├── .gitignore ├── DP.md ├── Diff-Pruning └── exp_code │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── calc_fid.py │ ├── compute_flops.py │ ├── compute_pruned_ssim_curve.py │ ├── compute_ssim.py │ ├── compute_ssim_vis.py │ ├── configs │ ├── bedroom.yml │ ├── celeba.yml │ ├── church.yml │ ├── cifar10.yml │ ├── cifar10_long.yml │ └── cifar10_pruning.yml │ ├── datasets │ ├── __init__.py │ ├── celeba.py │ ├── ffhq.py │ ├── lsun.py │ ├── utils.py │ └── vision.py │ ├── ddpm_full.txt │ ├── draw_ssim_pruned_curve.py │ ├── extract_cifar10.py │ ├── fid_score.py │ ├── finetune.py │ ├── finetune_simple.py │ ├── functions │ ├── __init__.py │ ├── ckpt_util.py │ ├── denoising.py │ └── losses.py │ ├── inception.py │ ├── main.py │ ├── models │ ├── diffusion.py │ └── ema.py │ ├── prune.py │ ├── prune_kd.py │ ├── prune_ssim.py │ ├── prune_test.py │ ├── run │ └── .keep │ ├── runners │ ├── __init__.py │ ├── diffusion.py │ └── diffusion_simple.py │ ├── scripts │ ├── fid_simple_cifar_kim24layer_hp.sh │ ├── fid_simple_cifar_our_hp.sh │ ├── finetune_bedroom_ddpm.sh │ ├── finetune_celeba_ddpm.sh │ ├── finetune_celeba_ddpm_kd.sh │ ├── finetune_church_ddpm.sh │ ├── finetune_cifar_ddpm.sh │ ├── finetune_cifar_ddpm_kd.sh │ ├── finetune_cifar_ddpm_random.sh │ ├── finetune_cifar_ddpm_taylor.sh │ ├── old │ │ ├── run_bedroom_sample_pratrained.sh │ │ ├── run_celeba_pruning_scratch.sh │ │ ├── run_celeba_pruning_taylor.sh │ │ ├── run_celeba_sample_pratrained.sh │ │ ├── run_church_pruning_taylor.sh │ │ ├── run_cifar_pruning_first_order_taylor.sh │ │ ├── run_cifar_pruning_magnitude.sh │ │ ├── run_cifar_pruning_random.sh │ │ ├── run_cifar_pruning_random_kd.sh │ │ ├── run_cifar_pruning_scratch.sh │ │ ├── run_cifar_pruning_second_order_taylor.sh │ │ ├── run_cifar_pruning_taylor.sh │ │ ├── run_cifar_pruning_taylor_kd.sh │ │ └── run_cifar_train.sh │ ├── prune_bedroom_ddpm.sh │ ├── prune_bedroom_ddpm_test.sh │ ├── prune_celeba_ddpm.sh │ ├── prune_celeba_ddpm_ssim.sh │ ├── prune_church_ddpm.sh │ ├── prune_church_ddpm_test.sh │ ├── prune_cifar_ddpm.sh │ ├── prune_cifar_ddpm_ssim.sh │ ├── prune_cifar_ddpm_test.sh │ ├── run_celeba.sh │ ├── sample_bedroom_ddpm_pretrained.sh │ ├── sample_bedroom_ddpm_pruning.sh │ ├── sample_celeba_ddpm_pruning.sh │ ├── sample_celeba_pretrained.sh │ ├── sample_church_ddpm_pruning.sh │ ├── sample_church_ddpm_pruning_old.sh │ ├── sample_church_ddpm_test.sh │ ├── sample_church_pretrained.sh │ ├── sample_cifar_ddpm_kim23efficient.sh │ ├── sample_cifar_ddpm_kim24layer.sh │ ├── sample_cifar_ddpm_kim24layermerge.sh │ ├── sample_cifar_ddpm_pretrained.sh │ ├── sample_cifar_ddpm_pruning.sh │ ├── sample_cifar_from_pruned_ddpm_kim23efficient.sh │ ├── sample_cifar_from_pruned_ddpm_kim24layer.sh │ ├── sample_cifar_from_pruned_ddpm_kim24layermerge.sh │ ├── sample_cifar_pretrained.sh │ ├── simple_celeba_our.sh │ ├── simple_cifar_from_pruned_kim23efficient.sh │ ├── simple_cifar_from_pruned_kim24layer.sh │ ├── simple_cifar_from_pruned_kim24layermerge.sh │ ├── simple_cifar_kim23efficient.sh │ ├── simple_cifar_kim24layer.sh │ ├── simple_cifar_kim24layer_hp.sh │ ├── simple_cifar_kim24layermerge.sh │ ├── simple_cifar_our.sh │ ├── simple_cifar_our_hp.sh │ ├── simple_cifar_our_test.sh │ ├── simple_rat_cifar_long_our.sh │ ├── simple_rat_cifar_our.sh │ ├── time_cifar_ddpm_kim23efficient.sh │ ├── time_cifar_ddpm_kim24layer.sh │ ├── time_cifar_ddpm_kim24layermerge.sh │ ├── time_cifar_ddpm_pretrained.sh │ └── time_cifar_ddpm_pruning.sh │ ├── tools │ ├── extract_cifar10.py │ └── transform_weights.py │ ├── torch_pruning │ ├── __init__.py │ ├── _helpers.py │ ├── dependency.py │ ├── importance.py │ ├── ops.py │ ├── pruner │ │ ├── __init__.py │ │ ├── algorithms │ │ │ ├── __init__.py │ │ │ ├── batchnorm_scale_pruner.py │ │ │ ├── group_norm_pruner.py │ │ │ ├── magnitude_based_pruner.py │ │ │ ├── metapruner.py │ │ │ ├── scaling_factor_pruner.py │ │ │ ├── scheduler.py │ │ │ └── taylor_pruner.py │ │ └── function.py │ └── utils │ │ ├── __init__.py │ │ ├── op_counter.py │ │ └── utils.py │ └── utils.py ├── EVALUATE.md ├── Efficient-CNN-Depth-Compression ├── .gitignore ├── LICENSE ├── README.md ├── asset │ ├── icml23.yml │ ├── requirements.txt │ └── title.png ├── config │ └── arguments.py ├── exps │ ├── aggregate_imp.py │ ├── generate_tables.py │ ├── inference_trt.py │ ├── main.py │ └── solve_dp.py ├── models │ ├── imagenet │ │ ├── __init__.py │ │ ├── mobilenetv2.py │ │ ├── mobilenetv2_com.py │ │ ├── mobilenetv2_ds.py │ │ ├── vgg.py │ │ └── vgg_com.py │ ├── model_op.py │ └── modules_trt.py └── utils │ ├── __init__.py │ ├── datasets.py │ ├── dp.py │ ├── loaders.py │ ├── logger.py │ ├── measure.py │ ├── misc.py │ ├── table │ ├── mbv2_1.0 │ │ ├── opt_time_fish_gpu1_1228.csv │ │ └── time_fish_gpu1_1228.csv │ ├── mbv2_1.4 │ │ ├── opt_time_fish_gpu1_0103.csv │ │ └── time_fish_gpu1_0103.csv │ └── vgg19_no_trt │ │ ├── opt_time_fish_gpu1_0317.csv │ │ └── time_fish_gpu1_0317.csv │ ├── train.py │ └── txt │ ├── class100.txt │ └── holdout_val.txt ├── HALP ├── Dockerfile ├── LICENSE ├── README.md ├── apex │ ├── .github │ │ └── ISSUE_TEMPLATE │ │ │ └── bug_report.md │ ├── .gitignore │ ├── .gitmodules │ ├── .nojekyll │ ├── LICENSE │ ├── README.md │ ├── apex │ │ ├── RNN │ │ │ ├── README.md │ │ │ ├── RNNBackend.py │ │ │ ├── __init__.py │ │ │ ├── cells.py │ │ │ └── models.py │ │ ├── __init__.py │ │ ├── _autocast_utils.py │ │ ├── amp │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── __version__.py │ │ │ ├── _amp_state.py │ │ │ ├── _initialize.py │ │ │ ├── _process_optimizer.py │ │ │ ├── amp.py │ │ │ ├── compat.py │ │ │ ├── frontend.py │ │ │ ├── handle.py │ │ │ ├── lists │ │ │ │ ├── __init__.py │ │ │ │ ├── functional_overrides.py │ │ │ │ ├── tensor_overrides.py │ │ │ │ └── torch_overrides.py │ │ │ ├── opt.py │ │ │ ├── rnn_compat.py │ │ │ ├── scaler.py │ │ │ ├── utils.py │ │ │ └── wrap.py │ │ ├── contrib │ │ │ ├── __init__.py │ │ │ ├── bottleneck │ │ │ │ ├── __init__.py │ │ │ │ ├── bottleneck.py │ │ │ │ └── halo_exchangers.py │ │ │ ├── clip_grad │ │ │ │ ├── __init__.py │ │ │ │ └── clip_grad.py │ │ │ ├── conv_bias_relu │ │ │ │ ├── __init__.py │ │ │ │ └── conv_bias_relu.py │ │ │ ├── csrc │ │ │ │ ├── bottleneck │ │ │ │ │ └── bottleneck.cpp │ │ │ │ ├── conv_bias_relu │ │ │ │ │ └── conv_bias_relu.cpp │ │ │ │ ├── cudnn_gbn │ │ │ │ │ ├── cudnn_gbn.cpp │ │ │ │ │ ├── norm_sample.cpp │ │ │ │ │ └── norm_sample.h │ │ │ │ ├── fmha │ │ │ │ │ ├── fmha_api.cpp │ │ │ │ │ └── src │ │ │ │ │ │ ├── fmha.h │ │ │ │ │ │ ├── fmha │ │ │ │ │ │ ├── gemm.h │ │ │ │ │ │ ├── gmem_tile.h │ │ │ │ │ │ ├── kernel_traits.h │ │ │ │ │ │ ├── mask.h │ │ │ │ │ │ ├── smem_tile.h │ │ │ │ │ │ ├── softmax.h │ │ │ │ │ │ └── utils.h │ │ │ │ │ │ ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload.h │ │ │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload_nl.h │ │ │ │ │ │ ├── fmha_fill.cu │ │ │ │ │ │ ├── fmha_fprop_fp16_128_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_fprop_fp16_256_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_fprop_fp16_384_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_fprop_fp16_512_64_kernel.sm80.cu │ │ │ │ │ │ ├── fmha_fprop_kernel_1xN.h │ │ │ │ │ │ ├── fmha_kernel.h │ │ │ │ │ │ ├── fmha_noloop_reduce.cu │ │ │ │ │ │ └── fmha_utils.h │ │ │ │ ├── focal_loss │ │ │ │ │ ├── focal_loss_cuda.cpp │ │ │ │ │ └── focal_loss_cuda_kernel.cu │ │ │ │ ├── group_norm │ │ │ │ │ ├── group_norm_nhwc.cpp │ │ │ │ │ ├── group_norm_nhwc.h │ │ │ │ │ ├── group_norm_nhwc_bwd_one_pass.h │ │ │ │ │ ├── group_norm_nhwc_bwd_one_pass_kernel.cuh │ │ │ │ │ ├── group_norm_nhwc_bwd_two_pass.cu │ │ │ │ │ ├── group_norm_nhwc_fwd_one_pass.h │ │ │ │ │ ├── group_norm_nhwc_fwd_one_pass_kernel.cuh │ │ │ │ │ ├── group_norm_nhwc_fwd_two_pass.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_10.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_112.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_120.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_128.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_14.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_16.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_160.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_20.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_24.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_26.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_28.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_30.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_32.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_4.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_40.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_42.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_48.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_56.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_60.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_64.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_70.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_8.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_80.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_84.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_96.cu │ │ │ │ │ ├── group_norm_nhwc_one_pass_98.cu │ │ │ │ │ ├── group_norm_nhwc_op.cpp │ │ │ │ │ ├── macros.h │ │ │ │ │ └── traits.h │ │ │ │ ├── groupbn │ │ │ │ │ ├── batch_norm.cu │ │ │ │ │ ├── batch_norm.h │ │ │ │ │ ├── batch_norm_add_relu.cu │ │ │ │ │ ├── batch_norm_add_relu.h │ │ │ │ │ ├── cuda_utils.h │ │ │ │ │ ├── interface.cpp │ │ │ │ │ ├── ipc.cu │ │ │ │ │ └── nhwc_batch_norm_kernel.h │ │ │ │ ├── index_mul_2d │ │ │ │ │ ├── index_mul_2d_cuda.cpp │ │ │ │ │ └── index_mul_2d_cuda_kernel.cu │ │ │ │ ├── layer_norm │ │ │ │ │ ├── ln.h │ │ │ │ │ ├── ln_api.cpp │ │ │ │ │ ├── ln_bwd_kernels.cuh │ │ │ │ │ ├── ln_bwd_semi_cuda_kernel.cu │ │ │ │ │ ├── ln_fwd_cuda_kernel.cu │ │ │ │ │ ├── ln_fwd_kernels.cuh │ │ │ │ │ ├── ln_kernel_traits.h │ │ │ │ │ └── ln_utils.cuh │ │ │ │ ├── multihead_attn │ │ │ │ │ ├── additive_masked_softmax_dropout_cuda.cu │ │ │ │ │ ├── dropout.cuh │ │ │ │ │ ├── encdec_multihead_attn_cuda.cu │ │ │ │ │ ├── encdec_multihead_attn_norm_add_cuda.cu │ │ │ │ │ ├── layer_norm.cuh │ │ │ │ │ ├── masked_softmax_dropout_cuda.cu │ │ │ │ │ ├── multihead_attn_frontend.cpp │ │ │ │ │ ├── philox.cuh │ │ │ │ │ ├── self_multihead_attn_bias_additive_mask_cuda.cu │ │ │ │ │ ├── self_multihead_attn_bias_cuda.cu │ │ │ │ │ ├── self_multihead_attn_cuda.cu │ │ │ │ │ ├── self_multihead_attn_norm_add_cuda.cu │ │ │ │ │ ├── softmax.cuh │ │ │ │ │ └── strided_batched_gemm.cuh │ │ │ │ ├── nccl_p2p │ │ │ │ │ ├── nccl_p2p.cpp │ │ │ │ │ ├── nccl_p2p_cuda.cu │ │ │ │ │ ├── nccl_p2p_cuda.cuh │ │ │ │ │ ├── nccl_version.cpp │ │ │ │ │ └── nccl_version_check.cu │ │ │ │ ├── optimizers │ │ │ │ │ ├── fused_adam_cuda.cpp │ │ │ │ │ ├── fused_adam_cuda_kernel.cu │ │ │ │ │ ├── fused_lamb_cuda.cpp │ │ │ │ │ ├── fused_lamb_cuda_kernel.cu │ │ │ │ │ ├── multi_tensor_distopt_adam.cpp │ │ │ │ │ ├── multi_tensor_distopt_adam_kernel.cu │ │ │ │ │ ├── multi_tensor_distopt_lamb.cpp │ │ │ │ │ └── multi_tensor_distopt_lamb_kernel.cu │ │ │ │ ├── peer_memory │ │ │ │ │ ├── peer_memory.cpp │ │ │ │ │ ├── peer_memory_cuda.cu │ │ │ │ │ └── peer_memory_cuda.cuh │ │ │ │ ├── transducer │ │ │ │ │ ├── transducer_joint.cpp │ │ │ │ │ ├── transducer_joint_kernel.cu │ │ │ │ │ ├── transducer_loss.cpp │ │ │ │ │ └── transducer_loss_kernel.cu │ │ │ │ └── xentropy │ │ │ │ │ ├── interface.cpp │ │ │ │ │ └── xentropy_kernel.cu │ │ │ ├── cudnn_gbn │ │ │ │ ├── __init__.py │ │ │ │ └── batch_norm.py │ │ │ ├── examples │ │ │ │ └── multihead_attn │ │ │ │ │ ├── func_test_multihead_attn.py │ │ │ │ │ └── perf_test_multihead_attn.py │ │ │ ├── fmha │ │ │ │ ├── __init__.py │ │ │ │ └── fmha.py │ │ │ ├── focal_loss │ │ │ │ ├── __init__.py │ │ │ │ └── focal_loss.py │ │ │ ├── group_norm │ │ │ │ ├── __init__.py │ │ │ │ └── group_norm.py │ │ │ ├── groupbn │ │ │ │ ├── __init__.py │ │ │ │ └── batch_norm.py │ │ │ ├── index_mul_2d │ │ │ │ ├── __init__.py │ │ │ │ └── index_mul_2d.py │ │ │ ├── layer_norm │ │ │ │ ├── __init__.py │ │ │ │ └── layer_norm.py │ │ │ ├── multihead_attn │ │ │ │ ├── MHA_bwd.png │ │ │ │ ├── MHA_fwd.png │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── encdec_multihead_attn.py │ │ │ │ ├── encdec_multihead_attn_func.py │ │ │ │ ├── fast_encdec_multihead_attn_func.py │ │ │ │ ├── fast_encdec_multihead_attn_norm_add_func.py │ │ │ │ ├── fast_self_multihead_attn_func.py │ │ │ │ ├── fast_self_multihead_attn_norm_add_func.py │ │ │ │ ├── mask_softmax_dropout_func.py │ │ │ │ ├── self_multihead_attn.py │ │ │ │ └── self_multihead_attn_func.py │ │ │ ├── openfold_triton │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── _layer_norm_backward_kernels.py │ │ │ │ ├── _layer_norm_config_ampere.py │ │ │ │ ├── _layer_norm_config_hopper.py │ │ │ │ ├── _layer_norm_forward_kernels.py │ │ │ │ ├── _mha_kernel.py │ │ │ │ ├── fused_adam_swa.py │ │ │ │ ├── layer_norm.py │ │ │ │ └── mha.py │ │ │ ├── optimizers │ │ │ │ ├── __init__.py │ │ │ │ ├── distributed_fused_adam.py │ │ │ │ ├── distributed_fused_lamb.py │ │ │ │ ├── fp16_optimizer.py │ │ │ │ ├── fused_adam.py │ │ │ │ ├── fused_lamb.py │ │ │ │ └── fused_sgd.py │ │ │ ├── peer_memory │ │ │ │ ├── __init__.py │ │ │ │ ├── peer_halo_exchanger_1d.py │ │ │ │ └── peer_memory.py │ │ │ ├── sparsity │ │ │ │ ├── COPYRIGHT │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── asp.py │ │ │ │ ├── permutation_lib.py │ │ │ │ ├── permutation_search_kernels │ │ │ │ │ ├── CUDA_kernels │ │ │ │ │ │ └── permutation_search_kernels.cu │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── call_permutation_search_kernels.py │ │ │ │ │ ├── channel_swap.py │ │ │ │ │ ├── exhaustive_search.py │ │ │ │ │ └── permutation_utilities.py │ │ │ │ ├── permutation_tests │ │ │ │ │ ├── README.md │ │ │ │ │ ├── ablation_studies.sh │ │ │ │ │ ├── permutation_test.py │ │ │ │ │ ├── runtime_table.sh │ │ │ │ │ └── unstructured_study.sh │ │ │ │ ├── sparse_masklib.py │ │ │ │ └── test │ │ │ │ │ ├── checkpointing_test_part1.py │ │ │ │ │ ├── checkpointing_test_part2.py │ │ │ │ │ ├── checkpointing_test_reference.py │ │ │ │ │ ├── test_permutation_application.py │ │ │ │ │ └── toy_problem.py │ │ │ ├── test │ │ │ │ ├── __init__.py │ │ │ │ ├── bottleneck │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_bottleneck_module.py │ │ │ │ ├── clip_grad │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_clip_grad.py │ │ │ │ ├── conv_bias_relu │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_conv_bias_relu.py │ │ │ │ ├── cudnn_gbn │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_cudnn_gbn_with_two_gpus.py │ │ │ │ ├── fmha │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_fmha.py │ │ │ │ ├── focal_loss │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_focal_loss.py │ │ │ │ ├── fused_dense │ │ │ │ │ └── test_fused_dense.py │ │ │ │ ├── group_norm │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_group_norm.py │ │ │ │ ├── index_mul_2d │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_index_mul_2d.py │ │ │ │ ├── layer_norm │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_fast_layer_norm.py │ │ │ │ ├── multihead_attn │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── test_encdec_multihead_attn.py │ │ │ │ │ ├── test_encdec_multihead_attn_norm_add.py │ │ │ │ │ ├── test_fast_self_multihead_attn_bias.py │ │ │ │ │ ├── test_mha_fused_softmax.py │ │ │ │ │ ├── test_self_multihead_attn.py │ │ │ │ │ └── test_self_multihead_attn_norm_add.py │ │ │ │ ├── optimizers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── test_dist_adam.py │ │ │ │ │ └── test_distributed_fused_lamb.py │ │ │ │ ├── peer_memory │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_peer_halo_exchange_module.py │ │ │ │ ├── transducer │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── test_transducer_joint.py │ │ │ │ │ └── test_transducer_loss.py │ │ │ │ └── xentropy │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_label_smoothing.py │ │ │ ├── transducer │ │ │ │ ├── __init__.py │ │ │ │ ├── _transducer_ref.py │ │ │ │ └── transducer.py │ │ │ └── xentropy │ │ │ │ ├── __init__.py │ │ │ │ └── softmax_xentropy.py │ │ ├── fp16_utils │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── fp16_optimizer.py │ │ │ ├── fp16util.py │ │ │ └── loss_scaler.py │ │ ├── fused_dense │ │ │ ├── __init__.py │ │ │ └── fused_dense.py │ │ ├── mlp │ │ │ ├── __init__.py │ │ │ └── mlp.py │ │ ├── multi_tensor_apply │ │ │ ├── __init__.py │ │ │ └── multi_tensor_apply.py │ │ ├── normalization │ │ │ ├── __init__.py │ │ │ └── fused_layer_norm.py │ │ ├── optimizers │ │ │ ├── __init__.py │ │ │ ├── fused_adagrad.py │ │ │ ├── fused_adam.py │ │ │ ├── fused_lamb.py │ │ │ ├── fused_mixed_precision_lamb.py │ │ │ ├── fused_novograd.py │ │ │ └── fused_sgd.py │ │ ├── parallel │ │ │ ├── LARC.py │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── distributed.py │ │ │ ├── multiproc.py │ │ │ ├── optimized_sync_batchnorm.py │ │ │ ├── optimized_sync_batchnorm_kernel.py │ │ │ ├── sync_batchnorm.py │ │ │ └── sync_batchnorm_kernel.py │ │ └── transformer │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── _data │ │ │ ├── __init__.py │ │ │ └── _batchsampler.py │ │ │ ├── _ucc_util.py │ │ │ ├── amp │ │ │ ├── __init__.py │ │ │ └── grad_scaler.py │ │ │ ├── enums.py │ │ │ ├── functional │ │ │ ├── __init__.py │ │ │ └── fused_softmax.py │ │ │ ├── layers │ │ │ ├── __init__.py │ │ │ └── layer_norm.py │ │ │ ├── log_util.py │ │ │ ├── microbatches.py │ │ │ ├── parallel_state.py │ │ │ ├── pipeline_parallel │ │ │ ├── __init__.py │ │ │ ├── _timers.py │ │ │ ├── p2p_communication.py │ │ │ ├── schedules │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── fwd_bwd_no_pipelining.py │ │ │ │ ├── fwd_bwd_pipelining_with_interleaving.py │ │ │ │ └── fwd_bwd_pipelining_without_interleaving.py │ │ │ └── utils.py │ │ │ ├── tensor_parallel │ │ │ ├── __init__.py │ │ │ ├── cross_entropy.py │ │ │ ├── data.py │ │ │ ├── layers.py │ │ │ ├── mappings.py │ │ │ ├── memory.py │ │ │ ├── random.py │ │ │ └── utils.py │ │ │ ├── testing │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── commons.py │ │ │ ├── distributed_test_base.py │ │ │ ├── global_vars.py │ │ │ ├── standalone_bert.py │ │ │ ├── standalone_gpt.py │ │ │ └── standalone_transformer_lm.py │ │ │ └── utils.py │ ├── csrc │ │ ├── amp_C_frontend.cpp │ │ ├── compat.h │ │ ├── flatten_unflatten.cpp │ │ ├── fused_dense.cpp │ │ ├── fused_dense_cuda.cu │ │ ├── layer_norm_cuda.cpp │ │ ├── layer_norm_cuda_kernel.cu │ │ ├── megatron │ │ │ ├── fused_weight_gradient_dense.cpp │ │ │ ├── fused_weight_gradient_dense_16bit_prec_cuda.cu │ │ │ ├── fused_weight_gradient_dense_cuda.cu │ │ │ ├── generic_scaled_masked_softmax.cpp │ │ │ ├── generic_scaled_masked_softmax.h │ │ │ ├── generic_scaled_masked_softmax_cuda.cu │ │ │ ├── scaled_masked_softmax.cpp │ │ │ ├── scaled_masked_softmax.h │ │ │ ├── scaled_masked_softmax_cuda.cu │ │ │ ├── scaled_softmax.cpp │ │ │ ├── scaled_softmax_cuda.cu │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ ├── scaled_upper_triang_masked_softmax.h │ │ │ └── scaled_upper_triang_masked_softmax_cuda.cu │ │ ├── mlp.cpp │ │ ├── mlp_cuda.cu │ │ ├── multi_tensor_adagrad.cu │ │ ├── multi_tensor_adam.cu │ │ ├── multi_tensor_apply.cuh │ │ ├── multi_tensor_axpby_kernel.cu │ │ ├── multi_tensor_l2norm_kernel.cu │ │ ├── multi_tensor_l2norm_kernel_mp.cu │ │ ├── multi_tensor_l2norm_scale_kernel.cu │ │ ├── multi_tensor_lamb.cu │ │ ├── multi_tensor_lamb_mp.cu │ │ ├── multi_tensor_lamb_stage_1.cu │ │ ├── multi_tensor_lamb_stage_2.cu │ │ ├── multi_tensor_novograd.cu │ │ ├── multi_tensor_scale_kernel.cu │ │ ├── multi_tensor_sgd_kernel.cu │ │ ├── static_switch.h │ │ ├── syncbn.cpp │ │ ├── type_shim.h │ │ ├── update_scale_hysteresis.cu │ │ └── welford.cu │ ├── docs │ │ ├── Makefile │ │ └── source │ │ │ ├── _static │ │ │ ├── css │ │ │ │ └── pytorch_theme.css │ │ │ └── img │ │ │ │ └── nv-pytorch2.png │ │ │ ├── _templates │ │ │ └── layout.html │ │ │ ├── advanced.rst │ │ │ ├── amp.rst │ │ │ ├── conf.py │ │ │ ├── fp16_utils.rst │ │ │ ├── index.rst │ │ │ ├── layernorm.rst │ │ │ ├── optimizers.rst │ │ │ └── parallel.rst │ ├── examples │ │ ├── README.md │ │ ├── dcgan │ │ │ ├── README.md │ │ │ └── main_amp.py │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ └── README.md │ │ ├── imagenet │ │ │ ├── README.md │ │ │ └── main_amp.py │ │ └── simple │ │ │ └── distributed │ │ │ ├── README.md │ │ │ ├── distributed_data_parallel.py │ │ │ └── run.sh │ ├── pyproject.toml │ ├── requirements.txt │ ├── requirements_dev.txt │ ├── setup.py │ └── tests │ │ ├── L0 │ │ ├── run_amp │ │ │ ├── __init__.py │ │ │ ├── test_add_param_group.py │ │ │ ├── test_basic_casts.py │ │ │ ├── test_cache.py │ │ │ ├── test_checkpointing.py │ │ │ ├── test_fused_sgd.py │ │ │ ├── test_larc.py │ │ │ ├── test_multi_tensor_axpby.py │ │ │ ├── test_multi_tensor_l2norm.py │ │ │ ├── test_multi_tensor_scale.py │ │ │ ├── test_multi_tensor_unscale_l2norm.py │ │ │ ├── test_multiple_models_optimizers_losses.py │ │ │ ├── test_promotion.py │ │ │ ├── test_rnn.py │ │ │ ├── test_update_scale_hysteresis.py │ │ │ └── utils.py │ │ ├── run_deprecated │ │ │ └── test_deprecated_warning.py │ │ ├── run_fp16util │ │ │ ├── __init__.py │ │ │ └── test_fp16util.py │ │ ├── run_fused_layer_norm │ │ │ └── test_fused_layer_norm.py │ │ ├── run_mlp │ │ │ └── test_mlp.py │ │ ├── run_optimizers │ │ │ ├── __init__.py │ │ │ ├── test_adam.py │ │ │ ├── test_fused_novograd.py │ │ │ ├── test_fused_optimizer.py │ │ │ └── test_lamb.py │ │ ├── run_test.py │ │ └── run_transformer │ │ │ ├── __init__.py │ │ │ ├── gpt_scaling_test.py │ │ │ ├── test_batch_sampler.py │ │ │ ├── test_bert_minimal.py │ │ │ ├── test_cross_entropy.py │ │ │ ├── test_data.py │ │ │ ├── test_dynamic_batchsize.py │ │ │ ├── test_fused_softmax.py │ │ │ ├── test_gpt_minimal.py │ │ │ ├── test_layers.py │ │ │ ├── test_mapping.py │ │ │ ├── test_microbatches.py │ │ │ ├── test_p2p_comm.py │ │ │ ├── test_parallel_state.py │ │ │ ├── test_pipeline_parallel_fwd_bwd.py │ │ │ ├── test_random.py │ │ │ └── test_transformer_utils.py │ │ ├── L1 │ │ ├── common │ │ │ ├── compare.py │ │ │ ├── main_amp.py │ │ │ └── run_test.sh │ │ ├── cross_product │ │ │ └── run.sh │ │ ├── cross_product_distributed │ │ │ └── run.sh │ │ └── transformer │ │ │ └── pipeline_parallel_fwd_bwd_ucc_async.py │ │ ├── distributed │ │ ├── DDP │ │ │ ├── ddp_race_condition_test.py │ │ │ └── run_race_test.sh │ │ ├── amp_master_params │ │ │ ├── amp_master_params.py │ │ │ ├── compare.py │ │ │ └── run.sh │ │ └── synced_batchnorm │ │ │ ├── python_single_gpu_unit_test.py │ │ │ ├── single_gpu_unit_test.py │ │ │ ├── test_batchnorm1d.py │ │ │ ├── test_groups.py │ │ │ ├── two_gpu_test_different_batch_size.py │ │ │ ├── two_gpu_unit_test.py │ │ │ └── unit_test.sh │ │ └── docker_extension_builds │ │ └── run.sh ├── assets │ └── pipeline.png ├── configs │ ├── exp_configs │ │ ├── rn34_imagenet_baseline.yaml │ │ ├── rn34_imagenet_baseline_eval.yaml │ │ ├── rn34_imagenet_prune_rat0.2.yaml │ │ ├── rn34_imagenet_prune_rat0.25.yaml │ │ ├── rn34_imagenet_prune_rat0.3.yaml │ │ ├── rn34_imagenet_prune_rat0.35.yaml │ │ ├── rn34_imagenet_prune_rat0.45.yaml │ │ ├── rn50_imagenet_baseline.yaml │ │ ├── rn50_imagenet_baseline_eval.yaml │ │ ├── rn50_imagenet_prune_rat0.1.yaml │ │ ├── rn50_imagenet_prune_rat0.15.yaml │ │ ├── rn50_imagenet_prune_rat0.2.yaml │ │ └── rn50_imagenet_prune_rat0.45.yaml │ └── prune_configs │ │ ├── rn34_fmap.json │ │ ├── rn34_prune_layer.json │ │ ├── rn50_fmap.json │ │ ├── rn50_prune_layer.json │ │ ├── rtx2080_rn34_prune_groups.json │ │ ├── rtx2080_rn50_prune_groups.json │ │ ├── titanv_rn34_prune_groups.json │ │ └── titanv_rn50_prune_groups.json ├── data │ └── dataloaders.py ├── main.py ├── models │ ├── __init__.py │ ├── create_model.py │ ├── resnet.py │ ├── resnet_fused.py │ └── resnet_pruned.py ├── multiproc.py ├── profile_halp.py ├── prune │ ├── cost.py │ ├── importance.py │ ├── prune_config.py │ └── pruner.py ├── train │ ├── lr_schedule.py │ ├── optimizer.py │ └── training.py └── utils │ ├── mixup.py │ ├── model_summary.py │ ├── smoothing.py │ └── utils.py ├── LICENSE ├── README.md ├── asset ├── short_demo.png └── title.png ├── examples ├── .gitignore ├── 0_mbv2_demo.ipynb ├── 1_ddpm_demo.ipynb ├── ckpt │ └── .keep ├── ddpm_cifar10.yml ├── imagenet1000clsidx_to_list.txt └── images │ └── husky.png ├── layer_merge ├── __init__.py ├── aggregate_imp.py ├── ddpm_trainer.py ├── kim23efficient │ ├── __init__.py │ ├── datasets.py │ ├── generate_tables.py │ ├── holdout_val.txt │ └── importance.py ├── kim24layer │ ├── __init__.py │ ├── datasets.py │ ├── generate_tables.py │ ├── holdout_train.txt │ ├── holdout_val.txt │ └── importance.py ├── kim24layermerge │ ├── __init__.py │ ├── datasets.py │ ├── generate_tables.py │ ├── holdout_train.txt │ ├── holdout_val.txt │ └── importance.py ├── measure.py ├── models │ ├── __init__.py │ ├── ddpm.py │ ├── ddpm_cfg │ │ ├── __init__.py │ │ ├── bedroom.yml │ │ ├── celeba.yml │ │ ├── church.yml │ │ └── cifar10.yml │ ├── ddpm_datasets │ │ ├── __init__.py │ │ ├── celeba.py │ │ ├── cifar.py │ │ ├── cifar10_holdout_train.txt │ │ ├── cifar10_holdout_val.txt │ │ ├── ffhq.py │ │ ├── lsun.py │ │ ├── utils.py │ │ └── vision.py │ ├── ddpm_layer.py │ ├── ddpm_merged.py │ ├── ddpm_merged_layer.py │ ├── merge_op.py │ ├── mobilenetv2.py │ ├── mobilenetv2_layer.py │ ├── mobilenetv2_merged_layer.py │ ├── resnet.py │ ├── resnet_layer.py │ ├── resnet_merged.py │ └── resnet_merged_layer.py └── trainer.py ├── lymg.yml ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | __pycache__ 3 | *.pkl 4 | *.DS_Store 5 | LUT*/ 6 | model_ckpt/ 7 | cache/ 8 | run_slurm/ 9 | slurm/ 10 | output* 11 | rn34_output* 12 | rn50_output* 13 | rn34_rtx2080* 14 | rn50_rtx2080* 15 | model*.txt 16 | test_skip.py 17 | generate_lst.py 18 | generate_group.py 19 | measure_halp.py 20 | merge_rtx2080.py 21 | test.py 22 | plots/ 23 | pretrained/ 24 | my_util/ 25 | msr* 26 | solve*.sh 27 | *.egg-info 28 | ddpm_chk.txt 29 | ddpm_mgd.txt 30 | .ipynb_checkpoints/ 31 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ 3 | *.log 4 | run/finetune_simple_v2/ 5 | run/*.npz 6 | run/sample_* 7 | run/time_* 8 | data 9 | *.png -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jiaming Song 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/calc_fid.py: -------------------------------------------------------------------------------- 1 | from cleanfid import fid 2 | import argparse 3 | parser = argparse.ArgumentParser(description=globals()["__doc__"]) 4 | parser.add_argument('--path1', type=str, required=True, help='Path to the images') 5 | parser.add_argument('--path2', type=str, required=True, help='Path to the images') 6 | args = parser.parse_args() 7 | 8 | if args.path2=="cifar10": 9 | score = fid.compute_fid(args.dir, dataset_name="cifar10", dataset_res=32, dataset_split="train") 10 | else: 11 | score = fid.compute_fid(args.path1, args.path2) 12 | print("FID: ", score) -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/compute_flops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random, os 3 | import argparse 4 | from PIL import Image 5 | import torchvision 6 | import numpy as np 7 | import pytorch_msssim 8 | from utils import UnlabeledImageFolder 9 | from tqdm import tqdm 10 | import torch_pruning as tp 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--restore_from', type=str, required=True) 13 | args = parser.parse_args() 14 | 15 | model = torch.load(args.restore_from, map_location='cpu')[0] 16 | example_inputs = {'x': torch.randn(1, 3, 32, 32), 't': torch.ones(1)} 17 | macs, params = tp.utils.count_ops_and_params(model, example_inputs) 18 | print("model: {}, macs: {} G, params: {} M".format(args.restore_from, macs/1e9, params/1e6)) 19 | 20 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/compute_pruned_ssim_curve.py: -------------------------------------------------------------------------------- 1 | import pytorch_msssim 2 | import os 3 | import torch 4 | from PIL import Image 5 | import torchvision 6 | 7 | base_folder_name = 'run/prune_ssim_2/0' 8 | folder_name = [os.path.join('run/prune_ssim_2', '{}'.format(k)) for k in range(50, 1000+1, 50)] 9 | n_samples = 32 10 | # test ssim for each folder 11 | folder_ssim = [] 12 | for f in folder_name: 13 | ssim_list = [] 14 | for img_id in range(n_samples): 15 | img1 = Image.open(os.path.join(base_folder_name, f'{img_id}.png')) 16 | img2 = Image.open(os.path.join(f, f'{img_id}.png')) 17 | img1_tensor = torchvision.transforms.ToTensor()(img1) 18 | img2_tensor = torchvision.transforms.ToTensor()(img2) 19 | img1_tensor = img1_tensor.unsqueeze(0) 20 | img2_tensor = img2_tensor.unsqueeze(0) 21 | ssim = pytorch_msssim.ssim(img1_tensor, img2_tensor, data_range=1.0, size_average=True) 22 | ssim_list.append(ssim) 23 | ssim = sum(ssim_list) / len(ssim_list) 24 | folder_ssim.append(ssim.item()) 25 | print(folder_ssim) -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/compute_ssim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random, os 3 | import argparse 4 | from PIL import Image 5 | import torchvision 6 | import numpy as np 7 | import pytorch_msssim 8 | from utils import UnlabeledImageFolder 9 | from tqdm import tqdm 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--path', type=str, required=True, nargs='+') 12 | args = parser.parse_args() 13 | 14 | # generate radom index 15 | nrow = 16 16 | img_index = random.sample(list(range(50000)), nrow*nrow) 17 | path1 = args.path[0] 18 | path2 = args.path[1] 19 | print(path1, path2) 20 | img_dst1 = UnlabeledImageFolder(path1, transform=torchvision.transforms.ToTensor(), exts=["png"]) 21 | img_dst2 = UnlabeledImageFolder(path2, transform=torchvision.transforms.ToTensor(), exts=["png"]) 22 | print(len(img_dst1), len(img_dst2)) 23 | 24 | loader1 = torch.utils.data.DataLoader( 25 | img_dst1, 26 | batch_size=100, 27 | shuffle=False, 28 | num_workers=4, 29 | drop_last=False, 30 | ) 31 | loader2 = torch.utils.data.DataLoader( 32 | img_dst2, 33 | batch_size=100, 34 | shuffle=False, 35 | num_workers=4, 36 | drop_last=False, 37 | ) 38 | 39 | with torch.no_grad(): 40 | ssim_list = [] 41 | mse_list = [] 42 | for i, (img1, img2) in tqdm(enumerate(zip(loader1, loader2))): 43 | ssim = pytorch_msssim.ssim(img1.cuda(), img2.cuda(), data_range=1.0, size_average=False) 44 | ssim_list.append(ssim.cpu()) 45 | mse = torch.nn.functional.mse_loss(img1.cuda(), img2.cuda(), reduction='none').mean(dim=(1,2,3)) 46 | mse_list.append(mse.cpu()) 47 | 48 | ssim = torch.cat(ssim_list, dim=0) 49 | mse = torch.cat(mse_list, dim=0) 50 | ssim_avg = ssim.mean() 51 | mse_avg = mse.mean() 52 | print("path1: {}, path2: {}, ssim: {}, mse: {}".format(path1, path2, ssim_avg, mse_avg)) 53 | 54 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/compute_ssim_vis.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random, os 3 | import argparse 4 | from PIL import Image 5 | import torchvision 6 | import numpy as np 7 | import pytorch_msssim 8 | from utils import UnlabeledImageFolder 9 | from tqdm import tqdm 10 | img_ids = [159, 149, 144, 127, 86, 41] 11 | image_folder1 = 'run/sample_v2/bedroom_250k/image_samples/images/0' 12 | image_folder2 = 'run/sample_v2/bedroom_official/image_samples/images/0' 13 | base_img_id = 0 14 | ssim_list = [] 15 | for iid in img_ids: 16 | img1 = Image.open(os.path.join(image_folder1, f'{iid}.png')) 17 | img2 = Image.open(os.path.join(image_folder2, f'{iid}.png')) 18 | img1_tensor = torchvision.transforms.ToTensor()(img1).unsqueeze(0) 19 | img2_tensor = torchvision.transforms.ToTensor()(img2).unsqueeze(0) 20 | ssim = pytorch_msssim.ssim(img1_tensor, img2_tensor, data_range=1.0, size_average=True) 21 | ssim_list.append(ssim.item()) 22 | print(ssim_list) 23 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/bedroom.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "LSUN" 3 | category: "bedroom" 4 | image_size: 256 5 | channels: 3 6 | logit_transform: false 7 | uniform_dequantization: false 8 | gaussian_dequantization: false 9 | random_flip: true 10 | rescaled: true 11 | num_workers: 32 12 | 13 | model: 14 | type: "simple" 15 | in_channels: 3 16 | out_ch: 3 17 | ch: 128 18 | ch_mult: [1, 1, 2, 2, 4, 4] 19 | num_res_blocks: 2 20 | attn_resolutions: [16, ] 21 | dropout: 0.0 22 | var_type: fixedsmall 23 | ema_rate: 0.999 24 | ema: True 25 | resamp_with_conv: True 26 | 27 | diffusion: 28 | beta_schedule: linear 29 | beta_start: 0.0001 30 | beta_end: 0.02 31 | num_diffusion_timesteps: 1000 32 | 33 | training: 34 | batch_size: 8 35 | n_epochs: 10000 36 | n_iters: 5000000 37 | snapshot_freq: 5000 38 | validation_freq: 2000 39 | 40 | sampling: 41 | batch_size: 16 42 | last_only: True 43 | 44 | optim: 45 | weight_decay: 0.000 46 | optimizer: "Adam" 47 | lr: 0.000002 48 | beta1: 0.9 49 | amsgrad: false 50 | eps: 0.00000001 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/celeba.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CELEBA" 3 | image_size: 64 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2, 4] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 96 # 128 34 | n_epochs: 10000 35 | n_iters: 5000000 36 | snapshot_freq: 5000 37 | validation_freq: 20000 38 | 39 | sampling: 40 | batch_size: 32 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/church.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "LSUN" 3 | category: "church_outdoor" 4 | image_size: 256 5 | channels: 3 6 | logit_transform: false 7 | uniform_dequantization: false 8 | gaussian_dequantization: false 9 | random_flip: true 10 | rescaled: true 11 | num_workers: 32 12 | 13 | model: 14 | type: "simple" 15 | in_channels: 3 16 | out_ch: 3 17 | ch: 128 18 | ch_mult: [1, 1, 2, 2, 4, 4] 19 | num_res_blocks: 2 20 | attn_resolutions: [16, ] 21 | dropout: 0.0 22 | var_type: fixedsmall 23 | ema_rate: 0.999 24 | ema: True 25 | resamp_with_conv: True 26 | 27 | diffusion: 28 | beta_schedule: linear 29 | beta_start: 0.0001 30 | beta_end: 0.02 31 | num_diffusion_timesteps: 1000 32 | 33 | training: 34 | batch_size: 8 # 64 35 | n_epochs: 10000 36 | n_iters: 5000000 37 | snapshot_freq: 5000 38 | validation_freq: 2000 39 | 40 | sampling: 41 | batch_size: 16 42 | last_only: True 43 | 44 | optim: 45 | weight_decay: 0.000 46 | optimizer: "Adam" 47 | lr: 0.00002 48 | beta1: 0.9 49 | amsgrad: false 50 | eps: 0.00000001 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/cifar10.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CIFAR10" 3 | image_size: 32 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 128 34 | n_epochs: 256 35 | n_iters: 100000 36 | snapshot_freq: 50000 37 | validation_freq: 2000 38 | 39 | sampling: 40 | batch_size: 64 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/cifar10_long.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CIFAR10" 3 | image_size: 32 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 128 34 | n_epochs: 512 35 | n_iters: 200000 36 | snapshot_freq: 100000 37 | validation_freq: 2000 38 | 39 | sampling: 40 | batch_size: 64 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/configs/cifar10_pruning.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CIFAR10" 3 | image_size: 32 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 128 34 | n_epochs: 10000 35 | n_iters: 5000000 36 | snapshot_freq: 5000 37 | validation_freq: 2000 38 | 39 | sampling: 40 | batch_size: 64 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.00002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/datasets/ffhq.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import lmdb 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | 7 | 8 | class FFHQ(Dataset): 9 | def __init__(self, path, transform, resolution=8): 10 | self.env = lmdb.open( 11 | path, 12 | max_readers=32, 13 | readonly=True, 14 | lock=False, 15 | readahead=False, 16 | meminit=False, 17 | ) 18 | 19 | if not self.env: 20 | raise IOError('Cannot open lmdb dataset', path) 21 | 22 | with self.env.begin(write=False) as txn: 23 | self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8')) 24 | 25 | self.resolution = resolution 26 | self.transform = transform 27 | 28 | def __len__(self): 29 | return self.length 30 | 31 | def __getitem__(self, index): 32 | with self.env.begin(write=False) as txn: 33 | key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8') 34 | img_bytes = txn.get(key) 35 | 36 | buffer = BytesIO(img_bytes) 37 | img = Image.open(buffer) 38 | img = self.transform(img) 39 | target = 0 40 | 41 | return img, target -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/extract_cifar10.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torchvision 3 | from torchvision.datasets import CIFAR10 4 | from tqdm import tqdm 5 | 6 | # Define the path to the folder where the images will be saved 7 | save_path = 'data/cifar10/images' 8 | 9 | # Create the folder if it doesn't exist 10 | if not os.path.exists(save_path): 11 | os.makedirs(save_path) 12 | 13 | # Load the CIFAR10 dataset 14 | dataset = CIFAR10(root='data/cifar10', train=True, download=True) 15 | 16 | # Loop through the dataset and save each image to the folder 17 | for i in tqdm(range(len(dataset))): 18 | image, label = dataset[i] 19 | image_name = f'{i}.png' 20 | image_path = os.path.join(save_path, image_name) 21 | image.save(image_path) -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/functions/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | 3 | 4 | def get_optimizer(config, parameters): 5 | if config.optim.optimizer == 'Adam': 6 | return optim.Adam(parameters, lr=config.optim.lr, weight_decay=config.optim.weight_decay, 7 | betas=(config.optim.beta1, 0.999), amsgrad=config.optim.amsgrad, 8 | eps=config.optim.eps) 9 | elif config.optim.optimizer == 'RMSProp': 10 | return optim.RMSprop(parameters, lr=config.optim.lr, weight_decay=config.optim.weight_decay) 11 | elif config.optim.optimizer == 'SGD': 12 | return optim.SGD(parameters, lr=config.optim.lr, momentum=0.9) 13 | else: 14 | raise NotImplementedError( 15 | 'Optimizer {} not understood.'.format(config.optim.optimizer)) 16 | 17 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/functions/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def noise_estimation_loss(model, 5 | x0: torch.Tensor, 6 | t: torch.LongTensor, 7 | e: torch.Tensor, 8 | b: torch.Tensor, keepdim=False): 9 | a = (1-b).cumprod(dim=0).index_select(0, t).view(-1, 1, 1, 1) 10 | x = x0 * a.sqrt() + e * (1.0 - a).sqrt() 11 | output = model(x, t.float()) 12 | if keepdim: 13 | return (e - output).square().sum(dim=(1, 2, 3)) 14 | else: 15 | return (e - output).square().sum(dim=(1, 2, 3)).mean(dim=0) 16 | 17 | def noise_estimation_kd_loss(model, 18 | teacher, 19 | x0: torch.Tensor, 20 | t: torch.LongTensor, 21 | e: torch.Tensor, 22 | b: torch.Tensor, keepdim=False): 23 | a = (1-b).cumprod(dim=0).index_select(0, t).view(-1, 1, 1, 1) 24 | x = x0 * a.sqrt() + e * (1.0 - a).sqrt() 25 | output = model(x, t.float()) 26 | with torch.no_grad(): 27 | teacher_output = teacher(x, t.float()) 28 | if keepdim: 29 | return 0.7*(teacher_output - output).square().sum(dim=(1, 2, 3)) + 0.3 * (e - output).square().sum(dim=(1, 2, 3)) 30 | else: 31 | return 0.7*(teacher_output - output).square().sum(dim=(1, 2, 3)).mean(dim=0) + 0.3 * (e - output).square().sum(dim=(1, 2, 3)).mean(dim=0) 32 | 33 | 34 | loss_registry = { 35 | 'simple': noise_estimation_loss, 36 | } 37 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/run/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Diff-Pruning/exp_code/run/.keep -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/runners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Diff-Pruning/exp_code/runners/__init__.py -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_bedroom_ddpm.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=6 --master_port 22223 --use_env finetune.py \ 2 | --config bedroom.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune/bedroom_ddpm_$1_0.3_finetuned-continue-v4-2e-5 \ 7 | --doc post_training \ 8 | --skip_type uniform \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --load_pruned_model run/finetune/bedroom_ddpm_taylor_0.3_finetuned-continue-v3-2e-6/logs/post_training/ckpt_65000.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_celeba_ddpm.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config celeba.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_final/celeba_T=$1_finetuned \ 7 | --doc post_training \ 8 | --skip_type uniform \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --load_pruned_model "run/pruned_final/celeba_T=$1.pth" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_celeba_ddpm_kd.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config celeba.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_v2/celeba_ddpm_$1_0.3_finetuned_kd \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --kd \ 13 | --load_pruned_model run/pruned/celeba_ddpm_$1_0.3.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_church_ddpm.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22223 --use_env finetune.py \ 2 | --config church.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune/church_ddpm_$1_0.3_finetuned \ 7 | --doc post_training \ 8 | --skip_type uniform \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --load_pruned_model run/pruned/church_ddpm_$1_0.3.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_v3/cifar10_ddpm_$1_finetuned_0.05T.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --load_pruned_model run/pruned_v5/cifar10_pruned_$1.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_kd.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_v2/cifar10_ddpm_$1_0.3_finetuned_kd \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --kd \ 13 | --load_pruned_model run/pruned/cifar10_pruned_$1_0.3.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_random.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=2 --master_port 22223 --use_env finetune.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune/cifar10_pruned_random_0.3_finetuned\ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner random \ 13 | --load_pruned_model run/pruned/cifar10_pruned_random_0.3.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune/cifar10_pruned_taylor_0.3_real_x_finetuned \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner taylor \ 13 | --load_pruned_model run/pruned/cifar10_pruned_taylor_0.3_real_x.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_bedroom_sample_pratrained.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config bedroom.yml \ 3 | --exp run/ddim_bedroom_official \ 4 | --sample \ 5 | --use_pretrained \ 6 | --timesteps 50 \ 7 | --eta 0 \ 8 | --ni \ 9 | --doc 50steps_quad \ 10 | --skip_type quad \ 11 | --pruning_ratio 0.0 \ 12 | --fid \ 13 | --use_ema -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_celeba_pruning_scratch.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config celeba.yml \ 3 | --exp run/ddim_celeba_pruning_reinit \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner reinit \ 13 | --taylor_batch_size 96 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_celeba_pruning_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config celeba.yml \ 3 | --exp run/ddim_celeba_pruning_taylor \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner taylor \ 13 | --taylor_batch_size 96 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_celeba_sample_pratrained.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config celeba.yml \ 3 | --exp run/ddim_celeba_official \ 4 | --sample \ 5 | --use_pretrained \ 6 | --timesteps 100 \ 7 | --eta 0 \ 8 | --ni \ 9 | --doc 100steps_quad \ 10 | --skip_type quad \ 11 | --pruning_ratio 0.0 \ 12 | --fid \ 13 | --use_ema \ 14 | --restore_from run/cache/diffusion_models_converted/celeba/ckpt.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_church_pruning_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config church.yml \ 3 | --exp run/ddim_church_pruning_taylor \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner random \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_first_order_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_first_order_taylor \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner first_order_taylor \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_magnitude.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_magnitude \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner magnitude \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_random.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_random \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner random \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_random_kd.sh: -------------------------------------------------------------------------------- 1 | python -B prune_kd.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_random \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner random \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_scratch.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_reinit \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner reinit \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_second_order_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_second_order_taylor \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner second_order_taylor \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_taylor.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_taylor \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner taylor \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_taylor_kd.sh: -------------------------------------------------------------------------------- 1 | python -B prune_kd.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_pruning_taylor_kd \ 4 | --timesteps 100 \ 5 | --eta 0 \ 6 | --ni \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --pruner taylor \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/old/run_cifar_train.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/ddim_cifar10_train_v2 \ 4 | --use_pretrained \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc post_training_with_0.2_pruning_ratio_v2 \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.2 \ 11 | --use_ema \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_bedroom_ddpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune.py \ 5 | --config "bedroom.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.3" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "$1" \ 15 | --save_pruned_model "run/pruned/bedroom_ddpm_$1_0.3.pth" \ 16 | --taylor_batch_size "4" -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_bedroom_ddpm_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune_test.py \ 5 | --config "bedroom.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.05" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "$1" \ 15 | --save_pruned_model "run/pruned_test/bedroom_ddpm_$1.pth" \ 16 | --taylor_batch_size "4" -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_celeba_ddpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune.py \ 5 | --config "celeba.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.3" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "ours" \ 15 | --save_pruned_model "run/pruned_final/celeba_T=$1.pth" \ 16 | --taylor_batch_size "64" \ 17 | --thr "$1" -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_celeba_ddpm_ssim.sh: -------------------------------------------------------------------------------- 1 | python -B prune_ssim.py \ 2 | --config celeba.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --doc post_training \ 7 | --skip_type quad \ 8 | --pruning_ratio 0.15 \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --stage $1 \ 12 | --pruner "ours" \ 13 | --save_pruned_model run/pruned_v4/celeba_pruned.pth \ 14 | --taylor_batch_size 64 -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_church_ddpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune.py \ 5 | --config "church.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.3" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "$1" \ 15 | --save_pruned_model "run/pruned/church_ddpm_$1_0.3.pth" \ 16 | --taylor_batch_size "2" -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_church_ddpm_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune_test.py \ 5 | --config "church.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.05" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "$1" \ 15 | --save_pruned_model "run/pruned_test/church_ddpm_$1.pth" \ 16 | --taylor_batch_size "4" -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_cifar_ddpm.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --doc post_training \ 7 | --skip_type quad \ 8 | --pruning_ratio 0.3 \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --pruner "$1" \ 12 | --save_pruned_model run/pruned_v5/cifar10_pruned_$1_$2.pth \ 13 | --thr $2 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_cifar_ddpm_ssim.sh: -------------------------------------------------------------------------------- 1 | python -B prune_ssim.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --doc post_training \ 7 | --skip_type quad \ 8 | --pruning_ratio 0.2 \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --stage $1 \ 12 | --pruner "ours" \ 13 | --save_pruned_model run/pruned_v4/cifar10_pruned.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/prune_cifar_ddpm_test.sh: -------------------------------------------------------------------------------- 1 | python -B prune_test.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --doc post_training \ 7 | --skip_type quad \ 8 | --pruning_ratio 0.3 \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --pruner "$1" \ 12 | --save_pruned_model run/pruned_test/cifar10_pruned_$1_0.2.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/run_celeba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Execute the Python script with the provided arguments 4 | python -B prune.py \ 5 | --config "celeba.yml" \ 6 | --timesteps "100" \ 7 | --eta "0" \ 8 | --ni \ 9 | --doc "post_training" \ 10 | --skip_type "quad" \ 11 | --pruning_ratio "0.3" \ 12 | --use_ema \ 13 | --use_pretrained \ 14 | --pruner "ours" \ 15 | --save_pruned_model "run/pruned_final/celeba_T=$1.pth" \ 16 | --taylor_batch_size "64" \ 17 | --thr "$1" 18 | 19 | python -B finetune.py \ 20 | --config celeba.yml \ 21 | --timesteps 100 \ 22 | --eta 0 \ 23 | --ni \ 24 | --exp run/finetune_final/celeba_T=$1_finetuned \ 25 | --doc post_training \ 26 | --skip_type uniform \ 27 | --pruning_ratio 0.3 \ 28 | --use_ema \ 29 | --use_pretrained \ 30 | --load_pruned_model "run/pruned_final/celeba_T=$1.pth" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_bedroom_ddpm_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=1 --master_port 22200 --use_env finetune.py \ 2 | --config bedroom.yml \ 3 | --exp $1 \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --use_pretrained \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_bedroom_ddpm_pruning.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22223 --use_env finetune.py \ 2 | --config bedroom.yml \ 3 | --exp $2 \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from $1 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_celeba_ddpm_pruning.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config celeba.yml \ 3 | --exp $2 \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from $1 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_celeba_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config celeba.yml \ 3 | --exp run/sample/ddim_celeba_official \ 4 | --sample \ 5 | --use_pretrained \ 6 | --timesteps 100 \ 7 | --eta 0 \ 8 | --ni \ 9 | --doc official \ 10 | --skip_type uniform \ 11 | --pruning_ratio 0.0 \ 12 | --fid \ 13 | --use_ema -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_church_ddpm_pruning.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config church.yml \ 3 | --exp run/sample/church_ddpm_350k \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from run/finetune_v2/church_pruned_taylor_0.3_finetuned/logs/post_training/ckpt_350000.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_church_ddpm_pruning_old.sh: -------------------------------------------------------------------------------- 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22221 --use_env finetune.py \ 2 | --config church.yml \ 3 | --exp $2 \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from $1 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_church_ddpm_test.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config church.yml \ 3 | --exp run/sample/church_ddpm_350k \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type uniform \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from run/finetune_v2/church_pruned_taylor_0.3_finetuned/logs/post_training/ckpt_350000.pth \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_church_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config church.yml \ 3 | --exp run/sample/ddim_church_official \ 4 | --sample \ 5 | --use_pretrained \ 6 | --timesteps 100 \ 7 | --eta 0 \ 8 | --ni \ 9 | --doc official \ 10 | --skip_type uniform \ 11 | --pruning_ratio 0.0 \ 12 | --fid \ 13 | --use_ema -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim23efficient.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim23efficient \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim24layer.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layer \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim24layermerge.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layermerge \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$1" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --use_pretrained \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_pruning.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$2" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim23efficient.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim23efficient \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim24layer.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layer \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim24layermerge.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --sample \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layermerge \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/sample_cifar_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B prune.py \ 2 | --config cifar10.yml \ 3 | --exp run/sample/ddim_cifar10_official \ 4 | --sample \ 5 | --use_pretrained \ 6 | --timesteps 100 \ 7 | --eta 0 \ 8 | --ni \ 9 | --doc sample_100k \ 10 | --skip_type quad \ 11 | --pruning_ratio 0.0 \ 12 | --fid \ 13 | --use_ema -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_celeba_our.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config celeba.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple/celeba_ours_T=$1.pth \ 7 | --doc post_training \ 8 | --skip_type uniform \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours \ 14 | --taylor_batch_size 64 -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim23efficient.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/$3$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --restore_from "$4" \ 11 | --depth_method kim23efficient \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ 15 | --from_pruned -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim24layer.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/$3$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --restore_from "$4" \ 11 | --depth_method kim24layer \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ 15 | --from_pruned -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim24layermerge.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/$3$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --restore_from "$4" \ 11 | --depth_method kim24layermerge \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ 15 | --from_pruned -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_kim23efficient.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/$3$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --depth_method kim23efficient \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_kim24layer.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/lr0.0004_$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --depth_method kim24layer \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_kim24layer_hp.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/lr$3_beta$4_$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --depth_method kim24layer \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr $3 \ 15 | --beta1 $4 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_kim24layermerge.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/$3$2 \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --use_ema \ 10 | --use_pretrained \ 11 | --depth_method kim24layermerge \ 12 | --depth_path $2 \ 13 | --thr $1 \ 14 | --lr 0.0004 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_our.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/cifar10_ours_T=$1.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_our_hp.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/cifar10_ours_T=$1_lr$2_beta$3.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours \ 14 | --lr $2 \ 15 | --beta1 $3 \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_cifar_our_test.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2_test/cifar10_ours_T=$1.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio 0.3 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_rat_cifar_long_our.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10_long.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/cifar10_long_ours/T=$1_rat=$2.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio $2 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours \ 14 | --lr 0.0004 -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/simple_rat_cifar_our.sh: -------------------------------------------------------------------------------- 1 | python -B finetune_simple.py \ 2 | --config cifar10.yml \ 3 | --timesteps 100 \ 4 | --eta 0 \ 5 | --ni \ 6 | --exp run/finetune_simple_v2/cifar10_ours/T=$1_rat=$2.pth \ 7 | --doc post_training \ 8 | --skip_type quad \ 9 | --pruning_ratio $2 \ 10 | --use_ema \ 11 | --use_pretrained \ 12 | --thr $1 \ 13 | --pruner ours \ 14 | --lr 0.0004 -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim23efficient.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --measure \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim23efficient \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim24layer.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --measure \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layer \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim24layermerge.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$3" \ 4 | --measure \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --depth_method kim24layermerge \ 14 | --depth_path $2 \ 15 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/time_cifar_ddpm_pretrained.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$1" \ 4 | --measure \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --use_pretrained \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/scripts/time_cifar_ddpm_pruning.sh: -------------------------------------------------------------------------------- 1 | python -B finetune.py \ 2 | --config cifar10.yml \ 3 | --exp "$2" \ 4 | --measure \ 5 | --timesteps 100 \ 6 | --eta 0 \ 7 | --ni \ 8 | --doc sample \ 9 | --skip_type quad \ 10 | --pruning_ratio 0.0 \ 11 | --fid \ 12 | --use_ema \ 13 | --restore_from "$1" \ -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/tools/extract_cifar10.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torchvision 3 | from torchvision.datasets import CIFAR10 4 | from tqdm import tqdm 5 | 6 | # Define the path to the folder where the images will be saved 7 | save_path = 'data/cifar10/images' 8 | 9 | # Create the folder if it doesn't exist 10 | if not os.path.exists(save_path): 11 | os.makedirs(save_path) 12 | 13 | # Load the CIFAR10 dataset 14 | dataset = CIFAR10(root='data/cifar10', train=True, download=True) 15 | 16 | # Loop through the dataset and save each image to the folder 17 | for i in tqdm(range(len(dataset))): 18 | image, label = dataset[i] 19 | image_name = f'{i}.png' 20 | image_path = os.path.join(save_path, image_name) 21 | image.save(image_path) -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/tools/transform_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | state = torch.load("model.ckpt.old") 4 | old_dict = state[0] 5 | print(state[0].keys()) 6 | state[0] = {pname.replace("module.", ''): pval for pname, pval in old_dict.items()} 7 | print(state[0].keys()) 8 | torch.save(state, "model.ckpt") -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/__init__.py: -------------------------------------------------------------------------------- 1 | from .dependency import * 2 | from .pruner import * 3 | from . import _helpers, utils, importance -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/pruner/__init__.py: -------------------------------------------------------------------------------- 1 | from .function import * 2 | from .algorithms import * -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from .metapruner import MetaPruner 2 | from .magnitude_based_pruner import MagnitudePruner 3 | from .batchnorm_scale_pruner import BNScalePruner 4 | from .group_norm_pruner import GroupNormPruner 5 | from .scaling_factor_pruner import ScalingFactorPruner 6 | from .taylor_pruner import TaylorPruner -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/batchnorm_scale_pruner.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import Callable 3 | from .metapruner import MetaPruner 4 | from .scheduler import linear_scheduler 5 | import torch 6 | import torch.nn as nn 7 | 8 | class BNScalePruner(MetaPruner): 9 | def __init__( 10 | self, 11 | model, 12 | example_inputs, 13 | importance, 14 | reg=1e-5, 15 | iterative_steps=1, 16 | iterative_sparsity_scheduler: Callable = linear_scheduler, 17 | ch_sparsity=0.5, 18 | ch_sparsity_dict=None, 19 | global_pruning=False, 20 | max_ch_sparsity=1.0, 21 | round_to=None, 22 | ignored_layers=None, 23 | customized_pruners=None, 24 | unwrapped_parameters=None, 25 | output_transform=None, 26 | ): 27 | super(BNScalePruner, self).__init__( 28 | model=model, 29 | example_inputs=example_inputs, 30 | importance=importance, 31 | iterative_steps=iterative_steps, 32 | iterative_sparsity_scheduler=iterative_sparsity_scheduler, 33 | ch_sparsity=ch_sparsity, 34 | ch_sparsity_dict=ch_sparsity_dict, 35 | global_pruning=global_pruning, 36 | max_ch_sparsity=max_ch_sparsity, 37 | round_to=round_to, 38 | ignored_layers=ignored_layers, 39 | customized_pruners=customized_pruners, 40 | unwrapped_parameters=unwrapped_parameters, 41 | output_transform=output_transform, 42 | ) 43 | self.reg = reg 44 | 45 | def regularize(self, model): 46 | for m in model.modules(): 47 | if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) and m.affine==True: 48 | m.weight.grad.data.add_(self.reg*torch.sign(m.weight.data)) 49 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/magnitude_based_pruner.py: -------------------------------------------------------------------------------- 1 | from .metapruner import MetaPruner 2 | 3 | class MagnitudePruner(MetaPruner): 4 | pass 5 | -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/scheduler.py: -------------------------------------------------------------------------------- 1 | 2 | def linear_scheduler(ch_sparsity_dict, steps): 3 | return [((i) / float(steps)) * ch_sparsity_dict for i in range(steps+1)] -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/torch_pruning/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .op_counter import count_ops_and_params -------------------------------------------------------------------------------- /Diff-Pruning/exp_code/utils.py: -------------------------------------------------------------------------------- 1 | import torch, os 2 | from glob import glob 3 | from PIL import Image 4 | 5 | class UnlabeledImageFolder(torch.utils.data.Dataset): 6 | def __init__(self, root, transform=None, exts=["*.jpg", "*.png", "*.jpeg", "*.webp"]): 7 | self.root = root 8 | self.files = [] 9 | self.transform = transform 10 | for ext in exts: 11 | self.files.extend(glob(os.path.join(root, '**/*.{}'.format(ext)), recursive=True)) 12 | 13 | def __len__(self): 14 | return len(self.files) 15 | 16 | def __getitem__(self, idx): 17 | path = self.files[idx] 18 | img = Image.open(path).convert("RGB") 19 | if self.transform is not None: 20 | img = self.transform(img) 21 | return img 22 | 23 | import torch 24 | 25 | def set_dropout(model, p): 26 | for m in model.modules(): 27 | if isinstance(m, torch.nn.Dropout): 28 | m.p = p 29 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | exp_result* 3 | pretrained 4 | mb_v2_w1.0* 5 | mb_v2_w1.4* 6 | vgg19 7 | kd_exps 8 | slurm 9 | run_* 10 | *.log 11 | *.zip 12 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 snu-mllab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/asset/icml23.yml: -------------------------------------------------------------------------------- 1 | name: icml23 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=4.5=1_gnu 8 | - accimage=0.2.0=py37h37b52e9_2 9 | - ca-certificates=2022.9.24=ha878542_0 10 | - certifi=2022.9.24=pyhd8ed1ab_0 11 | - cudatoolkit=11.3.1=h2bc3f7f_2 12 | - intel-ipp=2019.1.144=h711154d_3 13 | - ld_impl_linux-64=2.35.1=h7274673_9 14 | - libffi=3.3=he6710b0_2 15 | - libgcc-ng=9.3.0=h5101ec6_17 16 | - libgomp=9.3.0=h5101ec6_17 17 | - libjpeg-turbo=2.1.0=h7f98852_0 18 | - libstdcxx-ng=9.3.0=hd4cf53a_17 19 | - ncurses=6.3=h7f8727e_2 20 | - openssl=1.1.1k=h7f98852_0 21 | - pip=21.2.2=py37h06a4308_0 22 | - python=3.7.11=h12debd9_0 23 | - python_abi=3.7=2_cp37m 24 | - readline=8.1.2=h7f8727e_1 25 | - setuptools=58.0.4=py37h06a4308_0 26 | - sqlite=3.37.0=hc218d9a_0 27 | - tk=8.6.11=h1ccaba5_0 28 | - wheel=0.37.1=pyhd3eb1b0_0 29 | - xz=5.2.5=h7b6447c_0 30 | - zlib=1.2.11=h7f8727e_4 31 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/asset/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu113 2 | accimage==0.2.0 3 | colorama==0.4.5 4 | einops==0.4.1 5 | fvcore==0.1.5.post20220512 6 | matplotlib==3.5.1 7 | numpy==1.21.5 8 | pandas==1.3.5 9 | Pillow==9.5.0 10 | progress==1.6 11 | tensorboardX==2.6 12 | timm==0.4.12 13 | torch==1.12.1+cu113 14 | torchvision==0.13.1+cu113 15 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/asset/title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Efficient-CNN-Depth-Compression/asset/title.png -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/exps/aggregate_imp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), "..")) 5 | 6 | import pandas as pd 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser(description="Inference Time with TensorRT") 10 | parser.add_argument( 11 | "-d", 12 | "--dir", 13 | type=str, 14 | help="directory name", 15 | ) 16 | parser.add_argument( 17 | "-n", 18 | "--num", 19 | type=int, 20 | help="the number of blks", 21 | ) 22 | import re 23 | 24 | 25 | def natural_key(string_): 26 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)] 27 | 28 | 29 | if __name__ == "__main__": 30 | args = parser.parse_args() 31 | res = pd.DataFrame() 32 | for currentpath, folders, files in os.walk(args.dir): 33 | for f in sorted(files, key=natural_key): 34 | if ".csv" in f: 35 | print(f) 36 | tmp = pd.read_csv(os.path.join(currentpath, f)) 37 | res = pd.concat([res, tmp]) 38 | print(len(res)) 39 | assert len(res) == args.num 40 | res.to_csv(os.path.join(args.dir, "importance.csv")) 41 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/models/imagenet/__init__.py: -------------------------------------------------------------------------------- 1 | from .mobilenetv2 import * 2 | from .mobilenetv2_com import * 3 | from .mobilenetv2_ds import * 4 | from .vgg import * 5 | from .vgg_com import * 6 | 7 | models = { 8 | "mobilenet_v2": mobilenet_v2, 9 | "learn_mobilenet_v2": learn_mobilenet_v2, 10 | "dep_shrink_mobilenet_v2": dep_shrink_mobilenet_v2, 11 | "vgg19": vgg19_bn, 12 | "learn_vgg19": learn_vgg19_bn, 13 | } 14 | 15 | blocks = { 16 | "mobilenet_v2": InvertedResidual, 17 | "learn_mobilenet_v2": InvertedResidual, 18 | "dep_shrink_mobilenet_v2": InvertedResidual, 19 | "vgg19": VGGBlock, 20 | "learn_vgg19": LearnVGGBlock, 21 | } 22 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/models/modules_trt.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from collections import OrderedDict 7 | 8 | 9 | class NaiveFeed(nn.Module): 10 | def __init__(self, odict: OrderedDict) -> None: 11 | super().__init__() 12 | self.md = nn.Sequential(odict) 13 | 14 | def forward(self, x): 15 | return self.md(x) 16 | 17 | 18 | class SkipFeed(nn.Module): 19 | def __init__(self, odict: OrderedDict, last=nn.Identity) -> None: 20 | super().__init__() 21 | self.md = nn.Sequential(odict) 22 | self.last = last() 23 | 24 | def forward(self, x): 25 | return self.last(self.md(x) + x) 26 | 27 | 28 | class Downsample(nn.Module): 29 | def __init__(self, planes) -> None: 30 | super().__init__() 31 | self.planes = planes 32 | 33 | def forward(self, x): 34 | sz = x.shape[3] // 2 35 | ch = x.shape[1] // 2 36 | out = x 37 | out = F.interpolate(out, size=(sz, sz)) 38 | zeros = out.mul(0) 39 | out = torch.cat((zeros[:, :ch, :, :], out), 1) 40 | out = torch.cat((out, zeros[:, ch:, :, :]), 1) 41 | return out 42 | 43 | 44 | class SkipFeedDown(nn.Module): 45 | def __init__( 46 | self, odict: OrderedDict, last=nn.Identity, downsample=nn.Identity() 47 | ) -> None: 48 | super().__init__() 49 | self.md = nn.Sequential(odict) 50 | self.last = last() 51 | self.downsample = downsample 52 | 53 | def forward(self, x): 54 | return self.last(self.md(x) + self.downsample(x)) 55 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Useful utils 2 | """ 3 | from .logger import * 4 | from .train import * 5 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/utils/table/vgg19_no_trt/time_fish_gpu1_0317.csv: -------------------------------------------------------------------------------- 1 | id,st,end,time,stdev 2 | 0,0,1,8.326913743019103,0.017304351327325815 3 | 1,0,2,8.772203865051269,0.02060655504859982 4 | 2,1,2,20.50561405181885,0.08567755181542744 5 | 3,2,3,41.0177968788147,6.546454036595063 6 | 4,2,4,114.17423645019531,0.37080851651707775 7 | 5,3,4,14.111193132400512,0.029675090426546084 8 | 6,4,5,25.186032161712646,0.11286922832404914 9 | 7,4,6,30.260052652359008,0.28093528033398557 10 | 8,4,7,207.41159881591796,0.36424596242395935 11 | 9,4,8,338.93406860351564,0.1620991236145511 12 | 10,5,6,9.722453293800355,0.042286760436264254 13 | 11,5,7,13.03276577949524,0.07575332235689078 14 | 12,5,8,103.39464534759522,0.20889484580545423 15 | 13,6,7,9.725028190612793,0.04050072965233604 16 | 14,6,8,13.03555697441101,0.0779521749173917 17 | 15,7,8,9.716243696212768,0.04117186801377567 18 | 16,8,9,17.8516659450531,0.1073048045457938 19 | 17,8,10,22.99314765930176,0.21371451202721295 20 | 18,8,11,206.29462867736817,0.2996600949701716 21 | 19,8,12,340.9868132019043,0.26668334990896564 22 | 20,9,10,7.417239518165588,0.03485802200762281 23 | 21,9,11,12.60063491344452,0.0710037261804765 24 | 22,9,12,105.64399105072022,0.09381249297199233 25 | 23,10,11,7.412527368068695,0.0435099201397835 26 | 24,10,12,12.601713118553162,0.07170721415827588 27 | 25,11,12,7.415155837535858,0.04905487540587592 28 | 26,12,13,7.403466064929962,0.04160782811405336 29 | 27,12,14,12.585364966392516,0.07354652251636305 30 | 28,12,15,105.63326538085937,0.09045596020779985 31 | 29,12,16,179.90253311157227,0.3572176476267134 32 | 30,13,14,2.4252339148521425,0.03501485338327998 33 | 31,13,15,3.372817919254303,0.018959956059685847 34 | 32,13,16,29.415552349090575,0.22555944676716505 35 | 33,14,15,2.4251503944396973,0.0346148784304925 36 | 34,14,16,3.3729427111148835,0.021966171804449645 37 | 35,15,16,2.422551679611206,0.025841686677925104 38 | -------------------------------------------------------------------------------- /Efficient-CNN-Depth-Compression/utils/txt/class100.txt: -------------------------------------------------------------------------------- 1 | n02869837 2 | n01749939 3 | n02488291 4 | n02107142 5 | n13037406 6 | n02091831 7 | n04517823 8 | n04589890 9 | n03062245 10 | n01773797 11 | n01735189 12 | n07831146 13 | n07753275 14 | n03085013 15 | n04485082 16 | n02105505 17 | n01983481 18 | n02788148 19 | n03530642 20 | n04435653 21 | n02086910 22 | n02859443 23 | n13040303 24 | n03594734 25 | n02085620 26 | n02099849 27 | n01558993 28 | n04493381 29 | n02109047 30 | n04111531 31 | n02877765 32 | n04429376 33 | n02009229 34 | n01978455 35 | n02106550 36 | n01820546 37 | n01692333 38 | n07714571 39 | n02974003 40 | n02114855 41 | n03785016 42 | n03764736 43 | n03775546 44 | n02087046 45 | n07836838 46 | n04099969 47 | n04592741 48 | n03891251 49 | n02701002 50 | n03379051 51 | n02259212 52 | n07715103 53 | n03947888 54 | n04026417 55 | n02326432 56 | n03637318 57 | n01980166 58 | n02113799 59 | n02086240 60 | n03903868 61 | n02483362 62 | n04127249 63 | n02089973 64 | n03017168 65 | n02093428 66 | n02804414 67 | n02396427 68 | n04418357 69 | n02172182 70 | n01729322 71 | n02113978 72 | n03787032 73 | n02089867 74 | n02119022 75 | n03777754 76 | n04238763 77 | n02231487 78 | n03032252 79 | n02138441 80 | n02104029 81 | n03837869 82 | n03494278 83 | n04136333 84 | n03794056 85 | n03492542 86 | n02018207 87 | n04067472 88 | n03930630 89 | n03584829 90 | n02123045 91 | n04229816 92 | n02100583 93 | n03642806 94 | n04336792 95 | n03259280 96 | n02116738 97 | n02108089 98 | n03424325 99 | n01855672 100 | n02090622 101 | -------------------------------------------------------------------------------- /HALP/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:19.12-py3 2 | ARG DEBIAN_FRONTEND=noninteractive 3 | RUN apt-get update && apt-get -y install sudo dialog apt-utils && rm -rf /var/lib/apt/lists/* 4 | USER root 5 | 6 | # Install some basic utilities 7 | RUN apt-get update && apt-get install -y \ 8 | curl \ 9 | ca-certificates \ 10 | sudo \ 11 | unzip \ 12 | htop \ 13 | wget \ 14 | git \ 15 | bzip2 \ 16 | libx11-6 \ 17 | && rm -rf /var/lib/apt/lists/* 18 | 19 | RUN mkdir -p /workspace/ 20 | ENV HOME=/workspace/ 21 | RUN chmod 777 /workspace/ 22 | 23 | RUN pip install easydict 24 | RUN pip install opencv-python 25 | 26 | RUN pip install tensorboardX 27 | CMD ["python3"] 28 | 29 | WORKDIR /workspace/ 30 | USER root 31 | 32 | -------------------------------------------------------------------------------- /HALP/apex/.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve apex 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the Bug** 11 | 12 | **Minimal Steps/Code to Reproduce the Bug** 13 | 18 | 19 | **Expected Behavior** 20 | 21 | 22 | **Environment** 23 | 24 | -------------------------------------------------------------------------------- /HALP/apex/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "apex/contrib/csrc/multihead_attn/cutlass"] 2 | path = apex/contrib/csrc/multihead_attn/cutlass 3 | url = https://github.com/NVIDIA/cutlass.git 4 | branch = v1.2.0 5 | [submodule "apex/contrib/csrc/cudnn-frontend"] 6 | path = apex/contrib/csrc/cudnn-frontend 7 | url = https://github.com/NVIDIA/cudnn-frontend.git 8 | -------------------------------------------------------------------------------- /HALP/apex/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/.nojekyll -------------------------------------------------------------------------------- /HALP/apex/LICENSE: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /HALP/apex/apex/RNN/README.md: -------------------------------------------------------------------------------- 1 | **This module will be removed by the end of February 2023** 2 | 3 | Under construction... 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/RNN/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM 2 | 3 | __all__ = ['models'] 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/_autocast_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | import torch 4 | 5 | 6 | __all__ = ["_cast_if_autocast_enabled"] 7 | 8 | 9 | def _get_autocast_dtypes() -> Sequence[torch.dtype]: 10 | if torch.cuda.is_bf16_supported(): 11 | return [torch.half, torch.bfloat16] 12 | return [torch.half] 13 | 14 | 15 | def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype: 16 | if not torch.is_autocast_enabled(): 17 | return torch.float or dtype 18 | else: 19 | return torch.get_autocast_gpu_dtype() 20 | 21 | 22 | def _cast_if_autocast_enabled(*args): 23 | if not torch.is_autocast_enabled(): 24 | return args 25 | else: 26 | return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype()) 27 | -------------------------------------------------------------------------------- /HALP/apex/apex/amp/__init__.py: -------------------------------------------------------------------------------- 1 | from .amp import init, half_function, float_function, promote_function,\ 2 | register_half_function, register_float_function, register_promote_function 3 | from .handle import scale_loss, disable_casts 4 | from .frontend import initialize, state_dict, load_state_dict 5 | from ._amp_state import master_params, _amp_state 6 | -------------------------------------------------------------------------------- /HALP/apex/apex/amp/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 1, 0) 2 | __version__ = '.'.join(map(str, VERSION)) 3 | -------------------------------------------------------------------------------- /HALP/apex/apex/amp/compat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # True for post-0.4, when Variables/Tensors merged. 4 | def variable_is_tensor(): 5 | v = torch.autograd.Variable() 6 | return isinstance(v, torch.Tensor) 7 | 8 | def tensor_is_variable(): 9 | x = torch.Tensor() 10 | return type(x) == torch.autograd.Variable 11 | 12 | # False for post-0.4 13 | def tensor_is_float_tensor(): 14 | x = torch.Tensor() 15 | return type(x) == torch.FloatTensor 16 | 17 | # Akin to `torch.is_tensor`, but returns True for Variable 18 | # objects in pre-0.4. 19 | def is_tensor_like(x): 20 | return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable) 21 | 22 | # Wraps `torch.is_floating_point` if present, otherwise checks 23 | # the suffix of `x.type()`. 24 | def is_floating_point(x): 25 | if hasattr(torch, 'is_floating_point'): 26 | return torch.is_floating_point(x) 27 | try: 28 | torch_type = x.type() 29 | return torch_type.endswith('FloatTensor') or \ 30 | torch_type.endswith('HalfTensor') or \ 31 | torch_type.endswith('DoubleTensor') 32 | except AttributeError: 33 | return False 34 | 35 | def scalar_python_val(x): 36 | if hasattr(x, 'item'): 37 | return x.item() 38 | else: 39 | if isinstance(x, torch.autograd.Variable): 40 | return x.data[0] 41 | else: 42 | return x[0] 43 | 44 | # Accounts for the possibility that some ops may be removed from a namespace. 45 | def filter_attrs(module, attrs): 46 | return list(attrname for attrname in attrs if hasattr(module, attrname)) 47 | -------------------------------------------------------------------------------- /HALP/apex/apex/amp/lists/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/amp/lists/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/amp/lists/tensor_overrides.py: -------------------------------------------------------------------------------- 1 | from .. import compat 2 | from . import torch_overrides 3 | 4 | import importlib 5 | 6 | import torch 7 | 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable(): 9 | MODULE = torch.Tensor 10 | # else: 11 | # MODULE = torch.autograd.Variable 12 | 13 | 14 | FP16_FUNCS = compat.filter_attrs(MODULE, [ 15 | '__matmul__', 16 | ]) 17 | 18 | FP32_FUNCS = compat.filter_attrs(MODULE, [ 19 | '__ipow__', 20 | '__pow__', 21 | '__rpow__', 22 | 23 | # Cast to fp32 before transfer to CPU 24 | 'cpu', 25 | ]) 26 | 27 | CASTS = compat.filter_attrs(MODULE, [ 28 | '__add__', 29 | '__div__', 30 | '__eq__', 31 | '__ge__', 32 | '__gt__', 33 | '__iadd__', 34 | '__idiv__', 35 | '__imul__', 36 | '__isub__', 37 | '__itruediv__', 38 | '__le__', 39 | '__lt__', 40 | '__mul__', 41 | '__ne__', 42 | '__radd__', 43 | '__rdiv__', 44 | '__rmul__', 45 | '__rsub__', 46 | '__rtruediv__', 47 | '__sub__', 48 | '__truediv__', 49 | ]) 50 | 51 | # None of these, but here to make code cleaner. 52 | SEQUENCE_CASTS = [] 53 | 54 | # We need to grab all the methods from torch_overrides and add them to 55 | # the Tensor lists as well, as almost all methods are duplicated 56 | # between `torch` and `torch.Tensor` (and check with `hasattr`, 57 | # because a few random ones aren't defined on Tensor) 58 | _self_mod = importlib.import_module(__name__) 59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']: 60 | lst = getattr(_self_mod, attrname) 61 | for fn in getattr(torch_overrides, attrname): 62 | if hasattr(MODULE, fn): 63 | lst.append(fn) 64 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/bottleneck/__init__.py: -------------------------------------------------------------------------------- 1 | from .bottleneck import Bottleneck, SpatialBottleneck 2 | from .halo_exchangers import HaloExchangerNoComm, HaloExchangerAllGather, HaloExchangerSendRecv, HaloExchangerPeer 3 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/clip_grad/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_grad import clip_grad_norm_ 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/conv_bias_relu/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_bias_relu import ConvBiasReLU, ConvBias, ConvBiasMaskReLU, ConvFrozenScaleBiasReLU 2 | 3 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_10.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 10, /* THREADS_PER_BLOCK */ 640) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_112.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 112, /* THREADS_PER_BLOCK */ 448) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_120.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 120, /* THREADS_PER_BLOCK */ 480) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_128.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 128, /* THREADS_PER_BLOCK */ 512) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_14.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 14, /* THREADS_PER_BLOCK */ 224) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_16.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 16, /* THREADS_PER_BLOCK */ 256) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_160.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 160, /* THREADS_PER_BLOCK */ 640) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_20.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 20, /* THREADS_PER_BLOCK */ 640) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_24.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 24, /* THREADS_PER_BLOCK */ 384) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_26.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 26, /* THREADS_PER_BLOCK */ 416) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_28.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 28, /* THREADS_PER_BLOCK */ 448) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_30.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 30, /* THREADS_PER_BLOCK */ 480) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_32.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 32, /* THREADS_PER_BLOCK */ 512) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_4.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | 23 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 4, /* THREADS_PER_BLOCK */ 128) 24 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_40.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 40, /* THREADS_PER_BLOCK */ 640) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_42.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 42, /* THREADS_PER_BLOCK */ 672) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_48.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 48, /* THREADS_PER_BLOCK */ 384) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_56.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 56, /* THREADS_PER_BLOCK */ 448) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_60.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 60, /* THREADS_PER_BLOCK */ 480) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_64.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 64, /* THREADS_PER_BLOCK */ 512) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_70.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 70, /* THREADS_PER_BLOCK */ 560) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_8.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 8, /* THREADS_PER_BLOCK */ 128) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_80.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 80, /* THREADS_PER_BLOCK */ 640) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_84.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 84, /* THREADS_PER_BLOCK */ 672) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_96.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 96, /* THREADS_PER_BLOCK */ 768) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_98.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without modification, are not permit- 5 | * ted. 6 | * 7 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 8 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 10 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 11 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 12 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 13 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 14 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | * 16 | **************************************************************************************************/ 17 | 18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh" 19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh" 20 | #include "macros.h" 21 | 22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 98, /* THREADS_PER_BLOCK */ 392) 23 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/groupbn/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #ifndef CUDA_UTILS_H 3 | #define CUDA_UTILS_H 4 | 5 | namespace at { 6 | namespace cuda { 7 | 8 | namespace utils { 9 | 10 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) { 11 | return getDeviceProperties(device_id)->sharedMemPerMultiprocessor; 12 | } 13 | 14 | 15 | } 16 | } 17 | } 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "nccl_p2p_cuda.cuh" 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("get_unique_nccl_id", &apex::contrib::nccl_p2p::get_unique_nccl_id, "get_unique_nccl_id"); 21 | m.def("init_nccl_comm", &apex::contrib::nccl_p2p::init_nccl_comm, "init_nccl_comm"); 22 | m.def("left_right_halo_exchange_inplace", &apex::contrib::nccl_p2p::left_right_halo_exchange_inplace, "left_right_halo_exchange_inplace"); 23 | m.def("left_right_halo_exchange", &apex::contrib::nccl_p2p::left_right_halo_exchange, "left_right_halo_exchange"); 24 | m.def("add_delay", &apex::contrib::nccl_p2p::add_delay, "add_delay"); 25 | } 26 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #ifndef _nccl_p2p_h_ 20 | #define _nccl_p2p_h_ 21 | 22 | namespace apex { namespace contrib { namespace nccl_p2p { 23 | at::Tensor get_unique_nccl_id(int n); 24 | int init_nccl_comm( 25 | at::Tensor unique_nccl_id, 26 | int my_rank, 27 | int num_ranks 28 | ); 29 | void left_right_halo_exchange_inplace( 30 | int handle, 31 | int left_rank, 32 | int right_rank, 33 | at::Tensor left_output_halo, 34 | at::Tensor right_output_halo, 35 | at::Tensor left_input_halo, 36 | at::Tensor right_input_halo); 37 | std::vector left_right_halo_exchange( 38 | int handle, 39 | int left_rank, 40 | int right_rank, 41 | at::Tensor left_output_halo, 42 | at::Tensor right_output_halo); 43 | void add_delay(int delay); 44 | }}} 45 | #endif 46 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_version.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | // This file is used to check the version of NCCL detected. 3 | #include 4 | 5 | #include 6 | 7 | std::tuple get_nccl_version(); 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def("get_nccl_version", &get_nccl_version); 11 | } 12 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_version_check.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | // This file is used to check the version of NCCL detected. 4 | #include 5 | #include 6 | 7 | 8 | std::tuple get_nccl_version() { 9 | return { int(NCCL_MAJOR), int(NCCL_MINOR) }; 10 | } 11 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | const float lr, 8 | const float beta1, 9 | const float beta2, 10 | const float epsilon, 11 | const int step, 12 | const int bias_correction, 13 | const float weight_decay, 14 | const int grad_averaging, 15 | const int mode, 16 | const float global_grad_norm, 17 | const float max_grad_norm); 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer"); 21 | } 22 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_fused_adam_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | at::Tensor grad_scale, 8 | float lr, 9 | float beta1, 10 | float beta2, 11 | float eps, 12 | int step, 13 | int mode, 14 | int bias_correction, 15 | float weight_decay); 16 | 17 | void multi_tensor_fused_adam_with_param_remainders_cuda( 18 | int chunk_size, 19 | at::Tensor noop_flag, 20 | std::vector> tensor_lists, 21 | at::Tensor grad_scale, 22 | float lr, 23 | float beta1, 24 | float beta2, 25 | float eps, 26 | int step, 27 | int mode, 28 | int bias_correction, 29 | float weight_decay); 30 | 31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 32 | m.def("multi_tensor_fused_adam", 33 | &multi_tensor_fused_adam_cuda, 34 | "CUDA kernels for multi-tensor Adam, " 35 | "with param copy"); 36 | m.def("multi_tensor_fused_adam_with_param_remainders", 37 | &multi_tensor_fused_adam_with_param_remainders_cuda, 38 | "CUDA kernel for multi-tensor Adam, " 39 | "with stored param remainders and param copy"); 40 | } 41 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_compute_update_term_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | at::Tensor per_tensor_beta1, 8 | at::Tensor per_tensor_beta2, 9 | at::Tensor per_tensor_beta3, 10 | at::Tensor per_tensor_bias_correction, 11 | at::Tensor step, 12 | at::Tensor per_tensor_epsilon, 13 | const int mode, 14 | at::Tensor per_tensor_decay, 15 | at::Tensor global_scale, 16 | at::Tensor global_grad_norm, 17 | const float max_grad_norm); 18 | 19 | void multi_tensor_lamb_update_weights_cuda( 20 | int chunk_size, 21 | at::Tensor noop_flag, 22 | std::vector> tensor_lists, 23 | at::Tensor per_tensor_param_norm, 24 | at::Tensor per_tensor_update_norm, 25 | at::Tensor update_norm_offset, 26 | at::Tensor learning_rate, 27 | at::Tensor per_tensor_decay, 28 | at::Tensor global_grad_norm, 29 | bool use_nvlamb); 30 | 31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 32 | m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda, 33 | "Computes update term for LAMB optimizer"); 34 | m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda, 35 | "Applies update term for LAMB optimizer"); 36 | } 37 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/csrc/peer_memory/peer_memory.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "peer_memory_cuda.cuh" 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("allocate_raw", &apex::contrib::peer_memory::allocate_raw, "allocate_raw"); 21 | m.def("free_raw", &apex::contrib::peer_memory::free_raw, "free_raw"); 22 | m.def("zero", &apex::contrib::peer_memory::zero, "zero"); 23 | m.def("get_raw_ipc_address", &apex::contrib::peer_memory::get_raw_ipc_address, "get_raw_ipc_address"); 24 | m.def("get_raw_peers", &apex::contrib::peer_memory::get_raw_peers, "get_raw_peers"); 25 | m.def("blob_view_half", &apex::contrib::peer_memory::blob_view_half, "blob_view_half"); 26 | m.def("blob_view_float", &apex::contrib::peer_memory::blob_view_float, "blob_view_float"); 27 | m.def("blob_view_int", &apex::contrib::peer_memory::blob_view_int, "blob_view_int"); 28 | m.def("push_pull_halos_1d", &apex::contrib::peer_memory::push_pull_halos_1d, "push_pull_halos_1d"); 29 | } 30 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/cudnn_gbn/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch_norm import GroupBatchNorm2d -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/fmha/__init__.py: -------------------------------------------------------------------------------- 1 | from .fmha import FMHAFun 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/focal_loss/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | import focal_loss_cuda 4 | from .focal_loss import focal_loss 5 | del torch 6 | del focal_loss_cuda 7 | del focal_loss 8 | except ImportError as err: 9 | print("apex was installed without --focal_loss flag, apex.contrib.focal_loss is not available") 10 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/focal_loss/focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import focal_loss_cuda 4 | 5 | 6 | class FocalLoss(torch.autograd.Function): 7 | @staticmethod 8 | def forward( 9 | ctx, 10 | cls_output, 11 | cls_targets_at_level, 12 | num_positives_sum, 13 | num_real_classes, 14 | alpha, 15 | gamma, 16 | label_smoothing=0.0, 17 | ): 18 | loss, partial_grad = focal_loss_cuda.forward( 19 | cls_output, 20 | cls_targets_at_level, 21 | num_positives_sum, 22 | num_real_classes, 23 | alpha, 24 | gamma, 25 | label_smoothing, 26 | ) 27 | 28 | ctx.save_for_backward(partial_grad, num_positives_sum) 29 | return loss 30 | 31 | @staticmethod 32 | def backward(ctx, grad_loss): 33 | partial_grad, num_positives_sum = ctx.saved_tensors 34 | 35 | # The backward kernel is actually in-place to save memory space, 36 | # partial_grad and grad_input are the same tensor. 37 | grad_input = focal_loss_cuda.backward(grad_loss, partial_grad, num_positives_sum) 38 | 39 | return grad_input, None, None, None, None, None, None 40 | 41 | 42 | def focal_loss( 43 | cls_output: torch.Tensor, 44 | cls_targets_at_level: torch.Tensor, 45 | num_positive_sum: torch.Tensor, 46 | num_real_classes: int, 47 | alpha: float, 48 | gamma: float, 49 | label_smoothing: float = 0.0, 50 | ) -> torch.Tensor: 51 | """Fused focal loss function.""" 52 | return FocalLoss.apply( 53 | cls_output, 54 | cls_targets_at_level, 55 | num_positive_sum, 56 | num_real_classes, 57 | alpha, 58 | gamma, 59 | label_smoothing, 60 | ) 61 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/group_norm/__init__.py: -------------------------------------------------------------------------------- 1 | from .group_norm import * 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/groupbn/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | import bnp 4 | from .batch_norm import BatchNorm2d_NHWC 5 | del torch 6 | del bnp 7 | del batch_norm 8 | except ImportError as err: 9 | print("apex was installed without --bnp flag, contrib.groupbn is not available") 10 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/index_mul_2d/__init__.py: -------------------------------------------------------------------------------- 1 | from .index_mul_2d import index_mul_2d 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/layer_norm/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_norm import FastLayerNorm 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/multihead_attn/MHA_bwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/multihead_attn/MHA_bwd.png -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/multihead_attn/MHA_fwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/multihead_attn/MHA_fwd.png -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/multihead_attn/__init__.py: -------------------------------------------------------------------------------- 1 | from .self_multihead_attn import SelfMultiheadAttn 2 | from .encdec_multihead_attn import EncdecMultiheadAttn 3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16_optimizer import FP16_Optimizer 2 | from .fused_adam import FusedAdam 3 | from .fused_lamb import FusedLAMB 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/peer_memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .peer_memory import PeerMemoryPool 2 | from .peer_halo_exchanger_1d import PeerHaloExchanger1d 3 | 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/sparsity/COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/sparsity/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_masklib import create_mask 2 | from .asp import ASP 3 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/sparsity/permutation_search_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | from .call_permutation_search_kernels import accelerated_search_for_good_permutation 2 | from .permutation_utilities import sum_after_2_to_4 -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/bottleneck/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/bottleneck/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/clip_grad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/clip_grad/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/conv_bias_relu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/conv_bias_relu/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/cudnn_gbn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/cudnn_gbn/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/fmha/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/fmha/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/focal_loss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/focal_loss/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/group_norm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/group_norm/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/index_mul_2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/index_mul_2d/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/layer_norm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/layer_norm/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/multihead_attn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/multihead_attn/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/optimizers/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/peer_memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/peer_memory/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/transducer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/transducer/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/test/xentropy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/xentropy/__init__.py -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/transducer/__init__.py: -------------------------------------------------------------------------------- 1 | from .transducer import TransducerJoint 2 | from .transducer import TransducerLoss 3 | from . import _transducer_ref 4 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/xentropy/__init__.py: -------------------------------------------------------------------------------- 1 | from .softmax_xentropy import SoftmaxCrossEntropyLoss 2 | 3 | 4 | __all__ = [ 5 | "SoftmaxCrossEntropyLoss", 6 | ] 7 | -------------------------------------------------------------------------------- /HALP/apex/apex/contrib/xentropy/softmax_xentropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import xentropy_cuda 4 | 5 | 6 | class SoftmaxCrossEntropyLoss(torch.autograd.Function): 7 | @staticmethod 8 | def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False): 9 | losses, max_log_sum_exp = xentropy_cuda.forward( 10 | logits, labels, smoothing, half_to_float) 11 | losses.masked_fill_(labels==padding_idx, 0) 12 | 13 | ctx.save_for_backward(logits, max_log_sum_exp, labels, 14 | torch.FloatTensor([smoothing]), 15 | torch.LongTensor([padding_idx])) 16 | 17 | return losses 18 | 19 | @staticmethod 20 | def backward(ctx, grad_loss): 21 | logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors 22 | 23 | if not grad_loss.is_contiguous(): 24 | grad_loss = grad_loss.contiguous() 25 | grad_loss.masked_fill_(labels==padding_idx.item(), 0) 26 | grad_logits = xentropy_cuda.backward( 27 | grad_loss.contiguous(), logits, max_log_sum_exp, 28 | labels, smoothing.item()) 29 | 30 | return grad_logits, None, None, None, None 31 | -------------------------------------------------------------------------------- /HALP/apex/apex/fp16_utils/README.md: -------------------------------------------------------------------------------- 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user. To use `FP16_Optimizer`, only two lines of one's Python model need to change. 2 | 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling) 4 | 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple) 6 | 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 8 | 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) 10 | 11 | 12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses. 13 | 14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management) 15 | 16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling. These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically. 17 | -------------------------------------------------------------------------------- /HALP/apex/apex/fp16_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16util import ( 2 | BN_convert_float, 3 | network_to_half, 4 | prep_param_lists, 5 | model_grads_to_master_grads, 6 | master_params_to_model_params, 7 | tofp16, 8 | to_python_float, 9 | clip_grad_norm, 10 | convert_module, 11 | convert_network, 12 | FP16Model, 13 | ) 14 | 15 | from .fp16_optimizer import FP16_Optimizer 16 | from .loss_scaler import LossScaler, DynamicLossScaler 17 | -------------------------------------------------------------------------------- /HALP/apex/apex/fused_dense/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_dense import * 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/mlp/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import * 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/multi_tensor_apply/__init__.py: -------------------------------------------------------------------------------- 1 | from .multi_tensor_apply import MultiTensorApply 2 | 3 | multi_tensor_applier = MultiTensorApply(2048*32) 4 | 5 | -------------------------------------------------------------------------------- /HALP/apex/apex/multi_tensor_apply/multi_tensor_apply.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class MultiTensorApply(object): 4 | available = False 5 | warned = False 6 | 7 | def __init__(self, chunk_size): 8 | try: 9 | import amp_C 10 | MultiTensorApply.available = True 11 | self.chunk_size = chunk_size 12 | except ImportError as err: 13 | MultiTensorApply.available = False 14 | MultiTensorApply.import_err = err 15 | 16 | def check_avail(self): 17 | if MultiTensorApply.available == False: 18 | raise RuntimeError( 19 | "Attempted to call MultiTensorApply method, but MultiTensorApply " 20 | "is not available, possibly because Apex was installed without " 21 | "--cpp_ext --cuda_ext. Original import error message:", 22 | MultiTensorApply.import_err) 23 | 24 | def __call__(self, op, noop_flag_buffer, tensor_lists, *args): 25 | self.check_avail() 26 | 27 | return op(self.chunk_size, 28 | noop_flag_buffer, 29 | tensor_lists, 30 | *args) 31 | -------------------------------------------------------------------------------- /HALP/apex/apex/normalization/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm, FusedRMSNorm, MixedFusedRMSNorm 2 | -------------------------------------------------------------------------------- /HALP/apex/apex/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_sgd import FusedSGD 2 | from .fused_adam import FusedAdam 3 | from .fused_novograd import FusedNovoGrad 4 | from .fused_lamb import FusedLAMB 5 | from .fused_adagrad import FusedAdagrad 6 | from .fused_mixed_precision_lamb import FusedMixedPrecisionLamb 7 | -------------------------------------------------------------------------------- /HALP/apex/apex/parallel/multiproc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import subprocess 4 | 5 | def docstring_hack(): 6 | """ 7 | Multiproc file which will launch a set of processes locally for multi-gpu 8 | usage: python -m apex.parallel.multiproc main.py ... 9 | """ 10 | pass 11 | 12 | argslist = list(sys.argv)[1:] 13 | world_size = torch.cuda.device_count() 14 | 15 | if '--world-size' in argslist: 16 | world_size = int(argslist[argslist.index('--world-size')+1]) 17 | else: 18 | argslist.append('--world-size') 19 | argslist.append(str(world_size)) 20 | 21 | workers = [] 22 | 23 | for i in range(world_size): 24 | if '--rank' in argslist: 25 | argslist[argslist.index('--rank')+1] = str(i) 26 | else: 27 | argslist.append('--rank') 28 | argslist.append(str(i)) 29 | stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w") 30 | print(argslist) 31 | p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) 32 | workers.append(p) 33 | 34 | for p in workers: 35 | p.wait() 36 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer import amp 2 | from apex.transformer import functional 3 | from apex.transformer import parallel_state 4 | from apex.transformer import pipeline_parallel 5 | from apex.transformer import tensor_parallel 6 | from apex.transformer import utils 7 | from apex.transformer.enums import LayerType 8 | from apex.transformer.enums import AttnType 9 | from apex.transformer.enums import AttnMaskType 10 | 11 | 12 | __all__ = [ 13 | "amp", 14 | "functional", 15 | "parallel_state", 16 | "pipeline_parallel", 17 | "tensor_parallel", 18 | "utils", 19 | # enums.py 20 | "LayerType", 21 | "AttnType", 22 | "AttnMaskType", 23 | ] 24 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/_data/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer._data._batchsampler import MegatronPretrainingRandomSampler 2 | from apex.transformer._data._batchsampler import MegatronPretrainingSampler 3 | 4 | 5 | __all__ = [ 6 | "MegatronPretrainingRandomSampler", 7 | "MegatronPretrainingSampler", 8 | ] 9 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/_ucc_util.py: -------------------------------------------------------------------------------- 1 | from torch import distributed as dist 2 | 3 | HAS_UCC = hasattr(dist, "is_ucc_available") and dist.is_ucc_available() 4 | if not HAS_UCC: 5 | try: 6 | import torch_ucc 7 | HAS_UCC = True 8 | except ImportError: 9 | HAS_UCC = False 10 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/amp/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer.amp.grad_scaler import GradScaler 2 | 3 | 4 | __all__ = [ 5 | "GradScaler", 6 | ] 7 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import enum 16 | 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | 23 | class AttnType(enum.Enum): 24 | self_attn = 1 25 | cross_attn = 2 26 | 27 | 28 | class AttnMaskType(enum.Enum): 29 | padding = 1 30 | causal = 2 31 | 32 | 33 | class ModelType(enum.Enum): 34 | encoder_or_decoder = 1 35 | encoder_and_decoder = 2 36 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/functional/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer.functional.fused_softmax import FusedScaleMaskSoftmax 2 | 3 | __all__ = [ 4 | "FusedScaleMaskSoftmax", 5 | ] 6 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | from apex.transformer.layers.layer_norm import FastLayerNorm 3 | from apex.transformer.layers.layer_norm import FusedLayerNorm 4 | from apex.transformer.layers.layer_norm import MixedFusedLayerNorm 5 | 6 | 7 | __all__ = [ 8 | "FastLayerNorm", 9 | "FusedLayerNorm", 10 | "MixedFusedLayerNorm", 11 | ] 12 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/log_util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | 5 | def get_transformer_logger(name: str) -> logging.Logger: 6 | name_wo_ext = os.path.splitext(name)[0] 7 | return logging.getLogger(name_wo_ext) 8 | 9 | 10 | def set_logging_level(verbosity) -> None: 11 | """Change logging severity. 12 | 13 | Args: 14 | verbosity 15 | """ 16 | from apex import _library_root_logger 17 | 18 | _library_root_logger.setLevel(verbosity) 19 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func 2 | from apex.transformer.pipeline_parallel.schedules.common import build_model 3 | 4 | 5 | __all__ = [ 6 | "get_forward_backward_func", 7 | "build_model", 8 | ] 9 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/pipeline_parallel/schedules/__init__.py: -------------------------------------------------------------------------------- 1 | from apex.transformer import parallel_state 2 | from apex.transformer.pipeline_parallel.utils import get_num_microbatches 3 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import ( 4 | forward_backward_no_pipelining, 5 | ) 6 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import ( 7 | _forward_backward_pipelining_with_interleaving, 8 | ) 9 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import ( 10 | forward_backward_pipelining_without_interleaving, 11 | ) 12 | 13 | __all__ = [ 14 | "get_forward_backward_func", 15 | ] 16 | 17 | 18 | class ExperimentalWarning(Warning): 19 | pass 20 | 21 | 22 | def get_forward_backward_func( 23 | virtual_pipeline_model_parallel_size, pipeline_model_parallel_size, 24 | ): 25 | if parallel_state.get_pipeline_model_parallel_world_size() > 1: 26 | if virtual_pipeline_model_parallel_size is not None: 27 | if get_num_microbatches() % pipeline_model_parallel_size != 0: 28 | msg = "number of microbatches is not divisible by pipeline-parallel size when using interleaved schedule" 29 | raise RuntimeError(msg) 30 | forward_backward_func = _forward_backward_pipelining_with_interleaving 31 | else: 32 | forward_backward_func = forward_backward_pipelining_without_interleaving 33 | else: 34 | forward_backward_func = forward_backward_no_pipelining 35 | return forward_backward_func 36 | -------------------------------------------------------------------------------- /HALP/apex/apex/transformer/testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/transformer/testing/__init__.py -------------------------------------------------------------------------------- /HALP/apex/csrc/compat.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCH_CHECK 2 | #define TORCH_CHECK AT_CHECK 3 | #endif 4 | 5 | #ifdef VERSION_GE_1_3 6 | #define DATA_PTR data_ptr 7 | #else 8 | #define DATA_PTR data 9 | #endif 10 | -------------------------------------------------------------------------------- /HALP/apex/csrc/flatten_unflatten.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h 4 | 5 | at::Tensor flatten(std::vector tensors) 6 | { 7 | return torch::utils::flatten_dense_tensors(tensors); 8 | } 9 | 10 | std::vector unflatten(at::Tensor flat, std::vector tensors) 11 | { 12 | return torch::utils::unflatten_dense_tensors(flat, tensors); 13 | } 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("flatten", &flatten, "Flatten dense tensors"); 17 | m.def("unflatten", &unflatten, "Unflatten dense tensors"); 18 | } 19 | -------------------------------------------------------------------------------- /HALP/apex/csrc/megatron/fused_weight_gradient_dense.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | void wgrad_gemm_accum_fp32_cuda_stub( 7 | at::Tensor &input_2d, 8 | at::Tensor &d_output_2d, 9 | at::Tensor &d_weight 10 | ); 11 | 12 | void wgrad_gemm_accum_fp16_cuda_stub( 13 | at::Tensor &input_2d, 14 | at::Tensor &d_output_2d, 15 | at::Tensor &d_weight 16 | ); 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 19 | m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32_cuda_stub, "wgrad gemm accum in fp32"); 20 | m.def("wgrad_gemm_accum_fp16", &wgrad_gemm_accum_fp16_cuda_stub, "wgrad gemm accum in fp16"); 21 | } 22 | -------------------------------------------------------------------------------- /HALP/apex/csrc/static_switch.h: -------------------------------------------------------------------------------- 1 | // From 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 3 | 4 | #pragma once 5 | 6 | /// @param COND - a boolean expression to switch by 7 | /// @param CONST_NAME - a name given for the constexpr bool variable. 8 | /// @param ... - code to execute for true and false 9 | /// 10 | /// Usage: 11 | /// ``` 12 | /// BOOL_SWITCH(flag, BoolConst, [&] { 13 | /// some_function(...); 14 | /// }); 15 | /// ``` 16 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 17 | [&] { \ 18 | if (COND) { \ 19 | constexpr static bool CONST_NAME = true; \ 20 | return __VA_ARGS__(); \ 21 | } else { \ 22 | constexpr static bool CONST_NAME = false; \ 23 | return __VA_ARGS__(); \ 24 | } \ 25 | }() 26 | -------------------------------------------------------------------------------- /HALP/apex/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = NVIDIAAPEX 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | gh-pages: 16 | git checkout gh-pages 17 | rm -rf build 18 | rm -rf source 19 | git checkout master -- . 20 | make html 21 | rm -rf ../_modules ../_sources ../_static 22 | mv -fv build/html/* ../ 23 | rm -rf build 24 | git add -A 25 | git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master 26 | 27 | .PHONY: help Makefile 28 | 29 | # Catch-all target: route all unknown targets to Sphinx using the new 30 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 31 | %: Makefile 32 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 33 | -------------------------------------------------------------------------------- /HALP/apex/docs/source/_static/img/nv-pytorch2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/docs/source/_static/img/nv-pytorch2.png -------------------------------------------------------------------------------- /HALP/apex/docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% block sidebartitle %} {{ super() }} 3 | 4 | 32 | {% endblock %} 33 | 34 | {% block footer %} {{ super() }} 35 | 36 | 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /HALP/apex/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyTorch documentation master file, created by 2 | sphinx-quickstart on Fri Dec 23 13:31:47 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/nvidia/apex 7 | 8 | Apex (A PyTorch Extension) 9 | =================================== 10 | 11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex), 12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible. 13 | 14 | Installation instructions can be found here: https://github.com/NVIDIA/apex#quick-start. 15 | 16 | Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here: https://github.com/mcarilli/mixed_precision_references. 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | :caption: AMP: Automatic Mixed Precision 21 | 22 | amp 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: Distributed Training 27 | 28 | parallel 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: Fused Optimizers 33 | 34 | optimizers 35 | 36 | .. toctree:: 37 | :maxdepth: 1 38 | :caption: Fused Layer Norm 39 | 40 | layernorm 41 | 42 | .. .. toctree:: 43 | :maxdepth: 1 44 | :caption: Deprecated mixed precision API 45 | fp16_util 46 | 47 | .. RNN 48 | 49 | Indices and tables 50 | ================== 51 | 52 | * :ref:`genindex` 53 | * :ref:`modindex` 54 | -------------------------------------------------------------------------------- /HALP/apex/docs/source/layernorm.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.normalization.fused_layer_norm 5 | =================================== 6 | 7 | .. automodule:: apex.normalization 8 | .. currentmodule:: apex.normalization 9 | 10 | .. FusedAdam 11 | ---------- 12 | 13 | .. autoclass:: FusedLayerNorm 14 | :members: 15 | 16 | .. autoclass:: FusedRMSNorm 17 | :members: 18 | -------------------------------------------------------------------------------- /HALP/apex/docs/source/optimizers.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.optimizers 5 | =================================== 6 | 7 | .. automodule:: apex.optimizers 8 | .. currentmodule:: apex.optimizers 9 | 10 | .. FusedAdam 11 | ---------- 12 | 13 | .. autoclass:: FusedAdam 14 | :members: 15 | 16 | .. autoclass:: FusedLAMB 17 | :members: 18 | 19 | .. autoclass:: FusedNovoGrad 20 | :members: 21 | 22 | .. autoclass:: FusedSGD 23 | :members: 24 | -------------------------------------------------------------------------------- /HALP/apex/docs/source/parallel.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | apex.parallel 5 | =================================== 6 | 7 | .. automodule:: apex.parallel 8 | .. currentmodule:: apex.parallel 9 | 10 | .. DistributedDataParallel 11 | ---------- 12 | 13 | .. autoclass:: DistributedDataParallel 14 | :members: 15 | 16 | .. autoclass:: Reducer 17 | :members: 18 | 19 | .. autoclass:: SyncBatchNorm 20 | :members: 21 | 22 | Utility functions 23 | ---------------------------------- 24 | 25 | .. autofunction:: convert_syncbn_model 26 | -------------------------------------------------------------------------------- /HALP/apex/examples/README.md: -------------------------------------------------------------------------------- 1 | This directory contains examples illustrating Apex mixed precision and distributed tools. 2 | 3 | **Note for users of the pre-unification API**: 4 | `deprecated_api` contains examples illustrating the old (pre-unified) APIs. These APIs will be removed soon, and users are strongly encouraged to switch. The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API. 5 | -------------------------------------------------------------------------------- /HALP/apex/examples/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image must at least have pytorch and CUDA installed. 2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3 3 | FROM $BASE_IMAGE 4 | ARG BASE_IMAGE 5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}" 6 | # make sure we don't overwrite some existing directory called "apex" 7 | WORKDIR /tmp/unique_for_apex 8 | # uninstall Apex if present, twice to make absolutely sure :) 9 | RUN pip uninstall -y apex || : 10 | RUN pip uninstall -y apex || : 11 | # SHA is something the user can touch to force recreation of this Docker layer, 12 | # and therefore force cloning of the latest version of Apex 13 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git 14 | WORKDIR /tmp/unique_for_apex/apex 15 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . 16 | WORKDIR /workspace 17 | -------------------------------------------------------------------------------- /HALP/apex/examples/simple/distributed/README.md: -------------------------------------------------------------------------------- 1 | **distributed_data_parallel.py** and **run.sh** show an example using Amp with 2 | [apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or 3 | [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel) 4 | and the Pytorch multiprocess launcher script, 5 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility). 6 | The use of `Amp` with DistributedDataParallel does not need to change from ordinary 7 | single-process use. The only gotcha is that wrapping your model with `DistributedDataParallel` must 8 | come after the call to `amp.initialize`. Test via 9 | ```bash 10 | bash run.sh 11 | ``` 12 | 13 | **This is intended purely as an instructional example, not a performance showcase.** 14 | -------------------------------------------------------------------------------- /HALP/apex/examples/simple/distributed/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py 3 | -------------------------------------------------------------------------------- /HALP/apex/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /HALP/apex/requirements.txt: -------------------------------------------------------------------------------- 1 | cxxfilt>=0.2.0 2 | tqdm>=4.28.1 3 | numpy>=1.15.3 4 | PyYAML>=5.1 5 | pytest>=3.5.1 6 | packaging>=14.0 7 | -------------------------------------------------------------------------------- /HALP/apex/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | flake8>=3.7.9 3 | Sphinx>=3.0.3 -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_amp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_amp/__init__.py -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_amp/test_larc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import Parameter 6 | 7 | from apex import amp 8 | from apex.parallel.LARC import LARC 9 | from utils import common_init 10 | 11 | 12 | class MyModel(torch.nn.Module): 13 | def __init__(self, unique): 14 | super(MyModel, self).__init__() 15 | self.weight0 = Parameter( 16 | unique + torch.arange(2, device="cuda", dtype=torch.float32) 17 | ) 18 | 19 | def forward(self, input): 20 | return (input * self.weight0).sum() 21 | 22 | 23 | class TestLARC(unittest.TestCase): 24 | def setUp(self): 25 | self.x = torch.ones((2), device="cuda", dtype=torch.float32) 26 | common_init(self) 27 | 28 | def tearDown(self): 29 | pass 30 | 31 | def test_larc_mixed_precision(self): 32 | for opt_level in ["O0", "O1", "O2", "O3"]: 33 | model = MyModel(1) 34 | 35 | optimizer = LARC( 36 | torch.optim.SGD( 37 | [{"params": model.parameters(), "lr": 0.25}], momentum=0.125 38 | ) 39 | ) 40 | 41 | model, optimizer = amp.initialize( 42 | model, optimizer, opt_level=opt_level, verbosity=0 43 | ) 44 | 45 | optimizer.zero_grad() 46 | loss = model(self.x) 47 | with amp.scale_loss(loss, optimizer) as scaled_loss: 48 | scaled_loss.backward() 49 | optimizer.step() 50 | 51 | 52 | if __name__ == "__main__": 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_amp/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | HALF = 'torch.cuda.HalfTensor' 4 | FLOAT = 'torch.cuda.FloatTensor' 5 | 6 | DTYPES = [torch.half, torch.float] 7 | 8 | ALWAYS_HALF = {torch.float: HALF, 9 | torch.half: HALF} 10 | ALWAYS_FLOAT = {torch.float: FLOAT, 11 | torch.half: FLOAT} 12 | MATCH_INPUT = {torch.float: FLOAT, 13 | torch.half: HALF} 14 | 15 | def common_init(test_case): 16 | test_case.h = 64 17 | test_case.b = 16 18 | test_case.c = 16 19 | test_case.k = 3 20 | test_case.t = 10 21 | torch.set_default_tensor_type(torch.cuda.FloatTensor) 22 | -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_deprecated/test_deprecated_warning.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | 5 | import apex 6 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase 7 | 8 | 9 | def init_model_and_optimizer(): 10 | model = torch.nn.Linear(1, 1, bias=False).cuda() 11 | optimizer = torch.optim.SGD(model.parameters(), 1.0) 12 | return model, optimizer 13 | 14 | 15 | @unittest.skipUnless(torch.cuda.is_available(), "") 16 | class TestDeprecatedWarning(unittest.TestCase): 17 | 18 | def test_amp(self): 19 | model, optimizer = init_model_and_optimizer() 20 | with self.assertWarns(apex.DeprecatedFeatureWarning): 21 | _ = apex.amp.initialize(model, optimizer) 22 | 23 | def test_fp16_model(self): 24 | model, _ = init_model_and_optimizer() 25 | with self.assertWarns(apex.DeprecatedFeatureWarning): 26 | _ = apex.fp16_utils.FP16Model(model) 27 | 28 | def test_fp16_optimizer(self): 29 | _, optimizer = init_model_and_optimizer() 30 | with self.assertWarns(apex.DeprecatedFeatureWarning): 31 | _ = apex.fp16_utils.FP16_Optimizer(optimizer) 32 | 33 | def test_fp16_loss_scaler(self): 34 | with self.assertWarns(apex.DeprecatedFeatureWarning): 35 | apex.fp16_utils.LossScaler() 36 | 37 | 38 | class TestParallel(NcclDistributedTestBase): 39 | 40 | @property 41 | def world_size(self): 42 | return min(torch.cuda.device_count(), 2) 43 | 44 | def test_distributed_data_parallel(self): 45 | model, _ = init_model_and_optimizer() 46 | with self.assertWarns(apex.DeprecatedFeatureWarning): 47 | _ = apex.parallel.DistributedDataParallel(model) 48 | 49 | def test_convert_syncbn_model(self): 50 | model, _ = init_model_and_optimizer() 51 | with self.assertWarns(apex.DeprecatedFeatureWarning): 52 | _ = apex.parallel.convert_syncbn_model(model) 53 | 54 | 55 | if __name__ == "__main__": 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_fp16util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_fp16util/__init__.py -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_optimizers/__init__.py -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_transformer/__init__.py -------------------------------------------------------------------------------- /HALP/apex/tests/L0/run_transformer/test_transformer_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | from torch.testing._internal import common_utils 5 | 6 | logging.getLogger("torch").setLevel(logging.WARNING) 7 | 8 | from apex.transformer import parallel_state 9 | from apex.transformer.tensor_parallel import utils 10 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase 11 | 12 | logging.getLogger("apex").setLevel(logging.WARNING) 13 | 14 | 15 | class TransformerUtilsTest(NcclDistributedTestBase): 16 | def test_split_tensor_along_last_dim(self): 17 | for tensor_model_paralell_world_size in range(1, self.world_size + 1): 18 | if self.world_size % tensor_model_paralell_world_size > 0: 19 | continue 20 | parallel_state.initialize_model_parallel( 21 | tensor_model_parallel_size_=tensor_model_paralell_world_size 22 | ) 23 | 24 | device = "cpu" 25 | input_tensor = torch.randn((100, 100, 100), device=device) 26 | splits = utils.split_tensor_along_last_dim(input_tensor, 10) 27 | last_dim_shapes = torch.tensor( 28 | [int(split.size()[-1]) for split in splits] 29 | ) 30 | 31 | self.assertTrue( 32 | torch.equal(last_dim_shapes, torch.full((10,), 10),), 33 | msg=f"tensor_model_paralell_world_size: {tensor_model_paralell_world_size}", 34 | ) 35 | 36 | parallel_state.destroy_model_parallel() 37 | 38 | 39 | if __name__ == "__main__": 40 | common_utils.run_tests() 41 | -------------------------------------------------------------------------------- /HALP/apex/tests/L1/cross_product/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/" 4 | # DATADIR="/opt/home/apex/examples/imagenet/" 5 | cp ../common/* . 6 | bash run_test.sh single_gpu $1 7 | -------------------------------------------------------------------------------- /HALP/apex/tests/L1/cross_product_distributed/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cp ../common/* . 4 | bash run_test.sh distributed $1 5 | -------------------------------------------------------------------------------- /HALP/apex/tests/distributed/DDP/run_race_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py 4 | -------------------------------------------------------------------------------- /HALP/apex/tests/distributed/amp_master_params/compare.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | model_params_rank0 = torch.load("rank0model.pth", 4 | map_location = lambda storage, loc: storage.cuda(0)) 5 | model_params_rank1 = torch.load("rank1model.pth", 6 | map_location = lambda storage, loc: storage.cuda(0)) 7 | master_params_rank0 = torch.load("rank0master.pth", 8 | map_location = lambda storage, loc: storage.cuda(0)) 9 | master_params_rank1 = torch.load("rank1master.pth", 10 | map_location = lambda storage, loc: storage.cuda(0)) 11 | 12 | for model_rank0, model_rank1, master_rank0, master_rank1 in zip( 13 | model_params_rank0, 14 | model_params_rank1, 15 | master_params_rank0, 16 | master_params_rank1): 17 | assert torch.allclose(model_rank0, model_rank1), "Model param mismatch" 18 | assert torch.allclose(master_rank0, master_rank1), "Master param mismatch" 19 | # Some debugging/investigation assistance code: 20 | # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0) 21 | # offending_val_half = model_rank0.view(-1)[maxind.item()] 22 | # offending_val_float = master_rank0.view(-1)[maxind.item()] 23 | # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(), 24 | # offending_val_float.half().item()) 25 | # rtol needs to be > 2^-11 because of denormals... 26 | assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch" 27 | 28 | print("OK: Model and master params match across ranks.") 29 | -------------------------------------------------------------------------------- /HALP/apex/tests/distributed/amp_master_params/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py 3 | 4 | python compare.py 5 | -------------------------------------------------------------------------------- /HALP/apex/tests/distributed/synced_batchnorm/test_batchnorm1d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import apex 3 | 4 | model = apex.parallel.SyncBatchNorm(4).cuda() 5 | model.weight.data.uniform_() 6 | model.bias.data.uniform_() 7 | data = torch.rand((8,4)).cuda() 8 | 9 | model_ref = torch.nn.BatchNorm1d(4).cuda() 10 | model_ref.load_state_dict(model.state_dict()) 11 | data_ref = data.clone() 12 | 13 | output = model(data) 14 | output_ref = model_ref(data_ref) 15 | 16 | assert(output.allclose(output_ref)) 17 | assert(model.running_mean.allclose(model_ref.running_mean)) 18 | assert(model.running_var.allclose(model_ref.running_var)) 19 | -------------------------------------------------------------------------------- /HALP/apex/tests/distributed/synced_batchnorm/unit_test.sh: -------------------------------------------------------------------------------- 1 | python python_single_gpu_unit_test.py 2 | python single_gpu_unit_test.py 3 | python test_batchnorm1d.py 4 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py 5 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16 6 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex 7 | #beware, you need a system with at least 4 gpus to test group_size= s: 33 | lr *= decay_factor 34 | return lr 35 | 36 | return lr_policy(_lr_fn) 37 | 38 | 39 | def lr_cosine_policy(base_lr, warmup_length, epochs): 40 | def _lr_fn(epoch): 41 | if epoch < warmup_length: 42 | lr = base_lr * (epoch + 1) / warmup_length 43 | else: 44 | e = epoch - warmup_length 45 | es = epochs - warmup_length 46 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 47 | return lr 48 | 49 | return lr_policy(_lr_fn) 50 | -------------------------------------------------------------------------------- /HALP/utils/mixup.py: -------------------------------------------------------------------------------- 1 | """Originated from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/image_classification/mixup.py 2 | """ 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | def mixup(alpha, num_classes, data, target): 10 | with torch.no_grad(): 11 | bs = data.size(0) 12 | c = np.random.beta(alpha, alpha) 13 | 14 | perm = torch.randperm(bs).cuda() 15 | 16 | md = c * data + (1 - c) * data[perm, :] 17 | mt = c * target + (1 - c) * target[perm, :] 18 | return md, mt 19 | 20 | 21 | class MixUpWrapper(object): 22 | def __init__(self, alpha, num_classes, dataloader): 23 | self.alpha = alpha 24 | self.dataloader = dataloader 25 | self.num_classes = num_classes 26 | 27 | def mixup_loader(self, loader): 28 | for input, target in loader: 29 | i, t = mixup(self.alpha, self.num_classes, input, target) 30 | yield i, t 31 | 32 | def __iter__(self): 33 | return self.mixup_loader(self.dataloader) 34 | 35 | 36 | class NLLMultiLabelSmooth(nn.Module): 37 | def __init__(self, smoothing=0.0): 38 | super(NLLMultiLabelSmooth, self).__init__() 39 | self.confidence = 1.0 - smoothing 40 | self.smoothing = smoothing 41 | 42 | def forward(self, x, target): 43 | if self.training: 44 | x = x.float() 45 | target = target.float() 46 | logprobs = torch.nn.functional.log_softmax(x, dim=-1) 47 | 48 | nll_loss = -logprobs * target 49 | nll_loss = nll_loss.sum(-1) 50 | 51 | smooth_loss = -logprobs.mean(dim=-1) 52 | 53 | loss = self.confidence * nll_loss + self.smoothing * smooth_loss 54 | 55 | return loss.mean() 56 | else: 57 | return torch.nn.functional.cross_entropy(x, target) 58 | -------------------------------------------------------------------------------- /HALP/utils/smoothing.py: -------------------------------------------------------------------------------- 1 | """Originated from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/image_classification/smoothing.py 2 | """ 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class LabelSmoothing(nn.Module): 9 | """ 10 | NLL loss with label smoothing. 11 | """ 12 | 13 | def __init__(self, smoothing=0.0): 14 | """ 15 | Constructor for the LabelSmoothing module. 16 | 17 | :param smoothing: label smoothing factor 18 | """ 19 | super(LabelSmoothing, self).__init__() 20 | self.confidence = 1.0 - smoothing 21 | self.smoothing = smoothing 22 | 23 | def forward(self, x, target): 24 | logprobs = torch.nn.functional.log_softmax(x, dim=-1) 25 | 26 | nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) 27 | nll_loss = nll_loss.squeeze(1) 28 | smooth_loss = -logprobs.mean(dim=-1) 29 | loss = self.confidence * nll_loss + self.smoothing * smooth_loss 30 | return loss.mean() 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 snu-mllab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /asset/short_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/asset/short_demo.png -------------------------------------------------------------------------------- /asset/title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/asset/title.png -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.pth 2 | *.ckpt 3 | -------------------------------------------------------------------------------- /examples/ckpt/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/examples/ckpt/.keep -------------------------------------------------------------------------------- /examples/ddpm_cifar10.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CIFAR10" 3 | image_size: 32 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 128 34 | n_epochs: 256 35 | n_iters: 100000 36 | snapshot_freq: 50000 37 | validation_freq: 2000 38 | 39 | sampling: 40 | batch_size: 64 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /examples/images/husky.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/examples/images/husky.png -------------------------------------------------------------------------------- /layer_merge/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/__init__.py -------------------------------------------------------------------------------- /layer_merge/aggregate_imp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Aggregating the parallelized importance table.") 6 | parser.add_argument( 7 | "-d", 8 | "--dir", 9 | type=str, 10 | help="directory name", 11 | ) 12 | parser.add_argument( 13 | "-n", 14 | "--num", 15 | type=int, 16 | help="the number of blks", 17 | ) 18 | import re 19 | 20 | 21 | def natural_key(string_): 22 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)] 23 | 24 | 25 | def main(): 26 | args = parser.parse_args() 27 | res = pd.DataFrame() 28 | for currentpath, folders, files in os.walk(args.dir): 29 | for f in sorted(files, key=natural_key): 30 | if ".csv" in f: 31 | print(f) 32 | tmp = pd.read_csv(os.path.join(currentpath, f)) 33 | res = pd.concat([res, tmp]) 34 | print(len(res)) 35 | assert len(res) == args.num 36 | res.to_csv(os.path.join(args.dir, "importance.csv")) 37 | -------------------------------------------------------------------------------- /layer_merge/kim23efficient/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim23efficient/__init__.py -------------------------------------------------------------------------------- /layer_merge/kim24layer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim24layer/__init__.py -------------------------------------------------------------------------------- /layer_merge/kim24layermerge/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim24layermerge/__init__.py -------------------------------------------------------------------------------- /layer_merge/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/models/__init__.py -------------------------------------------------------------------------------- /layer_merge/models/ddpm_cfg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/models/ddpm_cfg/__init__.py -------------------------------------------------------------------------------- /layer_merge/models/ddpm_cfg/bedroom.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "LSUN" 3 | category: "bedroom" 4 | image_size: 256 5 | channels: 3 6 | logit_transform: false 7 | uniform_dequantization: false 8 | gaussian_dequantization: false 9 | random_flip: true 10 | rescaled: true 11 | num_workers: 32 12 | 13 | model: 14 | type: "simple" 15 | in_channels: 3 16 | out_ch: 3 17 | ch: 128 18 | ch_mult: [1, 1, 2, 2, 4, 4] 19 | num_res_blocks: 2 20 | attn_resolutions: [16, ] 21 | dropout: 0.0 22 | var_type: fixedsmall 23 | ema_rate: 0.999 24 | ema: True 25 | resamp_with_conv: True 26 | 27 | diffusion: 28 | beta_schedule: linear 29 | beta_start: 0.0001 30 | beta_end: 0.02 31 | num_diffusion_timesteps: 1000 32 | 33 | training: 34 | batch_size: 8 35 | n_epochs: 10000 36 | n_iters: 5000000 37 | snapshot_freq: 5000 38 | validation_freq: 2000 39 | 40 | sampling: 41 | batch_size: 16 42 | last_only: True 43 | 44 | optim: 45 | weight_decay: 0.000 46 | optimizer: "Adam" 47 | lr: 0.000002 48 | beta1: 0.9 49 | amsgrad: false 50 | eps: 0.00000001 51 | -------------------------------------------------------------------------------- /layer_merge/models/ddpm_cfg/celeba.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CELEBA" 3 | image_size: 64 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2, 4] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 96 # 128 34 | n_epochs: 10000 35 | n_iters: 5000000 36 | snapshot_freq: 5000 37 | validation_freq: 20000 38 | 39 | sampling: 40 | batch_size: 32 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /layer_merge/models/ddpm_cfg/church.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "LSUN" 3 | category: "church_outdoor" 4 | image_size: 256 5 | channels: 3 6 | logit_transform: false 7 | uniform_dequantization: false 8 | gaussian_dequantization: false 9 | random_flip: true 10 | rescaled: true 11 | num_workers: 32 12 | 13 | model: 14 | type: "simple" 15 | in_channels: 3 16 | out_ch: 3 17 | ch: 128 18 | ch_mult: [1, 1, 2, 2, 4, 4] 19 | num_res_blocks: 2 20 | attn_resolutions: [16, ] 21 | dropout: 0.0 22 | var_type: fixedsmall 23 | ema_rate: 0.999 24 | ema: True 25 | resamp_with_conv: True 26 | 27 | diffusion: 28 | beta_schedule: linear 29 | beta_start: 0.0001 30 | beta_end: 0.02 31 | num_diffusion_timesteps: 1000 32 | 33 | training: 34 | batch_size: 8 # 64 35 | n_epochs: 10000 36 | n_iters: 5000000 37 | snapshot_freq: 5000 38 | validation_freq: 2000 39 | 40 | sampling: 41 | batch_size: 16 42 | last_only: True 43 | 44 | optim: 45 | weight_decay: 0.000 46 | optimizer: "Adam" 47 | lr: 0.00002 48 | beta1: 0.9 49 | amsgrad: false 50 | eps: 0.00000001 51 | -------------------------------------------------------------------------------- /layer_merge/models/ddpm_cfg/cifar10.yml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: "CIFAR10" 3 | image_size: 32 4 | channels: 3 5 | logit_transform: false 6 | uniform_dequantization: false 7 | gaussian_dequantization: false 8 | random_flip: true 9 | rescaled: true 10 | num_workers: 4 11 | 12 | model: 13 | type: "simple" 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: [1, 2, 2, 2] 18 | num_res_blocks: 2 19 | attn_resolutions: [16, ] 20 | dropout: 0.1 21 | var_type: fixedlarge 22 | ema_rate: 0.9999 23 | ema: True 24 | resamp_with_conv: True 25 | 26 | diffusion: 27 | beta_schedule: linear 28 | beta_start: 0.0001 29 | beta_end: 0.02 30 | num_diffusion_timesteps: 1000 31 | 32 | training: 33 | batch_size: 128 34 | n_epochs: 256 35 | n_iters: 100000 36 | snapshot_freq: 50000 37 | validation_freq: 2000 38 | 39 | sampling: 40 | batch_size: 64 41 | last_only: True 42 | 43 | optim: 44 | weight_decay: 0.000 45 | optimizer: "Adam" 46 | lr: 0.0002 47 | beta1: 0.9 48 | amsgrad: false 49 | eps: 0.00000001 50 | grad_clip: 1.0 51 | -------------------------------------------------------------------------------- /layer_merge/models/ddpm_datasets/ffhq.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import lmdb 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | 7 | 8 | class FFHQ(Dataset): 9 | def __init__(self, path, transform, resolution=8): 10 | self.env = lmdb.open( 11 | path, 12 | max_readers=32, 13 | readonly=True, 14 | lock=False, 15 | readahead=False, 16 | meminit=False, 17 | ) 18 | 19 | if not self.env: 20 | raise IOError('Cannot open lmdb dataset', path) 21 | 22 | with self.env.begin(write=False) as txn: 23 | self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8')) 24 | 25 | self.resolution = resolution 26 | self.transform = transform 27 | 28 | def __len__(self): 29 | return self.length 30 | 31 | def __getitem__(self, index): 32 | with self.env.begin(write=False) as txn: 33 | key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8') 34 | img_bytes = txn.get(key) 35 | 36 | buffer = BytesIO(img_bytes) 37 | img = Image.open(buffer) 38 | img = self.transform(img) 39 | target = 0 40 | 41 | return img, target -------------------------------------------------------------------------------- /lymg.yml: -------------------------------------------------------------------------------- 1 | name: lymg 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=4.5=1_gnu 8 | - accimage=0.2.0=py37h37b52e9_2 9 | - ca-certificates=2022.9.24=ha878542_0 10 | - certifi=2022.9.24=pyhd8ed1ab_0 11 | - cudatoolkit=11.3.1=h2bc3f7f_2 12 | - intel-ipp=2019.1.144=h711154d_3 13 | - ld_impl_linux-64=2.35.1=h7274673_9 14 | - libffi=3.3=he6710b0_2 15 | - libgcc-ng=9.3.0=h5101ec6_17 16 | - libgomp=9.3.0=h5101ec6_17 17 | - libjpeg-turbo=2.1.0=h7f98852_0 18 | - libstdcxx-ng=9.3.0=hd4cf53a_17 19 | - ncurses=6.3=h7f8727e_2 20 | - openssl=1.1.1k=h7f98852_0 21 | - pip=21.2.2=py37h06a4308_0 22 | - python=3.7.11=h12debd9_0 23 | - python_abi=3.7=2_cp37m 24 | - readline=8.1.2=h7f8727e_1 25 | - setuptools=58.0.4=py37h06a4308_0 26 | - sqlite=3.37.0=hc218d9a_0 27 | - tk=8.6.11=h1ccaba5_0 28 | - wheel=0.37.1=pyhd3eb1b0_0 29 | - xz=5.2.5=h7b6447c_0 30 | - zlib=1.2.11=h7f8727e_4 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu113 2 | accimage==0.2.0 3 | colorama==0.4.5 4 | einops==0.4.1 5 | fvcore==0.1.5.post20220512 6 | matplotlib==3.5.1 7 | numpy==1.21.5 8 | pandas==1.3.5 9 | Pillow==9.0.0 10 | progress==1.6 11 | tensorboardX==2.6 12 | timm==0.4.12 13 | torch==1.12.1+cu113 14 | torchvision==0.13.1+cu113 15 | lmdb==1.4.1 16 | tensorboard==2.11.2 17 | accelerate==0.20.3 18 | jupyter==1.0.0 19 | notebook==6.4.12 20 | gdown==4.7.3 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="layer_merge", 5 | version="0.1", 6 | packages=find_packages(), 7 | package_data={ 8 | "layer_merge": [ 9 | "kim23efficient/*.txt", 10 | "kim24layermerge/*.txt", 11 | "models/ddpm_cfg/*.yml", 12 | ] 13 | }, 14 | entry_points={ 15 | "console_scripts": [ 16 | "lymg_kim23_dp = layer_merge.kim23efficient.generate_tables:main", 17 | "lymg_kim23_imp = layer_merge.kim23efficient.importance:main", 18 | "lymg_kim24_dp = layer_merge.kim24layermerge.generate_tables:main", 19 | "lymg_kim24_imp = layer_merge.kim24layermerge.importance:main", 20 | "lymg_kim24lyr_dp = layer_merge.kim24layer.generate_tables:main", 21 | "lymg_kim24lyr_imp = layer_merge.kim24layer.importance:main", 22 | "lymg_agg = layer_merge.aggregate_imp:main", 23 | ] 24 | }, 25 | ) 26 | --------------------------------------------------------------------------------