├── .gitignore
├── DP.md
├── Diff-Pruning
    └── exp_code
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── README.md
    │   ├── calc_fid.py
    │   ├── compute_flops.py
    │   ├── compute_pruned_ssim_curve.py
    │   ├── compute_ssim.py
    │   ├── compute_ssim_vis.py
    │   ├── configs
    │       ├── bedroom.yml
    │       ├── celeba.yml
    │       ├── church.yml
    │       ├── cifar10.yml
    │       ├── cifar10_long.yml
    │       └── cifar10_pruning.yml
    │   ├── datasets
    │       ├── __init__.py
    │       ├── celeba.py
    │       ├── ffhq.py
    │       ├── lsun.py
    │       ├── utils.py
    │       └── vision.py
    │   ├── ddpm_full.txt
    │   ├── draw_ssim_pruned_curve.py
    │   ├── extract_cifar10.py
    │   ├── fid_score.py
    │   ├── finetune.py
    │   ├── finetune_simple.py
    │   ├── functions
    │       ├── __init__.py
    │       ├── ckpt_util.py
    │       ├── denoising.py
    │       └── losses.py
    │   ├── inception.py
    │   ├── main.py
    │   ├── models
    │       ├── diffusion.py
    │       └── ema.py
    │   ├── prune.py
    │   ├── prune_kd.py
    │   ├── prune_ssim.py
    │   ├── prune_test.py
    │   ├── run
    │       └── .keep
    │   ├── runners
    │       ├── __init__.py
    │       ├── diffusion.py
    │       └── diffusion_simple.py
    │   ├── scripts
    │       ├── fid_simple_cifar_kim24layer_hp.sh
    │       ├── fid_simple_cifar_our_hp.sh
    │       ├── finetune_bedroom_ddpm.sh
    │       ├── finetune_celeba_ddpm.sh
    │       ├── finetune_celeba_ddpm_kd.sh
    │       ├── finetune_church_ddpm.sh
    │       ├── finetune_cifar_ddpm.sh
    │       ├── finetune_cifar_ddpm_kd.sh
    │       ├── finetune_cifar_ddpm_random.sh
    │       ├── finetune_cifar_ddpm_taylor.sh
    │       ├── old
    │       │   ├── run_bedroom_sample_pratrained.sh
    │       │   ├── run_celeba_pruning_scratch.sh
    │       │   ├── run_celeba_pruning_taylor.sh
    │       │   ├── run_celeba_sample_pratrained.sh
    │       │   ├── run_church_pruning_taylor.sh
    │       │   ├── run_cifar_pruning_first_order_taylor.sh
    │       │   ├── run_cifar_pruning_magnitude.sh
    │       │   ├── run_cifar_pruning_random.sh
    │       │   ├── run_cifar_pruning_random_kd.sh
    │       │   ├── run_cifar_pruning_scratch.sh
    │       │   ├── run_cifar_pruning_second_order_taylor.sh
    │       │   ├── run_cifar_pruning_taylor.sh
    │       │   ├── run_cifar_pruning_taylor_kd.sh
    │       │   └── run_cifar_train.sh
    │       ├── prune_bedroom_ddpm.sh
    │       ├── prune_bedroom_ddpm_test.sh
    │       ├── prune_celeba_ddpm.sh
    │       ├── prune_celeba_ddpm_ssim.sh
    │       ├── prune_church_ddpm.sh
    │       ├── prune_church_ddpm_test.sh
    │       ├── prune_cifar_ddpm.sh
    │       ├── prune_cifar_ddpm_ssim.sh
    │       ├── prune_cifar_ddpm_test.sh
    │       ├── run_celeba.sh
    │       ├── sample_bedroom_ddpm_pretrained.sh
    │       ├── sample_bedroom_ddpm_pruning.sh
    │       ├── sample_celeba_ddpm_pruning.sh
    │       ├── sample_celeba_pretrained.sh
    │       ├── sample_church_ddpm_pruning.sh
    │       ├── sample_church_ddpm_pruning_old.sh
    │       ├── sample_church_ddpm_test.sh
    │       ├── sample_church_pretrained.sh
    │       ├── sample_cifar_ddpm_kim23efficient.sh
    │       ├── sample_cifar_ddpm_kim24layer.sh
    │       ├── sample_cifar_ddpm_kim24layermerge.sh
    │       ├── sample_cifar_ddpm_pretrained.sh
    │       ├── sample_cifar_ddpm_pruning.sh
    │       ├── sample_cifar_from_pruned_ddpm_kim23efficient.sh
    │       ├── sample_cifar_from_pruned_ddpm_kim24layer.sh
    │       ├── sample_cifar_from_pruned_ddpm_kim24layermerge.sh
    │       ├── sample_cifar_pretrained.sh
    │       ├── simple_celeba_our.sh
    │       ├── simple_cifar_from_pruned_kim23efficient.sh
    │       ├── simple_cifar_from_pruned_kim24layer.sh
    │       ├── simple_cifar_from_pruned_kim24layermerge.sh
    │       ├── simple_cifar_kim23efficient.sh
    │       ├── simple_cifar_kim24layer.sh
    │       ├── simple_cifar_kim24layer_hp.sh
    │       ├── simple_cifar_kim24layermerge.sh
    │       ├── simple_cifar_our.sh
    │       ├── simple_cifar_our_hp.sh
    │       ├── simple_cifar_our_test.sh
    │       ├── simple_rat_cifar_long_our.sh
    │       ├── simple_rat_cifar_our.sh
    │       ├── time_cifar_ddpm_kim23efficient.sh
    │       ├── time_cifar_ddpm_kim24layer.sh
    │       ├── time_cifar_ddpm_kim24layermerge.sh
    │       ├── time_cifar_ddpm_pretrained.sh
    │       └── time_cifar_ddpm_pruning.sh
    │   ├── tools
    │       ├── extract_cifar10.py
    │       └── transform_weights.py
    │   ├── torch_pruning
    │       ├── __init__.py
    │       ├── _helpers.py
    │       ├── dependency.py
    │       ├── importance.py
    │       ├── ops.py
    │       ├── pruner
    │       │   ├── __init__.py
    │       │   ├── algorithms
    │       │   │   ├── __init__.py
    │       │   │   ├── batchnorm_scale_pruner.py
    │       │   │   ├── group_norm_pruner.py
    │       │   │   ├── magnitude_based_pruner.py
    │       │   │   ├── metapruner.py
    │       │   │   ├── scaling_factor_pruner.py
    │       │   │   ├── scheduler.py
    │       │   │   └── taylor_pruner.py
    │       │   └── function.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── op_counter.py
    │       │   └── utils.py
    │   └── utils.py
├── EVALUATE.md
├── Efficient-CNN-Depth-Compression
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── asset
    │   ├── icml23.yml
    │   ├── requirements.txt
    │   └── title.png
    ├── config
    │   └── arguments.py
    ├── exps
    │   ├── aggregate_imp.py
    │   ├── generate_tables.py
    │   ├── inference_trt.py
    │   ├── main.py
    │   └── solve_dp.py
    ├── models
    │   ├── imagenet
    │   │   ├── __init__.py
    │   │   ├── mobilenetv2.py
    │   │   ├── mobilenetv2_com.py
    │   │   ├── mobilenetv2_ds.py
    │   │   ├── vgg.py
    │   │   └── vgg_com.py
    │   ├── model_op.py
    │   └── modules_trt.py
    └── utils
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── dp.py
    │   ├── loaders.py
    │   ├── logger.py
    │   ├── measure.py
    │   ├── misc.py
    │   ├── table
    │       ├── mbv2_1.0
    │       │   ├── opt_time_fish_gpu1_1228.csv
    │       │   └── time_fish_gpu1_1228.csv
    │       ├── mbv2_1.4
    │       │   ├── opt_time_fish_gpu1_0103.csv
    │       │   └── time_fish_gpu1_0103.csv
    │       └── vgg19_no_trt
    │       │   ├── opt_time_fish_gpu1_0317.csv
    │       │   └── time_fish_gpu1_0317.csv
    │   ├── train.py
    │   └── txt
    │       ├── class100.txt
    │       └── holdout_val.txt
├── HALP
    ├── Dockerfile
    ├── LICENSE
    ├── README.md
    ├── apex
    │   ├── .github
    │   │   └── ISSUE_TEMPLATE
    │   │   │   └── bug_report.md
    │   ├── .gitignore
    │   ├── .gitmodules
    │   ├── .nojekyll
    │   ├── LICENSE
    │   ├── README.md
    │   ├── apex
    │   │   ├── RNN
    │   │   │   ├── README.md
    │   │   │   ├── RNNBackend.py
    │   │   │   ├── __init__.py
    │   │   │   ├── cells.py
    │   │   │   └── models.py
    │   │   ├── __init__.py
    │   │   ├── _autocast_utils.py
    │   │   ├── amp
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── __version__.py
    │   │   │   ├── _amp_state.py
    │   │   │   ├── _initialize.py
    │   │   │   ├── _process_optimizer.py
    │   │   │   ├── amp.py
    │   │   │   ├── compat.py
    │   │   │   ├── frontend.py
    │   │   │   ├── handle.py
    │   │   │   ├── lists
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── functional_overrides.py
    │   │   │   │   ├── tensor_overrides.py
    │   │   │   │   └── torch_overrides.py
    │   │   │   ├── opt.py
    │   │   │   ├── rnn_compat.py
    │   │   │   ├── scaler.py
    │   │   │   ├── utils.py
    │   │   │   └── wrap.py
    │   │   ├── contrib
    │   │   │   ├── __init__.py
    │   │   │   ├── bottleneck
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bottleneck.py
    │   │   │   │   └── halo_exchangers.py
    │   │   │   ├── clip_grad
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── clip_grad.py
    │   │   │   ├── conv_bias_relu
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── conv_bias_relu.py
    │   │   │   ├── csrc
    │   │   │   │   ├── bottleneck
    │   │   │   │   │   └── bottleneck.cpp
    │   │   │   │   ├── conv_bias_relu
    │   │   │   │   │   └── conv_bias_relu.cpp
    │   │   │   │   ├── cudnn_gbn
    │   │   │   │   │   ├── cudnn_gbn.cpp
    │   │   │   │   │   ├── norm_sample.cpp
    │   │   │   │   │   └── norm_sample.h
    │   │   │   │   ├── fmha
    │   │   │   │   │   ├── fmha_api.cpp
    │   │   │   │   │   └── src
    │   │   │   │   │   │   ├── fmha.h
    │   │   │   │   │   │   ├── fmha
    │   │   │   │   │   │       ├── gemm.h
    │   │   │   │   │   │       ├── gmem_tile.h
    │   │   │   │   │   │       ├── kernel_traits.h
    │   │   │   │   │   │       ├── mask.h
    │   │   │   │   │   │       ├── smem_tile.h
    │   │   │   │   │   │       ├── softmax.h
    │   │   │   │   │   │       └── utils.h
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload.h
    │   │   │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload_nl.h
    │   │   │   │   │   │   ├── fmha_fill.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_kernel_1xN.h
    │   │   │   │   │   │   ├── fmha_kernel.h
    │   │   │   │   │   │   ├── fmha_noloop_reduce.cu
    │   │   │   │   │   │   └── fmha_utils.h
    │   │   │   │   ├── focal_loss
    │   │   │   │   │   ├── focal_loss_cuda.cpp
    │   │   │   │   │   └── focal_loss_cuda_kernel.cu
    │   │   │   │   ├── group_norm
    │   │   │   │   │   ├── group_norm_nhwc.cpp
    │   │   │   │   │   ├── group_norm_nhwc.h
    │   │   │   │   │   ├── group_norm_nhwc_bwd_one_pass.h
    │   │   │   │   │   ├── group_norm_nhwc_bwd_one_pass_kernel.cuh
    │   │   │   │   │   ├── group_norm_nhwc_bwd_two_pass.cu
    │   │   │   │   │   ├── group_norm_nhwc_fwd_one_pass.h
    │   │   │   │   │   ├── group_norm_nhwc_fwd_one_pass_kernel.cuh
    │   │   │   │   │   ├── group_norm_nhwc_fwd_two_pass.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_10.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_112.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_120.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_128.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_14.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_16.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_160.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_20.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_24.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_26.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_28.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_30.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_32.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_4.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_40.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_42.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_48.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_56.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_60.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_64.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_70.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_8.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_80.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_84.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_96.cu
    │   │   │   │   │   ├── group_norm_nhwc_one_pass_98.cu
    │   │   │   │   │   ├── group_norm_nhwc_op.cpp
    │   │   │   │   │   ├── macros.h
    │   │   │   │   │   └── traits.h
    │   │   │   │   ├── groupbn
    │   │   │   │   │   ├── batch_norm.cu
    │   │   │   │   │   ├── batch_norm.h
    │   │   │   │   │   ├── batch_norm_add_relu.cu
    │   │   │   │   │   ├── batch_norm_add_relu.h
    │   │   │   │   │   ├── cuda_utils.h
    │   │   │   │   │   ├── interface.cpp
    │   │   │   │   │   ├── ipc.cu
    │   │   │   │   │   └── nhwc_batch_norm_kernel.h
    │   │   │   │   ├── index_mul_2d
    │   │   │   │   │   ├── index_mul_2d_cuda.cpp
    │   │   │   │   │   └── index_mul_2d_cuda_kernel.cu
    │   │   │   │   ├── layer_norm
    │   │   │   │   │   ├── ln.h
    │   │   │   │   │   ├── ln_api.cpp
    │   │   │   │   │   ├── ln_bwd_kernels.cuh
    │   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
    │   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
    │   │   │   │   │   ├── ln_fwd_kernels.cuh
    │   │   │   │   │   ├── ln_kernel_traits.h
    │   │   │   │   │   └── ln_utils.cuh
    │   │   │   │   ├── multihead_attn
    │   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
    │   │   │   │   │   ├── dropout.cuh
    │   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
    │   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
    │   │   │   │   │   ├── layer_norm.cuh
    │   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
    │   │   │   │   │   ├── multihead_attn_frontend.cpp
    │   │   │   │   │   ├── philox.cuh
    │   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
    │   │   │   │   │   ├── softmax.cuh
    │   │   │   │   │   └── strided_batched_gemm.cuh
    │   │   │   │   ├── nccl_p2p
    │   │   │   │   │   ├── nccl_p2p.cpp
    │   │   │   │   │   ├── nccl_p2p_cuda.cu
    │   │   │   │   │   ├── nccl_p2p_cuda.cuh
    │   │   │   │   │   ├── nccl_version.cpp
    │   │   │   │   │   └── nccl_version_check.cu
    │   │   │   │   ├── optimizers
    │   │   │   │   │   ├── fused_adam_cuda.cpp
    │   │   │   │   │   ├── fused_adam_cuda_kernel.cu
    │   │   │   │   │   ├── fused_lamb_cuda.cpp
    │   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
    │   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
    │   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
    │   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
    │   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
    │   │   │   │   ├── peer_memory
    │   │   │   │   │   ├── peer_memory.cpp
    │   │   │   │   │   ├── peer_memory_cuda.cu
    │   │   │   │   │   └── peer_memory_cuda.cuh
    │   │   │   │   ├── transducer
    │   │   │   │   │   ├── transducer_joint.cpp
    │   │   │   │   │   ├── transducer_joint_kernel.cu
    │   │   │   │   │   ├── transducer_loss.cpp
    │   │   │   │   │   └── transducer_loss_kernel.cu
    │   │   │   │   └── xentropy
    │   │   │   │   │   ├── interface.cpp
    │   │   │   │   │   └── xentropy_kernel.cu
    │   │   │   ├── cudnn_gbn
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── batch_norm.py
    │   │   │   ├── examples
    │   │   │   │   └── multihead_attn
    │   │   │   │   │   ├── func_test_multihead_attn.py
    │   │   │   │   │   └── perf_test_multihead_attn.py
    │   │   │   ├── fmha
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── fmha.py
    │   │   │   ├── focal_loss
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── focal_loss.py
    │   │   │   ├── group_norm
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── group_norm.py
    │   │   │   ├── groupbn
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── batch_norm.py
    │   │   │   ├── index_mul_2d
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── index_mul_2d.py
    │   │   │   ├── layer_norm
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── layer_norm.py
    │   │   │   ├── multihead_attn
    │   │   │   │   ├── MHA_bwd.png
    │   │   │   │   ├── MHA_fwd.png
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── encdec_multihead_attn.py
    │   │   │   │   ├── encdec_multihead_attn_func.py
    │   │   │   │   ├── fast_encdec_multihead_attn_func.py
    │   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
    │   │   │   │   ├── fast_self_multihead_attn_func.py
    │   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
    │   │   │   │   ├── mask_softmax_dropout_func.py
    │   │   │   │   ├── self_multihead_attn.py
    │   │   │   │   └── self_multihead_attn_func.py
    │   │   │   ├── openfold_triton
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── _layer_norm_backward_kernels.py
    │   │   │   │   ├── _layer_norm_config_ampere.py
    │   │   │   │   ├── _layer_norm_config_hopper.py
    │   │   │   │   ├── _layer_norm_forward_kernels.py
    │   │   │   │   ├── _mha_kernel.py
    │   │   │   │   ├── fused_adam_swa.py
    │   │   │   │   ├── layer_norm.py
    │   │   │   │   └── mha.py
    │   │   │   ├── optimizers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── distributed_fused_adam.py
    │   │   │   │   ├── distributed_fused_lamb.py
    │   │   │   │   ├── fp16_optimizer.py
    │   │   │   │   ├── fused_adam.py
    │   │   │   │   ├── fused_lamb.py
    │   │   │   │   └── fused_sgd.py
    │   │   │   ├── peer_memory
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── peer_halo_exchanger_1d.py
    │   │   │   │   └── peer_memory.py
    │   │   │   ├── sparsity
    │   │   │   │   ├── COPYRIGHT
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── asp.py
    │   │   │   │   ├── permutation_lib.py
    │   │   │   │   ├── permutation_search_kernels
    │   │   │   │   │   ├── CUDA_kernels
    │   │   │   │   │   │   └── permutation_search_kernels.cu
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── call_permutation_search_kernels.py
    │   │   │   │   │   ├── channel_swap.py
    │   │   │   │   │   ├── exhaustive_search.py
    │   │   │   │   │   └── permutation_utilities.py
    │   │   │   │   ├── permutation_tests
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   ├── ablation_studies.sh
    │   │   │   │   │   ├── permutation_test.py
    │   │   │   │   │   ├── runtime_table.sh
    │   │   │   │   │   └── unstructured_study.sh
    │   │   │   │   ├── sparse_masklib.py
    │   │   │   │   └── test
    │   │   │   │   │   ├── checkpointing_test_part1.py
    │   │   │   │   │   ├── checkpointing_test_part2.py
    │   │   │   │   │   ├── checkpointing_test_reference.py
    │   │   │   │   │   ├── test_permutation_application.py
    │   │   │   │   │   └── toy_problem.py
    │   │   │   ├── test
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bottleneck
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_bottleneck_module.py
    │   │   │   │   ├── clip_grad
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_clip_grad.py
    │   │   │   │   ├── conv_bias_relu
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_conv_bias_relu.py
    │   │   │   │   ├── cudnn_gbn
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_cudnn_gbn_with_two_gpus.py
    │   │   │   │   ├── fmha
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_fmha.py
    │   │   │   │   ├── focal_loss
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_focal_loss.py
    │   │   │   │   ├── fused_dense
    │   │   │   │   │   └── test_fused_dense.py
    │   │   │   │   ├── group_norm
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_group_norm.py
    │   │   │   │   ├── index_mul_2d
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_index_mul_2d.py
    │   │   │   │   ├── layer_norm
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_fast_layer_norm.py
    │   │   │   │   ├── multihead_attn
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── test_encdec_multihead_attn.py
    │   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
    │   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
    │   │   │   │   │   ├── test_mha_fused_softmax.py
    │   │   │   │   │   ├── test_self_multihead_attn.py
    │   │   │   │   │   └── test_self_multihead_attn_norm_add.py
    │   │   │   │   ├── optimizers
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── test_dist_adam.py
    │   │   │   │   │   └── test_distributed_fused_lamb.py
    │   │   │   │   ├── peer_memory
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_peer_halo_exchange_module.py
    │   │   │   │   ├── transducer
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── test_transducer_joint.py
    │   │   │   │   │   └── test_transducer_loss.py
    │   │   │   │   └── xentropy
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_label_smoothing.py
    │   │   │   ├── transducer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── _transducer_ref.py
    │   │   │   │   └── transducer.py
    │   │   │   └── xentropy
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── softmax_xentropy.py
    │   │   ├── fp16_utils
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── fp16_optimizer.py
    │   │   │   ├── fp16util.py
    │   │   │   └── loss_scaler.py
    │   │   ├── fused_dense
    │   │   │   ├── __init__.py
    │   │   │   └── fused_dense.py
    │   │   ├── mlp
    │   │   │   ├── __init__.py
    │   │   │   └── mlp.py
    │   │   ├── multi_tensor_apply
    │   │   │   ├── __init__.py
    │   │   │   └── multi_tensor_apply.py
    │   │   ├── normalization
    │   │   │   ├── __init__.py
    │   │   │   └── fused_layer_norm.py
    │   │   ├── optimizers
    │   │   │   ├── __init__.py
    │   │   │   ├── fused_adagrad.py
    │   │   │   ├── fused_adam.py
    │   │   │   ├── fused_lamb.py
    │   │   │   ├── fused_mixed_precision_lamb.py
    │   │   │   ├── fused_novograd.py
    │   │   │   └── fused_sgd.py
    │   │   ├── parallel
    │   │   │   ├── LARC.py
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── distributed.py
    │   │   │   ├── multiproc.py
    │   │   │   ├── optimized_sync_batchnorm.py
    │   │   │   ├── optimized_sync_batchnorm_kernel.py
    │   │   │   ├── sync_batchnorm.py
    │   │   │   └── sync_batchnorm_kernel.py
    │   │   └── transformer
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── _data
    │   │   │       ├── __init__.py
    │   │   │       └── _batchsampler.py
    │   │   │   ├── _ucc_util.py
    │   │   │   ├── amp
    │   │   │       ├── __init__.py
    │   │   │       └── grad_scaler.py
    │   │   │   ├── enums.py
    │   │   │   ├── functional
    │   │   │       ├── __init__.py
    │   │   │       └── fused_softmax.py
    │   │   │   ├── layers
    │   │   │       ├── __init__.py
    │   │   │       └── layer_norm.py
    │   │   │   ├── log_util.py
    │   │   │   ├── microbatches.py
    │   │   │   ├── parallel_state.py
    │   │   │   ├── pipeline_parallel
    │   │   │       ├── __init__.py
    │   │   │       ├── _timers.py
    │   │   │       ├── p2p_communication.py
    │   │   │       ├── schedules
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── common.py
    │   │   │       │   ├── fwd_bwd_no_pipelining.py
    │   │   │       │   ├── fwd_bwd_pipelining_with_interleaving.py
    │   │   │       │   └── fwd_bwd_pipelining_without_interleaving.py
    │   │   │       └── utils.py
    │   │   │   ├── tensor_parallel
    │   │   │       ├── __init__.py
    │   │   │       ├── cross_entropy.py
    │   │   │       ├── data.py
    │   │   │       ├── layers.py
    │   │   │       ├── mappings.py
    │   │   │       ├── memory.py
    │   │   │       ├── random.py
    │   │   │       └── utils.py
    │   │   │   ├── testing
    │   │   │       ├── __init__.py
    │   │   │       ├── arguments.py
    │   │   │       ├── commons.py
    │   │   │       ├── distributed_test_base.py
    │   │   │       ├── global_vars.py
    │   │   │       ├── standalone_bert.py
    │   │   │       ├── standalone_gpt.py
    │   │   │       └── standalone_transformer_lm.py
    │   │   │   └── utils.py
    │   ├── csrc
    │   │   ├── amp_C_frontend.cpp
    │   │   ├── compat.h
    │   │   ├── flatten_unflatten.cpp
    │   │   ├── fused_dense.cpp
    │   │   ├── fused_dense_cuda.cu
    │   │   ├── layer_norm_cuda.cpp
    │   │   ├── layer_norm_cuda_kernel.cu
    │   │   ├── megatron
    │   │   │   ├── fused_weight_gradient_dense.cpp
    │   │   │   ├── fused_weight_gradient_dense_16bit_prec_cuda.cu
    │   │   │   ├── fused_weight_gradient_dense_cuda.cu
    │   │   │   ├── generic_scaled_masked_softmax.cpp
    │   │   │   ├── generic_scaled_masked_softmax.h
    │   │   │   ├── generic_scaled_masked_softmax_cuda.cu
    │   │   │   ├── scaled_masked_softmax.cpp
    │   │   │   ├── scaled_masked_softmax.h
    │   │   │   ├── scaled_masked_softmax_cuda.cu
    │   │   │   ├── scaled_softmax.cpp
    │   │   │   ├── scaled_softmax_cuda.cu
    │   │   │   ├── scaled_upper_triang_masked_softmax.cpp
    │   │   │   ├── scaled_upper_triang_masked_softmax.h
    │   │   │   └── scaled_upper_triang_masked_softmax_cuda.cu
    │   │   ├── mlp.cpp
    │   │   ├── mlp_cuda.cu
    │   │   ├── multi_tensor_adagrad.cu
    │   │   ├── multi_tensor_adam.cu
    │   │   ├── multi_tensor_apply.cuh
    │   │   ├── multi_tensor_axpby_kernel.cu
    │   │   ├── multi_tensor_l2norm_kernel.cu
    │   │   ├── multi_tensor_l2norm_kernel_mp.cu
    │   │   ├── multi_tensor_l2norm_scale_kernel.cu
    │   │   ├── multi_tensor_lamb.cu
    │   │   ├── multi_tensor_lamb_mp.cu
    │   │   ├── multi_tensor_lamb_stage_1.cu
    │   │   ├── multi_tensor_lamb_stage_2.cu
    │   │   ├── multi_tensor_novograd.cu
    │   │   ├── multi_tensor_scale_kernel.cu
    │   │   ├── multi_tensor_sgd_kernel.cu
    │   │   ├── static_switch.h
    │   │   ├── syncbn.cpp
    │   │   ├── type_shim.h
    │   │   ├── update_scale_hysteresis.cu
    │   │   └── welford.cu
    │   ├── docs
    │   │   ├── Makefile
    │   │   └── source
    │   │   │   ├── _static
    │   │   │       ├── css
    │   │   │       │   └── pytorch_theme.css
    │   │   │       └── img
    │   │   │       │   └── nv-pytorch2.png
    │   │   │   ├── _templates
    │   │   │       └── layout.html
    │   │   │   ├── advanced.rst
    │   │   │   ├── amp.rst
    │   │   │   ├── conf.py
    │   │   │   ├── fp16_utils.rst
    │   │   │   ├── index.rst
    │   │   │   ├── layernorm.rst
    │   │   │   ├── optimizers.rst
    │   │   │   └── parallel.rst
    │   ├── examples
    │   │   ├── README.md
    │   │   ├── dcgan
    │   │   │   ├── README.md
    │   │   │   └── main_amp.py
    │   │   ├── docker
    │   │   │   ├── Dockerfile
    │   │   │   └── README.md
    │   │   ├── imagenet
    │   │   │   ├── README.md
    │   │   │   └── main_amp.py
    │   │   └── simple
    │   │   │   └── distributed
    │   │   │       ├── README.md
    │   │   │       ├── distributed_data_parallel.py
    │   │   │       └── run.sh
    │   ├── pyproject.toml
    │   ├── requirements.txt
    │   ├── requirements_dev.txt
    │   ├── setup.py
    │   └── tests
    │   │   ├── L0
    │   │       ├── run_amp
    │   │       │   ├── __init__.py
    │   │       │   ├── test_add_param_group.py
    │   │       │   ├── test_basic_casts.py
    │   │       │   ├── test_cache.py
    │   │       │   ├── test_checkpointing.py
    │   │       │   ├── test_fused_sgd.py
    │   │       │   ├── test_larc.py
    │   │       │   ├── test_multi_tensor_axpby.py
    │   │       │   ├── test_multi_tensor_l2norm.py
    │   │       │   ├── test_multi_tensor_scale.py
    │   │       │   ├── test_multi_tensor_unscale_l2norm.py
    │   │       │   ├── test_multiple_models_optimizers_losses.py
    │   │       │   ├── test_promotion.py
    │   │       │   ├── test_rnn.py
    │   │       │   ├── test_update_scale_hysteresis.py
    │   │       │   └── utils.py
    │   │       ├── run_deprecated
    │   │       │   └── test_deprecated_warning.py
    │   │       ├── run_fp16util
    │   │       │   ├── __init__.py
    │   │       │   └── test_fp16util.py
    │   │       ├── run_fused_layer_norm
    │   │       │   └── test_fused_layer_norm.py
    │   │       ├── run_mlp
    │   │       │   └── test_mlp.py
    │   │       ├── run_optimizers
    │   │       │   ├── __init__.py
    │   │       │   ├── test_adam.py
    │   │       │   ├── test_fused_novograd.py
    │   │       │   ├── test_fused_optimizer.py
    │   │       │   └── test_lamb.py
    │   │       ├── run_test.py
    │   │       └── run_transformer
    │   │       │   ├── __init__.py
    │   │       │   ├── gpt_scaling_test.py
    │   │       │   ├── test_batch_sampler.py
    │   │       │   ├── test_bert_minimal.py
    │   │       │   ├── test_cross_entropy.py
    │   │       │   ├── test_data.py
    │   │       │   ├── test_dynamic_batchsize.py
    │   │       │   ├── test_fused_softmax.py
    │   │       │   ├── test_gpt_minimal.py
    │   │       │   ├── test_layers.py
    │   │       │   ├── test_mapping.py
    │   │       │   ├── test_microbatches.py
    │   │       │   ├── test_p2p_comm.py
    │   │       │   ├── test_parallel_state.py
    │   │       │   ├── test_pipeline_parallel_fwd_bwd.py
    │   │       │   ├── test_random.py
    │   │       │   └── test_transformer_utils.py
    │   │   ├── L1
    │   │       ├── common
    │   │       │   ├── compare.py
    │   │       │   ├── main_amp.py
    │   │       │   └── run_test.sh
    │   │       ├── cross_product
    │   │       │   └── run.sh
    │   │       ├── cross_product_distributed
    │   │       │   └── run.sh
    │   │       └── transformer
    │   │       │   └── pipeline_parallel_fwd_bwd_ucc_async.py
    │   │   ├── distributed
    │   │       ├── DDP
    │   │       │   ├── ddp_race_condition_test.py
    │   │       │   └── run_race_test.sh
    │   │       ├── amp_master_params
    │   │       │   ├── amp_master_params.py
    │   │       │   ├── compare.py
    │   │       │   └── run.sh
    │   │       └── synced_batchnorm
    │   │       │   ├── python_single_gpu_unit_test.py
    │   │       │   ├── single_gpu_unit_test.py
    │   │       │   ├── test_batchnorm1d.py
    │   │       │   ├── test_groups.py
    │   │       │   ├── two_gpu_test_different_batch_size.py
    │   │       │   ├── two_gpu_unit_test.py
    │   │       │   └── unit_test.sh
    │   │   └── docker_extension_builds
    │   │       └── run.sh
    ├── assets
    │   └── pipeline.png
    ├── configs
    │   ├── exp_configs
    │   │   ├── rn34_imagenet_baseline.yaml
    │   │   ├── rn34_imagenet_baseline_eval.yaml
    │   │   ├── rn34_imagenet_prune_rat0.2.yaml
    │   │   ├── rn34_imagenet_prune_rat0.25.yaml
    │   │   ├── rn34_imagenet_prune_rat0.3.yaml
    │   │   ├── rn34_imagenet_prune_rat0.35.yaml
    │   │   ├── rn34_imagenet_prune_rat0.45.yaml
    │   │   ├── rn50_imagenet_baseline.yaml
    │   │   ├── rn50_imagenet_baseline_eval.yaml
    │   │   ├── rn50_imagenet_prune_rat0.1.yaml
    │   │   ├── rn50_imagenet_prune_rat0.15.yaml
    │   │   ├── rn50_imagenet_prune_rat0.2.yaml
    │   │   └── rn50_imagenet_prune_rat0.45.yaml
    │   └── prune_configs
    │   │   ├── rn34_fmap.json
    │   │   ├── rn34_prune_layer.json
    │   │   ├── rn50_fmap.json
    │   │   ├── rn50_prune_layer.json
    │   │   ├── rtx2080_rn34_prune_groups.json
    │   │   ├── rtx2080_rn50_prune_groups.json
    │   │   ├── titanv_rn34_prune_groups.json
    │   │   └── titanv_rn50_prune_groups.json
    ├── data
    │   └── dataloaders.py
    ├── main.py
    ├── models
    │   ├── __init__.py
    │   ├── create_model.py
    │   ├── resnet.py
    │   ├── resnet_fused.py
    │   └── resnet_pruned.py
    ├── multiproc.py
    ├── profile_halp.py
    ├── prune
    │   ├── cost.py
    │   ├── importance.py
    │   ├── prune_config.py
    │   └── pruner.py
    ├── train
    │   ├── lr_schedule.py
    │   ├── optimizer.py
    │   └── training.py
    └── utils
    │   ├── mixup.py
    │   ├── model_summary.py
    │   ├── smoothing.py
    │   └── utils.py
├── LICENSE
├── README.md
├── asset
    ├── short_demo.png
    └── title.png
├── examples
    ├── .gitignore
    ├── 0_mbv2_demo.ipynb
    ├── 1_ddpm_demo.ipynb
    ├── ckpt
    │   └── .keep
    ├── ddpm_cifar10.yml
    ├── imagenet1000clsidx_to_list.txt
    └── images
    │   └── husky.png
├── layer_merge
    ├── __init__.py
    ├── aggregate_imp.py
    ├── ddpm_trainer.py
    ├── kim23efficient
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── generate_tables.py
    │   ├── holdout_val.txt
    │   └── importance.py
    ├── kim24layer
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── generate_tables.py
    │   ├── holdout_train.txt
    │   ├── holdout_val.txt
    │   └── importance.py
    ├── kim24layermerge
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── generate_tables.py
    │   ├── holdout_train.txt
    │   ├── holdout_val.txt
    │   └── importance.py
    ├── measure.py
    ├── models
    │   ├── __init__.py
    │   ├── ddpm.py
    │   ├── ddpm_cfg
    │   │   ├── __init__.py
    │   │   ├── bedroom.yml
    │   │   ├── celeba.yml
    │   │   ├── church.yml
    │   │   └── cifar10.yml
    │   ├── ddpm_datasets
    │   │   ├── __init__.py
    │   │   ├── celeba.py
    │   │   ├── cifar.py
    │   │   ├── cifar10_holdout_train.txt
    │   │   ├── cifar10_holdout_val.txt
    │   │   ├── ffhq.py
    │   │   ├── lsun.py
    │   │   ├── utils.py
    │   │   └── vision.py
    │   ├── ddpm_layer.py
    │   ├── ddpm_merged.py
    │   ├── ddpm_merged_layer.py
    │   ├── merge_op.py
    │   ├── mobilenetv2.py
    │   ├── mobilenetv2_layer.py
    │   ├── mobilenetv2_merged_layer.py
    │   ├── resnet.py
    │   ├── resnet_layer.py
    │   ├── resnet_merged.py
    │   └── resnet_merged_layer.py
    └── trainer.py
├── lymg.yml
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | output/
 2 | __pycache__
 3 | *.pkl
 4 | *.DS_Store
 5 | LUT*/
 6 | model_ckpt/
 7 | cache/
 8 | run_slurm/
 9 | slurm/
10 | output*
11 | rn34_output*
12 | rn50_output*
13 | rn34_rtx2080*
14 | rn50_rtx2080*
15 | model*.txt
16 | test_skip.py
17 | generate_lst.py
18 | generate_group.py
19 | measure_halp.py
20 | merge_rtx2080.py
21 | test.py
22 | plots/
23 | pretrained/
24 | my_util/
25 | msr*
26 | solve*.sh
27 | *.egg-info
28 | ddpm_chk.txt
29 | ddpm_mgd.txt
30 | .ipynb_checkpoints/
31 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | __pycache__
3 | *.log
4 | run/finetune_simple_v2/
5 | run/*.npz
6 | run/sample_*
7 | run/time_*
8 | data
9 | *.png


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jiaming Song
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/calc_fid.py:
--------------------------------------------------------------------------------
 1 | from cleanfid import fid
 2 | import argparse
 3 | parser = argparse.ArgumentParser(description=globals()["__doc__"])
 4 | parser.add_argument('--path1', type=str, required=True, help='Path to the images')
 5 | parser.add_argument('--path2', type=str, required=True, help='Path to the images')
 6 | args = parser.parse_args()
 7 | 
 8 | if args.path2=="cifar10":
 9 |     score = fid.compute_fid(args.dir, dataset_name="cifar10", dataset_res=32, dataset_split="train")
10 | else:
11 |     score = fid.compute_fid(args.path1, args.path2)
12 | print("FID: ", score)


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/compute_flops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random, os
 3 | import argparse
 4 | from PIL import Image
 5 | import torchvision
 6 | import numpy as np
 7 | import pytorch_msssim
 8 | from utils import UnlabeledImageFolder
 9 | from tqdm import tqdm 
10 | import torch_pruning as tp
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--restore_from', type=str, required=True)
13 | args = parser.parse_args()
14 | 
15 | model = torch.load(args.restore_from, map_location='cpu')[0]
16 | example_inputs = {'x': torch.randn(1, 3, 32, 32), 't': torch.ones(1)}
17 | macs, params = tp.utils.count_ops_and_params(model, example_inputs)
18 | print("model: {}, macs: {} G, params: {} M".format(args.restore_from, macs/1e9, params/1e6))
19 | 
20 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/compute_pruned_ssim_curve.py:
--------------------------------------------------------------------------------
 1 | import pytorch_msssim 
 2 | import os
 3 | import torch
 4 | from PIL import Image
 5 | import torchvision
 6 | 
 7 | base_folder_name = 'run/prune_ssim_2/0'
 8 | folder_name = [os.path.join('run/prune_ssim_2', '{}'.format(k)) for k in range(50, 1000+1, 50)]
 9 | n_samples = 32
10 | # test ssim for each folder
11 | folder_ssim = []
12 | for f in folder_name:
13 |     ssim_list = []
14 |     for img_id in range(n_samples):
15 |         img1 = Image.open(os.path.join(base_folder_name, f'{img_id}.png'))
16 |         img2 = Image.open(os.path.join(f, f'{img_id}.png'))
17 |         img1_tensor = torchvision.transforms.ToTensor()(img1)
18 |         img2_tensor = torchvision.transforms.ToTensor()(img2)
19 |         img1_tensor = img1_tensor.unsqueeze(0)
20 |         img2_tensor = img2_tensor.unsqueeze(0)
21 |         ssim = pytorch_msssim.ssim(img1_tensor, img2_tensor, data_range=1.0, size_average=True)
22 |         ssim_list.append(ssim)
23 |     ssim = sum(ssim_list) / len(ssim_list)
24 |     folder_ssim.append(ssim.item())
25 | print(folder_ssim)


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/compute_ssim.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random, os
 3 | import argparse
 4 | from PIL import Image
 5 | import torchvision
 6 | import numpy as np
 7 | import pytorch_msssim
 8 | from utils import UnlabeledImageFolder
 9 | from tqdm import tqdm 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--path', type=str, required=True, nargs='+')
12 | args = parser.parse_args()
13 | 
14 | # generate radom index
15 | nrow = 16
16 | img_index = random.sample(list(range(50000)), nrow*nrow)
17 | path1 = args.path[0]
18 | path2 = args.path[1]
19 | print(path1, path2)
20 | img_dst1 = UnlabeledImageFolder(path1, transform=torchvision.transforms.ToTensor(), exts=["png"])
21 | img_dst2 = UnlabeledImageFolder(path2, transform=torchvision.transforms.ToTensor(), exts=["png"])
22 | print(len(img_dst1), len(img_dst2))
23 | 
24 | loader1 = torch.utils.data.DataLoader(
25 |     img_dst1,
26 |     batch_size=100,
27 |     shuffle=False,
28 |     num_workers=4,
29 |     drop_last=False,
30 | )
31 | loader2 = torch.utils.data.DataLoader(
32 |     img_dst2,
33 |     batch_size=100,
34 |     shuffle=False,
35 |     num_workers=4,
36 |     drop_last=False,
37 | )
38 | 
39 | with torch.no_grad():
40 |     ssim_list = []
41 |     mse_list = []
42 |     for i, (img1, img2) in tqdm(enumerate(zip(loader1, loader2))):
43 |         ssim = pytorch_msssim.ssim(img1.cuda(), img2.cuda(), data_range=1.0, size_average=False)
44 |         ssim_list.append(ssim.cpu())
45 |         mse = torch.nn.functional.mse_loss(img1.cuda(), img2.cuda(), reduction='none').mean(dim=(1,2,3))
46 |         mse_list.append(mse.cpu())
47 | 
48 |     ssim = torch.cat(ssim_list, dim=0)
49 |     mse = torch.cat(mse_list, dim=0)
50 |     ssim_avg = ssim.mean()
51 |     mse_avg = mse.mean()
52 |     print("path1: {}, path2: {}, ssim: {}, mse: {}".format(path1, path2, ssim_avg, mse_avg))
53 | 
54 |     


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/compute_ssim_vis.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random, os
 3 | import argparse
 4 | from PIL import Image
 5 | import torchvision
 6 | import numpy as np
 7 | import pytorch_msssim
 8 | from utils import UnlabeledImageFolder
 9 | from tqdm import tqdm 
10 | img_ids = [159, 149, 144, 127, 86, 41]
11 | image_folder1 = 'run/sample_v2/bedroom_250k/image_samples/images/0'
12 | image_folder2 = 'run/sample_v2/bedroom_official/image_samples/images/0'
13 | base_img_id = 0
14 | ssim_list = []
15 | for iid in img_ids:
16 |     img1 = Image.open(os.path.join(image_folder1, f'{iid}.png'))
17 |     img2 = Image.open(os.path.join(image_folder2, f'{iid}.png'))
18 |     img1_tensor = torchvision.transforms.ToTensor()(img1).unsqueeze(0)
19 |     img2_tensor = torchvision.transforms.ToTensor()(img2).unsqueeze(0)
20 |     ssim = pytorch_msssim.ssim(img1_tensor, img2_tensor, data_range=1.0, size_average=True)
21 |     ssim_list.append(ssim.item())
22 | print(ssim_list)
23 |     


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/bedroom.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "LSUN"
 3 |     category: "bedroom"
 4 |     image_size: 256
 5 |     channels: 3
 6 |     logit_transform: false
 7 |     uniform_dequantization: false
 8 |     gaussian_dequantization: false
 9 |     random_flip: true
10 |     rescaled: true
11 |     num_workers: 32
12 | 
13 | model:
14 |     type: "simple"
15 |     in_channels: 3
16 |     out_ch: 3
17 |     ch: 128
18 |     ch_mult: [1, 1, 2, 2, 4, 4]
19 |     num_res_blocks: 2
20 |     attn_resolutions: [16, ]
21 |     dropout: 0.0
22 |     var_type: fixedsmall
23 |     ema_rate: 0.999
24 |     ema: True
25 |     resamp_with_conv: True
26 | 
27 | diffusion:
28 |     beta_schedule: linear
29 |     beta_start: 0.0001
30 |     beta_end: 0.02
31 |     num_diffusion_timesteps: 1000
32 | 
33 | training:
34 |     batch_size: 8
35 |     n_epochs: 10000
36 |     n_iters: 5000000
37 |     snapshot_freq: 5000
38 |     validation_freq: 2000
39 | 
40 | sampling:
41 |     batch_size: 16
42 |     last_only: True
43 | 
44 | optim:
45 |     weight_decay: 0.000
46 |     optimizer: "Adam"
47 |     lr: 0.000002
48 |     beta1: 0.9
49 |     amsgrad: false
50 |     eps: 0.00000001
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/celeba.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CELEBA"
 3 |     image_size: 64
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2, 4]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 96 # 128
34 |     n_epochs: 10000
35 |     n_iters: 5000000
36 |     snapshot_freq: 5000
37 |     validation_freq: 20000
38 | 
39 | sampling:
40 |     batch_size: 32
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/church.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "LSUN"
 3 |     category: "church_outdoor"
 4 |     image_size: 256
 5 |     channels: 3
 6 |     logit_transform: false
 7 |     uniform_dequantization: false
 8 |     gaussian_dequantization: false
 9 |     random_flip: true
10 |     rescaled: true
11 |     num_workers: 32
12 | 
13 | model:
14 |     type: "simple"
15 |     in_channels: 3
16 |     out_ch: 3
17 |     ch: 128
18 |     ch_mult: [1, 1, 2, 2, 4, 4]
19 |     num_res_blocks: 2
20 |     attn_resolutions: [16, ]
21 |     dropout: 0.0
22 |     var_type: fixedsmall
23 |     ema_rate: 0.999
24 |     ema: True
25 |     resamp_with_conv: True
26 | 
27 | diffusion:
28 |     beta_schedule: linear
29 |     beta_start: 0.0001
30 |     beta_end: 0.02
31 |     num_diffusion_timesteps: 1000
32 | 
33 | training:
34 |     batch_size: 8 # 64
35 |     n_epochs: 10000
36 |     n_iters: 5000000
37 |     snapshot_freq: 5000
38 |     validation_freq: 2000
39 | 
40 | sampling:
41 |     batch_size: 16
42 |     last_only: True
43 | 
44 | optim:
45 |     weight_decay: 0.000
46 |     optimizer: "Adam"
47 |     lr: 0.00002
48 |     beta1: 0.9
49 |     amsgrad: false
50 |     eps: 0.00000001
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/cifar10.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CIFAR10"
 3 |     image_size: 32
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 128
34 |     n_epochs: 256
35 |     n_iters: 100000
36 |     snapshot_freq: 50000
37 |     validation_freq: 2000
38 | 
39 | sampling:
40 |     batch_size: 64
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/cifar10_long.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CIFAR10"
 3 |     image_size: 32
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 128
34 |     n_epochs: 512
35 |     n_iters: 200000
36 |     snapshot_freq: 100000
37 |     validation_freq: 2000
38 | 
39 | sampling:
40 |     batch_size: 64
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/configs/cifar10_pruning.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CIFAR10"
 3 |     image_size: 32
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 128
34 |     n_epochs: 10000
35 |     n_iters: 5000000
36 |     snapshot_freq: 5000
37 |     validation_freq: 2000
38 | 
39 | sampling:
40 |     batch_size: 64
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.00002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/datasets/ffhq.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | import lmdb
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | class FFHQ(Dataset):
 9 |     def __init__(self, path, transform, resolution=8):
10 |         self.env = lmdb.open(
11 |             path,
12 |             max_readers=32,
13 |             readonly=True,
14 |             lock=False,
15 |             readahead=False,
16 |             meminit=False,
17 |         )
18 | 
19 |         if not self.env:
20 |             raise IOError('Cannot open lmdb dataset', path)
21 | 
22 |         with self.env.begin(write=False) as txn:
23 |             self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
24 | 
25 |         self.resolution = resolution
26 |         self.transform = transform
27 | 
28 |     def __len__(self):
29 |         return self.length
30 | 
31 |     def __getitem__(self, index):
32 |         with self.env.begin(write=False) as txn:
33 |             key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8')
34 |             img_bytes = txn.get(key)
35 | 
36 |         buffer = BytesIO(img_bytes)
37 |         img = Image.open(buffer)
38 |         img = self.transform(img)
39 |         target = 0
40 | 
41 |         return img, target


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/extract_cifar10.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torchvision
 3 | from torchvision.datasets import CIFAR10
 4 | from tqdm import tqdm
 5 | 
 6 | # Define the path to the folder where the images will be saved
 7 | save_path = 'data/cifar10/images'
 8 | 
 9 | # Create the folder if it doesn't exist
10 | if not os.path.exists(save_path):
11 |     os.makedirs(save_path)
12 | 
13 | # Load the CIFAR10 dataset
14 | dataset = CIFAR10(root='data/cifar10', train=True, download=True)
15 | 
16 | # Loop through the dataset and save each image to the folder
17 | for i in tqdm(range(len(dataset))):
18 |     image, label = dataset[i]
19 |     image_name = f'{i}.png'
20 |     image_path = os.path.join(save_path, image_name)
21 |     image.save(image_path)


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch.optim as optim
 2 | 
 3 | 
 4 | def get_optimizer(config, parameters):
 5 |     if config.optim.optimizer == 'Adam':
 6 |         return optim.Adam(parameters, lr=config.optim.lr, weight_decay=config.optim.weight_decay,
 7 |                           betas=(config.optim.beta1, 0.999), amsgrad=config.optim.amsgrad,
 8 |                           eps=config.optim.eps)
 9 |     elif config.optim.optimizer == 'RMSProp':
10 |         return optim.RMSprop(parameters, lr=config.optim.lr, weight_decay=config.optim.weight_decay)
11 |     elif config.optim.optimizer == 'SGD':
12 |         return optim.SGD(parameters, lr=config.optim.lr, momentum=0.9)
13 |     else:
14 |         raise NotImplementedError(
15 |             'Optimizer {} not understood.'.format(config.optim.optimizer))
16 | 
17 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/functions/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def noise_estimation_loss(model,
 5 |                           x0: torch.Tensor,
 6 |                           t: torch.LongTensor,
 7 |                           e: torch.Tensor,
 8 |                           b: torch.Tensor, keepdim=False):
 9 |     a = (1-b).cumprod(dim=0).index_select(0, t).view(-1, 1, 1, 1)
10 |     x = x0 * a.sqrt() + e * (1.0 - a).sqrt()
11 |     output = model(x, t.float())
12 |     if keepdim:
13 |         return (e - output).square().sum(dim=(1, 2, 3))
14 |     else:
15 |         return (e - output).square().sum(dim=(1, 2, 3)).mean(dim=0)
16 | 
17 | def noise_estimation_kd_loss(model,
18 |                              teacher,
19 |                           x0: torch.Tensor,
20 |                           t: torch.LongTensor,
21 |                           e: torch.Tensor,
22 |                           b: torch.Tensor, keepdim=False):
23 |     a = (1-b).cumprod(dim=0).index_select(0, t).view(-1, 1, 1, 1)
24 |     x = x0 * a.sqrt() + e * (1.0 - a).sqrt()
25 |     output = model(x, t.float())
26 |     with torch.no_grad():
27 |         teacher_output = teacher(x, t.float())
28 |     if keepdim:
29 |         return 0.7*(teacher_output - output).square().sum(dim=(1, 2, 3)) + 0.3 * (e - output).square().sum(dim=(1, 2, 3))
30 |     else:
31 |         return 0.7*(teacher_output - output).square().sum(dim=(1, 2, 3)).mean(dim=0) + 0.3 * (e - output).square().sum(dim=(1, 2, 3)).mean(dim=0)
32 | 
33 | 
34 | loss_registry = {
35 |     'simple': noise_estimation_loss,
36 | }
37 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/run/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Diff-Pruning/exp_code/run/.keep


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/runners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Diff-Pruning/exp_code/runners/__init__.py


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_bedroom_ddpm.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=6 --master_port 22223 --use_env finetune.py \
 2 | --config bedroom.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune/bedroom_ddpm_$1_0.3_finetuned-continue-v4-2e-5 \
 7 | --doc post_training \
 8 | --skip_type uniform  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --load_pruned_model run/finetune/bedroom_ddpm_taylor_0.3_finetuned-continue-v3-2e-6/logs/post_training/ckpt_65000.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_celeba_ddpm.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config celeba.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_final/celeba_T=$1_finetuned \
 7 | --doc post_training \
 8 | --skip_type uniform  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --load_pruned_model "run/pruned_final/celeba_T=$1.pth" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_celeba_ddpm_kd.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config celeba.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_v2/celeba_ddpm_$1_0.3_finetuned_kd \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --kd \
13 | --load_pruned_model run/pruned/celeba_ddpm_$1_0.3.pth  \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_church_ddpm.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22223 --use_env finetune.py \
 2 | --config church.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune/church_ddpm_$1_0.3_finetuned \
 7 | --doc post_training \
 8 | --skip_type uniform  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --load_pruned_model run/pruned/church_ddpm_$1_0.3.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_v3/cifar10_ddpm_$1_finetuned_0.05T.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --load_pruned_model run/pruned_v5/cifar10_pruned_$1.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_kd.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_v2/cifar10_ddpm_$1_0.3_finetuned_kd \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --kd \
13 | --load_pruned_model run/pruned/cifar10_pruned_$1_0.3.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_random.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=2 --master_port 22223 --use_env finetune.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune/cifar10_pruned_random_0.3_finetuned\
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner random \
13 | --load_pruned_model run/pruned/cifar10_pruned_random_0.3.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/finetune_cifar_ddpm_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune/cifar10_pruned_taylor_0.3_real_x_finetuned \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner taylor \
13 | --load_pruned_model run/pruned/cifar10_pruned_taylor_0.3_real_x.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_bedroom_sample_pratrained.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config bedroom.yml \
 3 | --exp run/ddim_bedroom_official \
 4 | --sample \
 5 | --use_pretrained \
 6 | --timesteps 50 \
 7 | --eta 0 \
 8 | --ni \
 9 | --doc 50steps_quad \
10 | --skip_type quad  \
11 | --pruning_ratio 0.0 \
12 | --fid \
13 | --use_ema


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_celeba_pruning_scratch.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config celeba.yml \
 3 | --exp run/ddim_celeba_pruning_reinit \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner reinit \
13 | --taylor_batch_size 96 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_celeba_pruning_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config celeba.yml \
 3 | --exp run/ddim_celeba_pruning_taylor \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner taylor \
13 | --taylor_batch_size 96 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_celeba_sample_pratrained.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config celeba.yml \
 3 | --exp run/ddim_celeba_official \
 4 | --sample \
 5 | --use_pretrained \
 6 | --timesteps 100 \
 7 | --eta 0 \
 8 | --ni \
 9 | --doc 100steps_quad \
10 | --skip_type quad  \
11 | --pruning_ratio 0.0 \
12 | --fid \
13 | --use_ema \
14 | --restore_from run/cache/diffusion_models_converted/celeba/ckpt.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_church_pruning_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config church.yml \
 3 | --exp run/ddim_church_pruning_taylor \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner random \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_first_order_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_first_order_taylor \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner first_order_taylor \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_magnitude.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_magnitude \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner magnitude \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_random.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_random \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner random \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_random_kd.sh:
--------------------------------------------------------------------------------
 1 | python -B prune_kd.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_random \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner random \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_scratch.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_reinit \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner reinit \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_second_order_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_second_order_taylor \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner second_order_taylor \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_taylor.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_taylor \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner taylor \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_pruning_taylor_kd.sh:
--------------------------------------------------------------------------------
 1 | python -B prune_kd.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_pruning_taylor_kd \
 4 | --timesteps 100 \
 5 | --eta 0 \
 6 | --ni \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --pruner taylor \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/old/run_cifar_train.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/ddim_cifar10_train_v2 \
 4 | --use_pretrained \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc post_training_with_0.2_pruning_ratio_v2 \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.2 \
11 | --use_ema \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_bedroom_ddpm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune.py \
 5 | --config "bedroom.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.3" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "$1" \
15 | --save_pruned_model "run/pruned/bedroom_ddpm_$1_0.3.pth" \
16 | --taylor_batch_size "4"


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_bedroom_ddpm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune_test.py \
 5 | --config "bedroom.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.05" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "$1" \
15 | --save_pruned_model "run/pruned_test/bedroom_ddpm_$1.pth" \
16 | --taylor_batch_size "4"


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_celeba_ddpm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune.py \
 5 | --config "celeba.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.3" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "ours" \
15 | --save_pruned_model "run/pruned_final/celeba_T=$1.pth" \
16 | --taylor_batch_size "64" \
17 | --thr "$1"


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_celeba_ddpm_ssim.sh:
--------------------------------------------------------------------------------
 1 | python -B prune_ssim.py \
 2 | --config celeba.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --doc post_training \
 7 | --skip_type quad  \
 8 | --pruning_ratio 0.15 \
 9 | --use_ema \
10 | --use_pretrained \
11 | --stage $1 \
12 | --pruner "ours" \
13 | --save_pruned_model run/pruned_v4/celeba_pruned.pth \
14 | --taylor_batch_size 64


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_church_ddpm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune.py \
 5 | --config "church.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.3" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "$1" \
15 | --save_pruned_model "run/pruned/church_ddpm_$1_0.3.pth" \
16 | --taylor_batch_size "2"


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_church_ddpm_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune_test.py \
 5 | --config "church.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.05" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "$1" \
15 | --save_pruned_model "run/pruned_test/church_ddpm_$1.pth" \
16 | --taylor_batch_size "4"


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_cifar_ddpm.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --doc post_training \
 7 | --skip_type quad  \
 8 | --pruning_ratio 0.3 \
 9 | --use_ema \
10 | --use_pretrained \
11 | --pruner "$1" \
12 | --save_pruned_model run/pruned_v5/cifar10_pruned_$1_$2.pth \
13 | --thr $2 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_cifar_ddpm_ssim.sh:
--------------------------------------------------------------------------------
 1 | python -B prune_ssim.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --doc post_training \
 7 | --skip_type quad  \
 8 | --pruning_ratio 0.2 \
 9 | --use_ema \
10 | --use_pretrained \
11 | --stage $1 \
12 | --pruner "ours" \
13 | --save_pruned_model run/pruned_v4/cifar10_pruned.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/prune_cifar_ddpm_test.sh:
--------------------------------------------------------------------------------
 1 | python -B prune_test.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --doc post_training \
 7 | --skip_type quad  \
 8 | --pruning_ratio 0.3 \
 9 | --use_ema \
10 | --use_pretrained \
11 | --pruner "$1" \
12 | --save_pruned_model run/pruned_test/cifar10_pruned_$1_0.2.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/run_celeba.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Execute the Python script with the provided arguments
 4 | python -B prune.py \
 5 | --config "celeba.yml" \
 6 | --timesteps "100" \
 7 | --eta "0" \
 8 | --ni \
 9 | --doc "post_training" \
10 | --skip_type "quad" \
11 | --pruning_ratio "0.3" \
12 | --use_ema \
13 | --use_pretrained \
14 | --pruner "ours" \
15 | --save_pruned_model "run/pruned_final/celeba_T=$1.pth" \
16 | --taylor_batch_size "64" \
17 | --thr "$1"
18 | 
19 | python -B finetune.py \
20 | --config celeba.yml \
21 | --timesteps 100 \
22 | --eta 0 \
23 | --ni \
24 | --exp run/finetune_final/celeba_T=$1_finetuned \
25 | --doc post_training \
26 | --skip_type uniform  \
27 | --pruning_ratio 0.3 \
28 | --use_ema \
29 | --use_pretrained \
30 | --load_pruned_model "run/pruned_final/celeba_T=$1.pth" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_bedroom_ddpm_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=1 --master_port 22200 --use_env finetune.py \
 2 | --config bedroom.yml \
 3 | --exp $1 \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --use_pretrained \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_bedroom_ddpm_pruning.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22223 --use_env finetune.py \
 2 | --config bedroom.yml \
 3 | --exp $2 \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from $1 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_celeba_ddpm_pruning.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config celeba.yml \
 3 | --exp $2 \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from $1 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_celeba_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config celeba.yml \
 3 | --exp run/sample/ddim_celeba_official \
 4 | --sample \
 5 | --use_pretrained \
 6 | --timesteps 100 \
 7 | --eta 0 \
 8 | --ni \
 9 | --doc official \
10 | --skip_type uniform  \
11 | --pruning_ratio 0.0 \
12 | --fid \
13 | --use_ema


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_church_ddpm_pruning.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config church.yml \
 3 | --exp run/sample/church_ddpm_350k \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from run/finetune_v2/church_pruned_taylor_0.3_finetuned/logs/post_training/ckpt_350000.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_church_ddpm_pruning_old.sh:
--------------------------------------------------------------------------------
 1 | python -B -m torch.distributed.launch --nproc_per_node=4 --master_port 22221 --use_env finetune.py \
 2 | --config church.yml \
 3 | --exp $2 \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from $1 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_church_ddpm_test.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config church.yml \
 3 | --exp run/sample/church_ddpm_350k \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type uniform  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from run/finetune_v2/church_pruned_taylor_0.3_finetuned/logs/post_training/ckpt_350000.pth \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_church_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config church.yml \
 3 | --exp run/sample/ddim_church_official \
 4 | --sample \
 5 | --use_pretrained \
 6 | --timesteps 100 \
 7 | --eta 0 \
 8 | --ni \
 9 | --doc official \
10 | --skip_type uniform  \
11 | --pruning_ratio 0.0 \
12 | --fid \
13 | --use_ema


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim23efficient.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim23efficient \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim24layer.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layer \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_kim24layermerge.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layermerge \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$1" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --use_pretrained \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_ddpm_pruning.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$2" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim23efficient.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim23efficient \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim24layer.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layer \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_from_pruned_ddpm_kim24layermerge.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --sample \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layermerge \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/sample_cifar_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B prune.py \
 2 | --config cifar10.yml \
 3 | --exp run/sample/ddim_cifar10_official \
 4 | --sample \
 5 | --use_pretrained \
 6 | --timesteps 100 \
 7 | --eta 0 \
 8 | --ni \
 9 | --doc sample_100k \
10 | --skip_type quad  \
11 | --pruning_ratio 0.0 \
12 | --fid \
13 | --use_ema


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_celeba_our.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config celeba.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple/celeba_ours_T=$1.pth \
 7 | --doc post_training \
 8 | --skip_type uniform  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours \
14 | --taylor_batch_size 64


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim23efficient.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/$3$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --restore_from "$4" \
11 | --depth_method kim23efficient \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \
15 | --from_pruned


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim24layer.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/$3$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --restore_from "$4" \
11 | --depth_method kim24layer \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \
15 | --from_pruned


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_from_pruned_kim24layermerge.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/$3$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --restore_from "$4" \
11 | --depth_method kim24layermerge \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \
15 | --from_pruned


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_kim23efficient.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/$3$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --use_pretrained \
11 | --depth_method kim23efficient \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_kim24layer.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/lr0.0004_$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --use_pretrained \
11 | --depth_method kim24layer \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_kim24layer_hp.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/lr$3_beta$4_$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --use_pretrained \
11 | --depth_method kim24layer \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr $3 \
15 | --beta1 $4 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_kim24layermerge.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/$3$2 \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --use_ema \
10 | --use_pretrained \
11 | --depth_method kim24layermerge \
12 | --depth_path $2 \
13 | --thr $1 \
14 | --lr 0.0004 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_our.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/cifar10_ours_T=$1.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_our_hp.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/cifar10_ours_T=$1_lr$2_beta$3.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours \
14 | --lr $2 \
15 | --beta1 $3 \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_cifar_our_test.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2_test/cifar10_ours_T=$1.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio 0.3 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_rat_cifar_long_our.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10_long.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/cifar10_long_ours/T=$1_rat=$2.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio $2 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours \
14 | --lr 0.0004


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/simple_rat_cifar_our.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune_simple.py \
 2 | --config cifar10.yml \
 3 | --timesteps 100 \
 4 | --eta 0 \
 5 | --ni \
 6 | --exp run/finetune_simple_v2/cifar10_ours/T=$1_rat=$2.pth \
 7 | --doc post_training \
 8 | --skip_type quad  \
 9 | --pruning_ratio $2 \
10 | --use_ema \
11 | --use_pretrained \
12 | --thr $1 \
13 | --pruner ours \
14 | --lr 0.0004


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim23efficient.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --measure \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim23efficient \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim24layer.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --measure \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layer \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/time_cifar_ddpm_kim24layermerge.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$3" \
 4 | --measure \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --depth_method kim24layermerge \
14 | --depth_path $2 \
15 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/time_cifar_ddpm_pretrained.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$1" \
 4 | --measure \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --use_pretrained \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/scripts/time_cifar_ddpm_pruning.sh:
--------------------------------------------------------------------------------
 1 | python -B finetune.py \
 2 | --config cifar10.yml \
 3 | --exp "$2" \
 4 | --measure \
 5 | --timesteps 100 \
 6 | --eta 0 \
 7 | --ni \
 8 | --doc sample \
 9 | --skip_type quad  \
10 | --pruning_ratio 0.0 \
11 | --fid \
12 | --use_ema \
13 | --restore_from "$1" \


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/tools/extract_cifar10.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torchvision
 3 | from torchvision.datasets import CIFAR10
 4 | from tqdm import tqdm
 5 | 
 6 | # Define the path to the folder where the images will be saved
 7 | save_path = 'data/cifar10/images'
 8 | 
 9 | # Create the folder if it doesn't exist
10 | if not os.path.exists(save_path):
11 |     os.makedirs(save_path)
12 | 
13 | # Load the CIFAR10 dataset
14 | dataset = CIFAR10(root='data/cifar10', train=True, download=True)
15 | 
16 | # Loop through the dataset and save each image to the folder
17 | for i in tqdm(range(len(dataset))):
18 |     image, label = dataset[i]
19 |     image_name = f'{i}.png'
20 |     image_path = os.path.join(save_path, image_name)
21 |     image.save(image_path)


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/tools/transform_weights.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | state = torch.load("model.ckpt.old")
4 | old_dict = state[0]
5 | print(state[0].keys())
6 | state[0] = {pname.replace("module.", ''): pval for pname, pval in old_dict.items()}
7 | print(state[0].keys())
8 | torch.save(state, "model.ckpt")


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/__init__.py:
--------------------------------------------------------------------------------
1 | from .dependency import *
2 | from .pruner import *
3 | from . import _helpers, utils, importance


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/pruner/__init__.py:
--------------------------------------------------------------------------------
1 | from .function import *
2 | from .algorithms import *


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | from .metapruner import MetaPruner
2 | from .magnitude_based_pruner import MagnitudePruner
3 | from .batchnorm_scale_pruner import BNScalePruner
4 | from .group_norm_pruner import GroupNormPruner
5 | from .scaling_factor_pruner import ScalingFactorPruner 
6 | from .taylor_pruner import TaylorPruner


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/batchnorm_scale_pruner.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from typing import Callable
 3 | from .metapruner import MetaPruner
 4 | from .scheduler import linear_scheduler
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | class BNScalePruner(MetaPruner):
 9 |     def __init__(
10 |         self,
11 |         model,
12 |         example_inputs,
13 |         importance,
14 |         reg=1e-5,
15 |         iterative_steps=1,
16 |         iterative_sparsity_scheduler: Callable = linear_scheduler,
17 |         ch_sparsity=0.5,
18 |         ch_sparsity_dict=None,
19 |         global_pruning=False,
20 |         max_ch_sparsity=1.0,
21 |         round_to=None,
22 |         ignored_layers=None,
23 |         customized_pruners=None,
24 |         unwrapped_parameters=None,
25 |         output_transform=None,
26 |     ):
27 |         super(BNScalePruner, self).__init__(
28 |             model=model,
29 |             example_inputs=example_inputs,
30 |             importance=importance,
31 |             iterative_steps=iterative_steps,
32 |             iterative_sparsity_scheduler=iterative_sparsity_scheduler,
33 |             ch_sparsity=ch_sparsity,
34 |             ch_sparsity_dict=ch_sparsity_dict,
35 |             global_pruning=global_pruning,
36 |             max_ch_sparsity=max_ch_sparsity,
37 |             round_to=round_to,
38 |             ignored_layers=ignored_layers,
39 |             customized_pruners=customized_pruners,
40 |             unwrapped_parameters=unwrapped_parameters,
41 |             output_transform=output_transform,
42 |         )
43 |         self.reg = reg
44 | 
45 |     def regularize(self, model):
46 |         for m in model.modules():
47 |             if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) and m.affine==True:
48 |                 m.weight.grad.data.add_(self.reg*torch.sign(m.weight.data))
49 | 


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/magnitude_based_pruner.py:
--------------------------------------------------------------------------------
1 | from .metapruner import MetaPruner
2 | 
3 | class MagnitudePruner(MetaPruner):
4 |     pass
5 |     


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/pruner/algorithms/scheduler.py:
--------------------------------------------------------------------------------
1 | 
2 | def linear_scheduler(ch_sparsity_dict, steps):
3 |     return [((i) / float(steps)) * ch_sparsity_dict for i in range(steps+1)]


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/torch_pruning/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from .op_counter import count_ops_and_params


--------------------------------------------------------------------------------
/Diff-Pruning/exp_code/utils.py:
--------------------------------------------------------------------------------
 1 | import torch, os
 2 | from glob import glob
 3 | from PIL import Image
 4 | 
 5 | class UnlabeledImageFolder(torch.utils.data.Dataset):
 6 |     def __init__(self, root, transform=None, exts=["*.jpg", "*.png", "*.jpeg", "*.webp"]):
 7 |         self.root = root
 8 |         self.files = []
 9 |         self.transform = transform
10 |         for ext in exts:
11 |             self.files.extend(glob(os.path.join(root, '**/*.{}'.format(ext)), recursive=True))
12 | 
13 |     def __len__(self):
14 |         return len(self.files)
15 | 
16 |     def __getitem__(self, idx):
17 |         path = self.files[idx]
18 |         img = Image.open(path).convert("RGB")
19 |         if self.transform is not None:
20 |             img = self.transform(img)
21 |         return img
22 | 
23 | import torch
24 | 
25 | def set_dropout(model, p):
26 |     for m in model.modules():
27 |         if isinstance(m, torch.nn.Dropout):
28 |             m.p = p
29 |         


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | exp_result*
 3 | pretrained
 4 | mb_v2_w1.0*
 5 | mb_v2_w1.4*
 6 | vgg19
 7 | kd_exps
 8 | slurm
 9 | run_*
10 | *.log
11 | *.zip
12 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 snu-mllab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/asset/icml23.yml:
--------------------------------------------------------------------------------
 1 | name: icml23
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - _openmp_mutex=4.5=1_gnu
 8 |   - accimage=0.2.0=py37h37b52e9_2
 9 |   - ca-certificates=2022.9.24=ha878542_0
10 |   - certifi=2022.9.24=pyhd8ed1ab_0
11 |   - cudatoolkit=11.3.1=h2bc3f7f_2
12 |   - intel-ipp=2019.1.144=h711154d_3
13 |   - ld_impl_linux-64=2.35.1=h7274673_9
14 |   - libffi=3.3=he6710b0_2
15 |   - libgcc-ng=9.3.0=h5101ec6_17
16 |   - libgomp=9.3.0=h5101ec6_17
17 |   - libjpeg-turbo=2.1.0=h7f98852_0
18 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
19 |   - ncurses=6.3=h7f8727e_2
20 |   - openssl=1.1.1k=h7f98852_0
21 |   - pip=21.2.2=py37h06a4308_0
22 |   - python=3.7.11=h12debd9_0
23 |   - python_abi=3.7=2_cp37m
24 |   - readline=8.1.2=h7f8727e_1
25 |   - setuptools=58.0.4=py37h06a4308_0
26 |   - sqlite=3.37.0=hc218d9a_0
27 |   - tk=8.6.11=h1ccaba5_0
28 |   - wheel=0.37.1=pyhd3eb1b0_0
29 |   - xz=5.2.5=h7b6447c_0
30 |   - zlib=1.2.11=h7f8727e_4
31 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/asset/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | accimage==0.2.0
 3 | colorama==0.4.5
 4 | einops==0.4.1
 5 | fvcore==0.1.5.post20220512
 6 | matplotlib==3.5.1
 7 | numpy==1.21.5
 8 | pandas==1.3.5
 9 | Pillow==9.5.0
10 | progress==1.6
11 | tensorboardX==2.6
12 | timm==0.4.12
13 | torch==1.12.1+cu113
14 | torchvision==0.13.1+cu113
15 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/asset/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/Efficient-CNN-Depth-Compression/asset/title.png


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/exps/aggregate_imp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."))
 5 | 
 6 | import pandas as pd
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser(description="Inference Time with TensorRT")
10 | parser.add_argument(
11 |     "-d",
12 |     "--dir",
13 |     type=str,
14 |     help="directory name",
15 | )
16 | parser.add_argument(
17 |     "-n",
18 |     "--num",
19 |     type=int,
20 |     help="the number of blks",
21 | )
22 | import re
23 | 
24 | 
25 | def natural_key(string_):
26 |     return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)]
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     args = parser.parse_args()
31 |     res = pd.DataFrame()
32 |     for currentpath, folders, files in os.walk(args.dir):
33 |         for f in sorted(files, key=natural_key):
34 |             if ".csv" in f:
35 |                 print(f)
36 |                 tmp = pd.read_csv(os.path.join(currentpath, f))
37 |                 res = pd.concat([res, tmp])
38 |     print(len(res))
39 |     assert len(res) == args.num
40 |     res.to_csv(os.path.join(args.dir, "importance.csv"))
41 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/models/imagenet/__init__.py:
--------------------------------------------------------------------------------
 1 | from .mobilenetv2 import *
 2 | from .mobilenetv2_com import *
 3 | from .mobilenetv2_ds import *
 4 | from .vgg import *
 5 | from .vgg_com import *
 6 | 
 7 | models = {
 8 |     "mobilenet_v2": mobilenet_v2,
 9 |     "learn_mobilenet_v2": learn_mobilenet_v2,
10 |     "dep_shrink_mobilenet_v2": dep_shrink_mobilenet_v2,
11 |     "vgg19": vgg19_bn,
12 |     "learn_vgg19": learn_vgg19_bn,
13 | }
14 | 
15 | blocks = {
16 |     "mobilenet_v2": InvertedResidual,
17 |     "learn_mobilenet_v2": InvertedResidual,
18 |     "dep_shrink_mobilenet_v2": InvertedResidual,
19 |     "vgg19": VGGBlock,
20 |     "learn_vgg19": LearnVGGBlock,
21 | }
22 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/models/modules_trt.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | from collections import OrderedDict
 7 | 
 8 | 
 9 | class NaiveFeed(nn.Module):
10 |     def __init__(self, odict: OrderedDict) -> None:
11 |         super().__init__()
12 |         self.md = nn.Sequential(odict)
13 | 
14 |     def forward(self, x):
15 |         return self.md(x)
16 | 
17 | 
18 | class SkipFeed(nn.Module):
19 |     def __init__(self, odict: OrderedDict, last=nn.Identity) -> None:
20 |         super().__init__()
21 |         self.md = nn.Sequential(odict)
22 |         self.last = last()
23 | 
24 |     def forward(self, x):
25 |         return self.last(self.md(x) + x)
26 | 
27 | 
28 | class Downsample(nn.Module):
29 |     def __init__(self, planes) -> None:
30 |         super().__init__()
31 |         self.planes = planes
32 | 
33 |     def forward(self, x):
34 |         sz = x.shape[3] // 2
35 |         ch = x.shape[1] // 2
36 |         out = x
37 |         out = F.interpolate(out, size=(sz, sz))
38 |         zeros = out.mul(0)
39 |         out = torch.cat((zeros[:, :ch, :, :], out), 1)
40 |         out = torch.cat((out, zeros[:, ch:, :, :]), 1)
41 |         return out
42 | 
43 | 
44 | class SkipFeedDown(nn.Module):
45 |     def __init__(
46 |         self, odict: OrderedDict, last=nn.Identity, downsample=nn.Identity()
47 |     ) -> None:
48 |         super().__init__()
49 |         self.md = nn.Sequential(odict)
50 |         self.last = last()
51 |         self.downsample = downsample
52 | 
53 |     def forward(self, x):
54 |         return self.last(self.md(x) + self.downsample(x))
55 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Useful utils
2 | """
3 | from .logger import *
4 | from .train import *
5 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/utils/table/vgg19_no_trt/time_fish_gpu1_0317.csv:
--------------------------------------------------------------------------------
 1 | id,st,end,time,stdev
 2 | 0,0,1,8.326913743019103,0.017304351327325815
 3 | 1,0,2,8.772203865051269,0.02060655504859982
 4 | 2,1,2,20.50561405181885,0.08567755181542744
 5 | 3,2,3,41.0177968788147,6.546454036595063
 6 | 4,2,4,114.17423645019531,0.37080851651707775
 7 | 5,3,4,14.111193132400512,0.029675090426546084
 8 | 6,4,5,25.186032161712646,0.11286922832404914
 9 | 7,4,6,30.260052652359008,0.28093528033398557
10 | 8,4,7,207.41159881591796,0.36424596242395935
11 | 9,4,8,338.93406860351564,0.1620991236145511
12 | 10,5,6,9.722453293800355,0.042286760436264254
13 | 11,5,7,13.03276577949524,0.07575332235689078
14 | 12,5,8,103.39464534759522,0.20889484580545423
15 | 13,6,7,9.725028190612793,0.04050072965233604
16 | 14,6,8,13.03555697441101,0.0779521749173917
17 | 15,7,8,9.716243696212768,0.04117186801377567
18 | 16,8,9,17.8516659450531,0.1073048045457938
19 | 17,8,10,22.99314765930176,0.21371451202721295
20 | 18,8,11,206.29462867736817,0.2996600949701716
21 | 19,8,12,340.9868132019043,0.26668334990896564
22 | 20,9,10,7.417239518165588,0.03485802200762281
23 | 21,9,11,12.60063491344452,0.0710037261804765
24 | 22,9,12,105.64399105072022,0.09381249297199233
25 | 23,10,11,7.412527368068695,0.0435099201397835
26 | 24,10,12,12.601713118553162,0.07170721415827588
27 | 25,11,12,7.415155837535858,0.04905487540587592
28 | 26,12,13,7.403466064929962,0.04160782811405336
29 | 27,12,14,12.585364966392516,0.07354652251636305
30 | 28,12,15,105.63326538085937,0.09045596020779985
31 | 29,12,16,179.90253311157227,0.3572176476267134
32 | 30,13,14,2.4252339148521425,0.03501485338327998
33 | 31,13,15,3.372817919254303,0.018959956059685847
34 | 32,13,16,29.415552349090575,0.22555944676716505
35 | 33,14,15,2.4251503944396973,0.0346148784304925
36 | 34,14,16,3.3729427111148835,0.021966171804449645
37 | 35,15,16,2.422551679611206,0.025841686677925104
38 | 


--------------------------------------------------------------------------------
/Efficient-CNN-Depth-Compression/utils/txt/class100.txt:
--------------------------------------------------------------------------------
  1 | n02869837
  2 | n01749939
  3 | n02488291
  4 | n02107142
  5 | n13037406
  6 | n02091831
  7 | n04517823
  8 | n04589890
  9 | n03062245
 10 | n01773797
 11 | n01735189
 12 | n07831146
 13 | n07753275
 14 | n03085013
 15 | n04485082
 16 | n02105505
 17 | n01983481
 18 | n02788148
 19 | n03530642
 20 | n04435653
 21 | n02086910
 22 | n02859443
 23 | n13040303
 24 | n03594734
 25 | n02085620
 26 | n02099849
 27 | n01558993
 28 | n04493381
 29 | n02109047
 30 | n04111531
 31 | n02877765
 32 | n04429376
 33 | n02009229
 34 | n01978455
 35 | n02106550
 36 | n01820546
 37 | n01692333
 38 | n07714571
 39 | n02974003
 40 | n02114855
 41 | n03785016
 42 | n03764736
 43 | n03775546
 44 | n02087046
 45 | n07836838
 46 | n04099969
 47 | n04592741
 48 | n03891251
 49 | n02701002
 50 | n03379051
 51 | n02259212
 52 | n07715103
 53 | n03947888
 54 | n04026417
 55 | n02326432
 56 | n03637318
 57 | n01980166
 58 | n02113799
 59 | n02086240
 60 | n03903868
 61 | n02483362
 62 | n04127249
 63 | n02089973
 64 | n03017168
 65 | n02093428
 66 | n02804414
 67 | n02396427
 68 | n04418357
 69 | n02172182
 70 | n01729322
 71 | n02113978
 72 | n03787032
 73 | n02089867
 74 | n02119022
 75 | n03777754
 76 | n04238763
 77 | n02231487
 78 | n03032252
 79 | n02138441
 80 | n02104029
 81 | n03837869
 82 | n03494278
 83 | n04136333
 84 | n03794056
 85 | n03492542
 86 | n02018207
 87 | n04067472
 88 | n03930630
 89 | n03584829
 90 | n02123045
 91 | n04229816
 92 | n02100583
 93 | n03642806
 94 | n04336792
 95 | n03259280
 96 | n02116738
 97 | n02108089
 98 | n03424325
 99 | n01855672
100 | n02090622
101 | 


--------------------------------------------------------------------------------
/HALP/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:19.12-py3
 2 | ARG DEBIAN_FRONTEND=noninteractive
 3 | RUN apt-get update && apt-get -y install sudo dialog apt-utils && rm -rf /var/lib/apt/lists/*
 4 | USER root
 5 | 
 6 | # Install some basic utilities
 7 | RUN apt-get update && apt-get install -y \    
 8 |     curl \
 9 |     ca-certificates \
10 |     sudo \
11 |     unzip \
12 |     htop \
13 |     wget \
14 |     git \
15 |     bzip2 \
16 |     libx11-6 \
17 |  && rm -rf /var/lib/apt/lists/*
18 | 
19 | RUN mkdir -p /workspace/
20 | ENV HOME=/workspace/
21 | RUN chmod 777 /workspace/
22 | 
23 | RUN pip install easydict
24 | RUN pip install opencv-python
25 | 
26 | RUN pip install tensorboardX
27 | CMD ["python3"]
28 | 
29 | WORKDIR /workspace/
30 | USER root
31 | 
32 | 


--------------------------------------------------------------------------------
/HALP/apex/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve apex
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the Bug**
11 | 
12 | **Minimal Steps/Code to Reproduce the Bug**
13 | <!--
14 | Please list the *minimal* steps or provide a code snippet for us to be able to reproduce the bug.
15 | 
16 | A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
17 | --> 
18 | 
19 | **Expected Behavior**
20 | <!-- A clear and concise description of what you expected to happen. -->
21 | 
22 | **Environment**
23 | <!-- OS, version of Python, CUDA, PyTorch; collect these via `python -m torch.utils.collect_env` -->
24 | 


--------------------------------------------------------------------------------
/HALP/apex/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "apex/contrib/csrc/multihead_attn/cutlass"]
2 | 	path = apex/contrib/csrc/multihead_attn/cutlass
3 | 	url = https://github.com/NVIDIA/cutlass.git
4 | 	branch = v1.2.0
5 | [submodule "apex/contrib/csrc/cudnn-frontend"]
6 | 	path = apex/contrib/csrc/cudnn-frontend
7 | 	url = https://github.com/NVIDIA/cudnn-frontend.git
8 | 


--------------------------------------------------------------------------------
/HALP/apex/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/.nojekyll


--------------------------------------------------------------------------------
/HALP/apex/LICENSE:
--------------------------------------------------------------------------------
 1 | All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/HALP/apex/apex/RNN/README.md:
--------------------------------------------------------------------------------
1 | **This module will be removed by the end of February 2023**
2 | 
3 | Under construction...
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/RNN/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM
2 | 
3 | __all__ = ['models']
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/_autocast_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | __all__ = ["_cast_if_autocast_enabled"]
 7 | 
 8 | 
 9 | def _get_autocast_dtypes() -> Sequence[torch.dtype]:
10 |     if torch.cuda.is_bf16_supported():
11 |         return [torch.half, torch.bfloat16]
12 |     return [torch.half]
13 | 
14 | 
15 | def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
16 |     if not torch.is_autocast_enabled():
17 |         return torch.float or dtype
18 |     else:
19 |         return torch.get_autocast_gpu_dtype()
20 | 
21 | 
22 | def _cast_if_autocast_enabled(*args):
23 |     if not torch.is_autocast_enabled():
24 |         return args
25 |     else:
26 |         return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
27 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from .amp import init, half_function, float_function, promote_function,\
2 |     register_half_function, register_float_function, register_promote_function
3 | from .handle import scale_loss, disable_casts
4 | from .frontend import initialize, state_dict, load_state_dict
5 | from ._amp_state import master_params, _amp_state
6 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/amp/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | __version__ = '.'.join(map(str, VERSION))
3 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/amp/compat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # True for post-0.4, when Variables/Tensors merged.
 4 | def variable_is_tensor():
 5 |     v = torch.autograd.Variable()
 6 |     return isinstance(v, torch.Tensor)
 7 | 
 8 | def tensor_is_variable():
 9 |     x = torch.Tensor()
10 |     return type(x) == torch.autograd.Variable
11 | 
12 | # False for post-0.4
13 | def tensor_is_float_tensor():
14 |     x = torch.Tensor()
15 |     return type(x) == torch.FloatTensor
16 | 
17 | # Akin to `torch.is_tensor`, but returns True for Variable
18 | # objects in pre-0.4.
19 | def is_tensor_like(x):
20 |     return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
21 | 
22 | # Wraps `torch.is_floating_point` if present, otherwise checks
23 | # the suffix of `x.type()`.
24 | def is_floating_point(x):
25 |     if hasattr(torch, 'is_floating_point'):
26 |         return torch.is_floating_point(x)
27 |     try:
28 |         torch_type = x.type()
29 |         return torch_type.endswith('FloatTensor') or \
30 |             torch_type.endswith('HalfTensor') or \
31 |             torch_type.endswith('DoubleTensor')
32 |     except AttributeError:
33 |         return False
34 | 
35 | def scalar_python_val(x):
36 |     if hasattr(x, 'item'):
37 |         return x.item()
38 |     else:
39 |         if isinstance(x, torch.autograd.Variable):
40 |             return x.data[0]
41 |         else:
42 |             return x[0]
43 | 
44 | # Accounts for the possibility that some ops may be removed from a namespace.
45 | def filter_attrs(module, attrs):
46 |     return list(attrname for attrname in attrs if hasattr(module, attrname))
47 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/amp/lists/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/amp/lists/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/amp/lists/tensor_overrides.py:
--------------------------------------------------------------------------------
 1 | from .. import compat
 2 | from . import torch_overrides
 3 | 
 4 | import importlib
 5 | 
 6 | import torch
 7 | 
 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable():
 9 | MODULE = torch.Tensor
10 | # else:
11 | #     MODULE = torch.autograd.Variable
12 | 
13 | 
14 | FP16_FUNCS = compat.filter_attrs(MODULE, [
15 |     '__matmul__',
16 | ])
17 | 
18 | FP32_FUNCS = compat.filter_attrs(MODULE, [
19 |     '__ipow__',
20 |     '__pow__',
21 |     '__rpow__',
22 | 
23 |     # Cast to fp32 before transfer to CPU
24 |     'cpu',
25 | ])
26 | 
27 | CASTS = compat.filter_attrs(MODULE, [
28 |     '__add__',
29 |     '__div__',
30 |     '__eq__',
31 |     '__ge__',
32 |     '__gt__',
33 |     '__iadd__',
34 |     '__idiv__',
35 |     '__imul__',
36 |     '__isub__',
37 |     '__itruediv__',
38 |     '__le__',
39 |     '__lt__',
40 |     '__mul__',
41 |     '__ne__',
42 |     '__radd__',
43 |     '__rdiv__',
44 |     '__rmul__',
45 |     '__rsub__',
46 |     '__rtruediv__',
47 |     '__sub__',
48 |     '__truediv__',
49 | ])
50 | 
51 | # None of these, but here to make code cleaner.
52 | SEQUENCE_CASTS = []
53 | 
54 | # We need to grab all the methods from torch_overrides and add them to
55 | # the Tensor lists as well, as almost all methods are duplicated
56 | # between `torch` and `torch.Tensor` (and check with `hasattr`,
57 | # because a few random ones aren't defined on Tensor)
58 | _self_mod = importlib.import_module(__name__)
59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
60 |     lst = getattr(_self_mod, attrname)
61 |     for fn in getattr(torch_overrides, attrname):
62 |         if hasattr(MODULE, fn):
63 |             lst.append(fn)
64 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/bottleneck/__init__.py:
--------------------------------------------------------------------------------
1 | from .bottleneck import Bottleneck, SpatialBottleneck
2 | from .halo_exchangers import HaloExchangerNoComm, HaloExchangerAllGather, HaloExchangerSendRecv, HaloExchangerPeer
3 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/clip_grad/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip_grad import clip_grad_norm_
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/conv_bias_relu/__init__.py:
--------------------------------------------------------------------------------
1 | from .conv_bias_relu import ConvBiasReLU, ConvBias, ConvBiasMaskReLU, ConvFrozenScaleBiasReLU
2 | 
3 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_10.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 10, /* THREADS_PER_BLOCK */ 640)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_112.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 112, /* THREADS_PER_BLOCK */ 448)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_120.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 120, /* THREADS_PER_BLOCK */ 480)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_128.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 128, /* THREADS_PER_BLOCK */ 512)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_14.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 14, /* THREADS_PER_BLOCK */ 224)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_16.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 16, /* THREADS_PER_BLOCK */ 256)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_160.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 160, /* THREADS_PER_BLOCK */ 640)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_20.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 20, /* THREADS_PER_BLOCK */ 640)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_24.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 24, /* THREADS_PER_BLOCK */ 384)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_26.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 26, /* THREADS_PER_BLOCK */ 416)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_28.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 28, /* THREADS_PER_BLOCK */ 448)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_30.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 30, /* THREADS_PER_BLOCK */ 480)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_32.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 32, /* THREADS_PER_BLOCK */ 512)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_4.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | 
23 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 4, /* THREADS_PER_BLOCK */ 128)
24 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_40.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 40, /* THREADS_PER_BLOCK */ 640)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_42.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 42, /* THREADS_PER_BLOCK */ 672)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_48.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 48, /* THREADS_PER_BLOCK */ 384)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_56.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 56, /* THREADS_PER_BLOCK */ 448)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_60.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 60, /* THREADS_PER_BLOCK */ 480)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_64.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 64, /* THREADS_PER_BLOCK */ 512)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_70.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 70, /* THREADS_PER_BLOCK */ 560)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_8.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 8, /* THREADS_PER_BLOCK */ 128)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_80.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 80, /* THREADS_PER_BLOCK */ 640)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_84.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 84, /* THREADS_PER_BLOCK */ 672)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_96.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 96, /* THREADS_PER_BLOCK */ 768)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/group_norm/group_norm_nhwc_one_pass_98.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without modification, are not permit-
 5 |  * ted.
 6 |  *
 7 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
 8 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
 9 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
10 |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
11 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
12 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
13 |  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
14 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 |  *
16 |  **************************************************************************************************/
17 | 
18 | #include "group_norm_nhwc_fwd_one_pass_kernel.cuh"
19 | #include "group_norm_nhwc_bwd_one_pass_kernel.cuh"
20 | #include "macros.h"
21 | 
22 | GN_FWD_BWD_ONE_PASS_DEFINITION(/* CHANNELS_PER_GROUP */ 98, /* THREADS_PER_BLOCK */ 392)
23 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/groupbn/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #include <ATen/cuda/CUDAContext.h>
 2 | #ifndef CUDA_UTILS_H
 3 | #define CUDA_UTILS_H
 4 | 
 5 | namespace at {
 6 | namespace cuda {
 7 | 
 8 | namespace utils {
 9 | 
10 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
11 |     return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
12 | }
13 | 
14 | 
15 | }
16 | }
17 | }
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "nccl_p2p_cuda.cuh"
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |   m.def("get_unique_nccl_id", &apex::contrib::nccl_p2p::get_unique_nccl_id, "get_unique_nccl_id");
21 |   m.def("init_nccl_comm", &apex::contrib::nccl_p2p::init_nccl_comm, "init_nccl_comm");
22 |   m.def("left_right_halo_exchange_inplace", &apex::contrib::nccl_p2p::left_right_halo_exchange_inplace, "left_right_halo_exchange_inplace");
23 |   m.def("left_right_halo_exchange", &apex::contrib::nccl_p2p::left_right_halo_exchange, "left_right_halo_exchange");
24 |   m.def("add_delay", &apex::contrib::nccl_p2p::add_delay, "add_delay");
25 | }
26 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <torch/extension.h>
19 | #ifndef _nccl_p2p_h_
20 | #define _nccl_p2p_h_
21 | 
22 | namespace apex { namespace contrib { namespace nccl_p2p {
23 | at::Tensor get_unique_nccl_id(int n);
24 | int init_nccl_comm(
25 |         at::Tensor unique_nccl_id, 
26 |         int my_rank, 
27 |         int num_ranks
28 |         );
29 | void left_right_halo_exchange_inplace(
30 |         int handle,
31 |         int left_rank,
32 |         int right_rank,
33 | 	at::Tensor left_output_halo,
34 | 	at::Tensor right_output_halo,
35 | 	at::Tensor left_input_halo,
36 | 	at::Tensor right_input_halo);
37 | std::vector<at::Tensor> left_right_halo_exchange(
38 |         int handle,
39 |         int left_rank,
40 |         int right_rank,
41 |         at::Tensor left_output_halo, 
42 |         at::Tensor right_output_halo);
43 | void add_delay(int delay);
44 | }}}
45 | #endif
46 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_version.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | // This file is used to check the version of NCCL detected.
 3 | #include <tuple>
 4 | 
 5 | #include <torch/extension.h>
 6 | 
 7 | std::tuple<int, int> get_nccl_version();
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("get_nccl_version", &get_nccl_version);
11 | }
12 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/nccl_p2p/nccl_version_check.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | // This file is used to check the version of NCCL detected.
 4 | #include <tuple>
 5 | #include <nccl.h>
 6 | 
 7 | 
 8 | std::tuple<int, int> get_nccl_version() {
 9 |   return { int(NCCL_MAJOR), int(NCCL_MINOR) };
10 | }
11 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   const float lr,
 8 |   const float beta1,
 9 |   const float beta2,
10 |   const float epsilon,
11 |   const int step,
12 |   const int bias_correction,
13 |   const float weight_decay,
14 |   const int grad_averaging,
15 |   const int mode,
16 |   const float global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |         m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
21 | }
22 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_fused_adam_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor grad_scale,
 8 |   float lr,
 9 |   float beta1,
10 |   float beta2,
11 |   float eps,
12 |   int step,
13 |   int mode,
14 |   int bias_correction,
15 |   float weight_decay);
16 | 
17 | void multi_tensor_fused_adam_with_param_remainders_cuda(
18 |   int chunk_size,
19 |   at::Tensor noop_flag,
20 |   std::vector<std::vector<at::Tensor>> tensor_lists,
21 |   at::Tensor grad_scale,
22 |   float lr,
23 |   float beta1,
24 |   float beta2,
25 |   float eps,
26 |   int step,
27 |   int mode,
28 |   int bias_correction,
29 |   float weight_decay);
30 | 
31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
32 |   m.def("multi_tensor_fused_adam",
33 |         &multi_tensor_fused_adam_cuda,
34 |         "CUDA kernels for multi-tensor Adam, "
35 |         "with param copy");
36 |   m.def("multi_tensor_fused_adam_with_param_remainders",
37 |         &multi_tensor_fused_adam_with_param_remainders_cuda,
38 |         "CUDA kernel for multi-tensor Adam, "
39 |         "with stored param remainders and param copy");
40 | }
41 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_compute_update_term_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_beta3,
10 |   at::Tensor per_tensor_bias_correction,
11 |   at::Tensor step,
12 |   at::Tensor per_tensor_epsilon,
13 |   const int mode,
14 |   at::Tensor per_tensor_decay,
15 |   at::Tensor global_scale,
16 |   at::Tensor global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | void multi_tensor_lamb_update_weights_cuda(
20 |   int chunk_size,
21 |   at::Tensor noop_flag,
22 |   std::vector<std::vector<at::Tensor>> tensor_lists,
23 |   at::Tensor per_tensor_param_norm,
24 |   at::Tensor per_tensor_update_norm,
25 |   at::Tensor update_norm_offset,
26 |   at::Tensor learning_rate,
27 |   at::Tensor per_tensor_decay,
28 |   at::Tensor global_grad_norm,
29 |   bool use_nvlamb);
30 | 
31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
32 |   m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda,
33 |         "Computes update term for LAMB optimizer");
34 |   m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda,
35 |         "Applies update term for LAMB optimizer");
36 | }
37 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/csrc/peer_memory/peer_memory.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "peer_memory_cuda.cuh"
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |     m.def("allocate_raw", &apex::contrib::peer_memory::allocate_raw, "allocate_raw");
21 |     m.def("free_raw", &apex::contrib::peer_memory::free_raw, "free_raw");
22 |     m.def("zero", &apex::contrib::peer_memory::zero, "zero");
23 |     m.def("get_raw_ipc_address", &apex::contrib::peer_memory::get_raw_ipc_address, "get_raw_ipc_address");
24 |     m.def("get_raw_peers", &apex::contrib::peer_memory::get_raw_peers, "get_raw_peers");
25 |     m.def("blob_view_half", &apex::contrib::peer_memory::blob_view_half, "blob_view_half");
26 |     m.def("blob_view_float", &apex::contrib::peer_memory::blob_view_float, "blob_view_float");
27 |     m.def("blob_view_int", &apex::contrib::peer_memory::blob_view_int, "blob_view_int");
28 |     m.def("push_pull_halos_1d", &apex::contrib::peer_memory::push_pull_halos_1d, "push_pull_halos_1d");
29 | }
30 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/cudnn_gbn/__init__.py:
--------------------------------------------------------------------------------
1 | from .batch_norm import GroupBatchNorm2d


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/fmha/__init__.py:
--------------------------------------------------------------------------------
1 | from .fmha import FMHAFun
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/focal_loss/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import focal_loss_cuda
 4 |     from .focal_loss import focal_loss
 5 |     del torch
 6 |     del focal_loss_cuda
 7 |     del focal_loss
 8 | except ImportError as err:
 9 |     print("apex was installed without --focal_loss flag, apex.contrib.focal_loss is not available")
10 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/focal_loss/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import focal_loss_cuda
 4 | 
 5 | 
 6 | class FocalLoss(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(
 9 |         ctx,
10 |         cls_output,
11 |         cls_targets_at_level,
12 |         num_positives_sum,
13 |         num_real_classes,
14 |         alpha,
15 |         gamma,
16 |         label_smoothing=0.0,
17 |     ):
18 |         loss, partial_grad = focal_loss_cuda.forward(
19 |             cls_output,
20 |             cls_targets_at_level,
21 |             num_positives_sum,
22 |             num_real_classes,
23 |             alpha,
24 |             gamma,
25 |             label_smoothing,
26 |         )
27 | 
28 |         ctx.save_for_backward(partial_grad, num_positives_sum)
29 |         return loss
30 | 
31 |     @staticmethod
32 |     def backward(ctx, grad_loss):
33 |         partial_grad, num_positives_sum = ctx.saved_tensors
34 | 
35 |         # The backward kernel is actually in-place to save memory space,
36 |         # partial_grad and grad_input are the same tensor.
37 |         grad_input = focal_loss_cuda.backward(grad_loss, partial_grad, num_positives_sum)
38 | 
39 |         return grad_input, None, None, None, None, None, None
40 | 
41 | 
42 | def focal_loss(
43 |     cls_output: torch.Tensor,
44 |     cls_targets_at_level: torch.Tensor,
45 |     num_positive_sum: torch.Tensor,
46 |     num_real_classes: int,
47 |     alpha: float,
48 |     gamma: float,
49 |     label_smoothing: float = 0.0,
50 | ) -> torch.Tensor:
51 |     """Fused focal loss function."""
52 |     return FocalLoss.apply(
53 |         cls_output,
54 |         cls_targets_at_level,
55 |         num_positive_sum,
56 |         num_real_classes,
57 |         alpha,
58 |         gamma,
59 |         label_smoothing,
60 |     )
61 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/group_norm/__init__.py:
--------------------------------------------------------------------------------
1 | from .group_norm import *
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/groupbn/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import bnp
 4 |     from .batch_norm import BatchNorm2d_NHWC
 5 |     del torch
 6 |     del bnp
 7 |     del batch_norm
 8 | except ImportError as err:
 9 |     print("apex was installed without --bnp flag, contrib.groupbn is not available")
10 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/index_mul_2d/__init__.py:
--------------------------------------------------------------------------------
1 | from .index_mul_2d import index_mul_2d
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/layer_norm/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_norm import FastLayerNorm
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/multihead_attn/MHA_bwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/multihead_attn/MHA_bwd.png


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/multihead_attn/MHA_fwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/multihead_attn/MHA_fwd.png


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/multihead_attn/__init__.py:
--------------------------------------------------------------------------------
1 | from .self_multihead_attn import SelfMultiheadAttn
2 | from .encdec_multihead_attn import EncdecMultiheadAttn
3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp16_optimizer import FP16_Optimizer
2 | from .fused_adam import FusedAdam
3 | from .fused_lamb import FusedLAMB
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/peer_memory/__init__.py:
--------------------------------------------------------------------------------
1 | from .peer_memory import PeerMemoryPool
2 | from .peer_halo_exchanger_1d import PeerHaloExchanger1d
3 | 
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/sparsity/COPYRIGHT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_masklib import create_mask
2 | from .asp import ASP
3 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/sparsity/permutation_search_kernels/__init__.py:
--------------------------------------------------------------------------------
1 | from .call_permutation_search_kernels import accelerated_search_for_good_permutation
2 | from .permutation_utilities import sum_after_2_to_4


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/bottleneck/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/bottleneck/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/clip_grad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/clip_grad/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/conv_bias_relu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/conv_bias_relu/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/cudnn_gbn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/cudnn_gbn/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/fmha/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/fmha/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/focal_loss/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/focal_loss/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/group_norm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/group_norm/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/index_mul_2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/index_mul_2d/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/layer_norm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/layer_norm/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/multihead_attn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/multihead_attn/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/optimizers/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/peer_memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/peer_memory/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/transducer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/transducer/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/test/xentropy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/contrib/test/xentropy/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | from .transducer import TransducerJoint
2 | from .transducer import TransducerLoss
3 | from . import _transducer_ref
4 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/xentropy/__init__.py:
--------------------------------------------------------------------------------
1 | from .softmax_xentropy import SoftmaxCrossEntropyLoss
2 | 
3 | 
4 | __all__ = [
5 |     "SoftmaxCrossEntropyLoss",
6 | ]
7 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/contrib/xentropy/softmax_xentropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import xentropy_cuda
 4 | 
 5 | 
 6 | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
 9 |         losses, max_log_sum_exp = xentropy_cuda.forward(
10 |             logits, labels, smoothing, half_to_float)
11 |         losses.masked_fill_(labels==padding_idx, 0)
12 | 
13 |         ctx.save_for_backward(logits, max_log_sum_exp, labels,
14 |             torch.FloatTensor([smoothing]),
15 |             torch.LongTensor([padding_idx]))
16 | 
17 |         return losses
18 | 
19 |     @staticmethod
20 |     def backward(ctx, grad_loss):
21 |         logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
22 | 
23 |         if not grad_loss.is_contiguous():
24 |             grad_loss = grad_loss.contiguous()
25 |         grad_loss.masked_fill_(labels==padding_idx.item(), 0)
26 |         grad_logits = xentropy_cuda.backward(
27 |             grad_loss.contiguous(), logits, max_log_sum_exp,
28 |             labels, smoothing.item())
29 | 
30 |         return grad_logits, None, None, None, None
31 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/fp16_utils/README.md:
--------------------------------------------------------------------------------
 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
 2 | 
 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
 4 | 
 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
 6 | 
 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 8 | 
 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 | 
11 | 
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
13 | 
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 | 
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fp16util import (
 2 |     BN_convert_float,
 3 |     network_to_half,
 4 |     prep_param_lists,
 5 |     model_grads_to_master_grads,
 6 |     master_params_to_model_params,
 7 |     tofp16,
 8 |     to_python_float,
 9 |     clip_grad_norm,
10 |     convert_module,
11 |     convert_network,
12 |     FP16Model,
13 | )
14 | 
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/fused_dense/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_dense import *
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import *
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/multi_tensor_apply/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_tensor_apply import MultiTensorApply
2 | 
3 | multi_tensor_applier = MultiTensorApply(2048*32)
4 | 
5 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/multi_tensor_apply/multi_tensor_apply.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class MultiTensorApply(object):
 4 |     available = False
 5 |     warned = False
 6 | 
 7 |     def __init__(self, chunk_size):
 8 |         try:
 9 |             import amp_C
10 |             MultiTensorApply.available = True
11 |             self.chunk_size = chunk_size
12 |         except ImportError as err:
13 |             MultiTensorApply.available = False
14 |             MultiTensorApply.import_err = err
15 | 
16 |     def check_avail(self):
17 |         if MultiTensorApply.available == False:
18 |             raise RuntimeError(
19 |                 "Attempted to call MultiTensorApply method, but MultiTensorApply "
20 |                 "is not available, possibly because Apex was installed without "
21 |                 "--cpp_ext --cuda_ext.  Original import error message:",
22 |                 MultiTensorApply.import_err)
23 | 
24 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
25 |         self.check_avail()
26 | 
27 |         return op(self.chunk_size,
28 |                   noop_flag_buffer,
29 |                   tensor_lists,
30 |                   *args)
31 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm, FusedRMSNorm, MixedFusedRMSNorm
2 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_sgd import FusedSGD
2 | from .fused_adam import FusedAdam
3 | from .fused_novograd import FusedNovoGrad
4 | from .fused_lamb import FusedLAMB
5 | from .fused_adagrad import FusedAdagrad
6 | from .fused_mixed_precision_lamb import FusedMixedPrecisionLamb
7 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/parallel/multiproc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def docstring_hack():
 6 |     """
 7 |     Multiproc file which will launch a set of processes locally for multi-gpu
 8 |     usage: python -m apex.parallel.multiproc main.py ...
 9 |     """
10 |     pass
11 | 
12 | argslist = list(sys.argv)[1:]
13 | world_size = torch.cuda.device_count()
14 | 
15 | if '--world-size' in argslist:
16 |     world_size = int(argslist[argslist.index('--world-size')+1])
17 | else:
18 |     argslist.append('--world-size')
19 |     argslist.append(str(world_size))
20 | 
21 | workers = []
22 | 
23 | for i in range(world_size):
24 |     if '--rank' in argslist:
25 |         argslist[argslist.index('--rank')+1] = str(i)
26 |     else:
27 |         argslist.append('--rank')
28 |         argslist.append(str(i))
29 |     stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
30 |     print(argslist)
31 |     p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
32 |     workers.append(p)
33 | 
34 | for p in workers:
35 |     p.wait()
36 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | from apex.transformer import amp
 2 | from apex.transformer import functional
 3 | from apex.transformer import parallel_state
 4 | from apex.transformer import pipeline_parallel
 5 | from apex.transformer import tensor_parallel
 6 | from apex.transformer import utils
 7 | from apex.transformer.enums import LayerType
 8 | from apex.transformer.enums import AttnType
 9 | from apex.transformer.enums import AttnMaskType
10 | 
11 | 
12 | __all__ = [
13 |     "amp",
14 |     "functional",
15 |     "parallel_state",
16 |     "pipeline_parallel",
17 |     "tensor_parallel",
18 |     "utils",
19 |     # enums.py
20 |     "LayerType",
21 |     "AttnType",
22 |     "AttnMaskType",
23 | ]
24 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/_data/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer._data._batchsampler import MegatronPretrainingRandomSampler
2 | from apex.transformer._data._batchsampler import MegatronPretrainingSampler
3 | 
4 | 
5 | __all__ = [
6 |     "MegatronPretrainingRandomSampler",
7 |     "MegatronPretrainingSampler",
8 | ]
9 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/_ucc_util.py:
--------------------------------------------------------------------------------
 1 | from torch import distributed as dist
 2 | 
 3 | HAS_UCC = hasattr(dist, "is_ucc_available") and dist.is_ucc_available()
 4 | if not HAS_UCC:
 5 |     try:
 6 |         import torch_ucc
 7 |         HAS_UCC = True
 8 |     except ImportError:
 9 |         HAS_UCC = False
10 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer.amp.grad_scaler import GradScaler
2 | 
3 | 
4 | __all__ = [
5 |     "GradScaler",
6 | ]
7 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import enum
16 | 
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 | 
22 | 
23 | class AttnType(enum.Enum):
24 |     self_attn = 1
25 |     cross_attn = 2
26 | 
27 | 
28 | class AttnMaskType(enum.Enum):
29 |     padding = 1
30 |     causal = 2
31 | 
32 | 
33 | class ModelType(enum.Enum):
34 |     encoder_or_decoder = 1
35 |     encoder_and_decoder = 2
36 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/functional/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer.functional.fused_softmax import FusedScaleMaskSoftmax
2 | 
3 | __all__ = [
4 |     "FusedScaleMaskSoftmax",
5 | ]
6 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | from apex.transformer.layers.layer_norm import FastLayerNorm
 3 | from apex.transformer.layers.layer_norm import FusedLayerNorm
 4 | from apex.transformer.layers.layer_norm import MixedFusedLayerNorm
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "FastLayerNorm",
 9 |     "FusedLayerNorm",
10 |     "MixedFusedLayerNorm",
11 | ]
12 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/log_util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | 
 5 | def get_transformer_logger(name: str) -> logging.Logger:
 6 |     name_wo_ext = os.path.splitext(name)[0]
 7 |     return logging.getLogger(name_wo_ext)
 8 | 
 9 | 
10 | def set_logging_level(verbosity) -> None:
11 |     """Change logging severity.
12 | 
13 |     Args:
14 |         verbosity
15 |     """
16 |     from apex import _library_root_logger
17 | 
18 |     _library_root_logger.setLevel(verbosity)
19 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func
2 | from apex.transformer.pipeline_parallel.schedules.common import build_model
3 | 
4 | 
5 | __all__ = [
6 |     "get_forward_backward_func",
7 |     "build_model",
8 | ]
9 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/pipeline_parallel/schedules/__init__.py:
--------------------------------------------------------------------------------
 1 | from apex.transformer import parallel_state
 2 | from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 3 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import (
 4 |     forward_backward_no_pipelining,
 5 | )
 6 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import (
 7 |     _forward_backward_pipelining_with_interleaving,
 8 | )
 9 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import (
10 |     forward_backward_pipelining_without_interleaving,
11 | )
12 | 
13 | __all__ = [
14 |     "get_forward_backward_func",
15 | ]
16 | 
17 | 
18 | class ExperimentalWarning(Warning):
19 |     pass
20 | 
21 | 
22 | def get_forward_backward_func(
23 |     virtual_pipeline_model_parallel_size, pipeline_model_parallel_size,
24 | ):
25 |     if parallel_state.get_pipeline_model_parallel_world_size() > 1:
26 |         if virtual_pipeline_model_parallel_size is not None:
27 |             if get_num_microbatches() % pipeline_model_parallel_size != 0:
28 |                 msg = "number of microbatches is not divisible by pipeline-parallel size when using interleaved schedule"
29 |                 raise RuntimeError(msg)
30 |             forward_backward_func = _forward_backward_pipelining_with_interleaving
31 |         else:
32 |             forward_backward_func = forward_backward_pipelining_without_interleaving
33 |     else:
34 |         forward_backward_func = forward_backward_no_pipelining
35 |     return forward_backward_func
36 | 


--------------------------------------------------------------------------------
/HALP/apex/apex/transformer/testing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/apex/transformer/testing/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/csrc/compat.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCH_CHECK
 2 | #define TORCH_CHECK AT_CHECK
 3 | #endif
 4 | 
 5 | #ifdef VERSION_GE_1_3
 6 | #define DATA_PTR data_ptr
 7 | #else
 8 | #define DATA_PTR data
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/HALP/apex/csrc/flatten_unflatten.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <torch/csrc/utils/tensor_flatten.h>
 3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
 4 | 
 5 | at::Tensor flatten(std::vector<at::Tensor> tensors)
 6 | {
 7 |   return torch::utils::flatten_dense_tensors(tensors);
 8 | }
 9 | 
10 | std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
11 | {
12 |   return torch::utils::unflatten_dense_tensors(flat, tensors);
13 | }
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("flatten", &flatten, "Flatten dense tensors");
17 |   m.def("unflatten", &unflatten, "Unflatten dense tensors");
18 | }
19 | 


--------------------------------------------------------------------------------
/HALP/apex/csrc/megatron/fused_weight_gradient_dense.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include <cstdio>
 4 | #include <vector>
 5 | 
 6 | void wgrad_gemm_accum_fp32_cuda_stub(
 7 |   at::Tensor &input_2d,
 8 |   at::Tensor &d_output_2d,
 9 |   at::Tensor &d_weight
10 | );
11 | 
12 | void wgrad_gemm_accum_fp16_cuda_stub(
13 |   at::Tensor &input_2d,
14 |   at::Tensor &d_output_2d,
15 |   at::Tensor &d_weight
16 | );
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |     m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32_cuda_stub, "wgrad gemm accum in fp32");
20 |     m.def("wgrad_gemm_accum_fp16", &wgrad_gemm_accum_fp16_cuda_stub, "wgrad gemm accum in fp16");
21 | }
22 | 


--------------------------------------------------------------------------------
/HALP/apex/csrc/static_switch.h:
--------------------------------------------------------------------------------
 1 | // From
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | 
 4 | #pragma once
 5 | 
 6 | /// @param COND       - a boolean expression to switch by
 7 | /// @param CONST_NAME - a name given for the constexpr bool variable.
 8 | /// @param ...       - code to execute for true and false
 9 | ///
10 | /// Usage:
11 | /// ```
12 | /// BOOL_SWITCH(flag, BoolConst, [&] {
13 | ///     some_function<BoolConst>(...);
14 | /// });
15 | /// ```
16 | #define BOOL_SWITCH(COND, CONST_NAME, ...)      \
17 |   [&] {                                         \
18 |     if (COND) {                                 \
19 |       constexpr static bool CONST_NAME = true;  \
20 |       return __VA_ARGS__();                     \
21 |     } else {                                    \
22 |       constexpr static bool CONST_NAME = false; \
23 |       return __VA_ARGS__();                     \
24 |     }                                           \
25 |   }()
26 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = NVIDIAAPEX
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | gh-pages:
16 | 	git checkout gh-pages
17 | 	rm -rf build
18 | 	rm -rf source
19 | 	git checkout master -- .
20 | 	make html
21 | 	rm -rf ../_modules ../_sources ../_static
22 | 	mv -fv build/html/* ../
23 | 	rm -rf build
24 | 	git add -A
25 | 	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
26 | 
27 | .PHONY: help Makefile
28 | 
29 | # Catch-all target: route all unknown targets to Sphinx using the new
30 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
31 | %: Makefile
32 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
33 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/source/_static/img/nv-pytorch2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/docs/source/_static/img/nv-pytorch2.png


--------------------------------------------------------------------------------
/HALP/apex/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 |   {% block sidebartitle %} {{ super() }}
 3 | 
 4 |   <style>
 5 |     /* Sidebar header (and topbar for mobile) */
 6 |     .wy-side-nav-search, .wy-nav-top {
 7 |       background: #76b900;
 8 |     }
 9 | 
10 |     .wy-side-nav-search a:link, .wy-nav-top a:link {
11 |       color: #fff;
12 |     }
13 |     .wy-side-nav-search a:visited, .wy-nav-top a:visited {
14 |       color: #fff;
15 |     }
16 |     .wy-side-nav-search a:hover, .wy-nav-top a:hover {
17 |       color: #fff;
18 |     }
19 | 
20 |     .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
21 |       color: #d9d9d9
22 |     }
23 | 
24 |     .wy-menu-vertical a:active {
25 |       background-color: #76b900
26 |     }
27 | 
28 |     .wy-side-nav-search>div.version {
29 |       color: rgba(0, 0, 0, 0.3)
30 |     }
31 |   </style>
32 |   {% endblock %}
33 | 
34 |   {% block footer %} {{ super() }}
35 | 
36 |   <style>
37 |   a:link, a:visited {
38 |     color: #76b900;
39 |   }
40 | 
41 |   a:hover {
42 |     color: #8c0;
43 |   }
44 | 
45 |   .rst-content dl:not(.docutils) dt {
46 |     background: rgba(118, 185, 0, 0.1);
47 |     color: rgba(59,93,0,1);
48 |     border-top: solid 3px rgba(59,93,0,1);
49 |   }
50 |   </style>
51 |   {% endblock %}
52 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyTorch documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec 23 13:31:47 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :github_url: https://github.com/nvidia/apex
 7 | 
 8 | Apex (A PyTorch Extension)
 9 | ===================================
10 | 
11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex),
12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
13 | 
14 | Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.
15 | 
16 | Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here:  https://github.com/mcarilli/mixed_precision_references.
17 | 
18 | .. toctree::
19 |    :maxdepth: 1
20 |    :caption: AMP:  Automatic Mixed Precision
21 | 
22 |    amp
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: Distributed Training
27 | 
28 |    parallel
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: Fused Optimizers
33 | 
34 |    optimizers
35 | 
36 | .. toctree::
37 |    :maxdepth: 1
38 |    :caption: Fused Layer Norm
39 | 
40 |    layernorm
41 | 
42 | ..   .. toctree::
43 |      :maxdepth: 1
44 |      :caption: Deprecated mixed precision API
45 |      fp16_util
46 | 
47 | ..   RNN
48 |    
49 | Indices and tables
50 | ==================
51 | 
52 | * :ref:`genindex`
53 | * :ref:`modindex`
54 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/source/layernorm.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.normalization.fused_layer_norm
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.normalization
 8 | .. currentmodule:: apex.normalization
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedLayerNorm
14 |     :members:
15 | 
16 | .. autoclass:: FusedRMSNorm
17 |     :members:
18 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/source/optimizers.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.optimizers
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.optimizers
 8 | .. currentmodule:: apex.optimizers
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedAdam
14 |     :members:
15 | 
16 | .. autoclass:: FusedLAMB
17 |     :members:
18 | 
19 | .. autoclass:: FusedNovoGrad
20 |     :members:
21 | 
22 | .. autoclass:: FusedSGD
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/HALP/apex/docs/source/parallel.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.parallel
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.parallel
 8 | .. currentmodule:: apex.parallel
 9 | 
10 | .. DistributedDataParallel
11 |    ----------
12 | 
13 | .. autoclass:: DistributedDataParallel
14 |     :members:
15 | 
16 | .. autoclass:: Reducer
17 |     :members:
18 | 
19 | .. autoclass:: SyncBatchNorm
20 |     :members:
21 | 
22 | Utility functions
23 | ----------------------------------
24 | 
25 | .. autofunction:: convert_syncbn_model
26 | 


--------------------------------------------------------------------------------
/HALP/apex/examples/README.md:
--------------------------------------------------------------------------------
1 | This directory contains examples illustrating Apex mixed precision and distributed tools.
2 | 
3 | **Note for users of the pre-unification API**:
4 | `deprecated_api` contains examples illustrating the old (pre-unified) APIs.  These APIs will be removed soon, and users are strongly encouraged to switch.  The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API.
5 | 


--------------------------------------------------------------------------------
/HALP/apex/examples/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image must at least have pytorch and CUDA installed.
 2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3
 3 | FROM $BASE_IMAGE
 4 | ARG BASE_IMAGE
 5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}"
 6 | # make sure we don't overwrite some existing directory called "apex"
 7 | WORKDIR /tmp/unique_for_apex
 8 | # uninstall Apex if present, twice to make absolutely sure :)
 9 | RUN pip uninstall -y apex || :
10 | RUN pip uninstall -y apex || :
11 | # SHA is something the user can touch to force recreation of this Docker layer,
12 | # and therefore force cloning of the latest version of Apex
13 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
14 | WORKDIR /tmp/unique_for_apex/apex
15 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
16 | WORKDIR /workspace
17 | 


--------------------------------------------------------------------------------
/HALP/apex/examples/simple/distributed/README.md:
--------------------------------------------------------------------------------
 1 | **distributed_data_parallel.py** and **run.sh** show an example using Amp with
 2 | [apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
 3 | [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
 4 | and the Pytorch multiprocess launcher script,
 5 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
 6 | The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
 7 | single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
 8 | come after the call to `amp.initialize`.  Test via
 9 | ```bash
10 | bash run.sh
11 | ```
12 | 
13 | **This is intended purely as an instructional example, not a performance showcase.**
14 | 


--------------------------------------------------------------------------------
/HALP/apex/examples/simple/distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
3 | 


--------------------------------------------------------------------------------
/HALP/apex/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools",
4 |     "wheel",
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/HALP/apex/requirements.txt:
--------------------------------------------------------------------------------
1 | cxxfilt>=0.2.0
2 | tqdm>=4.28.1
3 | numpy>=1.15.3
4 | PyYAML>=5.1
5 | pytest>=3.5.1
6 | packaging>=14.0
7 | 


--------------------------------------------------------------------------------
/HALP/apex/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | flake8>=3.7.9
3 | Sphinx>=3.0.3


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_amp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_amp/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_amp/test_larc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import Parameter
 6 | 
 7 | from apex import amp
 8 | from apex.parallel.LARC import LARC
 9 | from utils import common_init
10 | 
11 | 
12 | class MyModel(torch.nn.Module):
13 |     def __init__(self, unique):
14 |         super(MyModel, self).__init__()
15 |         self.weight0 = Parameter(
16 |             unique + torch.arange(2, device="cuda", dtype=torch.float32)
17 |         )
18 | 
19 |     def forward(self, input):
20 |         return (input * self.weight0).sum()
21 | 
22 | 
23 | class TestLARC(unittest.TestCase):
24 |     def setUp(self):
25 |         self.x = torch.ones((2), device="cuda", dtype=torch.float32)
26 |         common_init(self)
27 | 
28 |     def tearDown(self):
29 |         pass
30 | 
31 |     def test_larc_mixed_precision(self):
32 |         for opt_level in ["O0", "O1", "O2", "O3"]:
33 |             model = MyModel(1)
34 | 
35 |             optimizer = LARC(
36 |                 torch.optim.SGD(
37 |                     [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
38 |                 )
39 |             )
40 | 
41 |             model, optimizer = amp.initialize(
42 |                 model, optimizer, opt_level=opt_level, verbosity=0
43 |             )
44 | 
45 |             optimizer.zero_grad()
46 |             loss = model(self.x)
47 |             with amp.scale_loss(loss, optimizer) as scaled_loss:
48 |                 scaled_loss.backward()
49 |             optimizer.step()
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_amp/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | HALF = 'torch.cuda.HalfTensor'
 4 | FLOAT = 'torch.cuda.FloatTensor'
 5 | 
 6 | DTYPES = [torch.half, torch.float]
 7 | 
 8 | ALWAYS_HALF = {torch.float: HALF,
 9 |                torch.half: HALF}
10 | ALWAYS_FLOAT = {torch.float: FLOAT,
11 |                 torch.half: FLOAT}
12 | MATCH_INPUT = {torch.float: FLOAT,
13 |                torch.half: HALF}
14 | 
15 | def common_init(test_case):
16 |     test_case.h = 64
17 |     test_case.b = 16
18 |     test_case.c = 16
19 |     test_case.k = 3
20 |     test_case.t = 10
21 |     torch.set_default_tensor_type(torch.cuda.FloatTensor)
22 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_deprecated/test_deprecated_warning.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | 
 5 | import apex
 6 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
 7 | 
 8 | 
 9 | def init_model_and_optimizer():
10 |     model = torch.nn.Linear(1, 1, bias=False).cuda()
11 |     optimizer = torch.optim.SGD(model.parameters(), 1.0)
12 |     return model, optimizer
13 | 
14 | 
15 | @unittest.skipUnless(torch.cuda.is_available(), "")
16 | class TestDeprecatedWarning(unittest.TestCase):
17 | 
18 |     def test_amp(self):
19 |         model, optimizer = init_model_and_optimizer()
20 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
21 |             _ = apex.amp.initialize(model, optimizer)
22 | 
23 |     def test_fp16_model(self):
24 |         model, _ = init_model_and_optimizer()
25 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
26 |             _ = apex.fp16_utils.FP16Model(model)
27 | 
28 |     def test_fp16_optimizer(self):
29 |         _, optimizer = init_model_and_optimizer()
30 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
31 |             _ = apex.fp16_utils.FP16_Optimizer(optimizer)
32 | 
33 |     def test_fp16_loss_scaler(self):
34 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
35 |              apex.fp16_utils.LossScaler()
36 | 
37 | 
38 | class TestParallel(NcclDistributedTestBase):
39 | 
40 |     @property
41 |     def world_size(self):
42 |         return min(torch.cuda.device_count(), 2)
43 | 
44 |     def test_distributed_data_parallel(self):
45 |         model, _ = init_model_and_optimizer()
46 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
47 |             _ = apex.parallel.DistributedDataParallel(model)
48 | 
49 |     def test_convert_syncbn_model(self):
50 |         model, _ = init_model_and_optimizer()
51 |         with self.assertWarns(apex.DeprecatedFeatureWarning):
52 |             _ = apex.parallel.convert_syncbn_model(model)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_fp16util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_fp16util/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_optimizers/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/apex/tests/L0/run_transformer/__init__.py


--------------------------------------------------------------------------------
/HALP/apex/tests/L0/run_transformer/test_transformer_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | from torch.testing._internal import common_utils
 5 | 
 6 | logging.getLogger("torch").setLevel(logging.WARNING)
 7 | 
 8 | from apex.transformer import parallel_state
 9 | from apex.transformer.tensor_parallel import utils
10 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
11 | 
12 | logging.getLogger("apex").setLevel(logging.WARNING)
13 | 
14 | 
15 | class TransformerUtilsTest(NcclDistributedTestBase):
16 |     def test_split_tensor_along_last_dim(self):
17 |         for tensor_model_paralell_world_size in range(1, self.world_size + 1):
18 |             if self.world_size % tensor_model_paralell_world_size > 0:
19 |                 continue
20 |             parallel_state.initialize_model_parallel(
21 |                 tensor_model_parallel_size_=tensor_model_paralell_world_size
22 |             )
23 | 
24 |             device = "cpu"
25 |             input_tensor = torch.randn((100, 100, 100), device=device)
26 |             splits = utils.split_tensor_along_last_dim(input_tensor, 10)
27 |             last_dim_shapes = torch.tensor(
28 |                 [int(split.size()[-1]) for split in splits]
29 |             )
30 | 
31 |             self.assertTrue(
32 |                 torch.equal(last_dim_shapes, torch.full((10,), 10),),
33 |                 msg=f"tensor_model_paralell_world_size: {tensor_model_paralell_world_size}",
34 |             )
35 | 
36 |             parallel_state.destroy_model_parallel()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     common_utils.run_tests()
41 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/L1/cross_product/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
4 | # DATADIR="/opt/home/apex/examples/imagenet/"
5 | cp ../common/* .
6 | bash run_test.sh single_gpu $1
7 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/L1/cross_product_distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cp ../common/* .
4 | bash run_test.sh distributed $1
5 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/distributed/DDP/run_race_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py
4 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/distributed/amp_master_params/compare.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | model_params_rank0 = torch.load("rank0model.pth",
 4 |                            map_location = lambda storage, loc: storage.cuda(0))
 5 | model_params_rank1 = torch.load("rank1model.pth",
 6 |                                  map_location = lambda storage, loc: storage.cuda(0))
 7 | master_params_rank0 = torch.load("rank0master.pth",
 8 |                                  map_location = lambda storage, loc: storage.cuda(0))
 9 | master_params_rank1 = torch.load("rank1master.pth",
10 |                                  map_location = lambda storage, loc: storage.cuda(0))
11 | 
12 | for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
13 |         model_params_rank0,
14 |         model_params_rank1,
15 |         master_params_rank0,
16 |         master_params_rank1):
17 |     assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
18 |     assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
19 |     # Some debugging/investigation assistance code:
20 |     # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
21 |     # offending_val_half = model_rank0.view(-1)[maxind.item()]
22 |     # offending_val_float = master_rank0.view(-1)[maxind.item()]
23 |     # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
24 |     #       offending_val_float.half().item())
25 |     # rtol needs to be > 2^-11 because of denormals...
26 |     assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch"
27 | 
28 | print("OK:  Model and master params match across ranks.")
29 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/distributed/amp_master_params/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
3 | 
4 | python compare.py
5 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/distributed/synced_batchnorm/test_batchnorm1d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import apex
 3 | 
 4 | model = apex.parallel.SyncBatchNorm(4).cuda()
 5 | model.weight.data.uniform_()
 6 | model.bias.data.uniform_()
 7 | data = torch.rand((8,4)).cuda()
 8 | 
 9 | model_ref = torch.nn.BatchNorm1d(4).cuda()
10 | model_ref.load_state_dict(model.state_dict())
11 | data_ref = data.clone()
12 | 
13 | output = model(data)
14 | output_ref = model_ref(data_ref)
15 | 
16 | assert(output.allclose(output_ref))
17 | assert(model.running_mean.allclose(model_ref.running_mean))
18 | assert(model.running_var.allclose(model_ref.running_var))
19 | 


--------------------------------------------------------------------------------
/HALP/apex/tests/distributed/synced_batchnorm/unit_test.sh:
--------------------------------------------------------------------------------
1 | python python_single_gpu_unit_test.py
2 | python single_gpu_unit_test.py
3 | python test_batchnorm1d.py
4 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
5 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16
6 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex
7 | #beware, you need a system with at least 4 gpus to test group_size<world_size
8 | #python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
9 | 


--------------------------------------------------------------------------------
/HALP/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/HALP/assets/pipeline.png


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_baseline.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 90


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_baseline_eval.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | # data_root: /ssd_data/imagenet/
 6 | data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: false
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_prune_rat0.2.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30
30 | 
31 | # pruning
32 | prune_start_iter: 0
33 | prune_interval: 40
34 | prune_steps: 30
35 | prune_ratio: 0.2
36 | disable_layer_prune: false
37 | layer_cfg: configs/prune_configs/rn34_prune_layer.json
38 | fmap_cfg: configs/prune_configs/rn34_fmap.json
39 | group_size_cfg: configs/prune_configs/rtx2080_rn34_prune_groups.json
40 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
41 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_prune_rat0.25.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30
30 | 
31 | # pruning
32 | prune_start_iter: 0
33 | prune_interval: 40
34 | prune_steps: 30
35 | prune_ratio: 0.25
36 | disable_layer_prune: false
37 | layer_cfg: configs/prune_configs/rn34_prune_layer.json
38 | fmap_cfg: configs/prune_configs/rn34_fmap.json
39 | group_size_cfg: configs/prune_configs/rtx2080_rn34_prune_groups.json
40 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
41 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_prune_rat0.3.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30
30 | 
31 | # pruning
32 | prune_start_iter: 0
33 | prune_interval: 40
34 | prune_steps: 30
35 | prune_ratio: 0.3
36 | disable_layer_prune: false
37 | layer_cfg: configs/prune_configs/rn34_prune_layer.json
38 | fmap_cfg: configs/prune_configs/rn34_fmap.json
39 | group_size_cfg: configs/prune_configs/rtx2080_rn34_prune_groups.json
40 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
41 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_prune_rat0.35.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30
30 | 
31 | # pruning
32 | prune_start_iter: 0
33 | prune_interval: 40
34 | prune_steps: 30
35 | prune_ratio: 0.35
36 | disable_layer_prune: false
37 | layer_cfg: configs/prune_configs/rn34_prune_layer.json
38 | fmap_cfg: configs/prune_configs/rn34_fmap.json
39 | group_size_cfg: configs/prune_configs/rtx2080_rn34_prune_groups.json
40 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
41 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn34_imagenet_prune_rat0.45.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet34
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | # data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 7 | class_num: 1000
 8 | workers: 10
 9 | batch_size: 128
10 | optimizer_batch_size: 1024
11 | 
12 | learning_rate: 1.024
13 | momentum: 0.875
14 | nesterov: false
15 | weight_decay: 3.0517578125e-05
16 | bn_weight_decay: 0.0
17 | lr_schedule: cosine
18 | warmup: 8
19 | epochs: 90
20 | 
21 | fp16: false
22 | amp: true
23 | opt_level: O1
24 | static_loss_scale: 128
25 | dynamic_loss_scale: true
26 | label_smoothing: 0.1
27 | mixup: 0.0
28 | 
29 | ckpt_freq: 30
30 | 
31 | # pruning
32 | prune_start_iter: 0
33 | prune_interval: 40
34 | prune_steps: 30
35 | prune_ratio: 0.45
36 | disable_layer_prune: false
37 | layer_cfg: configs/prune_configs/rn34_prune_layer.json
38 | fmap_cfg: configs/prune_configs/rn34_fmap.json
39 | group_size_cfg: configs/prune_configs/rtx2080_rn34_prune_groups.json
40 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
41 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_baseline.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: true
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_baseline_eval.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /data_large/readonly/ImageNet-Fast/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: false
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_prune_rat0.1.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: true
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30
29 | 
30 | # pruning
31 | prune_start_iter: 0
32 | prune_interval: 40
33 | prune_steps: 30
34 | prune_ratio: 0.1
35 | disable_layer_prune: false
36 | layer_cfg: configs/prune_configs/rn50_prune_layer.json
37 | fmap_cfg: configs/prune_configs/rn50_fmap.json
38 | group_size_cfg: configs/prune_configs/rtx2080_rn50_prune_groups.json
39 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
40 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_prune_rat0.15.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: true
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30
29 | 
30 | # pruning
31 | prune_start_iter: 0
32 | prune_interval: 40
33 | prune_steps: 30
34 | prune_ratio: 0.15
35 | disable_layer_prune: false
36 | layer_cfg: configs/prune_configs/rn50_prune_layer.json
37 | fmap_cfg: configs/prune_configs/rn50_fmap.json
38 | group_size_cfg: configs/prune_configs/rtx2080_rn50_prune_groups.json
39 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
40 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_prune_rat0.2.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: true
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30
29 | 
30 | # pruning
31 | prune_start_iter: 0
32 | prune_interval: 40
33 | prune_steps: 30
34 | prune_ratio: 0.2
35 | disable_layer_prune: false
36 | layer_cfg: configs/prune_configs/rn50_prune_layer.json
37 | fmap_cfg: configs/prune_configs/rn50_fmap.json
38 | group_size_cfg: configs/prune_configs/rtx2080_rn50_prune_groups.json
39 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
40 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/exp_configs/rn50_imagenet_prune_rat0.45.yaml:
--------------------------------------------------------------------------------
 1 | arch: resnet50
 2 | enable_bias: false
 3 | 
 4 | dataset_name: imagenet
 5 | data_root: /ssd_data/imagenet/
 6 | class_num: 1000
 7 | workers: 10
 8 | batch_size: 128
 9 | optimizer_batch_size: 1024
10 | 
11 | learning_rate: 1.024
12 | momentum: 0.875
13 | nesterov: false
14 | weight_decay: 3.0517578125e-05
15 | bn_weight_decay: 0.0
16 | lr_schedule: cosine
17 | warmup: 8
18 | epochs: 90
19 | 
20 | fp16: false
21 | amp: true
22 | opt_level: O1
23 | static_loss_scale: 128
24 | dynamic_loss_scale: true
25 | label_smoothing: 0.1
26 | mixup: 0.0
27 | 
28 | ckpt_freq: 30
29 | 
30 | # pruning
31 | prune_start_iter: 0
32 | prune_interval: 40
33 | prune_steps: 30
34 | prune_ratio: 0.45
35 | disable_layer_prune: false
36 | layer_cfg: configs/prune_configs/rn50_prune_layer.json
37 | fmap_cfg: configs/prune_configs/rn50_fmap.json
38 | group_size_cfg: configs/prune_configs/rtx2080_rn50_prune_groups.json
39 | latency_lut_file: LUT/rn50_rtx2080_cudnn_batch256.pkl
40 | lut_bs: 256


--------------------------------------------------------------------------------
/HALP/configs/prune_configs/rn34_fmap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "module.conv1": 224,
 3 |     "module.layer1.0.conv1": 56,
 4 |     "module.layer1.0.conv2": 56,
 5 |     "module.layer1.1.conv1": 56,
 6 |     "module.layer1.1.conv2": 56,
 7 |     "module.layer1.2.conv1": 56,
 8 |     "module.layer1.2.conv2": 56,
 9 |     "module.layer2.0.conv1": 56,
10 |     "module.layer2.0.conv2": 28,
11 |     "module.layer2.1.conv1": 28,
12 |     "module.layer2.1.conv2": 28,
13 |     "module.layer2.2.conv1": 28,
14 |     "module.layer2.2.conv2": 28,
15 |     "module.layer2.3.conv1": 28,
16 |     "module.layer2.3.conv2": 28,
17 |     "module.layer2.0.downsample.0": 56,
18 |     "module.layer3.0.conv1": 28,
19 |     "module.layer3.0.conv2": 14,
20 |     "module.layer3.1.conv1": 14,
21 |     "module.layer3.1.conv2": 14,
22 |     "module.layer3.2.conv1": 14,
23 |     "module.layer3.2.conv2": 14,
24 |     "module.layer3.3.conv1": 14,
25 |     "module.layer3.3.conv2": 14,
26 |     "module.layer3.4.conv1": 14,
27 |     "module.layer3.4.conv2": 14,
28 |     "module.layer3.5.conv1": 14,
29 |     "module.layer3.5.conv2": 14,
30 |     "module.layer3.0.downsample.0": 28,
31 |     "module.layer4.0.conv1": 14,
32 |     "module.layer4.0.conv2": 7,
33 |     "module.layer4.1.conv1": 7,
34 |     "module.layer4.1.conv2": 7,
35 |     "module.layer4.2.conv1": 7,
36 |     "module.layer4.2.conv2": 7,
37 |     "module.layer4.0.downsample.0": 14
38 | }


--------------------------------------------------------------------------------
/HALP/configs/prune_configs/rtx2080_rn34_prune_groups.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "module.conv1": 2,
 3 |     "module.layer1.0.conv1": 32,
 4 |     "module.layer1.0.conv2": 32,
 5 |     "module.layer1.1.conv1": 32,
 6 |     "module.layer1.1.conv2": 32,
 7 |     "module.layer1.2.conv1": 32,
 8 |     "module.layer1.2.conv2": 32,
 9 |     "module.layer2.0.conv1": 64,
10 |     "module.layer2.0.conv2": 64,
11 |     "module.layer2.1.conv1": 64,
12 |     "module.layer2.1.conv2": 64,
13 |     "module.layer2.2.conv1": 64,
14 |     "module.layer2.2.conv2": 64,
15 |     "module.layer2.3.conv1": 64,
16 |     "module.layer2.3.conv2": 64,
17 |     "module.layer2.0.downsample.0": 64,
18 |     "module.layer3.0.conv1": 64,
19 |     "module.layer3.0.conv2": 32,
20 |     "module.layer3.1.conv1": 32,
21 |     "module.layer3.1.conv2": 32,
22 |     "module.layer3.2.conv1": 32,
23 |     "module.layer3.2.conv2": 32,
24 |     "module.layer3.3.conv1": 32,
25 |     "module.layer3.3.conv2": 32,
26 |     "module.layer3.4.conv1": 32,
27 |     "module.layer3.4.conv2": 32,
28 |     "module.layer3.5.conv1": 32,
29 |     "module.layer3.5.conv2": 32,
30 |     "module.layer3.0.downsample.0": 128,
31 |     "module.layer4.0.conv1": 64,
32 |     "module.layer4.0.conv2": 64,
33 |     "module.layer4.1.conv1": 64,
34 |     "module.layer4.1.conv2": 64,
35 |     "module.layer4.2.conv1": 64,
36 |     "module.layer4.2.conv2": 64,
37 |     "module.layer4.0.downsample.0": 128
38 | }


--------------------------------------------------------------------------------
/HALP/configs/prune_configs/titanv_rn34_prune_groups.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "module.conv1": 2,
 3 |     "module.layer1.0.conv1": 32,
 4 |     "module.layer1.0.conv2": 32,
 5 |     "module.layer1.1.conv1": 32,
 6 |     "module.layer1.1.conv2": 32,
 7 |     "module.layer1.2.conv1": 32,
 8 |     "module.layer1.2.conv2": 32,
 9 |     "module.layer2.0.conv1": 32,
10 |     "module.layer2.0.conv2": 32,
11 |     "module.layer2.1.conv1": 32,
12 |     "module.layer2.1.conv2": 32,
13 |     "module.layer2.2.conv1": 32,
14 |     "module.layer2.2.conv2": 32,
15 |     "module.layer2.3.conv1": 32,
16 |     "module.layer2.3.conv2": 32,
17 |     "module.layer2.0.downsample.0": 32,
18 |     "module.layer3.0.conv1": 64,
19 |     "module.layer3.0.conv2": 32,
20 |     "module.layer3.1.conv1": 32,
21 |     "module.layer3.1.conv2": 32,
22 |     "module.layer3.2.conv1": 32,
23 |     "module.layer3.2.conv2": 32,
24 |     "module.layer3.3.conv1": 32,
25 |     "module.layer3.3.conv2": 32,
26 |     "module.layer3.4.conv1": 32,
27 |     "module.layer3.4.conv2": 32,
28 |     "module.layer3.5.conv1": 32,
29 |     "module.layer3.5.conv2": 32,
30 |     "module.layer3.0.downsample.0": 64,
31 |     "module.layer4.0.conv1": 64,
32 |     "module.layer4.0.conv2": 32,
33 |     "module.layer4.1.conv1": 32,
34 |     "module.layer4.1.conv2": 32,
35 |     "module.layer4.2.conv1": 32,
36 |     "module.layer4.2.conv2": 32,
37 |     "module.layer4.0.downsample.0": 128
38 | }


--------------------------------------------------------------------------------
/HALP/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (C) 2022 NVIDIA Corporation. All rights reserved.
 3 | #
 4 | # Official PyTorch implementation of NeurIPS2022 paper
 5 | # Structural Pruning via Latency-Saliency Knapsack
 6 | # Maying Shen, Hongxu Yin, Pavlo Molchanov, Lei Mao, Jianna Liu and Jose M. Alvarez
 7 | #
 8 | # This work is licensed under the NVIDIA Source Code License
 9 | # To view a copy of this license, see the LICENSE file.
10 | # --------------------------------------------------------
11 | 
12 | from models.create_model import get_model, fuse_model
13 | 
14 | __all__ = get_model, fuse_model
15 | 


--------------------------------------------------------------------------------
/HALP/train/lr_schedule.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Copyright (C) 2022 NVIDIA Corporation. All rights reserved.
 3 | #
 4 | # Official PyTorch implementation of NeurIPS2022 paper
 5 | # Structural Pruning via Latency-Saliency Knapsack
 6 | # Maying Shen, Hongxu Yin, Pavlo Molchanov, Lei Mao, Jianna Liu and Jose M. Alvarez
 7 | #
 8 | # This work is licensed under the NVIDIA Source Code License
 9 | # To view a copy of this license, see the LICENSE file.
10 | # --------------------------------------------------------
11 | 
12 | import numpy as np
13 | 
14 | 
15 | def lr_policy(lr_fn):
16 |     def _modify_lr(optimizer, epoch):
17 |         lr = lr_fn(epoch)
18 | 
19 |         for param_group in optimizer.param_groups:
20 |             param_group["lr"] = lr
21 | 
22 |     return _modify_lr
23 | 
24 | 
25 | def lr_step_policy(base_lr, steps, decay_factor, warmup_length):
26 |     def _lr_fn(epoch):
27 |         if epoch < warmup_length:
28 |             lr = base_lr * (epoch + 1) / warmup_length
29 |         else:
30 |             lr = base_lr
31 |             for s in steps:
32 |                 if epoch >= s:
33 |                     lr *= decay_factor
34 |         return lr
35 | 
36 |     return lr_policy(_lr_fn)
37 | 
38 | 
39 | def lr_cosine_policy(base_lr, warmup_length, epochs):
40 |     def _lr_fn(epoch):
41 |         if epoch < warmup_length:
42 |             lr = base_lr * (epoch + 1) / warmup_length
43 |         else:
44 |             e = epoch - warmup_length
45 |             es = epochs - warmup_length
46 |             lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
47 |         return lr
48 | 
49 |     return lr_policy(_lr_fn)
50 | 


--------------------------------------------------------------------------------
/HALP/utils/mixup.py:
--------------------------------------------------------------------------------
 1 | """Originated from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/image_classification/mixup.py
 2 | """
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | 
 9 | def mixup(alpha, num_classes, data, target):
10 |     with torch.no_grad():
11 |         bs = data.size(0)
12 |         c = np.random.beta(alpha, alpha)
13 | 
14 |         perm = torch.randperm(bs).cuda()
15 | 
16 |         md = c * data + (1 - c) * data[perm, :]
17 |         mt = c * target + (1 - c) * target[perm, :]
18 |         return md, mt
19 | 
20 | 
21 | class MixUpWrapper(object):
22 |     def __init__(self, alpha, num_classes, dataloader):
23 |         self.alpha = alpha
24 |         self.dataloader = dataloader
25 |         self.num_classes = num_classes
26 | 
27 |     def mixup_loader(self, loader):
28 |         for input, target in loader:
29 |             i, t = mixup(self.alpha, self.num_classes, input, target)
30 |             yield i, t
31 | 
32 |     def __iter__(self):
33 |         return self.mixup_loader(self.dataloader)
34 | 
35 | 
36 | class NLLMultiLabelSmooth(nn.Module):
37 |     def __init__(self, smoothing=0.0):
38 |         super(NLLMultiLabelSmooth, self).__init__()
39 |         self.confidence = 1.0 - smoothing
40 |         self.smoothing = smoothing
41 | 
42 |     def forward(self, x, target):
43 |         if self.training:
44 |             x = x.float()
45 |             target = target.float()
46 |             logprobs = torch.nn.functional.log_softmax(x, dim=-1)
47 | 
48 |             nll_loss = -logprobs * target
49 |             nll_loss = nll_loss.sum(-1)
50 | 
51 |             smooth_loss = -logprobs.mean(dim=-1)
52 | 
53 |             loss = self.confidence * nll_loss + self.smoothing * smooth_loss
54 | 
55 |             return loss.mean()
56 |         else:
57 |             return torch.nn.functional.cross_entropy(x, target)
58 | 


--------------------------------------------------------------------------------
/HALP/utils/smoothing.py:
--------------------------------------------------------------------------------
 1 | """Originated from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/Classification/ConvNets/image_classification/smoothing.py
 2 | """
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class LabelSmoothing(nn.Module):
 9 |     """
10 |     NLL loss with label smoothing.
11 |     """
12 | 
13 |     def __init__(self, smoothing=0.0):
14 |         """
15 |         Constructor for the LabelSmoothing module.
16 | 
17 |         :param smoothing: label smoothing factor
18 |         """
19 |         super(LabelSmoothing, self).__init__()
20 |         self.confidence = 1.0 - smoothing
21 |         self.smoothing = smoothing
22 | 
23 |     def forward(self, x, target):
24 |         logprobs = torch.nn.functional.log_softmax(x, dim=-1)
25 | 
26 |         nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
27 |         nll_loss = nll_loss.squeeze(1)
28 |         smooth_loss = -logprobs.mean(dim=-1)
29 |         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
30 |         return loss.mean()
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 snu-mllab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/asset/short_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/asset/short_demo.png


--------------------------------------------------------------------------------
/asset/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/asset/title.png


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | *.pth
2 | *.ckpt
3 | 


--------------------------------------------------------------------------------
/examples/ckpt/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/examples/ckpt/.keep


--------------------------------------------------------------------------------
/examples/ddpm_cifar10.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CIFAR10"
 3 |     image_size: 32
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 128
34 |     n_epochs: 256
35 |     n_iters: 100000
36 |     snapshot_freq: 50000
37 |     validation_freq: 2000
38 | 
39 | sampling:
40 |     batch_size: 64
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/examples/images/husky.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/examples/images/husky.png


--------------------------------------------------------------------------------
/layer_merge/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/__init__.py


--------------------------------------------------------------------------------
/layer_merge/aggregate_imp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Aggregating the parallelized importance table.")
 6 | parser.add_argument(
 7 |     "-d",
 8 |     "--dir",
 9 |     type=str,
10 |     help="directory name",
11 | )
12 | parser.add_argument(
13 |     "-n",
14 |     "--num",
15 |     type=int,
16 |     help="the number of blks",
17 | )
18 | import re
19 | 
20 | 
21 | def natural_key(string_):
22 |     return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)]
23 | 
24 | 
25 | def main():
26 |     args = parser.parse_args()
27 |     res = pd.DataFrame()
28 |     for currentpath, folders, files in os.walk(args.dir):
29 |         for f in sorted(files, key=natural_key):
30 |             if ".csv" in f:
31 |                 print(f)
32 |                 tmp = pd.read_csv(os.path.join(currentpath, f))
33 |                 res = pd.concat([res, tmp])
34 |     print(len(res))
35 |     assert len(res) == args.num
36 |     res.to_csv(os.path.join(args.dir, "importance.csv"))
37 | 


--------------------------------------------------------------------------------
/layer_merge/kim23efficient/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim23efficient/__init__.py


--------------------------------------------------------------------------------
/layer_merge/kim24layer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim24layer/__init__.py


--------------------------------------------------------------------------------
/layer_merge/kim24layermerge/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/kim24layermerge/__init__.py


--------------------------------------------------------------------------------
/layer_merge/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/models/__init__.py


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_cfg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snu-mllab/LayerMerge/3dcddb6b8a90c84afe8400fada63b1d5c47377c5/layer_merge/models/ddpm_cfg/__init__.py


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_cfg/bedroom.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "LSUN"
 3 |     category: "bedroom"
 4 |     image_size: 256
 5 |     channels: 3
 6 |     logit_transform: false
 7 |     uniform_dequantization: false
 8 |     gaussian_dequantization: false
 9 |     random_flip: true
10 |     rescaled: true
11 |     num_workers: 32
12 | 
13 | model:
14 |     type: "simple"
15 |     in_channels: 3
16 |     out_ch: 3
17 |     ch: 128
18 |     ch_mult: [1, 1, 2, 2, 4, 4]
19 |     num_res_blocks: 2
20 |     attn_resolutions: [16, ]
21 |     dropout: 0.0
22 |     var_type: fixedsmall
23 |     ema_rate: 0.999
24 |     ema: True
25 |     resamp_with_conv: True
26 | 
27 | diffusion:
28 |     beta_schedule: linear
29 |     beta_start: 0.0001
30 |     beta_end: 0.02
31 |     num_diffusion_timesteps: 1000
32 | 
33 | training:
34 |     batch_size: 8
35 |     n_epochs: 10000
36 |     n_iters: 5000000
37 |     snapshot_freq: 5000
38 |     validation_freq: 2000
39 | 
40 | sampling:
41 |     batch_size: 16
42 |     last_only: True
43 | 
44 | optim:
45 |     weight_decay: 0.000
46 |     optimizer: "Adam"
47 |     lr: 0.000002
48 |     beta1: 0.9
49 |     amsgrad: false
50 |     eps: 0.00000001
51 | 


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_cfg/celeba.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CELEBA"
 3 |     image_size: 64
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2, 4]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 96 # 128
34 |     n_epochs: 10000
35 |     n_iters: 5000000
36 |     snapshot_freq: 5000
37 |     validation_freq: 20000
38 | 
39 | sampling:
40 |     batch_size: 32
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_cfg/church.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "LSUN"
 3 |     category: "church_outdoor"
 4 |     image_size: 256
 5 |     channels: 3
 6 |     logit_transform: false
 7 |     uniform_dequantization: false
 8 |     gaussian_dequantization: false
 9 |     random_flip: true
10 |     rescaled: true
11 |     num_workers: 32
12 | 
13 | model:
14 |     type: "simple"
15 |     in_channels: 3
16 |     out_ch: 3
17 |     ch: 128
18 |     ch_mult: [1, 1, 2, 2, 4, 4]
19 |     num_res_blocks: 2
20 |     attn_resolutions: [16, ]
21 |     dropout: 0.0
22 |     var_type: fixedsmall
23 |     ema_rate: 0.999
24 |     ema: True
25 |     resamp_with_conv: True
26 | 
27 | diffusion:
28 |     beta_schedule: linear
29 |     beta_start: 0.0001
30 |     beta_end: 0.02
31 |     num_diffusion_timesteps: 1000
32 | 
33 | training:
34 |     batch_size: 8 # 64
35 |     n_epochs: 10000
36 |     n_iters: 5000000
37 |     snapshot_freq: 5000
38 |     validation_freq: 2000
39 | 
40 | sampling:
41 |     batch_size: 16
42 |     last_only: True
43 | 
44 | optim:
45 |     weight_decay: 0.000
46 |     optimizer: "Adam"
47 |     lr: 0.00002
48 |     beta1: 0.9
49 |     amsgrad: false
50 |     eps: 0.00000001
51 | 


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_cfg/cifar10.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: "CIFAR10"
 3 |     image_size: 32
 4 |     channels: 3
 5 |     logit_transform: false
 6 |     uniform_dequantization: false
 7 |     gaussian_dequantization: false
 8 |     random_flip: true
 9 |     rescaled: true
10 |     num_workers: 4
11 | 
12 | model:
13 |     type: "simple"
14 |     in_channels: 3
15 |     out_ch: 3
16 |     ch: 128
17 |     ch_mult: [1, 2, 2, 2]
18 |     num_res_blocks: 2
19 |     attn_resolutions: [16, ]
20 |     dropout: 0.1
21 |     var_type: fixedlarge
22 |     ema_rate: 0.9999
23 |     ema: True
24 |     resamp_with_conv: True
25 | 
26 | diffusion:
27 |     beta_schedule: linear
28 |     beta_start: 0.0001
29 |     beta_end: 0.02
30 |     num_diffusion_timesteps: 1000
31 | 
32 | training:
33 |     batch_size: 128
34 |     n_epochs: 256
35 |     n_iters: 100000
36 |     snapshot_freq: 50000
37 |     validation_freq: 2000
38 | 
39 | sampling:
40 |     batch_size: 64
41 |     last_only: True
42 | 
43 | optim:
44 |     weight_decay: 0.000
45 |     optimizer: "Adam"
46 |     lr: 0.0002
47 |     beta1: 0.9
48 |     amsgrad: false
49 |     eps: 0.00000001
50 |     grad_clip: 1.0
51 | 


--------------------------------------------------------------------------------
/layer_merge/models/ddpm_datasets/ffhq.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | import lmdb
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | class FFHQ(Dataset):
 9 |     def __init__(self, path, transform, resolution=8):
10 |         self.env = lmdb.open(
11 |             path,
12 |             max_readers=32,
13 |             readonly=True,
14 |             lock=False,
15 |             readahead=False,
16 |             meminit=False,
17 |         )
18 | 
19 |         if not self.env:
20 |             raise IOError('Cannot open lmdb dataset', path)
21 | 
22 |         with self.env.begin(write=False) as txn:
23 |             self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
24 | 
25 |         self.resolution = resolution
26 |         self.transform = transform
27 | 
28 |     def __len__(self):
29 |         return self.length
30 | 
31 |     def __getitem__(self, index):
32 |         with self.env.begin(write=False) as txn:
33 |             key = f'{self.resolution}-{str(index).zfill(5)}'.encode('utf-8')
34 |             img_bytes = txn.get(key)
35 | 
36 |         buffer = BytesIO(img_bytes)
37 |         img = Image.open(buffer)
38 |         img = self.transform(img)
39 |         target = 0
40 | 
41 |         return img, target


--------------------------------------------------------------------------------
/lymg.yml:
--------------------------------------------------------------------------------
 1 | name: lymg
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - _openmp_mutex=4.5=1_gnu
 8 |   - accimage=0.2.0=py37h37b52e9_2
 9 |   - ca-certificates=2022.9.24=ha878542_0
10 |   - certifi=2022.9.24=pyhd8ed1ab_0
11 |   - cudatoolkit=11.3.1=h2bc3f7f_2
12 |   - intel-ipp=2019.1.144=h711154d_3
13 |   - ld_impl_linux-64=2.35.1=h7274673_9
14 |   - libffi=3.3=he6710b0_2
15 |   - libgcc-ng=9.3.0=h5101ec6_17
16 |   - libgomp=9.3.0=h5101ec6_17
17 |   - libjpeg-turbo=2.1.0=h7f98852_0
18 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
19 |   - ncurses=6.3=h7f8727e_2
20 |   - openssl=1.1.1k=h7f98852_0
21 |   - pip=21.2.2=py37h06a4308_0
22 |   - python=3.7.11=h12debd9_0
23 |   - python_abi=3.7=2_cp37m
24 |   - readline=8.1.2=h7f8727e_1
25 |   - setuptools=58.0.4=py37h06a4308_0
26 |   - sqlite=3.37.0=hc218d9a_0
27 |   - tk=8.6.11=h1ccaba5_0
28 |   - wheel=0.37.1=pyhd3eb1b0_0
29 |   - xz=5.2.5=h7b6447c_0
30 |   - zlib=1.2.11=h7f8727e_4
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | accimage==0.2.0
 3 | colorama==0.4.5
 4 | einops==0.4.1
 5 | fvcore==0.1.5.post20220512
 6 | matplotlib==3.5.1
 7 | numpy==1.21.5
 8 | pandas==1.3.5
 9 | Pillow==9.0.0
10 | progress==1.6
11 | tensorboardX==2.6
12 | timm==0.4.12
13 | torch==1.12.1+cu113
14 | torchvision==0.13.1+cu113
15 | lmdb==1.4.1
16 | tensorboard==2.11.2
17 | accelerate==0.20.3
18 | jupyter==1.0.0
19 | notebook==6.4.12
20 | gdown==4.7.3
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="layer_merge",
 5 |     version="0.1",
 6 |     packages=find_packages(),
 7 |     package_data={
 8 |         "layer_merge": [
 9 |             "kim23efficient/*.txt",
10 |             "kim24layermerge/*.txt",
11 |             "models/ddpm_cfg/*.yml",
12 |         ]
13 |     },
14 |     entry_points={
15 |         "console_scripts": [
16 |             "lymg_kim23_dp = layer_merge.kim23efficient.generate_tables:main",
17 |             "lymg_kim23_imp = layer_merge.kim23efficient.importance:main",
18 |             "lymg_kim24_dp = layer_merge.kim24layermerge.generate_tables:main",
19 |             "lymg_kim24_imp = layer_merge.kim24layermerge.importance:main",
20 |             "lymg_kim24lyr_dp = layer_merge.kim24layer.generate_tables:main",
21 |             "lymg_kim24lyr_imp = layer_merge.kim24layer.importance:main",
22 |             "lymg_agg = layer_merge.aggregate_imp:main",
23 |         ]
24 |     },
25 | )
26 | 


--------------------------------------------------------------------------------