├── .github
    └── ISSUE_TEMPLATE
    │   └── bug_report.md
├── .gitignore
├── .gitmodules
├── .jenkins
    └── docker
    │   ├── build.sh
    │   └── launch.sh
├── .nojekyll
├── Dockerfile
├── LICENSE
├── README.md
├── apex
    ├── RNN
    │   ├── README.md
    │   ├── RNNBackend.py
    │   ├── __init__.py
    │   ├── cells.py
    │   └── models.py
    ├── __init__.py
    ├── _autocast_utils.py
    ├── amp
    │   ├── README.md
    │   ├── __init__.py
    │   ├── __version__.py
    │   ├── _amp_state.py
    │   ├── _initialize.py
    │   ├── _process_optimizer.py
    │   ├── amp.py
    │   ├── compat.py
    │   ├── frontend.py
    │   ├── handle.py
    │   ├── lists
    │   │   ├── __init__.py
    │   │   ├── functional_overrides.py
    │   │   ├── tensor_overrides.py
    │   │   └── torch_overrides.py
    │   ├── opt.py
    │   ├── rnn_compat.py
    │   ├── scaler.py
    │   ├── utils.py
    │   └── wrap.py
    ├── contrib
    │   ├── __init__.py
    │   ├── bottleneck
    │   │   ├── __init__.py
    │   │   ├── bottleneck.py
    │   │   ├── bottleneck_module_test.py
    │   │   ├── halo_exchangers.py
    │   │   └── test.py
    │   ├── clip_grad
    │   │   ├── __init__.py
    │   │   └── clip_grad.py
    │   ├── conv_bias_relu
    │   │   ├── __init__.py
    │   │   └── conv_bias_relu.py
    │   ├── csrc
    │   │   ├── bottleneck
    │   │   │   └── bottleneck.cpp
    │   │   ├── conv_bias_relu
    │   │   │   └── conv_bias_relu.cpp
    │   │   ├── fmha
    │   │   │   ├── fmha_api.cpp
    │   │   │   └── src
    │   │   │   │   ├── fmha.h
    │   │   │   │   ├── fmha
    │   │   │   │       ├── gemm.h
    │   │   │   │       ├── gmem_tile.h
    │   │   │   │       ├── kernel_traits.h
    │   │   │   │       ├── mask.h
    │   │   │   │       ├── smem_tile.h
    │   │   │   │       ├── softmax.h
    │   │   │   │       └── utils.h
    │   │   │   │   ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload.h
    │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload_nl.h
    │   │   │   │   ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_kernel_1xN.h
    │   │   │   │   ├── fmha_kernel.h
    │   │   │   │   ├── fmha_noloop_reduce.cu
    │   │   │   │   └── fmha_utils.h
    │   │   ├── focal_loss
    │   │   │   ├── focal_loss_cuda.cpp
    │   │   │   └── focal_loss_cuda_kernel.cu
    │   │   ├── groupbn
    │   │   │   ├── batch_norm.cu
    │   │   │   ├── batch_norm.h
    │   │   │   ├── batch_norm_add_relu.cu
    │   │   │   ├── batch_norm_add_relu.h
    │   │   │   ├── cuda_utils.h
    │   │   │   ├── dnn.h
    │   │   │   ├── interface.cpp
    │   │   │   ├── ipc.cu
    │   │   │   └── nhwc_batch_norm_kernel.h
    │   │   ├── index_mul_2d
    │   │   │   ├── index_mul_2d_cuda.cpp
    │   │   │   └── index_mul_2d_cuda_kernel.cu
    │   │   ├── layer_norm
    │   │   │   ├── ln.h
    │   │   │   ├── ln_api.cpp
    │   │   │   ├── ln_bwd_kernels.cuh
    │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
    │   │   │   ├── ln_fwd_cuda_kernel.cu
    │   │   │   ├── ln_fwd_kernels.cuh
    │   │   │   ├── ln_kernel_traits.h
    │   │   │   └── ln_utils.cuh
    │   │   ├── multihead_attn
    │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
    │   │   │   ├── dropout.cuh
    │   │   │   ├── encdec_multihead_attn_cuda.cu
    │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
    │   │   │   ├── layer_norm.cuh
    │   │   │   ├── masked_softmax_dropout_cuda.cu
    │   │   │   ├── multihead_attn_frontend.cpp
    │   │   │   ├── philox.cuh
    │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
    │   │   │   ├── self_multihead_attn_bias_cuda.cu
    │   │   │   ├── self_multihead_attn_cuda.cu
    │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
    │   │   │   ├── softmax.cuh
    │   │   │   └── strided_batched_gemm.cuh
    │   │   ├── nccl_allocator
    │   │   │   └── NCCLAllocator.cpp
    │   │   ├── nccl_p2p
    │   │   │   ├── nccl_p2p.cpp
    │   │   │   ├── nccl_p2p_cuda.cu
    │   │   │   ├── nccl_p2p_cuda.cuh
    │   │   │   ├── nccl_version.cpp
    │   │   │   └── nccl_version_check.cu
    │   │   ├── optimizers
    │   │   │   ├── fused_adam_cuda.cpp
    │   │   │   ├── fused_adam_cuda_kernel.cu
    │   │   │   ├── fused_lamb_cuda.cpp
    │   │   │   ├── fused_lamb_cuda_kernel.cu
    │   │   │   ├── multi_tensor_distopt_adam.cpp
    │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
    │   │   │   ├── multi_tensor_distopt_lamb.cpp
    │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
    │   │   ├── peer_memory
    │   │   │   ├── peer_memory.cpp
    │   │   │   ├── peer_memory_cuda.cu
    │   │   │   └── peer_memory_cuda.cuh
    │   │   ├── transducer
    │   │   │   ├── transducer_joint.cpp
    │   │   │   ├── transducer_joint_kernel.cu
    │   │   │   ├── transducer_loss.cpp
    │   │   │   └── transducer_loss_kernel.cu
    │   │   └── xentropy
    │   │   │   ├── interface.cpp
    │   │   │   └── xentropy_kernel.cu
    │   ├── examples
    │   │   └── multihead_attn
    │   │   │   ├── func_test_multihead_attn.py
    │   │   │   └── perf_test_multihead_attn.py
    │   ├── fmha
    │   │   ├── __init__.py
    │   │   └── fmha.py
    │   ├── focal_loss
    │   │   ├── __init__.py
    │   │   └── focal_loss.py
    │   ├── groupbn
    │   │   ├── __init__.py
    │   │   └── batch_norm.py
    │   ├── index_mul_2d
    │   │   ├── __init__.py
    │   │   └── index_mul_2d.py
    │   ├── layer_norm
    │   │   ├── __init__.py
    │   │   └── layer_norm.py
    │   ├── multihead_attn
    │   │   ├── MHA_bwd.png
    │   │   ├── MHA_fwd.png
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── encdec_multihead_attn.py
    │   │   ├── encdec_multihead_attn_func.py
    │   │   ├── fast_encdec_multihead_attn_func.py
    │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
    │   │   ├── fast_self_multihead_attn_func.py
    │   │   ├── fast_self_multihead_attn_norm_add_func.py
    │   │   ├── mask_softmax_dropout_func.py
    │   │   ├── self_multihead_attn.py
    │   │   └── self_multihead_attn_func.py
    │   ├── nccl_allocator
    │   │   ├── __init__.py
    │   │   └── nccl_allocator.py
    │   ├── optimizers
    │   │   ├── __init__.py
    │   │   ├── distributed_fused_adam.py
    │   │   ├── distributed_fused_lamb.py
    │   │   ├── fp16_optimizer.py
    │   │   ├── fused_adam.py
    │   │   ├── fused_lamb.py
    │   │   └── fused_sgd.py
    │   ├── peer_memory
    │   │   ├── __init__.py
    │   │   ├── peer_halo_exchange_module_tests.py
    │   │   ├── peer_halo_exchanger_1d.py
    │   │   └── peer_memory.py
    │   ├── sparsity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── asp.py
    │   │   ├── permutation_lib.py
    │   │   ├── permutation_search_kernels
    │   │   │   ├── CUDA_kernels
    │   │   │   │   └── permutation_search_kernels.cu
    │   │   │   ├── __init__.py
    │   │   │   ├── call_permutation_search_kernels.py
    │   │   │   ├── exhaustive_search.py
    │   │   │   └── permutation_utilities.py
    │   │   ├── sparse_masklib.py
    │   │   └── test
    │   │   │   ├── checkpointing_test_part1.py
    │   │   │   ├── checkpointing_test_part2.py
    │   │   │   ├── checkpointing_test_reference.py
    │   │   │   └── toy_problem.py
    │   ├── test
    │   │   ├── clip_grad
    │   │   │   └── test_clip_grad.py
    │   │   ├── conv_bias_relu
    │   │   │   └── test_conv_bias_relu.py
    │   │   ├── fmha
    │   │   │   └── test_fmha.py
    │   │   ├── focal_loss
    │   │   │   └── test_focal_loss.py
    │   │   ├── fused_dense
    │   │   │   ├── test_gelu.py
    │   │   │   └── test_half.py
    │   │   ├── groupbn
    │   │   │   ├── test_groupbn.py
    │   │   │   └── test_groupbn_channel_last.py
    │   │   ├── index_mul_2d
    │   │   │   └── test_index_mul_2d.py
    │   │   ├── layer_norm
    │   │   │   └── test_fast_layer_norm.py
    │   │   ├── multihead_attn
    │   │   │   ├── test_encdec_multihead_attn.py
    │   │   │   ├── test_encdec_multihead_attn_norm_add.py
    │   │   │   ├── test_fast_self_multihead_attn_bias.py
    │   │   │   ├── test_mha_fused_softmax.py
    │   │   │   ├── test_self_multihead_attn.py
    │   │   │   └── test_self_multihead_attn_norm_add.py
    │   │   ├── optimizers
    │   │   │   ├── test_dist_adam.py
    │   │   │   └── test_distributed_fused_lamb.py
    │   │   ├── run_rocm_extensions.py
    │   │   ├── test_label_smoothing.py
    │   │   └── transducer
    │   │   │   ├── test_transducer_joint.py
    │   │   │   ├── test_transducer_loss.py
    │   │   │   └── transducer_ref.py
    │   ├── transducer
    │   │   ├── __init__.py
    │   │   └── transducer.py
    │   └── xentropy
    │   │   ├── __init__.py
    │   │   └── softmax_xentropy.py
    ├── fp16_utils
    │   ├── README.md
    │   ├── __init__.py
    │   ├── fp16_optimizer.py
    │   ├── fp16util.py
    │   └── loss_scaler.py
    ├── fused_dense
    │   ├── __init__.py
    │   └── fused_dense.py
    ├── mlp
    │   ├── __init__.py
    │   └── mlp.py
    ├── multi_tensor_apply
    │   ├── __init__.py
    │   └── multi_tensor_apply.py
    ├── normalization
    │   ├── __init__.py
    │   └── fused_layer_norm.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── fused_adagrad.py
    │   ├── fused_adam.py
    │   ├── fused_lamb.py
    │   ├── fused_lars.py
    │   ├── fused_mixed_precision_lamb.py
    │   ├── fused_novograd.py
    │   └── fused_sgd.py
    ├── parallel
    │   ├── LARC.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── distributed.py
    │   ├── multiproc.py
    │   ├── optimized_sync_batchnorm.py
    │   ├── optimized_sync_batchnorm_kernel.py
    │   ├── sync_batchnorm.py
    │   └── sync_batchnorm_kernel.py
    ├── testing
    │   ├── __init__.py
    │   └── common_utils.py
    └── transformer
    │   ├── README.md
    │   ├── __init__.py
    │   ├── _data
    │       ├── __init__.py
    │       └── _batchsampler.py
    │   ├── amp
    │       ├── __init__.py
    │       └── grad_scaler.py
    │   ├── enums.py
    │   ├── functional
    │       ├── __init__.py
    │       ├── fused_rope.py
    │       └── fused_softmax.py
    │   ├── layers
    │       ├── __init__.py
    │       └── layer_norm.py
    │   ├── log_util.py
    │   ├── microbatches.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │       ├── __init__.py
    │       ├── _timers.py
    │       ├── p2p_communication.py
    │       ├── schedules
    │       │   ├── __init__.py
    │       │   ├── common.py
    │       │   ├── fwd_bwd_no_pipelining.py
    │       │   ├── fwd_bwd_pipelining_with_interleaving.py
    │       │   └── fwd_bwd_pipelining_without_interleaving.py
    │       └── utils.py
    │   ├── tensor_parallel
    │       ├── __init__.py
    │       ├── cross_entropy.py
    │       ├── data.py
    │       ├── layers.py
    │       ├── mappings.py
    │       ├── memory.py
    │       ├── random.py
    │       └── utils.py
    │   ├── testing
    │       ├── __init__.py
    │       ├── arguments.py
    │       ├── commons.py
    │       ├── distributed_test_base.py
    │       ├── global_vars.py
    │       ├── standalone_bert.py
    │       ├── standalone_gpt.py
    │       └── standalone_transformer_lm.py
    │   └── utils.py
├── build.sh
├── csrc
    ├── amp_C_frontend.cpp
    ├── compat.h
    ├── flatten_unflatten.cpp
    ├── fused_dense_base.cpp
    ├── fused_dense_cuda.cu
    ├── layer_norm_cuda.cpp
    ├── layer_norm_cuda_kernel.cu
    ├── megatron
    │   ├── fused_bias_swiglu.cpp
    │   ├── fused_bias_swiglu_cuda.cu
    │   ├── fused_rotary_positional_embedding.cpp
    │   ├── fused_rotary_positional_embedding.h
    │   ├── fused_rotary_positional_embedding_cuda.cu
    │   ├── fused_weight_gradient_dense.cpp
    │   ├── fused_weight_gradient_dense_16bit_prec_cuda.cu
    │   ├── fused_weight_gradient_dense_cuda.cu
    │   ├── generic_scaled_masked_softmax.h
    │   ├── generic_scaled_masked_softmax_cpu.cpp
    │   ├── generic_scaled_masked_softmax_cuda.cu
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cpu.cpp
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_softmax_cpu.cpp
    │   ├── scaled_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cpu.cpp
    │   └── scaled_upper_triang_masked_softmax_cuda.cu
    ├── mlp.cpp
    ├── mlp_cuda.cu
    ├── multi_tensor_adagrad.cu
    ├── multi_tensor_adam.cu
    ├── multi_tensor_apply.cuh
    ├── multi_tensor_apply_base.cuh
    ├── multi_tensor_axpby_kernel.cu
    ├── multi_tensor_l2norm_kernel.cu
    ├── multi_tensor_l2norm_kernel_mp.cu
    ├── multi_tensor_l2norm_scale_kernel.cu
    ├── multi_tensor_lamb.cu
    ├── multi_tensor_lamb_mp.cu
    ├── multi_tensor_lamb_stage_1.cu
    ├── multi_tensor_lamb_stage_2.cu
    ├── multi_tensor_lars.cu
    ├── multi_tensor_novograd.cu
    ├── multi_tensor_scale_kernel.cu
    ├── multi_tensor_sgd_kernel.cu
    ├── static_switch.h
    ├── syncbn.cpp
    ├── type_shim.h
    └── welford.cu
├── docs
    ├── Makefile
    └── source
    │   ├── _static
    │       ├── css
    │       │   └── pytorch_theme.css
    │       └── img
    │       │   └── nv-pytorch2.png
    │   ├── _templates
    │       └── layout.html
    │   ├── advanced.rst
    │   ├── amp.rst
    │   ├── conf.py
    │   ├── fp16_utils.rst
    │   ├── index.rst
    │   ├── layernorm.rst
    │   ├── optimizers.rst
    │   └── parallel.rst
├── examples
    ├── README.md
    ├── dcgan
    │   ├── README.md
    │   └── main_amp.py
    ├── docker
    │   ├── Dockerfile
    │   └── README.md
    ├── imagenet
    │   ├── README.md
    │   └── main_amp.py
    └── simple
    │   └── distributed
    │       ├── README.md
    │       ├── distributed_data_parallel.py
    │       └── run.sh
├── pyproject.toml
├── requirements.txt
├── requirements_dev.txt
├── setup.py
├── tests
    ├── L0
    │   ├── run_amp
    │   │   ├── __init__.py
    │   │   ├── test_add_param_group.py
    │   │   ├── test_basic_casts.py
    │   │   ├── test_cache.py
    │   │   ├── test_checkpointing.py
    │   │   ├── test_fused_sgd.py
    │   │   ├── test_larc.py
    │   │   ├── test_multi_tensor_axpby.py
    │   │   ├── test_multi_tensor_l2norm.py
    │   │   ├── test_multi_tensor_scale.py
    │   │   ├── test_multiple_models_optimizers_losses.py
    │   │   ├── test_promotion.py
    │   │   ├── test_rnn.py
    │   │   └── utils.py
    │   ├── run_fp16util
    │   │   ├── __init__.py
    │   │   └── test_fp16util.py
    │   ├── run_fused_dense
    │   │   ├── test_fused_dense.py
    │   │   └── test_gelu.py
    │   ├── run_fused_layer_norm
    │   │   └── test_fused_layer_norm.py
    │   ├── run_mlp
    │   │   └── test_mlp.py
    │   ├── run_optimizers
    │   │   ├── __init__.py
    │   │   ├── test_adam.py
    │   │   ├── test_fused_novograd.py
    │   │   ├── test_fused_optimizer.py
    │   │   ├── test_fused_optimizer_channels_last.py
    │   │   └── test_lamb.py
    │   ├── run_rocm.sh
    │   ├── run_test.py
    │   └── run_transformer
    │   │   ├── __init__.py
    │   │   ├── gpt_scaling_test.py
    │   │   ├── run_bert_minimal_test.py
    │   │   ├── run_dynamic_batchsize_test.py
    │   │   ├── run_gpt_minimal_test.py
    │   │   ├── test_batch_sampler.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_fused_bias_swiglu.py
    │   │   ├── test_fused_rope.py
    │   │   ├── test_fused_softmax.py
    │   │   ├── test_layers.py
    │   │   ├── test_mapping.py
    │   │   ├── test_microbatches.py
    │   │   ├── test_p2p_comm.py
    │   │   ├── test_parallel_state.py
    │   │   ├── test_pipeline_parallel_fwd_bwd.py
    │   │   ├── test_random.py
    │   │   ├── test_transformer_module.py
    │   │   └── test_transformer_utils.py
    ├── L1
    │   ├── common
    │   │   ├── compare.py
    │   │   ├── main_amp.py
    │   │   └── run_test.sh
    │   ├── cross_product
    │   │   └── run.sh
    │   ├── cross_product_distributed
    │   │   └── run.sh
    │   └── transformer
    │   │   └── pipeline_parallel_fwd_bwd_ucc_async.py
    ├── distributed
    │   ├── DDP
    │   │   ├── ddp_race_condition_test.py
    │   │   └── run_race_test.sh
    │   ├── amp_master_params
    │   │   ├── amp_master_params.py
    │   │   ├── compare.py
    │   │   └── run.sh
    │   ├── run_rocm_distributed.sh
    │   └── synced_batchnorm
    │   │   ├── python_single_gpu_unit_test.py
    │   │   ├── single_gpu_unit_test.py
    │   │   ├── test_batchnorm1d.py
    │   │   ├── test_groups.py
    │   │   ├── two_gpu_test_different_batch_size.py
    │   │   ├── two_gpu_unit_test.py
    │   │   └── unit_test.sh
    └── docker_extension_builds
    │   └── run.sh
└── version.txt


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve apex
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the Bug**
11 | 
12 | **Minimal Steps/Code to Reproduce the Bug**
13 | <!--
14 | Please list the *minimal* steps or provide a code snippet for us to be able to reproduce the bug.
15 | 
16 | A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
17 | --> 
18 | 
19 | **Expected Behavior**
20 | <!-- A clear and concise description of what you expected to happen. -->
21 | 
22 | **Environment**
23 | <!-- OS, version of Python, CUDA, PyTorch; collect these via `python -m torch.utils.collect_env` -->
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | apex.egg-info
  2 | dist
  3 | build
  4 | docs/build
  5 | *~
  6 | __pycache__
  7 | .vscode
  8 | 
  9 | # Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 | 
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # pytype static type analyzer
144 | .pytype/
145 | 
146 | # Cython debug symbols
147 | cython_debug/
148 | *.hip
149 | *_hip.*
150 | *hip* 
151 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "apex/contrib/csrc/cudnn-frontend"]
2 | 	path = apex/contrib/csrc/cudnn-frontend
3 | 	url = https://github.com/NVIDIA/cudnn-frontend.git
4 | 


--------------------------------------------------------------------------------
/.jenkins/docker/build.sh:
--------------------------------------------------------------------------------
1 | sudo docker build . --rm -t apex
2 | 


--------------------------------------------------------------------------------
/.jenkins/docker/launch.sh:
--------------------------------------------------------------------------------
1 | sudo docker run -it -v $HOME:/data --rm --privileged --device=/dev/dri --device=/dev/kfd --network host --group-add video apex
2 | 


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/.nojekyll


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG FROM_IMAGE=lcskrishna/rocm-pytorch:rocm3.3_ubuntu16.04_py3.6_pytorch_bfloat16_mgpu
2 | 
3 | FROM ${FROM_IMAGE}
4 | RUN \
5 |     git clone --recursive https://github.com/ROCmSoftwarePlatform/apex.git && \
6 |     cd apex && \
7 |     python3.6 setup.py install --cpp_ext --cuda_ext
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/apex/RNN/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 | 


--------------------------------------------------------------------------------
/apex/RNN/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM
2 | 
3 | __all__ = ['models']
4 | 


--------------------------------------------------------------------------------
/apex/RNN/cells.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .RNNBackend import RNNCell
 6 | 
 7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
 8 | 
 9 | import math 
10 | 
11 | 
12 | class mLSTMRNNCell(RNNCell):
13 |     """
14 |     mLSTMRNNCell
15 |     """
16 | 
17 |     def __init__(self, input_size, hidden_size, bias = False, output_size = None):
18 |         gate_multiplier = 4
19 |         super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
20 | 
21 |         self.w_mih = nn.Parameter(torch.empty(self.output_size, self.input_size))
22 |         self.w_mhh = nn.Parameter(torch.empty(self.output_size, self.output_size))
23 | 
24 |         self.reset_parameters()
25 | 
26 |     def forward(self, input):
27 |         """
28 |         mLSTMRNNCell.forward()
29 |         """
30 |         #if not inited or bsz has changed this will create hidden states
31 |         self.init_hidden(input.size()[0])
32 | 
33 |         hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
34 | 
35 |         self.hidden = list(
36 |                            self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
37 |                            b_ih=self.b_ih, b_hh=self.b_hh)
38 |         )
39 |         
40 |         if self.output_size != self.hidden_size:
41 |             self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
42 |         return tuple(self.hidden)
43 | 
44 | 
45 |     def new_like(self, new_input_size=None):
46 |         if new_input_size is None:
47 |             new_input_size = self.input_size
48 |         
49 |         return type(self)(
50 |             new_input_size,
51 |             self.hidden_size,
52 |             self.bias,
53 |             self.output_size)
54 | 
55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
56 |     """
57 |     mLSTMCell
58 |     """
59 | 
60 |     if input.is_cuda:
61 |         igates = F.linear(input, w_ih)
62 |         m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
63 |         hgates = F.linear(m, w_hh)
64 | 
65 |         state = fusedBackend.LSTMFused.apply
66 |         return state(igates, hgates, hidden[1], b_ih, b_hh)
67 | 
68 |     hx, cx = hidden
69 |     
70 |     m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
71 |     gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
72 | 
73 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
74 | 
75 |     ingate = F.sigmoid(ingate)
76 |     forgetgate = F.sigmoid(forgetgate)
77 |     cellgate = F.tanh(cellgate)
78 |     outgate = F.sigmoid(outgate)
79 |     
80 |     cy = (forgetgate * cx) + (ingate * cellgate)
81 |     hy = outgate * F.tanh(cy)
82 |     
83 |     return hy, cy
84 |                                                                             
85 | 


--------------------------------------------------------------------------------
/apex/RNN/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
 4 | 
 5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
 6 | from .cells import mLSTMRNNCell, mLSTMCell
 7 | 
 8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
 9 |     """
10 |     :class:`toRNNBackend`
11 |     """
12 | 
13 |     if bidirectional:
14 |         return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
15 |     else:
16 |         return stackedRNN(inputRNN, num_layers, dropout = dropout)
17 | 
18 | 
19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
20 |     """
21 |     :class:`LSTM`
22 |     """
23 |     inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
24 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
25 | 
26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
27 |     """
28 |     :class:`GRU`
29 |     """
30 |     inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
31 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
32 | 
33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
34 |     """
35 |     :class:`ReLU`
36 |     """
37 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
38 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
39 | 
40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
41 |     """
42 |     :class:`Tanh`
43 |     """
44 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
45 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
46 |         
47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
48 |     """
49 |     :class:`mLSTM`
50 |     """
51 |     inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
52 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/apex/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import warnings
 3 | 
 4 | # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
 5 | import torch
 6 | 
 7 | 
 8 | if torch.distributed.is_available():
 9 |     from . import parallel
10 | 
11 | from . import amp
12 | from . import fp16_utils
13 | 
14 | # For optimizers and normalization there is no Python fallback.
15 | # Absence of cuda backend is a hard error.
16 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
17 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
18 | # so they expect those backends to be available, but for some reason they actually aren't
19 | # available (for example because they built improperly in a way that isn't revealed until
20 | # load time) the error message is timely and visible.
21 | from . import optimizers
22 | from . import normalization
23 | from . import transformer
24 | 
25 | 
26 | # Logging utilities for apex.transformer module
27 | class RankInfoFormatter(logging.Formatter):
28 | 
29 |     def format(self, record):
30 |         from apex.transformer.parallel_state import get_rank_info
31 |         record.rank_info = get_rank_info()
32 |         return super().format(record)
33 | 
34 | 
35 | _library_root_logger = logging.getLogger(__name__)
36 | handler = logging.StreamHandler()
37 | handler.setFormatter(RankInfoFormatter("%(asctime)s - PID:%(process)d - rank:%(rank_info)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s", "%y-%m-%d %H:%M:%S"))
38 | _library_root_logger.addHandler(handler)
39 | _library_root_logger.propagate = False
40 | 
41 | 
42 | def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
43 |     cudnn_available = torch.backends.cudnn.is_available()
44 |     cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
45 |     if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
46 |         warnings.warn(
47 |             f"`{global_option}` depends on cuDNN {required_cudnn_version} or later, "
48 |             f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
49 |         )
50 |         return False
51 |     return True
52 | 


--------------------------------------------------------------------------------
/apex/_autocast_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | __all__ = ["_cast_if_autocast_enabled"]
 7 | 
 8 | 
 9 | def _get_autocast_dtypes() -> Sequence[torch.dtype]:
10 |     if torch.cuda.is_bf16_supported():
11 |         return [torch.half, torch.bfloat16]
12 |     return [torch.half]
13 | 
14 | 
15 | def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
16 |     if not torch.is_autocast_enabled():
17 |         return torch.float or dtype
18 |     else:
19 |         return torch.get_autocast_gpu_dtype()
20 | 
21 | 
22 | def _cast_if_autocast_enabled(*args):
23 |     if not torch.is_autocast_enabled():
24 |         return args
25 |     else:
26 |         return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
27 | 


--------------------------------------------------------------------------------
/apex/amp/README.md:
--------------------------------------------------------------------------------
 1 | # amp: Automatic Mixed Precision
 2 | 
 3 | ## Annotating User Functions
 4 | 
 5 | Nearly all PyTorch user code needs nothing more than the two steps
 6 | above to use amp. After all, custom layers are built out of simpler
 7 | PyTorch components, and amp already can see those.
 8 | 
 9 | However, any custom C++ or CUDA code is outside of amp's (default)
10 | view of things. For example, suppose I implemented a new recurrent
11 | cell called a "forgetful recurrent unit" that calls directly into a
12 | CUDA backend:
13 | 
14 | ```python
15 | from backend import FRUBackend
16 | 
17 | def fru(input, hidden, weight, bias):
18 |     # call to CUDA code
19 |     FRUBackend(input, hidden, weight, bias)
20 | ```
21 | 
22 | In this case, it is possible to get a runtime type mismatch. For
23 | example, you might have `input` in fp16, and `weight` in fp32, and amp
24 | doesn't have the visibility to insert an appropriate cast.
25 | 
26 | amp exposes two ways to handle "invisible" backend code: function
27 | annotations and explicit registration.
28 | 
29 | #### Function annotation
30 | 
31 | The first way to handle backend code is a set of function annotations:
32 | 
33 | - `@amp.half_function`
34 | - `@amp.float_function`
35 | - `@amp.promote_function`
36 | 
37 | These correspond to:
38 | 
39 | - Cast all arguments to fp16
40 | - Cast all argumnets fo fp32
41 | - If there are any type mismatches, cast everything to the widest type
42 | 
43 | In our example, we believe that the FRU unit is fp16-safe and will get
44 | performance gains from casting its arguments to fp16, so we write:
45 | 
46 | ```python
47 | @amp.half_function
48 | def fru(input, hidden, weight, bias):
49 |     #...
50 | ```
51 | 
52 | #### Explicit registration
53 | 
54 | The other way to handle backend code is with explicit function
55 | registration:
56 | 
57 | - `amp.register_half_function(module, function_name)`
58 | - `amp.register_float_function(module, function_name)`
59 | - `amp.register_promote_function(module, function_name)`
60 | 
61 | When using this API, `module` is the containing class or module for
62 | the function, and `function_name` is the _string_ name of the
63 | function. Note that the function must be registered before the call to
64 | `amp.initalize()`.
65 | 
66 | For our FRU unit, we can register the backend function directly:
67 | 
68 | ```python
69 | import backend
70 | 
71 | amp.register_half_function(backend, 'FRUBackend')
72 | ```
73 | 


--------------------------------------------------------------------------------
/apex/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from .amp import init, half_function, bfloat16_function, float_function, promote_function,\
2 |     register_half_function, register_bfloat16_function, register_float_function, register_promote_function
3 | from .handle import scale_loss, disable_casts
4 | from .frontend import initialize, state_dict, load_state_dict
5 | from ._amp_state import master_params, _amp_state
6 | 


--------------------------------------------------------------------------------
/apex/amp/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | __version__ = '.'.join(map(str, VERSION))
3 | 


--------------------------------------------------------------------------------
/apex/amp/_amp_state.py:
--------------------------------------------------------------------------------
 1 | # This is a "header object" that allows different amp modules to communicate.
 2 | # I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
 3 | # But apparently it's ok:
 4 | # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
 5 | import torch
 6 | 
 7 | 
 8 | class AmpState(object):
 9 |     def __init__(self):
10 |         self.hard_override=False
11 |         self.allow_incoming_model_not_fp32 = False
12 |         self.verbosity=1
13 | 
14 | 
15 | # Attribute stash.  Could also just stash things as global module attributes.
16 | _amp_state = AmpState()
17 | 
18 | 
19 | def warn_or_err(msg):
20 |     if _amp_state.hard_override:
21 |         print("Warning:  " + msg)
22 |     else:
23 |         raise RuntimeError(msg)
24 |         # I'm not sure if allowing hard_override is a good idea.
25 |         # + "  If you're sure you know what you're doing, supply " +
26 |         #                    "hard_override=True to amp.initialize.")
27 | 
28 | 
29 | def maybe_print(msg, rank0=False):
30 |     distributed = torch.distributed.is_available() and \
31 |         torch.distributed.is_initialized() and \
32 |         torch.distributed.get_world_size() > 1
33 |     if _amp_state.verbosity > 0:
34 |         if rank0:
35 |             if distributed:
36 |                 if torch.distributed.get_rank() == 0:
37 |                     print(msg)
38 |             else:
39 |                 print(msg)
40 |         else:
41 |             print(msg)
42 | 
43 | 
44 | # def iter_params(param_groups):
45 | #     for group in param_groups:
46 | #         for p in group['params']:
47 | #             yield p
48 | 
49 | 
50 | def master_params(optimizer):
51 |     """
52 |     Generator expression that iterates over the params owned by ``optimizer``.
53 | 
54 |     Args:
55 |         optimizer: An optimizer previously returned from ``amp.initialize``.
56 |     """
57 |     for group in optimizer.param_groups:
58 |         for p in group['params']:
59 |             yield p
60 | 


--------------------------------------------------------------------------------
/apex/amp/compat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # True for post-0.4, when Variables/Tensors merged.
 4 | def variable_is_tensor():
 5 |     v = torch.autograd.Variable()
 6 |     return isinstance(v, torch.Tensor)
 7 | 
 8 | def tensor_is_variable():
 9 |     x = torch.Tensor()
10 |     return type(x) == torch.autograd.Variable
11 | 
12 | # False for post-0.4
13 | def tensor_is_float_tensor():
14 |     x = torch.Tensor()
15 |     return type(x) == torch.FloatTensor
16 | 
17 | # Akin to `torch.is_tensor`, but returns True for Variable
18 | # objects in pre-0.4.
19 | def is_tensor_like(x):
20 |     return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
21 | 
22 | # Wraps `torch.is_floating_point` if present, otherwise checks
23 | # the suffix of `x.type()`.
24 | def is_floating_point(x):
25 |     if hasattr(torch, 'is_floating_point'):
26 |         return torch.is_floating_point(x)
27 |     try:
28 |         torch_type = x.type()
29 |         return torch_type.endswith('FloatTensor') or \
30 |             torch_type.endswith('HalfTensor') or \
31 |             torch_type.endswith('DoubleTensor') or \
32 |             torch_type.endswith('BFloat16Tensor')
33 |     except AttributeError:
34 |         return False
35 | 
36 | def scalar_python_val(x):
37 |     if hasattr(x, 'item'):
38 |         return x.item()
39 |     else:
40 |         if isinstance(x, torch.autograd.Variable):
41 |             return x.data[0]
42 |         else:
43 |             return x[0]
44 | 
45 | # Accounts for the possibility that some ops may be removed from a namespace.
46 | def filter_attrs(module, attrs):
47 |     return list(attrname for attrname in attrs if hasattr(module, attrname))
48 | 


--------------------------------------------------------------------------------
/apex/amp/lists/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/amp/lists/__init__.py


--------------------------------------------------------------------------------
/apex/amp/lists/functional_overrides.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # TODO: think about the following two. They do weird things.
 3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
 4 | # - torch.nn.utils.weight_norm
 5 | 
 6 | # Notes:
 7 | # F.instance_norm uses batch_norm internally. Which correctly handles
 8 | #   fp16 in/out with fp32 weights. So we shouldn't do anything for
 9 | #   either of these.
10 | # F.normalize calls `input.norm()` internally, so it's redundant, but
11 | #   kept here in case impl. changes.
12 | # F.cosine_similarity is same: calls `x.norm()` internally.
13 | 
14 | import torch.nn.functional
15 | 
16 | MODULE = torch.nn.functional
17 | 
18 | FP16_FUNCS = [
19 |     'conv1d',
20 |     'conv2d',
21 |     'conv3d',
22 |     'conv_transpose1d',
23 |     'conv_transpose2d',
24 |     'conv_transpose3d',
25 |     'conv_tbc', # Undocumented / maybe new?
26 |     'linear',
27 | ]
28 | 
29 | BFLOAT16_FUNCS = [
30 |     'conv1d',
31 |     'conv2d',
32 |     'conv3d',
33 |     'conv_transpose1d',
34 |     'conv_transpose2d',
35 |     'conv_transpose3d',
36 |     'conv_tbc', # Undocumented / maybe new?
37 |     'linear',
38 | ]
39 | 
40 | FP32_FUNCS = [
41 | 
42 |     # Interpolation/Upsampling TODO:  Remove for 1.2
43 |     'interpolate',
44 |     'grid_sample',
45 | 
46 |     # Pointwise
47 |     'softplus',
48 |     'softmin',
49 |     'log_softmax',
50 |     'softmax',
51 |     'gelu',
52 |     
53 |     # Normalization
54 |     'layer_norm',
55 |     'group_norm',
56 |     'local_response_norm',
57 |     'normalize',
58 |     'cosine_similarity',
59 | 
60 |     # Loss functions
61 |     # TODO: which of these can be fp16?
62 |     'poisson_nll_loss',
63 |     'cosine_embedding_loss',
64 |     'cross_entropy',
65 |     'hinge_embedding_loss',
66 |     'kl_div',
67 |     'l1_loss',
68 |     'mse_loss',
69 |     'margin_ranking_loss',
70 |     'multilabel_margin_loss',
71 |     'multilabel_soft_margin_loss',
72 |     'multi_margin_loss',
73 |     'nll_loss',
74 |     'binary_cross_entropy_with_logits',
75 |     'smooth_l1_loss',
76 |     'soft_margin_loss',
77 |     'triplet_margin_loss',
78 |     'ctc_loss'
79 | ]
80 | 
81 | BANNED_FUNCS = [
82 |     ('binary_cross_entropy',
83 |      ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
84 |       "It requires that the output of the previous function be already a FloatTensor. \n\n"
85 |       "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
86 |       "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
87 |       "that is compatible with amp.\nAnother option is to add\n"
88 |       "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
89 |       "If you _really_ know what you are doing, you can disable this warning by passing "
90 |       "allow_banned=True to `amp.init()`."))
91 | ]
92 | 


--------------------------------------------------------------------------------
/apex/amp/lists/tensor_overrides.py:
--------------------------------------------------------------------------------
 1 | from .. import compat
 2 | from . import torch_overrides
 3 | 
 4 | import importlib
 5 | 
 6 | import torch
 7 | 
 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable():
 9 | MODULE = torch.Tensor
10 | # else:
11 | #     MODULE = torch.autograd.Variable
12 | 
13 | 
14 | FP16_FUNCS = compat.filter_attrs(MODULE, [
15 |     '__matmul__',
16 | ])
17 | 
18 | BFLOAT16_FUNCS = [
19 |     '__matmul__',
20 | ]
21 | 
22 | FP32_FUNCS = compat.filter_attrs(MODULE, [
23 |     '__ipow__',
24 |     '__pow__',
25 |     '__rpow__',
26 | 
27 |     # Cast to fp32 before transfer to CPU
28 |     'cpu',
29 | ])
30 | 
31 | CASTS = compat.filter_attrs(MODULE, [
32 |     '__add__',
33 |     '__div__',
34 |     '__eq__',
35 |     '__ge__',
36 |     '__gt__',
37 |     '__iadd__',
38 |     '__idiv__',
39 |     '__imul__',
40 |     '__isub__',
41 |     '__itruediv__',
42 |     '__le__',
43 |     '__lt__',
44 |     '__mul__',
45 |     '__ne__',
46 |     '__radd__',
47 |     '__rdiv__',
48 |     '__rmul__',
49 |     '__rsub__',
50 |     '__rtruediv__',
51 |     '__sub__',
52 |     '__truediv__',
53 | ])
54 | 
55 | # None of these, but here to make code cleaner.
56 | SEQUENCE_CASTS = []
57 | 
58 | # We need to grab all the methods from torch_overrides and add them to
59 | # the Tensor lists as well, as almost all methods are duplicated
60 | # between `torch` and `torch.Tensor` (and check with `hasattr`,
61 | # because a few random ones aren't defined on Tensor)
62 | _self_mod = importlib.import_module(__name__)
63 | for attrname in ['FP16_FUNCS', 'BFLOAT16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
64 |     lst = getattr(_self_mod, attrname)
65 |     for fn in getattr(torch_overrides, attrname):
66 |         if hasattr(MODULE, fn):
67 |             lst.append(fn)
68 | 


--------------------------------------------------------------------------------
/apex/amp/lists/torch_overrides.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from .. import utils
  4 | 
  5 | MODULE = torch
  6 | 
  7 | FP16_FUNCS = [
  8 |     # Low level functions wrapped by torch.nn layers.
  9 |     # The wrapper layers contain the weights which are then passed in as a parameter
 10 |     # to these functions.
 11 |     'conv1d',
 12 |     'conv2d',
 13 |     'conv3d',
 14 |     'conv_transpose1d',
 15 |     'conv_transpose2d',
 16 |     'conv_transpose3d',
 17 |     'conv_tbc',
 18 |     'prelu',
 19 | 
 20 |     # BLAS
 21 |     'addmm',
 22 |     'addmv',
 23 |     'addr',
 24 |     'matmul',
 25 |     'mm',
 26 |     'mv',
 27 | ]
 28 | 
 29 | BFLOAT16_FUNCS = [
 30 |     # Low level functions wrapped by torch.nn layers.
 31 |     # The wrapper layers contain the weights which are then passed in as a parameter
 32 |     # to these functions.
 33 |     'conv1d',
 34 |     'conv2d',
 35 |     'conv3d',
 36 |     'conv_transpose1d',
 37 |     'conv_transpose2d',
 38 |     'conv_transpose3d',
 39 |     'conv_tbc',
 40 | 
 41 |     # BLAS
 42 |     'addmm',
 43 |     'addmv',
 44 |     'addr',
 45 |     'matmul',
 46 |     'mm',
 47 |     'mv',
 48 | ]
 49 | 
 50 | FP32_FUNCS = [
 51 |     # Pointwise
 52 |     'acos',
 53 |     'asin',
 54 |     'cosh',
 55 |     'erfinv',
 56 |     'exp',
 57 |     'expm1',
 58 |     'log',
 59 |     'log10',
 60 |     'log2',
 61 |     'reciprocal',
 62 |     'rsqrt',
 63 |     'sinh',
 64 |     'tan',
 65 | 
 66 |     # Other math
 67 |     'pow',
 68 | 
 69 |     # Reduction
 70 |     'cumprod',
 71 |     'cumsum',
 72 |     'dist',
 73 |     # 'mean',
 74 |     'norm',
 75 |     'prod',
 76 |     'std',
 77 |     'sum',
 78 |     'var',
 79 | 
 80 |     # Misc
 81 |     'renorm'
 82 | ]
 83 | 
 84 | version_strings = torch.__version__.split('.')
 85 | version_major = version_strings[0]
 86 | version_minor = version_strings[1]
 87 | version_num = float(version_major + "." + version_minor)
 88 | # Before torch 1.1, mean must be blacklisted.
 89 | if version_num < 1.1:
 90 |     FP32_FUNCS.append('mean')
 91 | 
 92 | # Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
 93 | # check the CUDA version -- if at least 9.1, then put the bmm
 94 | # functions on the fp16 list. Otherwise, put them on the fp32 list.
 95 | _bmms = ['addbmm',
 96 |          'baddbmm',
 97 |          'bmm']
 98 | 
 99 | if utils.is_cuda_enabled():
100 |   # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
101 |   if utils.get_cuda_version() >= (9, 1, 0):
102 |       FP16_FUNCS.extend(_bmms)
103 |   else:
104 |       FP32_FUNCS.extend(_bmms)
105 | 
106 | # Multi-tensor fns that may need type promotion
107 | CASTS = [
108 |     # Multi-tensor math
109 |     'addcdiv',
110 |     'addcmul',
111 |     'atan2',
112 |     'cross',
113 |     'bilinear',
114 |     'dot',
115 | 
116 |     # Element-wise _or_ tensor-wise math
117 |     'add',
118 |     'div',
119 |     'mul',
120 | 
121 |     # Comparison
122 |     'eq',
123 |     'equal',
124 |     'ge',
125 |     'gt',
126 |     'le',
127 |     'lt',
128 |     'ne'
129 | ]
130 | 
131 | # Functions that take sequence arguments. We need to inspect the whole
132 | # sequence and cast to the widest type.
133 | SEQUENCE_CASTS = [
134 |     'cat',
135 |     'stack'
136 | ]
137 | 


--------------------------------------------------------------------------------
/apex/amp/rnn_compat.py:
--------------------------------------------------------------------------------
 1 | from . import utils, wrap
 2 | 
 3 | import torch
 4 | _VF = torch._C._VariableFunctions
 5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
 6 | 
 7 | def _gen_VF_wrapper(name):
 8 |     def wrapper(*args, **kwargs):
 9 |         return getattr(_VF, name)(*args, **kwargs)
10 |     return wrapper
11 | 
12 | # Some python magic to generate an object that has the rnn cell functions
13 | # defined on it, all of which call into corresponding _VF version.
14 | # Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
15 | # imported at module scope within torch.nn.modules.rnn).  This should
16 | # not affect third-party importers of _VF.py.
17 | class VariableFunctionsShim(object):
18 |     def __init__(self):
19 |         for name in RNN_NAMES:
20 |             for suffix in ['', '_cell']:
21 |                fn_name = name + suffix
22 |                setattr(self, fn_name, _gen_VF_wrapper(fn_name))
23 | 
24 | def has_old_rnns():
25 |     try:
26 |         torch.nn.backends.thnn.backend.LSTMCell
27 |         return True
28 |     except:
29 |         return False
30 | 
31 | def whitelist_rnn_cells(cast_fn, handle, verbose):
32 |     # Different module + function names in old/new RNN cases
33 |     if has_old_rnns():
34 |         fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
35 |         mod = torch.nn.backends.thnn.backend
36 |     else:
37 |         fn_names = [x + '_cell' for x in RNN_NAMES]
38 |         mod = torch.nn.modules.rnn._VF
39 |         assert isinstance(mod, VariableFunctionsShim)
40 | 
41 |     # Insert casts on cell functions
42 |     for fn in fn_names:
43 |         wrap.cached_cast(mod, fn, cast_fn, handle,
44 |                          try_caching=True, verbose=verbose)
45 | 
46 |     if has_old_rnns():
47 |         # Special handling of `backward` for fused gru / lstm:
48 |         # The `backward` method calls Tensor.sum() (blacklist) internally,
49 |         # and then the resulting grad_input has the wrong type.
50 |         # TODO: where else is this a problem?
51 |         for rnn_type in ['GRUFused', 'LSTMFused']:
52 |             mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
53 |             wrap.disable_casts(mod, 'backward', handle)
54 | 


--------------------------------------------------------------------------------
/apex/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/contrib/__init__.py


--------------------------------------------------------------------------------
/apex/contrib/bottleneck/__init__.py:
--------------------------------------------------------------------------------
1 | from .bottleneck import Bottleneck, SpatialBottleneck
2 | from .halo_exchangers import HaloExchangerNoComm, HaloExchangerAllGather, HaloExchangerSendRecv, HaloExchangerPeer
3 | 


--------------------------------------------------------------------------------
/apex/contrib/bottleneck/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from bottleneck import Bottleneck
 3 | torch.manual_seed(23337)
 4 | 
 5 | # use True to print layerwise sum for all outputs in reference code path
 6 | DEBUG = False#True
 7 | 
 8 | for stride, o_channel in [(1,32), (1,128), (2,32)]:
 9 |     print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
10 |     a_ = torch.randn(17,32,28,28)
11 | 
12 |     a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
13 |     model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
14 | 
15 |     # test model
16 |     b = model(a)
17 |     b.mean().backward()
18 |     d_grad = a.grad.float()
19 |     a.grad = None
20 |     torch.cuda.synchronize()
21 | 
22 |     if DEBUG:
23 |         print("[DEBUG] ref dx :", d_grad.sum().item())
24 |         # print wgrad. we don't need to reset since later cpp print before accumulation
25 |         for i, w in enumerate(model.w_conv):
26 |             print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
27 | 
28 |     wgrads = []
29 |     for w in model.w_conv:
30 |         wgrads.append(w.grad.float())
31 | 
32 |     model.use_cudnn = True
33 |     model.zero_grad()
34 |     c = model(a)
35 |     c.mean().backward()
36 | 
37 |     torch.cuda.synchronize()
38 |     print("comparing native and channels_last:")
39 |     print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
40 |     print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
41 |     for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
42 |         print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
43 | 
44 |     nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
45 |     nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
46 |     for p,q in zip(model.parameters(), nhwc_model.parameters()):
47 |         # model's storage is already in nhwc, we clone and assign to explicit nhwc model
48 |         q.data.copy_(p.data.permute(0,2,3,1).contiguous())
49 |     for p,q in zip(model.buffers(), nhwc_model.buffers()):
50 |         q.data.copy_(p.data)
51 | 
52 |     d = nhwc_model(nhwc_a)
53 |     d.mean().backward()
54 |     torch.cuda.synchronize()
55 | 
56 |     # reset reference to cudnn channels_last permute
57 |     #c_s = c.storage().tolist()
58 |     #d_s = d.storage().tolist()
59 |     #print(max([x-y for x,y in zip(c_s,d_s)]))
60 |     c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
61 |     d_grad = a.grad.float().permute(0,2,3,1).contiguous()
62 |     wgrads = []
63 |     for w in model.w_conv:
64 |         wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
65 | 
66 |     torch.cuda.synchronize()
67 |     print("comparing nhwc and channels_last:")
68 |     print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
69 |     print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
70 |     for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
71 |         print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
72 | 


--------------------------------------------------------------------------------
/apex/contrib/clip_grad/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip_grad import clip_grad_norm_
2 | 


--------------------------------------------------------------------------------
/apex/contrib/conv_bias_relu/__init__.py:
--------------------------------------------------------------------------------
1 | from .conv_bias_relu import ConvBiasReLU, ConvBias, ConvBiasMaskReLU 
2 | 
3 | 


--------------------------------------------------------------------------------
/apex/contrib/conv_bias_relu/conv_bias_relu.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | 
 3 | import torch
 4 | from torch.autograd import gradcheck
 5 | 
 6 | from apex import check_cudnn_version_and_warn
 7 | import fused_conv_bias_relu
 8 | 
 9 | check_cudnn_version_and_warn(__name__, 8400)
10 | 
11 | 
12 | class ConvBiasReLU_(torch.autograd.Function):
13 |     @staticmethod
14 |     @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
15 |     def forward(ctx, x, weight, bias, padding, stride):
16 |         outputs = fused_conv_bias_relu.forward([x, weight, bias], padding, stride)
17 |         ctx.save_for_backward(x, weight, outputs[0])
18 |         ctx.padding = padding
19 |         ctx.stride = stride
20 | 
21 |         return outputs[0]
22 | 
23 |     @staticmethod
24 |     @torch.cuda.amp.custom_bwd
25 |     def backward(ctx, grad_output):
26 |         bwd_args = [*ctx.saved_tensors, grad_output]
27 |         padding = ctx.padding
28 |         stride = ctx.stride
29 |         grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)
30 | 
31 |         return grads[0], grads[1], grads[2], None, None
32 | 
33 | 
34 | class ConvBiasMaskReLU_(torch.autograd.Function):
35 |     @staticmethod
36 |     @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
37 |     def forward(ctx, x, weight, bias, mask, padding, stride):
38 |         outputs = fused_conv_bias_relu.forward_mask([x, weight, bias, mask], padding, stride)
39 |         ctx.save_for_backward(x, weight, outputs[0])
40 |         ctx.padding = padding
41 |         ctx.stride = stride
42 | 
43 |         return outputs[0]
44 | 
45 |     @staticmethod
46 |     @torch.cuda.amp.custom_bwd
47 |     def backward(ctx, grad_output):
48 |         bwd_args = [*ctx.saved_tensors, grad_output]
49 |         padding = ctx.padding
50 |         stride = ctx.stride
51 |         grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)
52 | 
53 |         return grads[0], grads[1], grads[2], None, None, None
54 | 
55 | 
56 | class ConvBias_(torch.autograd.Function):
57 |     @staticmethod
58 |     @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
59 |     def forward(ctx, x, weight, bias, padding, stride):
60 |         outputs = fused_conv_bias_relu.forward_no_relu([x, weight, bias], padding, stride)
61 |         ctx.save_for_backward(x, weight)
62 |         ctx.padding = padding
63 |         ctx.stride = stride
64 | 
65 |         return outputs[0]
66 | 
67 |     @staticmethod
68 |     @torch.cuda.amp.custom_bwd
69 |     def backward(ctx, grad_output):
70 |         bwd_args = [*ctx.saved_tensors, grad_output]
71 |         padding = ctx.padding
72 |         stride = ctx.stride
73 |         grads = fused_conv_bias_relu.backward_no_relu(bwd_args, padding, stride)
74 | 
75 |         return grads[0], grads[1], grads[2], None, None
76 | 
77 | 
78 | ConvBiasReLU = ConvBiasReLU_.apply
79 | ConvBiasMaskReLU = ConvBiasMaskReLU_.apply
80 | ConvBias = ConvBias_.apply
81 | 
82 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/focal_loss/focal_loss_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | 
 3 | #include <vector>
 4 | #include <cstdint>
 5 | 
 6 | // CUDA forward declarations
 7 | 
 8 | std::vector<at::Tensor> focal_loss_forward_cuda(
 9 |   const at::Tensor &cls_output,
10 |   const at::Tensor &cls_targets_at_level,
11 |   const at::Tensor &num_positives_sum,
12 |   const int64_t num_real_classes,
13 |   const float alpha,
14 |   const float gamma,
15 |   const float smoothing_factor);
16 | 
17 | at::Tensor focal_loss_backward_cuda(
18 |   const at::Tensor &grad_output,
19 |   const at::Tensor &partial_grad,
20 |   const at::Tensor &num_positives_sum);
21 | 
22 | // C++ interface
23 | 
24 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
25 | #define CHECK_CONTIGUOUS(x)                                                    \
26 |   TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
27 | #define CHECK_INPUT(x)                                                         \
28 |   CHECK_CUDA(x);                                                               \
29 |   CHECK_CONTIGUOUS(x)
30 | 
31 | std::vector<at::Tensor> focal_loss_forward(
32 |   const at::Tensor &cls_output,
33 |   const at::Tensor &cls_targets_at_level,
34 |   const at::Tensor &num_positives_sum,
35 |   const int64_t num_real_classes,
36 |   const float alpha,
37 |   const float gamma,
38 |   const float smoothing_factor
39 | ) {
40 |   CHECK_INPUT(cls_output);
41 |   CHECK_INPUT(cls_targets_at_level);
42 |   CHECK_INPUT(num_positives_sum);
43 | 
44 |   return focal_loss_forward_cuda(
45 |     cls_output,
46 |     cls_targets_at_level,
47 |     num_positives_sum,
48 |     num_real_classes,
49 |     alpha,
50 |     gamma,
51 |     smoothing_factor);
52 | }
53 | 
54 | at::Tensor focal_loss_backward(
55 |   const at::Tensor &grad_output,
56 |   const at::Tensor &partial_grad,
57 |   const at::Tensor &num_positives_sum
58 | ) {
59 |   CHECK_INPUT(grad_output);
60 |   CHECK_INPUT(partial_grad);
61 | 
62 |   return focal_loss_backward_cuda(grad_output, partial_grad, num_positives_sum);
63 | }
64 | 
65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
66 |   m.def("forward", &focal_loss_forward,
67 |         "Focal loss calculation forward (CUDA)",
68 |         py::call_guard<py::gil_scoped_release>());
69 |   m.def("backward", &focal_loss_backward,
70 |         "Focal loss calculation backward (CUDA)",
71 |         py::call_guard<py::gil_scoped_release>());
72 | }
73 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/groupbn/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifdef USE_ROCM
 2 | #include <ATen/hip/HIPContext.h>
 3 | #else
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | #endif
 6 | #ifndef CUDA_UTILS_H
 7 | #define CUDA_UTILS_H
 8 | 
 9 | namespace at {
10 | namespace cuda {
11 | 
12 | namespace utils {
13 | 
14 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
15 | #ifdef USE_ROCM
16 |     return getDeviceProperties(device_id)->maxSharedMemoryPerMultiProcessor;
17 | #else
18 |     return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
19 | #endif
20 | }
21 | 
22 | 
23 | }
24 | }
25 | }
26 | 
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/groupbn/dnn.h:
--------------------------------------------------------------------------------
 1 | #ifndef DNN_H
 2 | #define DNN_H
 3 | 
 4 | #ifdef USE_ROCM
 5 | #include <miopen/miopen.h>
 6 | #define DNN_STATUS_SUCCESS miopenStatusSuccess
 7 | #define DNN_DATA_HALF miopenHalf
 8 | #define DNN_TENSOR_FORMAT 0
 9 | 
10 | using dnnTensorFormat_t = int;
11 | using dnnDataType_t = miopenDataType_t;
12 | using dnnStatus_t = miopenStatus_t;
13 | using dnnTensorDescriptor_t = miopenTensorDescriptor_t;
14 | #else
15 | #include <cudnn.h>
16 | #define DNN_STATUS_SUCCESS CUDNN_STATUS_SUCCESS
17 | #define DNN_DATA_HALF CUDNN_DATA_HALF
18 | #define DNN_TENSOR_FORMAT CUDNN_TENSOR_NHWC
19 | 
20 | using dnnTensorFormat_t = cudnnTensorFormat_t;
21 | using dnnDataType_t = cudnnDataType_t;
22 | using dnnStatus_t = cudnnStatus_t;
23 | using dnnTensorDescriptor_t = cudnnTensorDescriptor_t;
24 | #endif
25 | 
26 | #endif // DNN_H
27 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/multihead_attn/philox.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | // Philox CUDA.
 3 | 
 4 | namespace {
 5 | 
 6 | class Philox {
 7 | public:
 8 |   __device__ inline Philox(unsigned long long seed,
 9 |                            unsigned long long subsequence,
10 |                            unsigned long long offset) {
11 |     key.x = (unsigned int)seed;
12 |     key.y = (unsigned int)(seed >> 32);
13 |     counter = make_uint4(0, 0, 0, 0);
14 |     counter.z = (unsigned int)(subsequence);
15 |     counter.w = (unsigned int)(subsequence >> 32);
16 |     STATE = 0;
17 |     incr_n(offset / 4);
18 |   }
19 |   __device__ inline uint4 operator()() {
20 |     if (STATE == 0) {
21 |       uint4 counter_ = counter;
22 |       uint2 key_ = key;
23 |       // 7-round philox
24 |       for (int i = 0; i < 6; i++) {
25 |         counter_ = single_round(counter_, key_);
26 |         key_.x += (kPhilox10A);
27 |         key_.y += (kPhilox10B);
28 |       }
29 |       output = single_round(counter_, key_);
30 |       incr();
31 |     }
32 |     // return a float4 directly
33 |     // unsigned long ret;
34 |     // switch(STATE) {
35 |     //  case 0: ret = output.x; break;
36 |     //  case 1: ret = output.y; break;
37 |     //  case 2: ret = output.z; break;
38 |     //  case 3: ret = output.w; break;
39 |     //}
40 |     // STATE = (STATE + 1) % 4;
41 |     return output;
42 |   }
43 | 
44 | private:
45 |   uint4 counter;
46 |   uint4 output;
47 |   uint2 key;
48 |   unsigned int STATE;
49 |   __device__ inline void incr_n(unsigned long long n) {
50 |     unsigned int nlo = (unsigned int)(n);
51 |     unsigned int nhi = (unsigned int)(n >> 32);
52 |     counter.x += nlo;
53 |     if (counter.x < nlo)
54 |       nhi++;
55 |     counter.y += nhi;
56 |     if (nhi <= counter.y)
57 |       return;
58 |     if (++counter.z)
59 |       return;
60 |     ++counter.w;
61 |   }
62 |   __device__ inline void incr() {
63 |     if (++counter.x)
64 |       return;
65 |     if (++counter.y)
66 |       return;
67 |     if (++counter.z)
68 |       return;
69 |     ++counter.w;
70 |   }
71 |   __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
72 |                                     unsigned int *result_high) {
73 |     *result_high = __umulhi(a, b);
74 |     return a * b;
75 |   }
76 |   __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
77 |     unsigned int hi0;
78 |     unsigned int hi1;
79 |     unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
80 |     unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
81 |     uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
82 |     return ret;
83 |   }
84 |   static const unsigned long kPhilox10A = 0x9E3779B9;
85 |   static const unsigned long kPhilox10B = 0xBB67AE85;
86 |   static const unsigned long kPhiloxSA = 0xD2511F53;
87 |   static const unsigned long kPhiloxSB = 0xCD9E8D57;
88 | };
89 | // Inverse of 2^32.
90 | constexpr float M_RAN_INVM32 = 2.3283064e-10f;
91 | __device__ __inline__ float4 uniform4(uint4 x) {
92 |   return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,
93 |                      x.w * M_RAN_INVM32);
94 | }
95 | 
96 | } // namespace
97 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/nccl_allocator/NCCLAllocator.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <c10/cuda/CUDACachingAllocator.h>
 3 | #include <c10/util/Exception.h>
 4 | #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
 5 | #include <torch/extension.h>
 6 | 
 7 | #include <nccl.h>
 8 | 
 9 | #define NCCL_CHECK(cmd)                                                        \
10 |   do {                                                                         \
11 |     ncclResult_t result = cmd;                                                 \
12 |     if (result != ncclSuccess) {                                               \
13 |       std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +      \
14 |                         std::to_string(__LINE__) + ", " +                      \
15 |                         std::string(ncclGetErrorString(result));               \
16 |       TORCH_CHECK(false, err);                                                 \
17 |     }                                                                          \
18 |   } while (0)
19 | 
20 | void *nccl_alloc_plug(size_t size, int device, void *stream) {
21 |   void *ptr;
22 |   NCCL_CHECK(ncclMemAlloc(&ptr, size));
23 |   return ptr;
24 | }
25 | 
26 | void nccl_free_plug(void *ptr, std::size_t size, int device, void *stream) {
27 |   NCCL_CHECK(ncclMemFree(ptr));
28 | }
29 | 
30 | std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> nccl_allocator;
31 | 
32 | void maybe_init() {
33 |   if (!nccl_allocator) {
34 |     nccl_allocator = std::make_shared<
35 |         torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator>(
36 |         nccl_alloc_plug, nccl_free_plug);
37 |   }
38 | }
39 | 
40 | std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
41 | get_nccl_allocator() {
42 |   maybe_init();
43 |   return nccl_allocator;
44 | }
45 | 
46 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
47 |   m.def("get_nccl_allocator", []() { return get_nccl_allocator(); });
48 | };


--------------------------------------------------------------------------------
/apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "nccl_p2p_cuda.cuh"
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |   m.def("get_unique_nccl_id", &apex::contrib::nccl_p2p::get_unique_nccl_id, "get_unique_nccl_id");
21 |   m.def("init_nccl_comm", &apex::contrib::nccl_p2p::init_nccl_comm, "init_nccl_comm");
22 |   m.def("left_right_halo_exchange_inplace", &apex::contrib::nccl_p2p::left_right_halo_exchange_inplace, "left_right_halo_exchange_inplace");
23 |   m.def("left_right_halo_exchange", &apex::contrib::nccl_p2p::left_right_halo_exchange, "left_right_halo_exchange");
24 |   m.def("add_delay", &apex::contrib::nccl_p2p::add_delay, "add_delay");
25 | }
26 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <torch/extension.h>
19 | #ifndef _nccl_p2p_h_
20 | #define _nccl_p2p_h_
21 | 
22 | namespace apex { namespace contrib { namespace nccl_p2p {
23 | at::Tensor get_unique_nccl_id(int n);
24 | int init_nccl_comm(
25 |         at::Tensor unique_nccl_id, 
26 |         int my_rank, 
27 |         int num_ranks
28 |         );
29 | void left_right_halo_exchange_inplace(
30 |         int handle,
31 |         int left_rank,
32 |         int right_rank,
33 | 	at::Tensor left_output_halo,
34 | 	at::Tensor right_output_halo,
35 | 	at::Tensor left_input_halo,
36 | 	at::Tensor right_input_halo);
37 | std::vector<at::Tensor> left_right_halo_exchange(
38 |         int handle,
39 |         int left_rank,
40 |         int right_rank,
41 |         at::Tensor left_output_halo, 
42 |         at::Tensor right_output_halo);
43 | void add_delay(int delay);
44 | }}}
45 | #endif
46 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/nccl_p2p/nccl_version.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | // This file is used to check the version of NCCL detected.
 3 | #include <tuple>
 4 | 
 5 | #include <torch/extension.h>
 6 | 
 7 | std::tuple<int, int> get_nccl_version();
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("get_nccl_version", &get_nccl_version);
11 | }


--------------------------------------------------------------------------------
/apex/contrib/csrc/nccl_p2p/nccl_version_check.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | // This file is used to check the version of NCCL detected.
 4 | #include <tuple>
 5 | #include <nccl.h>
 6 | 
 7 | 
 8 | std::tuple<int, int> get_nccl_version() {
 9 |   return { int(NCCL_MAJOR), int(NCCL_MINOR) };
10 | }


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   const float lr,
 8 |   const float beta1,
 9 |   const float beta2,
10 |   const float epsilon,
11 |   const int step,
12 |   const int bias_correction,
13 |   const float weight_decay,
14 |   const int grad_averaging,
15 |   const int mode,
16 |   const float global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |         m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
21 | }
22 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_fused_adam_cuda(
 4 |     int chunk_size, at::Tensor noop_flag,
 5 |     std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_scale,
 6 |     float lr, float beta1, float beta2, float eps, int step, int mode,
 7 |     int bias_correction, float weight_decay);
 8 | 
 9 | void multi_tensor_fused_adam_capturable_cuda(
10 |     int chunk_size, at::Tensor noop_flag,
11 |     std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_scale,
12 |     at::Tensor lr, float beta1, float beta2, float eps, at::Tensor step,
13 |     int mode, int bias_correction, float weight_decay);
14 | 
15 | void multi_tensor_fused_adam_with_param_remainders_cuda(
16 |     int chunk_size, at::Tensor noop_flag,
17 |     std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_scale,
18 |     float lr, float beta1, float beta2, float eps, int step, int mode,
19 |     int bias_correction, float weight_decay);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |   m.def("multi_tensor_fused_adam", &multi_tensor_fused_adam_cuda,
23 |         "CUDA kernels for multi-tensor Adam, "
24 |         "with param copy",
25 |         py::call_guard<py::gil_scoped_release>());
26 |   m.def("multi_tensor_fused_adam_capturable",
27 |         &multi_tensor_fused_adam_capturable_cuda,
28 |         "CUDA kernels for multi-tensor Adam, "
29 |         "with param copy, capturable for CUDA graph",
30 |         py::call_guard<py::gil_scoped_release>());
31 |   m.def("multi_tensor_fused_adam_with_param_remainders",
32 |         &multi_tensor_fused_adam_with_param_remainders_cuda,
33 |         "CUDA kernel for multi-tensor Adam, "
34 |         "with stored param remainders and param copy",
35 |         py::call_guard<py::gil_scoped_release>());
36 | }


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_compute_update_term_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_beta3,
10 |   at::Tensor per_tensor_bias_correction,
11 |   at::Tensor step,
12 |   at::Tensor per_tensor_epsilon,
13 |   const int mode,
14 |   at::Tensor per_tensor_decay,
15 |   at::Tensor global_scale,
16 |   at::Tensor global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | void multi_tensor_lamb_update_weights_cuda(
20 |   int chunk_size,
21 |   at::Tensor noop_flag,
22 |   std::vector<std::vector<at::Tensor>> tensor_lists,
23 |   at::Tensor per_tensor_param_norm,
24 |   at::Tensor per_tensor_update_norm,
25 |   at::Tensor update_norm_offset,
26 |   at::Tensor learning_rate,
27 |   at::Tensor per_tensor_decay,
28 |   at::Tensor global_grad_norm,
29 |   bool use_nvlamb);
30 | 
31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
32 |   m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda,
33 |         "Computes update term for LAMB optimizer", py::call_guard<py::gil_scoped_release>());
34 |   m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda,
35 |         "Applies update term for LAMB optimizer", py::call_guard<py::gil_scoped_release>());
36 | }
37 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/peer_memory/peer_memory.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "peer_memory_cuda.cuh"
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |     m.def("allocate_raw", &apex::contrib::peer_memory::allocate_raw, "allocate_raw");
21 |     m.def("free_raw", &apex::contrib::peer_memory::free_raw, "free_raw");
22 |     m.def("zero", &apex::contrib::peer_memory::zero, "zero");
23 |     m.def("get_raw_ipc_address", &apex::contrib::peer_memory::get_raw_ipc_address, "get_raw_ipc_address");
24 |     m.def("get_raw_peers", &apex::contrib::peer_memory::get_raw_peers, "get_raw_peers");
25 |     m.def("blob_view_half", &apex::contrib::peer_memory::blob_view_half, "blob_view_half");
26 |     m.def("blob_view_float", &apex::contrib::peer_memory::blob_view_float, "blob_view_float");
27 |     m.def("blob_view_int", &apex::contrib::peer_memory::blob_view_int, "blob_view_int");
28 |     m.def("push_pull_halos_1d", &apex::contrib::peer_memory::push_pull_halos_1d, "push_pull_halos_1d");
29 | }
30 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/peer_memory/peer_memory_cuda.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <torch/extension.h>
19 | #ifndef _peer_memory_h_
20 | #define _peer_memory_h_ 
21 | 
22 | namespace apex { namespace contrib { namespace peer_memory {
23 |     int64_t allocate_raw(int64_t size);
24 |     void free_raw(int64_t raw);
25 |     void zero(int64_t raw, int64_t size);
26 |     at::Tensor get_raw_ipc_address(int64_t raw);
27 |     std::vector<int64_t> get_raw_peers(at::Tensor ipc_addresses, int peer_rank, int64_t raw);
28 |     at::Tensor blob_view_half(int64_t raw, std::vector<int64_t> shape, bool channels_last);
29 |     at::Tensor blob_view_float(int64_t raw, std::vector<int64_t> shape, bool channels_last);
30 |     at::Tensor blob_view_int(int64_t raw, std::vector<int64_t> shape, bool channels_last);
31 |     void push_pull_halos_1d(
32 |         bool diagnostics,
33 |         bool explicit_nhwc,
34 |         int numSM,                      // number of SMs to use
35 | 	bool top_zero,			// true if top halo should be zeroed
36 |         at::Tensor top_out_halo,        // top output halo in sender device memory
37 |         at::Tensor top_out_tx,          // top output transfer buffer in sender peer pool memory
38 | 	at::Tensor top_inp_tx,		// top input transfer buffer in top neighbor peer pool memory
39 |         at::Tensor top_inp_halo,        // top input halo in receiver device memory
40 | 	bool btm_zero,			// true if btm halo should be zeroed
41 |         at::Tensor btm_out_halo,        // btm output halo in sender device memory
42 |         at::Tensor btm_out_tx,          // btm output transfer buffer in sender peer pool memory
43 | 	at::Tensor btm_inp_tx,		// btm input transfer buffer in btm neighbor peer pool memory
44 |         at::Tensor btm_inp_halo,        // btm input halo in receiver device memory
45 |         at::Tensor top_signal,          // top input signal in receiver device memory
46 |         at::Tensor btm_signal,          // btm input signal in receiver device memory
47 |         at::Tensor waits                // top and btm signals for this rank
48 |         );
49 | } } }
50 | #endif
51 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/transducer/transducer_joint.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/Functions.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 7 | 
 8 | std::vector<torch::Tensor> transducer_joint_cuda_forward(
 9 |     torch::Tensor f,
10 |     torch::Tensor g,
11 |     torch::Tensor fLen,
12 |     torch::Tensor gLen,
13 |     torch::Tensor batchOffset,
14 |     int64_t packedBatch,
15 |     int opt,
16 |     bool packOutput,
17 |     bool relu,
18 |     bool dropout,
19 |     float dropoutProb,
20 |     int tileSize);
21 | 
22 | 
23 | std::vector<torch::Tensor> transducer_joint_cuda_backward(
24 |     std::vector<torch::Tensor> in,
25 |     torch::Tensor fLen,
26 |     torch::Tensor gLen,
27 |     torch::Tensor batchOffset,
28 |     int maxFLen,
29 |     int maxGLen,
30 |     bool packOutput,
31 |     float scale);
32 | 
33 | std::vector<torch::Tensor> transducer_joint_forward(
34 |     torch::Tensor f,
35 |     torch::Tensor g,
36 |     torch::Tensor fLen,
37 |     torch::Tensor gLen,
38 |     torch::Tensor batchOffset,
39 |     int64_t packedBatch,
40 |     int opt,
41 |     bool packOutput,
42 |     bool relu,
43 |     bool dropout,
44 |     float dropoutProb,
45 |     int tileSize) {
46 |     CHECK_INPUT(f);
47 |     CHECK_INPUT(g);
48 |     CHECK_INPUT(fLen);
49 |     CHECK_INPUT(gLen);
50 |     if (packOutput)
51 |         CHECK_INPUT(batchOffset);
52 |     return transducer_joint_cuda_forward(
53 |         f, 
54 |         g, 
55 |         fLen, 
56 |         gLen,
57 |         batchOffset,
58 |         packedBatch,
59 |         opt,
60 |         packOutput,
61 |         relu,
62 |         dropout,
63 |         dropoutProb,
64 |         tileSize);
65 | }
66 | 
67 | std::vector<torch::Tensor> transducer_joint_backward(
68 |     std::vector<torch::Tensor> in,
69 |     torch::Tensor fLen,
70 |     torch::Tensor gLen,
71 |     torch::Tensor batchOffset,
72 |     int maxFLen,
73 |     int maxGLen,
74 |     bool packOutput,
75 |     float scale) {
76 |     for (auto t : in){
77 |         CHECK_INPUT(t);
78 |     }
79 |     CHECK_INPUT(fLen);
80 |     CHECK_INPUT(gLen);
81 |     if (packOutput)
82 |         CHECK_INPUT(batchOffset);
83 |     return transducer_joint_cuda_backward(
84 |         in, 
85 |         fLen, 
86 |         gLen,
87 |         batchOffset,
88 |         maxFLen,
89 |         maxGLen,
90 |         packOutput,
91 |         scale);
92 | }
93 | 
94 | 
95 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
96 |   m.def("forward", &transducer_joint_forward, "transducer joint forward (CUDA)");
97 |   m.def("backward", &transducer_joint_backward, "transducer joint backward (CUDA)");
98 | }


--------------------------------------------------------------------------------
/apex/contrib/csrc/transducer/transducer_loss.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <vector>
  3 | 
  4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
  5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
  6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
  7 | 
  8 | std::vector<torch::Tensor> transducer_loss_cuda_forward(
  9 |     torch::Tensor x,
 10 |     torch::Tensor label,
 11 |     torch::Tensor audLen,
 12 |     torch::Tensor txtLen,
 13 |     torch::Tensor batchOffset,
 14 |     int maxFLen,
 15 |     int blankIdx,
 16 |     int opt,
 17 |     bool packedInput);
 18 | 
 19 | torch::Tensor transducer_loss_cuda_backward(
 20 |     torch::Tensor x,
 21 |     torch::Tensor lossGrad,
 22 |     torch::Tensor alpha,
 23 |     torch::Tensor beta,
 24 |     torch::Tensor audLen,
 25 |     torch::Tensor txtLen,
 26 |     torch::Tensor label,
 27 |     torch::Tensor batchOffset,
 28 |     int maxFLen,
 29 |     int blankIdx,
 30 |     int opt,
 31 |     bool fuseSoftmaxBackward,
 32 |     bool packedInput);
 33 | 
 34 | 
 35 | std::vector<torch::Tensor> transducer_loss_forward(
 36 |     torch::Tensor x,
 37 |     torch::Tensor label,
 38 |     torch::Tensor fLen,
 39 |     torch::Tensor yLen,
 40 |     torch::Tensor batchOffset,
 41 |     int maxFLen,
 42 |     int blankIdx,
 43 |     int opt,
 44 |     bool packedInput
 45 |     ) {
 46 | 
 47 |     CHECK_INPUT(x);
 48 |     CHECK_INPUT(label);
 49 |     CHECK_INPUT(fLen);
 50 |     CHECK_INPUT(yLen);
 51 |     if (packedInput)
 52 |         CHECK_INPUT(batchOffset);
 53 |     return transducer_loss_cuda_forward(
 54 |         x, 
 55 |         label, 
 56 |         fLen, 
 57 |         yLen, 
 58 |         batchOffset,
 59 |         maxFLen,
 60 |         blankIdx, 
 61 |         opt,
 62 |         packedInput);
 63 | }
 64 | 
 65 | torch::Tensor transducer_loss_backward(
 66 |     torch::Tensor x,
 67 |     torch::Tensor lossGrad,
 68 |     torch::Tensor alpha,
 69 |     torch::Tensor beta,
 70 |     torch::Tensor fLen,
 71 |     torch::Tensor yLen,
 72 |     torch::Tensor label,
 73 |     torch::Tensor batchOffset,
 74 |     int maxFLen,
 75 |     int blankIdx,
 76 |     int opt,
 77 |     bool fuseSoftmaxBackward,
 78 |     bool packedInput){
 79 | 
 80 |     CHECK_INPUT(x);
 81 |     CHECK_INPUT(label);
 82 |     CHECK_INPUT(lossGrad);
 83 |     CHECK_INPUT(alpha);
 84 |     CHECK_INPUT(beta);
 85 |     CHECK_INPUT(fLen);
 86 |     CHECK_INPUT(yLen);
 87 |     if (packedInput)
 88 |         CHECK_INPUT(batchOffset);
 89 | 
 90 |     return transducer_loss_cuda_backward(
 91 |         x,
 92 |         lossGrad,
 93 |         alpha,
 94 |         beta,
 95 |         fLen,
 96 |         yLen,
 97 |         label,
 98 |         batchOffset,
 99 |         maxFLen,
100 |         blankIdx,
101 |         opt,
102 |         fuseSoftmaxBackward,
103 |         packedInput);
104 | }
105 | 
106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
107 |   m.def("forward", &transducer_loss_forward, "transducer loss forward (CUDA)", py::call_guard<py::gil_scoped_release>());
108 |   m.def("backward", &transducer_loss_backward, "transducer loss backward (CUDA)", py::call_guard<py::gil_scoped_release>());
109 | }
110 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/xentropy/interface.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | // CUDA forward declarations
 4 | 
 5 | std::vector<at::Tensor> softmax_xentropy_cuda(
 6 |     const at::Tensor &input,
 7 |     const at::Tensor &labels,
 8 |     const float smoothing,
 9 |     const bool half_to_float);
10 | 
11 | at::Tensor softmax_xentropy_backward_cuda(
12 |     const at::Tensor &grad_loss,
13 |     const at::Tensor &logits,
14 |     const at::Tensor &max_log_sum_exp,
15 |     const at::Tensor &labels,
16 |     const float smoothing);
17 | 
18 | // C++ interface
19 | 
20 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
21 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
22 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
23 | 
24 | std::vector<at::Tensor> softmax_xentropy_forward(
25 |     const at::Tensor &input,
26 |     const at::Tensor &labels,
27 |     const float smoothing,
28 |     const bool half_to_float) {
29 |     CHECK_CUDA(input);
30 |     CHECK_INPUT(labels);
31 | 
32 |     return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
33 | }
34 | 
35 | at::Tensor softmax_xentropy_backward(
36 |     const at::Tensor &grad_loss,
37 |     const at::Tensor &logits,
38 |     const at::Tensor &max_log_sum_exp,
39 |     const at::Tensor &labels,
40 |     const float smoothing)  {
41 |     CHECK_CUDA(grad_loss);
42 |     CHECK_CUDA(logits);
43 |     CHECK_INPUT(max_log_sum_exp);
44 |     CHECK_INPUT(labels);
45 | 
46 |     return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
47 | }
48 | 
49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
50 |     m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)");
51 |     m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)");
52 | }
53 | 


--------------------------------------------------------------------------------
/apex/contrib/fmha/__init__.py:
--------------------------------------------------------------------------------
1 | from .fmha import FMHAFun
2 | 


--------------------------------------------------------------------------------
/apex/contrib/focal_loss/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import focal_loss_cuda
 4 |     from .focal_loss import focal_loss
 5 |     del torch
 6 |     del focal_loss_cuda
 7 |     del focal_loss
 8 | except ImportError as err:
 9 |     print("apex was installed without --focal_loss flag, apex.contrib.focal_loss is not available")
10 | 


--------------------------------------------------------------------------------
/apex/contrib/focal_loss/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import focal_loss_cuda
 4 | 
 5 | 
 6 | class FocalLoss(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(
 9 |         ctx,
10 |         cls_output,
11 |         cls_targets_at_level,
12 |         num_positives_sum,
13 |         num_real_classes,
14 |         alpha,
15 |         gamma,
16 |         label_smoothing=0.0,
17 |     ):
18 |         loss, partial_grad = focal_loss_cuda.forward(
19 |             cls_output,
20 |             cls_targets_at_level,
21 |             num_positives_sum,
22 |             num_real_classes,
23 |             alpha,
24 |             gamma,
25 |             label_smoothing,
26 |         )
27 | 
28 |         ctx.save_for_backward(partial_grad, num_positives_sum)
29 |         return loss
30 | 
31 |     @staticmethod
32 |     def backward(ctx, grad_loss):
33 |         partial_grad, num_positives_sum = ctx.saved_tensors
34 | 
35 |         # The backward kernel is actually in-place to save memory space,
36 |         # partial_grad and grad_input are the same tensor.
37 |         grad_input = focal_loss_cuda.backward(grad_loss, partial_grad, num_positives_sum)
38 | 
39 |         return grad_input, None, None, None, None, None, None
40 | 
41 | 
42 | def focal_loss(
43 |     cls_output: torch.Tensor,
44 |     cls_targets_at_level: torch.Tensor,
45 |     num_positive_sum: torch.Tensor,
46 |     num_real_classes: int,
47 |     alpha: float,
48 |     gamma: float,
49 |     label_smoothing: float = 0.0,
50 | ) -> torch.Tensor:
51 |     """Fused focal loss function."""
52 |     return FocalLoss.apply(
53 |         cls_output,
54 |         cls_targets_at_level,
55 |         num_positive_sum,
56 |         num_real_classes,
57 |         alpha,
58 |         gamma,
59 |         label_smoothing,
60 |     )
61 | 


--------------------------------------------------------------------------------
/apex/contrib/groupbn/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import bnp
 4 |     from .batch_norm import BatchNorm2d_NHWC
 5 |     del torch
 6 |     del bnp
 7 |     del batch_norm
 8 | except ImportError as err:
 9 |     print("apex was installed without --bnp flag, contrib.groupbn is not available")
10 | 


--------------------------------------------------------------------------------
/apex/contrib/index_mul_2d/__init__.py:
--------------------------------------------------------------------------------
1 | from .index_mul_2d import index_mul_2d
2 | 


--------------------------------------------------------------------------------
/apex/contrib/layer_norm/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_norm import FastLayerNorm
2 | 


--------------------------------------------------------------------------------
/apex/contrib/layer_norm/layer_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import init
 3 | 
 4 | from apex._autocast_utils import _cast_if_autocast_enabled
 5 | import fast_layer_norm
 6 | 
 7 | 
 8 | class FastLayerNormFN(torch.autograd.Function):
 9 |     @staticmethod
10 |     def forward(ctx, x, gamma, beta, epsilon):
11 |         x = x.contiguous()
12 |         gamma = gamma.contiguous()
13 |         beta = beta.contiguous()
14 |         hidden_size = gamma.numel()
15 |         xmat = x.view((-1, hidden_size))
16 |         ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon)
17 |         ctx.save_for_backward(x, gamma, mu, rsigma)
18 |         return ymat.view(x.shape)
19 | 
20 |     @staticmethod
21 |     def backward(ctx, dy):
22 |         # assert dy.is_contiguous()
23 |         dy = dy.contiguous()  # this happens!
24 |         x, gamma, mu, rsigma = ctx.saved_tensors
25 | 
26 |         hidden_size = gamma.numel()
27 |         xmat = x.view((-1, hidden_size))
28 |         dymat = dy.view(xmat.shape)
29 |         dxmat, dgamma, dbeta, _, _ = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma)
30 |         dx = dxmat.view(x.shape)
31 |         return dx, dgamma, dbeta, None
32 | 
33 | 
34 | def _fast_layer_norm(x, weight, bias, epsilon):
35 |     args = _cast_if_autocast_enabled(x, weight, bias, epsilon)
36 |     with torch.cuda.amp.autocast(enabled=False):
37 |         return FastLayerNormFN.apply(*args)
38 | 
39 | 
40 | class FastLayerNorm(torch.nn.Module):
41 |     def __init__(self, hidden_size, eps=1e-5):
42 |         super().__init__()
43 |         self.epsilon = eps
44 |         self.weight = torch.nn.Parameter(torch.empty(hidden_size))
45 |         self.bias = torch.nn.Parameter(torch.empty(hidden_size))
46 |         self.reset_parameters()
47 | 
48 |     def reset_parameters(self):
49 |         init.ones_(self.weight)
50 |         init.zeros_(self.bias)
51 | 
52 |     def forward(self, x):
53 |         return _fast_layer_norm(x, self.weight, self.bias, self.epsilon)
54 | 


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/MHA_bwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/contrib/multihead_attn/MHA_bwd.png


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/MHA_fwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/contrib/multihead_attn/MHA_fwd.png


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/README.md:
--------------------------------------------------------------------------------
 1 | # Fast Multihead Attention 
 2 | 
 3 | This implementation has two main features :
 4 | * A C++ implementation to avoid the CPU overheads of Pytorch found with smaller batch sizes.
 5 | * The removal of all copies and transposes found in standard implementations of Multihead Attention.
 6 | 
 7 | |                                            | Python Version | C++ Version |
 8 | | :----------------------------------------- | :------------: | :---------: |
 9 | | Layer Norm and Residual Add Variant        | X              | X           |
10 | | Includes Linear Biases                     | X              |             |
11 | | Reduces CPU Overheads                      |                | X           |
12 | | Fuses masking with Softmax                 |                | X           |
13 | | Removes Transposes and Copies              | X              | X           |
14 | | Includes Self and Encoder/Decoder Variants | X              | X           |
15 | 
16 | ## How to Instantiate
17 | 
18 | `SelfMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
19 | `EncdecMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
20 | 
21 |  `impl` has two options:
22 |  * `fast` uses C++ Version
23 |  * `default` uses Python Version
24 | 
25 | ## Instructions to build on Linux
26 | 
27 | ```
28 | $ git clone https://github.com/NVIDIA/apex
29 | $ cd apex
30 | $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./
31 | ```
32 | ## Try Performance Tests Yourself!
33 | Perf test script is found here!
34 | ```
35 | cd contrib/examples/multihead_attn
36 | ```
37 | #### Fast Multihead Attention
38 | ```
39 | python perf_test_multihead_attn.py --ref
40 | ```
41 | #### Fast Multihead Attention with C++ Implementation
42 | ```
43 | python perf_test_multihead_attn.py
44 | ```
45 | #### Compare with `torch.nn.MultiheadAttn`
46 | ```
47 | python perf_test_multihead_attn.py --native
48 | ```
49 | #### Test your own range!
50 | ```
51 | python perf_test_multihead_attn.py --seq-length 64 --num-seqs-start 10 --num-seqs-stop 120 --num-seqs-inc 5
52 | ```
53 | 
54 | ## Performance Comparisons
55 | 
56 | * Performance was measured with 64 token sequence lengths on an NVIDIA TitanV card.
57 | * Time is measured across multiple layers to simulate an in model scenario.
58 | 
59 | ![Multihead Attention Forward](MHA_fwd.png)
60 | ![Multihead Attention Backward](MHA_bwd.png)
61 | 


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/__init__.py:
--------------------------------------------------------------------------------
1 | from .self_multihead_attn import SelfMultiheadAttn
2 | from .encdec_multihead_attn import EncdecMultiheadAttn
3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func
4 | 


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/mask_softmax_dropout_func.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import fast_multihead_attn
 4 | 
 5 | 
 6 | class MaskSoftmaxDropout(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob):
 9 |         heads_t = torch.tensor([heads])
10 |         dropout_prob_t = torch.tensor([dropout_prob])
11 |         null_tensor = torch.tensor([])
12 |         use_mask = pad_mask is not None
13 |         use_mask_t = torch.tensor([use_mask])
14 |         mask_additive_t = torch.tensor([mask_additive])
15 | 
16 |         if mask_additive:
17 |             dropout_results, dropout_mask, softmax_results = fast_multihead_attn.additive_mask_softmax_dropout_forward(
18 |                 use_mask, is_training, heads, inputs, pad_mask if use_mask else null_tensor, dropout_prob
19 |             )
20 |             # fast_additive_mask_softmax_dropout.forward(                           \
21 |         else:
22 |             dropout_results, dropout_mask, softmax_results = fast_multihead_attn.mask_softmax_dropout_forward(
23 |                 use_mask, is_training, heads, inputs, pad_mask if use_mask else null_tensor, dropout_prob
24 |             )
25 |             # fast_mask_softmax_dropout.forward(                           \
26 | 
27 |         ctx.save_for_backward(
28 |             use_mask_t,
29 |             heads_t,
30 |             softmax_results,
31 |             dropout_mask,
32 |             pad_mask if use_mask else null_tensor,
33 |             mask_additive_t,
34 |             dropout_prob_t,
35 |         )
36 | 
37 |         return dropout_results.detach()
38 | 
39 |     @staticmethod
40 |     def backward(ctx, output_grads):
41 |         (
42 |             use_mask_t,
43 |             heads_t,
44 |             softmax_results,
45 |             dropout_mask,
46 |             pad_mask,
47 |             mask_additive_t,
48 |             dropout_prob_t,
49 |         ) = ctx.saved_tensors
50 | 
51 |         if mask_additive_t[0]:
52 |             input_grads = fast_multihead_attn.additive_mask_softmax_dropout_backward(
53 |                 use_mask_t[0], heads_t[0], output_grads, softmax_results, dropout_mask, dropout_prob_t[0]
54 |             )
55 |             # fast_additive_mask_softmax_dropout.backward(                          \
56 |         else:
57 |             input_grads = fast_multihead_attn.mask_softmax_dropout_backward(
58 |                 use_mask_t[0], heads_t[0], output_grads, softmax_results, dropout_mask, pad_mask, dropout_prob_t[0]
59 |             )
60 |             # fast_mask_softmax_dropout.backward(                          \
61 |         return None, None, input_grads, None, None, None
62 | 
63 | 
64 | fast_mask_softmax_dropout_func = MaskSoftmaxDropout.apply
65 | 


--------------------------------------------------------------------------------
/apex/contrib/nccl_allocator/__init__.py:
--------------------------------------------------------------------------------
1 | from .nccl_allocator import *


--------------------------------------------------------------------------------
/apex/contrib/nccl_allocator/nccl_allocator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import _apex_nccl_allocator
 4 | 
 5 | from contextlib import nullcontext
 6 | 
 7 | 
 8 | __all__ = ["init", "nccl_mem", "create_nccl_mem_pool"]
 9 | 
10 | 
11 | def create_nccl_mem_pool():
12 |     _allocator = _apex_nccl_allocator.get_nccl_allocator()
13 |     _pool = torch.cuda.MemPool(_allocator)
14 |     return _pool
15 | 
16 | 
17 | def init() -> None:
18 |     os.environ["NCCL_NVLS_ENABLE"] = "1"
19 |     os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "0"
20 | 
21 | 
22 | class nccl_mem:
23 |     def __init__(self, pool, enabled = True, device = None, group = None):
24 |         self.device = None
25 |         self.group = None
26 |         self.mem_context = None
27 |         self.pool = pool
28 | 
29 |         if enabled:
30 |             if device is None:
31 |                 self.device = torch.device("cuda", torch.cuda.current_device())
32 |             elif isinstance(device, int):
33 |                 self.device = torch.device("cuda", device)
34 |             elif isinstance(device, str):
35 |                 assert "cuda" in device, "only cuda devices are supported"
36 |                 self.device = torch.device(device)
37 | 
38 |             if group is None:
39 |                 self.group = torch.distributed.distributed_c10d._get_default_group()
40 |             else:
41 |                 self.group = group
42 | 
43 |             self.mem_context = torch.cuda.use_mem_pool(self.pool)
44 |         else:
45 |             self.mem_context = nullcontext()
46 | 
47 |     def __enter__(self):
48 |         self.mem_context.__enter__()
49 |         if self.group is not None:
50 |             backend = self.group._get_backend(self.device)
51 |             try:
52 |                 backend.deregister_mem_pool(self.pool)
53 |             except RuntimeError:
54 |                 pass
55 | 
56 |     def __exit__(self, *args):
57 |         if self.group is not None:
58 |             backend = self.group._get_backend(self.device)
59 |             try:
60 |                 backend.register_mem_pool(self.pool)
61 |             except RuntimeError:
62 |                 pass
63 |         self.mem_context.__exit__(*args)


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp16_optimizer import FP16_Optimizer
2 | from .fused_adam import FusedAdam
3 | from .fused_lamb import FusedLAMB
4 | 


--------------------------------------------------------------------------------
/apex/contrib/peer_memory/__init__.py:
--------------------------------------------------------------------------------
1 | from .peer_memory import PeerMemoryPool
2 | from .peer_halo_exchanger_1d import PeerHaloExchanger1d
3 | 
4 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_masklib import create_mask
2 | from .asp import ASP
3 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/permutation_search_kernels/__init__.py:
--------------------------------------------------------------------------------
1 | from .call_permutation_search_kernels import accelerated_search_for_good_permutation
2 | from .permutation_utilities import sum_after_2_to_4


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/checkpointing_test_part2.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | def build_model(args):
 8 |     od = OrderedDict()
 9 |     for i in range(args.num_layers):
10 |         if i == 0:
11 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
12 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
13 |         elif i == args.num_layers-1:
14 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
15 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
16 |         else:
17 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
18 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
19 |     return torch.nn.Sequential(od)
20 | 
21 | def train_step(args, model, optimizer, input_batch, target_batch, step):
22 |     predicted_target = model(input_batch)
23 |     loss = ((predicted_target-target_batch)**2).sum()
24 |     loss.backward()
25 |     optimizer.step()
26 |     optimizer.zero_grad()
27 |     step = step + 1
28 |     #print("Step %d :: loss=%e" % (step, loss.item()))
29 |     return step
30 | 
31 | def train_loop(args, model, optimizer, step, num_steps):
32 |     for i in range(num_steps):
33 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
34 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
35 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
36 |     return step
37 | 
38 | def main(step, args, model_state_dict, optimizer_state_dict):
39 |     #
40 |     # PART2
41 |     #
42 | 
43 |     model = build_model(args).cuda()
44 |     one_ll = next(model.children()).weight
45 |     optimizer = FusedAdam(model.parameters())
46 |     ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
47 |     ASP.init_optimizer_for_pruning(optimizer)
48 | 
49 |     torch.manual_seed(args.seed2)
50 |     model.load_state_dict(model_state_dict)
51 |     optimizer.load_state_dict(optimizer_state_dict)
52 | 
53 |     print("Model sparsity is %s" % ("enabled" if ASP.is_sparsity_enabled() else "disabled"))
54 | 
55 |     # train for a few steps with sparse weights
56 |     print("SPARSE :: ",one_ll)
57 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
58 | 
59 | if __name__ == '__main__':
60 |     checkpoint = torch.load("part1.chkp")
61 |     class Args:
62 |         verbosity = checkpoint['verbosity']
63 |         seed = 4873
64 |         seed2 = checkpoint['seed2']
65 |         pattern = checkpoint['pattern']
66 |         whitelist = checkpoint['whitelist']
67 |         allow_recompute_mask = checkpoint['allow_recompute_mask']
68 |         batch_size = 32
69 |         input_features = 8
70 |         output_features = 8
71 |         hidden_features = 32
72 |         num_layers = 4
73 |         num_dense_steps = 2000
74 |         num_sparse_steps = 3000
75 |         num_sparse_steps_2 = 1000
76 |         checkpoint_path = "part1.chkp"
77 |     args = Args()
78 | 
79 |     main(checkpoint['step'], args, checkpoint['model_state_dict'], checkpoint['optimizer_state_dict'])
80 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/checkpointing_test_reference.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | #
 8 | # Reference run for checkpointing test (part1 + part2)
 9 | #
10 | 
11 | def build_model(args):
12 |     od = OrderedDict()
13 |     for i in range(args.num_layers):
14 |         if i == 0:
15 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
16 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
17 |         elif i == args.num_layers-1:
18 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
19 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
20 |         else:
21 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
22 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
23 |     return torch.nn.Sequential(od)
24 | 
25 | def train_step(args, model, optimizer, input_batch, target_batch, step):
26 |     predicted_target = model(input_batch)
27 |     loss = ((predicted_target-target_batch)**2).sum()
28 |     loss.backward()
29 |     optimizer.step()
30 |     optimizer.zero_grad()
31 |     step = step + 1
32 |     #print("Step %d :: loss=%e" % (step, loss.item()))
33 |     return step
34 | 
35 | def train_loop(args, model, optimizer, step, num_steps):
36 |     for i in range(num_steps):
37 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
38 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
39 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
40 |     return step
41 | 
42 | def main(args):
43 |     #
44 |     # PART1
45 |     #
46 | 
47 |     torch.manual_seed(args.seed)
48 | 
49 |     model = build_model(args).cuda()
50 |     one_ll = next(model.children()).weight
51 |     optimizer = FusedAdam(model.parameters())
52 |     ASP.init_model_for_pruning(model, args.pattern, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
53 |     ASP.init_optimizer_for_pruning(optimizer)
54 | 
55 |     step = 0
56 | 
57 |     # train for a few steps with dense weights
58 |     print("DENSE :: ",one_ll)
59 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps)
60 | 
61 |     # simulate sparsity by inserting zeros into existing dense weights
62 |     ASP.compute_sparse_masks()
63 | 
64 |     # train for a few steps with sparse weights
65 |     print("SPARSE :: ",one_ll)
66 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps)
67 | 
68 |     #
69 |     # PART 2
70 |     #
71 | 
72 |     torch.manual_seed(args.seed2)
73 | 
74 |     # train for a few steps with sparse weights
75 |     print("SPARSE :: ",one_ll)
76 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
77 | 
78 | if __name__ == '__main__':
79 |     class Args:
80 |         seed = 4873
81 |         seed2 = 99875
82 |         pattern = "m4n2_2d_best"
83 |         whitelist = [torch.nn.Linear]
84 |         allow_recompute_mask = True
85 |         batch_size = 32
86 |         input_features = 8
87 |         output_features = 8
88 |         hidden_features = 32
89 |         num_layers = 4
90 |         num_dense_steps = 2000
91 |         num_sparse_steps = 3000
92 |         num_sparse_steps_2 = 1000
93 |         checkpoint_path = "part1.chkp"
94 |     args = Args()
95 | 
96 |     main(args)
97 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/toy_problem.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | def build_model(args):
 8 |     od = OrderedDict()
 9 |     for i in range(args.num_layers):
10 |         if i == 0:
11 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
12 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
13 |         elif i == args.num_layers-1:
14 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
15 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
16 |         else:
17 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
18 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
19 |     return torch.nn.Sequential(od)
20 | 
21 | def train_step(args, model, optimizer, input_batch, target_batch, step):
22 |     predicted_target = model(input_batch)
23 |     loss = ((predicted_target-target_batch)**2).sum()
24 |     loss.backward()
25 |     optimizer.step()
26 |     optimizer.zero_grad()
27 |     step = step + 1
28 |     #print("Step %d :: loss=%e" % (step, loss.item()))
29 |     return step
30 | 
31 | def train_loop(args, model, optimizer, step, num_steps):
32 |     for i in range(num_steps):
33 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
34 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
35 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
36 |     return step
37 | 
38 | def main(args):
39 |     model = build_model(args).cuda()
40 |     one_ll = next(model.children()).weight
41 |     optimizer = FusedAdam(model.parameters())
42 |     # only prune linear layers, even though we also support conv1d, conv2d and conv3d
43 |     ASP.init_model_for_pruning(model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True)
44 |     ASP.init_optimizer_for_pruning(optimizer)
45 | 
46 |     step = 0
47 | 
48 |     # train for a few steps with dense weights
49 |     print("DENSE :: ",one_ll)
50 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps)
51 | 
52 |     # simulate sparsity by inserting zeros into existing dense weights
53 |     ASP.compute_sparse_masks()
54 | 
55 |     # train for a few steps with sparse weights
56 |     print("SPARSE :: ",one_ll)
57 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps)
58 | 
59 |     # recompute sparse masks
60 |     ASP.compute_sparse_masks()
61 | 
62 |     # train for a few steps with sparse weights
63 |     print("SPARSE :: ",one_ll)
64 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
65 | 
66 |     # turn off sparsity
67 |     print("SPARSE :: ",one_ll)
68 |     ASP.restore_pruned_weights()
69 | 
70 |     # train for a few steps with dense weights
71 |     print("DENSE :: ",one_ll)
72 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps_2)
73 | 
74 | if __name__ == '__main__':
75 |     class Args:
76 |         batch_size = 32
77 |         input_features = 16
78 |         output_features = 8
79 |         hidden_features = 40
80 |         num_layers = 4
81 |         num_dense_steps = 2000
82 |         num_sparse_steps = 3000
83 |         num_sparse_steps_2 = 1000
84 |         num_dense_steps_2 = 1500
85 |     args = Args()
86 | 
87 |     main(args)
88 | 


--------------------------------------------------------------------------------
/apex/contrib/test/focal_loss/test_focal_loss.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | reference_available = True
 7 | try:
 8 |     from torchvision.ops.focal_loss import sigmoid_focal_loss
 9 | except ImportError:
10 |     reference_available = False
11 | 
12 | from apex.contrib.focal_loss import focal_loss
13 | 
14 | 
15 | @unittest.skipIf(not reference_available, "Reference implementation `torchvision.ops.focal_loss.sigmoid_focal_loss` is not available.")
16 | class FocalLossTest(unittest.TestCase):
17 | 
18 |     N_SAMPLES = 12
19 |     N_CLASSES = 8
20 |     ALPHA = 0.24
21 |     GAMMA = 2.0
22 |     REDUCTION = "sum"
23 | 
24 |     def test_focal_loss(self) -> None:
25 |         if not reference_available:
26 |             self.skipTest("This test needs `torchvision` for `torchvision.ops.focal_loss.sigmoid_focal_loss`.")
27 |         else:
28 |             x = torch.randn(FocalLossTest.N_SAMPLES, FocalLossTest.N_CLASSES).cuda()
29 |             with torch.no_grad():
30 |                 x_expected = x.clone()
31 |                 x_actual = x.clone()
32 |             x_expected.requires_grad_()
33 |             x_actual.requires_grad_()
34 | 
35 |             classes = torch.randint(0, FocalLossTest.N_CLASSES, (FocalLossTest.N_SAMPLES,)).cuda()
36 |             with torch.no_grad():
37 |                 y = F.one_hot(classes, FocalLossTest.N_CLASSES).float()
38 | 
39 |             expected = sigmoid_focal_loss(
40 |                 x_expected,
41 |                 y,
42 |                 alpha=FocalLossTest.ALPHA,
43 |                 gamma=FocalLossTest.GAMMA,
44 |                 reduction=FocalLossTest.REDUCTION,
45 |             )
46 | 
47 |             actual = sum([focal_loss.FocalLoss.apply(
48 |                 x_actual[i:i+1],
49 |                 classes[i:i+1].long(),
50 |                 torch.ones([], device="cuda"),
51 |                 FocalLossTest.N_CLASSES,
52 |                 FocalLossTest.ALPHA,
53 |                 FocalLossTest.GAMMA,
54 |                 0.0,
55 |             ) for i in range(FocalLossTest.N_SAMPLES)])
56 | 
57 |             # forward parity
58 |             torch.testing.assert_close(expected, actual)
59 | 
60 |             expected.backward()
61 |             actual.backward()
62 | 
63 |             # grad parity
64 |             torch.testing.assert_close(x_expected.grad, x_actual.grad)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     torch.manual_seed(42)
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/apex/contrib/test/fused_dense/test_gelu.py:
--------------------------------------------------------------------------------
 1 | from apex import FusedDenseGeluDense
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | batch_size   = 4
 6 | in_features  = 3
 7 | intermediate_features = 3
 8 | out_features = 2
 9 | 
10 | #tst_dtype = torch.float8_e4m3
11 | # tst_dtype = torch.float8_e5m2
12 | tst_dtype = torch.float16
13 | 
14 | # I = torch.randn(batch_size, in_features, dtype=tst_dtype, device='cuda')
15 | I = torch.tensor([[1., 2. , 3., 4.],
16 |                   [1., 2. , 3., 4.],
17 |                   [1., 2. , 3., 4.],
18 |                   [1., 2. , 3., 4.],
19 |                   [1., 2. , 3., 4.]],dtype=tst_dtype, device='cuda')
20 | 
21 | # W = torch.randn(out_features, in_features, dtype=tst_dtype, device='cuda')
22 | W = torch.tensor([[1., 1. , 1. , 1. ],
23 |                   [2., 2. , 2. , 2. ],
24 |                   [3., 3. , 3. , 3. ]],dtype=tst_dtype, device='cuda')
25 | 
26 | # b = torch.randn(in_features, dtype=tst_dtype, device='cuda')
27 | b = torch.tensor([1, 1, 1], dtype=tst_dtype, device='cuda')
28 | 
29 | print("Torch-A:\n", I)
30 | print("Torch-B:\n", W)
31 | print("Torch-b:\n", b)
32 | 
33 | C  = torch.matmul(I, W.t())+b
34 | gelu_output = F.gelu(C)
35 | print("Torch-C:\n", C)
36 | print("Torch-Geli:\n", gelu_output)
37 | 
38 | denseGlue = FusedDenseGeluDense.fused_dense_gelu_dense_function(in_features, intermediate_features, out_features)
39 | denseGlue.to(dtype=tst_dtype)
40 | denseGlue.cuda()
41 | y_tst = denseGlue(I)
42 | 
43 | print("Torch-aC:\n", aC)
44 | print("GELU tensor:\n", gelu_output)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/apex/contrib/test/fused_dense/test_half.py:
--------------------------------------------------------------------------------
 1 | from apex import fused_dense
 2 | import torch
 3 | 
 4 | batch_size   = 5
 5 | in_features  = 4
 6 | out_features = 3
 7 | 
 8 | tst_dtype = torch.float8_e5m2
 9 | 
10 | I = torch.randn(batch_size, in_features, dtype=tst_dtype, device='cuda')
11 | 
12 | W = torch.randn(in_features, out_features, dtype=tst_dtype, device='cuda')
13 | 
14 | b = torch.randn(out_features, dtype=tst_dtype, device='cuda')
15 | 
16 | print("Torch-A:\n", I)
17 | print("Torch-B:\n", W)
18 | print("Torch-b:\n", b)
19 | 
20 | 
21 | aC = fused_dense.fused_dense_function(I, W, b)
22 | print("Torch-aC:\n", aC)
23 | torch.testing.assert_close(C,  aC,  atol=1e-3, rtol=1e-3, equal_nan=True)
24 | 


--------------------------------------------------------------------------------
/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import unittest
 3 | import torch.nn.functional as F
 4 | from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
 5 | 
 6 | class FusedSoftmaxTest(unittest.TestCase):
 7 |     def setUp(self, seed=1234):
 8 |         torch.manual_seed(seed)
 9 |         torch.cuda.manual_seed_all(seed)
10 | 
11 |         self.seq_length   = 80
12 |         self.sequences    = 10
13 |         self.hidden_dim   = 1024
14 |         self.heads        = 16
15 |         self.dropout_prob = 0.0
16 | 
17 |         self.mask = (torch.randn(self.sequences,self.seq_length)>0).cuda()
18 |         self.mask = self.mask.half()*-10000
19 |         self.ref_inputs = torch.randn(self.heads * self.sequences, self.seq_length, self.seq_length, 
20 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
21 |         
22 |         self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)
23 | 
24 |     def test_fused_softmax(self) :
25 |         grads = torch.randn_like(self.tst_inputs)
26 |         y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length)
27 |         y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2)
28 |         y_ref = y_ref.view(self.sequences*self.heads, self.seq_length, self.seq_length) 
29 |         y_ref = F.softmax(y_ref, dim=-1)
30 |         y_ref = torch._fused_dropout(y_ref, 1.0)    
31 |    
32 |         y_tst = fast_mask_softmax_dropout_func(True, self.heads, self.tst_inputs, self.mask, True, 0.0)        
33 |         y_ref[0].backward(grads)
34 |         y_tst.backward(grads)
35 | 
36 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
37 |         self.assertTrue(torch.allclose(y_ref[0], y_tst, atol=1e-3, rtol=1e-3))
38 |         self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/apex/contrib/test/run_rocm_extensions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | 
 4 | 
 5 | test_dirs = ["groupbn", "fused_dense", "layer_norm", "multihead_attn", "transducer", "focal_loss", "index_mul_2d", "."] # "." for test_label_smoothing.py
 6 | ROCM_BLACKLIST = [
 7 |     "layer_norm"
 8 | ]
 9 | 
10 | runner = unittest.TextTestRunner(verbosity=2)
11 | 
12 | errcode = 0
13 | 
14 | for test_dir in test_dirs:
15 |     if test_dir in ROCM_BLACKLIST:
16 |         continue
17 |     suite = unittest.TestLoader().discover(test_dir)
18 | 
19 |     print("\nExecuting tests from " + test_dir)
20 | 
21 |     result = runner.run(suite)
22 | 
23 |     if not result.wasSuccessful():
24 |         errcode = 1
25 | 
26 | sys.exit(errcode)
27 | 


--------------------------------------------------------------------------------
/apex/contrib/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | from .transducer import TransducerJoint
2 | from .transducer import TransducerLoss


--------------------------------------------------------------------------------
/apex/contrib/xentropy/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import xentropy_cuda
 4 |     from .softmax_xentropy import SoftmaxCrossEntropyLoss
 5 |     del torch
 6 |     del xentropy_cuda
 7 |     del softmax_xentropy
 8 | except ImportError as err:
 9 |     print("apex was installed without --xentropy flag, contrib.xentropy is not available")
10 | 


--------------------------------------------------------------------------------
/apex/contrib/xentropy/softmax_xentropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import xentropy_cuda
 3 | 
 4 | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
 5 |     @staticmethod
 6 |     def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
 7 |         losses, max_log_sum_exp = xentropy_cuda.forward(
 8 |             logits, labels, smoothing, half_to_float)
 9 |         losses.masked_fill_(labels==padding_idx, 0)
10 | 
11 |         ctx.save_for_backward(logits, max_log_sum_exp, labels,
12 |             torch.FloatTensor([smoothing]),
13 |             torch.LongTensor([padding_idx]))
14 | 
15 |         return losses
16 | 
17 |     @staticmethod
18 |     def backward(ctx, grad_loss):
19 |         logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
20 | 
21 |         if not grad_loss.is_contiguous():
22 |             grad_loss = grad_loss.contiguous()
23 |         grad_loss.masked_fill_(labels==padding_idx.item(), 0)
24 |         grad_logits = xentropy_cuda.backward(
25 |             grad_loss.contiguous(), logits, max_log_sum_exp,
26 |             labels, smoothing.item())
27 | 
28 |         return grad_logits, None, None, None, None
29 | 


--------------------------------------------------------------------------------
/apex/fp16_utils/README.md:
--------------------------------------------------------------------------------
 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
 2 | 
 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
 4 | 
 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
 6 | 
 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 8 | 
 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 | 
11 | 
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
13 | 
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 | 
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 | 


--------------------------------------------------------------------------------
/apex/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fp16util import (
 2 |     BN_convert_float,
 3 |     network_to_half,
 4 |     prep_param_lists,
 5 |     model_grads_to_master_grads,
 6 |     master_params_to_model_params,
 7 |     tofp16,
 8 |     to_python_float,
 9 |     clip_grad_norm,
10 |     convert_module,
11 |     convert_network,
12 |     FP16Model,
13 | )
14 | 
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 | 


--------------------------------------------------------------------------------
/apex/fused_dense/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_dense import *
2 | 


--------------------------------------------------------------------------------
/apex/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import *
2 | 


--------------------------------------------------------------------------------
/apex/mlp/mlp.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | import math
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | from apex._autocast_utils import _cast_if_autocast_enabled
 8 | import mlp_cuda
 9 | 
10 | 
11 | class MlpFunction(torch.autograd.Function):
12 |     @staticmethod
13 |     def forward(ctx, bias, activation, *args):
14 |         output = mlp_cuda.forward(bias, activation, args)
15 |         ctx.save_for_backward(*args)
16 |         ctx.outputs = output
17 |         ctx.bias = bias
18 |         ctx.activation = activation
19 |         return output[0]
20 | 
21 |     @staticmethod
22 |     def backward(ctx, grad_o):
23 |         grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
24 |         del ctx.outputs
25 |         return (None, None, *grads)
26 | 
27 | 
28 | def mlp_function(bias, activation, *args):
29 |     autocast_args = _cast_if_autocast_enabled(bias, activation, *args)
30 |     return MlpFunction.apply(*autocast_args)
31 | 
32 | 
33 | class MLP(torch.nn.Module):
34 |     """Launch MLP in C++
35 | 
36 |     Args:
37 |         mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
38 |         bias (bool): Default True:
39 |         relu (bool): Default True
40 |     """
41 |     def __init__(self, mlp_sizes, bias=True, activation='relu'):
42 |         super(MLP, self).__init__()
43 |         self.num_layers = len(mlp_sizes) - 1
44 |         self.mlp_sizes = copy(mlp_sizes)
45 |         self.bias = 1 if bias else 0
46 | 
47 |         if activation is 'none':
48 |             self.activation = 0
49 |         elif activation is 'relu':
50 |             self.activation = 1
51 |         elif activation is 'sigmoid':
52 |             self.activation = 2
53 |         else:
54 |             raise TypeError("activation must be relu or none.")
55 | 
56 |         self.weights = []
57 |         self.biases = []
58 |         for i in range(self.num_layers):
59 |             w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i]))
60 |             self.weights.append(w)
61 |             name = 'weight_{}'.format(i)
62 |             setattr(self, name, w)
63 |             if self.bias:
64 |                 b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
65 |                 self.biases.append(b)
66 |                 name = 'bias_{}'.format(i)
67 |                 setattr(self, name, b)
68 | 
69 |         self.reset_parameters()
70 | 
71 |     def reset_parameters(self):
72 |         for weight in self.weights:
73 |             dimsum = weight.size(0) + weight.size(1)
74 |             std = math.sqrt(2. / float(dimsum))
75 |             nn.init.normal_(weight, 0., std)
76 |         if self.bias:
77 |             for bias in self.biases:
78 |                 std = math.sqrt(1. / float(bias.size(0)))
79 |                 nn.init.normal_(bias, 0., std)
80 | 
81 |     def forward(self, input):
82 |         return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)
83 | 
84 |     def extra_repr(self):
85 |         s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
86 |         return s
87 | 


--------------------------------------------------------------------------------
/apex/multi_tensor_apply/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_tensor_apply import MultiTensorApply
2 | 
3 | multi_tensor_applier = MultiTensorApply(256*32)
4 | multi_tensor_applier_l2norm = MultiTensorApply(2048*32)
5 | 
6 | 


--------------------------------------------------------------------------------
/apex/multi_tensor_apply/multi_tensor_apply.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class MultiTensorApply(object):
 4 |     available = False
 5 |     warned = False
 6 | 
 7 |     def __init__(self, chunk_size):
 8 |         try:
 9 |             import amp_C
10 |             MultiTensorApply.available = True
11 |             self.chunk_size = chunk_size
12 |         except ImportError as err:
13 |             MultiTensorApply.available = False
14 |             MultiTensorApply.import_err = err
15 | 
16 |     def check_avail(self):
17 |         if MultiTensorApply.available == False:
18 |             raise RuntimeError(
19 |                 "Attempted to call MultiTensorApply method, but MultiTensorApply "
20 |                 "is not available, possibly because Apex was installed without "
21 |                 "--cpp_ext --cuda_ext.  Original import error message:",
22 |                 MultiTensorApply.import_err)
23 | 
24 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
25 |         self.check_avail()
26 | 
27 |         return op(self.chunk_size,
28 |                   noop_flag_buffer,
29 |                   tensor_lists,
30 |                   *args)
31 | 


--------------------------------------------------------------------------------
/apex/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm, FusedRMSNorm, MixedFusedRMSNorm
2 | 


--------------------------------------------------------------------------------
/apex/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_sgd import FusedSGD
2 | from .fused_adam import FusedAdam
3 | from .fused_novograd import FusedNovoGrad
4 | from .fused_lamb import FusedLAMB
5 | from .fused_adagrad import FusedAdagrad
6 | from .fused_mixed_precision_lamb import FusedMixedPrecisionLamb
7 | from .fused_lars import FusedLARS
8 | 


--------------------------------------------------------------------------------
/apex/parallel/README.md:
--------------------------------------------------------------------------------
 1 | ## Distributed Data Parallel
 2 | 
 3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
 4 | 
 5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
 6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of
 7 | transfers required.
 8 | 
 9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
10 | 
11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html)
12 | 
13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
14 | 
15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
16 | 
17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
18 | 
19 | ### Synchronized Batch Normalization
20 | 
21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
22 | It reduces stats on the first (channel) dimension of the Tensor and accepts
23 | arbitrary spatial dimensions.
24 | 
25 | #### Installation
26 | 
27 | Apex provides two sync BN implementation:
28 | 
29 | 1. There is the Python-only implementation, which is the default implementation
30 | when install with `python setup.py install`.
31 | It uses PyTorch primitive operations and distributed communication package from
32 | `torch.distributed`.
33 | 
34 |    - _Python-only implementation requires input tensor to be of same data type as
35 | layer_
36 | 
37 | 2. We also provide implementation with kernels through CUDA/C++ extension with
38 | improved performance. We are experimenting with Welford and Kahan for reduction
39 | hoping to get better accuracy.
40 |    To use the kernel implementation, user need to install Apex with CUDA extension
41 | enabled `python setup.py install --cuda_ext`.
42 | 
43 |    - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
44 | This is required to run imagenet example in fp16._
45 | 
46 |    - _Currently kernel implementation only supports GPU._
47 | 
48 | #### HowTo
49 | 
50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with
51 | the layer explicitly.
52 | 
53 | ```
54 | import apex
55 | input_t = torch.randn(3, 5, 20).cuda()
56 | sbn = apex.parallel.SyncBatchNorm(5).cuda()
57 | output_t = sbn(input)
58 | ```
59 | 
60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
61 | 
62 | ```
63 | # model is an instance of torch.nn.Module
64 | import apex
65 | sync_bn_model = apex.parallel.convert_syncbn_model(model)
66 | ```
67 | 


--------------------------------------------------------------------------------
/apex/parallel/multiproc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def docstring_hack():
 6 |     """
 7 |     Multiproc file which will launch a set of processes locally for multi-gpu
 8 |     usage: python -m apex.parallel.multiproc main.py ...
 9 |     """
10 |     pass
11 | 
12 | argslist = list(sys.argv)[1:]
13 | world_size = torch.cuda.device_count()
14 | 
15 | if '--world-size' in argslist:
16 |     world_size = int(argslist[argslist.index('--world-size')+1])
17 | else:
18 |     argslist.append('--world-size')
19 |     argslist.append(str(world_size))
20 | 
21 | workers = []
22 | 
23 | for i in range(world_size):
24 |     if '--rank' in argslist:
25 |         argslist[argslist.index('--rank')+1] = str(i)
26 |     else:
27 |         argslist.append('--rank')
28 |         argslist.append(str(i))
29 |     stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
30 |     print(argslist)
31 |     p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
32 |     workers.append(p)
33 | 
34 | for p in workers:
35 |     p.wait()
36 | 


--------------------------------------------------------------------------------
/apex/testing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/testing/__init__.py


--------------------------------------------------------------------------------
/apex/testing/common_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This file contains common utility functions for running the unit tests on ROCM.
 3 | '''
 4 | 
 5 | import torch
 6 | import os
 7 | import sys
 8 | from functools import wraps
 9 | import unittest
10 | 
11 | 
12 | TEST_WITH_ROCM = os.getenv('APEX_TEST_WITH_ROCM', '0') == '1'
13 | SKIP_FLAKY_TEST = os.getenv('APEX_SKIP_FLAKY_TEST', '0') == '1'
14 | 
15 | ## Wrapper to skip the unit tests.
16 | def skipIfRocm(fn):
17 |     @wraps(fn)
18 |     def wrapper(*args, **kwargs):
19 |         if TEST_WITH_ROCM:
20 |             raise unittest.SkipTest("test doesn't currently work on ROCm stack.")
21 |         else:
22 |             fn(*args, **kwargs)
23 |     return wrapper
24 | 
25 | ## Wrapper to skip the flaky unit tests.
26 | def skipFlakyTest(fn):
27 |     @wraps(fn)
28 |     def wrapper(*args, **kwargs):
29 |         if SKIP_FLAKY_TEST:
30 |             raise unittest.SkipTest("Test is flaky.")
31 |         else:
32 |             fn(*args, **kwargs)
33 |     return wrapper
34 | 


--------------------------------------------------------------------------------
/apex/transformer/README.md:
--------------------------------------------------------------------------------
 1 | # apex.transformer
 2 | 
 3 | `apex.transformer` is a module which enables efficient large Transformer models at scale.
 4 | 
 5 | `apex.transformer.tensor_parallel` and `apex.transformer.pipeline_parallel` are both based on [NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM)'s module.
 6 | The former is based on `megatron.mpu` and the latter is on `megatron.schedules` and `megatron.p2p_communication`.
 7 | 
 8 | ## Tensor Model Parallel (TP)
 9 | 
10 | APEX's tensor model parallel utilities provides some `torch.nn.Module`'s, custom fused kernels, and PRNG state handling.
11 | See Appendix B.2 of [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) for the details of
12 | PRNG state handling.
13 | 
14 | ## Pipeline Model Parallel (PP)
15 | APEX's pipeline model parallel functions require models to have `.set_input_tensor` because
16 | the input tensor for `.forward` method can be `None`.
17 | 
18 | The following is a really casual sketch of training script with apex pp.
19 | 
20 | ```python
21 | import torch
22 | import torch.nn as nn
23 | import torch.nn.functional as F
24 | 
25 | from apex.transformer import parallel_state
26 | from apex.transformer.pipeline_parallel import get_forward_backward_func
27 | 
28 | 
29 | class Model(nn.Module):
30 | 
31 |     ...
32 | 
33 |     def __init__(self, *args, **kwargs):
34 |         super().__init__()
35 |         pre_process = kwargs.pop("pre_process")
36 |         post_process = kwargs.pop("post_process")
37 | 
38 |     def set_input_tensor(self, tensor):
39 |         self.input_tensor = tensor
40 | 
41 |     def forward(self, x, ...):
42 |         if parallel_state.is_pipeline_first_stage():
43 |             input = x
44 |         else:
45 |             input = self.input_tensor
46 |         ...
47 | 
48 | 
49 | def model_provider_func(*args, **kwargs):
50 |     return Model(*args, **kwargs)
51 | 
52 | 
53 | def loss_func(pred, label):
54 |     loss = ...
55 |     averaged_loss = average_losses_across_data_parallel_group([loss])
56 |     return loss, {'nice_loss': averaged_loss}
57 | 
58 | 
59 | def forward_step_func(batch, model):
60 |     input, label = process_batch(batch)
61 |     out = model(input)
62 |     return out, partial(loss_func, label)
63 | 
64 | 
65 | forward_backward_func = get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
66 | 
67 | 
68 | parallel_state.initialize_model_parallel(
69 |     tensor_model_parallel_size,
70 |     pipeline_model_parallel_size,
71 |     virtual_pipeline_model_parallel_size,
72 | )
73 | # The following line basically is equivalent to `build_model(Model, wrap_with_ddp, virtual_pipeline_model_parallel_size, *model_args, **model_kwargs)`
74 | model = build_model(model_provider_func, wrap_with_ddp, virtual_pipeline_model_parallel_size, *model_args, **model_kwargs)
75 | optimizer = ...
76 | data_loader = ...
77 | for epoch in range(num_epochs):
78 |     for batch in data_loader:
79 |         forward_backward_func(forward_step_func, batch, model, forward_only=False, tensor_shape)
80 |         optimizer.step()
81 | ```
82 | 


--------------------------------------------------------------------------------
/apex/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | from apex.transformer import amp
 2 | from apex.transformer import functional
 3 | from apex.transformer import parallel_state
 4 | from apex.transformer import pipeline_parallel
 5 | from apex.transformer import tensor_parallel
 6 | from apex.transformer import utils
 7 | from apex.transformer.enums import LayerType
 8 | from apex.transformer.enums import AttnType
 9 | from apex.transformer.enums import AttnMaskType
10 | 
11 | 
12 | __all__ = [
13 |     "amp",
14 |     "functional",
15 |     "parallel_state",
16 |     "pipeline_parallel",
17 |     "tensor_parallel",
18 |     "utils",
19 |     # enums.py
20 |     "LayerType",
21 |     "AttnType",
22 |     "AttnMaskType",
23 | ]
24 | 


--------------------------------------------------------------------------------
/apex/transformer/_data/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer._data._batchsampler import MegatronPretrainingRandomSampler
2 | from apex.transformer._data._batchsampler import MegatronPretrainingSampler
3 | 
4 | 
5 | __all__ = [
6 |     "MegatronPretrainingRandomSampler",
7 |     "MegatronPretrainingSampler",
8 | ]
9 | 


--------------------------------------------------------------------------------
/apex/transformer/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer.amp.grad_scaler import GradScaler
2 | 
3 | 
4 | __all__ = [
5 |     "GradScaler",
6 | ]
7 | 


--------------------------------------------------------------------------------
/apex/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import enum
16 | 
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 | 
22 | 
23 | class AttnType(enum.Enum):
24 |     self_attn = 1
25 |     cross_attn = 2
26 | 
27 | 
28 | class AttnMaskType(enum.Enum):
29 |     padding = 1
30 |     causal = 2
31 | 
32 | 
33 | class ModelType(enum.Enum):
34 |     encoder_or_decoder = 1
35 |     encoder_and_decoder = 2
36 | 


--------------------------------------------------------------------------------
/apex/transformer/functional/__init__.py:
--------------------------------------------------------------------------------
 1 | from apex.transformer.functional.fused_rope import (
 2 |     fused_apply_rotary_pos_emb,
 3 |     fused_apply_rotary_pos_emb_cached,
 4 |     fused_apply_rotary_pos_emb_thd,
 5 |     fused_apply_rotary_pos_emb_2d,
 6 | )
 7 | from apex.transformer.functional.fused_softmax import FusedScaleMaskSoftmax
 8 | 
 9 | __all__ = [
10 |     "FusedScaleMaskSoftmax",
11 |     "fused_apply_rotary_pos_emb",
12 |     "fused_apply_rotary_pos_emb_cached",
13 |     "fused_apply_rotary_pos_emb_thd",
14 |     "fused_apply_rotary_pos_emb_2d",
15 | ]
16 | 


--------------------------------------------------------------------------------
/apex/transformer/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | from apex.transformer.layers.layer_norm import FastLayerNorm
 3 | from apex.transformer.layers.layer_norm import FusedLayerNorm
 4 | from apex.transformer.layers.layer_norm import MixedFusedLayerNorm
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "FastLayerNorm",
 9 |     "FusedLayerNorm",
10 |     "MixedFusedLayerNorm",
11 | ]
12 | 


--------------------------------------------------------------------------------
/apex/transformer/log_util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | 
 5 | def get_transformer_logger(name: str) -> logging.Logger:
 6 |     name_wo_ext = os.path.splitext(name)[0]
 7 |     return logging.getLogger(name_wo_ext)
 8 | 
 9 | 
10 | def set_logging_level(verbosity) -> None:
11 |     """Change logging severity.
12 | 
13 |     Args:
14 |         verbosity
15 |     """
16 |     from apex import _library_root_logger
17 | 
18 |     _library_root_logger.setLevel(verbosity)
19 | 


--------------------------------------------------------------------------------
/apex/transformer/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func
2 | from apex.transformer.pipeline_parallel.schedules.common import build_model
3 | 
4 | 
5 | __all__ = [
6 |     "get_forward_backward_func",
7 |     "build_model",
8 | ]
9 | 


--------------------------------------------------------------------------------
/apex/transformer/pipeline_parallel/_timers.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class _Timer:
 7 |     """Timer."""
 8 | 
 9 |     def __init__(self, name):
10 |         self.name_ = name
11 |         self.elapsed_ = 0.0
12 |         self.started_ = False
13 |         self.start_time = time.time()
14 | 
15 |     def start(self):
16 |         """Start the timer."""
17 |         assert not self.started_, "timer has already been started"
18 |         torch.cuda.synchronize()
19 |         self.start_time = time.time()
20 |         self.started_ = True
21 | 
22 |     def stop(self):
23 |         """Stop the timer."""
24 |         assert self.started_, "timer is not started"
25 |         torch.cuda.synchronize()
26 |         self.elapsed_ += time.time() - self.start_time
27 |         self.started_ = False
28 | 
29 |     def reset(self):
30 |         """Reset timer."""
31 |         self.elapsed_ = 0.0
32 |         self.started_ = False
33 | 
34 |     def elapsed(self, reset=True):
35 |         """Calculate the elapsed time."""
36 |         started_ = self.started_
37 |         # If the timing in progress, end it first.
38 |         if self.started_:
39 |             self.stop()
40 |         # Get the elapsed time.
41 |         elapsed_ = self.elapsed_
42 |         # Reset the elapsed time
43 |         if reset:
44 |             self.reset()
45 |         # If timing was in progress, set it back.
46 |         if started_:
47 |             self.start()
48 |         return elapsed_
49 | 
50 | 
51 | class _Timers:
52 |     """Group of timers."""
53 | 
54 |     def __init__(self):
55 |         self.timers = {}
56 | 
57 |     def __call__(self, name):
58 |         if name not in self.timers:
59 |             self.timers[name] = _Timer(name)
60 |         return self.timers[name]
61 | 
62 |     def write(self, names, writer, iteration, normalizer=1.0, reset=False):
63 |         """Write timers to a tensorboard writer"""
64 |         # currently when using add_scalars,
65 |         # torch.utils.add_scalars makes each timer its own run, which
66 |         # polutes the runs list, so we just add each as a scalar
67 |         assert normalizer > 0.0
68 |         for name in names:
69 |             value = self.timers[name].elapsed(reset=reset) / normalizer
70 |             writer.add_scalar(name + "-time", value, iteration)
71 | 
72 |     def log(self, names, normalizer=1.0, reset=True):
73 |         """Log a group of timers."""
74 |         assert normalizer > 0.0
75 |         string = "time (ms)"
76 |         for name in names:
77 |             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
78 |             string += " | {}: {:.2f}".format(name, elapsed_time)
79 |         if torch.distributed.is_initialized():
80 |             if torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1):
81 |                 print(string, flush=True)
82 |         else:
83 |             print(string, flush=True)
84 | 


--------------------------------------------------------------------------------
/apex/transformer/pipeline_parallel/schedules/__init__.py:
--------------------------------------------------------------------------------
 1 | from apex.transformer import parallel_state
 2 | from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 3 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import (
 4 |     forward_backward_no_pipelining,
 5 | )
 6 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import (
 7 |     _forward_backward_pipelining_with_interleaving,
 8 | )
 9 | from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import (
10 |     forward_backward_pipelining_without_interleaving,
11 | )
12 | 
13 | __all__ = [
14 |     "get_forward_backward_func",
15 | ]
16 | 
17 | 
18 | class ExperimentalWarning(Warning):
19 |     pass
20 | 
21 | 
22 | def get_forward_backward_func(
23 |     virtual_pipeline_model_parallel_size, pipeline_model_parallel_size,
24 | ):
25 |     if parallel_state.get_pipeline_model_parallel_world_size() > 1:
26 |         if virtual_pipeline_model_parallel_size is not None:
27 |             if get_num_microbatches() % pipeline_model_parallel_size != 0:
28 |                 msg = "number of microbatches is not divisible by pipeline-parallel size when using interleaved schedule"
29 |                 raise RuntimeError(msg)
30 |             forward_backward_func = _forward_backward_pipelining_with_interleaving
31 |         else:
32 |             forward_backward_func = forward_backward_pipelining_without_interleaving
33 |     else:
34 |         forward_backward_func = forward_backward_no_pipelining
35 |     return forward_backward_func
36 | 


--------------------------------------------------------------------------------
/apex/transformer/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Model parallel utility interface."""
16 | 
17 | from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
18 | 
19 | from apex.transformer.tensor_parallel.data import broadcast_data
20 | 
21 | from apex.transformer.tensor_parallel.layers import (
22 |     ColumnParallelLinear,
23 |     RowParallelLinear,
24 |     VocabParallelEmbedding,
25 |     set_tensor_model_parallel_attributes,
26 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
27 |     copy_tensor_model_parallel_attributes,
28 | )
29 | 
30 | from apex.transformer.tensor_parallel.mappings import (
31 |     copy_to_tensor_model_parallel_region,
32 |     gather_from_tensor_model_parallel_region,
33 |     reduce_from_tensor_model_parallel_region,
34 |     scatter_to_tensor_model_parallel_region,
35 |     scatter_to_sequence_parallel_region,
36 | )
37 | 
38 | from .random import (
39 |     checkpoint,
40 |     get_cuda_rng_tracker,
41 |     init_checkpointed_activations_memory_buffer,
42 |     model_parallel_cuda_manual_seed,
43 |     reset_checkpointed_activations_memory_buffer,
44 | )
45 | 
46 | from apex.transformer.tensor_parallel.utils import split_tensor_along_last_dim
47 | 
48 | 
49 | __all__ = [
50 |     # cross_entropy.py
51 |     "vocab_parallel_cross_entropy",
52 |     # data.py
53 |     "broadcast_data",
54 |     # layers.py
55 |     "ColumnParallelLinear",
56 |     "RowParallelLinear",
57 |     "VocabParallelEmbedding",
58 |     "set_tensor_model_parallel_attributes",
59 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
60 |     "copy_tensor_model_parallel_attributes",
61 |     # mappings.py
62 |     "copy_to_tensor_model_parallel_region",
63 |     "gather_from_tensor_model_parallel_region",
64 |     "reduce_from_tensor_model_parallel_region",
65 |     "scatter_to_tensor_model_parallel_region",
66 |     "scatter_to_sequence_parallel_region",
67 |     # random.py
68 |     "checkpoint",
69 |     "get_cuda_rng_tracker",
70 |     "init_checkpointed_activations_memory_buffer",
71 |     "model_parallel_cuda_manual_seed",
72 |     "reset_checkpointed_activations_memory_buffer",
73 |     # utils.py
74 |     "split_tensor_along_last_dim",
75 | ]
76 | 


--------------------------------------------------------------------------------
/apex/transformer/tensor_parallel/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from typing import List, Sequence
16 | 
17 | import torch
18 | 
19 | from apex.transformer.utils import divide
20 | 
21 | 
22 | def split_tensor_along_last_dim(
23 |     tensor: torch.Tensor,
24 |     num_partitions: int,
25 |     contiguous_split_chunks: bool = False,
26 | ) -> List[torch.Tensor]:
27 |     """Split a tensor along its last dimension.
28 |     Arguments:
29 |         tensor: input tensor.
30 |         num_partitions: number of partitions to split the tensor
31 |         contiguous_split_chunks: If True, make each chunk contiguous
32 |                                  in memory.
33 |     """
34 |     # Get the size and dimension.
35 |     last_dim = tensor.dim() - 1
36 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
37 |     # Split.
38 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
39 |     # Note: torch.split does not create contiguous tensors by default.
40 |     if contiguous_split_chunks:
41 |         return tuple(chunk.contiguous() for chunk in tensor_list)
42 | 
43 |     return tensor_list
44 | 
45 | 
46 | class VocabUtility:
47 |     """Split the vocabulary into `world_size` chunks and return the
48 |     first and last index of the vocabulary belonging to the `rank`
49 |     partition: Note that indices in [fist, last)"""
50 | 
51 |     @staticmethod
52 |     def vocab_range_from_per_partition_vocab_size(
53 |         per_partition_vocab_size: int, rank, world_size: int
54 |     ) -> Sequence[int]:
55 |         index_f = rank * per_partition_vocab_size
56 |         index_l = index_f + per_partition_vocab_size
57 |         return index_f, index_l
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
61 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
62 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
63 |             per_partition_vocab_size, rank, world_size
64 |         )
65 | 


--------------------------------------------------------------------------------
/apex/transformer/testing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/apex/transformer/testing/__init__.py


--------------------------------------------------------------------------------
/apex/transformer/utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions used by both `pipeline_parallel` and `tensor_parallel`"""
 2 | import torch
 3 | 
 4 | from apex.transformer import parallel_state
 5 | 
 6 | 
 7 | def ensure_divisibility(numerator, denominator):
 8 |     """Ensure that numerator is divisible by the denominator."""
 9 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
10 |         numerator, denominator
11 |     )
12 | 
13 | 
14 | def divide(numerator, denominator):
15 |     """Ensure that numerator is divisible by the denominator and return
16 |     the division value."""
17 |     ensure_divisibility(numerator, denominator)
18 |     return numerator // denominator
19 | 
20 | 
21 | def split_tensor_into_1d_equal_chunks(tensor):
22 |     """Break a tensor into equal 1D chunks."""
23 |     data = tensor.view(-1)
24 |     partition_size = (
25 |         torch.numel(data) // parallel_state.get_tensor_model_parallel_world_size()
26 |     )
27 |     start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
28 |     end_index = start_index + partition_size
29 |     return data[start_index:end_index]
30 | 
31 | 
32 | def gather_split_1d_tensor(tensor):
33 |     """Opposite of above function, gather values from model parallel ranks."""
34 |     world_size = parallel_state.get_tensor_model_parallel_world_size()
35 |     numel = torch.numel(tensor)
36 |     numel_gathered = world_size * numel
37 |     gathered = torch.empty(
38 |         numel_gathered,
39 |         dtype=tensor.dtype,
40 |         device=torch.cuda.current_device(),
41 |         requires_grad=False,
42 |     )
43 |     torch.distributed._all_gather_base(
44 |         gathered,
45 |         tensor,
46 |         group=parallel_state.get_tensor_model_parallel_group()
47 |         )
48 |     return gathered
49 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | export PYTORCH_ROCM_ARCH=gfx942
 4 | # export TENSILE_DB=0x40
 5 | # export HIPBLASLT_LOG_MASK=0xff
 6 | 
 7 | 
 8 | python setup.py develop --cuda_ext --cpp_ext
 9 | cp build/lib.linux-x86_64-cpython-39/fused_dense_cuda.cpython-39-x86_64-linux-gnu.so /opt/conda/envs/py_3.9/lib/python3.9/site-packages/.
10 | 
11 | # export HIPBLASLT_LOG_FILE=hipblaslt_bgrad.log
12 | 
13 | # python apex/contrib/test/fused_dense/test_fused_dense_1.py
14 | 
15 | # python apex/contrib/test/fused_dense/test_half_T.py
16 | # python apex/contrib/test/fused_dense/test_half_NT.py
17 | 


--------------------------------------------------------------------------------
/csrc/compat.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCH_CHECK
 2 | #define TORCH_CHECK AT_CHECK
 3 | #endif
 4 | 
 5 | #ifdef VERSION_GE_1_3
 6 | #define DATA_PTR data_ptr
 7 | #else
 8 | #define DATA_PTR data
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/csrc/flatten_unflatten.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <torch/csrc/utils/tensor_flatten.h>
 3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
 4 | 
 5 | at::Tensor flatten(std::vector<at::Tensor> tensors)
 6 | {
 7 |   return torch::utils::flatten_dense_tensors(tensors);
 8 | }
 9 | 
10 | std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
11 | {
12 |   return torch::utils::unflatten_dense_tensors(flat, tensors);
13 | }
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("flatten", &flatten, "Flatten dense tensors");
17 |   m.def("unflatten", &unflatten, "Unflatten dense tensors");
18 | }
19 | 


--------------------------------------------------------------------------------
/csrc/fused_dense_base.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <torch/torch.h>
 3 | #include <vector>
 4 | #include <cublasLt.h>
 5 | #include <stdio.h>
 6 | 
 7 | at::Tensor linear_bias_forward(                      at::Tensor input,   at::Tensor weight,    at::Tensor bias);
 8 | 
 9 | std::vector<at::Tensor> linear_bias_backward(        at::Tensor input,   at::Tensor weight,    at::Tensor d_output);
10 | 
11 | std::vector<at::Tensor> linear_gelu_linear_forward(  at::Tensor input,   at::Tensor weight1,   at::Tensor bias1,     at::Tensor weight2,   at::Tensor bias2);
12 | 
13 | std::vector<at::Tensor> linear_gelu_linear_backward( at::Tensor input,   at::Tensor gelu_in,   at::Tensor output1,   at::Tensor weight1,   at::Tensor weight2, at::Tensor d_output2);
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("linear_bias_forward",         &linear_bias_forward,         "linear bias forward");
17 |   m.def("linear_bias_backward",        &linear_bias_backward,        "linear bias backward");
18 |   m.def("linear_gelu_linear_forward",  &linear_gelu_linear_forward,  "linear gelu linear forward");
19 |   m.def("linear_gelu_linear_backward", &linear_gelu_linear_backward, "linear gelu linear backward");
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/csrc/megatron/fused_bias_swiglu.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | // Function declarations
 4 | torch::Tensor fused_bias_swiglu_forward(torch::Tensor input, torch::Tensor bias);
 5 | torch::Tensor fused_bias_swiglu_backward(torch::Tensor grad_output, torch::Tensor input, torch::Tensor bias);
 6 | 
 7 | // Register functions for PyTorch extension
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 9 |     m.def("forward", &fused_bias_swiglu_forward, "Fused Bias SwiGLU Forward (CUDA)");
10 |     m.def("backward", &fused_bias_swiglu_backward, "Fused Bias SwiGLU Backward (CUDA)");
11 | }


--------------------------------------------------------------------------------
/csrc/megatron/fused_weight_gradient_dense.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include <cstdio>
 4 | #include <vector>
 5 | 
 6 | void wgrad_gemm_accum_fp32_cuda_stub(
 7 |   at::Tensor &input_2d,
 8 |   at::Tensor &d_output_2d,
 9 |   at::Tensor &d_weight
10 | );
11 | 
12 | void wgrad_gemm_accum_fp16_cuda_stub(
13 |   at::Tensor &input_2d,
14 |   at::Tensor &d_output_2d,
15 |   at::Tensor &d_weight
16 | );
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
19 |     m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32_cuda_stub, "wgrad gemm accum in fp32", py::call_guard<py::gil_scoped_release>());
20 |     m.def("wgrad_gemm_accum_fp16", &wgrad_gemm_accum_fp16_cuda_stub, "wgrad gemm accum in fp16", py::call_guard<py::gil_scoped_release>());
21 | }
22 | 


--------------------------------------------------------------------------------
/csrc/megatron/generic_scaled_masked_softmax_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn
22 | {
23 |   namespace fused_softmax
24 |   {
25 |     namespace generic_scaled_masked_softmax
26 |     {
27 | 
28 |       torch::Tensor fwd_cuda(
29 |           torch::Tensor const &input,
30 |           torch::Tensor const &mask,
31 |           float scale_factor);
32 | 
33 |       torch::Tensor bwd_cuda(
34 |           torch::Tensor const &output_grads,
35 |           torch::Tensor const &softmax_results,
36 |           float scale_factor);
37 | 
38 |       torch::Tensor fwd(
39 |           torch::Tensor const &input,
40 |           torch::Tensor const &mask,
41 |           float scale_factor)
42 |       {
43 |         TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
44 |         TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) ||
45 |                        (input.scalar_type() == at::ScalarType::BFloat16),
46 |                    "Only fp16 and bf16 are supported");
47 |         TORCH_CHECK(mask.dim() == 4, "expected 4D tensor");
48 | 
49 |         return fwd_cuda(input, mask, scale_factor);
50 |       }
51 | 
52 |       torch::Tensor bwd(
53 |           torch::Tensor const &output_grads,
54 |           torch::Tensor const &softmax_results,
55 |           float scale_factor)
56 |       {
57 | 
58 |         TORCH_CHECK(output_grads.dim() == 4, "expected 3D tensor");
59 |         TORCH_CHECK(softmax_results.dim() == 4, "expected 3D tensor");
60 | 
61 |         TORCH_CHECK((output_grads.scalar_type() == at::ScalarType::Half) ||
62 |                        (output_grads.scalar_type() == at::ScalarType::BFloat16),
63 |                    "Only fp16 and bf16 are supported");
64 |         TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
65 |                        (softmax_results.scalar_type() == at::ScalarType::BFloat16),
66 |                    "Only fp16 and bf16 are supported");
67 | 
68 |         return bwd_cuda(output_grads, softmax_results, scale_factor);
69 |       }
70 | 
71 |     } // end namespace generic_scaled_masked_softmax
72 |   }   // end namespace fused_softmax
73 | } // end namespace multihead_attn
74 | 
75 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
76 |   m.def("forward",
77 |         &multihead_attn::fused_softmax::generic_scaled_masked_softmax::fwd,
78 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.", py::call_guard<py::gil_scoped_release>());
79 | 
80 |   m.def("backward",
81 |         &multihead_attn::fused_softmax::generic_scaled_masked_softmax::bwd,
82 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.", py::call_guard<py::gil_scoped_release>());
83 | }
84 | 


--------------------------------------------------------------------------------
/csrc/megatron/scaled_masked_softmax_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <torch/extension.h>
18 | #include <vector>
19 | 
20 | namespace multihead_attn {
21 | namespace fused_softmax {
22 | namespace scaled_masked_softmax {
23 | 
24 | torch::Tensor fwd_cuda(
25 |     torch::Tensor const& input, 
26 |     torch::Tensor const& mask,
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | int get_batch_per_block_cuda(
35 |     int query_seq_len,
36 |     int key_seq_len,
37 |     int batches,
38 |     int attn_heads);
39 | 
40 | torch::Tensor fwd(
41 |     torch::Tensor const& input,
42 |     torch::Tensor const& mask,
43 |     float scale_factor) {
44 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
45 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
46 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
47 |       "Only fp16 and bf16 are supported");
48 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
49 | 
50 |   return fwd_cuda(input, mask, scale_factor);
51 | }
52 | 
53 | torch::Tensor bwd(
54 |     torch::Tensor const& output_grads, 
55 |     torch::Tensor const& softmax_results,
56 |     float scale_factor) {
57 | 
58 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
59 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
60 | 
61 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
62 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
63 |       "Only fp16 and bf16 are supported");
64 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
65 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
66 |       "Only fp16 and bf16 are supported");
67 | 
68 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
69 | }
70 | 
71 | int get_batch_per_block(
72 |     int query_seq_len,
73 |     int key_seq_len,
74 |     int batches,
75 |     int attn_heads) {
76 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
77 | }
78 | 
79 | } // end namespace scaled_masked_softmax
80 | } // end namespace fused_softmax
81 | } // end namespace multihead_attn
82 | 
83 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
84 |   m.def("forward", 
85 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
86 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
87 | 
88 |   m.def("backward",
89 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
90 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
91 | 
92 |   m.def("get_batch_per_block",
93 |         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
94 |         "Return Batch per block size."
95 |   );
96 | }
97 | 


--------------------------------------------------------------------------------
/csrc/megatron/scaled_softmax_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(
35 |     torch::Tensor const& input,
36 |     float scale_factor) {
37 |   TORCH_CHECK(input.dim() == 4, "expected 4D tensor");
38 |   TORCH_CHECK((input.scalar_type() == at::ScalarType::Half) ||
39 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
40 |       "Only fp16 and bf16 are supported");
41 | 
42 |   return fwd_cuda(input, scale_factor);
43 | }
44 | 
45 | torch::Tensor bwd(
46 |     torch::Tensor const& output_grads, 
47 |     torch::Tensor const& softmax_results,
48 |     float scale_factor) {
49 | 
50 |   TORCH_CHECK(output_grads.dim() == 4, "expected 3D tensor");
51 |   TORCH_CHECK(softmax_results.dim() == 4, "expected 3D tensor");
52 | 
53 |   TORCH_CHECK((output_grads.scalar_type() == at::ScalarType::Half) ||
54 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
55 |       "Only fp16 and bf16 are supported");
56 |   TORCH_CHECK((softmax_results.scalar_type() == at::ScalarType::Half) ||
57 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
58 |       "Only fp16 and bf16 are supported");
59 | 
60 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
61 | }
62 | 
63 | } // end namespace scaled_softmax
64 | } // end namespace fused_softmax
65 | } // end namespace multihead_attn
66 | 
67 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
68 |   m.def("forward", 
69 |         &multihead_attn::fused_softmax::scaled_softmax::fwd, 
70 | 	"Self Multihead Attention scaled, softmax -- Forward.", py::call_guard<py::gil_scoped_release>());
71 |   m.def("backward", 
72 |         &multihead_attn::fused_softmax::scaled_softmax::bwd,
73 | 	"Self Multihead Attention scaled, softmax -- Backward.", py::call_guard<py::gil_scoped_release>());
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/csrc/megatron/scaled_upper_triang_masked_softmax_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <torch/extension.h>
18 | #include <vector>
19 | 
20 | namespace multihead_attn {
21 | namespace fused_softmax {
22 | namespace scaled_upper_triang_masked_softmax {
23 | 
24 | torch::Tensor fwd_cuda(
25 |     torch::Tensor const& input, 
26 |     float scale_factor);
27 | 
28 | torch::Tensor bwd_cuda(
29 |     torch::Tensor const& output_grads, 
30 |     torch::Tensor const& softmax_results,
31 |     float scale_factor);
32 | 
33 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
34 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
35 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
36 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
37 |       "Only fp16 and bf16 are supported");
38 | 
39 |   return fwd_cuda(input, scale_factor);
40 | }
41 | 
42 | torch::Tensor bwd(
43 |     torch::Tensor const& output_grads, 
44 |     torch::Tensor const& softmax_results,
45 |     float scale_factor) {
46 | 
47 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
48 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
49 | 
50 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
51 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
52 |       "Only fp16 and bf16 are supported");
53 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
54 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
55 |       "Only fp16 and bf16 are supported");
56 | 
57 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
58 | }
59 | 
60 | } // end namespace scaled_upper_triang_masked_softmax
61 | } // end namespace fused_softmax
62 | } // end namespace multihead_attn
63 | 
64 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
65 |   m.def("forward", 
66 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
67 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
68 |   m.def("backward", 
69 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
70 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
71 | }
72 | 


--------------------------------------------------------------------------------
/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <cuda.h>
19 | #include <cuda_runtime.h>
20 | #include <cuda_fp16.h>
21 | //#include <cuda_profiler_api.h>
22 | #include <ATen/cuda/CUDAContext.h>
23 | #include <torch/extension.h>
24 | #include "scaled_upper_triang_masked_softmax.h"
25 | #include "type_shim.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(
32 |     torch::Tensor const& input, 
33 |     float scale_factor)
34 | {
35 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
36 |   const int attn_batches = input.size(0);
37 |   const int seq_len = input.size(1);
38 |   TORCH_INTERNAL_ASSERT(seq_len <= 16384);
39 | 
40 |   // Output 
41 |   auto act_options = input.options().requires_grad(false);
42 |   torch::Tensor softmax_results = 
43 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
44 | 
45 |   // Softmax Intermediate Result Ptr
46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
47 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
48 | 
49 |   DISPATCH_HALF_AND_BFLOAT(
50 |       input.scalar_type(),
51 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
52 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
53 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
54 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
55 | 	  scale_factor,
56 | 	  seq_len,
57 | 	  seq_len,
58 | 	  attn_batches);
59 |       );
60 |   return softmax_results;
61 | }
62 | 				      
63 | 
64 | torch::Tensor bwd_cuda(
65 |     torch::Tensor const& output_grads_, 
66 |     torch::Tensor const& softmax_results_, 
67 |     float scale_factor)  {
68 | 	
69 |   auto output_grads = output_grads_.contiguous();
70 |   auto softmax_results = softmax_results_.contiguous();
71 | 
72 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
73 |   const int attn_batches = output_grads.size(0);
74 |   const int seq_len = output_grads.size(1);
75 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
76 | 
77 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
78 | 
79 |   //Softmax Grad
80 |   DISPATCH_HALF_AND_BFLOAT(
81 |       output_grads_.scalar_type(),
82 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
83 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
84 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
85 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
86 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
87 | 	  scale_factor,
88 | 	  seq_len,
89 | 	  seq_len,
90 | 	  attn_batches);
91 |       );
92 |   
93 |   //backward pass is completely in-place
94 |   return output_grads;
95 | }
96 | }
97 | }
98 | }
99 | 


--------------------------------------------------------------------------------
/csrc/multi_tensor_adagrad.cu:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <ATen/AccumulateType.h>
  3 | #include <ATen/cuda/CUDAContext.h>
  4 | #include <ATen/cuda/Exceptions.h>
  5 | // Another possibility:
  6 | // #include <torch/all.h>
  7 | 
  8 | #include <assert.h>
  9 | 
 10 | #include "multi_tensor_apply.cuh"
 11 | #include "type_shim.h"
 12 | 
 13 | #define BLOCK_SIZE 1024
 14 | #define ILP 4
 15 | 
 16 | typedef enum {
 17 |   ADAGRAD_MODE_0 = 0, // L2 regularization mode.
 18 |   ADAGRAD_MODE_1 = 1, // AdamW-style weight decay.
 19 | 
 20 | } adagradMode_t;
 21 | 
 22 | using MATH_T = float;
 23 | 
 24 | template <typename T> struct AdagradFunctor {
 25 |   __device__ __forceinline__ void
 26 |   operator()(int chunk_size, volatile int *noop_gmem, TensorListMetadata<3> &tl,
 27 |              const float epsilon, const float lr, adagradMode_t mode,
 28 |              const float weight_decay) {
 29 |     int tensor_loc = tl.block_to_tensor[blockIdx.x];
 30 |     int chunk_idx = tl.block_to_chunk[blockIdx.x];
 31 |     int n = tl.sizes[tensor_loc];
 32 | 
 33 |     T *g = (T *)tl.addresses[0][tensor_loc];
 34 |     g += chunk_idx * chunk_size;
 35 | 
 36 |     T *p = (T *)tl.addresses[1][tensor_loc];
 37 |     p += chunk_idx * chunk_size;
 38 | 
 39 |     T *h = (T *)tl.addresses[2][tensor_loc];
 40 |     h += chunk_idx * chunk_size;
 41 | 
 42 |     n -= chunk_idx * chunk_size;
 43 | 
 44 |     // see note in multi_tensor_scale_kernel.cu
 45 |     for (int i_start = 0; i_start < n && i_start < chunk_size;
 46 |          i_start += blockDim.x * ILP) {
 47 |       MATH_T r_g[ILP];
 48 |       MATH_T r_p[ILP];
 49 |       MATH_T r_h[ILP];
 50 | #pragma unroll
 51 |       for (int ii = 0; ii < ILP; ii++) {
 52 |         int i = i_start + threadIdx.x + ii * blockDim.x;
 53 |         if (i < n && i < chunk_size) {
 54 |           r_g[ii] = g[i];
 55 |           r_p[ii] = p[i];
 56 |           r_h[ii] = h[i];
 57 |         } else {
 58 |           r_g[ii] = MATH_T(0);
 59 |           r_p[ii] = MATH_T(0);
 60 |           r_h[ii] = MATH_T(0);
 61 |         }
 62 |       }
 63 | #pragma unroll
 64 |       for (int ii = 0; ii < ILP; ii++) {
 65 |         if (mode == ADAGRAD_MODE_0) { // L2
 66 |           r_g[ii] = r_g[ii] + weight_decay * r_p[ii];
 67 |           r_h[ii] = r_h[ii] + r_g[ii] * r_g[ii];
 68 |           r_p[ii] = r_p[ii] - lr * (r_g[ii] / (sqrtf(r_h[ii]) + epsilon));
 69 |         } else { // AdamW-style
 70 |           r_h[ii] = r_h[ii] + r_g[ii] * r_g[ii];
 71 |           r_p[ii] = r_p[ii] - lr * (r_g[ii] / (sqrtf(r_h[ii]) + epsilon) + weight_decay * r_p[ii]);
 72 |         }
 73 |       }
 74 | #pragma unroll
 75 |       for (int ii = 0; ii < ILP; ii++) {
 76 |         int i = i_start + threadIdx.x + ii * blockDim.x;
 77 |         if (i < n && i < chunk_size) {
 78 |           p[i] = r_p[ii];
 79 |           h[i] = r_h[ii];
 80 |         }
 81 |       }
 82 |     }
 83 |   }
 84 | };
 85 | 
 86 | void multi_tensor_adagrad_cuda(
 87 |     int chunk_size, at::Tensor noop_flag,
 88 |     std::vector<std::vector<at::Tensor>> tensor_lists, const float lr,
 89 |     const float epsilon, const int mode, const float weight_decay) {
 90 |   using namespace at;
 91 | 
 92 |   // Assume single type across p,g,h now
 93 |   DISPATCH_DOUBLE_FLOAT_AND_HALF_AND_BFLOAT16(
 94 |       tensor_lists[0][0].scalar_type(), 0, "adagrad",
 95 |       multi_tensor_apply<3>(BLOCK_SIZE, chunk_size, noop_flag, tensor_lists,
 96 |                             AdagradFunctor<scalar_t_0>(), epsilon, lr,
 97 |                             (adagradMode_t)mode, weight_decay);)
 98 | 
 99 |   AT_CUDA_CHECK(cudaGetLastError());
100 | }
101 | 


--------------------------------------------------------------------------------
/csrc/static_switch.h:
--------------------------------------------------------------------------------
 1 | // From
 2 | // https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 3 | 
 4 | #pragma once
 5 | 
 6 | /// @param COND       - a boolean expression to switch by
 7 | /// @param CONST_NAME - a name given for the constexpr bool variable.
 8 | /// @param ...       - code to execute for true and false
 9 | ///
10 | /// Usage:
11 | /// ```
12 | /// BOOL_SWITCH(flag, BoolConst, [&] {
13 | ///     some_function<BoolConst>(...);
14 | /// });
15 | /// ```
16 | #define BOOL_SWITCH(COND, CONST_NAME, ...)      \
17 |   [&] {                                         \
18 |     if (COND) {                                 \
19 |       constexpr static bool CONST_NAME = true;  \
20 |       return __VA_ARGS__();                     \
21 |     } else {                                    \
22 |       constexpr static bool CONST_NAME = false; \
23 |       return __VA_ARGS__();                     \
24 |     }                                           \
25 |   }()


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = NVIDIAAPEX
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | gh-pages:
16 | 	git checkout gh-pages
17 | 	rm -rf build
18 | 	rm -rf source
19 | 	git checkout master -- .
20 | 	make html
21 | 	rm -rf ../_modules ../_sources ../_static
22 | 	mv -fv build/html/* ../
23 | 	rm -rf build
24 | 	git add -A
25 | 	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
26 | 
27 | .PHONY: help Makefile
28 | 
29 | # Catch-all target: route all unknown targets to Sphinx using the new
30 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
31 | %: Makefile
32 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
33 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/pytorch_theme.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
  3 | }
  4 | 
  5 | /* Default header fonts are ugly */
  6 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
  7 |     font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
  8 | }
  9 | 
 10 | /* Use white for docs background */
 11 | .wy-side-nav-search {
 12 |     background-color: #fff;
 13 | }
 14 | 
 15 | .wy-nav-content-wrap, .wy-menu li.current > a  {
 16 |     background-color: #fff;
 17 | }
 18 | 
 19 | @media screen and (min-width: 1400px) {
 20 |     .wy-nav-content-wrap {
 21 |         background-color: rgba(0, 0, 0, 0.0470588);
 22 |     }
 23 | 
 24 |     .wy-nav-content {
 25 |         background-color: #fff;
 26 |     }
 27 | }
 28 | 
 29 | /* Fixes for mobile */
 30 | .wy-nav-top {
 31 |     background-color: #fff;
 32 |     background-image: url('../img/apex.jpg');
 33 |     background-repeat: no-repeat;
 34 |     background-position: center;
 35 |     padding: 0;
 36 |     margin: 0.4045em 0.809em;
 37 |     color: #333;
 38 | }
 39 | 
 40 | .wy-nav-top > a {
 41 |     display: none;
 42 | }
 43 | 
 44 | @media screen and (max-width: 768px) {
 45 |     .wy-side-nav-search>a img.logo {
 46 |         height: 60px;
 47 |     }
 48 | }
 49 | 
 50 | /* This is needed to ensure that logo above search scales properly */
 51 | .wy-side-nav-search a {
 52 |     display: block;
 53 | }
 54 | 
 55 | /* This ensures that multiple constructors will remain in separate lines. */
 56 | .rst-content dl:not(.docutils) dt {
 57 |     display: table;
 58 | }
 59 | 
 60 | /* Use our red for literals (it's very similar to the original color) */
 61 | .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
 62 |     color: #F05732;
 63 | }
 64 | 
 65 | .rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
 66 | .rst-content code.xref, a .rst-content tt, a .rst-content code {
 67 |     color: #404040;
 68 | }
 69 | 
 70 | /* Change link colors (except for the menu) */
 71 | 
 72 | a {
 73 |     color: #F05732;
 74 | }
 75 | 
 76 | a:hover {
 77 |     color: #F05732;
 78 | }
 79 | 
 80 | 
 81 | a:visited {
 82 |     color: #D44D2C;
 83 | }
 84 | 
 85 | .wy-menu a {
 86 |     color: #b3b3b3;
 87 | }
 88 | 
 89 | .wy-menu a:hover {
 90 |     color: #b3b3b3;
 91 | }
 92 | 
 93 | /* Default footer text is quite big */
 94 | footer {
 95 |     font-size: 80%;
 96 | }
 97 | 
 98 | footer .rst-footer-buttons {
 99 |     font-size: 125%; /* revert footer settings - 1/80% = 125% */
100 | }
101 | 
102 | footer p {
103 |     font-size: 100%;
104 | }
105 | 
106 | /* For hidden headers that appear in TOC tree */
107 | /* see http://stackoverflow.com/a/32363545/3343043 */
108 | .rst-content .hidden-section {
109 |     display: none;
110 | }
111 | 
112 | nav .hidden-section {
113 |     display: inherit;
114 | }
115 | 
116 | .wy-side-nav-search>div.version {
117 |     color: #000;
118 | }
119 | 


--------------------------------------------------------------------------------
/docs/source/_static/img/nv-pytorch2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/docs/source/_static/img/nv-pytorch2.png


--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 |   {% block sidebartitle %} {{ super() }}
 3 | 
 4 |   <style>
 5 |     /* Sidebar header (and topbar for mobile) */
 6 |     .wy-side-nav-search, .wy-nav-top {
 7 |       background: #76b900;
 8 |     }
 9 | 
10 |     .wy-side-nav-search a:link, .wy-nav-top a:link {
11 |       color: #fff;
12 |     }
13 |     .wy-side-nav-search a:visited, .wy-nav-top a:visited {
14 |       color: #fff;
15 |     }
16 |     .wy-side-nav-search a:hover, .wy-nav-top a:hover {
17 |       color: #fff;
18 |     }
19 | 
20 |     .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
21 |       color: #d9d9d9
22 |     }
23 | 
24 |     .wy-menu-vertical a:active {
25 |       background-color: #76b900
26 |     }
27 | 
28 |     .wy-side-nav-search>div.version {
29 |       color: rgba(0, 0, 0, 0.3)
30 |     }
31 |   </style>
32 |   {% endblock %}
33 | 
34 |   {% block footer %} {{ super() }}
35 | 
36 |   <style>
37 |   a:link, a:visited {
38 |     color: #76b900;
39 |   }
40 | 
41 |   a:hover {
42 |     color: #8c0;
43 |   }
44 | 
45 |   .rst-content dl:not(.docutils) dt {
46 |     background: rgba(118, 185, 0, 0.1);
47 |     color: rgba(59,93,0,1);
48 |     border-top: solid 3px rgba(59,93,0,1);
49 |   }
50 |   </style>
51 |   {% endblock %}
52 | 


--------------------------------------------------------------------------------
/docs/source/fp16_utils.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.fp16_utils
 5 | ===================================
 6 | 
 7 | This submodule contains utilities designed to streamline the mixed precision training recipe 
 8 | presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 
 9 | `Training Neural Networks with Mixed Precision: Theory and Practice`_ and 
10 | `Training Neural Networks with Mixed Precision: Real Examples`_.
11 | For Pytorch users, Real Examples in particular is recommended.
12 | 
13 | Full runnable Python scripts demonstrating ``apex.fp16_utils`` 
14 | can be found on the Github page:
15 | 
16 | | `Simple FP16_Optimizer demos`_
17 | |
18 | | `Distributed Mixed Precision Training with imagenet`_
19 | |
20 | | `Mixed Precision Training with word_language_model`_
21 | |
22 | |
23 | 
24 | .. _`on Parallel Forall`:
25 |     https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
26 | .. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
27 |     http://on-demand.gputechconf.com/gtc/2018/video/S8923/
28 | .. _`Training Neural Networks with Mixed Precision: Real Examples`:
29 |     http://on-demand.gputechconf.com/gtc/2018/video/S81012/
30 | .. _`Simple FP16_Optimizer demos`:
31 |     https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple
32 | .. _`Distributed Mixed Precision Training with imagenet`:
33 |     https://github.com/NVIDIA/apex/tree/master/examples/imagenet
34 | .. _`Mixed Precision Training with word_language_model`:
35 |     https://github.com/NVIDIA/apex/tree/master/examples/word_language_model
36 | 
37 | .. automodule:: apex.fp16_utils
38 | .. currentmodule:: apex.fp16_utils
39 | 
40 | Automatic management of master params + loss scaling
41 | ----------------------------------------------------
42 | 
43 | .. autoclass:: FP16_Optimizer
44 |     :members:
45 | 
46 | .. autoclass:: LossScaler
47 |     :members:
48 | 
49 | .. autoclass:: DynamicLossScaler
50 |     :members:
51 | 
52 | Manual master parameter management
53 | ----------------------------------
54 | 
55 | .. autofunction:: prep_param_lists
56 | 
57 | .. autofunction:: master_params_to_model_params
58 | 
59 | .. autofunction:: model_grads_to_master_grads
60 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyTorch documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec 23 13:31:47 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :github_url: https://github.com/nvidia/apex
 7 | 
 8 | Apex (A PyTorch Extension)
 9 | ===================================
10 | 
11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex),
12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
13 | 
14 | Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.
15 | 
16 | Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here:  https://github.com/mcarilli/mixed_precision_references.
17 | 
18 | .. toctree::
19 |    :maxdepth: 1
20 |    :caption: AMP:  Automatic Mixed Precision
21 | 
22 |    amp
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: Distributed Training
27 | 
28 |    parallel
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: Fused Optimizers
33 | 
34 |    optimizers
35 | 
36 | .. toctree::
37 |    :maxdepth: 1
38 |    :caption: Fused Layer Norm
39 | 
40 |    layernorm
41 | 
42 | ..   .. toctree::
43 |      :maxdepth: 1
44 |      :caption: Deprecated mixed precision API
45 |      fp16_util
46 | 
47 | ..   RNN
48 |    
49 | Indices and tables
50 | ==================
51 | 
52 | * :ref:`genindex`
53 | * :ref:`modindex`
54 | 


--------------------------------------------------------------------------------
/docs/source/layernorm.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.normalization.fused_layer_norm
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.normalization
 8 | .. currentmodule:: apex.normalization
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedLayerNorm
14 |     :members:
15 | 
16 | .. autoclass:: FusedRMSNorm
17 |     :members:
18 | 


--------------------------------------------------------------------------------
/docs/source/optimizers.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.optimizers
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.optimizers
 8 | .. currentmodule:: apex.optimizers
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedAdam
14 |     :members:
15 | 
16 | .. autoclass:: FusedLAMB
17 |     :members:
18 | 
19 | .. autoclass:: FusedNovoGrad
20 |     :members:
21 | 
22 | .. autoclass:: FusedSGD
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/docs/source/parallel.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.parallel
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.parallel
 8 | .. currentmodule:: apex.parallel
 9 | 
10 | .. DistributedDataParallel
11 |    ----------
12 | 
13 | .. autoclass:: DistributedDataParallel
14 |     :members:
15 | 
16 | .. autoclass:: Reducer
17 |     :members:
18 | 
19 | .. autoclass:: SyncBatchNorm
20 |     :members:
21 | 
22 | Utility functions
23 | ----------------------------------
24 | 
25 | .. autofunction:: convert_syncbn_model
26 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | This directory contains examples illustrating Apex mixed precision and distributed tools.
2 | 
3 | **Note for users of the pre-unification API**:
4 | `deprecated_api` contains examples illustrating the old (pre-unified) APIs.  These APIs will be removed soon, and users are strongly encouraged to switch.  The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API.
5 | 


--------------------------------------------------------------------------------
/examples/dcgan/README.md:
--------------------------------------------------------------------------------
 1 | # Mixed Precision DCGAN Training in PyTorch
 2 | 
 3 | `main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
 4 | It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
 5 | 
 6 | We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
 7 | ```
 8 | # Added after models and optimizers construction
 9 | [netD, netG], [optimizerD, optimizerG] = amp.initialize(
10 |     [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
11 | ...
12 | # loss.backward() changed to:
13 | with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
14 |     errD_real_scaled.backward()
15 | ...
16 | with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
17 |     errD_fake_scaled.backward()
18 | ...
19 | with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
20 |     errG_scaled.backward()
21 | ```
22 | 
23 | Note that we use different `loss_scalers` for each computed loss.
24 | Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).
25 | 
26 | To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.
27 | 
28 | With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
29 | 
30 | "Pure FP32" training:
31 | ```
32 | $ python main_amp.py --opt_level O0
33 | ```
34 | Recommended mixed precision training:
35 | ```
36 | $ python main_amp.py --opt_level O1
37 | ```
38 | 
39 | Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.
40 | 
41 | To enable mixed precision training, we introduce the `--opt_level` argument.
42 | 


--------------------------------------------------------------------------------
/examples/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image must at least have pytorch and CUDA installed.
 2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.02-py3
 3 | FROM $BASE_IMAGE
 4 | ARG BASE_IMAGE
 5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}"
 6 | # make sure we don't overwrite some existing directory called "apex"
 7 | WORKDIR /tmp/unique_for_apex
 8 | # uninstall Apex if present, twice to make absolutely sure :)
 9 | RUN pip uninstall -y apex || :
10 | RUN pip uninstall -y apex || :
11 | # SHA is something the user can touch to force recreation of this Docker layer,
12 | # and therefore force cloning of the latest version of Apex
13 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
14 | WORKDIR /tmp/unique_for_apex/apex
15 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
16 | WORKDIR /workspace
17 | 


--------------------------------------------------------------------------------
/examples/docker/README.md:
--------------------------------------------------------------------------------
 1 | ## Option 1:  Create a new container with Apex
 2 | 
 3 | **Dockerfile** installs the latest Apex on top of an existing image.  Run
 4 | ```
 5 | docker build -t new_image_with_apex .
 6 | ```
 7 | By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
 8 | which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
 9 | 
10 | Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
11 | `BASE_IMAGE` must have Pytorch and Cuda installed.  For example, any
12 | `-devel` image for Pytorch 1.0 and later from the
13 | [official Pytorch Dockerhub](https://hub.docker.com/r/pytorch/pytorch) may be used:
14 | ```
15 | docker build --build-arg BASE_IMAGE=1.3-cuda10.1-cudnn7-devel -t new_image_with_apex .
16 | ```
17 | 
18 | If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
19 | 
20 | **Warning:**
21 | Currently, the non-`-devel` images on Pytorch Dockerhub do not contain the Cuda compiler `nvcc`.  Therefore,
22 | images whose name does not contain `-devel` are not eligible candidates for `BASE_IMAGE`.
23 | 
24 | ### Running your Apex container
25 | 
26 | Like any Cuda-enabled Pytorch container, a container with Apex should be run via [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), for example:
27 | ```
28 | docker run --runtime=nvidia -it --rm --ipc=host new_image_with_apex
29 | ```
30 | 
31 | ## Option 2:  Install Apex in a running container
32 | 
33 | Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example,
34 | ```
35 | docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
36 | ```
37 | then go to /apex/in/container within the running container and
38 | ```
39 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
40 | ```
41 | 


--------------------------------------------------------------------------------
/examples/simple/distributed/README.md:
--------------------------------------------------------------------------------
 1 | **distributed_data_parallel.py** and **run.sh** show an example using Amp with
 2 | [apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
 3 | [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
 4 | and the Pytorch multiprocess launcher script,
 5 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
 6 | The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
 7 | single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
 8 | come after the call to `amp.initialize`.  Test via
 9 | ```bash
10 | bash run.sh
11 | ```
12 | 
13 | **This is intended purely as an instructional example, not a performance showcase.**
14 | 


--------------------------------------------------------------------------------
/examples/simple/distributed/distributed_data_parallel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import os
 4 | from apex import amp
 5 | # FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
 6 | from apex.parallel import DistributedDataParallel
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | # FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
10 | # automatically by torch.distributed.launch.
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 | 
14 | # FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
15 | # the 'WORLD_SIZE' environment variable will also be set automatically.
16 | args.distributed = False
17 | if 'WORLD_SIZE' in os.environ:
18 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
19 | 
20 | if args.distributed:
21 |     # FOR DISTRIBUTED:  Set the device according to local_rank.
22 |     torch.cuda.set_device(args.local_rank)
23 | 
24 |     # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
25 |     # environment variables, and requires that you use init_method=`env://`.
26 |     torch.distributed.init_process_group(backend='nccl',
27 |                                          init_method='env://')
28 | 
29 | torch.backends.cudnn.benchmark = True
30 | 
31 | N, D_in, D_out = 64, 1024, 16
32 | 
33 | # Each process receives its own batch of "fake input data" and "fake target data."
34 | # The "training loop" in each process just uses this fake batch over and over.
35 | # https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
36 | # example of distributed data sampling for both training and validation.
37 | x = torch.randn(N, D_in, device='cuda')
38 | y = torch.randn(N, D_out, device='cuda')
39 | 
40 | model = torch.nn.Linear(D_in, D_out).cuda()
41 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
42 | 
43 | model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
44 | 
45 | if args.distributed:
46 |     # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
47 |     # apex.parallel.DistributedDataParallel.
48 |     model = DistributedDataParallel(model)
49 |     # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
50 |     # model = torch.nn.parallel.DistributedDataParallel(model,
51 |     #                                                   device_ids=[args.local_rank],
52 |     #                                                   output_device=args.local_rank)
53 | 
54 | loss_fn = torch.nn.MSELoss()
55 | 
56 | for t in range(500):
57 |     optimizer.zero_grad()
58 |     y_pred = model(x)
59 |     loss = loss_fn(y_pred, y)
60 |     with amp.scale_loss(loss, optimizer) as scaled_loss:
61 |         scaled_loss.backward()
62 |     optimizer.step()
63 | 
64 | if args.local_rank == 0:
65 |     print("final loss = ", loss)
66 | 


--------------------------------------------------------------------------------
/examples/simple/distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools",
4 |     "wheel",
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cxxfilt>=0.2.0
2 | tqdm>=4.28.1
3 | numpy
4 | PyYAML>=5.1
5 | pytest>=3.5.1
6 | packaging>=14.0
7 | matplotlib>=3.8


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | flake8>=3.7.9
3 | Sphinx>=3.0.3


--------------------------------------------------------------------------------
/tests/L0/run_amp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/tests/L0/run_amp/__init__.py


--------------------------------------------------------------------------------
/tests/L0/run_amp/test_larc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import Parameter
 6 | 
 7 | from apex import amp
 8 | from apex.parallel.LARC import LARC
 9 | from utils import common_init, common_reset
10 | from apex.amp import _amp_state
11 | 
12 | 
13 | class MyModel(torch.nn.Module):
14 |     def __init__(self, unique):
15 |         super(MyModel, self).__init__()
16 |         self.weight0 = Parameter(
17 |             unique + torch.arange(2, device="cuda", dtype=torch.float32)
18 |         )
19 | 
20 |     def forward(self, input):
21 |         return (input * self.weight0).sum()
22 | 
23 | 
24 | class TestLARC(unittest.TestCase):
25 |     def setUp(self):
26 |         self.x = torch.ones((2), device="cuda", dtype=torch.float32)
27 |         common_init(self)
28 | 
29 |     def tearDown(self):
30 |         common_reset(self)
31 | 
32 |     def test_larc_mixed_precision(self):
33 |         for opt_level in ["O0", "O1", "O2", "O3"]:
34 |             model = MyModel(1)
35 | 
36 |             optimizer = LARC(
37 |                 torch.optim.SGD(
38 |                     [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
39 |                 )
40 |             )
41 | 
42 |             model, optimizer = amp.initialize(
43 |                 model, optimizer, opt_level=opt_level, verbosity=0
44 |             )
45 | 
46 |             optimizer.zero_grad()
47 |             loss = model(self.x)
48 |             with amp.scale_loss(loss, optimizer) as scaled_loss:
49 |                 scaled_loss.backward()
50 |             optimizer.step()
51 | 
52 |             if opt_level != "O0":
53 |                 _amp_state.handle._deactivate()
54 | 
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/tests/L0/run_amp/test_multi_tensor_l2norm.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import functools as ft
 4 | import itertools as it
 5 | 
 6 | from apex import amp
 7 | import torch
 8 | from torch import nn
 9 | import torch.nn.functional as F
10 | 
11 | from utils import common_init, HALF, FLOAT,\
12 |     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT, common_reset
13 | 
14 | try:
15 |   import amp_C
16 |   from amp_C import multi_tensor_l2norm
17 |   from apex.multi_tensor_apply import MultiTensorApply
18 |   disabled = False
19 | except ImportError as err:
20 |   print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
21 |   disabled = True
22 | 
23 | 
24 | class TestMultiTensorL2Norm(unittest.TestCase):
25 | 
26 |     def setUp(self):
27 |         common_init(self)
28 |         self.val = 4.0
29 |         self.overflow_buf = torch.tensor(1, dtype=torch.int, device='cuda').zero_()
30 | 
31 |     def tearDown(self):
32 |         common_reset(self)
33 | 
34 |     # The tensor creation here is written for convenience, not speed.
35 |     def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_tensor):
36 |         self.overflow_buf.zero_()
37 |         a = torch.cuda.FloatTensor(sizea).fill_(self.val)
38 |         b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
39 | 
40 |         in_list = []
41 |         for i in range(repeat_tensors):
42 |             in_list += [a.clone().to(in_type), b.clone().to(in_type)]
43 | 
44 |         if per_tensor:
45 |             norm, norm_per_tensor = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
46 |             normab = torch.cat((a.norm().view(1), b.norm().view(1)))
47 |             norm_per_tensor = norm_per_tensor.view(-1, 2)
48 |         else:
49 |             norm, _ = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
50 | 
51 |         reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
52 | 
53 |         self.assertTrue(torch.allclose(norm, reference))
54 |         if per_tensor:
55 |           self.assertTrue(torch.allclose(norm_per_tensor, normab))
56 |         self.assertTrue(self.overflow_buf.item() == 0)
57 | 
58 |     @unittest.skipIf(disabled, "amp_C is unavailable")
59 |     def test_fuzz(self):
60 |         input_size_pairs = (
61 |             (7777*77, 555*555),
62 |             (777, 555),
63 |             (555, 2048*32+1),
64 |             (2048*32+1, 555),
65 |             (555, 2048*32),
66 |             (2048*32, 555),
67 |             (33333, 555),
68 |             (555, 33333))
69 |         appliers = (
70 |             MultiTensorApply(2048*32),
71 |             MultiTensorApply(333),
72 |             MultiTensorApply(33333))
73 |         repeat_tensors = (
74 |             1,
75 |             55)
76 | 
77 |         for sizea, sizeb in input_size_pairs:
78 |           for applier in appliers:
79 |             for repeat in repeat_tensors:
80 |               for in_type in (torch.float32, torch.float16):
81 |                 for per_tensor in (False, True):
82 |                   self.l2norm(sizea, sizeb, applier, repeat, in_type, per_tensor)
83 | 
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/tests/L0/run_amp/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | HALF = 'torch.cuda.HalfTensor'
 4 | FLOAT = 'torch.cuda.FloatTensor'
 5 | BFLOAT16 = 'torch.cuda.BFloat16Tensor'
 6 | 
 7 | DTYPES = [torch.half, torch.float]
 8 | 
 9 | DTYPES2 = [torch.bfloat16, torch.float]
10 | 
11 | ALWAYS_HALF = {torch.float: HALF,
12 |                torch.half: HALF}
13 | ALWAYS_BFLOAT16 = {torch.bfloat16: BFLOAT16,
14 |                    torch.float: BFLOAT16}
15 | ALWAYS_FLOAT = {torch.float: FLOAT,
16 |                 torch.half: FLOAT}
17 | MATCH_INPUT = {torch.float: FLOAT,
18 |                torch.half: HALF,
19 |                torch.bfloat16: BFLOAT16}
20 | 
21 | def common_init(test_case):
22 |     test_case.h = 64
23 |     test_case.b = 16
24 |     test_case.c = 16
25 |     test_case.k = 3
26 |     test_case.t = 10
27 |     torch.set_default_device('cuda')
28 |     torch.set_default_dtype(torch.float)
29 | 
30 | 
31 | def common_reset(test_case):
32 |     torch.set_default_device('cpu')
33 | 


--------------------------------------------------------------------------------
/tests/L0/run_fp16util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/tests/L0/run_fp16util/__init__.py


--------------------------------------------------------------------------------
/tests/L0/run_fp16util/test_fp16util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from apex.fp16_utils import FP16Model
 7 | 
 8 | 
 9 | class DummyBlock(nn.Module):
10 |     def __init__(self):
11 |         super(DummyBlock, self).__init__()
12 | 
13 |         self.conv = nn.Conv2d(10, 10, 2)
14 |         self.bn = nn.BatchNorm2d(10, affine=True)
15 | 
16 |     def forward(self, x):
17 |         return self.conv(self.bn(x))
18 | 
19 | 
20 | class DummyNet(nn.Module):
21 |     def __init__(self):
22 |         super(DummyNet, self).__init__()
23 | 
24 |         self.conv1 = nn.Conv2d(3, 10, 2)
25 |         self.bn1 = nn.BatchNorm2d(10, affine=False)
26 |         self.db1 = DummyBlock()
27 |         self.db2 = DummyBlock()
28 | 
29 |     def forward(self, x):
30 |         out = x
31 |         out = self.conv1(out)
32 |         out = self.bn1(out)
33 |         out = self.db1(out)
34 |         out = self.db2(out)
35 |         return out
36 | 
37 | 
38 | class DummyNetWrapper(nn.Module):
39 |     def __init__(self):
40 |         super(DummyNetWrapper, self).__init__()
41 | 
42 |         self.bn = nn.BatchNorm2d(3, affine=True)
43 |         self.dn = DummyNet()
44 | 
45 |     def forward(self, x):
46 |         return self.dn(self.bn(x))
47 | 
48 | 
49 | class TestFP16Model(unittest.TestCase):
50 |     def setUp(self):
51 |         self.N = 64
52 |         self.C_in = 3
53 |         self.H_in = 16
54 |         self.W_in = 32
55 |         self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda()
56 |         self.orig_model = DummyNetWrapper().cuda()
57 |         self.fp16_model = FP16Model(self.orig_model)
58 | 
59 |     def test_params_and_buffers(self):
60 |         exempted_modules = [
61 |             self.fp16_model.network.bn,
62 |             self.fp16_model.network.dn.db1.bn,
63 |             self.fp16_model.network.dn.db2.bn,
64 |         ]
65 |         for m in self.fp16_model.modules():
66 |             expected_dtype = torch.float if (m in exempted_modules) else torch.half
67 |             for p in m.parameters(recurse=False):
68 |                 assert p.dtype == expected_dtype
69 |             for b in m.buffers(recurse=False):
70 |                 assert b.dtype in (expected_dtype, torch.int64)
71 | 
72 |     def test_output_is_half(self):
73 |         out_tensor = self.fp16_model(self.in_tensor)
74 |         assert out_tensor.dtype == torch.half
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/tests/L0/run_fused_dense/test_fused_dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import unittest
 3 | import torch.nn.functional as F
 4 | from apex import fused_dense
 5 | from torch import nn
 6 | from apex import amp
 7 | 
 8 | class FusedDenseTest(unittest.TestCase):
 9 |     def setUp(self, seed=0):
10 |         torch.manual_seed(seed)
11 |         # torch.cuda.manual_seed_all(seed)
12 | 
13 |         self.seq_length   = 512
14 |         self.sequences    = 3
15 |         self.hidden_dim   = 1024
16 | 
17 |         self.ref_inputs = torch.randn(self.sequences*self.seq_length, self.hidden_dim,
18 |                                       dtype=torch.float16, device=torch.device("cuda")).half().requires_grad_(True)
19 | 
20 |         self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)
21 |         self.dense = fused_dense.FusedDense(1024, 3072)
22 |         self.dense.half()
23 |         self.dense.cuda()
24 | 
25 | 
26 |     def test_fused_dense(self) :
27 |         y_tst = self.dense(self.tst_inputs)
28 |         y_ref = torch.matmul(self.ref_inputs,self.dense.weight.t())+self.dense.bias
29 |         dy = torch.randn_like(y_tst).half()
30 |         y_tst.backward(dy)
31 |         dw_ref = torch.matmul(dy.t(), self.ref_inputs)
32 |         dx_ref = torch.matmul(dy, self.dense.weight.clone())
33 |         db_ref = dy.sum(0, False)
34 | 
35 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,        atol=1e-5, rtol=1e-5))
36 |         self.assertTrue(torch.allclose(y_ref,            y_tst,                  atol=1e-3, rtol=1e-3, equal_nan=True))
37 |         self.assertTrue(torch.allclose(dw_ref,           self.dense.weight.grad, atol=1e-3, rtol=1e-3, equal_nan=True))
38 |         self.assertTrue(torch.allclose(dx_ref,           self.tst_inputs.grad,   atol=1e-3, rtol=1e-3, equal_nan=True))
39 |         self.assertTrue(torch.allclose(db_ref,           self.dense.bias.grad,   atol=1e-3, rtol=1e-3, equal_nan=True))
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()


--------------------------------------------------------------------------------
/tests/L0/run_fused_dense/test_gelu.py:
--------------------------------------------------------------------------------
 1 | from apex import fused_dense 
 2 | import torch
 3 | import torch.nn.functional as F
 4 | import unittest
 5 | 
 6 | 
 7 | class FusedDenseGeluDenseTest(unittest.TestCase):
 8 | 
 9 |     def test_fused_dense_gelu_dense(self) :
10 |         batch_size   = 4
11 |         in_features  = 3
12 |         intermediate_features = 3
13 |         out_features = 2
14 | 
15 |         #tst_dtype = torch.float8_e4m3
16 |         # tst_dtype = torch.float8_e5m2
17 |         tst_dtype = torch.float16
18 | 
19 |         I = torch.randn(batch_size, in_features, dtype=tst_dtype, device='cuda')
20 | 
21 |         denseGelu = fused_dense.FusedDenseGeluDense(in_features, intermediate_features, out_features)
22 |         denseGelu.to(dtype=tst_dtype)
23 |         denseGelu.cuda()
24 | 
25 |         #get weight and bias from the denseGelu module
26 |         W1 = denseGelu.weight1
27 |         b1 = denseGelu.bias1
28 |         W2 = denseGelu.weight2
29 |         b2 = denseGelu.bias2
30 | 
31 |         C1  = torch.matmul(I, W1.t())+b1
32 |         gelu_output = F.gelu(C1)
33 |         y_ref = torch.matmul(gelu_output, W2.t())+b2
34 |         y_tst = denseGelu(I)
35 |         torch.testing.assert_close(y_ref,  y_tst,  atol=1e-3, rtol=1e-3, equal_nan=True)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/tests/L0/run_optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/tests/L0/run_optimizers/__init__.py


--------------------------------------------------------------------------------
/tests/L0/run_rocm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | APEX_TEST_WITH_ROCM=1 APEX_SKIP_FLAKY_TEST=1 python run_test.py
3 | 


--------------------------------------------------------------------------------
/tests/L0/run_test.py:
--------------------------------------------------------------------------------
 1 | """L0 Tests Runner.
 2 | 
 3 | How to run this script?
 4 | 
 5 | 1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
 6 | 2. Run one of the tests (e.g. fused layer norm):
 7 |     `python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
 8 | 3. Run two or more of the tests (e.g. optimizers and fused layer norm):
 9 |     `python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
10 | """
11 | import argparse
12 | import os
13 | import unittest
14 | import sys
15 | 
16 | from apex.testing.common_utils import TEST_WITH_ROCM
17 | from apex.testing.common_utils import SKIP_FLAKY_TEST
18 | 
19 | TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
20 | 
21 | #the tests that are allowed
22 | TEST_DIRS = [
23 |     "run_amp",
24 |     "run_fp16util",
25 |     "run_optimizers",
26 |     "run_fused_layer_norm",
27 |     "run_mlp",
28 |     "run_fused_dense",
29 |     "run_transformer",       
30 | ]
31 | 
32 | #the tests that are run by default
33 | DEFAULT_TEST_DIRS = [
34 |     "run_amp",
35 |     "run_fp16util",
36 |     "run_optimizers",
37 |     "run_fused_layer_norm",
38 |     "run_mlp",
39 |     "run_fused_dense",
40 |     "run_transformer",
41 | ]
42 | 
43 | 
44 | def parse_args():
45 |     parser = argparse.ArgumentParser(
46 |         description="L0 test runner",
47 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
48 |     )
49 |     parser.add_argument(
50 |         "--include",
51 |         nargs="+",
52 |         choices=TEST_DIRS,
53 |         default=DEFAULT_TEST_DIRS,
54 |         help="select a set of tests to run (defaults to ALL tests).",
55 |     )
56 |     args, _ = parser.parse_known_args()
57 |     return args
58 | 
59 | 
60 | def main(args):
61 |     runner = unittest.TextTestRunner(verbosity=2)
62 |     errcode = 0
63 |     for test_dir in args.include:
64 |         test_dir = os.path.join(TEST_ROOT, test_dir)
65 |         print(test_dir)
66 |         suite = unittest.TestLoader().discover(test_dir)
67 | 
68 |         print("\nExecuting tests from " + test_dir)
69 |         result = runner.run(suite)
70 |         if not result.wasSuccessful():
71 |             errcode = 1
72 | 
73 |     sys.exit(errcode)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     args = parse_args()
78 |     main(args)
79 | 
80 | 


--------------------------------------------------------------------------------
/tests/L0/run_transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ROCm/apex/89c37c81523484bf0c5b75054e0208952b8fe710/tests/L0/run_transformer/__init__.py


--------------------------------------------------------------------------------
/tests/L0/run_transformer/test_data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch.testing
 4 | from torch.testing._internal import common_utils
 5 | 
 6 | logging.getLogger("torch").setLevel(logging.WARNING)
 7 | 
 8 | from apex.transformer import parallel_state
 9 | from apex.transformer.tensor_parallel import data as data_utils
10 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
11 | from apex.transformer.testing.distributed_test_base import UccDistributedTestBase
12 | 
13 | logging.getLogger("torch").setLevel(logging.WARNING)
14 | 
15 | 
16 | class BroadcastDataTestBase:
17 |     def test_broadcast_data(self):
18 |         tensor_model_parallel_world_size: int = self.world_size // (
19 |             1 + self.world_size > 1
20 |         )
21 |         parallel_state.initialize_model_parallel(
22 |             tensor_model_parallel_size_=tensor_model_parallel_world_size
23 |         )
24 | 
25 |         target_key_size = {
26 |             "key1": [7, 11],
27 |             "key2": [8, 2, 1],
28 |             "key3": [13],
29 |             "key4": [5, 1, 2],
30 |             "key5": [5, 12],
31 |         }
32 |         keys = [k for k in target_key_size]
33 | 
34 |         data = {}
35 |         data_t = {}
36 |         with torch.no_grad():
37 |             for key in target_key_size:
38 |                 data[key] = torch.randint(0, 1000, size=target_key_size[key])
39 |                 data_t[key] = data[key].clone()
40 |             # "key_x" is supposed to be ignored.
41 |             data["key_x"] = torch.rand(5)
42 |             data_t["key_x"] = data["key_x"].clone()
43 |         if parallel_state.get_tensor_model_parallel_rank() != 0:
44 |             data = None
45 | 
46 |         data_utils._check_data_types(keys, data_t, torch.int64)
47 |         key_size, _, _ = data_utils._build_key_size_numel_dictionaries(keys, data)
48 | 
49 |         for key in keys:
50 |             self.assertEqual(target_key_size[key], key_size[key])
51 | 
52 |         broadcasted_data = data_utils.broadcast_data(keys, data, torch.int64)
53 |         for key in keys:
54 |             self.assertEqual(broadcasted_data[key], data_t[key].cuda())
55 | 
56 |         parallel_state.destroy_model_parallel()
57 | 
58 | 
59 | class NcclBroadcastDataTest(BroadcastDataTestBase, NcclDistributedTestBase): pass
60 | class UccBroadcastDataTest(BroadcastDataTestBase, UccDistributedTestBase): pass
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     common_utils.run_tests()
65 | 


--------------------------------------------------------------------------------
/tests/L0/run_transformer/test_fused_bias_swiglu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import fused_bias_swiglu
 3 | from torch.testing._internal import common_utils
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class TestFusedBiasSwiGLU(common_utils.TestCase):
 8 | 
 9 |     def swiglu(self, y):
10 |         y_1, y_2 = torch.chunk(y, 2, -1)
11 |         return F.silu(y_1) * y_2
12 | 
13 |     def bias_swiglu(self, y, bias):
14 |         y = y + bias
15 |         return self.swiglu(y)
16 | 
17 |     def swiglu_back(self, g, y):
18 |         y_1, y_2 = torch.chunk(y, 2, -1)
19 |         return torch.cat(
20 |             (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1
21 |         )
22 | 
23 |     def bias_swiglu_back(self, g, y, bias):
24 |         y = y + bias
25 |         return self.swiglu_back(g, y)
26 | 
27 |     def test_fused_bias_swiglu(self):
28 |         # Inputs
29 |         batch_size, hidden_dim = 16, 512
30 |         dtypes = [torch.float32, torch.float64, torch.float16]
31 | 
32 |         for dtype in dtypes:
33 |             print(f"Testing with data type: {dtype}")
34 |             input = torch.randn(batch_size, hidden_dim, device="cuda", dtype=dtype)
35 |             bias = torch.randn(hidden_dim, device="cuda", dtype=dtype)
36 | 
37 |             try:
38 |                 actual = fused_bias_swiglu.forward(input, bias)
39 |                 expected = self.bias_swiglu(input, bias)
40 | 
41 |                 self.assertEqual(actual, expected, atol=1e-3, rtol=1e-3)
42 | 
43 |                 grad_output = torch.randn(batch_size, hidden_dim // 2, device="cuda", dtype=dtype)  # Output gradient
44 |                 actual_grad = fused_bias_swiglu.backward(grad_output, input, bias)
45 |                 expected_grad = self.bias_swiglu_back(grad_output, input, bias)
46 |                 self.assertEqual(actual_grad, expected_grad, atol=1e-3, rtol=1e-3)
47 | 
48 |                 print(f"Test succeeded for data type: {dtype}")
49 |             except AssertionError as e:
50 |                 print(f"Test failed for data type: {dtype}")
51 |                 print(e)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     common_utils.run_tests()


--------------------------------------------------------------------------------
/tests/L0/run_transformer/test_transformer_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | from torch.testing._internal import common_utils
 5 | 
 6 | logging.getLogger("torch").setLevel(logging.WARNING)
 7 | 
 8 | from apex.transformer import parallel_state
 9 | from apex.transformer.tensor_parallel import utils
10 | from apex.transformer.testing.distributed_test_base import NcclDistributedTestBase
11 | 
12 | logging.getLogger("apex").setLevel(logging.WARNING)
13 | 
14 | 
15 | class TransformerUtilsTest(NcclDistributedTestBase):
16 |     def test_split_tensor_along_last_dim(self):
17 |         for tensor_model_paralell_world_size in range(1, self.world_size + 1):
18 |             if self.world_size % tensor_model_paralell_world_size > 0:
19 |                 continue
20 |             with self.subTest(
21 |                 tensor_model_paralell_world_size=tensor_model_paralell_world_size
22 |             ):
23 |                 parallel_state.initialize_model_parallel(
24 |                     tensor_model_parallel_size_=tensor_model_paralell_world_size
25 |                 )
26 | 
27 |                 device = "cpu"
28 |                 input_tensor = torch.randn((100, 100, 100), device=device)
29 |                 splits = utils.split_tensor_along_last_dim(input_tensor, 10)
30 |                 last_dim_shapes = torch.tensor(
31 |                     [int(split.size()[-1]) for split in splits]
32 |                 )
33 | 
34 |                 self.assertTrue(torch.equal(last_dim_shapes, torch.full((10,), 10),))
35 | 
36 |                 parallel_state.destroy_model_parallel()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     common_utils.run_tests()
41 | 


--------------------------------------------------------------------------------
/tests/L1/common/compare.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | parser = argparse.ArgumentParser(description='Compare')
 5 | parser.add_argument('--opt-level', type=str)
 6 | parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
 7 | parser.add_argument('--loss-scale', type=str, default=None)
 8 | parser.add_argument('--fused-adam', action='store_true')
 9 | parser.add_argument('--use_baseline', action='store_true')
10 | args = parser.parse_args()
11 | 
12 | base_file = str(args.opt_level) + "_" +\
13 |             str(args.loss_scale) + "_" +\
14 |             str(args.keep_batchnorm_fp32) + "_" +\
15 |             str(args.fused_adam)
16 | 
17 | file_e = "True_" + base_file
18 | file_p = "False_" + base_file
19 | if args.use_baseline:
20 |     file_b = "baselines/True_" + base_file
21 | 
22 | dict_e = torch.load(file_e)
23 | dict_p = torch.load(file_p)
24 | if args.use_baseline:
25 |     dict_b = torch.load(file_b)
26 | 
27 | torch.set_printoptions(precision=10)
28 | 
29 | print(file_e)
30 | print(file_p)
31 | if args.use_baseline:
32 |     print(file_b)
33 | 
34 | # ugly duplication here...
35 | if not args.use_baseline:
36 |     for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
37 |         assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
38 | 
39 |         loss_e = dict_e["Loss"][n]
40 |         loss_p = dict_p["Loss"][n]
41 |         assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
42 |         print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
43 |               i_e,
44 |               loss_e,
45 |               loss_p,
46 |               dict_e["Speed"][n],
47 |               dict_p["Speed"][n]))
48 | else:
49 |     for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
50 |         assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
51 | 
52 |         loss_e = dict_e["Loss"][n]
53 |         loss_p = dict_p["Loss"][n]
54 |         loss_b = dict_b["Loss"][n]
55 |         assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
56 |         assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
57 |         print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
58 |               i_e,
59 |               loss_b,
60 |               loss_e,
61 |               loss_p,
62 |               dict_b["Speed"][n],
63 |               dict_e["Speed"][n],
64 |               dict_p["Speed"][n]))
65 | 


--------------------------------------------------------------------------------
/tests/L1/cross_product/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
4 | # DATADIR="/opt/home/apex/examples/imagenet/"
5 | cp ../common/* .
6 | bash run_test.sh single_gpu $1
7 | 


--------------------------------------------------------------------------------
/tests/L1/cross_product_distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cp ../common/* .
4 | bash run_test.sh distributed $1
5 | 


--------------------------------------------------------------------------------
/tests/distributed/DDP/ddp_race_condition_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.nn import Parameter
 4 | from torch.nn import Module
 5 | from apex.parallel import DistributedDataParallel as DDP
 6 | import argparse
 7 | import os
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser(description='allreduce hook example')
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 | 
14 | args.distributed = False
15 | if 'WORLD_SIZE' in os.environ:
16 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
17 | 
18 | if args.distributed:
19 |     args.gpu = args.local_rank % torch.cuda.device_count()
20 |     torch.cuda.set_device(args.gpu)
21 |     torch.distributed.init_process_group(backend='nccl',
22 |                                          init_method='env://')
23 |     args.world_size = torch.distributed.get_world_size()
24 | 
25 | torch.set_printoptions(precision=10)
26 | torch.manual_seed(args.local_rank)
27 | 
28 | class Model(Module):
29 |     def __init__(self):
30 |         super(Model, self).__init__()
31 |         self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
32 |         self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
33 |     def forward(self, input):
34 |         return (input*self.a)*self.b
35 | 
36 | model = Model()
37 | # model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
38 | # model = DDP(model, delay_allreduce=True)
39 | # model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
40 | model = DDP(model, message_size=1, allreduce_trigger_params=[model.b], num_allreduce_streams=3)
41 | 
42 | x = torch.cuda.FloatTensor(4096*4096)
43 | 
44 | passed = True
45 | torch.cuda.cudart().cudaProfilerStart()
46 | for i in range(10):
47 |     x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
48 |     model.zero_grad()
49 |     out = model(x)
50 |     loss = out.sum()
51 |     # torch.cuda.nvtx.range_push("backward")
52 |     loss.backward()
53 |     # torch.cuda.nvtx.range_pop()
54 |     
55 |     # torch.cuda.nvtx.range_push("synchronize() + info")
56 |     # torch.cuda.synchronize()
57 |     print("i = {}".format(i))
58 |     def info(name, param, val):
59 |         expected = val*4096*4096*(2.*i+1)/2.
60 |         actual = param.grad.data.sum().item()
61 |         print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
62 |               param.grad.data_ptr(), expected, actual))
63 |         return (expected == actual)
64 |     if not info("model.a", model.module.a, 2.):  passed = False
65 |     if not info("model.b", model.module.b, 1.):  passed = False
66 |     # torch.cuda.nvtx.range_pop()
67 | torch.cuda.cudart().cudaProfilerStop()
68 | 
69 | print("passed = ", passed)
70 | 


--------------------------------------------------------------------------------
/tests/distributed/DDP/run_race_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py
4 | 


--------------------------------------------------------------------------------
/tests/distributed/amp_master_params/amp_master_params.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import os
 4 | from apex import amp
 5 | # FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
 6 | from apex.parallel import DistributedDataParallel
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | # FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
10 | # automatically by torch.distributed.launch.
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | parser.add_argument("--opt_level", default="O2", type=str)
13 | args = parser.parse_args()
14 | 
15 | # FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
16 | # the 'WORLD_SIZE' environment variable will also be set automatically.
17 | args.distributed = False
18 | if 'WORLD_SIZE' in os.environ:
19 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
20 | 
21 | if args.distributed:
22 |     # FOR DISTRIBUTED:  Set the device according to local_rank.
23 |     torch.cuda.set_device(args.local_rank)
24 | 
25 |     # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
26 |     # environment variables, and requires that you use init_method=`env://`.
27 |     torch.distributed.init_process_group(backend='nccl',
28 |                                          init_method='env://')
29 | 
30 |     torch.manual_seed(torch.distributed.get_rank())
31 | 
32 | torch.backends.cudnn.benchmark = True
33 | 
34 | N, D_in, D_out = 64, 1024, 16
35 | 
36 | # Each process receives its own batch of "fake input data" and "fake target data."
37 | # The "training loop" in each process just uses this fake batch over and over.
38 | # https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
39 | # example of distributed data sampling for both training and validation.
40 | x = torch.randn(N, D_in, device='cuda')
41 | y = torch.randn(N, D_out, device='cuda')
42 | 
43 | model = torch.nn.Linear(D_in, D_out).cuda()
44 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
45 | 
46 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level)
47 | 
48 | if args.distributed:
49 |     # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
50 |     # apex.parallel.DistributedDataParallel.
51 |     model = DistributedDataParallel(model)
52 |     # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
53 |     # model = torch.nn.parallel.DistributedDataParallel(model,
54 |     #                                                   device_ids=[args.local_rank],
55 |     #                                                   output_device=args.local_rank)
56 | 
57 | loss_fn = torch.nn.MSELoss()
58 | 
59 | for t in range(500):
60 |     optimizer.zero_grad()
61 |     y_pred = model(x)
62 |     loss = loss_fn(y_pred, y)
63 |     with amp.scale_loss(loss, optimizer) as scaled_loss:
64 |         scaled_loss.backward()
65 |     optimizer.step()
66 | 
67 | if args.local_rank == 0:
68 |     print("final loss = ", loss)
69 | 
70 | torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
71 | torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
72 | 


--------------------------------------------------------------------------------
/tests/distributed/amp_master_params/compare.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | model_params_rank0 = torch.load("rank0model.pth",
 4 |                            map_location = lambda storage, loc: storage.cuda(0))
 5 | model_params_rank1 = torch.load("rank1model.pth",
 6 |                                  map_location = lambda storage, loc: storage.cuda(0))
 7 | master_params_rank0 = torch.load("rank0master.pth",
 8 |                                  map_location = lambda storage, loc: storage.cuda(0))
 9 | master_params_rank1 = torch.load("rank1master.pth",
10 |                                  map_location = lambda storage, loc: storage.cuda(0))
11 | 
12 | for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
13 |         model_params_rank0,
14 |         model_params_rank1,
15 |         master_params_rank0,
16 |         master_params_rank1):
17 |     # converting model params to float is a hack since allclose doesn't support bfloat16 yet.
18 |     model_rank0 = model_rank0.float()
19 |     model_rank1 = model_rank1.float()
20 |     assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
21 |     assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
22 |     # Some debugging/investigation assistance code:
23 |     # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
24 |     # offending_val_half = model_rank0.view(-1)[maxind.item()]
25 |     # offending_val_float = master_rank0.view(-1)[maxind.item()]
26 |     # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
27 |     #       offending_val_float.half().item())
28 |     # rtol needs to be > 2^-11 because of denormals...
29 |     assert torch.allclose(model_rank0, master_rank0, rtol=.005), "Model-master mismatch"
30 | 
31 | print("OK:  Model and master params match across ranks.")
32 | 


--------------------------------------------------------------------------------
/tests/distributed/amp_master_params/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
3 | 
4 | python compare.py
5 | 


--------------------------------------------------------------------------------
/tests/distributed/run_rocm_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | set -e
 3 | 
 4 | # To run the test on 2 gpus
 5 | export WORLD_SIZE=2
 6 | 
 7 | torchrun=`dirname \`which python\``/torchrun
 8 | 
 9 | # Test with opt_level="O2"
10 | echo "running opt_level O2"
11 | # python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
12 | python $torchrun --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
13 | python amp_master_params/compare.py
14 | 
15 | # delete the model files
16 | echo -e "O2 test completed. Deleting model files\n"
17 | rm rank0model.pth
18 | rm rank1model.pth
19 | rm rank0master.pth
20 | rm rank1master.pth
21 | 
22 | 
23 | # Test with opt_level="O5"
24 | #echo "running opt_level O5"
25 | #python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
26 | #python amp_master_params/compare.py
27 | 
28 | ## delete the model files
29 | #echo "O5 test completed. Deleting model files"
30 | #rm rank0model.pth
31 | #rm rank1model.pth
32 | #rm rank0master.pth
33 | #rm rank1master.pth
34 | 
35 | ## Run the Sync BN Tests.
36 | echo "Running syncbn tests"
37 | python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py
38 | python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py --fp16
39 | python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
40 | echo "Running syncbn python only tests"
41 | python synced_batchnorm/python_single_gpu_unit_test.py
42 | echo "Running syncbn batchnorm1d tests"
43 | python synced_batchnorm/test_batchnorm1d.py 
44 | #beware, you need a system with at least 4 gpus to test group_size<world_size    (currently fail both on upstream and rocm fork)
45 | #python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
46 | 
47 | ## Run the DDP Tests
48 | echo "running DDP tests"
49 | HIP_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 DDP/ddp_race_condition_test.py
50 | 


--------------------------------------------------------------------------------
/tests/distributed/synced_batchnorm/test_batchnorm1d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import apex
 3 | 
 4 | model = apex.parallel.SyncBatchNorm(4).cuda()
 5 | model.weight.data.uniform_()
 6 | model.bias.data.uniform_()
 7 | data = torch.rand((8,4)).cuda()
 8 | 
 9 | model_ref = torch.nn.BatchNorm1d(4).cuda()
10 | model_ref.load_state_dict(model.state_dict())
11 | data_ref = data.clone()
12 | 
13 | output = model(data)
14 | output_ref = model_ref(data_ref)
15 | 
16 | assert(output.allclose(output_ref))
17 | assert(model.running_mean.allclose(model_ref.running_mean))
18 | assert(model.running_var.allclose(model_ref.running_var))
19 | 


--------------------------------------------------------------------------------
/tests/distributed/synced_batchnorm/unit_test.sh:
--------------------------------------------------------------------------------
1 | python python_single_gpu_unit_test.py || exit 1
2 | python single_gpu_unit_test.py || exit 1
3 | python test_batchnorm1d.py || exit 1
4 | torchrun --nnodes 1 --nproc-per-node 2  two_gpu_unit_test.py || exit 1
5 | torchrun --nnodes 1 --nproc-per-node 2  two_gpu_unit_test.py --fp16  || exit 1
6 | torchrun --nnodes 1 --nproc-per-node 2  two_gpu_test_different_batch_size.py --apex || exit 1
7 | #beware, you need a system with at least 4 gpus to test group_size<world_size
8 | # torchrun --nnodes 1 --nproc-per-node 4 test_groups.py --group_size=2 
9 | 


--------------------------------------------------------------------------------
/tests/docker_extension_builds/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | print_banner() {
 4 |   printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
 5 | }
 6 | 
 7 | print_green() {
 8 |   printf "\e[30m\e[42m$1\e[0m\n"
 9 | }
10 | 
11 | print_red() {
12 |   printf "\e[30m\e[41m$1\e[0m\n"
13 | }
14 | 
15 | images=(
16 | "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.08-py3-devel"
17 | "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
18 | "pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
19 | "pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
20 | "pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
21 | "pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
22 | "pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
23 | )
24 | 
25 | branch="master"
26 | 
27 | # Associative array for exit codes
28 | declare -A exit_codes
29 | for image in images
30 | do
31 |   exit_codes[$image]="None"
32 | done
33 | 
34 | for image in "${images[@]}"
35 | do
36 |   print_banner "$image"
37 |   set -x
38 |   docker pull $image
39 |   # Trying python setup.py install instead of pip install to ensure direct access to error codes.
40 |   # Maybe pip install would be ok too but this works.
41 |   docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e;  python setup.py install --cuda_ext --cpp_ext"
42 |   exit_code=$?
43 |   set +x
44 |   if [ $exit_code != 0 ]
45 |   then
46 |     print_red "Exit code: $exit_code"
47 |   else
48 |     print_green "Exit code: $exit_code"
49 |   fi
50 |   exit_codes[$image]=$exit_code
51 | done
52 | 
53 | success=0
54 | for image in "${images[@]}"
55 | do
56 |   exit_code=${exit_codes[$image]}
57 |   if [ $exit_code != 0 ]
58 |   then
59 |     print_red "$image : $exit_code"
60 |     success=1
61 |   else
62 |     print_green "$image : $exit_code"
63 |   fi
64 | done
65 | 
66 | if [ $success != 0 ]
67 | then
68 |   print_red "Overall status:  failure"
69 | else
70 |   print_green "Overall status:  success"
71 | fi
72 | 
73 | exit $success
74 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 1.8.0a0
2 | 


--------------------------------------------------------------------------------