├── README.md
├── apex
    ├── RNN
    │   ├── README.md
    │   ├── RNNBackend.py
    │   ├── __init__.py
    │   ├── cells.py
    │   └── models.py
    ├── __init__.py
    ├── amp
    │   ├── README.md
    │   ├── __init__.py
    │   ├── __version__.py
    │   ├── _amp_state.py
    │   ├── _initialize.py
    │   ├── _process_optimizer.py
    │   ├── amp.py
    │   ├── compat.py
    │   ├── frontend.py
    │   ├── handle.py
    │   ├── lists
    │   │   ├── __init__.py
    │   │   ├── functional_overrides.py
    │   │   ├── tensor_overrides.py
    │   │   └── torch_overrides.py
    │   ├── opt.py
    │   ├── rnn_compat.py
    │   ├── scaler.py
    │   ├── utils.py
    │   └── wrap.py
    ├── contrib
    │   ├── __init__.py
    │   ├── bottleneck
    │   │   ├── __init__.py
    │   │   ├── bottleneck.py
    │   │   └── test.py
    │   ├── csrc
    │   │   ├── bottleneck
    │   │   │   └── bottleneck.cpp
    │   │   ├── fmha
    │   │   │   ├── fmha_api.cpp
    │   │   │   └── src
    │   │   │   │   ├── fmha.h
    │   │   │   │   ├── fmha
    │   │   │   │       ├── gemm.h
    │   │   │   │       ├── gmem_tile.h
    │   │   │   │       ├── kernel_traits.h
    │   │   │   │       ├── mask.h
    │   │   │   │       ├── smem_tile.h
    │   │   │   │       ├── softmax.h
    │   │   │   │       └── utils.h
    │   │   │   │   ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload.h
    │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload_nl.h
    │   │   │   │   ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
    │   │   │   │   ├── fmha_fprop_kernel_1xN.h
    │   │   │   │   ├── fmha_fprop_kernel_1xN_nl.h
    │   │   │   │   ├── fmha_fprop_kernel_1xN_reload_v.h
    │   │   │   │   ├── fmha_kernel.h
    │   │   │   │   ├── fmha_noloop_reduce.cu
    │   │   │   │   └── fmha_utils.h
    │   │   ├── groupbn
    │   │   │   ├── batch_norm.cu
    │   │   │   ├── batch_norm.h
    │   │   │   ├── batch_norm_add_relu.cu
    │   │   │   ├── batch_norm_add_relu.h
    │   │   │   ├── cuda_utils.h
    │   │   │   ├── interface.cpp
    │   │   │   ├── ipc.cu
    │   │   │   └── nhwc_batch_norm_kernel.h
    │   │   ├── layer_norm
    │   │   │   ├── ln_api.cpp
    │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
    │   │   │   ├── ln_fwd_cuda_kernel.cu
    │   │   │   ├── ln_kernel_traits.h
    │   │   │   └── utils.cuh
    │   │   ├── multihead_attn
    │   │   │   ├── additive_masked_softmax_dropout.cpp
    │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
    │   │   │   ├── dropout.h
    │   │   │   ├── encdec_multihead_attn.cpp
    │   │   │   ├── encdec_multihead_attn_cuda.cu
    │   │   │   ├── encdec_multihead_attn_norm_add.cpp
    │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
    │   │   │   ├── layer_norm.h
    │   │   │   ├── masked_softmax_dropout.cpp
    │   │   │   ├── masked_softmax_dropout_cuda.cu
    │   │   │   ├── philox.h
    │   │   │   ├── self_multihead_attn.cpp
    │   │   │   ├── self_multihead_attn_bias.cpp
    │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
    │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
    │   │   │   ├── self_multihead_attn_bias_cuda.cu
    │   │   │   ├── self_multihead_attn_cuda.cu
    │   │   │   ├── self_multihead_attn_norm_add.cpp
    │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
    │   │   │   ├── softmax.h
    │   │   │   └── strided_batched_gemm.h
    │   │   ├── optimizers
    │   │   │   ├── fused_adam_cuda.cpp
    │   │   │   ├── fused_adam_cuda_kernel.cu
    │   │   │   ├── fused_lamb_cuda.cpp
    │   │   │   ├── fused_lamb_cuda_kernel.cu
    │   │   │   ├── multi_tensor_distopt_adam.cpp
    │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
    │   │   │   ├── multi_tensor_distopt_lamb.cpp
    │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
    │   │   ├── transducer
    │   │   │   ├── transducer_joint.cpp
    │   │   │   ├── transducer_joint_kernel.cu
    │   │   │   ├── transducer_loss.cpp
    │   │   │   └── transducer_loss_kernel.cu
    │   │   └── xentropy
    │   │   │   ├── interface.cpp
    │   │   │   └── xentropy_kernel.cu
    │   ├── examples
    │   │   └── multihead_attn
    │   │   │   ├── func_test_multihead_attn.py
    │   │   │   └── perf_test_multihead_attn.py
    │   ├── fmha
    │   │   ├── __init__.py
    │   │   └── fmha.py
    │   ├── groupbn
    │   │   ├── __init__.py
    │   │   └── batch_norm.py
    │   ├── layer_norm
    │   │   ├── __init__.py
    │   │   └── layer_norm.py
    │   ├── multihead_attn
    │   │   ├── MHA_bwd.png
    │   │   ├── MHA_fwd.png
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── encdec_multihead_attn.py
    │   │   ├── encdec_multihead_attn_func.py
    │   │   ├── fast_encdec_multihead_attn_func.py
    │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
    │   │   ├── fast_self_multihead_attn_func.py
    │   │   ├── fast_self_multihead_attn_norm_add_func.py
    │   │   ├── mask_softmax_dropout_func.py
    │   │   ├── self_multihead_attn.py
    │   │   └── self_multihead_attn_func.py
    │   ├── optimizers
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── fp16_optimizer.cpython-37.pyc
    │   │   │   ├── fused_adam.cpython-37.pyc
    │   │   │   └── fused_lamb.cpython-37.pyc
    │   │   ├── distributed_fused_adam.py
    │   │   ├── distributed_fused_adam_v2.py
    │   │   ├── distributed_fused_adam_v3.py
    │   │   ├── distributed_fused_lamb.py
    │   │   ├── fp16_optimizer.py
    │   │   ├── fused_adam.py
    │   │   ├── fused_lamb.py
    │   │   └── fused_sgd.py
    │   ├── sparsity
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── asp.py
    │   │   ├── sparse_masklib.py
    │   │   └── test
    │   │   │   ├── checkpointing_test_part1.py
    │   │   │   ├── checkpointing_test_part2.py
    │   │   │   ├── checkpointing_test_reference.py
    │   │   │   └── toy_problem.py
    │   ├── test
    │   │   ├── fmha
    │   │   │   └── test_fmha.py
    │   │   ├── layer_norm
    │   │   │   └── test_fast_layer_norm.py
    │   │   ├── multihead_attn
    │   │   │   ├── test_encdec_multihead_attn.py
    │   │   │   ├── test_encdec_multihead_attn_norm_add.py
    │   │   │   ├── test_fast_self_multihead_attn_bias.py
    │   │   │   ├── test_mha_fused_softmax.py
    │   │   │   ├── test_self_multihead_attn.py
    │   │   │   └── test_self_multihead_attn_norm_add.py
    │   │   ├── test_label_smoothing.py
    │   │   └── transducer
    │   │   │   ├── test_transducer_joint.py
    │   │   │   ├── test_transducer_loss.py
    │   │   │   └── transducer_ref.py
    │   ├── transducer
    │   │   ├── __init__.py
    │   │   └── transducer.py
    │   └── xentropy
    │   │   ├── __init__.py
    │   │   └── softmax_xentropy.py
    ├── fp16_utils
    │   ├── README.md
    │   ├── __init__.py
    │   ├── fp16_optimizer.py
    │   ├── fp16util.py
    │   └── loss_scaler.py
    ├── mlp
    │   ├── __init__.py
    │   └── mlp.py
    ├── multi_tensor_apply
    │   ├── __init__.py
    │   └── multi_tensor_apply.py
    ├── normalization
    │   ├── __init__.py
    │   └── fused_layer_norm.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── fused_adagrad.py
    │   ├── fused_adam.py
    │   ├── fused_lamb.py
    │   ├── fused_novograd.py
    │   └── fused_sgd.py
    ├── parallel
    │   ├── LARC.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── distributed.py
    │   ├── multiproc.py
    │   ├── optimized_sync_batchnorm.py
    │   ├── optimized_sync_batchnorm_kernel.py
    │   ├── sync_batchnorm.py
    │   └── sync_batchnorm_kernel.py
    ├── pyprof
    │   ├── FAQs.md
    │   ├── README.md
    │   ├── __init__.py
    │   ├── examples
    │   │   ├── .gitignore
    │   │   ├── apex
    │   │   │   ├── README.md
    │   │   │   ├── fused_adam.py
    │   │   │   ├── fused_layer_norm.py
    │   │   │   └── test.sh
    │   │   ├── custom_func_module
    │   │   │   ├── README.md
    │   │   │   ├── custom_function.py
    │   │   │   ├── custom_module.py
    │   │   │   └── test.sh
    │   │   ├── imagenet
    │   │   │   ├── imagenet.py
    │   │   │   └── test.sh
    │   │   ├── jit
    │   │   │   ├── README.md
    │   │   │   ├── jit_script_function.py
    │   │   │   ├── jit_script_method.py
    │   │   │   ├── jit_trace_function.py
    │   │   │   ├── jit_trace_method.py
    │   │   │   └── test.sh
    │   │   ├── lenet.py
    │   │   ├── operators.py
    │   │   ├── simple.py
    │   │   └── user_annotation
    │   │   │   ├── README.md
    │   │   │   ├── resnet.py
    │   │   │   └── test.sh
    │   ├── nvtx
    │   │   ├── __init__.py
    │   │   └── nvmarker.py
    │   ├── parse
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── db.py
    │   │   ├── kernel.py
    │   │   ├── nvvp.py
    │   │   └── parse.py
    │   └── prof
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── activation.py
    │   │   ├── base.py
    │   │   ├── blas.py
    │   │   ├── conv.py
    │   │   ├── convert.py
    │   │   ├── data.py
    │   │   ├── dropout.py
    │   │   ├── embedding.py
    │   │   ├── index_slice_join_mutate.py
    │   │   ├── linear.py
    │   │   ├── loss.py
    │   │   ├── misc.py
    │   │   ├── normalization.py
    │   │   ├── optim.py
    │   │   ├── output.py
    │   │   ├── pointwise.py
    │   │   ├── pooling.py
    │   │   ├── prof.py
    │   │   ├── randomSample.py
    │   │   ├── recurrentCell.py
    │   │   ├── reduction.py
    │   │   ├── softmax.py
    │   │   ├── usage.py
    │   │   └── utility.py
    └── reparameterization
    │   ├── README.md
    │   ├── __init__.py
    │   ├── reparameterization.py
    │   └── weight_norm.py
├── data
    ├── dataloader.py
    └── dataset.zip
├── generation_utils.py
├── model
    ├── KoBART
    │   ├── model.py
    │   └── processor.py
    ├── setting.py
    └── utils.py
├── modeling_bart.py
├── train.py
└── train_test.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # KoBART-pytorch
 2 | 🧀 KoBART summarization using pytorch + copy mechanism
 3 | 
 4 | ## Data
 5 | - Data statistics
 6 |     - Train Data : 29,432
 7 |     - Valid Data : 7,358
 8 |     - Test Data : 9,182
 9 |  
10 | ## How to Train
11 | - KoBART fine-tuning + Copy Mechanism
12 | > **Warning**
13 | > - Since the python libary was directly modified and used, I recommended to use a virtual environment. 😎
14 | >     - geneartion_utils.py -> /site-packages/transformers/_geneartion_utils.py_
15 | >     - modeling_bart.py -> /site-packages/transformers/models/bart/_modeling_bart.py_
16 | - bash train_test.sh
17 | ```
18 | [Training]
19 | python train.py --train True --test False --batch_size 16 --max_len 512 --lr 5e-05 --epochs 10
20 | 
21 | [Testing-rouge]
22 | python train.py --train False --test True --batch_size 16 --max_len 512
23 | ```
24 | 
25 | ## Model Performance
26 | - Test data's [rouge score](https://en.wikipedia.org/wiki/ROUGE_(metric)) 
27 | ### Base
28 | | | rouge-1 |rouge-2|rouge-l|
29 | |-------|--------:|--------:|--------:|
30 | | Precision|0.5333|0.3463|0.4534|
31 | | Recall|0.5775|0.3737|0.4869|
32 | | F1|0.5381|0.3452|0.4555|
33 | 
34 | ### Copy Mechanism
35 | | | rouge-1 |rouge-2|rouge-l|
36 | |-------|--------:|--------:|--------:|
37 | | Precision|0.5698|0.3776|0.4882|
38 | | Recall|0.5561|0.3612|0.4717|
39 | | F1|0.5460|0.3545|0.4654|
40 | 
41 | ## Examples
42 | | | |Text|
43 | |-------|:--------|:--------|
44 | |1|기사|경기도와 경기도시공사는 광교원천, 동탄호수공원, 성남판교 등 3개 지구에 건립 예정인 총 730가구의 경기행복주택 입주자를 모집한다고 26일 밝혔다. 모집 기간은 다음달 2일부터 11일까지이며, ‘경기도시공사 임대주택 청약센터(https://apply.gico.or.kr)’에서 인터넷 청약접수로 진행된다. 광교원천 경기행복주택은 전용면적 16㎡형 대학생 40가구와 청년 20가구, 26㎡형 청년 186가구와 고령자 24가구, 주거급여수급자 30가구까지 총 300가구를 모집한다. 보증금 2천729만4천&#126;4천783만3천 원에 월 임대료는 11만8천&#126;20만7천 원이다. 입주 예정은 오는 2020년 11월이다. 동탄호수공원 경기행복주택은 동탄2신도시에 6개동 995가구 조성되는 대규모 단지이다. 이번 입주자 모집에서는 공급면적 44㎡형 신혼부부 130가구를 우선 모집하며 임대조건은 보증금 5천만 원에 월 임대료 20만8천 원이다. 내년 12월 입주 예정으로, 나머지 가구는 연말에 모집할 예정이다 성남판교 경기행복주택은 전용면적 16㎡형 창업인 100가구와 청년 124가구, 26㎡형 청년 46가구와 고령자 30가구 등 총 300가구를 모집하며, 보증금 3천876만&#126;6천992만 원에 월 임대료 14만5천&#126;26만2천 원이다. 김준태 도 도시주택실장은 ""도내 청년층을 주요 대상으로 한 주거복지정책인 경기행복주택은 2022년까지 1만호를 공급하는 과정에서 매년 공급물량이 늘어날 예정""이라며 ""경기행복주택 사업에도 많은 관심을 가져달라""고 말했다.|
45 | |1|모델요약|경기도와 경기도시공사는 광교원천, 동탄호수공원, 성남판교 등 3개 지구에 건립 예정인 총 730가구의 경기행복주택 입주자를 모집한다고 26일 밝혔으며 모집 기간은 다음달 2일부터 11일까지이며, 보증금 2천729만4천&#126;4천783만3천 원에 월 임대료는 11만8천&#126;20만7천 원이다.|
46 | |2|기사|전남개발공사, ‘일자리창출’우수기관 선정 전남개발공사 청사 전경. 전남개발공사가 일자리창출 우수기관에 선정돼 행정안전부장관 표창을 수상했다. 6일 전남개방공사에 따르면 지난 3일 세종컨벤션센터에서 열린 2019년 상반기 지방공사·공단 CEO 리더십 포럼에서 이같이 수상 했다고 밝혔다. 전남개발공사는 문재인 정부 역점 사업인 일자리창출 정책에 적극 부응하며, 지역의 고용시장 활성화하기 위해 신규사업 발굴 등에 역점을 뒀다. 이 결과 지난해 2회에 걸쳐 10명을 채용하는 등 정부의 청년 및 장애인 의무고용에 대한 기준을 충족시켰다는 평가를 받았다. 또한 지역내 사회 초년생의 안정적인 취업에 도움이 될 수 있는 양질의 일자리 경험 및 역량을 쌓을 수 있도록 전라남도가 중점 추진하고 있는 ‘청년 내일로 프로젝트’에 참여해 7명의 지역인재를 선발하기도 했다. 특히 전남개발공사의 채용은 공정성과 투명성 위해 전면 블라인드 절차에 따라 진행되며, 특히 면접은 전원 외부면접위원으로 진행된다. 올해 채용은 전반기에 2명을 채용했으며하반기에는 추가로 5명이내의 규모로 진행될 예정이다. 한편 전남개발공사는 지난 2004년 전남도가 설립한 지방공기업으로 남악신도시, 빛가람 혁신도시, 여수경도해양관광단지 개발사업 등을 시행했고, 여수 죽림지구 택지개발사업도 추진할 계획이다.|
47 | |2|모델요약|전남개발공사는 지난 3일 세종컨벤션센터에서 열린 2019년 상반기 지방공사·공단 CEO 리더십 포럼에서 일자리창출 우수기관에 선정돼 행정안전부장관 표창을 수상했다.|
48 | |3|기사|광주시는 지역 유망강소기업을 발굴·육성하기 위해 운영하고 있는 ‘100대 명품강소기업 육성사업’에 참가할 기업을 모집한다. 명품강소기업 육성사업은 성장 잠재력과 뛰어난 기술력을 가진 지역 중소·중견기업을 발굴해 지역경제를 견인할 글로벌 기업으로 육성하기 위한 시책으로, 2014년 시작돼 올해로 6년째를 맞았다. 모집대상은 공고일 현재 본사와 주사업장이 광주에 위치한 제조업 및 지식서비스산업 기업으로 총 30개사다. 이번 모집은 현재 제3기 명품강소기업 27개사의 지정기간(3년)이 만료됨에 따라 이들 기업의 재지정 여부와 함께 재지정 포기·탈락 기업 결원분을 채우기 위해 추진됐다. 선정조건은 명품강소기업은 매출액 50억원 이상(지식서비스산업은 10억원 이상)이면서 최근 5년 간 연평균 매출액 증가율 5% 이상이거나, 최근 3년 간 매출액 대비 R&D 투자 비율이 1% 이상인 기업이다. 명품강소기업으로 선정되면 광주시 자금 지원, 기업진단 컨설팅, 성장전략 마련, 해외마케팅 등 기업중심 맞춤형 지원과 함께 다양한 우대 혜택을 받게 된다. 또 중앙정부(중소벤처기업부)와 연계한 기업성장사다리를 통해 단계별 성장전략 지원도 받을 수 있어 명품강소기업 선정 이후 글로벌 기업으로 발돋움할 수 있을 것으로 기대된다. 신청은 31일까지 광주테크노파크로 방문 접수하면 된다. 자세한 내용은 광주시 홈페이지(http://www.gwangju.go.kr) 고시·공고란을 참고하거나 광주시 기업육성과(062-613-3871)로 문의하면 된다. 광주시는 신청기업을 대상으로 1차 서류심사, 2차 발표·현장평가를 거쳐 8월 선정위원회에서 최종 확정할 계획이다.|
49 | |3|모델요약|광주시는 지역 중소·중견기업을 발굴해 지역경제를 견인할 글로벌 기업으로 육성하기 위해 운영하고 있는 '100대 명품강소기업 육성사업'에 참가할 기업을 모집한다.|
50 | 
51 | ## Reference
52 | - [KoBART](https://github.com/SKT-AI/KoBART)
53 | - [KoBART-summarization](https://github.com/seujung/KoBART-summarization)
54 | 
55 | 


--------------------------------------------------------------------------------
/apex/RNN/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 | 


--------------------------------------------------------------------------------
/apex/RNN/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM
2 | 
3 | __all__ = ['models']
4 | 


--------------------------------------------------------------------------------
/apex/RNN/cells.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .RNNBackend import RNNCell
 6 | 
 7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
 8 | 
 9 | import math 
10 | 
11 | 
12 | class mLSTMRNNCell(RNNCell):
13 |     """
14 |     mLSTMRNNCell
15 |     """
16 | 
17 |     def __init__(self, input_size, hidden_size, bias = False, output_size = None):
18 |         gate_multiplier = 4
19 |         super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
20 | 
21 |         self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
22 |         self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
23 | 
24 |         self.reset_parameters()
25 | 
26 |     def forward(self, input):
27 |         """
28 |         mLSTMRNNCell.forward()
29 |         """
30 |         #if not inited or bsz has changed this will create hidden states
31 |         self.init_hidden(input.size()[0])
32 | 
33 |         hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
34 | 
35 |         self.hidden = list(
36 |                            self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
37 |                            b_ih=self.b_ih, b_hh=self.b_hh)
38 |         )
39 |         
40 |         if self.output_size != self.hidden_size:
41 |             self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
42 |         return tuple(self.hidden)
43 | 
44 | 
45 |     def new_like(self, new_input_size=None):
46 |         if new_input_size is None:
47 |             new_input_size = self.input_size
48 |         
49 |         return type(self)(
50 |             new_input_size,
51 |             self.hidden_size,
52 |             self.bias,
53 |             self.output_size)
54 | 
55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
56 |     """
57 |     mLSTMCell
58 |     """
59 | 
60 |     if input.is_cuda:
61 |         igates = F.linear(input, w_ih)
62 |         m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
63 |         hgates = F.linear(m, w_hh)
64 | 
65 |         state = fusedBackend.LSTMFused.apply
66 |         return state(igates, hgates, hidden[1], b_ih, b_hh)
67 | 
68 |     hx, cx = hidden
69 |     
70 |     m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
71 |     gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
72 | 
73 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
74 | 
75 |     ingate = F.sigmoid(ingate)
76 |     forgetgate = F.sigmoid(forgetgate)
77 |     cellgate = F.tanh(cellgate)
78 |     outgate = F.sigmoid(outgate)
79 |     
80 |     cy = (forgetgate * cx) + (ingate * cellgate)
81 |     hy = outgate * F.tanh(cy)
82 |     
83 |     return hy, cy
84 |                                                                             
85 | 


--------------------------------------------------------------------------------
/apex/RNN/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
 4 | 
 5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
 6 | from .cells import mLSTMRNNCell, mLSTMCell
 7 | 
 8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
 9 |     """
10 |     :class:`toRNNBackend`
11 |     """
12 | 
13 |     if bidirectional:
14 |         return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
15 |     else:
16 |         return stackedRNN(inputRNN, num_layers, dropout = dropout)
17 | 
18 | 
19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
20 |     """
21 |     :class:`LSTM`
22 |     """
23 |     inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
24 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
25 | 
26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
27 |     """
28 |     :class:`GRU`
29 |     """
30 |     inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
31 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
32 | 
33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
34 |     """
35 |     :class:`ReLU`
36 |     """
37 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
38 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
39 | 
40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
41 |     """
42 |     :class:`Tanh`
43 |     """
44 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
45 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
46 |         
47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
48 |     """
49 |     :class:`mLSTM`
50 |     """
51 |     inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
52 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/apex/__init__.py:
--------------------------------------------------------------------------------
 1 | # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
 2 | import torch
 3 | import warnings
 4 | 
 5 | if torch.distributed.is_available():
 6 |     from . import parallel
 7 | 
 8 | from . import amp
 9 | from . import fp16_utils
10 | 
11 | # For optimizers and normalization there is no Python fallback.
12 | # Absence of cuda backend is a hard error.
13 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
14 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
15 | # so they expect those backends to be available, but for some reason they actually aren't
16 | # available (for example because they built improperly in a way that isn't revealed until
17 | # load time) the error message is timely and visible.
18 | from . import optimizers
19 | from . import normalization
20 | from . import pyprof
21 | 


--------------------------------------------------------------------------------
/apex/amp/README.md:
--------------------------------------------------------------------------------
 1 | # amp: Automatic Mixed Precision
 2 | 
 3 | ## Annotating User Functions
 4 | 
 5 | Nearly all PyTorch user code needs nothing more than the two steps
 6 | above to use amp. After all, custom layers are built out of simpler
 7 | PyTorch components, and amp already can see those.
 8 | 
 9 | However, any custom C++ or CUDA code is outside of amp's (default)
10 | view of things. For example, suppose I implemented a new recurrent
11 | cell called a "forgetful recurrent unit" that calls directly into a
12 | CUDA backend:
13 | 
14 | ```python
15 | from backend import FRUBackend
16 | 
17 | def fru(input, hidden, weight, bias):
18 |     # call to CUDA code
19 |     FRUBackend(input, hidden, weight, bias)
20 | ```
21 | 
22 | In this case, it is possible to get a runtime type mismatch. For
23 | example, you might have `input` in fp16, and `weight` in fp32, and amp
24 | doesn't have the visibility to insert an appropriate cast.
25 | 
26 | amp exposes two ways to handle "invisible" backend code: function
27 | annotations and explicit registration.
28 | 
29 | #### Function annotation
30 | 
31 | The first way to handle backend code is a set of function annotations:
32 | 
33 | - `@amp.half_function`
34 | - `@amp.float_function`
35 | - `@amp.promote_function`
36 | 
37 | These correspond to:
38 | 
39 | - Cast all arguments to fp16
40 | - Cast all argumnets fo fp32
41 | - If there are any type mismatches, cast everything to the widest type
42 | 
43 | In our example, we believe that the FRU unit is fp16-safe and will get
44 | performance gains from casting its arguments to fp16, so we write:
45 | 
46 | ```python
47 | @amp.half_function
48 | def fru(input, hidden, weight, bias):
49 |     #...
50 | ```
51 | 
52 | #### Explicit registration
53 | 
54 | The other way to handle backend code is with explicit function
55 | registration:
56 | 
57 | - `amp.register_half_function(module, function_name)`
58 | - `amp.register_float_function(module, function_name)`
59 | - `amp.register_promote_function(module, function_name)`
60 | 
61 | When using this API, `module` is the containing class or module for
62 | the function, and `function_name` is the _string_ name of the
63 | function. Note that the function must be registered before the call to
64 | `amp.initalize()`.
65 | 
66 | For our FRU unit, we can register the backend function directly:
67 | 
68 | ```python
69 | import backend
70 | 
71 | amp.register_half_function(backend, 'FRUBackend')
72 | ```
73 | 


--------------------------------------------------------------------------------
/apex/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from .amp import init, half_function, float_function, promote_function,\
2 |     register_half_function, register_float_function, register_promote_function
3 | from .handle import scale_loss, disable_casts
4 | from .frontend import initialize, state_dict, load_state_dict
5 | from ._amp_state import master_params, _amp_state
6 | 


--------------------------------------------------------------------------------
/apex/amp/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | __version__ = '.'.join(map(str, VERSION))
3 | 


--------------------------------------------------------------------------------
/apex/amp/_amp_state.py:
--------------------------------------------------------------------------------
 1 | # This is a "header object" that allows different amp modules to communicate.
 2 | # I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
 3 | # But apparently it's ok:
 4 | # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
 5 | import os
 6 | import torch
 7 | 
 8 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
 9 | TORCH_MINOR = int(torch.__version__.split('.')[1])
10 | 
11 | 
12 | if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
13 |     from torch._six import container_abcs
14 | else:
15 |     import collections.abc as container_abcs
16 | 
17 | 
18 | class AmpState(object):
19 |     def __init__(self):
20 |         self.hard_override=False
21 |         self.allow_incoming_model_not_fp32 = False
22 |         self.verbosity=1
23 | 
24 | 
25 | # Attribute stash.  Could also just stash things as global module attributes.
26 | _amp_state = AmpState()
27 | 
28 | 
29 | def warn_or_err(msg):
30 |     if _amp_state.hard_override:
31 |         print("Warning:  " + msg)
32 |     else:
33 |         raise RuntimeError(msg)
34 |         # I'm not sure if allowing hard_override is a good idea.
35 |         # + "  If you're sure you know what you're doing, supply " +
36 |         #                    "hard_override=True to amp.initialize.")
37 | 
38 | 
39 | def maybe_print(msg, rank0=False):
40 |     distributed = torch.distributed.is_available() and \
41 |         torch.distributed.is_initialized() and \
42 |         torch.distributed.get_world_size() > 1
43 |     if _amp_state.verbosity > 0:
44 |         if rank0:
45 |             if distributed:
46 |                 if torch.distributed.get_rank() == 0:
47 |                     print(msg)
48 |             else:
49 |                 print(msg)
50 |         else:
51 |             print(msg)
52 | 
53 | 
54 | # def iter_params(param_groups):
55 | #     for group in param_groups:
56 | #         for p in group['params']:
57 | #             yield p
58 | 
59 | 
60 | def master_params(optimizer):
61 |     """
62 |     Generator expression that iterates over the params owned by ``optimizer``.
63 | 
64 |     Args:
65 |         optimizer: An optimizer previously returned from ``amp.initialize``.
66 |     """
67 |     for group in optimizer.param_groups:
68 |         for p in group['params']:
69 |             yield p
70 | 


--------------------------------------------------------------------------------
/apex/amp/compat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # True for post-0.4, when Variables/Tensors merged.
 4 | def variable_is_tensor():
 5 |     v = torch.autograd.Variable()
 6 |     return isinstance(v, torch.Tensor)
 7 | 
 8 | def tensor_is_variable():
 9 |     x = torch.Tensor()
10 |     return type(x) == torch.autograd.Variable
11 | 
12 | # False for post-0.4
13 | def tensor_is_float_tensor():
14 |     x = torch.Tensor()
15 |     return type(x) == torch.FloatTensor
16 | 
17 | # Akin to `torch.is_tensor`, but returns True for Variable
18 | # objects in pre-0.4.
19 | def is_tensor_like(x):
20 |     return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
21 | 
22 | # Wraps `torch.is_floating_point` if present, otherwise checks
23 | # the suffix of `x.type()`.
24 | def is_floating_point(x):
25 |     if hasattr(torch, 'is_floating_point'):
26 |         return torch.is_floating_point(x)
27 |     try:
28 |         torch_type = x.type()
29 |         return torch_type.endswith('FloatTensor') or \
30 |             torch_type.endswith('HalfTensor') or \
31 |             torch_type.endswith('DoubleTensor')
32 |     except AttributeError:
33 |         return False
34 | 
35 | def scalar_python_val(x):
36 |     if hasattr(x, 'item'):
37 |         return x.item()
38 |     else:
39 |         if isinstance(x, torch.autograd.Variable):
40 |             return x.data[0]
41 |         else:
42 |             return x[0]
43 | 
44 | # Accounts for the possibility that some ops may be removed from a namespace.
45 | def filter_attrs(module, attrs):
46 |     return list(attrname for attrname in attrs if hasattr(module, attrname))
47 | 


--------------------------------------------------------------------------------
/apex/amp/lists/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/amp/lists/__init__.py


--------------------------------------------------------------------------------
/apex/amp/lists/functional_overrides.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # TODO: think about the following two. They do weird things.
 3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
 4 | # - torch.nn.utils.weight_norm
 5 | 
 6 | # Notes:
 7 | # F.instance_norm uses batch_norm internally. Which correctly handles
 8 | #   fp16 in/out with fp32 weights. So we shouldn't do anything for
 9 | #   either of these.
10 | # F.normalize calls `input.norm()` internally, so it's redundant, but
11 | #   kept here in case impl. changes.
12 | # F.cosine_similarity is same: calls `x.norm()` internally.
13 | 
14 | import torch.nn.functional
15 | 
16 | MODULE = torch.nn.functional
17 | 
18 | FP16_FUNCS = [
19 |     'conv1d',
20 |     'conv2d',
21 |     'conv3d',
22 |     'conv_transpose1d',
23 |     'conv_transpose2d',
24 |     'conv_transpose3d',
25 |     'conv_tbc', # Undocumented / maybe new?
26 |     'linear',
27 | ]
28 | 
29 | FP32_FUNCS = [
30 | 
31 |     # Interpolation/Upsampling TODO:  Remove for 1.2
32 |     'interpolate',
33 |     'grid_sample',
34 | 
35 |     # Pointwise
36 |     'softplus',
37 |     'softmin',
38 |     'log_softmax',
39 |     'softmax',
40 |     'gelu',
41 |     
42 |     # Normalization
43 |     'layer_norm',
44 |     'group_norm',
45 |     'local_response_norm',
46 |     'normalize',
47 |     'cosine_similarity',
48 | 
49 |     # Loss functions
50 |     # TODO: which of these can be fp16?
51 |     'poisson_nll_loss',
52 |     'cosine_embedding_loss',
53 |     'cross_entropy',
54 |     'hinge_embedding_loss',
55 |     'kl_div',
56 |     'l1_loss',
57 |     'mse_loss',
58 |     'margin_ranking_loss',
59 |     'multilabel_margin_loss',
60 |     'multilabel_soft_margin_loss',
61 |     'multi_margin_loss',
62 |     'nll_loss',
63 |     'binary_cross_entropy_with_logits',
64 |     'smooth_l1_loss',
65 |     'soft_margin_loss',
66 |     'triplet_margin_loss',
67 |     'ctc_loss'
68 | ]
69 | 
70 | BANNED_FUNCS = [
71 |     ('binary_cross_entropy',
72 |      ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
73 |       "It requires that the output of the previous function be already a FloatTensor. \n\n"
74 |       "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
75 |       "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
76 |       "that is compatible with amp.\nAnother option is to add\n"
77 |       "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
78 |       "If you _really_ know what you are doing, you can disable this warning by passing "
79 |       "allow_banned=True to `amp.init()`."))
80 | ]
81 | 


--------------------------------------------------------------------------------
/apex/amp/lists/tensor_overrides.py:
--------------------------------------------------------------------------------
 1 | from .. import compat
 2 | from . import torch_overrides
 3 | 
 4 | import importlib
 5 | 
 6 | import torch
 7 | 
 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable():
 9 | MODULE = torch.Tensor
10 | # else:
11 | #     MODULE = torch.autograd.Variable
12 | 
13 | 
14 | FP16_FUNCS = compat.filter_attrs(MODULE, [
15 |     '__matmul__',
16 | ])
17 | 
18 | FP32_FUNCS = compat.filter_attrs(MODULE, [
19 |     '__ipow__',
20 |     '__pow__',
21 |     '__rpow__',
22 | 
23 |     # Cast to fp32 before transfer to CPU
24 |     'cpu',
25 | ])
26 | 
27 | CASTS = compat.filter_attrs(MODULE, [
28 |     '__add__',
29 |     '__div__',
30 |     '__eq__',
31 |     '__ge__',
32 |     '__gt__',
33 |     '__iadd__',
34 |     '__idiv__',
35 |     '__imul__',
36 |     '__isub__',
37 |     '__itruediv__',
38 |     '__le__',
39 |     '__lt__',
40 |     '__mul__',
41 |     '__ne__',
42 |     '__radd__',
43 |     '__rdiv__',
44 |     '__rmul__',
45 |     '__rsub__',
46 |     '__rtruediv__',
47 |     '__sub__',
48 |     '__truediv__',
49 | ])
50 | 
51 | # None of these, but here to make code cleaner.
52 | SEQUENCE_CASTS = []
53 | 
54 | # We need to grab all the methods from torch_overrides and add them to
55 | # the Tensor lists as well, as almost all methods are duplicated
56 | # between `torch` and `torch.Tensor` (and check with `hasattr`,
57 | # because a few random ones aren't defined on Tensor)
58 | _self_mod = importlib.import_module(__name__)
59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
60 |     lst = getattr(_self_mod, attrname)
61 |     for fn in getattr(torch_overrides, attrname):
62 |         if hasattr(MODULE, fn):
63 |             lst.append(fn)
64 | 


--------------------------------------------------------------------------------
/apex/amp/lists/torch_overrides.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from .. import utils
  4 | 
  5 | MODULE = torch
  6 | 
  7 | FP16_FUNCS = [
  8 |     # Low level functions wrapped by torch.nn layers.
  9 |     # The wrapper layers contain the weights which are then passed in as a parameter
 10 |     # to these functions.
 11 |     'conv1d',
 12 |     'conv2d',
 13 |     'conv3d',
 14 |     'conv_transpose1d',
 15 |     'conv_transpose2d',
 16 |     'conv_transpose3d',
 17 |     'conv_tbc',
 18 |     'prelu',
 19 | 
 20 |     # BLAS
 21 |     'addmm',
 22 |     'addmv',
 23 |     'addr',
 24 |     'matmul',
 25 |     'mm',
 26 |     'mv',
 27 | ]
 28 | 
 29 | FP32_FUNCS = [
 30 |     # Pointwise
 31 |     'acos',
 32 |     'asin',
 33 |     'cosh',
 34 |     'erfinv',
 35 |     'exp',
 36 |     'expm1',
 37 |     'log',
 38 |     'log10',
 39 |     'log2',
 40 |     'reciprocal',
 41 |     'rsqrt',
 42 |     'sinh',
 43 |     'tan',
 44 | 
 45 |     # Other math
 46 |     'pow',
 47 | 
 48 |     # Reduction
 49 |     'cumprod',
 50 |     'cumsum',
 51 |     'dist',
 52 |     # 'mean',
 53 |     'norm',
 54 |     'prod',
 55 |     'std',
 56 |     'sum',
 57 |     'var',
 58 | 
 59 |     # Misc
 60 |     'renorm'
 61 | ]
 62 | 
 63 | version_strings = torch.__version__.split('.')
 64 | version_major = version_strings[0]
 65 | version_minor = version_strings[1]
 66 | version_num = float(version_major + "." + version_minor)
 67 | # Before torch 1.1, mean must be blacklisted.
 68 | if version_num < 1.1:
 69 |     FP32_FUNCS.append('mean')
 70 | 
 71 | # Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
 72 | # check the CUDA version -- if at least 9.1, then put the bmm
 73 | # functions on the fp16 list. Otherwise, put them on the fp32 list.
 74 | _bmms = ['addbmm',
 75 |          'baddbmm',
 76 |          'bmm']
 77 | 
 78 | if utils.is_cuda_enabled():
 79 |   # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
 80 |   if utils.get_cuda_version() >= (9, 1, 0):
 81 |       FP16_FUNCS.extend(_bmms)
 82 |   else:
 83 |       FP32_FUNCS.extend(_bmms)
 84 | 
 85 | # Multi-tensor fns that may need type promotion
 86 | CASTS = [
 87 |     # Multi-tensor math
 88 |     'addcdiv',
 89 |     'addcmul',
 90 |     'atan2',
 91 |     'cross',
 92 |     'bilinear',
 93 |     'dot',
 94 | 
 95 |     # Element-wise _or_ tensor-wise math
 96 |     'add',
 97 |     'div',
 98 |     'mul',
 99 | 
100 |     # Comparison
101 |     'eq',
102 |     'equal',
103 |     'ge',
104 |     'gt',
105 |     'le',
106 |     'lt',
107 |     'ne'
108 | ]
109 | 
110 | # Functions that take sequence arguments. We need to inspect the whole
111 | # sequence and cast to the widest type.
112 | SEQUENCE_CASTS = [
113 |     'cat',
114 |     'stack'
115 | ]
116 | 


--------------------------------------------------------------------------------
/apex/amp/opt.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import warnings
  3 | 
  4 | from .scaler import LossScaler, master_params
  5 | from ._amp_state import maybe_print
  6 | 
  7 | import numpy as np
  8 | 
  9 | class OptimWrapper(object):
 10 |     def __init__(self, optimizer, amp_handle, num_loss):
 11 |         self._optimizer = optimizer
 12 |         self._amp_handle = amp_handle
 13 |         self._num_loss = num_loss
 14 |         self._loss_idx = 0
 15 |         self._skip_next = [False] * num_loss
 16 |         self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
 17 | 
 18 |     @contextlib.contextmanager
 19 |     def scale_loss(self, loss):
 20 |         if not self._amp_handle.is_active():
 21 |             yield loss
 22 |             return
 23 | 
 24 |         # When there are multiple losses per-optimizer, we need
 25 |         # to save out current grad accumulation, since we won't be
 26 |         # able to unscale this particulare loss once the grads are
 27 |         # all mixed together.
 28 |         cached_grads = []
 29 |         if self._loss_idx > 0:
 30 |             for p in master_params(self._optimizer):
 31 |                 if p.grad is not None:
 32 |                     cached_grads.append(p.grad.data.detach().clone())
 33 |                 else:
 34 |                     cached_grads.append(None)
 35 |             self._optimizer.zero_grad()
 36 | 
 37 |         loss_scale = self._cur_loss_scaler().loss_scale()
 38 |         yield loss * loss_scale
 39 | 
 40 |         self._cur_loss_scaler().clear_overflow_state()
 41 |         self._cur_loss_scaler().unscale(
 42 |             master_params(self._optimizer),
 43 |             master_params(self._optimizer),
 44 |             loss_scale)
 45 |         self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
 46 |         self._loss_idx += 1
 47 | 
 48 |         if len(cached_grads) > 0:
 49 |             for p, cached_grad in zip(master_params(self._optimizer),
 50 |                                       cached_grads):
 51 |                 if cached_grad is not None:
 52 |                     p.grad.data.add_(cached_grad)
 53 |             cached_grads = []
 54 | 
 55 |     def _cur_loss_scaler(self):
 56 |         assert 0 <= self._loss_idx < self._num_loss
 57 |         return self._loss_scaler[self._loss_idx]
 58 | 
 59 |     def step(self, closure=None):
 60 |         if not self._amp_handle.is_active():
 61 |             return self._optimizer.step(closure=closure)
 62 | 
 63 |         self._loss_idx = 0
 64 | 
 65 |         for group in self._optimizer.param_groups:
 66 |             for p in group['params']:
 67 |                 self._amp_handle.remove_cache(p)
 68 | 
 69 |         if closure is not None:
 70 |             raise NotImplementedError(
 71 |                 'The `closure` argument is unsupported by the amp ' +
 72 |                 'optimizer wrapper.')
 73 |         if any(self._skip_next):
 74 |             maybe_print('Gradient overflow, skipping update')
 75 |             self._skip_next = [False] * self._num_loss
 76 |         else:
 77 |             return self._optimizer.step(closure=closure)
 78 | 
 79 |     # Forward any attribute lookups
 80 |     def __getattr__(self, attr):
 81 |         return getattr(self._optimizer, attr)
 82 | 
 83 |     # Forward all torch.optim.Optimizer methods
 84 |     def __getstate__(self):
 85 |         return self._optimizer.__getstate__()
 86 | 
 87 |     def __setstate__(self):
 88 |         return self._optimizer.__setstate__()
 89 | 
 90 |     def __repr__(self):
 91 |         return self._optimizer.__repr__()
 92 | 
 93 |     def state_dict(self):
 94 |         return self._optimizer.state_dict()
 95 | 
 96 |     def load_state_dict(self, state_dict):
 97 |         return self._optimizer.load_state_dict(state_dict)
 98 | 
 99 |     def zero_grad(self):
100 |         return self._optimizer.zero_grad()
101 | 
102 |     def add_param_group(self, param_group):
103 |         return self._optimizer.add_param_group(param_group)
104 | 


--------------------------------------------------------------------------------
/apex/amp/rnn_compat.py:
--------------------------------------------------------------------------------
 1 | from . import utils, wrap
 2 | 
 3 | import torch
 4 | _VF = torch._C._VariableFunctions
 5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
 6 | 
 7 | def _gen_VF_wrapper(name):
 8 |     def wrapper(*args, **kwargs):
 9 |         return getattr(_VF, name)(*args, **kwargs)
10 |     return wrapper
11 | 
12 | # Some python magic to generate an object that has the rnn cell functions
13 | # defined on it, all of which call into corresponding _VF version.
14 | # Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
15 | # imported at module scope within torch.nn.modules.rnn).  This should
16 | # not affect third-party importers of _VF.py.
17 | class VariableFunctionsShim(object):
18 |     def __init__(self):
19 |         for name in RNN_NAMES:
20 |             for suffix in ['', '_cell']:
21 |                fn_name = name + suffix
22 |                setattr(self, fn_name, _gen_VF_wrapper(fn_name))
23 | 
24 | def has_old_rnns():
25 |     try:
26 |         torch.nn.backends.thnn.backend.LSTMCell
27 |         return True
28 |     except:
29 |         return False
30 | 
31 | def whitelist_rnn_cells(handle, verbose):
32 |     # Different module + function names in old/new RNN cases
33 |     if has_old_rnns():
34 |         fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
35 |         mod = torch.nn.backends.thnn.backend
36 |     else:
37 |         fn_names = [x + '_cell' for x in RNN_NAMES]
38 |         mod = torch.nn.modules.rnn._VF
39 |         assert isinstance(mod, VariableFunctionsShim)
40 | 
41 |     # Insert casts on cell functions
42 |     for fn in fn_names:
43 |         wrap.cached_cast(mod, fn, utils.maybe_half, handle,
44 |                          try_caching=True, verbose=verbose)
45 | 
46 |     if has_old_rnns():
47 |         # Special handling of `backward` for fused gru / lstm:
48 |         # The `backward` method calls Tensor.sum() (blacklist) internally,
49 |         # and then the resulting grad_input has the wrong type.
50 |         # TODO: where else is this a problem?
51 |         for rnn_type in ['GRUFused', 'LSTMFused']:
52 |             mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
53 |             wrap.disable_casts(mod, 'backward', handle)
54 | 


--------------------------------------------------------------------------------
/apex/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/__init__.py


--------------------------------------------------------------------------------
/apex/contrib/bottleneck/__init__.py:
--------------------------------------------------------------------------------
1 | from .bottleneck import Bottleneck
2 | 


--------------------------------------------------------------------------------
/apex/contrib/bottleneck/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from bottleneck import Bottleneck
 3 | torch.manual_seed(23337)
 4 | 
 5 | # use True to print layerwise sum for all outputs in reference code path
 6 | DEBUG = False#True
 7 | 
 8 | for stride, o_channel in [(1,32), (1,128), (2,32)]:
 9 |     print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
10 |     a_ = torch.randn(17,32,28,28)
11 | 
12 |     a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
13 |     model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
14 | 
15 |     # test model
16 |     b = model(a)
17 |     b.mean().backward()
18 |     d_grad = a.grad.float()
19 |     a.grad = None
20 |     torch.cuda.synchronize()
21 | 
22 |     if DEBUG:
23 |         print("[DEBUG] ref dx :", d_grad.sum().item())
24 |         # print wgrad. we don't need to reset since later cpp print before accumulation
25 |         for i, w in enumerate(model.w_conv):
26 |             print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
27 | 
28 |     wgrads = []
29 |     for w in model.w_conv:
30 |         wgrads.append(w.grad.float())
31 | 
32 |     model.use_cudnn = True
33 |     model.zero_grad()
34 |     c = model(a)
35 |     c.mean().backward()
36 | 
37 |     torch.cuda.synchronize()
38 |     print("comparing native and channels_last:")
39 |     print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
40 |     print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
41 |     for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
42 |         print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
43 | 
44 |     nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
45 |     nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
46 |     for p,q in zip(model.parameters(), nhwc_model.parameters()):
47 |         # model's storage is already in nhwc, we clone and assign to explicit nhwc model
48 |         q.data.copy_(p.data.permute(0,2,3,1).contiguous())
49 |     for p,q in zip(model.buffers(), nhwc_model.buffers()):
50 |         q.data.copy_(p.data)
51 | 
52 |     d = nhwc_model(nhwc_a)
53 |     d.mean().backward()
54 |     torch.cuda.synchronize()
55 | 
56 |     # reset reference to cudnn channels_last permute
57 |     #c_s = c.storage().tolist()
58 |     #d_s = d.storage().tolist()
59 |     #print(max([x-y for x,y in zip(c_s,d_s)]))
60 |     c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
61 |     d_grad = a.grad.float().permute(0,2,3,1).contiguous()
62 |     wgrads = []
63 |     for w in model.w_conv:
64 |         wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
65 | 
66 |     torch.cuda.synchronize()
67 |     print("comparing nhwc and channels_last:")
68 |     print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
69 |     print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
70 |     for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
71 |         print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
72 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha/mask.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | namespace fmha {
31 | 
32 | 
33 | template<typename Cta_tile>
34 | struct Mask {
35 |     using Mma_tile = fmha::Hmma_tile<Cta_tile>;
36 | 
37 |     template<typename Params, typename BInfo>
38 |     __device__ Mask(const Params &params, const BInfo &blockInfo, int tidx) {
39 | 
40 |         actual_seqlen = blockInfo.actual_seqlen;
41 | 
42 |         const int warp = tidx / Cta_tile::THREADS_PER_WARP;
43 |         const int lane = tidx % Cta_tile::THREADS_PER_WARP;
44 | 
45 |         static_assert(Cta_tile::WARPS_K == 1, "");
46 | 
47 |         // find the warp in the Cta tile
48 |         const int warp_n = (warp / Cta_tile::WARPS_M);
49 |         const int warp_m = (warp % Cta_tile::WARPS_M);
50 |         // decompose warp into 8x4 tile
51 |         const int quad = lane / 4;
52 |         const int tid = (lane % 4) * 2;
53 |         row = warp_m * 16 + quad;
54 |         col = warp_n * 16 + tid;
55 |     }
56 | 
57 |     inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
58 | 
59 |         // ii and jj iterate over the 2x4 fragment
60 |         const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen;
61 |         //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
62 |         return col_valid;
63 |         // return row_valid && col_valid;
64 |     }
65 | 
66 |     inline __device__ void load(int it) {
67 |         row_offset = it * Cta_tile::M + row;
68 |     }
69 |     int row_offset;
70 | 
71 |     int row;
72 |     int col;
73 |     int actual_seqlen;
74 | };
75 | 
76 | }  // namespace fmha
77 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_dgrad_kernel_1xN_reload.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 128, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_dgrad_fp16_128_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::compute_dv_1xN<Kernel_traits>(params);
35 |     fmha::compute_dq_dk_1xN<Kernel_traits>(params);
36 | }
37 | 
38 | void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
39 | 
40 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
41 |     constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
42 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
43 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
44 | 
45 |     using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>;
46 |     constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
47 |     static_assert(smem_size_s == 16 * 128 * 2);
48 |     static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
49 | 
50 |     constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
51 |     constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
52 |     constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);
53 | 
54 |     if( smem_size >= 48 * 1024 ) {
55 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(
56 |             fmha_dgrad_fp16_128_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
57 |     }
58 |     dim3 grid(params.h, params.b);
59 |     fmha_dgrad_fp16_128_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
60 | }
61 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_dgrad_kernel_1xN_reload.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 256, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_dgrad_fp16_256_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::compute_dv_1xN<Kernel_traits>(params);
35 |     fmha::compute_dq_dk_1xN<Kernel_traits>(params);
36 | }
37 | 
38 | void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
39 | 
40 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
41 |     constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
42 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
43 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
44 | 
45 |     using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>;
46 |     constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
47 |     static_assert(smem_size_s == 16 * 256 * 2);
48 |     static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
49 | 
50 |     constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
51 |     constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
52 |     constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);
53 | 
54 |     if( smem_size >= 48 * 1024 ) {
55 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(
56 |             fmha_dgrad_fp16_256_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
57 |     }
58 |     dim3 grid(params.h, params.b);
59 |     fmha_dgrad_fp16_256_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
60 | }
61 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_dgrad_kernel_1xN_reload.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 384, 64, 16, 1, 8, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_dgrad_fp16_384_64_sm80_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::compute_dv_1xN<Kernel_traits>(params);
35 |     fmha::compute_dq_dk_1xN<Kernel_traits>(params);
36 | }
37 | 
38 | void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream) {
39 | 
40 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
41 |     constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
42 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
43 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
44 | 
45 |     using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>;
46 |     constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
47 |     static_assert(smem_size_s == 16 * 384 * 2);
48 |     static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
49 | 
50 |     constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax;
51 |     constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v;
52 |     constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk);
53 | 
54 |     if( smem_size >= 48 * 1024 ) {
55 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(
56 |             fmha_dgrad_fp16_384_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
57 |     }
58 |     dim3 grid(params.h, params.b);
59 |     fmha_dgrad_fp16_384_64_sm80_kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
60 | }
61 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_fprop_kernel_1xN.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 128, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_fprop_fp16_128_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::device_1xN<Kernel_traits, true>(params);
35 | }
36 | 
37 | extern "C" __global__ void fmha_fprop_fp16_128_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) {
38 |     fmha::device_1xN<Kernel_traits, false>(params);
39 | }
40 | 
41 | void run_fmha_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream) {
42 | 
43 |     auto kernel = is_training ? &fmha_fprop_fp16_128_64_sm80_train_kernel : &fmha_fprop_fp16_128_64_sm80_predict_kernel;
44 | 
45 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
46 |     constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
47 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
48 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
49 | 
50 |     constexpr int smem_size = smem_size_q + std::max(smem_size_v, smem_size_o + smem_size_softmax);
51 | 
52 |     if( smem_size >= 48 * 1024 ) {
53 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
54 |     }
55 | 
56 |     dim3 grid(params.h, params.b);
57 |     kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
58 | }
59 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_fprop_kernel_1xN.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 256, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_fprop_fp16_256_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::device_1xN<Kernel_traits, true>(params);
35 | }
36 | 
37 | extern "C" __global__ void fmha_fprop_fp16_256_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) {
38 |     fmha::device_1xN<Kernel_traits, false>(params);
39 | }
40 | 
41 | void run_fmha_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream) {
42 | 
43 |     auto kernel = is_training ? &fmha_fprop_fp16_256_64_sm80_train_kernel : &fmha_fprop_fp16_256_64_sm80_predict_kernel;
44 | 
45 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
46 |     constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
47 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
48 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
49 | 
50 |     constexpr int smem_size = smem_size_q + std::max(smem_size_v, smem_size_o + smem_size_softmax);
51 | 
52 |     if( smem_size >= 48 * 1024 ) {
53 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
54 |     }
55 | 
56 |     dim3 grid(params.h, params.b);
57 |     kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
58 | }
59 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_fprop_kernel_1xN_reload_v.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 384, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::device_1xN<Kernel_traits, true>(params);
35 | }
36 | 
37 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) {
38 |     fmha::device_1xN<Kernel_traits, false>(params);
39 | }
40 | 
41 | void run_fmha_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream) {
42 | 
43 |     auto kernel = is_training ? &fmha_fprop_fp16_384_64_sm80_train_kernel : &fmha_fprop_fp16_384_64_sm80_predict_kernel;
44 | 
45 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
46 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
47 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
48 | 
49 |     constexpr int smem_size = smem_size_v + smem_size_o + smem_size_softmax;
50 | 
51 |     if( smem_size >= 48 * 1024 ) {
52 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
53 |     }
54 | 
55 |     dim3 grid(params.h, params.b);
56 |     kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
57 | }
58 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/groupbn/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #include <ATen/cuda/CUDAContext.h>
 2 | #ifndef CUDA_UTILS_H
 3 | #define CUDA_UTILS_H
 4 | 
 5 | namespace at {
 6 | namespace cuda {
 7 | 
 8 | namespace utils {
 9 | 
10 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
11 |     return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
12 | }
13 | 
14 | 
15 | }
16 | }
17 | }
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/groupbn/ipc.cu:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | #include <THC/THCNumerics.cuh>
  4 | 
  5 | #include "THC/THC.h"
  6 | 
  7 | #include <cuda.h>
  8 | 
  9 | #include "compat.h"
 10 | 
 11 | 
 12 | #define cudaCheckErrors(msg) \
 13 |     do { \
 14 |         cudaError_t __err = cudaGetLastError(); \
 15 |         if (__err != cudaSuccess) { \
 16 |             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
 17 |                 msg, cudaGetErrorString(__err), \
 18 |                 __FILE__, __LINE__); \
 19 |             fprintf(stderr, "*** FAILED - ABORTING\n"); \
 20 |             exit(1); \
 21 |         } \
 22 |     } while (0)
 23 | 
 24 | template<>
 25 | struct std::hash<cudaIpcMemHandle_t> {
 26 |   size_t operator() (const cudaIpcMemHandle_t& handle) const {
 27 |     size_t hash = 0;
 28 |     uint8_t* ptr = (uint8_t*)&handle;
 29 |     assert(sizeof(uint8_t) == 1);
 30 |     for (int i=0; i<sizeof(cudaIpcMemHandle_t); i++) {
 31 |       hash += *ptr;
 32 |       ptr++;
 33 |     }
 34 |     return hash;
 35 |   }
 36 | };
 37 | 
 38 | template<>
 39 | struct std::equal_to<cudaIpcMemHandle_t> {
 40 |   bool operator() (const cudaIpcMemHandle_t &lhs,
 41 |                              const cudaIpcMemHandle_t &rhs) const {
 42 |     return (std::memcmp((void*) &lhs,
 43 |                         (void*) &rhs,
 44 |                         sizeof(cudaIpcMemHandle_t)) == 0);
 45 |   }
 46 | };
 47 | 
 48 | namespace {
 49 | 
 50 | namespace gpuipc {
 51 | //from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h
 52 | // The number of threads per pixel.
 53 | const int THREADS_PER_PIXEL = 16;
 54 | // The number of elements per ldg.
 55 | const int ELEMENTS_PER_LDG = 4;
 56 | // The number of reducing ops, each uses its own space : mean, var, dscale, dbias
 57 | const int REDUCE_OPS = 4;
 58 | // Maximum block.y supported - limited due to buffer allocation
 59 | const int MAX_BLOCK_Y = 256;
 60 | const int MAX_OFFSET = REDUCE_OPS*MAX_BLOCK_Y;
 61 | const int BYTES_PER_ELEM = 4;
 62 | // Buffer size per sync step
 63 | const int SINGLE_SYNC_BUFFER_BYTES = MAX_OFFSET*THREADS_PER_PIXEL*2*ELEMENTS_PER_LDG*BYTES_PER_ELEM;
 64 | };
 65 | 
 66 | class IpcMemHandleRegistry {
 67 | public:
 68 |   void* getPtr(const cudaIpcMemHandle_t& handle, int64_t offset) {
 69 |     if (registry_.count(handle) == 0) {
 70 |       registry_.insert(std::make_pair(handle, RegistryEntry()));
 71 |       registry_[handle].dev_ptr = ipcOpenMem(handle);
 72 |     }
 73 |     registry_[handle].ref_count++;
 74 |     return (((uint8_t*)registry_[handle].dev_ptr) + offset);
 75 |   }
 76 | 
 77 |   void releasePtr(const cudaIpcMemHandle_t& handle) {
 78 |     if (registry_.count(handle) == 0) {
 79 |     }
 80 |     if (--registry_[handle].ref_count == 0) {
 81 |       ipcCloseMem(registry_[handle].dev_ptr);
 82 |       registry_.erase(handle);
 83 |     }
 84 |   }
 85 | 
 86 |   struct RegistryEntry {
 87 |     void* dev_ptr;
 88 |     int   ref_count;
 89 |     RegistryEntry() : dev_ptr(NULL) , ref_count(0) {}
 90 |   };
 91 | 
 92 | protected:
 93 |   std::unordered_map<cudaIpcMemHandle_t, RegistryEntry> registry_;
 94 | 
 95 |   void* ipcOpenMem(const cudaIpcMemHandle_t& handle) {
 96 |     void *data;
 97 |     cudaIpcOpenMemHandle(&data, handle, cudaIpcMemLazyEnablePeerAccess);
 98 |     cudaCheckErrors("ipc init");
 99 |     return data;
100 |   }
101 | 
102 |   void ipcCloseMem(void* dev_ptr) {
103 |     cudaIpcCloseMemHandle(dev_ptr);
104 |     cudaCheckErrors("ipc close");
105 |   }
106 | 
107 | };
108 | 
109 | }
110 | 
111 | static IpcMemHandleRegistry ipc_mem_registry;
112 | 
113 | int64_t get_buffer_size(const int bn_sync_steps) {
114 |   return bn_sync_steps * gpuipc::SINGLE_SYNC_BUFFER_BYTES;
115 | }
116 | 
117 | void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
118 |   cudaIpcMemHandle_t my_handle;
119 |   memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
120 |   return ipc_mem_registry.getPtr(my_handle, offset);
121 | }
122 | 
123 | void close_remote_data(const at::Tensor& handle) {
124 |     cudaIpcMemHandle_t my_handle;
125 |     memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
126 |   ipc_mem_registry.releasePtr(my_handle);
127 | }
128 | 
129 | void* get_data_ptr(
130 |                    const at::Tensor& data) {
131 |   return data.DATA_PTR<uint8_t>();
132 | }
133 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/layer_norm/ln_api.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include "ATen/cuda/CUDAContext.h"
  3 | 
  4 | void ln_fwd_cuda(at::Tensor &y, at::Tensor &mu, at::Tensor &rsigma,
  5 |                  const at::Tensor &x, const at::Tensor &gamma,
  6 |                  const at::Tensor &beta, const float epsilon, const int rows, const int cols,
  7 |                  cudaStream_t stream);
  8 | 
  9 | void ln_bwd_cuda(at::Tensor &dx, at::Tensor &dgamma, at::Tensor &dbeta,
 10 |                  const at::Tensor &dw, const at::Tensor &x,
 11 |                  const at::Tensor &mu, const at::Tensor &rsigma,
 12 |                  const at::Tensor &gamma, const int rows, const int cols, cudaStream_t stream);
 13 | 
 14 | 
 15 | std::vector<at::Tensor> ln_fwd(const at::Tensor &x,      // BxSxhidden_size
 16 |                                const at::Tensor &gamma,   // hidden_size
 17 |                                const at::Tensor &beta,   // hidden_size
 18 |                                const float epsilon
 19 | ) {
 20 | 
 21 |     TORCH_CHECK(x.is_cuda())
 22 |     TORCH_CHECK(gamma.is_cuda())
 23 |     TORCH_CHECK(beta.is_cuda())
 24 | 
 25 |     TORCH_CHECK(x.is_contiguous());
 26 |     auto sizes = x.sizes();
 27 |     TORCH_CHECK(sizes.size() == 2);
 28 | 
 29 |     const int rows = sizes[0];
 30 |     const int cols = sizes[1];
 31 | 
 32 |     auto dtype = x.scalar_type();
 33 | 
 34 |     TORCH_CHECK(gamma.dtype() == dtype);
 35 |     TORCH_CHECK(beta.dtype() == dtype);
 36 | 
 37 |     TORCH_CHECK(gamma.sizes() == beta.sizes());
 38 |     TORCH_CHECK(gamma.numel() == cols);
 39 | 
 40 |     TORCH_CHECK(epsilon >= 0.f);
 41 | 
 42 |     auto stream = at::cuda::getCurrentCUDAStream().stream();
 43 | 
 44 |     auto y = torch::empty_like(x);
 45 | 
 46 |     auto opts = x.options();
 47 | 
 48 |     auto mu = torch::empty({rows}, opts.dtype(torch::kFloat32));
 49 |     auto rsigma = torch::empty({rows}, opts.dtype(torch::kFloat32));
 50 | 
 51 |     ln_fwd_cuda(y, mu, rsigma, x, gamma, beta, epsilon, rows, cols, stream);
 52 | 
 53 |     return {y, mu, rsigma};
 54 | }
 55 | 
 56 | 
 57 | 
 58 | std::vector<at::Tensor> ln_bwd(const at::Tensor &dw,     // BxSxhidden_size
 59 |                                const at::Tensor &x,      // BxSxhidden_size
 60 |                                const at::Tensor &mu,     // BxS, FP32!
 61 |                                const at::Tensor &rsigma, // BxS, FP32!
 62 |                                const at::Tensor &gamma   // hidden_size
 63 | ) {
 64 | 
 65 |   TORCH_CHECK(x.is_cuda());
 66 |   TORCH_CHECK(dw.is_cuda());
 67 |   TORCH_CHECK(mu.is_cuda());
 68 |   TORCH_CHECK(rsigma.is_cuda());
 69 |   TORCH_CHECK(gamma.is_cuda());
 70 | 
 71 |   TORCH_CHECK(x.is_contiguous());
 72 |   TORCH_CHECK(dw.is_contiguous());
 73 | 
 74 |   auto sizes = x.sizes();
 75 |   TORCH_CHECK(sizes.size() == 2);
 76 |   TORCH_CHECK(dw.sizes() == sizes);
 77 |   auto rows = sizes[0];
 78 |   auto cols = sizes[1];
 79 |   
 80 |   auto dtype = x.scalar_type();
 81 |   TORCH_CHECK(dw.dtype() == dtype);
 82 |   TORCH_CHECK(gamma.dtype() == dtype);
 83 |   TORCH_CHECK(mu.dtype() == torch::kFloat32);
 84 |   TORCH_CHECK(rsigma.dtype() == torch::kFloat32);
 85 |   TORCH_CHECK(mu.sizes() == rsigma.sizes());
 86 |   TORCH_CHECK(mu.numel() == rows);
 87 | 
 88 |   TORCH_CHECK(gamma.numel() == cols);
 89 | 
 90 | 
 91 |   auto stream = at::cuda::getCurrentCUDAStream().stream();
 92 | 
 93 |   auto dx = torch::empty_like(x);
 94 |   auto dgamma = torch::empty_like(gamma);
 95 |   auto dbeta = torch::empty_like(gamma);
 96 |   
 97 |   ln_bwd_cuda(dx, dgamma, dbeta, dw, x, mu, rsigma, gamma, rows, cols, stream);
 98 | 
 99 |   return {dx, dgamma, dbeta};
100 | }
101 | 
102 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
103 |   m.doc() = "CUDA LayerNorm"; // optional module docstring
104 |   m.def("ln_fwd", &ln_fwd, "Run LayerNorm forward kernel");
105 |   m.def("ln_bwd", &ln_bwd, "Run LayerNorm backward kernel");
106 | }
107 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/layer_norm/ln_kernel_traits.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | constexpr uint32_t THREADS_PER_WARP = 32;
 4 | 
 5 | template <typename dtype, int COLS_, int WARPS_M_, int WARPS_N_,
 6 |           int BYTES_PER_LDG_ = 16>
 7 | struct Kernel_traits {
 8 |   enum { WARPS_M = WARPS_M_ };
 9 |   enum { WARPS_N = WARPS_N_ };
10 |   enum { COLS = COLS_ };
11 |   enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
12 | 
13 |   using Vec = Vec<dtype, BYTES_PER_LDG>;
14 | 
15 |   using vec_t = typename Vec::vec_t;
16 |   using base_t = typename Vec::base_t;
17 |   using packed_t = typename Vec::packed_t;
18 |   using compute_t = typename Vec::compute_t;
19 |   using packed_compute_t = typename Vec::packed_compute_t;
20 | 
21 |   enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP };
22 |   enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW };
23 |   enum { ROWS_PER_CTA = WARPS_M };
24 | 
25 |   enum { BYTES_PER_ROW = COLS * sizeof(base_t) };
26 |   enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG };
27 |   enum {SMEM_BYTES = ROWS_PER_CTA * COLS * sizeof(compute_t)};
28 | };
29 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/layer_norm/utils.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "torch/extension.h"
 4 | #include <ATen/cuda/Exceptions.h> // for CUDNN_CHECK
 5 | 
 6 | #define DIVUP(x, y) (((x) + ((y)-1)) / (y))
 7 | 
 8 | #define DISPATCH_FLOAT_AND_HALF(TYPE, NAME, ...)                               \
 9 |   [&] {                                                                        \
10 |     const auto &the_type = TYPE;                                               \
11 |     /* don't use TYPE again in case it is an expensive or side-effect op */    \
12 |     at::ScalarType _st = ::detail::scalar_type(the_type);                      \
13 |     switch (_st) {                                                             \
14 |       AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)          \
15 |       AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)        \
16 |     default:                                                                   \
17 |       AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");           \
18 |     }                                                                          \
19 |   }()
20 | 
21 | template <int Bytes> struct Vec_type {};
22 | 
23 | template <> struct Vec_type<16> {
24 |   using Type = uint4;
25 |   static __device__ inline Type zero() { return make_uint4(0, 0, 0, 0); }
26 | };
27 | template <> struct Vec_type<8> {
28 |   using Type = uint2;
29 |   static __device__ inline Type zero() { return make_uint2(0, 0); }
30 | };
31 | 
32 | template <> struct Vec_type<4> {
33 |   using Type = uint32_t;
34 |   static __device__ inline Type zero() { return 0; }
35 | };
36 | 
37 | template <> struct Vec_type<2> {
38 |   using Type = uint16_t;
39 |   static __device__ inline Type zero() { return 0; }
40 | };
41 | 
42 | template <typename T> struct TypeInfo {
43 |   using base_t = T;
44 |   using packed_t = T;
45 |   using compute_t = float;
46 |   using packed_compute_t = float;
47 | };
48 | 
49 | template <> struct TypeInfo<half> {
50 |   using base_t = half;
51 |   using packed_t = half2;
52 |   using compute_t = float;
53 |   using packed_compute_t = float2;
54 | };
55 | 
56 | template <typename dtype, int Bytes> struct Vec {
57 | 
58 |   using base_t = typename TypeInfo<dtype>::base_t;
59 |   using packed_t = typename TypeInfo<dtype>::packed_t;
60 |   using compute_t = typename TypeInfo<dtype>::compute_t;
61 |   using packed_compute_t = typename TypeInfo<dtype>::packed_compute_t;
62 | 
63 |   static_assert(Bytes % sizeof(base_t) == 0, "");
64 |   static_assert(Bytes % sizeof(packed_t) == 0, "");
65 |   enum { BYTES_PER_THREAD = Bytes };
66 |   enum { NUM_ELTS = Bytes / sizeof(base_t) };
67 |   enum { NUM_PACKED = Bytes / sizeof(packed_t) };
68 |   using vec_t = typename Vec_type<Bytes>::Type;
69 |   using store_t = union {
70 |     vec_t raw;
71 |     base_t elt[NUM_ELTS];
72 |     packed_t packed[NUM_PACKED];
73 |   };
74 |   store_t data;
75 | 
76 |   __device__ Vec() { data.raw = Vec_type<Bytes>::zero(); }
77 | 
78 |   __device__ inline void load_from(const char *ptr) {
79 |     data.raw = *reinterpret_cast<const vec_t *>(ptr);
80 |   }
81 | 
82 |   __device__ inline void load_or_zero(const char *ptr, const bool is_valid) {
83 |     data.raw = is_valid ? *reinterpret_cast<const vec_t *>(ptr)
84 |                         : Vec_type<Bytes>::zero();
85 |   }
86 | 
87 |   __device__ inline void store_to(char *ptr) const {
88 |     *reinterpret_cast<vec_t *>(ptr) = data.raw;
89 |   }
90 | 
91 |   __device__ inline void store_valid(char *ptr, const bool is_valid) const {
92 |     if (is_valid)
93 |       *reinterpret_cast<vec_t *>(ptr) = data.raw;
94 |   }
95 | };
96 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <cuda_fp16.h>
 3 | #include <vector>
 4 | 
 5 | namespace multihead_attn {
 6 | namespace fused_softmax {
 7 | namespace additive_mask_softmax_dropout {
 8 | 
 9 | std::vector<torch::Tensor> fwd_cuda(
10 |                                bool                 is_training,
11 |                                int                  heads,
12 |                                torch::Tensor const& input, 
13 |                                const half*       pad_mask,
14 |                                float                dropout_prob
15 |                                                   );
16 | 
17 | torch::Tensor bwd_cuda(
18 | 		               int heads,
19 |                                torch::Tensor const& output_grads, 
20 |                                torch::Tensor const& softmax_results,
21 |                                torch::Tensor const& dropout_mask,
22 |                                float                dropout_prob
23 |                                                   );
24 | 
25 | // C++ interface
26 | 
27 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
28 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
29 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
30 | 
31 | std::vector<torch::Tensor> fwd(
32 |  			       bool 				use_mask,
33 |                                bool                 is_training,
34 |                                int                  heads,
35 |                                torch::Tensor const& input,
36 |                                torch::Tensor const& pad_mask,
37 |                                float                dropout_prob
38 |                                                  )
39 | {
40 |   AT_ASSERTM(input.dim()         == 3, "expected 3D tensor");
41 |   AT_ASSERTM(input.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
42 | 
43 |   if (use_mask) {
44 |   	AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
45 |   	AT_ASSERTM(pad_mask.type().scalarType()       == at::ScalarType::Half, "Only BYTE is supported");
46 |   }
47 | 
48 |   return fwd_cuda(
49 |                                  is_training,
50 |                                  heads, 
51 |                                  input, 
52 |                                  use_mask ? static_cast<const half*>(pad_mask.data_ptr()) : nullptr, 
53 |                                  dropout_prob
54 |                                 );
55 | }
56 | 
57 | torch::Tensor bwd(
58 | 		               bool use_mask,
59 | 		               int heads,
60 |                                torch::Tensor const& output_grads, 
61 |                                torch::Tensor const& softmax_results,
62 |                                torch::Tensor const& dropout_mask,
63 |                                float                dropout_prob
64 |                                                   )
65 | {
66 |   AT_ASSERTM(output_grads.dim()      == 3, "expected 3D tensor");
67 |   AT_ASSERTM(softmax_results.dim()   == 3, "expected 3D tensor");
68 |   AT_ASSERTM(dropout_mask.dim()      == 3, "expected 3D tensor");
69 | 
70 |   AT_ASSERTM(output_grads.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
71 |   AT_ASSERTM(softmax_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
72 | //  AT_ASSERTM(dropout_mask.type().scalarType()      == at::ScalarType::Byte, "Only BYTE is supported");
73 | 
74 |   return bwd_cuda(
75 | 		                 heads,
76 |                                  output_grads,
77 |                                  softmax_results, 
78 |                                  dropout_mask, 
79 |                                  dropout_prob
80 |                                 );
81 | }
82 | 
83 | } // end namespace mask_softmax_dropout
84 | } // end namespace fused_softmax
85 | } // end namespace multihead_attn
86 | 
87 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
88 |   m.def("forward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::fwd, "Self Multihead Attention masked softmax dropout -- Forward.");
89 |   m.def("backward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::bwd, "Self Multihead Attention masked softmax dropout -- Backward.");
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/multihead_attn/philox.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | //Philox CUDA. 
 3 | 
 4 | class Philox {
 5 | public:
 6 |   __device__ inline Philox(unsigned long long seed,
 7 |                            unsigned long long subsequence,
 8 |                            unsigned long long offset) {
 9 |     key.x = (unsigned int)seed;
10 |     key.y = (unsigned int)(seed >> 32);
11 |     counter = make_uint4(0, 0, 0, 0);
12 |     counter.z = (unsigned int)(subsequence);
13 |     counter.w = (unsigned int)(subsequence >> 32);
14 |     STATE = 0;
15 |     incr_n(offset / 4);
16 |   }
17 |   __device__ inline uint4 operator()() {
18 |     if(STATE == 0) {
19 |       uint4 counter_ = counter;
20 |       uint2 key_ = key;
21 |       //7-round philox
22 |       for(int i = 0; i < 6; i++) {
23 |         counter_ = single_round(counter_, key_);
24 |         key_.x += (kPhilox10A); key_.y += (kPhilox10B);
25 |       }
26 |       output = single_round(counter_, key_);
27 |       incr();
28 |     }
29 |     //return a float4 directly
30 |     //unsigned long ret;
31 |     //switch(STATE) {
32 |     //  case 0: ret = output.x; break;
33 |     //  case 1: ret = output.y; break;
34 |     //  case 2: ret = output.z; break;
35 |     //  case 3: ret = output.w; break;
36 |     //}
37 |     //STATE = (STATE + 1) % 4;
38 |     return output;
39 |   }
40 | private:
41 |   uint4 counter;
42 |   uint4 output;
43 |   uint2 key;
44 |   unsigned int STATE;
45 |   __device__ inline void incr_n(unsigned long long n) {
46 |     unsigned int nlo = (unsigned int)(n);
47 |     unsigned int nhi = (unsigned int)(n >> 32);
48 |     counter.x += nlo;
49 |     if (counter.x < nlo)
50 |       nhi++;
51 |     counter.y += nhi;
52 |     if (nhi <= counter.y)
53 |       return;
54 |     if (++counter.z)
55 |       return;
56 |     ++counter.w;
57 |   }
58 |   __device__ inline void incr() {
59 |     if (++counter.x)
60 |       return;
61 |     if (++counter.y)
62 |       return;
63 |     if (++counter.z)
64 |       return;
65 |     ++counter.w;
66 |   }
67 |   __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
68 |                                     unsigned int *result_high) {
69 |     *result_high = __umulhi(a, b);
70 |     return a*b;
71 |   }
72 |   __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
73 |     unsigned int hi0;
74 |     unsigned int hi1;
75 |     unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
76 |     unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
77 |     uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
78 |     return ret;
79 |   }
80 |   static const unsigned long kPhilox10A = 0x9E3779B9;
81 |   static const unsigned long kPhilox10B = 0xBB67AE85;
82 |   static const unsigned long kPhiloxSA = 0xD2511F53;
83 |   static const unsigned long kPhiloxSB = 0xCD9E8D57;
84 | };
85 | // Inverse of 2^32.
86 | #define M_RAN_INVM32 2.3283064e-10f
87 | __device__  __inline__ float4 uniform4(uint4 x) {
88 |     return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,x.w * M_RAN_INVM32);
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   const float lr,
 8 |   const float beta1,
 9 |   const float beta2,
10 |   const float epsilon,
11 |   const int step,
12 |   const int bias_correction,
13 |   const float weight_decay,
14 |   const int grad_averaging,
15 |   const int mode,
16 |   const float global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |         m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
21 | }
22 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_fused_adam_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_bias_correction,
10 |   at::Tensor per_tensor_eps,
11 |   at::Tensor per_tensor_weight_decay,
12 |   float lr,
13 |   float grad_scale,
14 |   int step,
15 |   int mode);
16 | 
17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
18 |   m.def("multi_tensor_fused_adam", &multi_tensor_fused_adam_cuda,
19 |         "Multi tensor Adam optimized CUDA implementation.");
20 | }
21 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_compute_update_term_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_beta3,
10 |   at::Tensor per_tensor_bias_correction,
11 |   at::Tensor step,
12 |   at::Tensor per_tensor_epsilon,
13 |   const int mode,
14 |   at::Tensor per_tensor_decay,
15 |   at::Tensor global_scale,
16 |   at::Tensor global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | void multi_tensor_lamb_update_weights_cuda(
20 |   int chunk_size,
21 |   at::Tensor noop_flag,
22 |   std::vector<std::vector<at::Tensor>> tensor_lists,
23 |   at::Tensor per_tensor_param_norm,
24 |   at::Tensor per_tensor_update_norm,
25 |   at::Tensor update_norm_offset,
26 |   at::Tensor learning_rate,
27 |   at::Tensor per_tensor_decay,
28 |   at::Tensor global_grad_norm,
29 |   bool use_nvlamb);
30 | 
31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
32 |   m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda,
33 |         "Computes update term for LAMB optimizer");
34 |   m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda,
35 |         "Applies update term for LAMB optimizer");
36 | }
37 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/transducer/transducer_joint.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/Functions.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 7 | 
 8 | std::vector<torch::Tensor> transducer_joint_cuda_forward(
 9 |     torch::Tensor f,
10 |     torch::Tensor g,
11 |     torch::Tensor fLen,
12 |     torch::Tensor gLen,
13 |     torch::Tensor batchOffset,
14 |     int64_t packedBatch,
15 |     int opt,
16 |     bool packOutput,
17 |     bool relu,
18 |     bool dropout,
19 |     float dropoutProb,
20 |     int tileSize);
21 | 
22 | 
23 | std::vector<torch::Tensor> transducer_joint_cuda_backward(
24 |     std::vector<torch::Tensor> in,
25 |     torch::Tensor fLen,
26 |     torch::Tensor gLen,
27 |     torch::Tensor batchOffset,
28 |     int maxFLen,
29 |     int maxGLen,
30 |     bool packOutput,
31 |     float scale);
32 | 
33 | std::vector<torch::Tensor> transducer_joint_forward(
34 |     torch::Tensor f,
35 |     torch::Tensor g,
36 |     torch::Tensor fLen,
37 |     torch::Tensor gLen,
38 |     torch::Tensor batchOffset,
39 |     int64_t packedBatch,
40 |     int opt,
41 |     bool packOutput,
42 |     bool relu,
43 |     bool dropout,
44 |     float dropoutProb,
45 |     int tileSize) {
46 |     CHECK_INPUT(f);
47 |     CHECK_INPUT(g);
48 |     CHECK_INPUT(fLen);
49 |     CHECK_INPUT(gLen);
50 |     if (packOutput)
51 |         CHECK_INPUT(batchOffset);
52 |     return transducer_joint_cuda_forward(
53 |         f, 
54 |         g, 
55 |         fLen, 
56 |         gLen,
57 |         batchOffset,
58 |         packedBatch,
59 |         opt,
60 |         packOutput,
61 |         relu,
62 |         dropout,
63 |         dropoutProb,
64 |         tileSize);
65 | }
66 | 
67 | std::vector<torch::Tensor> transducer_joint_backward(
68 |     std::vector<torch::Tensor> in,
69 |     torch::Tensor fLen,
70 |     torch::Tensor gLen,
71 |     torch::Tensor batchOffset,
72 |     int maxFLen,
73 |     int maxGLen,
74 |     bool packOutput,
75 |     float scale) {
76 |     for (auto t : in){
77 |         CHECK_INPUT(t);
78 |     }
79 |     CHECK_INPUT(fLen);
80 |     CHECK_INPUT(gLen);
81 |     if (packOutput)
82 |         CHECK_INPUT(batchOffset);
83 |     return transducer_joint_cuda_backward(
84 |         in, 
85 |         fLen, 
86 |         gLen,
87 |         batchOffset,
88 |         maxFLen,
89 |         maxGLen,
90 |         packOutput,
91 |         scale);
92 | }
93 | 
94 | 
95 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
96 |   m.def("forward", &transducer_joint_forward, "transducer joint forward (CUDA)");
97 |   m.def("backward", &transducer_joint_backward, "transducer joint backward (CUDA)");
98 | }


--------------------------------------------------------------------------------
/apex/contrib/csrc/transducer/transducer_loss.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <vector>
  3 | 
  4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
  5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
  6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
  7 | 
  8 | std::vector<torch::Tensor> transducer_loss_cuda_forward(
  9 |     torch::Tensor x,
 10 |     torch::Tensor label,
 11 |     torch::Tensor audLen,
 12 |     torch::Tensor txtLen,
 13 |     torch::Tensor batchOffset,
 14 |     int maxFLen,
 15 |     int blankIdx,
 16 |     int opt,
 17 |     bool packedInput);
 18 | 
 19 | torch::Tensor transducer_loss_cuda_backward(
 20 |     torch::Tensor x,
 21 |     torch::Tensor lossGrad,
 22 |     torch::Tensor alpha,
 23 |     torch::Tensor beta,
 24 |     torch::Tensor audLen,
 25 |     torch::Tensor txtLen,
 26 |     torch::Tensor label,
 27 |     torch::Tensor batchOffset,
 28 |     int maxFLen,
 29 |     int blankIdx,
 30 |     int opt,
 31 |     bool fuseSoftmaxBackward,
 32 |     bool packedInput);
 33 | 
 34 | 
 35 | std::vector<torch::Tensor> transducer_loss_forward(
 36 |     torch::Tensor x,
 37 |     torch::Tensor label,
 38 |     torch::Tensor fLen,
 39 |     torch::Tensor yLen,
 40 |     torch::Tensor batchOffset,
 41 |     int maxFLen,
 42 |     int blankIdx,
 43 |     int opt,
 44 |     bool packedInput
 45 |     ) {
 46 | 
 47 |     CHECK_INPUT(x);
 48 |     CHECK_INPUT(label);
 49 |     CHECK_INPUT(fLen);
 50 |     CHECK_INPUT(yLen);
 51 |     if (packedInput)
 52 |         CHECK_INPUT(batchOffset);
 53 |     return transducer_loss_cuda_forward(
 54 |         x, 
 55 |         label, 
 56 |         fLen, 
 57 |         yLen, 
 58 |         batchOffset,
 59 |         maxFLen,
 60 |         blankIdx, 
 61 |         opt,
 62 |         packedInput);
 63 | }
 64 | 
 65 | torch::Tensor transducer_loss_backward(
 66 |     torch::Tensor x,
 67 |     torch::Tensor lossGrad,
 68 |     torch::Tensor alpha,
 69 |     torch::Tensor beta,
 70 |     torch::Tensor fLen,
 71 |     torch::Tensor yLen,
 72 |     torch::Tensor label,
 73 |     torch::Tensor batchOffset,
 74 |     int maxFLen,
 75 |     int blankIdx,
 76 |     int opt,
 77 |     bool fuseSoftmaxBackward,
 78 |     bool packedInput){
 79 | 
 80 |     CHECK_INPUT(x);
 81 |     CHECK_INPUT(label);
 82 |     CHECK_INPUT(lossGrad);
 83 |     CHECK_INPUT(alpha);
 84 |     CHECK_INPUT(beta);
 85 |     CHECK_INPUT(fLen);
 86 |     CHECK_INPUT(yLen);
 87 |     if (packedInput)
 88 |         CHECK_INPUT(batchOffset);
 89 | 
 90 |     return transducer_loss_cuda_backward(
 91 |         x,
 92 |         lossGrad,
 93 |         alpha,
 94 |         beta,
 95 |         fLen,
 96 |         yLen,
 97 |         label,
 98 |         batchOffset,
 99 |         maxFLen,
100 |         blankIdx,
101 |         opt,
102 |         fuseSoftmaxBackward,
103 |         packedInput);
104 | }
105 | 
106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
107 |   m.def("forward", &transducer_loss_forward, "transducer loss forward (CUDA)");
108 |   m.def("backward", &transducer_loss_backward, "transducer loss backward (CUDA)");
109 | }
110 | 


--------------------------------------------------------------------------------
/apex/contrib/csrc/xentropy/interface.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | // CUDA forward declarations
 4 | 
 5 | std::vector<at::Tensor> softmax_xentropy_cuda(
 6 |     const at::Tensor &input,
 7 |     const at::Tensor &labels,
 8 |     const float smoothing,
 9 |     const bool half_to_float);
10 | 
11 | at::Tensor softmax_xentropy_backward_cuda(
12 |     const at::Tensor &grad_loss,
13 |     const at::Tensor &logits,
14 |     const at::Tensor &max_log_sum_exp,
15 |     const at::Tensor &labels,
16 |     const float smoothing);
17 | 
18 | // C++ interface
19 | 
20 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
21 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
22 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
23 | 
24 | std::vector<at::Tensor> softmax_xentropy_forward(
25 |     const at::Tensor &input,
26 |     const at::Tensor &labels,
27 |     const float smoothing,
28 |     const bool half_to_float) {
29 |     CHECK_CUDA(input);
30 |     CHECK_INPUT(labels);
31 | 
32 |     return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
33 | }
34 | 
35 | at::Tensor softmax_xentropy_backward(
36 |     const at::Tensor &grad_loss,
37 |     const at::Tensor &logits,
38 |     const at::Tensor &max_log_sum_exp,
39 |     const at::Tensor &labels,
40 |     const float smoothing)  {
41 |     CHECK_CUDA(grad_loss);
42 |     CHECK_CUDA(logits);
43 |     CHECK_INPUT(max_log_sum_exp);
44 |     CHECK_INPUT(labels);
45 | 
46 |     return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
47 | }
48 | 
49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
50 |     m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)");
51 |     m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)");
52 | }
53 | 


--------------------------------------------------------------------------------
/apex/contrib/fmha/__init__.py:
--------------------------------------------------------------------------------
1 | from .fmha import FMHAFun
2 | 


--------------------------------------------------------------------------------
/apex/contrib/fmha/fmha.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are met:
 6 | #     * Redistributions of source code must retain the above copyright
 7 | #       notice, this list of conditions and the following disclaimer.
 8 | #     * Redistributions in binary form must reproduce the above copyright
 9 | #       notice, this list of conditions and the following disclaimer in the
10 | #       documentation and/or other materials provided with the distribution.
11 | #     * Neither the name of the NVIDIA CORPORATION nor the
12 | #       names of its contributors may be used to endorse or promote products
13 | #       derived from this software without specific prior written permission.
14 | # 
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | ###############################################################################
27 | 
28 | 
29 | import torch
30 | import torch.nn.functional as F
31 | import fmhalib as mha
32 | 
33 | class FMHAFun(torch.autograd.Function):
34 |     @staticmethod
35 |     def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training):
36 |         batch_size = cu_seqlens.numel() - 1
37 |         if batch_size < 4:
38 |             context, S_dmask = mha.fwd_nl(qkv, cu_seqlens, p_dropout, max_s, is_training, None)
39 |         else:
40 |             context, S_dmask = mha.fwd(qkv, cu_seqlens, p_dropout, max_s, is_training, None)
41 |         ctx.save_for_backward(qkv, S_dmask)
42 |         ctx.cu_seqlens = cu_seqlens
43 |         ctx.p_dropout = p_dropout
44 |         ctx.max_s = max_s
45 |         return context
46 |     
47 |     @staticmethod
48 |     def backward(ctx, dout):
49 |         qkv, S_dmask = ctx.saved_tensors
50 |         batch_size = ctx.cu_seqlens.numel() - 1
51 |         if batch_size < 4:
52 |             dqkv, dp, _ = mha.bwd_nl(dout, qkv, S_dmask, ctx.cu_seqlens, ctx.p_dropout, ctx.max_s)
53 |         else:
54 |             dqkv, dp = mha.bwd(dout, qkv, S_dmask, ctx.cu_seqlens, ctx.p_dropout, ctx.max_s)
55 | 
56 |         return dqkv, None, None, None, None, None, None
57 | 
58 | class FMHA(torch.nn.Module):
59 | 
60 |     def __init__(self, config):
61 | 
62 |         super(FMHA, self).__init__()
63 | 
64 |         self.p_dropout = config.attention_probs_dropout_prob
65 |         self.h = config.num_attention_heads
66 |         self.hidden_size = config.hidden_size
67 |         self.d = self.hidden_size // self.h
68 |         assert self.d * self.h == self.hidden_size, "Invalid hidden size/num_heads"
69 | 
70 |     def forward(self, qkv, cu_seqlens, max_s, is_training=True):
71 | 
72 |         ctx = FMHAFun.apply(qkv.view(-1, 3, self.h, self.d), cu_seqlens, self.p_dropout, max_s, is_training)
73 | 
74 |         return ctx.view(-1, self.hidden_size)
75 | 


--------------------------------------------------------------------------------
/apex/contrib/groupbn/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import bnp
 4 |     from .batch_norm import BatchNorm2d_NHWC
 5 |     del torch
 6 |     del bnp
 7 |     del batch_norm
 8 | except ImportError as err:
 9 |     print("apex was installed without --bnp flag, contrib.groupbn is not available")
10 | 


--------------------------------------------------------------------------------
/apex/contrib/layer_norm/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_norm import FastLayerNorm
2 | 


--------------------------------------------------------------------------------
/apex/contrib/layer_norm/layer_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import init
 3 | 
 4 | import fast_layer_norm
 5 | 
 6 | class FastLayerNormFN(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, x, gamma, beta, epsilon):
 9 |         x = x.contiguous()
10 |         gamma = gamma.contiguous()
11 |         beta = beta.contiguous()
12 |         hidden_size = gamma.numel()
13 |         xmat = x.view((-1, hidden_size))
14 |         ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon)
15 |         ctx.save_for_backward(x, gamma, mu, rsigma)
16 |         return ymat.view(x.shape)
17 |     
18 |     @staticmethod
19 |     def backward(ctx, dy):
20 |         #assert dy.is_contiguous()
21 |         dy = dy.contiguous() # this happens!
22 |         x, gamma, mu, rsigma = ctx.saved_tensors
23 | 
24 |         hidden_size = gamma.numel()
25 |         xmat = x.view((-1, hidden_size))
26 |         dymat = dy.view(xmat.shape)
27 |         dxmat, dgamma, dbeta = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma)
28 |         dx = dxmat.view(x.shape)
29 |         return dx, dgamma, dbeta, None
30 | 
31 | class FastLayerNorm(torch.nn.Module):
32 |     def __init__(self, hidden_size, eps=1e-5):
33 |         super(FastLayerNorm, self).__init__()
34 |         self.epsilon = eps
35 |         self.weight = torch.nn.Parameter(torch.Tensor(hidden_size))
36 |         self.bias = torch.nn.Parameter(torch.Tensor(hidden_size))
37 |         self.reset_parameters()
38 | 
39 |     def reset_parameters(self):
40 |         init.ones_(self.weight)
41 |         init.zeros_(self.bias)
42 | 
43 |     def forward(self, x):
44 |         return FastLayerNormFN.apply(x, self.weight, self.bias, self.epsilon)
45 | 


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/MHA_bwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/multihead_attn/MHA_bwd.png


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/MHA_fwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/multihead_attn/MHA_fwd.png


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/README.md:
--------------------------------------------------------------------------------
 1 | # Fast Multihead Attention 
 2 | 
 3 | This implementation has two main features :
 4 | * A C++ implementation to avoid the CPU overheads of Pytorch found with smaller batch sizes.
 5 | * The removal of all copies and transposes found in standard implementations of Multihead Attention.
 6 | 
 7 | |                                            | Python Version | C++ Version |
 8 | | :----------------------------------------- | :------------: | :---------: |
 9 | | Layer Norm and Residual Add Variant        | X              | X           |
10 | | Includes Linear Biases                     | X              |             |
11 | | Reduces CPU Overheads                      |                | X           |
12 | | Fuses masking with Softmax                 |                | X           |
13 | | Removes Transposes and Copies              | X              | X           |
14 | | Includes Self and Encoder/Decoder Variants | X              | X           |
15 | 
16 | ## How to Instantiate
17 | 
18 | `SelfMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
19 | `EncdecMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
20 | 
21 |  `impl` has two options:
22 |  * `fast` uses C++ Version
23 |  * `default` uses Python Version
24 | 
25 | ## Instructions to build on Linux
26 | 
27 | ```
28 | $ git clone https://github.com/NVIDIA/apex
29 | $ cd apex
30 | $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./
31 | ```
32 | ## Try Performance Tests Yourself!
33 | Perf test script is found here!
34 | ```
35 | cd contrib/examples/multihead_attn
36 | ```
37 | #### Fast Multihead Attention
38 | ```
39 | python perf_test_multihead_attn.py --ref
40 | ```
41 | #### Fast Multihead Attention with C++ Implementation
42 | ```
43 | python perf_test_multihead_attn.py
44 | ```
45 | #### Compare with `torch.nn.MultiheadAttn`
46 | ```
47 | python perf_test_multihead_attn.py --native
48 | ```
49 | #### Test your own range!
50 | ```
51 | python perf_test_multihead_attn.py --seq-length 64 --num-seqs-start 10 --num-seqs-stop 120 --num-seqs-inc 5
52 | ```
53 | 
54 | ## Performance Comparisons
55 | 
56 | * Performance was measured with 64 token sequence lengths on an NVIDIA TitanV card.
57 | * Time is measured across multiple layers to simulate an in model scenario.
58 | 
59 | ![Multihead Attention Forward](MHA_fwd.png)
60 | ![Multihead Attention Backward](MHA_bwd.png)
61 | 


--------------------------------------------------------------------------------
/apex/contrib/multihead_attn/__init__.py:
--------------------------------------------------------------------------------
1 | from .self_multihead_attn import SelfMultiheadAttn
2 | from .encdec_multihead_attn import EncdecMultiheadAttn
3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func
4 | 


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp16_optimizer import FP16_Optimizer
2 | from .fused_adam import FusedAdam
3 | from .fused_lamb import FusedLAMB
4 | 


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__pycache__/fp16_optimizer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fp16_optimizer.cpython-37.pyc


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__pycache__/fused_adam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fused_adam.cpython-37.pyc


--------------------------------------------------------------------------------
/apex/contrib/optimizers/__pycache__/fused_lamb.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fused_lamb.cpython-37.pyc


--------------------------------------------------------------------------------
/apex/contrib/sparsity/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to ASP
 2 | 
 3 | This serves as a quick-start for ASP (Automatic SParsity), a tool that enables sparse training and inference for PyTorch models by adding 2 lines of Python.
 4 | 
 5 | ## Importing ASP
 6 | ```
 7 | from apex.contrib.sparsity import ASP
 8 | ```
 9 | 
10 | ## Initializing ASP
11 | 
12 | Apart from the import statement, it is sufficient to add just the following line of code before the training phase to augment the model and the optimizer for sparse training/inference:
13 | ```
14 | ASP.prune_trained_model(model, optimizer)
15 | ```
16 | 
17 | In the context of a typical PyTorch training loop, it might look like this:
18 | ```
19 | ASP.prune_trained_model(model, optimizer)
20 | 
21 | x, y = DataLoader(args)
22 | for epoch in range(epochs):
23 |     y_pred = model(x)
24 |     loss = loss_function(y_pred, y)
25 |     loss.backward()
26 |     optimizer.step()
27 | 
28 | torch.save(...)
29 | ```
30 | The `prune_trained_model` step calculates the sparse mask and applies it to the weights. This is done once, i.e., sparse locations in the weights matrix remain fixed after this step. 
31 | 
32 | ## Generate a Sparse Network
33 | 
34 | The following approach serves as a guiding example on how to generate a pruned model that can use Sparse Tensor Cores in the NVIDIA Ampere Architecture. This approach generates a model for deployment, i.e. inference mode.
35 | 
36 | ```
37 | (1) Given a fully trained (dense) network, prune parameter values in a 2:4 sparse pattern.
38 | (2) Fine-tune  the  pruned  model  with  optimization  method  and  hyper-parameters (learning-rate, schedule, number of epochs, etc.) exactly as those used to obtain the trained model.
39 | (3) (If required) Quantize the model.
40 | ```
41 | 
42 | In code, below is a sketch on how to use ASP for this approach (steps 1 and 2 above).
43 | 
44 | ```
45 | 
46 | model = define_model(..., pretrained=True) # define model architecture and load parameter tensors with trained values (by reading a trained checkpoint)
47 | criterion = ... # compare ground truth with model predition; use the same criterion as used to generate the dense trained model
48 | optimizer = ... # optimize model parameters; use the same optimizer as used to generate the dense trained model
49 | lr_scheduler = ... # learning rate scheduler; use the same schedule as used to generate the dense trained model
50 | 
51 | from apex.contrib.sparsity import ASP     
52 | ASP.prune_trained_model(model, optimizer) #pruned a trained model
53 | 
54 | x, y = DataLoader(args)
55 | for epoch in range(epochs): # train the pruned model for the same number of epochs as used to generate the dense trained model
56 |     y_pred = model(x)
57 |     loss = criterion(y_pred, y)
58 |     lr_scheduler.step()
59 |     loss.backward()
60 |     optimizer.step()
61 | 
62 | torch.save(...) # saves the pruned checkpoint with sparsity masks 
63 | ```
64 | 
65 | ## Non-Standard Usage
66 | 
67 | If your goal is to easily perpare a network for accelerated inference, please follow the recipe above.  However, ASP can also be used to perform experiments in advanced techniques like training with sparsity from initialization. For example, in order to recompute the sparse mask in between training steps, use the following method:
68 | 
69 | ```
70 | ASP.compute_sparse_masks()
71 | ```
72 | 
73 | A more thorough example can be found in `./test/toy_problem.py`. 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_masklib import create_mask
2 | from .asp import ASP
3 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/checkpointing_test_part1.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | def build_model(args):
 8 |     od = OrderedDict()
 9 |     for i in range(args.num_layers):
10 |         if i == 0:
11 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
12 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
13 |         elif i == args.num_layers-1:
14 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
15 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
16 |         else:
17 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
18 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
19 |     return torch.nn.Sequential(od)
20 | 
21 | def train_step(args, model, optimizer, input_batch, target_batch, step):
22 |     predicted_target = model(input_batch)
23 |     loss = ((predicted_target-target_batch)**2).sum()
24 |     loss.backward()
25 |     optimizer.step()
26 |     optimizer.zero_grad()
27 |     step = step + 1
28 |     #print("Step %d :: loss=%e" % (step, loss.item()))
29 |     return step
30 | 
31 | def train_loop(args, model, optimizer, step, num_steps):
32 |     for i in range(num_steps):
33 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
34 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
35 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
36 |     return step
37 | 
38 | def main(args):
39 |     #
40 |     # PART1
41 |     #
42 | 
43 |     torch.manual_seed(args.seed)
44 | 
45 |     model = build_model(args).cuda()
46 |     one_ll = next(model.children()).weight
47 |     optimizer = FusedAdam(model.parameters())
48 |     ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
49 |     ASP.init_optimizer_for_pruning(optimizer)
50 | 
51 |     step = 0
52 | 
53 |     # train for a few steps with dense weights
54 |     print("DENSE :: ",one_ll)
55 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps)
56 | 
57 |     # simulate sparsity by inserting zeros into existing dense weights
58 |     ASP.enable_sparsity()
59 | 
60 |     # train for a few steps with sparse weights
61 |     print("SPARSE :: ",one_ll)
62 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps)
63 | 
64 |     torch.save({
65 |             'step': step,
66 |             'verbosity': args.verbosity,
67 |             'seed2': args.seed2,
68 |             'pattern': args.pattern,
69 |             'whitelist': args.whitelist,
70 |             'allow_recompute_mask': args.allow_recompute_mask,
71 |             'model_state_dict': model.state_dict(),
72 |             'optimizer_state_dict': optimizer.state_dict(),
73 |             }, args.checkpoint_path)
74 | 
75 | if __name__ == '__main__':
76 |     class Args:
77 |         verbosity=3
78 |         seed = 4873
79 |         seed2 = 99875
80 |         pattern = "m4n2_2d_best"
81 |         whitelist = [torch.nn.Linear]
82 |         allow_recompute_mask = True
83 |         batch_size = 32
84 |         input_features = 8
85 |         output_features = 8
86 |         hidden_features = 32
87 |         num_layers = 4
88 |         num_dense_steps = 2000
89 |         num_sparse_steps = 3000
90 |         num_sparse_steps_2 = 1000
91 |         checkpoint_path = "part1.chkp"
92 |     args = Args()
93 | 
94 |     main(args)
95 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/checkpointing_test_part2.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | def build_model(args):
 8 |     od = OrderedDict()
 9 |     for i in range(args.num_layers):
10 |         if i == 0:
11 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
12 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
13 |         elif i == args.num_layers-1:
14 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
15 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
16 |         else:
17 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
18 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
19 |     return torch.nn.Sequential(od)
20 | 
21 | def train_step(args, model, optimizer, input_batch, target_batch, step):
22 |     predicted_target = model(input_batch)
23 |     loss = ((predicted_target-target_batch)**2).sum()
24 |     loss.backward()
25 |     optimizer.step()
26 |     optimizer.zero_grad()
27 |     step = step + 1
28 |     #print("Step %d :: loss=%e" % (step, loss.item()))
29 |     return step
30 | 
31 | def train_loop(args, model, optimizer, step, num_steps):
32 |     for i in range(num_steps):
33 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
34 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
35 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
36 |     return step
37 | 
38 | def main(step, args, model_state_dict, optimizer_state_dict):
39 |     #
40 |     # PART2
41 |     #
42 | 
43 |     model = build_model(args).cuda()
44 |     one_ll = next(model.children()).weight
45 |     optimizer = FusedAdam(model.parameters())
46 |     ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
47 |     ASP.init_optimizer_for_pruning(optimizer)
48 | 
49 |     torch.manual_seed(args.seed2)
50 |     model.load_state_dict(model_state_dict)
51 |     optimizer.load_state_dict(optimizer_state_dict)
52 | 
53 |     print("Model sparsity is %s" % ("enabled" if ASP.sparsity_is_enabled() else "disabled"))
54 | 
55 |     # train for a few steps with sparse weights
56 |     print("SPARSE :: ",one_ll)
57 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
58 | 
59 | if __name__ == '__main__':
60 |     checkpoint = torch.load("part1.chkp")
61 |     class Args:
62 |         verbosity = checkpoint['verbosity']
63 |         seed = 4873
64 |         seed2 = checkpoint['seed2']
65 |         pattern = checkpoint['pattern']
66 |         whitelist = checkpoint['whitelist']
67 |         allow_recompute_mask = checkpoint['allow_recompute_mask']
68 |         batch_size = 32
69 |         input_features = 8
70 |         output_features = 8
71 |         hidden_features = 32
72 |         num_layers = 4
73 |         num_dense_steps = 2000
74 |         num_sparse_steps = 3000
75 |         num_sparse_steps_2 = 1000
76 |         checkpoint_path = "part1.chkp"
77 |     args = Args()
78 | 
79 |     main(checkpoint['step'], args, checkpoint['model_state_dict'], checkpoint['optimizer_state_dict'])
80 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/checkpointing_test_reference.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | #
 8 | # Reference run for checkpointing test (part1 + part2)
 9 | #
10 | 
11 | def build_model(args):
12 |     od = OrderedDict()
13 |     for i in range(args.num_layers):
14 |         if i == 0:
15 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
16 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
17 |         elif i == args.num_layers-1:
18 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
19 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
20 |         else:
21 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
22 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
23 |     return torch.nn.Sequential(od)
24 | 
25 | def train_step(args, model, optimizer, input_batch, target_batch, step):
26 |     predicted_target = model(input_batch)
27 |     loss = ((predicted_target-target_batch)**2).sum()
28 |     loss.backward()
29 |     optimizer.step()
30 |     optimizer.zero_grad()
31 |     step = step + 1
32 |     #print("Step %d :: loss=%e" % (step, loss.item()))
33 |     return step
34 | 
35 | def train_loop(args, model, optimizer, step, num_steps):
36 |     for i in range(num_steps):
37 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
38 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
39 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
40 |     return step
41 | 
42 | def main(args):
43 |     #
44 |     # PART1
45 |     #
46 | 
47 |     torch.manual_seed(args.seed)
48 | 
49 |     model = build_model(args).cuda()
50 |     one_ll = next(model.children()).weight
51 |     optimizer = FusedAdam(model.parameters())
52 |     ASP.init_model_for_pruning(model, args.pattern, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
53 |     ASP.init_optimizer_for_pruning(optimizer)
54 | 
55 |     step = 0
56 | 
57 |     # train for a few steps with dense weights
58 |     print("DENSE :: ",one_ll)
59 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps)
60 | 
61 |     # simulate sparsity by inserting zeros into existing dense weights
62 |     ASP.enable_sparsity()
63 | 
64 |     # train for a few steps with sparse weights
65 |     print("SPARSE :: ",one_ll)
66 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps)
67 | 
68 |     #
69 |     # PART 2
70 |     #
71 | 
72 |     torch.manual_seed(args.seed2)
73 | 
74 |     # train for a few steps with sparse weights
75 |     print("SPARSE :: ",one_ll)
76 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
77 | 
78 | if __name__ == '__main__':
79 |     class Args:
80 |         seed = 4873
81 |         seed2 = 99875
82 |         pattern = "m4n2_2d_best"
83 |         whitelist = [torch.nn.Linear]
84 |         allow_recompute_mask = True
85 |         batch_size = 32
86 |         input_features = 8
87 |         output_features = 8
88 |         hidden_features = 32
89 |         num_layers = 4
90 |         num_dense_steps = 2000
91 |         num_sparse_steps = 3000
92 |         num_sparse_steps_2 = 1000
93 |         checkpoint_path = "part1.chkp"
94 |     args = Args()
95 | 
96 |     main(args)
97 | 


--------------------------------------------------------------------------------
/apex/contrib/sparsity/test/toy_problem.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from apex.optimizers import FusedAdam
 5 | from apex.contrib.sparsity import ASP
 6 | 
 7 | def build_model(args):
 8 |     od = OrderedDict()
 9 |     for i in range(args.num_layers):
10 |         if i == 0:
11 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features)
12 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
13 |         elif i == args.num_layers-1:
14 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features)
15 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features])
16 |         else:
17 |             od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features)
18 |             od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features])
19 |     return torch.nn.Sequential(od)
20 | 
21 | def train_step(args, model, optimizer, input_batch, target_batch, step):
22 |     predicted_target = model(input_batch)
23 |     loss = ((predicted_target-target_batch)**2).sum()
24 |     loss.backward()
25 |     optimizer.step()
26 |     optimizer.zero_grad()
27 |     step = step + 1
28 |     #print("Step %d :: loss=%e" % (step, loss.item()))
29 |     return step
30 | 
31 | def train_loop(args, model, optimizer, step, num_steps):
32 |     for i in range(num_steps):
33 |         input_batch = torch.randn([args.batch_size, args.input_features]).cuda()
34 |         target_batch = torch.randn([args.batch_size, args.output_features]).cuda()
35 |         step = train_step(args, model, optimizer, input_batch, target_batch, step)
36 |     return step
37 | 
38 | def main(args):
39 |     model = build_model(args).cuda()
40 |     one_ll = next(model.children()).weight
41 |     optimizer = FusedAdam(model.parameters())
42 |     # only prune linear layers, even though we also support conv1d, conv2d and conv3d
43 |     ASP.init_model_for_pruning(model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True)
44 |     ASP.init_optimizer_for_pruning(optimizer)
45 | 
46 |     step = 0
47 | 
48 |     # train for a few steps with dense weights
49 |     print("DENSE :: ",one_ll)
50 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps)
51 | 
52 |     # simulate sparsity by inserting zeros into existing dense weights
53 |     ASP.compute_sparse_masks()
54 | 
55 |     # train for a few steps with sparse weights
56 |     print("SPARSE :: ",one_ll)
57 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps)
58 | 
59 |     # recompute sparse masks
60 |     ASP.compute_sparse_masks()
61 | 
62 |     # train for a few steps with sparse weights
63 |     print("SPARSE :: ",one_ll)
64 |     step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
65 | 
66 |     # turn off sparsity
67 |     print("SPARSE :: ",one_ll)
68 |     ASP.restore_pruned_weights()
69 | 
70 |     # train for a few steps with dense weights
71 |     print("DENSE :: ",one_ll)
72 |     step = train_loop(args, model, optimizer, step, args.num_dense_steps_2)
73 | 
74 | if __name__ == '__main__':
75 |     class Args:
76 |         batch_size = 32
77 |         input_features = 16
78 |         output_features = 8
79 |         hidden_features = 40
80 |         num_layers = 4
81 |         num_dense_steps = 2000
82 |         num_sparse_steps = 3000
83 |         num_sparse_steps_2 = 1000
84 |         num_dense_steps_2 = 1500
85 |     args = Args()
86 | 
87 |     main(args)
88 | 


--------------------------------------------------------------------------------
/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import unittest
 4 | 
 5 | from apex.contrib.multihead_attn import EncdecMultiheadAttn
 6 | 
 7 | class EncdecMultiheadAttnNormAddTest(unittest.TestCase):
 8 |     def setUp(self, seed=1234):
 9 |         torch.manual_seed(seed)
10 |         torch.cuda.manual_seed_all(seed)
11 | 
12 |         self.seq_length   = 80
13 |         self.sequences    = 10
14 |         self.hidden_dim   = 1024
15 |         self.heads        = 16
16 |         self.dropout_prob = 0.0
17 | 
18 |         self.ref_layer = EncdecMultiheadAttn(self.hidden_dim, 
19 |                                              self.heads, 
20 |                                              dropout=self.dropout_prob, 
21 |                                              bias=False, 
22 |                                              include_norm_add=True, 
23 |                                              impl='default')
24 |         self.ref_layer.cuda().half()
25 |         self.ref_layer.reset_parameters()
26 |         self.ref_inputs_q = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
27 |                                         dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
28 |         self.ref_inputs_k = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
29 |                                         dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
30 | 
31 |         # Reset seed so parameters are identical
32 |         torch.manual_seed(seed)
33 |         torch.cuda.manual_seed_all(seed)
34 |         
35 |         self.tst_layer = EncdecMultiheadAttn(self.hidden_dim, 
36 |                                              self.heads, 
37 |                                              dropout=self.dropout_prob, 
38 |                                              bias=False, 
39 |                                              include_norm_add=True, 
40 |                                              impl='fast')
41 |         self.tst_layer.cuda().half()
42 |         self.tst_layer.reset_parameters()
43 |         
44 |         self.tst_inputs_q = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
45 |                                         dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
46 |         self.tst_inputs_k = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
47 |                                         dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
48 | 
49 |     def test_encdec_multihead_attn_norm_add(self) :
50 |         grads         = torch.randn_like(self.tst_inputs_q)
51 | 
52 |         ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q, 
53 |                                                self.ref_inputs_k, 
54 |                                                self.ref_inputs_k,
55 |                                                key_padding_mask=None, 
56 |                                                need_weights=False, 
57 |                                                attn_mask=None,
58 |                                                is_training=True)
59 | 
60 |         tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q, 
61 |                                                self.tst_inputs_k, 
62 |                                                self.tst_inputs_k,
63 |                                                key_padding_mask=None, 
64 |                                                need_weights=False, 
65 |                                                attn_mask=None,
66 |                                                is_training=True)
67 |         
68 |         self.ref_inputs_q.backward(grads)
69 |         self.tst_inputs_q.backward(grads)
70 | 
71 |         self.assertTrue(torch.allclose(self.ref_inputs_q,  self.tst_inputs_q,  atol=1e-5, rtol=1e-5))
72 |         self.assertTrue(torch.allclose(self.ref_inputs_k,  self.tst_inputs_k,  atol=1e-5, rtol=1e-5))
73 |         self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3))
74 |         self.assertTrue(torch.allclose(self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3))
75 | 
76 | if __name__ == '__main__':
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import unittest
 4 | 
 5 | from apex.contrib.multihead_attn import SelfMultiheadAttn
 6 | 
 7 | class SelfMultiheadAttnTest(unittest.TestCase):
 8 |     def setUp(self, seed=1234):
 9 |         torch.manual_seed(seed)
10 |         torch.cuda.manual_seed_all(seed)
11 | 
12 |         self.seq_length   = 80
13 |         self.sequences    = 10
14 |         self.hidden_dim   = 1024
15 |         self.heads        = 16
16 |         self.dropout_prob = 0.0
17 | 
18 |         self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 
19 |                                            self.heads, 
20 |                                            dropout=self.dropout_prob, 
21 |                                            bias=True, 
22 |                                            include_norm_add=False, 
23 |                                            separate_qkv_params=True, 
24 |                                            mask_additive=True, 
25 |                                            impl='default')
26 |         self.ref_layer.cuda().half()
27 |         self.ref_layer.reset_parameters()
28 |         self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
29 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
30 |         # Reset seed so parameters are identical
31 |         torch.manual_seed(seed)
32 |         torch.cuda.manual_seed_all(seed)
33 |         
34 |         self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 
35 |                                            self.heads, 
36 |                                            dropout=self.dropout_prob, 
37 |                                            bias=True, 
38 |                                            include_norm_add=False, 
39 |                                            separate_qkv_params=True, 
40 |                                            mask_additive=True, 
41 |                                            impl='fast')
42 |         self.tst_layer.cuda().half()
43 |         self.tst_layer.reset_parameters()
44 |         
45 |         self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
46 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
47 |     
48 |     def test_self_multihead_attn_additive_mask(self) :
49 |         grads         = torch.randn_like(self.tst_inputs)
50 |         mask = ((torch.randn(self.sequences, self.seq_length) > 0) * -10000.0).half().cuda()
51 | 
52 |         ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
53 |                                                self.ref_inputs, 
54 |                                                self.ref_inputs,
55 |                                                key_padding_mask=mask, 
56 |                                                need_weights=False, 
57 |                                                attn_mask=None,
58 |                                                is_training=True)
59 | 
60 |         tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
61 |                                                self.tst_inputs, 
62 |                                                self.tst_inputs,
63 |                                                key_padding_mask=mask, 
64 |                                                need_weights=False, 
65 |                                                attn_mask=None,
66 |                                                is_training=True)
67 | 
68 |         
69 |         self.ref_inputs.backward(grads)
70 |         self.tst_inputs.backward(grads)
71 | 
72 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
73 |         self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3))
74 |         self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
75 | 
76 | if __name__ == '__main__':
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import unittest
 3 | import torch.nn.functional as F
 4 | from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
 5 | 
 6 | class FusedSoftmaxTest(unittest.TestCase):
 7 |     def setUp(self, seed=1234):
 8 |         torch.manual_seed(seed)
 9 |         torch.cuda.manual_seed_all(seed)
10 | 
11 |         self.seq_length   = 80
12 |         self.sequences    = 10
13 |         self.hidden_dim   = 1024
14 |         self.heads        = 16
15 |         self.dropout_prob = 0.0
16 | 
17 |         self.mask = (torch.randn(self.sequences,self.seq_length)>0).cuda()
18 |         self.mask = self.mask.half()*-10000
19 |         self.ref_inputs = torch.randn(self.heads * self.sequences, self.seq_length, self.seq_length, 
20 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
21 |         
22 |         self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)
23 | 
24 |     def test_fused_softmax(self) :
25 |         grads = torch.randn_like(self.tst_inputs)
26 |         y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length)
27 |         y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2)
28 |         y_ref = y_ref.view(self.sequences*self.heads, self.seq_length, self.seq_length) 
29 |         y_ref = F.softmax(y_ref, dim=-1)
30 |         y_ref = torch._fused_dropout(y_ref, 1.0)    
31 |    
32 |         y_tst = fast_mask_softmax_dropout_func(True, self.heads, self.tst_inputs, self.mask, True, 0.0)        
33 |         y_ref[0].backward(grads)
34 |         y_tst.backward(grads)
35 | 
36 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
37 |         self.assertTrue(torch.allclose(y_ref[0], y_tst, atol=1e-3, rtol=1e-3))
38 |         self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import unittest
 4 | 
 5 | from apex.contrib.multihead_attn import SelfMultiheadAttn
 6 | 
 7 | class SelfMultiheadAttnNormAddTest(unittest.TestCase):
 8 |     def setUp(self, seed=1234):
 9 |         torch.manual_seed(seed)
10 |         torch.cuda.manual_seed_all(seed)
11 | 
12 |         self.seq_length   = 80
13 |         self.sequences    = 10
14 |         self.hidden_dim   = 1024
15 |         self.heads        = 16
16 |         self.dropout_prob = 0.0
17 | 
18 |         self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 
19 |                                            self.heads, 
20 |                                            dropout=self.dropout_prob, 
21 |                                            bias=False, 
22 |                                            include_norm_add=True, 
23 |                                            impl='default')
24 |         self.ref_layer.cuda().half()
25 |         self.ref_layer.reset_parameters()
26 |         self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
27 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
28 | 
29 |         # Reset seed so parameters are identical
30 |         torch.manual_seed(seed)
31 |         torch.cuda.manual_seed_all(seed)
32 |         
33 |         self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 
34 |                                            self.heads, 
35 |                                            dropout=self.dropout_prob, 
36 |                                            bias=False, 
37 |                                            include_norm_add=True, 
38 |                                            impl='fast')
39 |         self.tst_layer.cuda().half()
40 |         self.tst_layer.reset_parameters()
41 |         
42 |         self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 
43 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
44 | 
45 |     def test_self_multihead_attn_norm_add(self) :
46 |         grads         = torch.randn_like(self.tst_inputs)
47 | 
48 |         ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 
49 |                                                self.ref_inputs, 
50 |                                                self.ref_inputs,
51 |                                                key_padding_mask=None, 
52 |                                                need_weights=False, 
53 |                                                attn_mask=None,
54 |                                                is_training=True)
55 | 
56 |         tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 
57 |                                                self.tst_inputs, 
58 |                                                self.tst_inputs,
59 |                                                key_padding_mask=None, 
60 |                                                need_weights=False, 
61 |                                                attn_mask=None,
62 |                                                is_training=True)
63 |         
64 |         self.ref_inputs.backward(grads)
65 |         self.tst_inputs.backward(grads)
66 | 
67 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
68 |         self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3))
69 |         self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/apex/contrib/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | from .transducer import TransducerJoint
2 | from .transducer import TransducerLoss


--------------------------------------------------------------------------------
/apex/contrib/xentropy/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import xentropy_cuda
 4 |     from .softmax_xentropy import SoftmaxCrossEntropyLoss
 5 |     del torch
 6 |     del xentropy_cuda
 7 |     del softmax_xentropy
 8 | except ImportError as err:
 9 |     print("apex was installed without --xentropy flag, contrib.xentropy is not available")
10 | 


--------------------------------------------------------------------------------
/apex/contrib/xentropy/softmax_xentropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import xentropy_cuda
 3 | 
 4 | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
 5 |     @staticmethod
 6 |     def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
 7 |         losses, max_log_sum_exp = xentropy_cuda.forward(
 8 |             logits, labels, smoothing, half_to_float)
 9 |         losses.masked_fill_(labels==padding_idx, 0)
10 | 
11 |         ctx.save_for_backward(logits, max_log_sum_exp, labels,
12 |             torch.FloatTensor([smoothing]),
13 |             torch.LongTensor([padding_idx]))
14 | 
15 |         return losses
16 | 
17 |     @staticmethod
18 |     def backward(ctx, grad_loss):
19 |         logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
20 | 
21 |         if not grad_loss.is_contiguous():
22 |             grad_loss = grad_loss.contiguous()
23 |         grad_loss.masked_fill_(labels==padding_idx.item(), 0)
24 |         grad_logits = xentropy_cuda.backward(
25 |             grad_loss.contiguous(), logits, max_log_sum_exp,
26 |             labels, smoothing.item())
27 | 
28 |         return grad_logits, None, None, None, None
29 | 


--------------------------------------------------------------------------------
/apex/fp16_utils/README.md:
--------------------------------------------------------------------------------
 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
 2 | 
 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
 4 | 
 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
 6 | 
 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 8 | 
 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 | 
11 | 
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
13 | 
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 | 
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 | 


--------------------------------------------------------------------------------
/apex/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fp16util import (
 2 |     BN_convert_float,
 3 |     network_to_half,
 4 |     prep_param_lists,
 5 |     model_grads_to_master_grads,
 6 |     master_params_to_model_params,
 7 |     tofp16,
 8 |     to_python_float,
 9 |     clip_grad_norm,
10 |     convert_module,
11 |     convert_network,
12 |     FP16Model,
13 | )
14 | 
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 | 


--------------------------------------------------------------------------------
/apex/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import *
2 | 


--------------------------------------------------------------------------------
/apex/mlp/mlp.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | import math
 3 | import torch
 4 | from torch import nn
 5 | import mlp_cuda
 6 | from .. import amp
 7 | 
 8 | class MlpFunction(torch.autograd.Function):
 9 |     @staticmethod
10 |     def forward(ctx, bias, activation, *args):
11 |         output = mlp_cuda.forward(bias, activation, args)
12 |         ctx.save_for_backward(*args)
13 |         ctx.outputs = output
14 |         ctx.bias = bias
15 |         ctx.activation = activation
16 |         return output[0]
17 | 
18 |     @staticmethod
19 |     def backward(ctx, grad_o):
20 |         grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
21 |         del ctx.outputs
22 |         return (None, None, *grads)
23 | 
24 | mlp_function = amp.half_function(MlpFunction.apply)
25 | 
26 | class MLP(torch.nn.Module):
27 |     """Launch MLP in C++
28 | 
29 |     Args:
30 |         mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
31 |         bias (bool): Default True:
32 |         relu (bool): Default True
33 |     """
34 |     def __init__(self, mlp_sizes, bias=True, activation='relu'):
35 |         super(MLP, self).__init__()
36 |         self.num_layers = len(mlp_sizes) - 1
37 |         self.mlp_sizes = copy(mlp_sizes)
38 |         self.bias = 1 if bias else 0
39 | 
40 |         if activation is 'none':
41 |             self.activation = 0
42 |         elif activation is 'relu':
43 |             self.activation = 1
44 |         elif activation is 'sigmoid':
45 |             self.activation = 2
46 |         else:
47 |             raise TypeError("activation must be relu or none.")
48 | 
49 |         self.weights = []
50 |         self.biases = []
51 |         for i in range(self.num_layers):
52 |             w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i]))
53 |             self.weights.append(w)
54 |             name = 'weight_{}'.format(i)
55 |             setattr(self, name, w)
56 |             if self.bias:
57 |                 b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
58 |                 self.biases.append(b)
59 |                 name = 'bias_{}'.format(i)
60 |                 setattr(self, name, b)
61 | 
62 |         self.reset_parameters()
63 | 
64 |     def reset_parameters(self):
65 |         for weight in self.weights:
66 |             dimsum = weight.size(0) + weight.size(1)
67 |             std = math.sqrt(2. / float(dimsum))
68 |             nn.init.normal_(weight, 0., std)
69 |         if self.bias:
70 |             for bias in self.biases:
71 |                 std = math.sqrt(1. / float(bias.size(0)))
72 |                 nn.init.normal_(bias, 0., std)
73 | 
74 |     def forward(self, input):
75 |         return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)
76 | 
77 |     def extra_repr(self):
78 |         s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
79 |         return s
80 | 


--------------------------------------------------------------------------------
/apex/multi_tensor_apply/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_tensor_apply import MultiTensorApply
2 | 
3 | multi_tensor_applier = MultiTensorApply(2048*32)
4 | 
5 | 


--------------------------------------------------------------------------------
/apex/multi_tensor_apply/multi_tensor_apply.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class MultiTensorApply(object):
 4 |     available = False
 5 |     warned = False
 6 | 
 7 |     def __init__(self, chunk_size):
 8 |         try:
 9 |             import amp_C
10 |             MultiTensorApply.available = True
11 |             self.chunk_size = chunk_size
12 |         except ImportError as err:
13 |             MultiTensorApply.available = False
14 |             MultiTensorApply.import_err = err
15 | 
16 |     def check_avail(self):
17 |         if MultiTensorApply.available == False:
18 |             raise RuntimeError(
19 |                 "Attempted to call MultiTensorApply method, but MultiTensorApply "
20 |                 "is not available, possibly because Apex was installed without "
21 |                 "--cpp_ext --cuda_ext.  Original import error message:",
22 |                 MultiTensorApply.import_err)
23 | 
24 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
25 |         self.check_avail()
26 | 
27 |         return op(self.chunk_size,
28 |                   noop_flag_buffer,
29 |                   tensor_lists,
30 |                   *args)
31 | 


--------------------------------------------------------------------------------
/apex/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_layer_norm import FusedLayerNorm
2 | 


--------------------------------------------------------------------------------
/apex/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_sgd import FusedSGD
2 | from .fused_adam import FusedAdam
3 | from .fused_novograd import FusedNovoGrad
4 | from .fused_lamb import FusedLAMB
5 | from .fused_adagrad import FusedAdagrad


--------------------------------------------------------------------------------
/apex/parallel/README.md:
--------------------------------------------------------------------------------
 1 | ## Distributed Data Parallel
 2 | 
 3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
 4 | 
 5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
 6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of
 7 | transfers required.
 8 | 
 9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
10 | 
11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html)
12 | 
13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
14 | 
15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
16 | 
17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
18 | 
19 | ### Synchronized Batch Normalization
20 | 
21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
22 | It reduces stats on the first (channel) dimension of the Tensor and accepts
23 | arbitrary spatial dimensions.
24 | 
25 | #### Installation
26 | 
27 | Apex provides two sync BN implementation:
28 | 
29 | 1. There is the Python-only implementation, which is the default implementation
30 | when install with `python setup.py install`.
31 | It uses PyTorch primitive operations and distributed communication package from
32 | `torch.distributed`.
33 | 
34 |    - _Python-only implementation requires input tensor to be of same data type as
35 | layer_
36 | 
37 | 2. We also provide implementation with kernels through CUDA/C++ extension with
38 | improved performance. We are experimenting with Welford and Kahan for reduction
39 | hoping to get better accuracy.
40 |    To use the kernel implementation, user need to install Apex with CUDA extension
41 | enabled `python setup.py install --cuda_ext`.
42 | 
43 |    - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
44 | This is required to run imagenet example in fp16._
45 | 
46 |    - _Currently kernel implementation only supports GPU._
47 | 
48 | #### HowTo
49 | 
50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with
51 | the layer explicitly.
52 | 
53 | ```
54 | import apex
55 | input_t = torch.randn(3, 5, 20).cuda()
56 | sbn = apex.parallel.SyncBatchNorm(5).cuda()
57 | output_t = sbn(input)
58 | ```
59 | 
60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
61 | 
62 | ```
63 | # model is an instance of torch.nn.Module
64 | import apex
65 | sync_bn_model = apex.parallel.convert_syncbn_model(model)
66 | ```
67 | 


--------------------------------------------------------------------------------
/apex/parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | if hasattr(torch.distributed, 'ReduceOp'):
 4 |     ReduceOp = torch.distributed.ReduceOp
 5 | elif hasattr(torch.distributed, 'reduce_op'):
 6 |     ReduceOp = torch.distributed.reduce_op
 7 | else:
 8 |     ReduceOp = torch.distributed.deprecated.reduce_op
 9 | 
10 | from .distributed import DistributedDataParallel, Reducer
11 | # This is tricky because I'd like SyncBatchNorm to be exposed the same way
12 | # for both the cuda-enabled and python-fallback versions, and I don't want
13 | # to suppress the error information.
14 | try:
15 |     import syncbn
16 |     from .optimized_sync_batchnorm import SyncBatchNorm
17 | except ImportError as err:
18 |     from .sync_batchnorm import SyncBatchNorm
19 |     SyncBatchNorm.syncbn_import_error = err
20 | 
21 | def convert_syncbn_model(module, process_group=None, channel_last=False):
22 |     '''
23 |     Recursively traverse module and its children to replace all instances of
24 |     ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`.
25 | 
26 |     All ``torch.nn.BatchNorm*N*d`` wrap around
27 |     ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch
28 |     to use sync BN.
29 | 
30 |     Args:
31 |         module (torch.nn.Module): input module
32 | 
33 |     Example::
34 | 
35 |         >>> # model is an instance of torch.nn.Module
36 |         >>> import apex
37 |         >>> sync_bn_model = apex.parallel.convert_syncbn_model(model)
38 |     '''
39 |     mod = module
40 |     if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm):
41 |         return module
42 |     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
43 |         mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last)
44 |         mod.running_mean = module.running_mean
45 |         mod.running_var = module.running_var
46 |         mod.num_batches_tracked = module.num_batches_tracked
47 |         if module.affine:
48 |             mod.weight.data = module.weight.data.clone().detach()
49 |             mod.bias.data = module.bias.data.clone().detach()
50 |     for name, child in module.named_children():
51 |         mod.add_module(name, convert_syncbn_model(child,
52 |                                                   process_group=process_group,
53 |                                                   channel_last=channel_last))
54 |     # TODO(jie) should I delete model explicitly?
55 |     del module
56 |     return mod
57 | 
58 | def create_syncbn_process_group(group_size):
59 |     '''
60 |     Creates process groups to be used for syncbn of a give ``group_size`` and returns
61 |     process group that current GPU participates in.
62 | 
63 |     ``group_size`` must divide the total number of GPUs (world_size).
64 | 
65 |     ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned.
66 | 
67 |     ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead.
68 | 
69 |     Args:
70 |         group_size (int): number of GPU's to collaborate for sync bn
71 | 
72 |     Example::
73 | 
74 |         >>> # model is an instance of torch.nn.Module
75 |         >>> import apex
76 |         >>> group = apex.parallel.create_syncbn_process_group(group_size)
77 |     '''
78 | 
79 |     if group_size==0:
80 |         return None
81 | 
82 |     world_size = torch.distributed.get_world_size()
83 |     assert(world_size >= group_size)
84 |     assert(world_size % group_size == 0)
85 | 
86 |     group=None
87 |     for group_num in (range(world_size//group_size)):
88 |         group_ids = range(group_num*group_size, (group_num+1)*group_size)
89 |         cur_group = torch.distributed.new_group(ranks=group_ids)
90 |         if (torch.distributed.get_rank()//group_size == group_num):
91 |             group = cur_group
92 |             #can not drop out and return here, every process must go through creation of all subgroups
93 | 
94 |     assert(group is not None)
95 |     return group
96 | 


--------------------------------------------------------------------------------
/apex/parallel/multiproc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def docstring_hack():
 6 |     """
 7 |     Multiproc file which will launch a set of processes locally for multi-gpu
 8 |     usage: python -m apex.parallel.multiproc main.py ...
 9 |     """
10 |     pass
11 | 
12 | argslist = list(sys.argv)[1:]
13 | world_size = torch.cuda.device_count()
14 | 
15 | if '--world-size' in argslist:
16 |     world_size = int(argslist[argslist.index('--world-size')+1])
17 | else:
18 |     argslist.append('--world-size')
19 |     argslist.append(str(world_size))
20 | 
21 | workers = []
22 | 
23 | for i in range(world_size):
24 |     if '--rank' in argslist:
25 |         argslist[argslist.index('--rank')+1] = str(i)
26 |     else:
27 |         argslist.append('--rank')
28 |         argslist.append(str(i))
29 |     stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
30 |     print(argslist)
31 |     p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
32 |     workers.append(p)
33 | 
34 | for p in workers:
35 |     p.wait()
36 | 


--------------------------------------------------------------------------------
/apex/parallel/sync_batchnorm_kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd.function import Function
 3 | 
 4 | from apex.parallel import ReduceOp
 5 | 
 6 | 
 7 | class SyncBatchnormFunction(Function):
 8 | 
 9 |     @staticmethod
10 |     def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size):
11 |         torch.cuda.nvtx.range_push("sync_BN_fw")
12 |         # transpose it to channel last to support broadcasting for input with different rank
13 |         c_last_input = input.transpose(1, -1).contiguous().clone()
14 | 
15 |         ctx.save_for_backward(c_last_input, weight, bias,
16 |                               running_mean, running_variance)
17 |         ctx.eps = eps
18 |         ctx.process_group = process_group
19 |         ctx.world_size = world_size
20 | 
21 |         c_last_input = (c_last_input - running_mean) / \
22 |             torch.sqrt(running_variance + eps)
23 | 
24 |         if weight is not None:
25 |             c_last_input = c_last_input * weight
26 |         if bias is not None:
27 |             c_last_input = c_last_input + bias
28 | 
29 |         torch.cuda.nvtx.range_pop()
30 |         return c_last_input.transpose(1, -1).contiguous().clone()
31 | 
32 |     @staticmethod
33 |     def backward(ctx, grad_output):
34 |         torch.cuda.nvtx.range_push("sync_BN_bw")
35 |         # mini batch mean & var are calculated by forward path.
36 |         # mu = 1./N*np.sum(h, axis = 0)
37 |         # var = 1./N*np.sum((h-mu)**2, axis = 0)
38 |         c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors
39 | 
40 |         eps = ctx.eps
41 |         process_group = ctx.process_group
42 |         world_size = ctx.world_size
43 |         grad_input = grad_weight = grad_bias = None
44 |         num_features = running_mean.size()[0]
45 | 
46 |         # transpose it to channel last to support broadcasting for input with different rank
47 |         torch.cuda.nvtx.range_push("carilli field")
48 |         c_last_grad = grad_output.transpose(1, -1).contiguous()
49 |         # squash non-channel dimension so we can easily calculate mean
50 |         c_grad = c_last_grad.view(-1, num_features).contiguous()
51 |         torch.cuda.nvtx.range_pop()
52 | 
53 |         # calculate grad_input
54 |         if ctx.needs_input_grad[0]:
55 |             # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0)
56 |             #     - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0))
57 |             mean_dy = c_grad.mean(0)
58 |             mean_dy_xmu = (c_last_grad * (c_last_input -
59 |                                           running_mean)).view(-1, num_features).mean(0)
60 |             if torch.distributed.is_initialized():
61 |                 torch.distributed.all_reduce(
62 |                     mean_dy, ReduceOp.SUM, process_group)
63 |                 mean_dy = mean_dy / world_size
64 |                 torch.distributed.all_reduce(
65 |                     mean_dy_xmu, ReduceOp.SUM, process_group)
66 |                 mean_dy_xmu = mean_dy_xmu / world_size
67 |             c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / (
68 |                 running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps)
69 |             if weight is not None:
70 |                 c_last_grad_input.mul_(weight)
71 |             grad_input = c_last_grad_input.transpose(1, -1).contiguous()
72 | 
73 |         # calculate grad_weight
74 |         grad_weight = None
75 |         if weight is not None and ctx.needs_input_grad[1]:
76 |             # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0)
77 |             grad_weight = ((c_last_input - running_mean) / torch.sqrt(
78 |                 running_variance + eps) * c_last_grad).view(-1, num_features).sum(0)
79 | 
80 |         # calculate grad_bias
81 |         grad_bias = None
82 |         if bias is not None and ctx.needs_input_grad[2]:
83 |             # dbeta = np.sum(dy, axis=0)
84 |             grad_bias = c_grad.sum(0)
85 | 
86 |         torch.cuda.nvtx.range_pop()
87 |         return grad_input, grad_weight, grad_bias, None, None, None, None, None
88 | 


--------------------------------------------------------------------------------
/apex/pyprof/FAQs.md:
--------------------------------------------------------------------------------
 1 | 1. How do I intercept the Adam optimizer in APEX ?
 2 | 
 3 | 	```python
 4 | 	from apex import pyprof
 5 | 	import fused_adam_cuda
 6 | 	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
 7 | 	```
 8 | 
 9 | 2. If you are using JIT and/or AMP, the correct initialization sequence is
10 | 	1. Let any JIT to finish.
11 | 	2. Initlialize pyprof `pyprof.nvtx.init()`.
12 | 	3. Initialize AMP.
13 | 
14 | 3. How do I profile with `torch.distributed.launch` ?
15 | 
16 | 	```python
17 | 	nvprof -f -o net%p.sql \
18 | 		--profile-from-start off \
19 | 		--profile-child-processes \
20 | 		python -m torch.distributed.launch net.py
21 | 	```
22 | 


--------------------------------------------------------------------------------
/apex/pyprof/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | 
3 | from . import nvtx, prof
4 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.sql
3 | *.dict
4 | *.csv
5 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/apex/README.md:
--------------------------------------------------------------------------------
1 | This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
2 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/apex/fused_adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import fused_adam_cuda
 3 | from apex.optimizers import FusedAdam, FP16_Optimizer
 4 | from apex import pyprof
 5 | 
 6 | pyprof.nvtx.init()
 7 | pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
 8 | 
 9 | model = torch.nn.Linear(10, 20).cuda().half()
10 | criterion = torch.nn.CrossEntropyLoss().cuda()
11 | optimizer = FusedAdam(model.parameters())
12 | optimizer = FP16_Optimizer(optimizer)
13 | 
14 | x = torch.ones(32, 10).cuda().half()
15 | target = torch.empty(32, dtype=torch.long).random_(20).cuda()
16 | y = model(x)
17 | loss = criterion(y, target)
18 | optimizer.zero_grad()
19 | loss.backward()
20 | optimizer.step()
21 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/apex/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import fused_layer_norm_cuda
 3 | from apex.normalization import FusedLayerNorm
 4 | from apex import pyprof
 5 | 
 6 | pyprof.nvtx.init()
 7 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
 8 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
 9 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
10 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')
11 | 
12 | input = torch.randn(20, 5, 10, 10).cuda()
13 | 
14 | # With Learnable Parameters
15 | m = FusedLayerNorm(input.size()[1:]).cuda()
16 | output = m(input)
17 | 
18 | # Without Learnable Parameters
19 | m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
20 | output = m(input)
21 | 
22 | # Normalize over last two dimensions
23 | m = FusedLayerNorm([10, 10]).cuda()
24 | output = m(input)
25 | 
26 | # Normalize over last dimension of size 10
27 | m = FusedLayerNorm(10).cuda()
28 | output = m(input)
29 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/apex/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/custom_func_module/README.md:
--------------------------------------------------------------------------------
1 | This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.
2 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/custom_func_module/custom_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | #Initialize pyprof
 7 | pyprof.nvtx.init()
 8 | 
 9 | class Foo(torch.autograd.Function):
10 | 	@staticmethod
11 | 	def forward(ctx, in1, in2):
12 | 		out = in1 + in2		#This could be a custom C/C++ function.
13 | 		return out
14 | 
15 | 	@staticmethod
16 | 	def backward(ctx, grad):
17 | 		in1_grad = grad		#This could be a custom C/C++ function.
18 | 		in2_grad = grad		#This could be a custom C/C++ function.
19 | 		return in1_grad, in2_grad
20 | 
21 | #Hook the forward and backward functions to pyprof
22 | pyprof.nvtx.wrap(Foo, 'forward')
23 | pyprof.nvtx.wrap(Foo, 'backward')
24 | 
25 | foo = Foo.apply
26 | 
27 | x = torch.ones(4,4).cuda()
28 | y = torch.ones(4,4).cuda()
29 | 
30 | with torch.autograd.profiler.emit_nvtx():
31 | 	profiler.start()
32 | 	z = foo(x,y)
33 | 	profiler.stop()
34 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/custom_func_module/custom_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | pyprof.nvtx.init()
 7 | 
 8 | class Foo(torch.nn.Module):
 9 |     def __init__(self, size):
10 |         super(Foo, self).__init__()
11 |         self.n = torch.nn.Parameter(torch.ones(size))
12 |         self.m = torch.nn.Parameter(torch.ones(size))
13 | 
14 |     def forward(self, input):
15 |         return self.n*input + self.m
16 | 
17 | #Hook the forward function to pyprof
18 | pyprof.nvtx.wrap(Foo, 'forward')
19 | 
20 | foo = Foo(4)
21 | foo.cuda()
22 | x = torch.ones(4).cuda()
23 | 
24 | with torch.autograd.profiler.emit_nvtx():
25 | 	profiler.start()
26 | 	z = foo(x)
27 | 	profiler.stop()
28 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/custom_func_module/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/imagenet/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python -m apex.pyprof.parse"
10 | prof="python -m apex.pyprof.prof"
11 | 
12 | for net in "resnet50"
13 | do
14 | 	for optim in adam sgd
15 | 	do
16 | 		for batch in 32 64
17 | 		do
18 | 			base="torchvision".$net.$optim.$batch
19 | 			sql=$base.sql
20 | 			dict=$base.dict
21 | 
22 | 			#NVprof
23 | 			echo "nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch"
24 | 			nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch
25 | 
26 | 			#Parse
27 | 			echo $parse $sql
28 | 			$parse $sql > $dict
29 | 
30 | 			#Prof
31 | 			echo $prof $dict
32 | 			$prof -w 130 $dict
33 | #			\rm $sql $dict
34 | 		done
35 | 	done
36 | done
37 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/README.md:
--------------------------------------------------------------------------------
 1 | *As of this writing, these examples do not work
 2 | because of changes being proposed in PyTorch.*
 3 | 
 4 | There are two ways to use PyTorch JIT
 5 |  - Scripting
 6 |  - Tracing
 7 | 
 8 | In addition, we can JIT a
 9 |  - Stand alone function
10 |  - Class / class method
11 | 
12 | This directory has an example for each of the 4 cases.
13 | Intercepting (monkey patching) JITted code has a few extra steps,
14 | which are explained through comments.
15 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/jit_script_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | #The following creates an object "foo" of type ScriptModule
 8 | #The new object has a function called "forward"
 9 | 
10 | @torch.jit.script
11 | def foo(x, y):
12 | 	return torch.sigmoid(x) + y
13 | 
14 | #Initialize pyprof after the JIT step
15 | pyprof.nvtx.init()
16 | 
17 | #Assign a name to the object "foo"
18 | foo.__name__ = "foo"
19 | 
20 | #Hook up the forward function to pyprof
21 | pyprof.nvtx.wrap(foo, 'forward')
22 | 
23 | x = torch.zeros(4,4).cuda()
24 | y = torch.ones(4,4).cuda()
25 | 
26 | with torch.autograd.profiler.emit_nvtx():
27 | 	profiler.start()
28 | 	z = foo(x, y)
29 | 	profiler.stop()
30 | 	print(z)
31 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/jit_script_method.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | class Foo(torch.jit.ScriptModule):
 8 |     def __init__(self, size):
 9 |         super(Foo, self).__init__()
10 |         self.n = torch.nn.Parameter(torch.ones(size))
11 |         self.m = torch.nn.Parameter(torch.ones(size))
12 | 
13 |     @torch.jit.script_method
14 |     def forward(self, input):
15 |         return self.n*input + self.m
16 | 
17 | #Initialize pyprof after the JIT step
18 | pyprof.nvtx.init()
19 | 
20 | #Hook up the forward function to pyprof
21 | pyprof.nvtx.wrap(Foo, 'forward')
22 | 
23 | foo = Foo(4)
24 | foo.cuda()
25 | x = torch.ones(4).cuda()
26 | 
27 | with torch.autograd.profiler.emit_nvtx():
28 | 	profiler.start()
29 | 	z = foo(x)
30 | 	profiler.stop()
31 | 	print(z)
32 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/jit_trace_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | def foo(x, y):
 8 | 	return torch.sigmoid(x) + y
 9 | 
10 | x = torch.zeros(4,4).cuda()
11 | y = torch.ones(4,4).cuda()
12 | 
13 | #JIT the function using tracing
14 | #This returns an object of type ScriptModule with a forward method.
15 | traced_foo = torch.jit.trace(foo, (x,y))
16 | 
17 | #Initialize pyprof after the JIT step
18 | pyprof.nvtx.init()
19 | 
20 | #Assign a name to the object "traced_foo"
21 | traced_foo.__dict__['__name__'] = "foo"
22 | 
23 | #Hook up the forward function to pyprof
24 | pyprof.nvtx.wrap(traced_foo, 'forward')
25 | 
26 | with torch.autograd.profiler.emit_nvtx():
27 | 	profiler.start()
28 | 	z = traced_foo(x, y)
29 | 	profiler.stop()
30 | 	print(z)
31 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/jit_trace_method.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | class Foo(torch.nn.Module):
 8 |     def __init__(self, size):
 9 |         super(Foo, self).__init__()
10 |         self.n = torch.nn.Parameter(torch.ones(size))
11 |         self.m = torch.nn.Parameter(torch.ones(size))
12 | 
13 |     def forward(self, input):
14 |         return self.n*input + self.m
15 | 
16 | foo = Foo(4)
17 | foo.cuda()
18 | x = torch.ones(4).cuda()
19 | 
20 | #JIT the class using tracing
21 | traced_foo = torch.jit.trace(foo, x)
22 | 
23 | #Initialize pyprof after the JIT step
24 | pyprof.nvtx.init()
25 | 
26 | #Assign a name to the object "traced_foo"
27 | traced_foo.__dict__['__name__'] = "foo"
28 | 
29 | #Hook up the forward function to pyprof
30 | pyprof.nvtx.wrap(traced_foo, 'forward')
31 | 
32 | with torch.autograd.profiler.emit_nvtx():
33 | 	profiler.start()
34 | 	z = traced_foo(x)
35 | 	profiler.stop()
36 | 	print(z)
37 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/jit/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/lenet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.cuda.profiler as profiler
 7 | import torch.optim as optim
 8 | 
 9 | from apex import pyprof
10 | pyprof.nvtx.init()
11 | 
12 | class LeNet5(nn.Module):
13 | 	def __init__(self):
14 | 		super(LeNet5, self).__init__()
15 | 		# 1 input image channel, 6 output channels, 5x5 square convolution
16 | 		# kernel
17 | 		self.conv1 = nn.Conv2d(1, 6, 5)
18 | 		self.conv2 = nn.Conv2d(6, 16, 5)
19 | 		# an affine operation: y = Wx + b
20 | 		self.fc1 = nn.Linear(16 * 5 * 5, 120)
21 | 		self.fc2 = nn.Linear(120, 84)
22 | 		self.fc3 = nn.Linear(84, 10)
23 | 
24 | 	def forward(self, x):
25 | 		# Max pooling over a (2, 2) window
26 | 		x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
27 | 		# If the size is a square you can only specify a single number
28 | 		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
29 | 		x = x.view(-1, self.num_flat_features(x))
30 | 		x = F.relu(self.fc1(x))
31 | 		x = F.relu(self.fc2(x))
32 | 		x = self.fc3(x)
33 | 		return x
34 | 
35 | 	def num_flat_features(self, x):
36 | 		size = x.size()[1:]  # all dimensions except the batch dimension
37 | 		num_features = 1
38 | 		for s in size:
39 | 			num_features *= s
40 | 		return num_features
41 | 
42 | with torch.autograd.profiler.emit_nvtx():
43 | 
44 | 	net = LeNet5().cuda()
45 | 
46 | 	input = torch.randn(1, 1, 32, 32).cuda()
47 | 	out = net(input)
48 | 
49 | 	target = torch.randn(10)			# a dummy target, for example
50 | 	target = target.view(1, -1).cuda()	# make it the same shape as output
51 | 	criterion = nn.MSELoss()
52 | 
53 | 	# create your optimizer
54 | 	optimizer = optim.SGD(net.parameters(), lr=0.01)
55 | 
56 | 	# in your training loop:
57 | 	optimizer.zero_grad()	# zero the gradient buffers
58 | 
59 | 	profiler.start()
60 | 	output = net(input)
61 | 	loss = criterion(output, target)
62 | 	loss.backward()
63 | 	optimizer.step()	# Does the update
64 | 	profiler.stop()
65 | 
66 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/operators.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | This file checks all Python operators.
  5 | """
  6 | 
  7 | import sys
  8 | import torch
  9 | import torch.cuda.profiler as profiler
 10 | import operator
 11 | import inspect
 12 | 
 13 | #Import and initialize pyprof
 14 | from apex import pyprof
 15 | pyprof.nvtx.init()
 16 | 
 17 | X = 1024
 18 | Y = 1024
 19 | 
 20 | fa = torch.rand(X, Y).cuda()
 21 | fb = torch.rand(X, Y).cuda()
 22 | fc = torch.rand(X, Y).cuda()
 23 | 
 24 | ia = torch.randint(0, 100, (X, Y)).cuda()
 25 | ib = torch.randint(0, 100, (X, Y)).cuda()
 26 | 
 27 | sa = torch.ones(1,1).cuda()
 28 | sb = torch.ones(1,1).cuda()
 29 | 
 30 | ba = fa.byte()
 31 | 
 32 | unaryOps = ["abs", "__abs__", "neg", "__neg__",]
 33 | invertOps = ["inv", "invert", "__inv__", "__invert__",]	#imlemented only for byte tensors
 34 | #pos, __pos__ is not implemented for tensors
 35 | 
 36 | binaryOps = []
 37 | binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ]
 38 | binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"]
 39 | binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"]
 40 | 
 41 | inplaceOps = []
 42 | inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",]
 43 | #ipow, __ipow__ is not implemented in pytorch
 44 | inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",]
 45 | 
 46 | matmulOps = [ "matmul", "__matmul__" ]
 47 | inplacematmulOps = [ "imatmul", "__imatmul__" ]
 48 | 
 49 | reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",]
 50 | reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",]
 51 | 
 52 | '''
 53 | TODO
 54 | .concat(a, b)
 55 | .__concat__(a, b)
 56 | .contains(a, b)
 57 | .__contains__(a, b)
 58 | .countOf(a, b)
 59 | .delitem(a, b)
 60 | .__delitem__(a, b)
 61 | .getitem(a, b)
 62 | .__getitem__(a, b)
 63 | .indexOf(a, b)
 64 | .setitem(a, b, c)
 65 | .__setitem__(a, b, c)
 66 | .length_hint(obj, default=0)
 67 | .iconcat(a, b)
 68 | .__iconcat__(a, b)
 69 | .index(a)
 70 | .__index__(a)
 71 | '''
 72 | 
 73 | #Context manager
 74 | with torch.autograd.profiler.emit_nvtx():
 75 | 
 76 | 	#Start profiler
 77 | 	profiler.start()
 78 | 
 79 | 	for op in unaryOps:
 80 | 		assert hasattr(operator, op)
 81 | 		f = getattr(operator, op)
 82 | 		assert inspect.isbuiltin(f)
 83 | 		c = f(ia)
 84 | 
 85 | 	for op in invertOps:
 86 | 		assert hasattr(operator, op)
 87 | 		f = getattr(operator, op)
 88 | 		assert inspect.isbuiltin(f)
 89 | 		c = f(ba)
 90 | 
 91 | 	for op in binaryOps:
 92 | 		assert hasattr(operator, op)
 93 | 		f = getattr(operator, op)
 94 | 		assert inspect.isbuiltin(f)
 95 | 		c = f(ia, ib)
 96 | 		c = f(ia, 2)
 97 | 
 98 | 	for op in inplaceOps:
 99 | 		assert hasattr(operator, op)
100 | 		f = getattr(operator, op)
101 | 		assert inspect.isbuiltin(f)
102 | 		ia = f(ia, ib)
103 | 		ia = f(ia, 2)
104 | 
105 | 	for op in matmulOps:
106 | 		assert hasattr(operator, op)
107 | 		f = getattr(operator, op)
108 | 		assert inspect.isbuiltin(f)
109 | 		c = f(fa, fb)
110 | 
111 | 	for op in inplacematmulOps:
112 | 		assert hasattr(operator, op)
113 | 		f = getattr(operator, op)
114 | 		assert inspect.isbuiltin(f)
115 | 		fa = f(fa, fb)
116 | 
117 | 	for op in reverseIntBinaryOps:
118 | 		assert hasattr(torch.Tensor, op)
119 | 		f = getattr(torch.Tensor, op)
120 | 		ia = f(ia, ib)
121 | 
122 | 	for op in reverseFloatBinaryOps:
123 | 		assert hasattr(torch.Tensor, op)
124 | 		f = getattr(torch.Tensor, op)
125 | 		fa = f(fa, fb)
126 | 
127 | 	'''
128 | 	#c = fa[3]
129 | 	#c = fa[3][3]
130 | 	#c = torch.min(fa, 3)
131 | 	c = torch.sum(fa)
132 | 	c = torch.max(fa)
133 | 	c = -fa
134 | 	#fc[2][2] = fa[2][2]
135 | 
136 | 	c = a_scalar and b_scalar
137 | 	c = a_scalar or b_scalar
138 | 	c = not a_scalar
139 | 
140 | 	c = a is b
141 | 	c = a is not b
142 | 	'''
143 | 
144 | 	#Stop profiler
145 | 	profiler.stop()
146 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/simple.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | This simple file provides an example of how to
 5 |  - import the pyprof library and initialize it
 6 |  - use the emit_nvtx context manager
 7 |  - start and stop the profiler
 8 | 
 9 | Only kernels within profiler.start and profiler.stop calls are profiled.
10 | To profile
11 | $ nvprof -f -o simple.sql --profile-from-start off ./simple.py
12 | """
13 | 
14 | import sys
15 | import torch
16 | import torch.cuda.profiler as profiler
17 | 
18 | #Import and initialize pyprof
19 | from apex import pyprof
20 | pyprof.nvtx.init()
21 | 
22 | a = torch.randn(5, 5).cuda()
23 | b = torch.randn(5, 5).cuda()
24 | 
25 | #Context manager
26 | with torch.autograd.profiler.emit_nvtx():
27 | 
28 | 	#Start profiler
29 | 	profiler.start()
30 | 
31 | 	c = a + b
32 | 	c = torch.mul(a,b)
33 | 	c = torch.matmul(a,b)
34 | 	c = torch.argmax(a, dim=1)
35 | 	c = torch.nn.functional.pad(a, (1,1))
36 | 
37 | 	#Stop profiler
38 | 	profiler.stop()
39 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/user_annotation/README.md:
--------------------------------------------------------------------------------
 1 | Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm) 
 2 | are a useful tool to capture and observe events and code ranges etc. 
 3 | Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
 4 | 
 5 | While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
 6 | 
 7 | NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
 8 | 
 9 | The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
10 | 
11 | Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
12 | 
13 | ### To run
14 | 
15 | ```sh
16 | nvprof -fo resnet.sql --profile-from-start off python resnet.py
17 | parse.py resnet.sql > resnet.dict
18 | prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
19 | ```
20 | 
21 | The file `resnet.sql` can also be opened with NVVP as usual.
22 | 


--------------------------------------------------------------------------------
/apex/pyprof/examples/user_annotation/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo --profile-from-start off $sql python $f"
20 | 	nvprof -fo $sql --profile-from-start off python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	#$prof -w 130 $dict
29 | 	$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
30 | 	\rm $sql $dict
31 | done
32 | 


--------------------------------------------------------------------------------
/apex/pyprof/nvtx/__init__.py:
--------------------------------------------------------------------------------
1 | from .nvmarker import init
2 | from .nvmarker import add_wrapper as wrap
3 | 


--------------------------------------------------------------------------------
/apex/pyprof/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/pyprof/parse/__init__.py


--------------------------------------------------------------------------------
/apex/pyprof/parse/__main__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | try:
 4 |     from .parse import main
 5 | except ImportError as e:
 6 |     warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
 7 |     raise e
 8 | 
 9 | if __name__ == '__main__':
10 |     main()
11 | 


--------------------------------------------------------------------------------
/apex/pyprof/parse/db.py:
--------------------------------------------------------------------------------
 1 | import sys, sqlite3
 2 | 
 3 | class DB(object):
 4 | 	"""
 5 | 	This class provides functions for DB operations
 6 | 	with exception handling.
 7 | 	"""
 8 | 
 9 | 	def __init__(self, dbFile):
10 | 		try:
11 | 			conn = sqlite3.connect(dbFile)
12 | 			conn.row_factory = sqlite3.Row
13 | 			c = conn.cursor()
14 | 		except:
15 | 			print("Error opening {}".format(dbFile))
16 | 			sys.exit(1)
17 | 
18 | 		self.conn = conn
19 | 		self.c = c
20 | 
21 | 	def select(self, cmd):
22 | 		try:
23 | 			self.c.execute(cmd)
24 | 			#rows = self.c.fetchall()
25 | 			rows = [dict(row) for row in self.c.fetchall()]
26 | 		except sqlite3.Error as e:
27 | 			print(e)
28 | 			sys.exit(1)
29 | 		except:
30 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
31 | 			sys.exit(1)
32 | 
33 | 		#print(rows)
34 | 		return rows
35 | 
36 | 	def insert(self, cmd, data):
37 | 		try:
38 | 			self.c.execute(cmd, data)
39 | 		except sqlite3.Error as e:
40 | 			print(e)
41 | 			sys.exit(1)
42 | 		except:
43 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
44 | 			sys.exit(1)
45 | 
46 | 	def execute(self, cmd):
47 | 		try:
48 | 			self.c.execute(cmd)
49 | 		except sqlite3.Error as e:
50 | 			print(e)
51 | 			sys.exit(1)
52 | 		except:
53 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
54 | 			sys.exit(1)
55 | 
56 | 	def commit(self):
57 | 		self.conn.commit()
58 | 
59 | 	def close(self):
60 | 		self.c.close()
61 | 		self.conn.close()
62 | 


--------------------------------------------------------------------------------
/apex/pyprof/parse/parse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Parse the SQL db and print a dictionary for every kernel.
  5 | """
  6 | 
  7 | import sys
  8 | import argparse
  9 | from tqdm import tqdm
 10 | 
 11 | from .db import DB
 12 | from .kernel import Kernel
 13 | from .nvvp import NVVP
 14 | 
 15 | def parseArgs():
 16 | 	parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.")
 17 | 	parser.add_argument("file",
 18 | 		type=str,
 19 | 		default=None,
 20 | 		help="SQL db (nvvp) file.")
 21 | 
 22 | 	args = parser.parse_args()
 23 | 	return args
 24 | 
 25 | def main():
 26 | 	args = parseArgs()
 27 | 
 28 | 	db = DB(args.file)
 29 | 	nvvp = NVVP(db)
 30 | 
 31 | 	kInfo = nvvp.getKernelInfo()
 32 | 	if len(kInfo) == 0:
 33 | 		print("Found 0 kernels. Exiting.", file=sys.stderr)
 34 | 		db.close()
 35 | 		sys.exit(0)
 36 | 	else:
 37 | 		print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr)
 38 | 
 39 | 	nvvp.createMarkerTable()
 40 | 
 41 | 	prevSeqId = -1
 42 | 	prevSubSeqId = -1
 43 | 	prevOp = "na"
 44 | 
 45 | 	Kernel.profStart = nvvp.getProfileStart()
 46 | 
 47 | 	for i in tqdm(range(len(kInfo)), ascii=True):
 48 | 		info = kInfo[i]
 49 | 		k = Kernel()
 50 | 
 51 | 		#Set kernel info
 52 | 		k.setKernelInfo(info)
 53 | 
 54 | 		#Get, set kernel name
 55 | 		name = nvvp.getString(k.kNameId)
 56 | 		k.setKernelName(name)
 57 | 
 58 | 		#Get runtime info
 59 | 		info = nvvp.getCPUInfo(k.corrId)
 60 | 		k.setRunTimeInfo(info)
 61 | 
 62 | 		#Get and set marker and seqid info
 63 | 		info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
 64 | 		k.setMarkerInfo(info)
 65 | 
 66 | 		#If the seqId contains both 0 and non zero integers, remove 0.
 67 | 		if any(seq != 0 for seq in k.seqId) and (0 in k.seqId):
 68 | 			k.seqId.remove(0)
 69 | 
 70 | 		#Set direction (it uses seq id)
 71 | 		k.setDirection()
 72 | 
 73 | 		#Set op
 74 | 		k.setOp()
 75 | 
 76 | 		#The following code is based on heuristics.
 77 | 		#TODO: Refactor.
 78 | 		#Assign subSeqId, adjust seqId and altSeqId
 79 | 		#seqId can be 0.
 80 | 		#A kernel can have multiple seqIds both in fprop and bprop.
 81 | 		#In bprop, seqIds might not decrease monotonically. I have observed a few blips.
 82 | 		if len(k.seqId):
 83 | 			assert (k.dir in ["fprop", "bprop"])
 84 | 			if (k.dir == "fprop"):
 85 | 				#Check if there is a sequence id larger than the previous
 86 | 				inc = (k.seqId[-1] > prevSeqId)
 87 | 				if inc:
 88 | 					currSeqId = [x for x in k.seqId if x > prevSeqId][0]
 89 | 				else:
 90 | 					currSeqId = prevSeqId
 91 | 			else:
 92 | 				currSeqId = k.seqId[0]
 93 | 
 94 | 			#if ((currSeqId == prevSeqId) and (k.op == prevOp)):
 95 | 			if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])):
 96 | 				#The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell.
 97 | 				k.subSeqId = prevSubSeqId + 1
 98 | 
 99 | 			prevSeqId = currSeqId
100 | 			prevSubSeqId = k.subSeqId
101 | 			prevOp = k.op
102 | 
103 | 			#Keep currSeqId in k.seqId, move everything else to k.altSeqId
104 | 			for s in k.seqId:
105 | 				if s != currSeqId:
106 | 					k.seqId.remove(s)
107 | 					k.altSeqId.append(s)
108 | 
109 | 			for s in k.altSeqId:
110 | 				if s == currSeqId:
111 | 					k.altSeqId.remove(s)
112 | 
113 | 			k.altSeqId = list(set(k.altSeqId))
114 | 			if (len(k.altSeqId)):
115 | 				(k.altSeqId).sort()
116 | 
117 | 		k.print()
118 | 
119 | 	db.close()
120 | 
121 | if __name__ == '__main__':
122 | 	main()
123 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, prof
2 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/__main__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | try:
 4 |     from .prof import main
 5 | except ImportError as e:
 6 |     warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
 7 |     raise e
 8 | 
 9 | if __name__ == '__main__':
10 |     main()
11 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/activation.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Activation(OperatorLayerBase):
 6 | 	"""
 7 | 	This class handles the various activation functions.
 8 | 	"""
 9 | 
10 | 	ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
11 | 
12 | 	def __init__(self, d):
13 | 		marker = eval(d.argMarker[0])
14 | 		mod = marker['mod']
15 | 		op = marker['op']
16 | 		args = marker['args']
17 | 
18 | 		self.marker = marker
19 | 		self.mod_ = mod
20 | 		self.op_ = op
21 | 		self.args = args
22 | 
23 | 		assert (mod in ["torch.nn.functional", "torch", "Tensor"])
24 | 
25 | 		#Filter out named parameters
26 | 		args = list(filter(lambda x : x['name'] == '', args))
27 | 
28 | 		assert (len(args) >= 1)
29 | 		arg = args[0]
30 | 		assert (arg['type'] == "tensor")
31 | 
32 | 		self.i = arg
33 | 		self.dir = d.dir
34 | 
35 | 	def params(self):
36 | 		p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
37 | 		return p
38 | 
39 | 	def flops(self):
40 | 		direction = self.dir
41 | 		tensor = self.i['shape']
42 | 		t = self.i['dtype']
43 | 
44 | 		# TODO: revise
45 | 		elems = Utility.numElems(tensor)
46 | 		return elems
47 | 
48 | 	def bytes(self):
49 | 		direction = self.dir
50 | 		tensor = self.i['shape']
51 | 		t = self.i['dtype']
52 | 
53 | 		elems = Utility.numElems(tensor)
54 | 		elems = elems * (2 if direction == "fprop" else 3)
55 | 
56 | 		return elems * Utility.typeToBytes(t)
57 | 
58 | 	def tc(self):
59 | 		return "-"
60 | 
61 | 	def op(self):
62 | 		return self.op_
63 | 
64 | 	def mod(self):
65 | 		return self.mod_
66 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | class OperatorLayerBase(ABC):
 4 | 	"""
 5 | 	Base class for all layers and operators.
 6 | 	Every derived class should have the following functions.
 7 | 	"""
 8 | 
 9 | 	@abstractmethod
10 | 	def tc(self):
11 | 		"""
12 | 		Tensor core usage by the kernel.
13 | 		Return "1" (yes), "0" (no, but possible), "-" (not applicable)
14 | 		"""
15 | 		pass
16 | 
17 | 	@abstractmethod
18 | 	def params(self):
19 | 		"""
20 | 		Kernel parameters to be printed.
21 | 		"""
22 | 		pass
23 | 
24 | 	@abstractmethod
25 | 	def flops(self):
26 | 		"""
27 | 		Note that 1 FMA = 2 flops.
28 | 		"""
29 | 		pass
30 | 
31 | 	@abstractmethod
32 | 	def bytes(self):
33 | 		pass
34 | 
35 | 	@abstractmethod
36 | 	def mod(self):
37 | 		"""
38 | 		Name of the module/class e.g. torch.nn.functional.
39 | 		"""
40 | 		pass
41 | 
42 | 	@abstractmethod
43 | 	def op(self):
44 | 		"""
45 | 		Name of the operator e.g. sigmoid.
46 | 		"""
47 | 		pass
48 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/convert.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Convert(OperatorLayerBase):
 6 | 	"""
 7 | 	Class to handle convert operations.
 8 | 	"""
 9 | 	ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"]
10 | 
11 | 	def __init__(self, d):
12 | 		marker = eval(d.argMarker[0])
13 | 		mod = marker['mod']
14 | 		op = marker['op']
15 | 		args = marker['args']
16 | 
17 | 		self.marker = marker
18 | 		self.mod_ = mod
19 | 		self.op_ = op
20 | 		self.args = args
21 | 
22 | 		assert (mod == "Tensor")
23 | 		assert (op in Convert.ops)
24 | 		assert (len(args) == 1)
25 | 
26 | 		#The argument could be a tensor or scalar
27 | 		t = args[0]
28 | 		if t['type'] == "tensor":
29 | 			shape = t['shape']
30 | 			stype = t['dtype']
31 | 		else:
32 | 			shape = (1,)
33 | 			stype = t['type']
34 | 		if self.op_ == "to":
35 | 			op = stype
36 | 
37 | 		self.shape = shape
38 | 		self.stype = stype
39 | 		self.dtype = op
40 | 
41 | 	def params(self):
42 | 		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
43 | 		return p
44 | 
45 | 	def op(self):
46 | 		return self.op_
47 | 
48 | 	def mod(self):
49 | 		return self.mod_
50 | 
51 | 	def tc(self):
52 | 		return "-"
53 | 
54 | 	def elems(self):
55 | 		return Utility.numElems(self.shape)
56 | 
57 | 	def flops(self):
58 | 		return 0
59 | 
60 | 	def bytes(self):
61 | 		b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
62 | 		return b
63 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/data.py:
--------------------------------------------------------------------------------
 1 | from .utility import Utility
 2 | 
 3 | class Data(object):
 4 | 	"""
 5 | 	Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc.
 6 | 	"""
 7 | 	def __init__(self, kernel):
 8 | 		#Available from NVprof
 9 | 		self.tid = kernel['tid']
10 | 		self.device = kernel['device']
11 | 		self.stream = kernel['stream']
12 | 		self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","")
13 | 		self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","")
14 | 		self.name = kernel['kShortName'].replace(" ","_")
15 | 		self.lName = kernel['kLongName']
16 | 		self.sil = kernel['kDuration']	#units ns
17 | 
18 | 		self.index = None
19 | 
20 | 		#Markers
21 | 		self.argMarker = kernel['marker']
22 | 		self.modMarker = kernel['reprMarkers']
23 | 		self.seqMarker = kernel['seqMarker']
24 | 
25 | 		self.layer = kernel['layer']
26 | 		self.trace = kernel['trace']
27 | 
28 | 		self.seqId = kernel['seqId']
29 | 		self.altSeqId = kernel['altSeqId']
30 | 
31 | 		self.dir = kernel['dir']
32 | 		self.sub = kernel['subSeqId']
33 | 
34 | 		self.mod = "na"
35 | 		self.op = "na"
36 | 		self.params = {"na":"na"}
37 | 		self.tc = "na"
38 | 		self.flops = 0
39 | 		self.bytes = 0
40 | 
41 | 	def setParams(self, params):
42 | 		#Remove space from params
43 | 		qaz = ""
44 | 		for key,value in params.items():
45 | 			if "type" not in key:
46 | 				qaz += "{}={},".format(key,value)
47 | 			else:
48 | 				if type(value) is str:
49 | 					qaz += "{},".format(Utility.typeToString(value))
50 | 				else:
51 | 					qaz += "{}".format(value)
52 | 
53 | 		self.params = qaz.replace(" ", "")
54 | 
55 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/dropout.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Dropout(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch.nn.functional")
19 | 		assert (op == "dropout")
20 | 		#assert (len(args) == 1)
21 | 
22 | 		self.shape = args[0]['shape']
23 | 		self.type  = args[0]['dtype']
24 | 		self.dir = d.dir
25 | 
26 | 		return
27 | 
28 | 	def params(self):
29 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
30 | 		return p
31 | 
32 | 	def op(self):
33 | 		return self.op_
34 | 
35 | 	def mod(self):
36 | 		return self.mod_
37 | 
38 | 	def tc(self):
39 | 		return "-"
40 | 
41 | 	def elems(self):
42 | 		return Utility.numElems(self.shape)
43 | 
44 | 	def bytes(self):
45 | 		#Ignoring the cost of writing and reading the mask
46 | 		return Utility.typeToBytes(self.type) * self.elems() * 2
47 | 
48 | 	def flops(self):
49 | 		# Note: This is approximate and depends on the RNG
50 | 		return 5*self.elems()
51 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/embedding.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Embedding(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch.nn.functional")
19 | 		assert (op == "embedding")
20 | 
21 | 		self.ishape = args[0]['shape']
22 | 		self.itype = args[0]['dtype']
23 | 
24 | 		self.eshape = args[1]['shape']
25 | 		self.etype = args[1]['dtype']
26 | 
27 | 		assert (len(self.eshape) == 2)
28 | 
29 | 		self.dir = d.dir
30 | 		self.sub = d.sub
31 | 		return
32 | 
33 | 	def params(self):
34 | 		p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)])
35 | 		return p
36 | 
37 | 	def op(self):
38 | 		return self.op_
39 | 
40 | 	def mod(self):
41 | 		return self.mod_
42 | 
43 | 	def tc(self):
44 | 		return "-"
45 | 
46 | 	def bytes(self):
47 | 		ishape = self.ishape
48 | 		itype = self.itype
49 | 		eshape = self.eshape
50 | 		etype = self.etype
51 | 
52 | 		ielems = Utility.numElems(ishape)
53 | 
54 | 		b = 0
55 | 		if self.dir == "fprop":
56 | 			#indices
57 | 			b += ielems * Utility.typeToBytes(itype)
58 | 			#read and write the embedding matrix
59 | 			b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype)
60 | 		else:
61 | 			#3 times the size of the incoming gradient
62 | 			b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype)
63 | 
64 | 			if self.sub > 0:
65 | 				b = 0
66 | 
67 | 		return b
68 | 
69 | 	def flops(self):
70 | 		# Note: not implemented yet
71 | 		return 0
72 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/loss.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | #TODO: Add support for additional loss functions.
 6 | 
 7 | class MSELoss(OperatorLayerBase):
 8 | 
 9 | 	def __init__(self, d):
10 | 		marker = eval(d.argMarker[0])
11 | 		mod = marker['mod']
12 | 		op = marker['op']
13 | 		args = marker['args']
14 | 
15 | 		self.marker = marker
16 | 		self.mod_ = mod
17 | 		self.op_ = op
18 | 		self.args = args
19 | 
20 | 		assert (mod == "torch.nn.functional")
21 | 		assert (op == "mse_loss")
22 | 		assert (len(args) == 3)
23 | 
24 | 		#Get input, target and reduction
25 | 		if (args[0]['name'] == ""):
26 | 			x = args[0]
27 | 		else:
28 | 			x = list(filter(lambda x : x['name'] == "input", args))[0]
29 | 
30 | 		if (args[1]['name'] == ""):
31 | 			y = args[1]
32 | 		else:
33 | 			y = list(filter(lambda x : x['name'] == "target", args))[0]
34 | 
35 | 		if (args[2]['name'] == ""):
36 | 			r = args[2]
37 | 		else:
38 | 			r = list(filter(lambda x : x['name'] == "reduction", args))[0]
39 | 
40 | 		assert (x['type'] == y['type'] == "tensor")
41 | 		assert (x['shape'] == y['shape'])
42 | 		assert (x['dtype'] == y['dtype'])
43 | 		assert (r['type'] == "str")
44 | 		assert (r['value'] in ["none", "mean", "sum"])
45 | 
46 | 		self.shape = x['shape']
47 | 		self.type = x['dtype']
48 | 		self.red = r['value']
49 | 		self.dir = d.dir
50 | 
51 | 	def params(self):
52 | 		p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)])
53 | 		return p
54 | 
55 | 	def elems(self):
56 | 		red = self.red
57 | 		e = Utility.numElems(self.shape)
58 | 
59 | 		if self.dir == "fprop":
60 | 			if red == "none":
61 | 				e *= 3
62 | 			else:
63 | 				e *= 2
64 | 		else:
65 | 			if red == "none":
66 | 				e *= 4
67 | 			else:
68 | 				e *= 3
69 | 		return e
70 | 
71 | 	def bytes(self):
72 | 		return self.elems() * Utility.typeToBytes(self.type)
73 | 
74 | 	def flops(self):
75 | 		return self.elems() * 2 + 1
76 | 
77 | 	def tc(self):
78 | 		return "-"
79 | 
80 | 	def op(self):
81 | 		return self.op_
82 | 
83 | 	def mod(self):
84 | 		return self.mod_
85 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/normalization.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class BatchNorm(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (op == "batch_norm")
19 | 		assert (len(args) == 8)
20 | 		i = args[0]
21 | 		assert (i['type'] == "tensor")
22 | 
23 | 		self.shape = i['shape']
24 | 		self.type = i['dtype']
25 | 		self.dir = d.dir
26 | 
27 | 	def params(self):
28 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
29 | 		return p
30 | 
31 | 	def tc(self):
32 | 		return "-"
33 | 
34 | 	def op(self):
35 | 		return self.op_
36 | 
37 | 	def mod(self):
38 | 		return self.mod_
39 | 
40 | 	def elems(self):
41 | 		return Utility.numElems(self.shape)
42 | 
43 | 	def flops(self):
44 | 		# Variance algo-dependent, but this is a reasonable value.
45 | 		return self.elems() * 8
46 | 
47 | 	def bytes(self):
48 | 		e = self.elems()
49 | 		if self.dir == "fprop":
50 | 			e *= 4
51 | 		else:
52 | 			e *= 5
53 | 
54 | 		return e * Utility.typeToBytes(self.type)
55 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/optim.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | #TODO: Add support for other optimizers.
 6 | 
 7 | class Adam(OperatorLayerBase):
 8 | 
 9 | 	def __init__(self, d):
10 | 		marker = eval(d.argMarker[0])
11 | 		mod = marker['mod']
12 | 		op = marker['op']
13 | 		args = marker['args']
14 | 
15 | 		self.marker = marker
16 | 		self.mod_ = mod
17 | 		self.op_ = op
18 | 		self.args = args
19 | 
20 | 		assert(op == "adam")
21 | 		assert (len(args) == 12) or (len(args) == 14)
22 | 		w, hw, m, v, g = args[0:5]
23 | 		assert (w['shape'] == m['shape'] == v['shape'] == g['shape'])
24 | 		assert (hw['shape'] == w['shape']) or (hw['shape'] == (0,))		#hw could be null
25 | 		assert (w['type'] == m['type'] == v['type'] == g['type'] == hw['type'] == "tensor")
26 | 		assert (w['dtype'] == m['dtype'] == v['dtype'] == "float32")
27 | 
28 | 		self.w = w
29 | 		self.g = g
30 | 
31 | 	def params(self):
32 | 		p = OrderedDict([('T',self.w['shape']), ('wtype',self.w['dtype']), ('gtype',self.g['dtype'])])
33 | 		return p
34 | 
35 | 	def flops(self):
36 | 		return 0
37 | 
38 | 	def bytes(self):
39 | 		wshape = self.w['shape']
40 | 		wtype = self.w['dtype']
41 | 		gtype = self.g['dtype']
42 | 		b = 0
43 | 
44 | 		elems = Utility.numElems(wshape)
45 | 
46 | 		#Get time to stream read/write w, m, v
47 | 		b += 6 * elems *  Utility.typeToBytes(wtype)
48 | 
49 | 		#Get time to read "g"
50 | 		b += elems * Utility.typeToBytes(gtype)
51 | 
52 | 		if wtype != gtype: #mixed precision
53 | 			#Get time to write "hw
54 | 			b += elems * Utility.typeToBytes(gtype)
55 | 
56 | 		return b
57 | 
58 | 	def tc(self):
59 | 		return "-"
60 | 
61 | 	def op(self):
62 | 		return self.op_
63 | 
64 | 	def mod(self):
65 | 		return self.mod_
66 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/pooling.py:
--------------------------------------------------------------------------------
 1 | from .collections import OrderedDict
 2 | from .utility import Utility
 3 | 
 4 | # Work in progress.
 5 | 
 6 | #poolFuncs = ["max_pool2d_with_indices_forward", "max_pool2d_with_indices"]
 7 | class MaxPool2d(object):
 8 | 
 9 | 	def parse(marker):
10 | 
11 | 		def convert2Tuple(arg):
12 | 			assert (arg['type'] in ["int", "tuple"])
13 | 			if arg['type'] == "int":
14 | 				return (arg['value'], arg['value'])
15 | 			else:
16 | 				return arg['value']
17 | 
18 | 		mod = marker['mod']
19 | 		op = marker['op']
20 | 		args = marker['args']
21 | 		assert (mod == "torch.nn.functional")
22 | 		assert (op == "max_pool2d")
23 | 		assert (len(args) >= 2)
24 | 
25 | 		#input
26 | 		assert (args[0]['name'] == "")
27 | 		inp = args[0]
28 | 		assert (inp['type'] == "tensor")
29 | 		i = inp['shape']
30 | 		t = inp['dtype']
31 | 		assert (len(i) == 4) #nchw tensor
32 | 
33 | 		#kernel
34 | 		if (args[1]['name'] == ""):
35 | 			k = args[1]
36 | 		else:
37 | 			k = list(filter(lambda x : x['name'] == "kernel_size", args))[0]
38 | 		k = convert2Tuple(k)
39 | 
40 | 		#stride
41 | 		s = k #default value
42 | 		if ((len(args) >= 3) and args[2] == ""):
43 | 			s = args[2]
44 | 			s = convert2Tuple(s)
45 | 		elif any(x['name'] == "stride" for x in args):
46 | 			s = list(filter(lambda x : x['name'] == "stride", args))[0]
47 | 			s = convert2Tuple(s)
48 | 
49 | 		#padding
50 | 		p = (0,0)
51 | 		if ((len(args) >= 4) and args[3] == ""):
52 | 			p = args[3]
53 | 			p = convert2Tuple(p)
54 | 		elif any(x['name'] == "padding" for x in args):
55 | 			p = list(filter(lambda x : x['name'] == "padding", args))[0]
56 | 			p = convert2Tuple(p)
57 | 		
58 | 		params = OrderedDict([('T', i), ('K', k), ('s',s), ('p',p), ('type', t)])
59 | 		return params
60 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/randomSample.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class RandPerm(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch")
19 | 		assert (op == "randperm")
20 | 		assert (len(args) == 1)
21 | 		n = args[0]
22 | 		assert n['type'] == "int"
23 | 		self.n = n['value']
24 | 
25 | 	def params(self):
26 | 		p = OrderedDict([('N', self.n)])
27 | 		return p
28 | 
29 | 	def tc(self):
30 | 		return "-"
31 | 
32 | 	def op(self):
33 | 		return self.op_
34 | 
35 | 	def mod(self):
36 | 		return self.mod_
37 | 
38 | 	def bytes(self):
39 | 		return self.n * Utility.typeToBytes("int64")
40 | 
41 | 	def flops(self):
42 | 		# Depends on RNG but this is probably a reasonable assumption.
43 | 		return self.n * 3
44 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/reduction.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from .utility import Utility
  3 | from .base import OperatorLayerBase
  4 | 
  5 | class Mean(OperatorLayerBase):
  6 | 
  7 | 	def __init__(self, d):
  8 | 		marker = eval(d.argMarker[0])
  9 | 		mod = marker['mod']
 10 | 		op = marker['op']
 11 | 		args = marker['args']
 12 | 
 13 | 		self.marker = marker
 14 | 		self.mod_ = mod
 15 | 		self.op_ = op
 16 | 		self.args = args
 17 | 
 18 | 		assert (mod in ["torch", "Tensor"])
 19 | 		assert (op == "mean")
 20 | 
 21 | 		#Filter out named parameters
 22 | 		args = list(filter(lambda x : x['name'] == '', args))
 23 | 
 24 | 		assert (len(args) <= 2)
 25 | 		i = args[0]
 26 | 
 27 | 		self.shape = i['shape']
 28 | 		self.type = i['dtype']
 29 | 		self.dir = d.dir
 30 | 		self.sub = d.sub
 31 | 
 32 | 	def params(self):
 33 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
 34 | 		return p
 35 | 
 36 | 	def tc(self):
 37 | 		return "-"
 38 | 
 39 | 	def op(self):
 40 | 		return self.op_
 41 | 
 42 | 	def mod(self):
 43 | 		return self.mod_
 44 | 
 45 | 	def elems(self):
 46 | 		return Utility.numElems(self.shape)
 47 | 
 48 | 	def bytes(self):
 49 | 		if self.sub == 0:
 50 | 			return self.elems() * Utility.typeToBytes(self.type)
 51 | 		else:
 52 | 			return 0
 53 | 
 54 | 	def flops(self):
 55 | 		if self.sub == 0:
 56 | 			return self.elems() + 1
 57 | 		else:
 58 | 			return 0
 59 | 
 60 | class Sum(OperatorLayerBase):
 61 | 
 62 | 	def __init__(self, d):
 63 | 		marker = eval(d.argMarker[0])
 64 | 		mod = marker['mod']
 65 | 		op = marker['op']
 66 | 		args = marker['args']
 67 | 
 68 | 		self.marker = marker
 69 | 		self.mod_ = mod
 70 | 		self.op_ = op
 71 | 		self.args = args
 72 | 
 73 | 		assert (mod in ["torch", "Tensor"])
 74 | 		assert (op == "sum")
 75 | 		assert (len(args) >= 1)
 76 | 
 77 | 		#Get input
 78 | 		if (args[0]['name'] == ""):
 79 | 			i = args[0]
 80 | 		else:
 81 | 			i = list(filter(lambda x : x['name'] == "input", args))[0]
 82 | 
 83 | 		self.shape = i['shape']
 84 | 		self.type = i['dtype']
 85 | 
 86 | 	def params(self):
 87 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
 88 | 		return p
 89 | 
 90 | 	def tc(self):
 91 | 		return "-"
 92 | 
 93 | 	def op(self):
 94 | 		return self.op_
 95 | 
 96 | 	def mod(self):
 97 | 		return self.mod_
 98 | 
 99 | 	def elems(self):
100 | 		return Utility.numElems(self.shape)
101 | 
102 | 	def flops(self):
103 | 		# Note: This is incorrect, need to calculate actual flops (say via nvprof)
104 | 		return self.elems()
105 | 
106 | 	def bytes(self):
107 | 		return self.elems() * Utility.typeToBytes(self.type)
108 | 
109 | class Norm(OperatorLayerBase):
110 | 
111 | 	def __init__(self, d):
112 | 		marker = eval(d.argMarker[0])
113 | 		mod = marker['mod']
114 | 		op = marker['op']
115 | 		args = marker['args']
116 | 
117 | 		self.marker = marker
118 | 		self.mod_ = mod
119 | 		self.op_ = op
120 | 		self.args = args
121 | 
122 | 		assert (mod in ["torch", "Tensor"])
123 | 		assert (op == "norm")
124 | 		#assert (len(args) == 1)
125 | 		i = args[0]
126 | 		self.shape = i['shape']
127 | 		self.type = i['dtype']
128 | 
129 | 	def params(self):
130 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
131 | 		return p
132 | 
133 | 	def elems(self):
134 | 		return Utility.numElems(self.shape)
135 | 
136 | 	def bytes(self):
137 | 		return self.elems() * Utility.typeToBytes(self.type)
138 | 
139 | 	def flops(self):
140 | 		# square and add plus sqrt
141 | 		return 2 * self.elems() + 1
142 | 
143 | 	def tc(self):
144 | 		return "-"
145 | 
146 | 	def op(self):
147 | 		return self.op_
148 | 
149 | 	def mod(self):
150 | 		return self.mod_
151 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/softmax.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from .utility import Utility
  3 | from .base import OperatorLayerBase
  4 | 
  5 | class Softmax(OperatorLayerBase):
  6 | 
  7 | 	def __init__(self, d):
  8 | 		marker = eval(d.argMarker[0])
  9 | 		mod = marker['mod']
 10 | 		op = marker['op']
 11 | 		args = marker['args']
 12 | 
 13 | 		self.marker = marker
 14 | 		self.mod_ = mod
 15 | 		self.op_ = op
 16 | 		self.args = args
 17 | 
 18 | 		assert (mod == "torch.nn.functional")
 19 | 		assert (op == "softmax")
 20 | 
 21 | 		#Filter out named parameters
 22 | 		args = list(filter(lambda x : x['name'] == '', args))
 23 | 
 24 | 		assert (len(args) <= 2)
 25 | 		self.shape = args[0]['shape']
 26 | 		self.type = args[0]['dtype']
 27 | 		self.dir = d.dir
 28 | 
 29 | 		return
 30 | 
 31 | 	def op(self):
 32 | 		return self.op_
 33 | 
 34 | 	def mod(self):
 35 | 		return self.mod_
 36 | 
 37 | 	def tc(self):
 38 | 		return "-"
 39 | 
 40 | 	def params(self):
 41 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
 42 | 		return p
 43 | 
 44 | 	def elems(self):
 45 | 		return Utility.numElems(self.shape)
 46 | 
 47 | 	def flops(self):
 48 | 		# Note: exp, sum-reduce, divide
 49 | 		#flops = elems * 3
 50 | 		return 0
 51 | 
 52 | 	def bytes(self):
 53 | 		b = self.elems() * Utility.typeToBytes(self.type)
 54 | 		b *= 3 if self.dir == "fprop" else 5 #verify
 55 | 		return b
 56 | 
 57 | class LogSoftmax(OperatorLayerBase):
 58 | 
 59 | 	def __init__(self, d):
 60 | 		marker = eval(d.argMarker[0])
 61 | 		mod = marker['mod']
 62 | 		op = marker['op']
 63 | 		args = marker['args']
 64 | 
 65 | 		self.marker = marker
 66 | 		self.mod_ = mod
 67 | 		self.op_ = op
 68 | 		self.args = args
 69 | 
 70 | 		assert (mod == "torch.nn.functional")
 71 | 		assert (op == "log_softmax")
 72 | 
 73 | 		#Filter out named parameters
 74 | 		args = list(filter(lambda x : x['name'] == '', args))
 75 | 
 76 | 		assert (len(args) <= 2)
 77 | 
 78 | 		#Get input
 79 | 		if (args[0]['name'] == ""):
 80 | 			i = args[0]
 81 | 		else:
 82 | 			i = list(filter(lambda x : x['name'] == "input", args))[0]
 83 | 
 84 | 		t = i['dtype']
 85 | 
 86 | 		self.shape = i['shape']
 87 | 		self.type = i['dtype']
 88 | 		self.dir = d.dir
 89 | 		return
 90 | 
 91 | 	def op(self):
 92 | 		return self.op_
 93 | 
 94 | 	def mod(self):
 95 | 		return self.mod_
 96 | 
 97 | 	def tc(self):
 98 | 		return "-"
 99 | 
100 | 	def params(self):
101 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
102 | 		return p
103 | 
104 | 	def elems(self):
105 | 		return Utility.numElems(self.shape)
106 | 
107 | 	def flops(self):
108 | 		# Note: exp, sum-reduce, divide, log
109 | 		#flops = elems * 4
110 | 		return 0
111 | 
112 | 	def bytes(self):
113 | 		b = self.elems() * Utility.typeToBytes(self.type)
114 | 		b *= 3 if self.dir == "fprop" else 5 #verify
115 | 		return b
116 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/usage.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | def parseArgs():
 5 | 	"""
 6 | 	Print usage and parse arguments.
 7 | 	"""
 8 | 
 9 | 	def check_cols(value):
10 | 		valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"]
11 | 		cols = value.split(",")
12 | 		for col in cols:
13 | 			if col not in valid:
14 | 				raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid)))
15 | 		return cols
16 | 
17 | 	def openFile(f):
18 | 		try:
19 | 			d = open(f, "r")
20 | 			return d
21 | 		except IOError:
22 | 			print("Error opening file {}. Exiting.".format(f), file=sys.stderr)
23 | 			sys.exit(1)
24 | 
25 | 	parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter)
26 | 	parser.add_argument("file",
27 | 		nargs='?',
28 | 		type=str,
29 | 		default=None,
30 | 		help="Output of parse.py (Python dictionary).")
31 | 
32 | 	parser.add_argument("-c",
33 | 		type=check_cols,
34 | 		default="idx,dir,sub,mod,op,kernel,params,sil",
35 | 		help='''Comma seperated names of columns to print.
36 | idx:      Index
37 | seq:      PyTorch Sequence Id
38 | altseq:   PyTorch Alternate Sequence Id
39 | tid:      Thread Id
40 | layer:    User annotated NVTX string (can be nested)
41 | trace:    Function Call Trace
42 | dir:      Direction
43 | sub:      Sub Sequence Id
44 | mod:      Module
45 | op:       Operattion
46 | kernel:   Kernel Name
47 | params:   Parameters
48 | sil:      Silicon Time (in ns)
49 | tc:       Tensor Core Usage
50 | device:   GPU Device Id
51 | stream:   Stream Id
52 | grid:     Grid Dimensions
53 | block:    Block Dimensions
54 | flops:    Floating point ops (FMA = 2 FLOPs)
55 | bytes:    Number of bytes in and out of DRAM
56 | e.g. -c idx,kernel,sil''')
57 | 
58 | 	group = parser.add_mutually_exclusive_group()
59 | 	group.add_argument("--csv",
60 | 		action="store_true",
61 | 		default=False,
62 | 		help="Print a CSV output.")
63 | 	group.add_argument("-w",
64 | 		type=int,
65 | 		default=0,
66 | 		help="Width of columnated output.")
67 | 
68 | 	args = parser.parse_args()
69 | 	if args.file is None:
70 | 		args.file = sys.stdin
71 | 	else:
72 | 		args.file = openFile(args.file)
73 | 	return args
74 | 


--------------------------------------------------------------------------------
/apex/pyprof/prof/utility.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | class Utility(object):
 4 | 
 5 | 	@staticmethod
 6 | 	def numElems(shape):
 7 | 		assert (type(shape) == tuple)
 8 | 		return reduce(lambda x,y: x*y, shape, 1)
 9 | 
10 | 	@staticmethod
11 | 	def typeToBytes(t):
12 | 		if (t in ["uint8", "int8", "byte", "char", "bool"]):
13 | 			return 1
14 | 		elif (t in ["float16", "half", "int16", "short"]):
15 | 			return 2
16 | 		elif (t in ["float32", "float", "int32", "int"]):
17 | 			return 4
18 | 		elif (t in ["int64", "long", "float64", "double"]):
19 | 			return 8
20 | 		assert False
21 | 
22 | 	@staticmethod
23 | 	def typeToString(t):
24 | 		if (t in ["uint8", "byte", "char",]):
25 | 			return "uint8"
26 | 		elif (t in ["int8",]):
27 | 			return "int8"
28 | 		elif (t in ["int16", "short",]):
29 | 			return "int16"
30 | 		elif (t in ["float16", "half"]):
31 | 			return "fp16"
32 | 		elif (t in ["float32", "float"]):
33 | 			return "fp32"
34 | 		elif (t in ["int32", "int",]):
35 | 			return "int32"
36 | 		elif (t in ["int64", "long"]):
37 | 			return "int64"
38 | 		elif (t in ["float64", "double",]):
39 | 			return "fp64"
40 | 		elif (t in ["bool",]):
41 | 			return "bool"
42 | 		assert False
43 | 
44 | 	@staticmethod
45 | 	def hasNVTX(marker):
46 | 		if type(marker) is str:
47 | 			try:
48 | 				marker = eval(marker)
49 | 			except:
50 | 				return False
51 | 
52 | 		if type(marker) is dict:
53 | 			keys  = marker.keys()
54 | 			return ("mod" in keys) and ("op" in keys) and ("args" in keys)
55 | 		else:
56 | 			return False
57 | 
58 | 	@staticmethod
59 | 	def isscalar(t):
60 | 		return (t in ["float", "int"])
61 | 


--------------------------------------------------------------------------------
/apex/reparameterization/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 | 


--------------------------------------------------------------------------------
/apex/reparameterization/weight_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.parameter import Parameter
 3 | from ..fp16_utils import Fused_Weight_Norm
 4 | import time
 5 | 
 6 | from .reparameterization import Reparameterization
 7 | 
 8 | def _norm(p, dim):
 9 |     """Computes the norm over all dimensions except dim"""
10 |     if dim is None:
11 |         return p.norm()
12 |     elif dim == 0:
13 |         output_size = (p.size(0),) + (1,) * (p.dim() - 1)
14 |         return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
15 |     elif dim == p.dim() - 1:
16 |         output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
17 |         return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
18 |     return _norm(p.transpose(0, dim), 0).transpose(0, dim)
19 | 
20 | HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
21 | 
22 | class WeightNorm(Reparameterization):
23 |     r"""
24 |     Weight normalization is a reparameterization that decouples the magnitude
25 |     of a weight tensor from its direction. This replaces the parameter specified
26 |     by `name` (e.g. "weight") with two parameters: one specifying the magnitude
27 |     (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
28 |     Weight normalization is implemented via a hook that recomputes the weight
29 |     tensor from the magnitude and direction before every :meth:`~Module.forward`
30 |     call.
31 | 
32 |     .. math::
33 |          \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
34 | 
35 |     By default, with `dim=0`, the norm is computed independently per output
36 |     channel/plane. To compute a norm over the entire weight tensor, use
37 |     `dim=None`.
38 |     """
39 |     def compute_weight(self, module=None, name=None):
40 |         """
41 |         Computes weight normalized weight value to assign value to module attribute
42 |         with name `name`.
43 |         Arguments:
44 |             module (nn.Module): module with weight we'd like to reparameterize
45 |         Returns:
46 |             w (Tensor): Tensor object containing value of reparameterized weight
47 |         """
48 |         if module is None:
49 |             module = self.module
50 |         if name is None:
51 |             name = self.name
52 |         module, name = Reparameterization.get_module_and_name(module, name)
53 |         g = getattr(module, name + '_g')
54 |         v = getattr(module, name + '_v')
55 | 
56 |         fused_weight_norm = Fused_Weight_Norm.apply
57 |         v = v.contiguous()
58 |         w = fused_weight_norm(v, g, self.dim)
59 | 
60 |         return w
61 | 
62 |     def reparameterize(self, name, weight, dim):
63 |         """
64 |         Creates Parameters v and gto be used for weight normalization
65 |         and creates names that for attributes for the module these Parameters
66 |         will correspond to. The parameters will be registered according to the names
67 |         provided.
68 |         Arguments:
69 |             module (nn.Module): module with weight we'd like to reparameterize
70 |             name (str, optional): name of weight parameter
71 |             dim (int, optional): dimension over which to compute parameterization
72 |         Returns:
73 |             names (list, str): names of Parameters to be used for reparameterization
74 |             params (list, Parameter): Parameters to be used for reparameterization
75 |         """
76 |         names = [name + '_g', name + '_v']
77 |         params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
78 |         return names, params
79 | 


--------------------------------------------------------------------------------
/data/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/data/dataset.zip


--------------------------------------------------------------------------------
/model/KoBART/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from kobart import get_pytorch_kobart_model
 4 | from transformers import BartForConditionalGeneration
 5 | 
 6 | 
 7 | class KoBARTConditionalGeneration(nn.Module):
 8 |     def __init__(self, args, tokenizer):
 9 |         super(KoBARTConditionalGeneration, self).__init__()
10 |         self.model = BartForConditionalGeneration.from_pretrained(get_pytorch_kobart_model(),
11 |                 output_attentions=True,
12 |                 output_hidden_states=True)
13 |      
14 |         self.vocab_size = self.model.config.vocab_size
15 |         
16 |         self.args = args
17 |         self.linear_copy = nn.Linear(768, 1)
18 |         self.tokenizer = tokenizer
19 |         self.loss_fct = nn.CrossEntropyLoss()
20 | 
21 |     def forward(self, inputs, mode):
22 |         if mode != 'test':
23 |             outs = self.model(input_ids=inputs['input_ids'],
24 |                               attention_mask=inputs['attention_mask'],
25 |                               decoder_input_ids=inputs['decoder_input_ids'],
26 |                               decoder_attention_mask=inputs['decoder_attention_mask'],
27 |                               labels=inputs['labels'], return_dict=True)
28 |             
29 |             encoder_input_ids = inputs['input_ids']
30 | 
31 |             logits = outs.logits
32 |             last_hidden_state = outs.decoder_hidden_states[-1]
33 |             last_attention_weight = torch.softmax(outs.cross_attentions[-1], dim=-1)
34 |         
35 |             p_copy = torch.sigmoid(self.linear_copy(last_hidden_state))
36 |             previous_word_pro = torch.softmax(logits, dim=-1) * (1 - p_copy)
37 |         
38 |             encoder_word_attention = p_copy * torch.mean(last_attention_weight, dim=1)
39 |             
40 |             mask = torch.where(encoder_input_ids == self.tokenizer.pad_token_id,
41 |                                encoder_word_attention.new_zeros(encoder_input_ids.shape),
42 |                                encoder_word_attention.new_ones(encoder_input_ids.shape))
43 |             
44 |             encoder_word_attention = encoder_word_attention * mask.unsqueeze(1)
45 |             personal_words = encoder_input_ids.unsqueeze(1).repeat(1, encoder_word_attention.shape[1], 1)
46 |             word_pro = torch.scatter_add(previous_word_pro, 2, personal_words, encoder_word_attention)
47 |             
48 |             loss = self.loss_fct(word_pro.view(-1, self.vocab_size), inputs['labels'].view(-1))
49 |             
50 |             return loss
51 |         
52 |         else:
53 |             outputs = self.model.generate(inputs['input_ids'],
54 |                     max_length=self.args.max_len,
55 |                     num_beams=5,
56 |                     linear_copy=self.linear_copy)
57 |     
58 |             return outputs
59 | 


--------------------------------------------------------------------------------
/model/setting.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | import logging
 4 | import numpy as np
 5 | from argparse import ArgumentParser
 6 | 
 7 | 
 8 | class Arguments():
 9 | 
10 |     def __init__(self):
11 |         self.parser = ArgumentParser()
12 | 
13 |     def add_type_of_processing(self):
14 |         self.add_argument('--opt_level', type=str, default='O1')
15 |         self.add_argument('--fp16', type=str, default='True')
16 |         self.add_argument('--train', type=str, default='True')
17 |         self.add_argument('--test', type=str, default='True')
18 |         self.add_argument('--device', type=str, default=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
19 | 
20 |     def add_hyper_parameters(self):
21 |         self.add_argument('--patient', type=int, default=5)
22 |         self.add_argument('--dropout', type=int, default=0.1)
23 |         self.add_argument('--max_len', type=int, default=256)
24 |         self.add_argument('--batch_size', type=int, default=32)
25 |         self.add_argument('--epochs', type=int, default=10)
26 |         self.add_argument('--seed', type=int, default=1)
27 |         self.add_argument('--lr', type=float, default=0.00003)
28 |         self.add_argument('--warmup_ratio', type=float, default=0.1)
29 | 
30 |     def add_data_parameters(self):
31 |         self.add_argument('--train_data', type=str, default='train.tsv')
32 |         self.add_argument('--test_data', type=str, default='test.tsv')
33 |         self.add_argument('--valid_data', type=str, default='valid.tsv')
34 |         self.add_argument('--path_to_data', type=str, default='./data/')
35 |         self.add_argument('--path_to_save', type=str, default='./output/')
36 |         self.add_argument('--ckpt', type=str, default='best_ckpt.pt')
37 | 
38 |     def print_args(self, args):
39 |         for idx, (key, value) in enumerate(args.__dict__.items()):
40 |             if idx == 0:print("argparse{\n", "\t", key, ":", value)
41 |             elif idx == len(args.__dict__) - 1:print("\t", key, ":", value, "\n}")
42 |             else:print("\t", key, ":", value)
43 | 
44 |     def add_argument(self, *args, **kw_args):
45 |         return self.parser.add_argument(*args, **kw_args)
46 | 
47 |     def parse(self):
48 |         args = self.parser.parse_args()
49 |         self.print_args(args)
50 | 
51 |         return args
52 | 
53 | 
54 | class Setting():
55 | 
56 |     def set_logger(self):
57 | 
58 |         _logger = logging.getLogger()
59 |         formatter = logging.Formatter(
60 |             '[%(levelname)s] %(asctime)s [ %(message)s ] | file::%(filename)s | line::%(lineno)s')
61 | 
62 |         stream_handler = logging.StreamHandler()
63 |         stream_handler.setFormatter(formatter)
64 | 
65 |         _logger.addHandler(stream_handler)
66 |         _logger.setLevel(logging.DEBUG)
67 | 
68 |         return _logger
69 | 
70 |     def set_seed(self, args):
71 | 
72 |         seed = args.seed
73 | 
74 |         random.seed(seed)
75 |         np.random.seed(seed)
76 | 
77 |         torch.manual_seed(seed)
78 |         torch.backends.cudnn.deterministic = True
79 |         torch.backends.cudnn.benchmark = False
80 | 
81 |         torch.cuda.manual_seed(seed)
82 |         torch.cuda.manual_seed_all(seed)
83 | 
84 |     def run(self):
85 | 
86 |         parser = Arguments()
87 |         parser.add_type_of_processing()
88 |         parser.add_hyper_parameters()
89 |         parser.add_data_parameters()
90 | 
91 |         args = parser.parse()
92 |         logger = self.set_logger()
93 |         self.set_seed(args)
94 | 
95 |         return args, logger
96 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | from model.setting import Setting, Arguments
 4 | from model.KoBART.processor import Processor
 5 | 
 6 | 
 7 | def main(args, logger) -> None:
 8 | 
 9 |     processor = Processor(args)
10 |     config = processor.model_setting()
11 |     logger.info('Model Setting Complete')
12 | 
13 |     if args.train == 'True':
14 |         logger.info('Start Training')
15 | 
16 |         for epoch in range(args.epochs):
17 |             start_time = time.time()
18 | 
19 |             train_loss = processor.train()
20 |             valid_loss = processor.valid()
21 | 
22 |             end_time = time.time()
23 |             epoch_mins, epoch_secs = processor.metric.cal_time(start_time, end_time)
24 | 
25 |             performance = {'tl': train_loss, 'vl': valid_loss,
26 |                            'ep': epoch, 'epm': epoch_mins, 'eps': epoch_secs}
27 | 
28 |             processor.metric.save_model(config, performance, processor.model_checker)
29 | 
30 |             if processor.model_checker['early_stop']:
31 |                 logger.info('Early Stopping')
32 |                 break
33 | 
34 |     if args.test == 'True':
35 |         logger.info("Start Test")
36 | 
37 |         rouge_score = processor.test()
38 |         print(f'\n{rouge_score}')
39 | 
40 |         processor.metric.print_size_of_model(config['model'])
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     args, logger = Setting().run()
45 |     main(args, logger)
46 | 


--------------------------------------------------------------------------------
/train_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CUDA_VISIBLE_DEVICES=1 python train.py --train True --test False --batch_size 14 --max_len 512 --lr 5e-05 --epochs 10
3 | CUDA_VISIBLE_DEVICES=1 python train.py --train False --test True --batch_size 14 --max_len 512
4 | 


--------------------------------------------------------------------------------