├── README.md ├── apex ├── RNN │ ├── README.md │ ├── RNNBackend.py │ ├── __init__.py │ ├── cells.py │ └── models.py ├── __init__.py ├── amp │ ├── README.md │ ├── __init__.py │ ├── __version__.py │ ├── _amp_state.py │ ├── _initialize.py │ ├── _process_optimizer.py │ ├── amp.py │ ├── compat.py │ ├── frontend.py │ ├── handle.py │ ├── lists │ │ ├── __init__.py │ │ ├── functional_overrides.py │ │ ├── tensor_overrides.py │ │ └── torch_overrides.py │ ├── opt.py │ ├── rnn_compat.py │ ├── scaler.py │ ├── utils.py │ └── wrap.py ├── contrib │ ├── __init__.py │ ├── bottleneck │ │ ├── __init__.py │ │ ├── bottleneck.py │ │ └── test.py │ ├── csrc │ │ ├── bottleneck │ │ │ └── bottleneck.cpp │ │ ├── fmha │ │ │ ├── fmha_api.cpp │ │ │ └── src │ │ │ │ ├── fmha.h │ │ │ │ ├── fmha │ │ │ │ ├── gemm.h │ │ │ │ ├── gmem_tile.h │ │ │ │ ├── kernel_traits.h │ │ │ │ ├── mask.h │ │ │ │ ├── smem_tile.h │ │ │ │ ├── softmax.h │ │ │ │ └── utils.h │ │ │ │ ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu │ │ │ │ ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu │ │ │ │ ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu │ │ │ │ ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload.h │ │ │ │ ├── fmha_dgrad_kernel_1xN_reload_nl.h │ │ │ │ ├── fmha_fprop_fp16_128_64_kernel.sm80.cu │ │ │ │ ├── fmha_fprop_fp16_256_64_kernel.sm80.cu │ │ │ │ ├── fmha_fprop_fp16_384_64_kernel.sm80.cu │ │ │ │ ├── fmha_fprop_fp16_512_64_kernel.sm80.cu │ │ │ │ ├── fmha_fprop_kernel_1xN.h │ │ │ │ ├── fmha_fprop_kernel_1xN_nl.h │ │ │ │ ├── fmha_fprop_kernel_1xN_reload_v.h │ │ │ │ ├── fmha_kernel.h │ │ │ │ ├── fmha_noloop_reduce.cu │ │ │ │ └── fmha_utils.h │ │ ├── groupbn │ │ │ ├── batch_norm.cu │ │ │ ├── batch_norm.h │ │ │ ├── batch_norm_add_relu.cu │ │ │ ├── batch_norm_add_relu.h │ │ │ ├── cuda_utils.h │ │ │ ├── interface.cpp │ │ │ ├── ipc.cu │ │ │ └── nhwc_batch_norm_kernel.h │ │ ├── layer_norm │ │ │ ├── ln_api.cpp │ │ │ ├── ln_bwd_semi_cuda_kernel.cu │ │ │ ├── ln_fwd_cuda_kernel.cu │ │ │ ├── ln_kernel_traits.h │ │ │ └── utils.cuh │ │ ├── multihead_attn │ │ │ ├── additive_masked_softmax_dropout.cpp │ │ │ ├── additive_masked_softmax_dropout_cuda.cu │ │ │ ├── dropout.h │ │ │ ├── encdec_multihead_attn.cpp │ │ │ ├── encdec_multihead_attn_cuda.cu │ │ │ ├── encdec_multihead_attn_norm_add.cpp │ │ │ ├── encdec_multihead_attn_norm_add_cuda.cu │ │ │ ├── layer_norm.h │ │ │ ├── masked_softmax_dropout.cpp │ │ │ ├── masked_softmax_dropout_cuda.cu │ │ │ ├── philox.h │ │ │ ├── self_multihead_attn.cpp │ │ │ ├── self_multihead_attn_bias.cpp │ │ │ ├── self_multihead_attn_bias_additive_mask.cpp │ │ │ ├── self_multihead_attn_bias_additive_mask_cuda.cu │ │ │ ├── self_multihead_attn_bias_cuda.cu │ │ │ ├── self_multihead_attn_cuda.cu │ │ │ ├── self_multihead_attn_norm_add.cpp │ │ │ ├── self_multihead_attn_norm_add_cuda.cu │ │ │ ├── softmax.h │ │ │ └── strided_batched_gemm.h │ │ ├── optimizers │ │ │ ├── fused_adam_cuda.cpp │ │ │ ├── fused_adam_cuda_kernel.cu │ │ │ ├── fused_lamb_cuda.cpp │ │ │ ├── fused_lamb_cuda_kernel.cu │ │ │ ├── multi_tensor_distopt_adam.cpp │ │ │ ├── multi_tensor_distopt_adam_kernel.cu │ │ │ ├── multi_tensor_distopt_lamb.cpp │ │ │ └── multi_tensor_distopt_lamb_kernel.cu │ │ ├── transducer │ │ │ ├── transducer_joint.cpp │ │ │ ├── transducer_joint_kernel.cu │ │ │ ├── transducer_loss.cpp │ │ │ └── transducer_loss_kernel.cu │ │ └── xentropy │ │ │ ├── interface.cpp │ │ │ └── xentropy_kernel.cu │ ├── examples │ │ └── multihead_attn │ │ │ ├── func_test_multihead_attn.py │ │ │ └── perf_test_multihead_attn.py │ ├── fmha │ │ ├── __init__.py │ │ └── fmha.py │ ├── groupbn │ │ ├── __init__.py │ │ └── batch_norm.py │ ├── layer_norm │ │ ├── __init__.py │ │ └── layer_norm.py │ ├── multihead_attn │ │ ├── MHA_bwd.png │ │ ├── MHA_fwd.png │ │ ├── README.md │ │ ├── __init__.py │ │ ├── encdec_multihead_attn.py │ │ ├── encdec_multihead_attn_func.py │ │ ├── fast_encdec_multihead_attn_func.py │ │ ├── fast_encdec_multihead_attn_norm_add_func.py │ │ ├── fast_self_multihead_attn_func.py │ │ ├── fast_self_multihead_attn_norm_add_func.py │ │ ├── mask_softmax_dropout_func.py │ │ ├── self_multihead_attn.py │ │ └── self_multihead_attn_func.py │ ├── optimizers │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── fp16_optimizer.cpython-37.pyc │ │ │ ├── fused_adam.cpython-37.pyc │ │ │ └── fused_lamb.cpython-37.pyc │ │ ├── distributed_fused_adam.py │ │ ├── distributed_fused_adam_v2.py │ │ ├── distributed_fused_adam_v3.py │ │ ├── distributed_fused_lamb.py │ │ ├── fp16_optimizer.py │ │ ├── fused_adam.py │ │ ├── fused_lamb.py │ │ └── fused_sgd.py │ ├── sparsity │ │ ├── README.md │ │ ├── __init__.py │ │ ├── asp.py │ │ ├── sparse_masklib.py │ │ └── test │ │ │ ├── checkpointing_test_part1.py │ │ │ ├── checkpointing_test_part2.py │ │ │ ├── checkpointing_test_reference.py │ │ │ └── toy_problem.py │ ├── test │ │ ├── fmha │ │ │ └── test_fmha.py │ │ ├── layer_norm │ │ │ └── test_fast_layer_norm.py │ │ ├── multihead_attn │ │ │ ├── test_encdec_multihead_attn.py │ │ │ ├── test_encdec_multihead_attn_norm_add.py │ │ │ ├── test_fast_self_multihead_attn_bias.py │ │ │ ├── test_mha_fused_softmax.py │ │ │ ├── test_self_multihead_attn.py │ │ │ └── test_self_multihead_attn_norm_add.py │ │ ├── test_label_smoothing.py │ │ └── transducer │ │ │ ├── test_transducer_joint.py │ │ │ ├── test_transducer_loss.py │ │ │ └── transducer_ref.py │ ├── transducer │ │ ├── __init__.py │ │ └── transducer.py │ └── xentropy │ │ ├── __init__.py │ │ └── softmax_xentropy.py ├── fp16_utils │ ├── README.md │ ├── __init__.py │ ├── fp16_optimizer.py │ ├── fp16util.py │ └── loss_scaler.py ├── mlp │ ├── __init__.py │ └── mlp.py ├── multi_tensor_apply │ ├── __init__.py │ └── multi_tensor_apply.py ├── normalization │ ├── __init__.py │ └── fused_layer_norm.py ├── optimizers │ ├── __init__.py │ ├── fused_adagrad.py │ ├── fused_adam.py │ ├── fused_lamb.py │ ├── fused_novograd.py │ └── fused_sgd.py ├── parallel │ ├── LARC.py │ ├── README.md │ ├── __init__.py │ ├── distributed.py │ ├── multiproc.py │ ├── optimized_sync_batchnorm.py │ ├── optimized_sync_batchnorm_kernel.py │ ├── sync_batchnorm.py │ └── sync_batchnorm_kernel.py ├── pyprof │ ├── FAQs.md │ ├── README.md │ ├── __init__.py │ ├── examples │ │ ├── .gitignore │ │ ├── apex │ │ │ ├── README.md │ │ │ ├── fused_adam.py │ │ │ ├── fused_layer_norm.py │ │ │ └── test.sh │ │ ├── custom_func_module │ │ │ ├── README.md │ │ │ ├── custom_function.py │ │ │ ├── custom_module.py │ │ │ └── test.sh │ │ ├── imagenet │ │ │ ├── imagenet.py │ │ │ └── test.sh │ │ ├── jit │ │ │ ├── README.md │ │ │ ├── jit_script_function.py │ │ │ ├── jit_script_method.py │ │ │ ├── jit_trace_function.py │ │ │ ├── jit_trace_method.py │ │ │ └── test.sh │ │ ├── lenet.py │ │ ├── operators.py │ │ ├── simple.py │ │ └── user_annotation │ │ │ ├── README.md │ │ │ ├── resnet.py │ │ │ └── test.sh │ ├── nvtx │ │ ├── __init__.py │ │ └── nvmarker.py │ ├── parse │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── db.py │ │ ├── kernel.py │ │ ├── nvvp.py │ │ └── parse.py │ └── prof │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── activation.py │ │ ├── base.py │ │ ├── blas.py │ │ ├── conv.py │ │ ├── convert.py │ │ ├── data.py │ │ ├── dropout.py │ │ ├── embedding.py │ │ ├── index_slice_join_mutate.py │ │ ├── linear.py │ │ ├── loss.py │ │ ├── misc.py │ │ ├── normalization.py │ │ ├── optim.py │ │ ├── output.py │ │ ├── pointwise.py │ │ ├── pooling.py │ │ ├── prof.py │ │ ├── randomSample.py │ │ ├── recurrentCell.py │ │ ├── reduction.py │ │ ├── softmax.py │ │ ├── usage.py │ │ └── utility.py └── reparameterization │ ├── README.md │ ├── __init__.py │ ├── reparameterization.py │ └── weight_norm.py ├── data ├── dataloader.py └── dataset.zip ├── generation_utils.py ├── model ├── KoBART │ ├── model.py │ └── processor.py ├── setting.py └── utils.py ├── modeling_bart.py ├── train.py └── train_test.sh /README.md: -------------------------------------------------------------------------------- 1 | # KoBART-pytorch 2 | 🧀 KoBART summarization using pytorch + copy mechanism 3 | 4 | ## Data 5 | - Data statistics 6 | - Train Data : 29,432 7 | - Valid Data : 7,358 8 | - Test Data : 9,182 9 | 10 | ## How to Train 11 | - KoBART fine-tuning + Copy Mechanism 12 | > **Warning** 13 | > - Since the python libary was directly modified and used, I recommended to use a virtual environment. 😎 14 | > - geneartion_utils.py -> /site-packages/transformers/_geneartion_utils.py_ 15 | > - modeling_bart.py -> /site-packages/transformers/models/bart/_modeling_bart.py_ 16 | - bash train_test.sh 17 | ``` 18 | [Training] 19 | python train.py --train True --test False --batch_size 16 --max_len 512 --lr 5e-05 --epochs 10 20 | 21 | [Testing-rouge] 22 | python train.py --train False --test True --batch_size 16 --max_len 512 23 | ``` 24 | 25 | ## Model Performance 26 | - Test data's [rouge score](https://en.wikipedia.org/wiki/ROUGE_(metric)) 27 | ### Base 28 | | | rouge-1 |rouge-2|rouge-l| 29 | |-------|--------:|--------:|--------:| 30 | | Precision|0.5333|0.3463|0.4534| 31 | | Recall|0.5775|0.3737|0.4869| 32 | | F1|0.5381|0.3452|0.4555| 33 | 34 | ### Copy Mechanism 35 | | | rouge-1 |rouge-2|rouge-l| 36 | |-------|--------:|--------:|--------:| 37 | | Precision|0.5698|0.3776|0.4882| 38 | | Recall|0.5561|0.3612|0.4717| 39 | | F1|0.5460|0.3545|0.4654| 40 | 41 | ## Examples 42 | | | |Text| 43 | |-------|:--------|:--------| 44 | |1|기사|경기도와 경기도시공사는 광교원천, 동탄호수공원, 성남판교 등 3개 지구에 건립 예정인 총 730가구의 경기행복주택 입주자를 모집한다고 26일 밝혔다. 모집 기간은 다음달 2일부터 11일까지이며, ‘경기도시공사 임대주택 청약센터(https://apply.gico.or.kr)’에서 인터넷 청약접수로 진행된다. 광교원천 경기행복주택은 전용면적 16㎡형 대학생 40가구와 청년 20가구, 26㎡형 청년 186가구와 고령자 24가구, 주거급여수급자 30가구까지 총 300가구를 모집한다. 보증금 2천729만4천~4천783만3천 원에 월 임대료는 11만8천~20만7천 원이다. 입주 예정은 오는 2020년 11월이다. 동탄호수공원 경기행복주택은 동탄2신도시에 6개동 995가구 조성되는 대규모 단지이다. 이번 입주자 모집에서는 공급면적 44㎡형 신혼부부 130가구를 우선 모집하며 임대조건은 보증금 5천만 원에 월 임대료 20만8천 원이다. 내년 12월 입주 예정으로, 나머지 가구는 연말에 모집할 예정이다 성남판교 경기행복주택은 전용면적 16㎡형 창업인 100가구와 청년 124가구, 26㎡형 청년 46가구와 고령자 30가구 등 총 300가구를 모집하며, 보증금 3천876만~6천992만 원에 월 임대료 14만5천~26만2천 원이다. 김준태 도 도시주택실장은 ""도내 청년층을 주요 대상으로 한 주거복지정책인 경기행복주택은 2022년까지 1만호를 공급하는 과정에서 매년 공급물량이 늘어날 예정""이라며 ""경기행복주택 사업에도 많은 관심을 가져달라""고 말했다.| 45 | |1|모델요약|경기도와 경기도시공사는 광교원천, 동탄호수공원, 성남판교 등 3개 지구에 건립 예정인 총 730가구의 경기행복주택 입주자를 모집한다고 26일 밝혔으며 모집 기간은 다음달 2일부터 11일까지이며, 보증금 2천729만4천~4천783만3천 원에 월 임대료는 11만8천~20만7천 원이다.| 46 | |2|기사|전남개발공사, ‘일자리창출’우수기관 선정 전남개발공사 청사 전경. 전남개발공사가 일자리창출 우수기관에 선정돼 행정안전부장관 표창을 수상했다. 6일 전남개방공사에 따르면 지난 3일 세종컨벤션센터에서 열린 2019년 상반기 지방공사·공단 CEO 리더십 포럼에서 이같이 수상 했다고 밝혔다. 전남개발공사는 문재인 정부 역점 사업인 일자리창출 정책에 적극 부응하며, 지역의 고용시장 활성화하기 위해 신규사업 발굴 등에 역점을 뒀다. 이 결과 지난해 2회에 걸쳐 10명을 채용하는 등 정부의 청년 및 장애인 의무고용에 대한 기준을 충족시켰다는 평가를 받았다. 또한 지역내 사회 초년생의 안정적인 취업에 도움이 될 수 있는 양질의 일자리 경험 및 역량을 쌓을 수 있도록 전라남도가 중점 추진하고 있는 ‘청년 내일로 프로젝트’에 참여해 7명의 지역인재를 선발하기도 했다. 특히 전남개발공사의 채용은 공정성과 투명성 위해 전면 블라인드 절차에 따라 진행되며, 특히 면접은 전원 외부면접위원으로 진행된다. 올해 채용은 전반기에 2명을 채용했으며하반기에는 추가로 5명이내의 규모로 진행될 예정이다. 한편 전남개발공사는 지난 2004년 전남도가 설립한 지방공기업으로 남악신도시, 빛가람 혁신도시, 여수경도해양관광단지 개발사업 등을 시행했고, 여수 죽림지구 택지개발사업도 추진할 계획이다.| 47 | |2|모델요약|전남개발공사는 지난 3일 세종컨벤션센터에서 열린 2019년 상반기 지방공사·공단 CEO 리더십 포럼에서 일자리창출 우수기관에 선정돼 행정안전부장관 표창을 수상했다.| 48 | |3|기사|광주시는 지역 유망강소기업을 발굴·육성하기 위해 운영하고 있는 ‘100대 명품강소기업 육성사업’에 참가할 기업을 모집한다. 명품강소기업 육성사업은 성장 잠재력과 뛰어난 기술력을 가진 지역 중소·중견기업을 발굴해 지역경제를 견인할 글로벌 기업으로 육성하기 위한 시책으로, 2014년 시작돼 올해로 6년째를 맞았다. 모집대상은 공고일 현재 본사와 주사업장이 광주에 위치한 제조업 및 지식서비스산업 기업으로 총 30개사다. 이번 모집은 현재 제3기 명품강소기업 27개사의 지정기간(3년)이 만료됨에 따라 이들 기업의 재지정 여부와 함께 재지정 포기·탈락 기업 결원분을 채우기 위해 추진됐다. 선정조건은 명품강소기업은 매출액 50억원 이상(지식서비스산업은 10억원 이상)이면서 최근 5년 간 연평균 매출액 증가율 5% 이상이거나, 최근 3년 간 매출액 대비 R&D 투자 비율이 1% 이상인 기업이다. 명품강소기업으로 선정되면 광주시 자금 지원, 기업진단 컨설팅, 성장전략 마련, 해외마케팅 등 기업중심 맞춤형 지원과 함께 다양한 우대 혜택을 받게 된다. 또 중앙정부(중소벤처기업부)와 연계한 기업성장사다리를 통해 단계별 성장전략 지원도 받을 수 있어 명품강소기업 선정 이후 글로벌 기업으로 발돋움할 수 있을 것으로 기대된다. 신청은 31일까지 광주테크노파크로 방문 접수하면 된다. 자세한 내용은 광주시 홈페이지(http://www.gwangju.go.kr) 고시·공고란을 참고하거나 광주시 기업육성과(062-613-3871)로 문의하면 된다. 광주시는 신청기업을 대상으로 1차 서류심사, 2차 발표·현장평가를 거쳐 8월 선정위원회에서 최종 확정할 계획이다.| 49 | |3|모델요약|광주시는 지역 중소·중견기업을 발굴해 지역경제를 견인할 글로벌 기업으로 육성하기 위해 운영하고 있는 '100대 명품강소기업 육성사업'에 참가할 기업을 모집한다.| 50 | 51 | ## Reference 52 | - [KoBART](https://github.com/SKT-AI/KoBART) 53 | - [KoBART-summarization](https://github.com/seujung/KoBART-summarization) 54 | 55 | -------------------------------------------------------------------------------- /apex/RNN/README.md: -------------------------------------------------------------------------------- 1 | Under construction... 2 | -------------------------------------------------------------------------------- /apex/RNN/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM 2 | 3 | __all__ = ['models'] 4 | -------------------------------------------------------------------------------- /apex/RNN/cells.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .RNNBackend import RNNCell 6 | 7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend 8 | 9 | import math 10 | 11 | 12 | class mLSTMRNNCell(RNNCell): 13 | """ 14 | mLSTMRNNCell 15 | """ 16 | 17 | def __init__(self, input_size, hidden_size, bias = False, output_size = None): 18 | gate_multiplier = 4 19 | super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size) 20 | 21 | self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size)) 22 | self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size)) 23 | 24 | self.reset_parameters() 25 | 26 | def forward(self, input): 27 | """ 28 | mLSTMRNNCell.forward() 29 | """ 30 | #if not inited or bsz has changed this will create hidden states 31 | self.init_hidden(input.size()[0]) 32 | 33 | hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden 34 | 35 | self.hidden = list( 36 | self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh, 37 | b_ih=self.b_ih, b_hh=self.b_hh) 38 | ) 39 | 40 | if self.output_size != self.hidden_size: 41 | self.hidden[0] = F.linear(self.hidden[0], self.w_ho) 42 | return tuple(self.hidden) 43 | 44 | 45 | def new_like(self, new_input_size=None): 46 | if new_input_size is None: 47 | new_input_size = self.input_size 48 | 49 | return type(self)( 50 | new_input_size, 51 | self.hidden_size, 52 | self.bias, 53 | self.output_size) 54 | 55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None): 56 | """ 57 | mLSTMCell 58 | """ 59 | 60 | if input.is_cuda: 61 | igates = F.linear(input, w_ih) 62 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh) 63 | hgates = F.linear(m, w_hh) 64 | 65 | state = fusedBackend.LSTMFused.apply 66 | return state(igates, hgates, hidden[1], b_ih, b_hh) 67 | 68 | hx, cx = hidden 69 | 70 | m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh) 71 | gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh) 72 | 73 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 74 | 75 | ingate = F.sigmoid(ingate) 76 | forgetgate = F.sigmoid(forgetgate) 77 | cellgate = F.tanh(cellgate) 78 | outgate = F.sigmoid(outgate) 79 | 80 | cy = (forgetgate * cx) + (ingate * cellgate) 81 | hy = outgate * F.tanh(cy) 82 | 83 | return hy, cy 84 | 85 | -------------------------------------------------------------------------------- /apex/RNN/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell 4 | 5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell 6 | from .cells import mLSTMRNNCell, mLSTMCell 7 | 8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0): 9 | """ 10 | :class:`toRNNBackend` 11 | """ 12 | 13 | if bidirectional: 14 | return bidirectionalRNN(inputRNN, num_layers, dropout = dropout) 15 | else: 16 | return stackedRNN(inputRNN, num_layers, dropout = dropout) 17 | 18 | 19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 20 | """ 21 | :class:`LSTM` 22 | """ 23 | inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size) 24 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 25 | 26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 27 | """ 28 | :class:`GRU` 29 | """ 30 | inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size) 31 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 32 | 33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 34 | """ 35 | :class:`ReLU` 36 | """ 37 | inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size) 38 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 39 | 40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 41 | """ 42 | :class:`Tanh` 43 | """ 44 | inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size) 45 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 46 | 47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None): 48 | """ 49 | :class:`mLSTM` 50 | """ 51 | inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size) 52 | return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout) 53 | 54 | 55 | -------------------------------------------------------------------------------- /apex/__init__.py: -------------------------------------------------------------------------------- 1 | # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten 2 | import torch 3 | import warnings 4 | 5 | if torch.distributed.is_available(): 6 | from . import parallel 7 | 8 | from . import amp 9 | from . import fp16_utils 10 | 11 | # For optimizers and normalization there is no Python fallback. 12 | # Absence of cuda backend is a hard error. 13 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda 14 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext 15 | # so they expect those backends to be available, but for some reason they actually aren't 16 | # available (for example because they built improperly in a way that isn't revealed until 17 | # load time) the error message is timely and visible. 18 | from . import optimizers 19 | from . import normalization 20 | from . import pyprof 21 | -------------------------------------------------------------------------------- /apex/amp/README.md: -------------------------------------------------------------------------------- 1 | # amp: Automatic Mixed Precision 2 | 3 | ## Annotating User Functions 4 | 5 | Nearly all PyTorch user code needs nothing more than the two steps 6 | above to use amp. After all, custom layers are built out of simpler 7 | PyTorch components, and amp already can see those. 8 | 9 | However, any custom C++ or CUDA code is outside of amp's (default) 10 | view of things. For example, suppose I implemented a new recurrent 11 | cell called a "forgetful recurrent unit" that calls directly into a 12 | CUDA backend: 13 | 14 | ```python 15 | from backend import FRUBackend 16 | 17 | def fru(input, hidden, weight, bias): 18 | # call to CUDA code 19 | FRUBackend(input, hidden, weight, bias) 20 | ``` 21 | 22 | In this case, it is possible to get a runtime type mismatch. For 23 | example, you might have `input` in fp16, and `weight` in fp32, and amp 24 | doesn't have the visibility to insert an appropriate cast. 25 | 26 | amp exposes two ways to handle "invisible" backend code: function 27 | annotations and explicit registration. 28 | 29 | #### Function annotation 30 | 31 | The first way to handle backend code is a set of function annotations: 32 | 33 | - `@amp.half_function` 34 | - `@amp.float_function` 35 | - `@amp.promote_function` 36 | 37 | These correspond to: 38 | 39 | - Cast all arguments to fp16 40 | - Cast all argumnets fo fp32 41 | - If there are any type mismatches, cast everything to the widest type 42 | 43 | In our example, we believe that the FRU unit is fp16-safe and will get 44 | performance gains from casting its arguments to fp16, so we write: 45 | 46 | ```python 47 | @amp.half_function 48 | def fru(input, hidden, weight, bias): 49 | #... 50 | ``` 51 | 52 | #### Explicit registration 53 | 54 | The other way to handle backend code is with explicit function 55 | registration: 56 | 57 | - `amp.register_half_function(module, function_name)` 58 | - `amp.register_float_function(module, function_name)` 59 | - `amp.register_promote_function(module, function_name)` 60 | 61 | When using this API, `module` is the containing class or module for 62 | the function, and `function_name` is the _string_ name of the 63 | function. Note that the function must be registered before the call to 64 | `amp.initalize()`. 65 | 66 | For our FRU unit, we can register the backend function directly: 67 | 68 | ```python 69 | import backend 70 | 71 | amp.register_half_function(backend, 'FRUBackend') 72 | ``` 73 | -------------------------------------------------------------------------------- /apex/amp/__init__.py: -------------------------------------------------------------------------------- 1 | from .amp import init, half_function, float_function, promote_function,\ 2 | register_half_function, register_float_function, register_promote_function 3 | from .handle import scale_loss, disable_casts 4 | from .frontend import initialize, state_dict, load_state_dict 5 | from ._amp_state import master_params, _amp_state 6 | -------------------------------------------------------------------------------- /apex/amp/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 1, 0) 2 | __version__ = '.'.join(map(str, VERSION)) 3 | -------------------------------------------------------------------------------- /apex/amp/_amp_state.py: -------------------------------------------------------------------------------- 1 | # This is a "header object" that allows different amp modules to communicate. 2 | # I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like. 3 | # But apparently it's ok: 4 | # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm 5 | import os 6 | import torch 7 | 8 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 9 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 10 | 11 | 12 | if TORCH_MAJOR == 1 and TORCH_MINOR < 8: 13 | from torch._six import container_abcs 14 | else: 15 | import collections.abc as container_abcs 16 | 17 | 18 | class AmpState(object): 19 | def __init__(self): 20 | self.hard_override=False 21 | self.allow_incoming_model_not_fp32 = False 22 | self.verbosity=1 23 | 24 | 25 | # Attribute stash. Could also just stash things as global module attributes. 26 | _amp_state = AmpState() 27 | 28 | 29 | def warn_or_err(msg): 30 | if _amp_state.hard_override: 31 | print("Warning: " + msg) 32 | else: 33 | raise RuntimeError(msg) 34 | # I'm not sure if allowing hard_override is a good idea. 35 | # + " If you're sure you know what you're doing, supply " + 36 | # "hard_override=True to amp.initialize.") 37 | 38 | 39 | def maybe_print(msg, rank0=False): 40 | distributed = torch.distributed.is_available() and \ 41 | torch.distributed.is_initialized() and \ 42 | torch.distributed.get_world_size() > 1 43 | if _amp_state.verbosity > 0: 44 | if rank0: 45 | if distributed: 46 | if torch.distributed.get_rank() == 0: 47 | print(msg) 48 | else: 49 | print(msg) 50 | else: 51 | print(msg) 52 | 53 | 54 | # def iter_params(param_groups): 55 | # for group in param_groups: 56 | # for p in group['params']: 57 | # yield p 58 | 59 | 60 | def master_params(optimizer): 61 | """ 62 | Generator expression that iterates over the params owned by ``optimizer``. 63 | 64 | Args: 65 | optimizer: An optimizer previously returned from ``amp.initialize``. 66 | """ 67 | for group in optimizer.param_groups: 68 | for p in group['params']: 69 | yield p 70 | -------------------------------------------------------------------------------- /apex/amp/compat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # True for post-0.4, when Variables/Tensors merged. 4 | def variable_is_tensor(): 5 | v = torch.autograd.Variable() 6 | return isinstance(v, torch.Tensor) 7 | 8 | def tensor_is_variable(): 9 | x = torch.Tensor() 10 | return type(x) == torch.autograd.Variable 11 | 12 | # False for post-0.4 13 | def tensor_is_float_tensor(): 14 | x = torch.Tensor() 15 | return type(x) == torch.FloatTensor 16 | 17 | # Akin to `torch.is_tensor`, but returns True for Variable 18 | # objects in pre-0.4. 19 | def is_tensor_like(x): 20 | return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable) 21 | 22 | # Wraps `torch.is_floating_point` if present, otherwise checks 23 | # the suffix of `x.type()`. 24 | def is_floating_point(x): 25 | if hasattr(torch, 'is_floating_point'): 26 | return torch.is_floating_point(x) 27 | try: 28 | torch_type = x.type() 29 | return torch_type.endswith('FloatTensor') or \ 30 | torch_type.endswith('HalfTensor') or \ 31 | torch_type.endswith('DoubleTensor') 32 | except AttributeError: 33 | return False 34 | 35 | def scalar_python_val(x): 36 | if hasattr(x, 'item'): 37 | return x.item() 38 | else: 39 | if isinstance(x, torch.autograd.Variable): 40 | return x.data[0] 41 | else: 42 | return x[0] 43 | 44 | # Accounts for the possibility that some ops may be removed from a namespace. 45 | def filter_attrs(module, attrs): 46 | return list(attrname for attrname in attrs if hasattr(module, attrname)) 47 | -------------------------------------------------------------------------------- /apex/amp/lists/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/amp/lists/__init__.py -------------------------------------------------------------------------------- /apex/amp/lists/functional_overrides.py: -------------------------------------------------------------------------------- 1 | 2 | # TODO: think about the following two. They do weird things. 3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway) 4 | # - torch.nn.utils.weight_norm 5 | 6 | # Notes: 7 | # F.instance_norm uses batch_norm internally. Which correctly handles 8 | # fp16 in/out with fp32 weights. So we shouldn't do anything for 9 | # either of these. 10 | # F.normalize calls `input.norm()` internally, so it's redundant, but 11 | # kept here in case impl. changes. 12 | # F.cosine_similarity is same: calls `x.norm()` internally. 13 | 14 | import torch.nn.functional 15 | 16 | MODULE = torch.nn.functional 17 | 18 | FP16_FUNCS = [ 19 | 'conv1d', 20 | 'conv2d', 21 | 'conv3d', 22 | 'conv_transpose1d', 23 | 'conv_transpose2d', 24 | 'conv_transpose3d', 25 | 'conv_tbc', # Undocumented / maybe new? 26 | 'linear', 27 | ] 28 | 29 | FP32_FUNCS = [ 30 | 31 | # Interpolation/Upsampling TODO: Remove for 1.2 32 | 'interpolate', 33 | 'grid_sample', 34 | 35 | # Pointwise 36 | 'softplus', 37 | 'softmin', 38 | 'log_softmax', 39 | 'softmax', 40 | 'gelu', 41 | 42 | # Normalization 43 | 'layer_norm', 44 | 'group_norm', 45 | 'local_response_norm', 46 | 'normalize', 47 | 'cosine_similarity', 48 | 49 | # Loss functions 50 | # TODO: which of these can be fp16? 51 | 'poisson_nll_loss', 52 | 'cosine_embedding_loss', 53 | 'cross_entropy', 54 | 'hinge_embedding_loss', 55 | 'kl_div', 56 | 'l1_loss', 57 | 'mse_loss', 58 | 'margin_ranking_loss', 59 | 'multilabel_margin_loss', 60 | 'multilabel_soft_margin_loss', 61 | 'multi_margin_loss', 62 | 'nll_loss', 63 | 'binary_cross_entropy_with_logits', 64 | 'smooth_l1_loss', 65 | 'soft_margin_loss', 66 | 'triplet_margin_loss', 67 | 'ctc_loss' 68 | ] 69 | 70 | BANNED_FUNCS = [ 71 | ('binary_cross_entropy', 72 | ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` " 73 | "It requires that the output of the previous function be already a FloatTensor. \n\n" 74 | "Most models have a Sigmoid right before BCELoss. In that case, you can use\n" 75 | " torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer " 76 | "that is compatible with amp.\nAnother option is to add\n" 77 | " amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n" 78 | "If you _really_ know what you are doing, you can disable this warning by passing " 79 | "allow_banned=True to `amp.init()`.")) 80 | ] 81 | -------------------------------------------------------------------------------- /apex/amp/lists/tensor_overrides.py: -------------------------------------------------------------------------------- 1 | from .. import compat 2 | from . import torch_overrides 3 | 4 | import importlib 5 | 6 | import torch 7 | 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable(): 9 | MODULE = torch.Tensor 10 | # else: 11 | # MODULE = torch.autograd.Variable 12 | 13 | 14 | FP16_FUNCS = compat.filter_attrs(MODULE, [ 15 | '__matmul__', 16 | ]) 17 | 18 | FP32_FUNCS = compat.filter_attrs(MODULE, [ 19 | '__ipow__', 20 | '__pow__', 21 | '__rpow__', 22 | 23 | # Cast to fp32 before transfer to CPU 24 | 'cpu', 25 | ]) 26 | 27 | CASTS = compat.filter_attrs(MODULE, [ 28 | '__add__', 29 | '__div__', 30 | '__eq__', 31 | '__ge__', 32 | '__gt__', 33 | '__iadd__', 34 | '__idiv__', 35 | '__imul__', 36 | '__isub__', 37 | '__itruediv__', 38 | '__le__', 39 | '__lt__', 40 | '__mul__', 41 | '__ne__', 42 | '__radd__', 43 | '__rdiv__', 44 | '__rmul__', 45 | '__rsub__', 46 | '__rtruediv__', 47 | '__sub__', 48 | '__truediv__', 49 | ]) 50 | 51 | # None of these, but here to make code cleaner. 52 | SEQUENCE_CASTS = [] 53 | 54 | # We need to grab all the methods from torch_overrides and add them to 55 | # the Tensor lists as well, as almost all methods are duplicated 56 | # between `torch` and `torch.Tensor` (and check with `hasattr`, 57 | # because a few random ones aren't defined on Tensor) 58 | _self_mod = importlib.import_module(__name__) 59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']: 60 | lst = getattr(_self_mod, attrname) 61 | for fn in getattr(torch_overrides, attrname): 62 | if hasattr(MODULE, fn): 63 | lst.append(fn) 64 | -------------------------------------------------------------------------------- /apex/amp/lists/torch_overrides.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .. import utils 4 | 5 | MODULE = torch 6 | 7 | FP16_FUNCS = [ 8 | # Low level functions wrapped by torch.nn layers. 9 | # The wrapper layers contain the weights which are then passed in as a parameter 10 | # to these functions. 11 | 'conv1d', 12 | 'conv2d', 13 | 'conv3d', 14 | 'conv_transpose1d', 15 | 'conv_transpose2d', 16 | 'conv_transpose3d', 17 | 'conv_tbc', 18 | 'prelu', 19 | 20 | # BLAS 21 | 'addmm', 22 | 'addmv', 23 | 'addr', 24 | 'matmul', 25 | 'mm', 26 | 'mv', 27 | ] 28 | 29 | FP32_FUNCS = [ 30 | # Pointwise 31 | 'acos', 32 | 'asin', 33 | 'cosh', 34 | 'erfinv', 35 | 'exp', 36 | 'expm1', 37 | 'log', 38 | 'log10', 39 | 'log2', 40 | 'reciprocal', 41 | 'rsqrt', 42 | 'sinh', 43 | 'tan', 44 | 45 | # Other math 46 | 'pow', 47 | 48 | # Reduction 49 | 'cumprod', 50 | 'cumsum', 51 | 'dist', 52 | # 'mean', 53 | 'norm', 54 | 'prod', 55 | 'std', 56 | 'sum', 57 | 'var', 58 | 59 | # Misc 60 | 'renorm' 61 | ] 62 | 63 | version_strings = torch.__version__.split('.') 64 | version_major = version_strings[0] 65 | version_minor = version_strings[1] 66 | version_num = float(version_major + "." + version_minor) 67 | # Before torch 1.1, mean must be blacklisted. 68 | if version_num < 1.1: 69 | FP32_FUNCS.append('mean') 70 | 71 | # Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We 72 | # check the CUDA version -- if at least 9.1, then put the bmm 73 | # functions on the fp16 list. Otherwise, put them on the fp32 list. 74 | _bmms = ['addbmm', 75 | 'baddbmm', 76 | 'bmm'] 77 | 78 | if utils.is_cuda_enabled(): 79 | # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802 80 | if utils.get_cuda_version() >= (9, 1, 0): 81 | FP16_FUNCS.extend(_bmms) 82 | else: 83 | FP32_FUNCS.extend(_bmms) 84 | 85 | # Multi-tensor fns that may need type promotion 86 | CASTS = [ 87 | # Multi-tensor math 88 | 'addcdiv', 89 | 'addcmul', 90 | 'atan2', 91 | 'cross', 92 | 'bilinear', 93 | 'dot', 94 | 95 | # Element-wise _or_ tensor-wise math 96 | 'add', 97 | 'div', 98 | 'mul', 99 | 100 | # Comparison 101 | 'eq', 102 | 'equal', 103 | 'ge', 104 | 'gt', 105 | 'le', 106 | 'lt', 107 | 'ne' 108 | ] 109 | 110 | # Functions that take sequence arguments. We need to inspect the whole 111 | # sequence and cast to the widest type. 112 | SEQUENCE_CASTS = [ 113 | 'cat', 114 | 'stack' 115 | ] 116 | -------------------------------------------------------------------------------- /apex/amp/opt.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import warnings 3 | 4 | from .scaler import LossScaler, master_params 5 | from ._amp_state import maybe_print 6 | 7 | import numpy as np 8 | 9 | class OptimWrapper(object): 10 | def __init__(self, optimizer, amp_handle, num_loss): 11 | self._optimizer = optimizer 12 | self._amp_handle = amp_handle 13 | self._num_loss = num_loss 14 | self._loss_idx = 0 15 | self._skip_next = [False] * num_loss 16 | self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)] 17 | 18 | @contextlib.contextmanager 19 | def scale_loss(self, loss): 20 | if not self._amp_handle.is_active(): 21 | yield loss 22 | return 23 | 24 | # When there are multiple losses per-optimizer, we need 25 | # to save out current grad accumulation, since we won't be 26 | # able to unscale this particulare loss once the grads are 27 | # all mixed together. 28 | cached_grads = [] 29 | if self._loss_idx > 0: 30 | for p in master_params(self._optimizer): 31 | if p.grad is not None: 32 | cached_grads.append(p.grad.data.detach().clone()) 33 | else: 34 | cached_grads.append(None) 35 | self._optimizer.zero_grad() 36 | 37 | loss_scale = self._cur_loss_scaler().loss_scale() 38 | yield loss * loss_scale 39 | 40 | self._cur_loss_scaler().clear_overflow_state() 41 | self._cur_loss_scaler().unscale( 42 | master_params(self._optimizer), 43 | master_params(self._optimizer), 44 | loss_scale) 45 | self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale() 46 | self._loss_idx += 1 47 | 48 | if len(cached_grads) > 0: 49 | for p, cached_grad in zip(master_params(self._optimizer), 50 | cached_grads): 51 | if cached_grad is not None: 52 | p.grad.data.add_(cached_grad) 53 | cached_grads = [] 54 | 55 | def _cur_loss_scaler(self): 56 | assert 0 <= self._loss_idx < self._num_loss 57 | return self._loss_scaler[self._loss_idx] 58 | 59 | def step(self, closure=None): 60 | if not self._amp_handle.is_active(): 61 | return self._optimizer.step(closure=closure) 62 | 63 | self._loss_idx = 0 64 | 65 | for group in self._optimizer.param_groups: 66 | for p in group['params']: 67 | self._amp_handle.remove_cache(p) 68 | 69 | if closure is not None: 70 | raise NotImplementedError( 71 | 'The `closure` argument is unsupported by the amp ' + 72 | 'optimizer wrapper.') 73 | if any(self._skip_next): 74 | maybe_print('Gradient overflow, skipping update') 75 | self._skip_next = [False] * self._num_loss 76 | else: 77 | return self._optimizer.step(closure=closure) 78 | 79 | # Forward any attribute lookups 80 | def __getattr__(self, attr): 81 | return getattr(self._optimizer, attr) 82 | 83 | # Forward all torch.optim.Optimizer methods 84 | def __getstate__(self): 85 | return self._optimizer.__getstate__() 86 | 87 | def __setstate__(self): 88 | return self._optimizer.__setstate__() 89 | 90 | def __repr__(self): 91 | return self._optimizer.__repr__() 92 | 93 | def state_dict(self): 94 | return self._optimizer.state_dict() 95 | 96 | def load_state_dict(self, state_dict): 97 | return self._optimizer.load_state_dict(state_dict) 98 | 99 | def zero_grad(self): 100 | return self._optimizer.zero_grad() 101 | 102 | def add_param_group(self, param_group): 103 | return self._optimizer.add_param_group(param_group) 104 | -------------------------------------------------------------------------------- /apex/amp/rnn_compat.py: -------------------------------------------------------------------------------- 1 | from . import utils, wrap 2 | 3 | import torch 4 | _VF = torch._C._VariableFunctions 5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm'] 6 | 7 | def _gen_VF_wrapper(name): 8 | def wrapper(*args, **kwargs): 9 | return getattr(_VF, name)(*args, **kwargs) 10 | return wrapper 11 | 12 | # Some python magic to generate an object that has the rnn cell functions 13 | # defined on it, all of which call into corresponding _VF version. 14 | # Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF" 15 | # imported at module scope within torch.nn.modules.rnn). This should 16 | # not affect third-party importers of _VF.py. 17 | class VariableFunctionsShim(object): 18 | def __init__(self): 19 | for name in RNN_NAMES: 20 | for suffix in ['', '_cell']: 21 | fn_name = name + suffix 22 | setattr(self, fn_name, _gen_VF_wrapper(fn_name)) 23 | 24 | def has_old_rnns(): 25 | try: 26 | torch.nn.backends.thnn.backend.LSTMCell 27 | return True 28 | except: 29 | return False 30 | 31 | def whitelist_rnn_cells(handle, verbose): 32 | # Different module + function names in old/new RNN cases 33 | if has_old_rnns(): 34 | fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell'] 35 | mod = torch.nn.backends.thnn.backend 36 | else: 37 | fn_names = [x + '_cell' for x in RNN_NAMES] 38 | mod = torch.nn.modules.rnn._VF 39 | assert isinstance(mod, VariableFunctionsShim) 40 | 41 | # Insert casts on cell functions 42 | for fn in fn_names: 43 | wrap.cached_cast(mod, fn, utils.maybe_half, handle, 44 | try_caching=True, verbose=verbose) 45 | 46 | if has_old_rnns(): 47 | # Special handling of `backward` for fused gru / lstm: 48 | # The `backward` method calls Tensor.sum() (blacklist) internally, 49 | # and then the resulting grad_input has the wrong type. 50 | # TODO: where else is this a problem? 51 | for rnn_type in ['GRUFused', 'LSTMFused']: 52 | mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type) 53 | wrap.disable_casts(mod, 'backward', handle) 54 | -------------------------------------------------------------------------------- /apex/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/__init__.py -------------------------------------------------------------------------------- /apex/contrib/bottleneck/__init__.py: -------------------------------------------------------------------------------- 1 | from .bottleneck import Bottleneck 2 | -------------------------------------------------------------------------------- /apex/contrib/bottleneck/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bottleneck import Bottleneck 3 | torch.manual_seed(23337) 4 | 5 | # use True to print layerwise sum for all outputs in reference code path 6 | DEBUG = False#True 7 | 8 | for stride, o_channel in [(1,32), (1,128), (2,32)]: 9 | print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel) 10 | a_ = torch.randn(17,32,28,28) 11 | 12 | a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_() 13 | model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last) 14 | 15 | # test model 16 | b = model(a) 17 | b.mean().backward() 18 | d_grad = a.grad.float() 19 | a.grad = None 20 | torch.cuda.synchronize() 21 | 22 | if DEBUG: 23 | print("[DEBUG] ref dx :", d_grad.sum().item()) 24 | # print wgrad. we don't need to reset since later cpp print before accumulation 25 | for i, w in enumerate(model.w_conv): 26 | print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item()) 27 | 28 | wgrads = [] 29 | for w in model.w_conv: 30 | wgrads.append(w.grad.float()) 31 | 32 | model.use_cudnn = True 33 | model.zero_grad() 34 | c = model(a) 35 | c.mean().backward() 36 | 37 | torch.cuda.synchronize() 38 | print("comparing native and channels_last:") 39 | print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item()) 40 | print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item()) 41 | for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)): 42 | print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item()) 43 | 44 | nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_() 45 | nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half() 46 | for p,q in zip(model.parameters(), nhwc_model.parameters()): 47 | # model's storage is already in nhwc, we clone and assign to explicit nhwc model 48 | q.data.copy_(p.data.permute(0,2,3,1).contiguous()) 49 | for p,q in zip(model.buffers(), nhwc_model.buffers()): 50 | q.data.copy_(p.data) 51 | 52 | d = nhwc_model(nhwc_a) 53 | d.mean().backward() 54 | torch.cuda.synchronize() 55 | 56 | # reset reference to cudnn channels_last permute 57 | #c_s = c.storage().tolist() 58 | #d_s = d.storage().tolist() 59 | #print(max([x-y for x,y in zip(c_s,d_s)])) 60 | c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous() 61 | d_grad = a.grad.float().permute(0,2,3,1).contiguous() 62 | wgrads = [] 63 | for w in model.w_conv: 64 | wgrads.append(w.grad.float().permute(0,2,3,1).contiguous()) 65 | 66 | torch.cuda.synchronize() 67 | print("comparing nhwc and channels_last:") 68 | print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item()) 69 | print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item()) 70 | for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)): 71 | print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item()) 72 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha/mask.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | namespace fmha { 31 | 32 | 33 | template 34 | struct Mask { 35 | using Mma_tile = fmha::Hmma_tile; 36 | 37 | template 38 | __device__ Mask(const Params ¶ms, const BInfo &blockInfo, int tidx) { 39 | 40 | actual_seqlen = blockInfo.actual_seqlen; 41 | 42 | const int warp = tidx / Cta_tile::THREADS_PER_WARP; 43 | const int lane = tidx % Cta_tile::THREADS_PER_WARP; 44 | 45 | static_assert(Cta_tile::WARPS_K == 1, ""); 46 | 47 | // find the warp in the Cta tile 48 | const int warp_n = (warp / Cta_tile::WARPS_M); 49 | const int warp_m = (warp % Cta_tile::WARPS_M); 50 | // decompose warp into 8x4 tile 51 | const int quad = lane / 4; 52 | const int tid = (lane % 4) * 2; 53 | row = warp_m * 16 + quad; 54 | col = warp_n * 16 + tid; 55 | } 56 | 57 | inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const { 58 | 59 | // ii and jj iterate over the 2x4 fragment 60 | const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen; 61 | //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen; 62 | return col_valid; 63 | // return row_valid && col_valid; 64 | } 65 | 66 | inline __device__ void load(int it) { 67 | row_offset = it * Cta_tile::M + row; 68 | } 69 | int row_offset; 70 | 71 | int row; 72 | int col; 73 | int actual_seqlen; 74 | }; 75 | 76 | } // namespace fmha 77 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_dgrad_kernel_1xN_reload.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 128, 64, 16, 1, 4, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_dgrad_fp16_128_64_sm80_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::compute_dv_1xN(params); 35 | fmha::compute_dq_dk_1xN(params); 36 | } 37 | 38 | void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, cudaStream_t stream) { 39 | 40 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 41 | constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE; 42 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 43 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 44 | 45 | using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>; 46 | constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE; 47 | static_assert(smem_size_s == 16 * 128 * 2); 48 | static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N); 49 | 50 | constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax; 51 | constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v; 52 | constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk); 53 | 54 | if( smem_size >= 48 * 1024 ) { 55 | FMHA_CHECK_CUDA(cudaFuncSetAttribute( 56 | fmha_dgrad_fp16_128_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 57 | } 58 | dim3 grid(params.h, params.b); 59 | fmha_dgrad_fp16_128_64_sm80_kernel<<>>(params); 60 | } 61 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_dgrad_kernel_1xN_reload.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 256, 64, 16, 1, 4, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_dgrad_fp16_256_64_sm80_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::compute_dv_1xN(params); 35 | fmha::compute_dq_dk_1xN(params); 36 | } 37 | 38 | void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, cudaStream_t stream) { 39 | 40 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 41 | constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE; 42 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 43 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 44 | 45 | using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>; 46 | constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE; 47 | static_assert(smem_size_s == 16 * 256 * 2); 48 | static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N); 49 | 50 | constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax; 51 | constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v; 52 | constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk); 53 | 54 | if( smem_size >= 48 * 1024 ) { 55 | FMHA_CHECK_CUDA(cudaFuncSetAttribute( 56 | fmha_dgrad_fp16_256_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 57 | } 58 | dim3 grid(params.h, params.b); 59 | fmha_dgrad_fp16_256_64_sm80_kernel<<>>(params); 60 | } 61 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_dgrad_kernel_1xN_reload.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 384, 64, 16, 1, 8, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_dgrad_fp16_384_64_sm80_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::compute_dv_1xN(params); 35 | fmha::compute_dq_dk_1xN(params); 36 | } 37 | 38 | void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, cudaStream_t stream) { 39 | 40 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 41 | constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE; 42 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 43 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 44 | 45 | using Smem_tile_s = fmha::Smem_tile_mma_transposed< Kernel_traits::Cta_tile_p>; 46 | constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE; 47 | static_assert(smem_size_s == 16 * 384 * 2); 48 | static_assert(smem_size_o == 16 * 64 * 4 * Kernel_traits::Cta_tile_p::WARPS_N); 49 | 50 | constexpr int smem_size_dv = smem_size_s + 2 * smem_size_q + smem_size_v + smem_size_softmax; 51 | constexpr int smem_size_dq_dk = smem_size_s + smem_size_o + smem_size_q + smem_size_v; 52 | constexpr int smem_size = std::max(smem_size_dv, smem_size_dq_dk); 53 | 54 | if( smem_size >= 48 * 1024 ) { 55 | FMHA_CHECK_CUDA(cudaFuncSetAttribute( 56 | fmha_dgrad_fp16_384_64_sm80_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 57 | } 58 | dim3 grid(params.h, params.b); 59 | fmha_dgrad_fp16_384_64_sm80_kernel<<>>(params); 60 | } 61 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_fprop_kernel_1xN.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 128, 64, 16, 1, 4, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_fprop_fp16_128_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::device_1xN(params); 35 | } 36 | 37 | extern "C" __global__ void fmha_fprop_fp16_128_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) { 38 | fmha::device_1xN(params); 39 | } 40 | 41 | void run_fmha_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, bool is_training, cudaStream_t stream) { 42 | 43 | auto kernel = is_training ? &fmha_fprop_fp16_128_64_sm80_train_kernel : &fmha_fprop_fp16_128_64_sm80_predict_kernel; 44 | 45 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 46 | constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE; 47 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 48 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 49 | 50 | constexpr int smem_size = smem_size_q + std::max(smem_size_v, smem_size_o + smem_size_softmax); 51 | 52 | if( smem_size >= 48 * 1024 ) { 53 | FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 54 | } 55 | 56 | dim3 grid(params.h, params.b); 57 | kernel<<>>(params); 58 | } 59 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_fprop_kernel_1xN.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 256, 64, 16, 1, 4, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_fprop_fp16_256_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::device_1xN(params); 35 | } 36 | 37 | extern "C" __global__ void fmha_fprop_fp16_256_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) { 38 | fmha::device_1xN(params); 39 | } 40 | 41 | void run_fmha_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, bool is_training, cudaStream_t stream) { 42 | 43 | auto kernel = is_training ? &fmha_fprop_fp16_256_64_sm80_train_kernel : &fmha_fprop_fp16_256_64_sm80_predict_kernel; 44 | 45 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 46 | constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE; 47 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 48 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 49 | 50 | constexpr int smem_size = smem_size_q + std::max(smem_size_v, smem_size_o + smem_size_softmax); 51 | 52 | if( smem_size >= 48 * 1024 ) { 53 | FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 54 | } 55 | 56 | dim3 grid(params.h, params.b); 57 | kernel<<>>(params); 58 | } 59 | -------------------------------------------------------------------------------- /apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include "fmha.h" 29 | #include "fmha_fprop_kernel_1xN_reload_v.h" 30 | 31 | using Kernel_traits = FMHA_kernel_traits< 384, 64, 16, 1, 4, 0x08u>; 32 | 33 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) { 34 | fmha::device_1xN(params); 35 | } 36 | 37 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) { 38 | fmha::device_1xN(params); 39 | } 40 | 41 | void run_fmha_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params ¶ms, bool is_training, cudaStream_t stream) { 42 | 43 | auto kernel = is_training ? &fmha_fprop_fp16_384_64_sm80_train_kernel : &fmha_fprop_fp16_384_64_sm80_predict_kernel; 44 | 45 | constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float); 46 | constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE; 47 | constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE; 48 | 49 | constexpr int smem_size = smem_size_v + smem_size_o + smem_size_softmax; 50 | 51 | if( smem_size >= 48 * 1024 ) { 52 | FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); 53 | } 54 | 55 | dim3 grid(params.h, params.b); 56 | kernel<<>>(params); 57 | } 58 | -------------------------------------------------------------------------------- /apex/contrib/csrc/groupbn/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #ifndef CUDA_UTILS_H 3 | #define CUDA_UTILS_H 4 | 5 | namespace at { 6 | namespace cuda { 7 | 8 | namespace utils { 9 | 10 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) { 11 | return getDeviceProperties(device_id)->sharedMemPerMultiprocessor; 12 | } 13 | 14 | 15 | } 16 | } 17 | } 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /apex/contrib/csrc/groupbn/ipc.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "THC/THC.h" 6 | 7 | #include 8 | 9 | #include "compat.h" 10 | 11 | 12 | #define cudaCheckErrors(msg) \ 13 | do { \ 14 | cudaError_t __err = cudaGetLastError(); \ 15 | if (__err != cudaSuccess) { \ 16 | fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 17 | msg, cudaGetErrorString(__err), \ 18 | __FILE__, __LINE__); \ 19 | fprintf(stderr, "*** FAILED - ABORTING\n"); \ 20 | exit(1); \ 21 | } \ 22 | } while (0) 23 | 24 | template<> 25 | struct std::hash { 26 | size_t operator() (const cudaIpcMemHandle_t& handle) const { 27 | size_t hash = 0; 28 | uint8_t* ptr = (uint8_t*)&handle; 29 | assert(sizeof(uint8_t) == 1); 30 | for (int i=0; i 39 | struct std::equal_to { 40 | bool operator() (const cudaIpcMemHandle_t &lhs, 41 | const cudaIpcMemHandle_t &rhs) const { 42 | return (std::memcmp((void*) &lhs, 43 | (void*) &rhs, 44 | sizeof(cudaIpcMemHandle_t)) == 0); 45 | } 46 | }; 47 | 48 | namespace { 49 | 50 | namespace gpuipc { 51 | //from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h 52 | // The number of threads per pixel. 53 | const int THREADS_PER_PIXEL = 16; 54 | // The number of elements per ldg. 55 | const int ELEMENTS_PER_LDG = 4; 56 | // The number of reducing ops, each uses its own space : mean, var, dscale, dbias 57 | const int REDUCE_OPS = 4; 58 | // Maximum block.y supported - limited due to buffer allocation 59 | const int MAX_BLOCK_Y = 256; 60 | const int MAX_OFFSET = REDUCE_OPS*MAX_BLOCK_Y; 61 | const int BYTES_PER_ELEM = 4; 62 | // Buffer size per sync step 63 | const int SINGLE_SYNC_BUFFER_BYTES = MAX_OFFSET*THREADS_PER_PIXEL*2*ELEMENTS_PER_LDG*BYTES_PER_ELEM; 64 | }; 65 | 66 | class IpcMemHandleRegistry { 67 | public: 68 | void* getPtr(const cudaIpcMemHandle_t& handle, int64_t offset) { 69 | if (registry_.count(handle) == 0) { 70 | registry_.insert(std::make_pair(handle, RegistryEntry())); 71 | registry_[handle].dev_ptr = ipcOpenMem(handle); 72 | } 73 | registry_[handle].ref_count++; 74 | return (((uint8_t*)registry_[handle].dev_ptr) + offset); 75 | } 76 | 77 | void releasePtr(const cudaIpcMemHandle_t& handle) { 78 | if (registry_.count(handle) == 0) { 79 | } 80 | if (--registry_[handle].ref_count == 0) { 81 | ipcCloseMem(registry_[handle].dev_ptr); 82 | registry_.erase(handle); 83 | } 84 | } 85 | 86 | struct RegistryEntry { 87 | void* dev_ptr; 88 | int ref_count; 89 | RegistryEntry() : dev_ptr(NULL) , ref_count(0) {} 90 | }; 91 | 92 | protected: 93 | std::unordered_map registry_; 94 | 95 | void* ipcOpenMem(const cudaIpcMemHandle_t& handle) { 96 | void *data; 97 | cudaIpcOpenMemHandle(&data, handle, cudaIpcMemLazyEnablePeerAccess); 98 | cudaCheckErrors("ipc init"); 99 | return data; 100 | } 101 | 102 | void ipcCloseMem(void* dev_ptr) { 103 | cudaIpcCloseMemHandle(dev_ptr); 104 | cudaCheckErrors("ipc close"); 105 | } 106 | 107 | }; 108 | 109 | } 110 | 111 | static IpcMemHandleRegistry ipc_mem_registry; 112 | 113 | int64_t get_buffer_size(const int bn_sync_steps) { 114 | return bn_sync_steps * gpuipc::SINGLE_SYNC_BUFFER_BYTES; 115 | } 116 | 117 | void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) { 118 | cudaIpcMemHandle_t my_handle; 119 | memcpy((unsigned char *)(&my_handle), handle.DATA_PTR(), sizeof(my_handle)); 120 | return ipc_mem_registry.getPtr(my_handle, offset); 121 | } 122 | 123 | void close_remote_data(const at::Tensor& handle) { 124 | cudaIpcMemHandle_t my_handle; 125 | memcpy((unsigned char *)(&my_handle), handle.DATA_PTR(), sizeof(my_handle)); 126 | ipc_mem_registry.releasePtr(my_handle); 127 | } 128 | 129 | void* get_data_ptr( 130 | const at::Tensor& data) { 131 | return data.DATA_PTR(); 132 | } 133 | -------------------------------------------------------------------------------- /apex/contrib/csrc/layer_norm/ln_api.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ATen/cuda/CUDAContext.h" 3 | 4 | void ln_fwd_cuda(at::Tensor &y, at::Tensor &mu, at::Tensor &rsigma, 5 | const at::Tensor &x, const at::Tensor &gamma, 6 | const at::Tensor &beta, const float epsilon, const int rows, const int cols, 7 | cudaStream_t stream); 8 | 9 | void ln_bwd_cuda(at::Tensor &dx, at::Tensor &dgamma, at::Tensor &dbeta, 10 | const at::Tensor &dw, const at::Tensor &x, 11 | const at::Tensor &mu, const at::Tensor &rsigma, 12 | const at::Tensor &gamma, const int rows, const int cols, cudaStream_t stream); 13 | 14 | 15 | std::vector ln_fwd(const at::Tensor &x, // BxSxhidden_size 16 | const at::Tensor &gamma, // hidden_size 17 | const at::Tensor &beta, // hidden_size 18 | const float epsilon 19 | ) { 20 | 21 | TORCH_CHECK(x.is_cuda()) 22 | TORCH_CHECK(gamma.is_cuda()) 23 | TORCH_CHECK(beta.is_cuda()) 24 | 25 | TORCH_CHECK(x.is_contiguous()); 26 | auto sizes = x.sizes(); 27 | TORCH_CHECK(sizes.size() == 2); 28 | 29 | const int rows = sizes[0]; 30 | const int cols = sizes[1]; 31 | 32 | auto dtype = x.scalar_type(); 33 | 34 | TORCH_CHECK(gamma.dtype() == dtype); 35 | TORCH_CHECK(beta.dtype() == dtype); 36 | 37 | TORCH_CHECK(gamma.sizes() == beta.sizes()); 38 | TORCH_CHECK(gamma.numel() == cols); 39 | 40 | TORCH_CHECK(epsilon >= 0.f); 41 | 42 | auto stream = at::cuda::getCurrentCUDAStream().stream(); 43 | 44 | auto y = torch::empty_like(x); 45 | 46 | auto opts = x.options(); 47 | 48 | auto mu = torch::empty({rows}, opts.dtype(torch::kFloat32)); 49 | auto rsigma = torch::empty({rows}, opts.dtype(torch::kFloat32)); 50 | 51 | ln_fwd_cuda(y, mu, rsigma, x, gamma, beta, epsilon, rows, cols, stream); 52 | 53 | return {y, mu, rsigma}; 54 | } 55 | 56 | 57 | 58 | std::vector ln_bwd(const at::Tensor &dw, // BxSxhidden_size 59 | const at::Tensor &x, // BxSxhidden_size 60 | const at::Tensor &mu, // BxS, FP32! 61 | const at::Tensor &rsigma, // BxS, FP32! 62 | const at::Tensor &gamma // hidden_size 63 | ) { 64 | 65 | TORCH_CHECK(x.is_cuda()); 66 | TORCH_CHECK(dw.is_cuda()); 67 | TORCH_CHECK(mu.is_cuda()); 68 | TORCH_CHECK(rsigma.is_cuda()); 69 | TORCH_CHECK(gamma.is_cuda()); 70 | 71 | TORCH_CHECK(x.is_contiguous()); 72 | TORCH_CHECK(dw.is_contiguous()); 73 | 74 | auto sizes = x.sizes(); 75 | TORCH_CHECK(sizes.size() == 2); 76 | TORCH_CHECK(dw.sizes() == sizes); 77 | auto rows = sizes[0]; 78 | auto cols = sizes[1]; 79 | 80 | auto dtype = x.scalar_type(); 81 | TORCH_CHECK(dw.dtype() == dtype); 82 | TORCH_CHECK(gamma.dtype() == dtype); 83 | TORCH_CHECK(mu.dtype() == torch::kFloat32); 84 | TORCH_CHECK(rsigma.dtype() == torch::kFloat32); 85 | TORCH_CHECK(mu.sizes() == rsigma.sizes()); 86 | TORCH_CHECK(mu.numel() == rows); 87 | 88 | TORCH_CHECK(gamma.numel() == cols); 89 | 90 | 91 | auto stream = at::cuda::getCurrentCUDAStream().stream(); 92 | 93 | auto dx = torch::empty_like(x); 94 | auto dgamma = torch::empty_like(gamma); 95 | auto dbeta = torch::empty_like(gamma); 96 | 97 | ln_bwd_cuda(dx, dgamma, dbeta, dw, x, mu, rsigma, gamma, rows, cols, stream); 98 | 99 | return {dx, dgamma, dbeta}; 100 | } 101 | 102 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 103 | m.doc() = "CUDA LayerNorm"; // optional module docstring 104 | m.def("ln_fwd", &ln_fwd, "Run LayerNorm forward kernel"); 105 | m.def("ln_bwd", &ln_bwd, "Run LayerNorm backward kernel"); 106 | } 107 | -------------------------------------------------------------------------------- /apex/contrib/csrc/layer_norm/ln_kernel_traits.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | constexpr uint32_t THREADS_PER_WARP = 32; 4 | 5 | template 7 | struct Kernel_traits { 8 | enum { WARPS_M = WARPS_M_ }; 9 | enum { WARPS_N = WARPS_N_ }; 10 | enum { COLS = COLS_ }; 11 | enum { BYTES_PER_LDG = BYTES_PER_LDG_ }; 12 | 13 | using Vec = Vec; 14 | 15 | using vec_t = typename Vec::vec_t; 16 | using base_t = typename Vec::base_t; 17 | using packed_t = typename Vec::packed_t; 18 | using compute_t = typename Vec::compute_t; 19 | using packed_compute_t = typename Vec::packed_compute_t; 20 | 21 | enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP }; 22 | enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW }; 23 | enum { ROWS_PER_CTA = WARPS_M }; 24 | 25 | enum { BYTES_PER_ROW = COLS * sizeof(base_t) }; 26 | enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG }; 27 | enum {SMEM_BYTES = ROWS_PER_CTA * COLS * sizeof(compute_t)}; 28 | }; 29 | -------------------------------------------------------------------------------- /apex/contrib/csrc/layer_norm/utils.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "torch/extension.h" 4 | #include // for CUDNN_CHECK 5 | 6 | #define DIVUP(x, y) (((x) + ((y)-1)) / (y)) 7 | 8 | #define DISPATCH_FLOAT_AND_HALF(TYPE, NAME, ...) \ 9 | [&] { \ 10 | const auto &the_type = TYPE; \ 11 | /* don't use TYPE again in case it is an expensive or side-effect op */ \ 12 | at::ScalarType _st = ::detail::scalar_type(the_type); \ 13 | switch (_st) { \ 14 | AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ 15 | AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ 16 | default: \ 17 | AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ 18 | } \ 19 | }() 20 | 21 | template struct Vec_type {}; 22 | 23 | template <> struct Vec_type<16> { 24 | using Type = uint4; 25 | static __device__ inline Type zero() { return make_uint4(0, 0, 0, 0); } 26 | }; 27 | template <> struct Vec_type<8> { 28 | using Type = uint2; 29 | static __device__ inline Type zero() { return make_uint2(0, 0); } 30 | }; 31 | 32 | template <> struct Vec_type<4> { 33 | using Type = uint32_t; 34 | static __device__ inline Type zero() { return 0; } 35 | }; 36 | 37 | template <> struct Vec_type<2> { 38 | using Type = uint16_t; 39 | static __device__ inline Type zero() { return 0; } 40 | }; 41 | 42 | template struct TypeInfo { 43 | using base_t = T; 44 | using packed_t = T; 45 | using compute_t = float; 46 | using packed_compute_t = float; 47 | }; 48 | 49 | template <> struct TypeInfo { 50 | using base_t = half; 51 | using packed_t = half2; 52 | using compute_t = float; 53 | using packed_compute_t = float2; 54 | }; 55 | 56 | template struct Vec { 57 | 58 | using base_t = typename TypeInfo::base_t; 59 | using packed_t = typename TypeInfo::packed_t; 60 | using compute_t = typename TypeInfo::compute_t; 61 | using packed_compute_t = typename TypeInfo::packed_compute_t; 62 | 63 | static_assert(Bytes % sizeof(base_t) == 0, ""); 64 | static_assert(Bytes % sizeof(packed_t) == 0, ""); 65 | enum { BYTES_PER_THREAD = Bytes }; 66 | enum { NUM_ELTS = Bytes / sizeof(base_t) }; 67 | enum { NUM_PACKED = Bytes / sizeof(packed_t) }; 68 | using vec_t = typename Vec_type::Type; 69 | using store_t = union { 70 | vec_t raw; 71 | base_t elt[NUM_ELTS]; 72 | packed_t packed[NUM_PACKED]; 73 | }; 74 | store_t data; 75 | 76 | __device__ Vec() { data.raw = Vec_type::zero(); } 77 | 78 | __device__ inline void load_from(const char *ptr) { 79 | data.raw = *reinterpret_cast(ptr); 80 | } 81 | 82 | __device__ inline void load_or_zero(const char *ptr, const bool is_valid) { 83 | data.raw = is_valid ? *reinterpret_cast(ptr) 84 | : Vec_type::zero(); 85 | } 86 | 87 | __device__ inline void store_to(char *ptr) const { 88 | *reinterpret_cast(ptr) = data.raw; 89 | } 90 | 91 | __device__ inline void store_valid(char *ptr, const bool is_valid) const { 92 | if (is_valid) 93 | *reinterpret_cast(ptr) = data.raw; 94 | } 95 | }; 96 | -------------------------------------------------------------------------------- /apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace multihead_attn { 6 | namespace fused_softmax { 7 | namespace additive_mask_softmax_dropout { 8 | 9 | std::vector fwd_cuda( 10 | bool is_training, 11 | int heads, 12 | torch::Tensor const& input, 13 | const half* pad_mask, 14 | float dropout_prob 15 | ); 16 | 17 | torch::Tensor bwd_cuda( 18 | int heads, 19 | torch::Tensor const& output_grads, 20 | torch::Tensor const& softmax_results, 21 | torch::Tensor const& dropout_mask, 22 | float dropout_prob 23 | ); 24 | 25 | // C++ interface 26 | 27 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 28 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 29 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 30 | 31 | std::vector fwd( 32 | bool use_mask, 33 | bool is_training, 34 | int heads, 35 | torch::Tensor const& input, 36 | torch::Tensor const& pad_mask, 37 | float dropout_prob 38 | ) 39 | { 40 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 41 | AT_ASSERTM(input.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 42 | 43 | if (use_mask) { 44 | AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor"); 45 | AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Half, "Only BYTE is supported"); 46 | } 47 | 48 | return fwd_cuda( 49 | is_training, 50 | heads, 51 | input, 52 | use_mask ? static_cast(pad_mask.data_ptr()) : nullptr, 53 | dropout_prob 54 | ); 55 | } 56 | 57 | torch::Tensor bwd( 58 | bool use_mask, 59 | int heads, 60 | torch::Tensor const& output_grads, 61 | torch::Tensor const& softmax_results, 62 | torch::Tensor const& dropout_mask, 63 | float dropout_prob 64 | ) 65 | { 66 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 67 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 68 | AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor"); 69 | 70 | AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 71 | AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 72 | // AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte, "Only BYTE is supported"); 73 | 74 | return bwd_cuda( 75 | heads, 76 | output_grads, 77 | softmax_results, 78 | dropout_mask, 79 | dropout_prob 80 | ); 81 | } 82 | 83 | } // end namespace mask_softmax_dropout 84 | } // end namespace fused_softmax 85 | } // end namespace multihead_attn 86 | 87 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 88 | m.def("forward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::fwd, "Self Multihead Attention masked softmax dropout -- Forward."); 89 | m.def("backward", &multihead_attn::fused_softmax::additive_mask_softmax_dropout::bwd, "Self Multihead Attention masked softmax dropout -- Backward."); 90 | } 91 | 92 | -------------------------------------------------------------------------------- /apex/contrib/csrc/multihead_attn/philox.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //Philox CUDA. 3 | 4 | class Philox { 5 | public: 6 | __device__ inline Philox(unsigned long long seed, 7 | unsigned long long subsequence, 8 | unsigned long long offset) { 9 | key.x = (unsigned int)seed; 10 | key.y = (unsigned int)(seed >> 32); 11 | counter = make_uint4(0, 0, 0, 0); 12 | counter.z = (unsigned int)(subsequence); 13 | counter.w = (unsigned int)(subsequence >> 32); 14 | STATE = 0; 15 | incr_n(offset / 4); 16 | } 17 | __device__ inline uint4 operator()() { 18 | if(STATE == 0) { 19 | uint4 counter_ = counter; 20 | uint2 key_ = key; 21 | //7-round philox 22 | for(int i = 0; i < 6; i++) { 23 | counter_ = single_round(counter_, key_); 24 | key_.x += (kPhilox10A); key_.y += (kPhilox10B); 25 | } 26 | output = single_round(counter_, key_); 27 | incr(); 28 | } 29 | //return a float4 directly 30 | //unsigned long ret; 31 | //switch(STATE) { 32 | // case 0: ret = output.x; break; 33 | // case 1: ret = output.y; break; 34 | // case 2: ret = output.z; break; 35 | // case 3: ret = output.w; break; 36 | //} 37 | //STATE = (STATE + 1) % 4; 38 | return output; 39 | } 40 | private: 41 | uint4 counter; 42 | uint4 output; 43 | uint2 key; 44 | unsigned int STATE; 45 | __device__ inline void incr_n(unsigned long long n) { 46 | unsigned int nlo = (unsigned int)(n); 47 | unsigned int nhi = (unsigned int)(n >> 32); 48 | counter.x += nlo; 49 | if (counter.x < nlo) 50 | nhi++; 51 | counter.y += nhi; 52 | if (nhi <= counter.y) 53 | return; 54 | if (++counter.z) 55 | return; 56 | ++counter.w; 57 | } 58 | __device__ inline void incr() { 59 | if (++counter.x) 60 | return; 61 | if (++counter.y) 62 | return; 63 | if (++counter.z) 64 | return; 65 | ++counter.w; 66 | } 67 | __device__ unsigned int mulhilo32(unsigned int a, unsigned int b, 68 | unsigned int *result_high) { 69 | *result_high = __umulhi(a, b); 70 | return a*b; 71 | } 72 | __device__ inline uint4 single_round(uint4 ctr, uint2 key) { 73 | unsigned int hi0; 74 | unsigned int hi1; 75 | unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0); 76 | unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1); 77 | uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0}; 78 | return ret; 79 | } 80 | static const unsigned long kPhilox10A = 0x9E3779B9; 81 | static const unsigned long kPhilox10B = 0xBB67AE85; 82 | static const unsigned long kPhiloxSA = 0xD2511F53; 83 | static const unsigned long kPhiloxSB = 0xCD9E8D57; 84 | }; 85 | // Inverse of 2^32. 86 | #define M_RAN_INVM32 2.3283064e-10f 87 | __device__ __inline__ float4 uniform4(uint4 x) { 88 | return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,x.w * M_RAN_INVM32); 89 | 90 | } 91 | -------------------------------------------------------------------------------- /apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | const float lr, 8 | const float beta1, 9 | const float beta2, 10 | const float epsilon, 11 | const int step, 12 | const int bias_correction, 13 | const float weight_decay, 14 | const int grad_averaging, 15 | const int mode, 16 | const float global_grad_norm, 17 | const float max_grad_norm); 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer"); 21 | } 22 | -------------------------------------------------------------------------------- /apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_fused_adam_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | at::Tensor per_tensor_beta1, 8 | at::Tensor per_tensor_beta2, 9 | at::Tensor per_tensor_bias_correction, 10 | at::Tensor per_tensor_eps, 11 | at::Tensor per_tensor_weight_decay, 12 | float lr, 13 | float grad_scale, 14 | int step, 15 | int mode); 16 | 17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 18 | m.def("multi_tensor_fused_adam", &multi_tensor_fused_adam_cuda, 19 | "Multi tensor Adam optimized CUDA implementation."); 20 | } 21 | -------------------------------------------------------------------------------- /apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_compute_update_term_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | at::Tensor per_tensor_beta1, 8 | at::Tensor per_tensor_beta2, 9 | at::Tensor per_tensor_beta3, 10 | at::Tensor per_tensor_bias_correction, 11 | at::Tensor step, 12 | at::Tensor per_tensor_epsilon, 13 | const int mode, 14 | at::Tensor per_tensor_decay, 15 | at::Tensor global_scale, 16 | at::Tensor global_grad_norm, 17 | const float max_grad_norm); 18 | 19 | void multi_tensor_lamb_update_weights_cuda( 20 | int chunk_size, 21 | at::Tensor noop_flag, 22 | std::vector> tensor_lists, 23 | at::Tensor per_tensor_param_norm, 24 | at::Tensor per_tensor_update_norm, 25 | at::Tensor update_norm_offset, 26 | at::Tensor learning_rate, 27 | at::Tensor per_tensor_decay, 28 | at::Tensor global_grad_norm, 29 | bool use_nvlamb); 30 | 31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 32 | m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda, 33 | "Computes update term for LAMB optimizer"); 34 | m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda, 35 | "Applies update term for LAMB optimizer"); 36 | } 37 | -------------------------------------------------------------------------------- /apex/contrib/csrc/transducer/transducer_joint.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") 5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 7 | 8 | std::vector transducer_joint_cuda_forward( 9 | torch::Tensor f, 10 | torch::Tensor g, 11 | torch::Tensor fLen, 12 | torch::Tensor gLen, 13 | torch::Tensor batchOffset, 14 | int64_t packedBatch, 15 | int opt, 16 | bool packOutput, 17 | bool relu, 18 | bool dropout, 19 | float dropoutProb, 20 | int tileSize); 21 | 22 | 23 | std::vector transducer_joint_cuda_backward( 24 | std::vector in, 25 | torch::Tensor fLen, 26 | torch::Tensor gLen, 27 | torch::Tensor batchOffset, 28 | int maxFLen, 29 | int maxGLen, 30 | bool packOutput, 31 | float scale); 32 | 33 | std::vector transducer_joint_forward( 34 | torch::Tensor f, 35 | torch::Tensor g, 36 | torch::Tensor fLen, 37 | torch::Tensor gLen, 38 | torch::Tensor batchOffset, 39 | int64_t packedBatch, 40 | int opt, 41 | bool packOutput, 42 | bool relu, 43 | bool dropout, 44 | float dropoutProb, 45 | int tileSize) { 46 | CHECK_INPUT(f); 47 | CHECK_INPUT(g); 48 | CHECK_INPUT(fLen); 49 | CHECK_INPUT(gLen); 50 | if (packOutput) 51 | CHECK_INPUT(batchOffset); 52 | return transducer_joint_cuda_forward( 53 | f, 54 | g, 55 | fLen, 56 | gLen, 57 | batchOffset, 58 | packedBatch, 59 | opt, 60 | packOutput, 61 | relu, 62 | dropout, 63 | dropoutProb, 64 | tileSize); 65 | } 66 | 67 | std::vector transducer_joint_backward( 68 | std::vector in, 69 | torch::Tensor fLen, 70 | torch::Tensor gLen, 71 | torch::Tensor batchOffset, 72 | int maxFLen, 73 | int maxGLen, 74 | bool packOutput, 75 | float scale) { 76 | for (auto t : in){ 77 | CHECK_INPUT(t); 78 | } 79 | CHECK_INPUT(fLen); 80 | CHECK_INPUT(gLen); 81 | if (packOutput) 82 | CHECK_INPUT(batchOffset); 83 | return transducer_joint_cuda_backward( 84 | in, 85 | fLen, 86 | gLen, 87 | batchOffset, 88 | maxFLen, 89 | maxGLen, 90 | packOutput, 91 | scale); 92 | } 93 | 94 | 95 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 96 | m.def("forward", &transducer_joint_forward, "transducer joint forward (CUDA)"); 97 | m.def("backward", &transducer_joint_backward, "transducer joint backward (CUDA)"); 98 | } -------------------------------------------------------------------------------- /apex/contrib/csrc/transducer/transducer_loss.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") 5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 7 | 8 | std::vector transducer_loss_cuda_forward( 9 | torch::Tensor x, 10 | torch::Tensor label, 11 | torch::Tensor audLen, 12 | torch::Tensor txtLen, 13 | torch::Tensor batchOffset, 14 | int maxFLen, 15 | int blankIdx, 16 | int opt, 17 | bool packedInput); 18 | 19 | torch::Tensor transducer_loss_cuda_backward( 20 | torch::Tensor x, 21 | torch::Tensor lossGrad, 22 | torch::Tensor alpha, 23 | torch::Tensor beta, 24 | torch::Tensor audLen, 25 | torch::Tensor txtLen, 26 | torch::Tensor label, 27 | torch::Tensor batchOffset, 28 | int maxFLen, 29 | int blankIdx, 30 | int opt, 31 | bool fuseSoftmaxBackward, 32 | bool packedInput); 33 | 34 | 35 | std::vector transducer_loss_forward( 36 | torch::Tensor x, 37 | torch::Tensor label, 38 | torch::Tensor fLen, 39 | torch::Tensor yLen, 40 | torch::Tensor batchOffset, 41 | int maxFLen, 42 | int blankIdx, 43 | int opt, 44 | bool packedInput 45 | ) { 46 | 47 | CHECK_INPUT(x); 48 | CHECK_INPUT(label); 49 | CHECK_INPUT(fLen); 50 | CHECK_INPUT(yLen); 51 | if (packedInput) 52 | CHECK_INPUT(batchOffset); 53 | return transducer_loss_cuda_forward( 54 | x, 55 | label, 56 | fLen, 57 | yLen, 58 | batchOffset, 59 | maxFLen, 60 | blankIdx, 61 | opt, 62 | packedInput); 63 | } 64 | 65 | torch::Tensor transducer_loss_backward( 66 | torch::Tensor x, 67 | torch::Tensor lossGrad, 68 | torch::Tensor alpha, 69 | torch::Tensor beta, 70 | torch::Tensor fLen, 71 | torch::Tensor yLen, 72 | torch::Tensor label, 73 | torch::Tensor batchOffset, 74 | int maxFLen, 75 | int blankIdx, 76 | int opt, 77 | bool fuseSoftmaxBackward, 78 | bool packedInput){ 79 | 80 | CHECK_INPUT(x); 81 | CHECK_INPUT(label); 82 | CHECK_INPUT(lossGrad); 83 | CHECK_INPUT(alpha); 84 | CHECK_INPUT(beta); 85 | CHECK_INPUT(fLen); 86 | CHECK_INPUT(yLen); 87 | if (packedInput) 88 | CHECK_INPUT(batchOffset); 89 | 90 | return transducer_loss_cuda_backward( 91 | x, 92 | lossGrad, 93 | alpha, 94 | beta, 95 | fLen, 96 | yLen, 97 | label, 98 | batchOffset, 99 | maxFLen, 100 | blankIdx, 101 | opt, 102 | fuseSoftmaxBackward, 103 | packedInput); 104 | } 105 | 106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 107 | m.def("forward", &transducer_loss_forward, "transducer loss forward (CUDA)"); 108 | m.def("backward", &transducer_loss_backward, "transducer loss backward (CUDA)"); 109 | } 110 | -------------------------------------------------------------------------------- /apex/contrib/csrc/xentropy/interface.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // CUDA forward declarations 4 | 5 | std::vector softmax_xentropy_cuda( 6 | const at::Tensor &input, 7 | const at::Tensor &labels, 8 | const float smoothing, 9 | const bool half_to_float); 10 | 11 | at::Tensor softmax_xentropy_backward_cuda( 12 | const at::Tensor &grad_loss, 13 | const at::Tensor &logits, 14 | const at::Tensor &max_log_sum_exp, 15 | const at::Tensor &labels, 16 | const float smoothing); 17 | 18 | // C++ interface 19 | 20 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 21 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 22 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 23 | 24 | std::vector softmax_xentropy_forward( 25 | const at::Tensor &input, 26 | const at::Tensor &labels, 27 | const float smoothing, 28 | const bool half_to_float) { 29 | CHECK_CUDA(input); 30 | CHECK_INPUT(labels); 31 | 32 | return softmax_xentropy_cuda(input, labels, smoothing, half_to_float); 33 | } 34 | 35 | at::Tensor softmax_xentropy_backward( 36 | const at::Tensor &grad_loss, 37 | const at::Tensor &logits, 38 | const at::Tensor &max_log_sum_exp, 39 | const at::Tensor &labels, 40 | const float smoothing) { 41 | CHECK_CUDA(grad_loss); 42 | CHECK_CUDA(logits); 43 | CHECK_INPUT(max_log_sum_exp); 44 | CHECK_INPUT(labels); 45 | 46 | return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing); 47 | } 48 | 49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 50 | m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)"); 51 | m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)"); 52 | } 53 | -------------------------------------------------------------------------------- /apex/contrib/fmha/__init__.py: -------------------------------------------------------------------------------- 1 | from .fmha import FMHAFun 2 | -------------------------------------------------------------------------------- /apex/contrib/fmha/fmha.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | ############################################################################### 27 | 28 | 29 | import torch 30 | import torch.nn.functional as F 31 | import fmhalib as mha 32 | 33 | class FMHAFun(torch.autograd.Function): 34 | @staticmethod 35 | def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training): 36 | batch_size = cu_seqlens.numel() - 1 37 | if batch_size < 4: 38 | context, S_dmask = mha.fwd_nl(qkv, cu_seqlens, p_dropout, max_s, is_training, None) 39 | else: 40 | context, S_dmask = mha.fwd(qkv, cu_seqlens, p_dropout, max_s, is_training, None) 41 | ctx.save_for_backward(qkv, S_dmask) 42 | ctx.cu_seqlens = cu_seqlens 43 | ctx.p_dropout = p_dropout 44 | ctx.max_s = max_s 45 | return context 46 | 47 | @staticmethod 48 | def backward(ctx, dout): 49 | qkv, S_dmask = ctx.saved_tensors 50 | batch_size = ctx.cu_seqlens.numel() - 1 51 | if batch_size < 4: 52 | dqkv, dp, _ = mha.bwd_nl(dout, qkv, S_dmask, ctx.cu_seqlens, ctx.p_dropout, ctx.max_s) 53 | else: 54 | dqkv, dp = mha.bwd(dout, qkv, S_dmask, ctx.cu_seqlens, ctx.p_dropout, ctx.max_s) 55 | 56 | return dqkv, None, None, None, None, None, None 57 | 58 | class FMHA(torch.nn.Module): 59 | 60 | def __init__(self, config): 61 | 62 | super(FMHA, self).__init__() 63 | 64 | self.p_dropout = config.attention_probs_dropout_prob 65 | self.h = config.num_attention_heads 66 | self.hidden_size = config.hidden_size 67 | self.d = self.hidden_size // self.h 68 | assert self.d * self.h == self.hidden_size, "Invalid hidden size/num_heads" 69 | 70 | def forward(self, qkv, cu_seqlens, max_s, is_training=True): 71 | 72 | ctx = FMHAFun.apply(qkv.view(-1, 3, self.h, self.d), cu_seqlens, self.p_dropout, max_s, is_training) 73 | 74 | return ctx.view(-1, self.hidden_size) 75 | -------------------------------------------------------------------------------- /apex/contrib/groupbn/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | import bnp 4 | from .batch_norm import BatchNorm2d_NHWC 5 | del torch 6 | del bnp 7 | del batch_norm 8 | except ImportError as err: 9 | print("apex was installed without --bnp flag, contrib.groupbn is not available") 10 | -------------------------------------------------------------------------------- /apex/contrib/layer_norm/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_norm import FastLayerNorm 2 | -------------------------------------------------------------------------------- /apex/contrib/layer_norm/layer_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import init 3 | 4 | import fast_layer_norm 5 | 6 | class FastLayerNormFN(torch.autograd.Function): 7 | @staticmethod 8 | def forward(ctx, x, gamma, beta, epsilon): 9 | x = x.contiguous() 10 | gamma = gamma.contiguous() 11 | beta = beta.contiguous() 12 | hidden_size = gamma.numel() 13 | xmat = x.view((-1, hidden_size)) 14 | ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon) 15 | ctx.save_for_backward(x, gamma, mu, rsigma) 16 | return ymat.view(x.shape) 17 | 18 | @staticmethod 19 | def backward(ctx, dy): 20 | #assert dy.is_contiguous() 21 | dy = dy.contiguous() # this happens! 22 | x, gamma, mu, rsigma = ctx.saved_tensors 23 | 24 | hidden_size = gamma.numel() 25 | xmat = x.view((-1, hidden_size)) 26 | dymat = dy.view(xmat.shape) 27 | dxmat, dgamma, dbeta = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma) 28 | dx = dxmat.view(x.shape) 29 | return dx, dgamma, dbeta, None 30 | 31 | class FastLayerNorm(torch.nn.Module): 32 | def __init__(self, hidden_size, eps=1e-5): 33 | super(FastLayerNorm, self).__init__() 34 | self.epsilon = eps 35 | self.weight = torch.nn.Parameter(torch.Tensor(hidden_size)) 36 | self.bias = torch.nn.Parameter(torch.Tensor(hidden_size)) 37 | self.reset_parameters() 38 | 39 | def reset_parameters(self): 40 | init.ones_(self.weight) 41 | init.zeros_(self.bias) 42 | 43 | def forward(self, x): 44 | return FastLayerNormFN.apply(x, self.weight, self.bias, self.epsilon) 45 | -------------------------------------------------------------------------------- /apex/contrib/multihead_attn/MHA_bwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/multihead_attn/MHA_bwd.png -------------------------------------------------------------------------------- /apex/contrib/multihead_attn/MHA_fwd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/multihead_attn/MHA_fwd.png -------------------------------------------------------------------------------- /apex/contrib/multihead_attn/README.md: -------------------------------------------------------------------------------- 1 | # Fast Multihead Attention 2 | 3 | This implementation has two main features : 4 | * A C++ implementation to avoid the CPU overheads of Pytorch found with smaller batch sizes. 5 | * The removal of all copies and transposes found in standard implementations of Multihead Attention. 6 | 7 | | | Python Version | C++ Version | 8 | | :----------------------------------------- | :------------: | :---------: | 9 | | Layer Norm and Residual Add Variant | X | X | 10 | | Includes Linear Biases | X | | 11 | | Reduces CPU Overheads | | X | 12 | | Fuses masking with Softmax | | X | 13 | | Removes Transposes and Copies | X | X | 14 | | Includes Self and Encoder/Decoder Variants | X | X | 15 | 16 | ## How to Instantiate 17 | 18 | `SelfMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)` 19 | `EncdecMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)` 20 | 21 | `impl` has two options: 22 | * `fast` uses C++ Version 23 | * `default` uses Python Version 24 | 25 | ## Instructions to build on Linux 26 | 27 | ``` 28 | $ git clone https://github.com/NVIDIA/apex 29 | $ cd apex 30 | $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./ 31 | ``` 32 | ## Try Performance Tests Yourself! 33 | Perf test script is found here! 34 | ``` 35 | cd contrib/examples/multihead_attn 36 | ``` 37 | #### Fast Multihead Attention 38 | ``` 39 | python perf_test_multihead_attn.py --ref 40 | ``` 41 | #### Fast Multihead Attention with C++ Implementation 42 | ``` 43 | python perf_test_multihead_attn.py 44 | ``` 45 | #### Compare with `torch.nn.MultiheadAttn` 46 | ``` 47 | python perf_test_multihead_attn.py --native 48 | ``` 49 | #### Test your own range! 50 | ``` 51 | python perf_test_multihead_attn.py --seq-length 64 --num-seqs-start 10 --num-seqs-stop 120 --num-seqs-inc 5 52 | ``` 53 | 54 | ## Performance Comparisons 55 | 56 | * Performance was measured with 64 token sequence lengths on an NVIDIA TitanV card. 57 | * Time is measured across multiple layers to simulate an in model scenario. 58 | 59 | ![Multihead Attention Forward](MHA_fwd.png) 60 | ![Multihead Attention Backward](MHA_bwd.png) 61 | -------------------------------------------------------------------------------- /apex/contrib/multihead_attn/__init__.py: -------------------------------------------------------------------------------- 1 | from .self_multihead_attn import SelfMultiheadAttn 2 | from .encdec_multihead_attn import EncdecMultiheadAttn 3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func 4 | -------------------------------------------------------------------------------- /apex/contrib/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16_optimizer import FP16_Optimizer 2 | from .fused_adam import FusedAdam 3 | from .fused_lamb import FusedLAMB 4 | -------------------------------------------------------------------------------- /apex/contrib/optimizers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /apex/contrib/optimizers/__pycache__/fp16_optimizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fp16_optimizer.cpython-37.pyc -------------------------------------------------------------------------------- /apex/contrib/optimizers/__pycache__/fused_adam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fused_adam.cpython-37.pyc -------------------------------------------------------------------------------- /apex/contrib/optimizers/__pycache__/fused_lamb.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/contrib/optimizers/__pycache__/fused_lamb.cpython-37.pyc -------------------------------------------------------------------------------- /apex/contrib/sparsity/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to ASP 2 | 3 | This serves as a quick-start for ASP (Automatic SParsity), a tool that enables sparse training and inference for PyTorch models by adding 2 lines of Python. 4 | 5 | ## Importing ASP 6 | ``` 7 | from apex.contrib.sparsity import ASP 8 | ``` 9 | 10 | ## Initializing ASP 11 | 12 | Apart from the import statement, it is sufficient to add just the following line of code before the training phase to augment the model and the optimizer for sparse training/inference: 13 | ``` 14 | ASP.prune_trained_model(model, optimizer) 15 | ``` 16 | 17 | In the context of a typical PyTorch training loop, it might look like this: 18 | ``` 19 | ASP.prune_trained_model(model, optimizer) 20 | 21 | x, y = DataLoader(args) 22 | for epoch in range(epochs): 23 | y_pred = model(x) 24 | loss = loss_function(y_pred, y) 25 | loss.backward() 26 | optimizer.step() 27 | 28 | torch.save(...) 29 | ``` 30 | The `prune_trained_model` step calculates the sparse mask and applies it to the weights. This is done once, i.e., sparse locations in the weights matrix remain fixed after this step. 31 | 32 | ## Generate a Sparse Network 33 | 34 | The following approach serves as a guiding example on how to generate a pruned model that can use Sparse Tensor Cores in the NVIDIA Ampere Architecture. This approach generates a model for deployment, i.e. inference mode. 35 | 36 | ``` 37 | (1) Given a fully trained (dense) network, prune parameter values in a 2:4 sparse pattern. 38 | (2) Fine-tune the pruned model with optimization method and hyper-parameters (learning-rate, schedule, number of epochs, etc.) exactly as those used to obtain the trained model. 39 | (3) (If required) Quantize the model. 40 | ``` 41 | 42 | In code, below is a sketch on how to use ASP for this approach (steps 1 and 2 above). 43 | 44 | ``` 45 | 46 | model = define_model(..., pretrained=True) # define model architecture and load parameter tensors with trained values (by reading a trained checkpoint) 47 | criterion = ... # compare ground truth with model predition; use the same criterion as used to generate the dense trained model 48 | optimizer = ... # optimize model parameters; use the same optimizer as used to generate the dense trained model 49 | lr_scheduler = ... # learning rate scheduler; use the same schedule as used to generate the dense trained model 50 | 51 | from apex.contrib.sparsity import ASP 52 | ASP.prune_trained_model(model, optimizer) #pruned a trained model 53 | 54 | x, y = DataLoader(args) 55 | for epoch in range(epochs): # train the pruned model for the same number of epochs as used to generate the dense trained model 56 | y_pred = model(x) 57 | loss = criterion(y_pred, y) 58 | lr_scheduler.step() 59 | loss.backward() 60 | optimizer.step() 61 | 62 | torch.save(...) # saves the pruned checkpoint with sparsity masks 63 | ``` 64 | 65 | ## Non-Standard Usage 66 | 67 | If your goal is to easily perpare a network for accelerated inference, please follow the recipe above. However, ASP can also be used to perform experiments in advanced techniques like training with sparsity from initialization. For example, in order to recompute the sparse mask in between training steps, use the following method: 68 | 69 | ``` 70 | ASP.compute_sparse_masks() 71 | ``` 72 | 73 | A more thorough example can be found in `./test/toy_problem.py`. 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /apex/contrib/sparsity/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_masklib import create_mask 2 | from .asp import ASP 3 | -------------------------------------------------------------------------------- /apex/contrib/sparsity/test/checkpointing_test_part1.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from apex.optimizers import FusedAdam 5 | from apex.contrib.sparsity import ASP 6 | 7 | def build_model(args): 8 | od = OrderedDict() 9 | for i in range(args.num_layers): 10 | if i == 0: 11 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features) 12 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 13 | elif i == args.num_layers-1: 14 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features) 15 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features]) 16 | else: 17 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features) 18 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 19 | return torch.nn.Sequential(od) 20 | 21 | def train_step(args, model, optimizer, input_batch, target_batch, step): 22 | predicted_target = model(input_batch) 23 | loss = ((predicted_target-target_batch)**2).sum() 24 | loss.backward() 25 | optimizer.step() 26 | optimizer.zero_grad() 27 | step = step + 1 28 | #print("Step %d :: loss=%e" % (step, loss.item())) 29 | return step 30 | 31 | def train_loop(args, model, optimizer, step, num_steps): 32 | for i in range(num_steps): 33 | input_batch = torch.randn([args.batch_size, args.input_features]).cuda() 34 | target_batch = torch.randn([args.batch_size, args.output_features]).cuda() 35 | step = train_step(args, model, optimizer, input_batch, target_batch, step) 36 | return step 37 | 38 | def main(args): 39 | # 40 | # PART1 41 | # 42 | 43 | torch.manual_seed(args.seed) 44 | 45 | model = build_model(args).cuda() 46 | one_ll = next(model.children()).weight 47 | optimizer = FusedAdam(model.parameters()) 48 | ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) 49 | ASP.init_optimizer_for_pruning(optimizer) 50 | 51 | step = 0 52 | 53 | # train for a few steps with dense weights 54 | print("DENSE :: ",one_ll) 55 | step = train_loop(args, model, optimizer, step, args.num_dense_steps) 56 | 57 | # simulate sparsity by inserting zeros into existing dense weights 58 | ASP.enable_sparsity() 59 | 60 | # train for a few steps with sparse weights 61 | print("SPARSE :: ",one_ll) 62 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps) 63 | 64 | torch.save({ 65 | 'step': step, 66 | 'verbosity': args.verbosity, 67 | 'seed2': args.seed2, 68 | 'pattern': args.pattern, 69 | 'whitelist': args.whitelist, 70 | 'allow_recompute_mask': args.allow_recompute_mask, 71 | 'model_state_dict': model.state_dict(), 72 | 'optimizer_state_dict': optimizer.state_dict(), 73 | }, args.checkpoint_path) 74 | 75 | if __name__ == '__main__': 76 | class Args: 77 | verbosity=3 78 | seed = 4873 79 | seed2 = 99875 80 | pattern = "m4n2_2d_best" 81 | whitelist = [torch.nn.Linear] 82 | allow_recompute_mask = True 83 | batch_size = 32 84 | input_features = 8 85 | output_features = 8 86 | hidden_features = 32 87 | num_layers = 4 88 | num_dense_steps = 2000 89 | num_sparse_steps = 3000 90 | num_sparse_steps_2 = 1000 91 | checkpoint_path = "part1.chkp" 92 | args = Args() 93 | 94 | main(args) 95 | -------------------------------------------------------------------------------- /apex/contrib/sparsity/test/checkpointing_test_part2.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from apex.optimizers import FusedAdam 5 | from apex.contrib.sparsity import ASP 6 | 7 | def build_model(args): 8 | od = OrderedDict() 9 | for i in range(args.num_layers): 10 | if i == 0: 11 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features) 12 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 13 | elif i == args.num_layers-1: 14 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features) 15 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features]) 16 | else: 17 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features) 18 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 19 | return torch.nn.Sequential(od) 20 | 21 | def train_step(args, model, optimizer, input_batch, target_batch, step): 22 | predicted_target = model(input_batch) 23 | loss = ((predicted_target-target_batch)**2).sum() 24 | loss.backward() 25 | optimizer.step() 26 | optimizer.zero_grad() 27 | step = step + 1 28 | #print("Step %d :: loss=%e" % (step, loss.item())) 29 | return step 30 | 31 | def train_loop(args, model, optimizer, step, num_steps): 32 | for i in range(num_steps): 33 | input_batch = torch.randn([args.batch_size, args.input_features]).cuda() 34 | target_batch = torch.randn([args.batch_size, args.output_features]).cuda() 35 | step = train_step(args, model, optimizer, input_batch, target_batch, step) 36 | return step 37 | 38 | def main(step, args, model_state_dict, optimizer_state_dict): 39 | # 40 | # PART2 41 | # 42 | 43 | model = build_model(args).cuda() 44 | one_ll = next(model.children()).weight 45 | optimizer = FusedAdam(model.parameters()) 46 | ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) 47 | ASP.init_optimizer_for_pruning(optimizer) 48 | 49 | torch.manual_seed(args.seed2) 50 | model.load_state_dict(model_state_dict) 51 | optimizer.load_state_dict(optimizer_state_dict) 52 | 53 | print("Model sparsity is %s" % ("enabled" if ASP.sparsity_is_enabled() else "disabled")) 54 | 55 | # train for a few steps with sparse weights 56 | print("SPARSE :: ",one_ll) 57 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) 58 | 59 | if __name__ == '__main__': 60 | checkpoint = torch.load("part1.chkp") 61 | class Args: 62 | verbosity = checkpoint['verbosity'] 63 | seed = 4873 64 | seed2 = checkpoint['seed2'] 65 | pattern = checkpoint['pattern'] 66 | whitelist = checkpoint['whitelist'] 67 | allow_recompute_mask = checkpoint['allow_recompute_mask'] 68 | batch_size = 32 69 | input_features = 8 70 | output_features = 8 71 | hidden_features = 32 72 | num_layers = 4 73 | num_dense_steps = 2000 74 | num_sparse_steps = 3000 75 | num_sparse_steps_2 = 1000 76 | checkpoint_path = "part1.chkp" 77 | args = Args() 78 | 79 | main(checkpoint['step'], args, checkpoint['model_state_dict'], checkpoint['optimizer_state_dict']) 80 | -------------------------------------------------------------------------------- /apex/contrib/sparsity/test/checkpointing_test_reference.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from apex.optimizers import FusedAdam 5 | from apex.contrib.sparsity import ASP 6 | 7 | # 8 | # Reference run for checkpointing test (part1 + part2) 9 | # 10 | 11 | def build_model(args): 12 | od = OrderedDict() 13 | for i in range(args.num_layers): 14 | if i == 0: 15 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features) 16 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 17 | elif i == args.num_layers-1: 18 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features) 19 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features]) 20 | else: 21 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features) 22 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 23 | return torch.nn.Sequential(od) 24 | 25 | def train_step(args, model, optimizer, input_batch, target_batch, step): 26 | predicted_target = model(input_batch) 27 | loss = ((predicted_target-target_batch)**2).sum() 28 | loss.backward() 29 | optimizer.step() 30 | optimizer.zero_grad() 31 | step = step + 1 32 | #print("Step %d :: loss=%e" % (step, loss.item())) 33 | return step 34 | 35 | def train_loop(args, model, optimizer, step, num_steps): 36 | for i in range(num_steps): 37 | input_batch = torch.randn([args.batch_size, args.input_features]).cuda() 38 | target_batch = torch.randn([args.batch_size, args.output_features]).cuda() 39 | step = train_step(args, model, optimizer, input_batch, target_batch, step) 40 | return step 41 | 42 | def main(args): 43 | # 44 | # PART1 45 | # 46 | 47 | torch.manual_seed(args.seed) 48 | 49 | model = build_model(args).cuda() 50 | one_ll = next(model.children()).weight 51 | optimizer = FusedAdam(model.parameters()) 52 | ASP.init_model_for_pruning(model, args.pattern, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) 53 | ASP.init_optimizer_for_pruning(optimizer) 54 | 55 | step = 0 56 | 57 | # train for a few steps with dense weights 58 | print("DENSE :: ",one_ll) 59 | step = train_loop(args, model, optimizer, step, args.num_dense_steps) 60 | 61 | # simulate sparsity by inserting zeros into existing dense weights 62 | ASP.enable_sparsity() 63 | 64 | # train for a few steps with sparse weights 65 | print("SPARSE :: ",one_ll) 66 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps) 67 | 68 | # 69 | # PART 2 70 | # 71 | 72 | torch.manual_seed(args.seed2) 73 | 74 | # train for a few steps with sparse weights 75 | print("SPARSE :: ",one_ll) 76 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) 77 | 78 | if __name__ == '__main__': 79 | class Args: 80 | seed = 4873 81 | seed2 = 99875 82 | pattern = "m4n2_2d_best" 83 | whitelist = [torch.nn.Linear] 84 | allow_recompute_mask = True 85 | batch_size = 32 86 | input_features = 8 87 | output_features = 8 88 | hidden_features = 32 89 | num_layers = 4 90 | num_dense_steps = 2000 91 | num_sparse_steps = 3000 92 | num_sparse_steps_2 = 1000 93 | checkpoint_path = "part1.chkp" 94 | args = Args() 95 | 96 | main(args) 97 | -------------------------------------------------------------------------------- /apex/contrib/sparsity/test/toy_problem.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from apex.optimizers import FusedAdam 5 | from apex.contrib.sparsity import ASP 6 | 7 | def build_model(args): 8 | od = OrderedDict() 9 | for i in range(args.num_layers): 10 | if i == 0: 11 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.input_features, args.hidden_features) 12 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 13 | elif i == args.num_layers-1: 14 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.output_features) 15 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.output_features]) 16 | else: 17 | od['linear_layer_%d' % (i+1)] = torch.nn.Linear(args.hidden_features, args.hidden_features) 18 | od['layer_norm_%d' % (i+1)] = torch.nn.LayerNorm([args.batch_size, args.hidden_features]) 19 | return torch.nn.Sequential(od) 20 | 21 | def train_step(args, model, optimizer, input_batch, target_batch, step): 22 | predicted_target = model(input_batch) 23 | loss = ((predicted_target-target_batch)**2).sum() 24 | loss.backward() 25 | optimizer.step() 26 | optimizer.zero_grad() 27 | step = step + 1 28 | #print("Step %d :: loss=%e" % (step, loss.item())) 29 | return step 30 | 31 | def train_loop(args, model, optimizer, step, num_steps): 32 | for i in range(num_steps): 33 | input_batch = torch.randn([args.batch_size, args.input_features]).cuda() 34 | target_batch = torch.randn([args.batch_size, args.output_features]).cuda() 35 | step = train_step(args, model, optimizer, input_batch, target_batch, step) 36 | return step 37 | 38 | def main(args): 39 | model = build_model(args).cuda() 40 | one_ll = next(model.children()).weight 41 | optimizer = FusedAdam(model.parameters()) 42 | # only prune linear layers, even though we also support conv1d, conv2d and conv3d 43 | ASP.init_model_for_pruning(model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True) 44 | ASP.init_optimizer_for_pruning(optimizer) 45 | 46 | step = 0 47 | 48 | # train for a few steps with dense weights 49 | print("DENSE :: ",one_ll) 50 | step = train_loop(args, model, optimizer, step, args.num_dense_steps) 51 | 52 | # simulate sparsity by inserting zeros into existing dense weights 53 | ASP.compute_sparse_masks() 54 | 55 | # train for a few steps with sparse weights 56 | print("SPARSE :: ",one_ll) 57 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps) 58 | 59 | # recompute sparse masks 60 | ASP.compute_sparse_masks() 61 | 62 | # train for a few steps with sparse weights 63 | print("SPARSE :: ",one_ll) 64 | step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) 65 | 66 | # turn off sparsity 67 | print("SPARSE :: ",one_ll) 68 | ASP.restore_pruned_weights() 69 | 70 | # train for a few steps with dense weights 71 | print("DENSE :: ",one_ll) 72 | step = train_loop(args, model, optimizer, step, args.num_dense_steps_2) 73 | 74 | if __name__ == '__main__': 75 | class Args: 76 | batch_size = 32 77 | input_features = 16 78 | output_features = 8 79 | hidden_features = 40 80 | num_layers = 4 81 | num_dense_steps = 2000 82 | num_sparse_steps = 3000 83 | num_sparse_steps_2 = 1000 84 | num_dense_steps_2 = 1500 85 | args = Args() 86 | 87 | main(args) 88 | -------------------------------------------------------------------------------- /apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import unittest 4 | 5 | from apex.contrib.multihead_attn import EncdecMultiheadAttn 6 | 7 | class EncdecMultiheadAttnNormAddTest(unittest.TestCase): 8 | def setUp(self, seed=1234): 9 | torch.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | 12 | self.seq_length = 80 13 | self.sequences = 10 14 | self.hidden_dim = 1024 15 | self.heads = 16 16 | self.dropout_prob = 0.0 17 | 18 | self.ref_layer = EncdecMultiheadAttn(self.hidden_dim, 19 | self.heads, 20 | dropout=self.dropout_prob, 21 | bias=False, 22 | include_norm_add=True, 23 | impl='default') 24 | self.ref_layer.cuda().half() 25 | self.ref_layer.reset_parameters() 26 | self.ref_inputs_q = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 27 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 28 | self.ref_inputs_k = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 29 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 30 | 31 | # Reset seed so parameters are identical 32 | torch.manual_seed(seed) 33 | torch.cuda.manual_seed_all(seed) 34 | 35 | self.tst_layer = EncdecMultiheadAttn(self.hidden_dim, 36 | self.heads, 37 | dropout=self.dropout_prob, 38 | bias=False, 39 | include_norm_add=True, 40 | impl='fast') 41 | self.tst_layer.cuda().half() 42 | self.tst_layer.reset_parameters() 43 | 44 | self.tst_inputs_q = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 45 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 46 | self.tst_inputs_k = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 47 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 48 | 49 | def test_encdec_multihead_attn_norm_add(self) : 50 | grads = torch.randn_like(self.tst_inputs_q) 51 | 52 | ref_outputs,_ = self.ref_layer.forward(self.ref_inputs_q, 53 | self.ref_inputs_k, 54 | self.ref_inputs_k, 55 | key_padding_mask=None, 56 | need_weights=False, 57 | attn_mask=None, 58 | is_training=True) 59 | 60 | tst_outputs,_ = self.tst_layer.forward(self.tst_inputs_q, 61 | self.tst_inputs_k, 62 | self.tst_inputs_k, 63 | key_padding_mask=None, 64 | need_weights=False, 65 | attn_mask=None, 66 | is_training=True) 67 | 68 | self.ref_inputs_q.backward(grads) 69 | self.tst_inputs_q.backward(grads) 70 | 71 | self.assertTrue(torch.allclose(self.ref_inputs_q, self.tst_inputs_q, atol=1e-5, rtol=1e-5)) 72 | self.assertTrue(torch.allclose(self.ref_inputs_k, self.tst_inputs_k, atol=1e-5, rtol=1e-5)) 73 | self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)) 74 | self.assertTrue(torch.allclose(self.ref_inputs_q.grad, self.tst_inputs_q.grad, atol=1e-3, rtol=1e-3)) 75 | 76 | if __name__ == '__main__': 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import unittest 4 | 5 | from apex.contrib.multihead_attn import SelfMultiheadAttn 6 | 7 | class SelfMultiheadAttnTest(unittest.TestCase): 8 | def setUp(self, seed=1234): 9 | torch.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | 12 | self.seq_length = 80 13 | self.sequences = 10 14 | self.hidden_dim = 1024 15 | self.heads = 16 16 | self.dropout_prob = 0.0 17 | 18 | self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 19 | self.heads, 20 | dropout=self.dropout_prob, 21 | bias=True, 22 | include_norm_add=False, 23 | separate_qkv_params=True, 24 | mask_additive=True, 25 | impl='default') 26 | self.ref_layer.cuda().half() 27 | self.ref_layer.reset_parameters() 28 | self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 29 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 30 | # Reset seed so parameters are identical 31 | torch.manual_seed(seed) 32 | torch.cuda.manual_seed_all(seed) 33 | 34 | self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 35 | self.heads, 36 | dropout=self.dropout_prob, 37 | bias=True, 38 | include_norm_add=False, 39 | separate_qkv_params=True, 40 | mask_additive=True, 41 | impl='fast') 42 | self.tst_layer.cuda().half() 43 | self.tst_layer.reset_parameters() 44 | 45 | self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 46 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 47 | 48 | def test_self_multihead_attn_additive_mask(self) : 49 | grads = torch.randn_like(self.tst_inputs) 50 | mask = ((torch.randn(self.sequences, self.seq_length) > 0) * -10000.0).half().cuda() 51 | 52 | ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 53 | self.ref_inputs, 54 | self.ref_inputs, 55 | key_padding_mask=mask, 56 | need_weights=False, 57 | attn_mask=None, 58 | is_training=True) 59 | 60 | tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 61 | self.tst_inputs, 62 | self.tst_inputs, 63 | key_padding_mask=mask, 64 | need_weights=False, 65 | attn_mask=None, 66 | is_training=True) 67 | 68 | 69 | self.ref_inputs.backward(grads) 70 | self.tst_inputs.backward(grads) 71 | 72 | self.assertTrue(torch.allclose(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)) 73 | self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)) 74 | self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)) 75 | 76 | if __name__ == '__main__': 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /apex/contrib/test/multihead_attn/test_mha_fused_softmax.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import unittest 3 | import torch.nn.functional as F 4 | from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func 5 | 6 | class FusedSoftmaxTest(unittest.TestCase): 7 | def setUp(self, seed=1234): 8 | torch.manual_seed(seed) 9 | torch.cuda.manual_seed_all(seed) 10 | 11 | self.seq_length = 80 12 | self.sequences = 10 13 | self.hidden_dim = 1024 14 | self.heads = 16 15 | self.dropout_prob = 0.0 16 | 17 | self.mask = (torch.randn(self.sequences,self.seq_length)>0).cuda() 18 | self.mask = self.mask.half()*-10000 19 | self.ref_inputs = torch.randn(self.heads * self.sequences, self.seq_length, self.seq_length, 20 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 21 | 22 | self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True) 23 | 24 | def test_fused_softmax(self) : 25 | grads = torch.randn_like(self.tst_inputs) 26 | y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length) 27 | y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2) 28 | y_ref = y_ref.view(self.sequences*self.heads, self.seq_length, self.seq_length) 29 | y_ref = F.softmax(y_ref, dim=-1) 30 | y_ref = torch._fused_dropout(y_ref, 1.0) 31 | 32 | y_tst = fast_mask_softmax_dropout_func(True, self.heads, self.tst_inputs, self.mask, True, 0.0) 33 | y_ref[0].backward(grads) 34 | y_tst.backward(grads) 35 | 36 | self.assertTrue(torch.allclose(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)) 37 | self.assertTrue(torch.allclose(y_ref[0], y_tst, atol=1e-3, rtol=1e-3)) 38 | self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)) 39 | 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import unittest 4 | 5 | from apex.contrib.multihead_attn import SelfMultiheadAttn 6 | 7 | class SelfMultiheadAttnNormAddTest(unittest.TestCase): 8 | def setUp(self, seed=1234): 9 | torch.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | 12 | self.seq_length = 80 13 | self.sequences = 10 14 | self.hidden_dim = 1024 15 | self.heads = 16 16 | self.dropout_prob = 0.0 17 | 18 | self.ref_layer = SelfMultiheadAttn(self.hidden_dim, 19 | self.heads, 20 | dropout=self.dropout_prob, 21 | bias=False, 22 | include_norm_add=True, 23 | impl='default') 24 | self.ref_layer.cuda().half() 25 | self.ref_layer.reset_parameters() 26 | self.ref_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 27 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 28 | 29 | # Reset seed so parameters are identical 30 | torch.manual_seed(seed) 31 | torch.cuda.manual_seed_all(seed) 32 | 33 | self.tst_layer = SelfMultiheadAttn(self.hidden_dim, 34 | self.heads, 35 | dropout=self.dropout_prob, 36 | bias=False, 37 | include_norm_add=True, 38 | impl='fast') 39 | self.tst_layer.cuda().half() 40 | self.tst_layer.reset_parameters() 41 | 42 | self.tst_inputs = torch.randn(self.seq_length, self.sequences, self.hidden_dim, 43 | dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True) 44 | 45 | def test_self_multihead_attn_norm_add(self) : 46 | grads = torch.randn_like(self.tst_inputs) 47 | 48 | ref_outputs,_ = self.ref_layer.forward(self.ref_inputs, 49 | self.ref_inputs, 50 | self.ref_inputs, 51 | key_padding_mask=None, 52 | need_weights=False, 53 | attn_mask=None, 54 | is_training=True) 55 | 56 | tst_outputs,_ = self.tst_layer.forward(self.tst_inputs, 57 | self.tst_inputs, 58 | self.tst_inputs, 59 | key_padding_mask=None, 60 | need_weights=False, 61 | attn_mask=None, 62 | is_training=True) 63 | 64 | self.ref_inputs.backward(grads) 65 | self.tst_inputs.backward(grads) 66 | 67 | self.assertTrue(torch.allclose(self.ref_inputs, self.tst_inputs, atol=1e-5, rtol=1e-5)) 68 | self.assertTrue(torch.allclose(ref_outputs, tst_outputs, atol=1e-3, rtol=1e-3)) 69 | self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3)) 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /apex/contrib/transducer/__init__.py: -------------------------------------------------------------------------------- 1 | from .transducer import TransducerJoint 2 | from .transducer import TransducerLoss -------------------------------------------------------------------------------- /apex/contrib/xentropy/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | import xentropy_cuda 4 | from .softmax_xentropy import SoftmaxCrossEntropyLoss 5 | del torch 6 | del xentropy_cuda 7 | del softmax_xentropy 8 | except ImportError as err: 9 | print("apex was installed without --xentropy flag, contrib.xentropy is not available") 10 | -------------------------------------------------------------------------------- /apex/contrib/xentropy/softmax_xentropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import xentropy_cuda 3 | 4 | class SoftmaxCrossEntropyLoss(torch.autograd.Function): 5 | @staticmethod 6 | def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False): 7 | losses, max_log_sum_exp = xentropy_cuda.forward( 8 | logits, labels, smoothing, half_to_float) 9 | losses.masked_fill_(labels==padding_idx, 0) 10 | 11 | ctx.save_for_backward(logits, max_log_sum_exp, labels, 12 | torch.FloatTensor([smoothing]), 13 | torch.LongTensor([padding_idx])) 14 | 15 | return losses 16 | 17 | @staticmethod 18 | def backward(ctx, grad_loss): 19 | logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors 20 | 21 | if not grad_loss.is_contiguous(): 22 | grad_loss = grad_loss.contiguous() 23 | grad_loss.masked_fill_(labels==padding_idx.item(), 0) 24 | grad_logits = xentropy_cuda.backward( 25 | grad_loss.contiguous(), logits, max_log_sum_exp, 26 | labels, smoothing.item()) 27 | 28 | return grad_logits, None, None, None, None 29 | -------------------------------------------------------------------------------- /apex/fp16_utils/README.md: -------------------------------------------------------------------------------- 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user. To use `FP16_Optimizer`, only two lines of one's Python model need to change. 2 | 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling) 4 | 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple) 6 | 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 8 | 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) 10 | 11 | 12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses. 13 | 14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management) 15 | 16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling. These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically. 17 | -------------------------------------------------------------------------------- /apex/fp16_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .fp16util import ( 2 | BN_convert_float, 3 | network_to_half, 4 | prep_param_lists, 5 | model_grads_to_master_grads, 6 | master_params_to_model_params, 7 | tofp16, 8 | to_python_float, 9 | clip_grad_norm, 10 | convert_module, 11 | convert_network, 12 | FP16Model, 13 | ) 14 | 15 | from .fp16_optimizer import FP16_Optimizer 16 | from .loss_scaler import LossScaler, DynamicLossScaler 17 | -------------------------------------------------------------------------------- /apex/mlp/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import * 2 | -------------------------------------------------------------------------------- /apex/mlp/mlp.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import math 3 | import torch 4 | from torch import nn 5 | import mlp_cuda 6 | from .. import amp 7 | 8 | class MlpFunction(torch.autograd.Function): 9 | @staticmethod 10 | def forward(ctx, bias, activation, *args): 11 | output = mlp_cuda.forward(bias, activation, args) 12 | ctx.save_for_backward(*args) 13 | ctx.outputs = output 14 | ctx.bias = bias 15 | ctx.activation = activation 16 | return output[0] 17 | 18 | @staticmethod 19 | def backward(ctx, grad_o): 20 | grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors) 21 | del ctx.outputs 22 | return (None, None, *grads) 23 | 24 | mlp_function = amp.half_function(MlpFunction.apply) 25 | 26 | class MLP(torch.nn.Module): 27 | """Launch MLP in C++ 28 | 29 | Args: 30 | mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024 31 | bias (bool): Default True: 32 | relu (bool): Default True 33 | """ 34 | def __init__(self, mlp_sizes, bias=True, activation='relu'): 35 | super(MLP, self).__init__() 36 | self.num_layers = len(mlp_sizes) - 1 37 | self.mlp_sizes = copy(mlp_sizes) 38 | self.bias = 1 if bias else 0 39 | 40 | if activation is 'none': 41 | self.activation = 0 42 | elif activation is 'relu': 43 | self.activation = 1 44 | elif activation is 'sigmoid': 45 | self.activation = 2 46 | else: 47 | raise TypeError("activation must be relu or none.") 48 | 49 | self.weights = [] 50 | self.biases = [] 51 | for i in range(self.num_layers): 52 | w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i])) 53 | self.weights.append(w) 54 | name = 'weight_{}'.format(i) 55 | setattr(self, name, w) 56 | if self.bias: 57 | b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1])) 58 | self.biases.append(b) 59 | name = 'bias_{}'.format(i) 60 | setattr(self, name, b) 61 | 62 | self.reset_parameters() 63 | 64 | def reset_parameters(self): 65 | for weight in self.weights: 66 | dimsum = weight.size(0) + weight.size(1) 67 | std = math.sqrt(2. / float(dimsum)) 68 | nn.init.normal_(weight, 0., std) 69 | if self.bias: 70 | for bias in self.biases: 71 | std = math.sqrt(1. / float(bias.size(0))) 72 | nn.init.normal_(bias, 0., std) 73 | 74 | def forward(self, input): 75 | return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases) 76 | 77 | def extra_repr(self): 78 | s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}" 79 | return s 80 | -------------------------------------------------------------------------------- /apex/multi_tensor_apply/__init__.py: -------------------------------------------------------------------------------- 1 | from .multi_tensor_apply import MultiTensorApply 2 | 3 | multi_tensor_applier = MultiTensorApply(2048*32) 4 | 5 | -------------------------------------------------------------------------------- /apex/multi_tensor_apply/multi_tensor_apply.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class MultiTensorApply(object): 4 | available = False 5 | warned = False 6 | 7 | def __init__(self, chunk_size): 8 | try: 9 | import amp_C 10 | MultiTensorApply.available = True 11 | self.chunk_size = chunk_size 12 | except ImportError as err: 13 | MultiTensorApply.available = False 14 | MultiTensorApply.import_err = err 15 | 16 | def check_avail(self): 17 | if MultiTensorApply.available == False: 18 | raise RuntimeError( 19 | "Attempted to call MultiTensorApply method, but MultiTensorApply " 20 | "is not available, possibly because Apex was installed without " 21 | "--cpp_ext --cuda_ext. Original import error message:", 22 | MultiTensorApply.import_err) 23 | 24 | def __call__(self, op, noop_flag_buffer, tensor_lists, *args): 25 | self.check_avail() 26 | 27 | return op(self.chunk_size, 28 | noop_flag_buffer, 29 | tensor_lists, 30 | *args) 31 | -------------------------------------------------------------------------------- /apex/normalization/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_layer_norm import FusedLayerNorm 2 | -------------------------------------------------------------------------------- /apex/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .fused_sgd import FusedSGD 2 | from .fused_adam import FusedAdam 3 | from .fused_novograd import FusedNovoGrad 4 | from .fused_lamb import FusedLAMB 5 | from .fused_adagrad import FusedAdagrad -------------------------------------------------------------------------------- /apex/parallel/README.md: -------------------------------------------------------------------------------- 1 | ## Distributed Data Parallel 2 | 3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library. 4 | 5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with 6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of 7 | transfers required. 8 | 9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs. 10 | 11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html) 12 | 13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed) 14 | 15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) 16 | 17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex) 18 | 19 | ### Synchronized Batch Normalization 20 | 21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`. 22 | It reduces stats on the first (channel) dimension of the Tensor and accepts 23 | arbitrary spatial dimensions. 24 | 25 | #### Installation 26 | 27 | Apex provides two sync BN implementation: 28 | 29 | 1. There is the Python-only implementation, which is the default implementation 30 | when install with `python setup.py install`. 31 | It uses PyTorch primitive operations and distributed communication package from 32 | `torch.distributed`. 33 | 34 | - _Python-only implementation requires input tensor to be of same data type as 35 | layer_ 36 | 37 | 2. We also provide implementation with kernels through CUDA/C++ extension with 38 | improved performance. We are experimenting with Welford and Kahan for reduction 39 | hoping to get better accuracy. 40 | To use the kernel implementation, user need to install Apex with CUDA extension 41 | enabled `python setup.py install --cuda_ext`. 42 | 43 | - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn. 44 | This is required to run imagenet example in fp16._ 45 | 46 | - _Currently kernel implementation only supports GPU._ 47 | 48 | #### HowTo 49 | 50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with 51 | the layer explicitly. 52 | 53 | ``` 54 | import apex 55 | input_t = torch.randn(3, 5, 20).cuda() 56 | sbn = apex.parallel.SyncBatchNorm(5).cuda() 57 | output_t = sbn(input) 58 | ``` 59 | 60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`. 61 | 62 | ``` 63 | # model is an instance of torch.nn.Module 64 | import apex 65 | sync_bn_model = apex.parallel.convert_syncbn_model(model) 66 | ``` 67 | -------------------------------------------------------------------------------- /apex/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | if hasattr(torch.distributed, 'ReduceOp'): 4 | ReduceOp = torch.distributed.ReduceOp 5 | elif hasattr(torch.distributed, 'reduce_op'): 6 | ReduceOp = torch.distributed.reduce_op 7 | else: 8 | ReduceOp = torch.distributed.deprecated.reduce_op 9 | 10 | from .distributed import DistributedDataParallel, Reducer 11 | # This is tricky because I'd like SyncBatchNorm to be exposed the same way 12 | # for both the cuda-enabled and python-fallback versions, and I don't want 13 | # to suppress the error information. 14 | try: 15 | import syncbn 16 | from .optimized_sync_batchnorm import SyncBatchNorm 17 | except ImportError as err: 18 | from .sync_batchnorm import SyncBatchNorm 19 | SyncBatchNorm.syncbn_import_error = err 20 | 21 | def convert_syncbn_model(module, process_group=None, channel_last=False): 22 | ''' 23 | Recursively traverse module and its children to replace all instances of 24 | ``torch.nn.modules.batchnorm._BatchNorm`` with :class:`apex.parallel.SyncBatchNorm`. 25 | 26 | All ``torch.nn.BatchNorm*N*d`` wrap around 27 | ``torch.nn.modules.batchnorm._BatchNorm``, so this function lets you easily switch 28 | to use sync BN. 29 | 30 | Args: 31 | module (torch.nn.Module): input module 32 | 33 | Example:: 34 | 35 | >>> # model is an instance of torch.nn.Module 36 | >>> import apex 37 | >>> sync_bn_model = apex.parallel.convert_syncbn_model(model) 38 | ''' 39 | mod = module 40 | if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm): 41 | return module 42 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): 43 | mod = SyncBatchNorm(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats, process_group, channel_last=channel_last) 44 | mod.running_mean = module.running_mean 45 | mod.running_var = module.running_var 46 | mod.num_batches_tracked = module.num_batches_tracked 47 | if module.affine: 48 | mod.weight.data = module.weight.data.clone().detach() 49 | mod.bias.data = module.bias.data.clone().detach() 50 | for name, child in module.named_children(): 51 | mod.add_module(name, convert_syncbn_model(child, 52 | process_group=process_group, 53 | channel_last=channel_last)) 54 | # TODO(jie) should I delete model explicitly? 55 | del module 56 | return mod 57 | 58 | def create_syncbn_process_group(group_size): 59 | ''' 60 | Creates process groups to be used for syncbn of a give ``group_size`` and returns 61 | process group that current GPU participates in. 62 | 63 | ``group_size`` must divide the total number of GPUs (world_size). 64 | 65 | ``group_size`` of 0 would be considered as =world_size. In this case ``None`` will be returned. 66 | 67 | ``group_size`` of 1 would be equivalent to using non-sync bn, but will still carry the overhead. 68 | 69 | Args: 70 | group_size (int): number of GPU's to collaborate for sync bn 71 | 72 | Example:: 73 | 74 | >>> # model is an instance of torch.nn.Module 75 | >>> import apex 76 | >>> group = apex.parallel.create_syncbn_process_group(group_size) 77 | ''' 78 | 79 | if group_size==0: 80 | return None 81 | 82 | world_size = torch.distributed.get_world_size() 83 | assert(world_size >= group_size) 84 | assert(world_size % group_size == 0) 85 | 86 | group=None 87 | for group_num in (range(world_size//group_size)): 88 | group_ids = range(group_num*group_size, (group_num+1)*group_size) 89 | cur_group = torch.distributed.new_group(ranks=group_ids) 90 | if (torch.distributed.get_rank()//group_size == group_num): 91 | group = cur_group 92 | #can not drop out and return here, every process must go through creation of all subgroups 93 | 94 | assert(group is not None) 95 | return group 96 | -------------------------------------------------------------------------------- /apex/parallel/multiproc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import subprocess 4 | 5 | def docstring_hack(): 6 | """ 7 | Multiproc file which will launch a set of processes locally for multi-gpu 8 | usage: python -m apex.parallel.multiproc main.py ... 9 | """ 10 | pass 11 | 12 | argslist = list(sys.argv)[1:] 13 | world_size = torch.cuda.device_count() 14 | 15 | if '--world-size' in argslist: 16 | world_size = int(argslist[argslist.index('--world-size')+1]) 17 | else: 18 | argslist.append('--world-size') 19 | argslist.append(str(world_size)) 20 | 21 | workers = [] 22 | 23 | for i in range(world_size): 24 | if '--rank' in argslist: 25 | argslist[argslist.index('--rank')+1] = str(i) 26 | else: 27 | argslist.append('--rank') 28 | argslist.append(str(i)) 29 | stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w") 30 | print(argslist) 31 | p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) 32 | workers.append(p) 33 | 34 | for p in workers: 35 | p.wait() 36 | -------------------------------------------------------------------------------- /apex/parallel/sync_batchnorm_kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd.function import Function 3 | 4 | from apex.parallel import ReduceOp 5 | 6 | 7 | class SyncBatchnormFunction(Function): 8 | 9 | @staticmethod 10 | def forward(ctx, input, weight, bias, running_mean, running_variance, eps, process_group, world_size): 11 | torch.cuda.nvtx.range_push("sync_BN_fw") 12 | # transpose it to channel last to support broadcasting for input with different rank 13 | c_last_input = input.transpose(1, -1).contiguous().clone() 14 | 15 | ctx.save_for_backward(c_last_input, weight, bias, 16 | running_mean, running_variance) 17 | ctx.eps = eps 18 | ctx.process_group = process_group 19 | ctx.world_size = world_size 20 | 21 | c_last_input = (c_last_input - running_mean) / \ 22 | torch.sqrt(running_variance + eps) 23 | 24 | if weight is not None: 25 | c_last_input = c_last_input * weight 26 | if bias is not None: 27 | c_last_input = c_last_input + bias 28 | 29 | torch.cuda.nvtx.range_pop() 30 | return c_last_input.transpose(1, -1).contiguous().clone() 31 | 32 | @staticmethod 33 | def backward(ctx, grad_output): 34 | torch.cuda.nvtx.range_push("sync_BN_bw") 35 | # mini batch mean & var are calculated by forward path. 36 | # mu = 1./N*np.sum(h, axis = 0) 37 | # var = 1./N*np.sum((h-mu)**2, axis = 0) 38 | c_last_input, weight, bias, running_mean, running_variance = ctx.saved_tensors 39 | 40 | eps = ctx.eps 41 | process_group = ctx.process_group 42 | world_size = ctx.world_size 43 | grad_input = grad_weight = grad_bias = None 44 | num_features = running_mean.size()[0] 45 | 46 | # transpose it to channel last to support broadcasting for input with different rank 47 | torch.cuda.nvtx.range_push("carilli field") 48 | c_last_grad = grad_output.transpose(1, -1).contiguous() 49 | # squash non-channel dimension so we can easily calculate mean 50 | c_grad = c_last_grad.view(-1, num_features).contiguous() 51 | torch.cuda.nvtx.range_pop() 52 | 53 | # calculate grad_input 54 | if ctx.needs_input_grad[0]: 55 | # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0) 56 | # - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0)) 57 | mean_dy = c_grad.mean(0) 58 | mean_dy_xmu = (c_last_grad * (c_last_input - 59 | running_mean)).view(-1, num_features).mean(0) 60 | if torch.distributed.is_initialized(): 61 | torch.distributed.all_reduce( 62 | mean_dy, ReduceOp.SUM, process_group) 63 | mean_dy = mean_dy / world_size 64 | torch.distributed.all_reduce( 65 | mean_dy_xmu, ReduceOp.SUM, process_group) 66 | mean_dy_xmu = mean_dy_xmu / world_size 67 | c_last_grad_input = (c_last_grad - mean_dy - (c_last_input - running_mean) / ( 68 | running_variance + eps) * mean_dy_xmu) / torch.sqrt(running_variance + eps) 69 | if weight is not None: 70 | c_last_grad_input.mul_(weight) 71 | grad_input = c_last_grad_input.transpose(1, -1).contiguous() 72 | 73 | # calculate grad_weight 74 | grad_weight = None 75 | if weight is not None and ctx.needs_input_grad[1]: 76 | # dgamma = np.sum((h - mu) * (var + eps)**(-1. / 2.) * dy, axis=0) 77 | grad_weight = ((c_last_input - running_mean) / torch.sqrt( 78 | running_variance + eps) * c_last_grad).view(-1, num_features).sum(0) 79 | 80 | # calculate grad_bias 81 | grad_bias = None 82 | if bias is not None and ctx.needs_input_grad[2]: 83 | # dbeta = np.sum(dy, axis=0) 84 | grad_bias = c_grad.sum(0) 85 | 86 | torch.cuda.nvtx.range_pop() 87 | return grad_input, grad_weight, grad_bias, None, None, None, None, None 88 | -------------------------------------------------------------------------------- /apex/pyprof/FAQs.md: -------------------------------------------------------------------------------- 1 | 1. How do I intercept the Adam optimizer in APEX ? 2 | 3 | ```python 4 | from apex import pyprof 5 | import fused_adam_cuda 6 | pyprof.nvtx.wrap(fused_adam_cuda, 'adam') 7 | ``` 8 | 9 | 2. If you are using JIT and/or AMP, the correct initialization sequence is 10 | 1. Let any JIT to finish. 11 | 2. Initlialize pyprof `pyprof.nvtx.init()`. 12 | 3. Initialize AMP. 13 | 14 | 3. How do I profile with `torch.distributed.launch` ? 15 | 16 | ```python 17 | nvprof -f -o net%p.sql \ 18 | --profile-from-start off \ 19 | --profile-child-processes \ 20 | python -m torch.distributed.launch net.py 21 | ``` 22 | -------------------------------------------------------------------------------- /apex/pyprof/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from . import nvtx, prof 4 | -------------------------------------------------------------------------------- /apex/pyprof/examples/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.sql 3 | *.dict 4 | *.csv 5 | -------------------------------------------------------------------------------- /apex/pyprof/examples/apex/README.md: -------------------------------------------------------------------------------- 1 | This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`. 2 | -------------------------------------------------------------------------------- /apex/pyprof/examples/apex/fused_adam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import fused_adam_cuda 3 | from apex.optimizers import FusedAdam, FP16_Optimizer 4 | from apex import pyprof 5 | 6 | pyprof.nvtx.init() 7 | pyprof.nvtx.wrap(fused_adam_cuda, 'adam') 8 | 9 | model = torch.nn.Linear(10, 20).cuda().half() 10 | criterion = torch.nn.CrossEntropyLoss().cuda() 11 | optimizer = FusedAdam(model.parameters()) 12 | optimizer = FP16_Optimizer(optimizer) 13 | 14 | x = torch.ones(32, 10).cuda().half() 15 | target = torch.empty(32, dtype=torch.long).random_(20).cuda() 16 | y = model(x) 17 | loss = criterion(y, target) 18 | optimizer.zero_grad() 19 | loss.backward() 20 | optimizer.step() 21 | -------------------------------------------------------------------------------- /apex/pyprof/examples/apex/fused_layer_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import fused_layer_norm_cuda 3 | from apex.normalization import FusedLayerNorm 4 | from apex import pyprof 5 | 6 | pyprof.nvtx.init() 7 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward') 8 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward') 9 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine') 10 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine') 11 | 12 | input = torch.randn(20, 5, 10, 10).cuda() 13 | 14 | # With Learnable Parameters 15 | m = FusedLayerNorm(input.size()[1:]).cuda() 16 | output = m(input) 17 | 18 | # Without Learnable Parameters 19 | m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda() 20 | output = m(input) 21 | 22 | # Normalize over last two dimensions 23 | m = FusedLayerNorm([10, 10]).cuda() 24 | output = m(input) 25 | 26 | # Normalize over last dimension of size 10 27 | m = FusedLayerNorm(10).cuda() 28 | output = m(input) 29 | -------------------------------------------------------------------------------- /apex/pyprof/examples/apex/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT=`realpath $0` 6 | SCRIPTPATH=`dirname $SCRIPT` 7 | PYPROF="$SCRIPTPATH/../.." 8 | 9 | parse="python $PYPROF/parse/parse.py" 10 | prof="python $PYPROF/prof/prof.py" 11 | 12 | for f in *.py 13 | do 14 | base=`basename $f .py` 15 | sql=$base.sql 16 | dict=$base.dict 17 | 18 | #NVprof 19 | echo "nvprof -fo $sql python $f" 20 | nvprof -fo $sql python $f 21 | 22 | #Parse 23 | echo $parse $sql 24 | $parse $sql > $dict 25 | 26 | #Prof 27 | echo $prof $dict 28 | $prof -w 130 $dict 29 | \rm $sql $dict 30 | done 31 | -------------------------------------------------------------------------------- /apex/pyprof/examples/custom_func_module/README.md: -------------------------------------------------------------------------------- 1 | This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class. 2 | -------------------------------------------------------------------------------- /apex/pyprof/examples/custom_func_module/custom_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | #Initialize pyprof 7 | pyprof.nvtx.init() 8 | 9 | class Foo(torch.autograd.Function): 10 | @staticmethod 11 | def forward(ctx, in1, in2): 12 | out = in1 + in2 #This could be a custom C/C++ function. 13 | return out 14 | 15 | @staticmethod 16 | def backward(ctx, grad): 17 | in1_grad = grad #This could be a custom C/C++ function. 18 | in2_grad = grad #This could be a custom C/C++ function. 19 | return in1_grad, in2_grad 20 | 21 | #Hook the forward and backward functions to pyprof 22 | pyprof.nvtx.wrap(Foo, 'forward') 23 | pyprof.nvtx.wrap(Foo, 'backward') 24 | 25 | foo = Foo.apply 26 | 27 | x = torch.ones(4,4).cuda() 28 | y = torch.ones(4,4).cuda() 29 | 30 | with torch.autograd.profiler.emit_nvtx(): 31 | profiler.start() 32 | z = foo(x,y) 33 | profiler.stop() 34 | -------------------------------------------------------------------------------- /apex/pyprof/examples/custom_func_module/custom_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | pyprof.nvtx.init() 7 | 8 | class Foo(torch.nn.Module): 9 | def __init__(self, size): 10 | super(Foo, self).__init__() 11 | self.n = torch.nn.Parameter(torch.ones(size)) 12 | self.m = torch.nn.Parameter(torch.ones(size)) 13 | 14 | def forward(self, input): 15 | return self.n*input + self.m 16 | 17 | #Hook the forward function to pyprof 18 | pyprof.nvtx.wrap(Foo, 'forward') 19 | 20 | foo = Foo(4) 21 | foo.cuda() 22 | x = torch.ones(4).cuda() 23 | 24 | with torch.autograd.profiler.emit_nvtx(): 25 | profiler.start() 26 | z = foo(x) 27 | profiler.stop() 28 | -------------------------------------------------------------------------------- /apex/pyprof/examples/custom_func_module/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT=`realpath $0` 6 | SCRIPTPATH=`dirname $SCRIPT` 7 | PYPROF="$SCRIPTPATH/../.." 8 | 9 | parse="python $PYPROF/parse/parse.py" 10 | prof="python $PYPROF/prof/prof.py" 11 | 12 | for f in *.py 13 | do 14 | base=`basename $f .py` 15 | sql=$base.sql 16 | dict=$base.dict 17 | 18 | #NVprof 19 | echo "nvprof -fo $sql python $f" 20 | nvprof -fo $sql python $f 21 | 22 | #Parse 23 | echo $parse $sql 24 | $parse $sql > $dict 25 | 26 | #Prof 27 | echo $prof $dict 28 | $prof -w 130 $dict 29 | \rm $sql $dict 30 | done 31 | -------------------------------------------------------------------------------- /apex/pyprof/examples/imagenet/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT=`realpath $0` 6 | SCRIPTPATH=`dirname $SCRIPT` 7 | PYPROF="$SCRIPTPATH/../.." 8 | 9 | parse="python -m apex.pyprof.parse" 10 | prof="python -m apex.pyprof.prof" 11 | 12 | for net in "resnet50" 13 | do 14 | for optim in adam sgd 15 | do 16 | for batch in 32 64 17 | do 18 | base="torchvision".$net.$optim.$batch 19 | sql=$base.sql 20 | dict=$base.dict 21 | 22 | #NVprof 23 | echo "nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch" 24 | nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch 25 | 26 | #Parse 27 | echo $parse $sql 28 | $parse $sql > $dict 29 | 30 | #Prof 31 | echo $prof $dict 32 | $prof -w 130 $dict 33 | # \rm $sql $dict 34 | done 35 | done 36 | done 37 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/README.md: -------------------------------------------------------------------------------- 1 | *As of this writing, these examples do not work 2 | because of changes being proposed in PyTorch.* 3 | 4 | There are two ways to use PyTorch JIT 5 | - Scripting 6 | - Tracing 7 | 8 | In addition, we can JIT a 9 | - Stand alone function 10 | - Class / class method 11 | 12 | This directory has an example for each of the 4 cases. 13 | Intercepting (monkey patching) JITted code has a few extra steps, 14 | which are explained through comments. 15 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/jit_script_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | 7 | #The following creates an object "foo" of type ScriptModule 8 | #The new object has a function called "forward" 9 | 10 | @torch.jit.script 11 | def foo(x, y): 12 | return torch.sigmoid(x) + y 13 | 14 | #Initialize pyprof after the JIT step 15 | pyprof.nvtx.init() 16 | 17 | #Assign a name to the object "foo" 18 | foo.__name__ = "foo" 19 | 20 | #Hook up the forward function to pyprof 21 | pyprof.nvtx.wrap(foo, 'forward') 22 | 23 | x = torch.zeros(4,4).cuda() 24 | y = torch.ones(4,4).cuda() 25 | 26 | with torch.autograd.profiler.emit_nvtx(): 27 | profiler.start() 28 | z = foo(x, y) 29 | profiler.stop() 30 | print(z) 31 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/jit_script_method.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | 7 | class Foo(torch.jit.ScriptModule): 8 | def __init__(self, size): 9 | super(Foo, self).__init__() 10 | self.n = torch.nn.Parameter(torch.ones(size)) 11 | self.m = torch.nn.Parameter(torch.ones(size)) 12 | 13 | @torch.jit.script_method 14 | def forward(self, input): 15 | return self.n*input + self.m 16 | 17 | #Initialize pyprof after the JIT step 18 | pyprof.nvtx.init() 19 | 20 | #Hook up the forward function to pyprof 21 | pyprof.nvtx.wrap(Foo, 'forward') 22 | 23 | foo = Foo(4) 24 | foo.cuda() 25 | x = torch.ones(4).cuda() 26 | 27 | with torch.autograd.profiler.emit_nvtx(): 28 | profiler.start() 29 | z = foo(x) 30 | profiler.stop() 31 | print(z) 32 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/jit_trace_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | 7 | def foo(x, y): 8 | return torch.sigmoid(x) + y 9 | 10 | x = torch.zeros(4,4).cuda() 11 | y = torch.ones(4,4).cuda() 12 | 13 | #JIT the function using tracing 14 | #This returns an object of type ScriptModule with a forward method. 15 | traced_foo = torch.jit.trace(foo, (x,y)) 16 | 17 | #Initialize pyprof after the JIT step 18 | pyprof.nvtx.init() 19 | 20 | #Assign a name to the object "traced_foo" 21 | traced_foo.__dict__['__name__'] = "foo" 22 | 23 | #Hook up the forward function to pyprof 24 | pyprof.nvtx.wrap(traced_foo, 'forward') 25 | 26 | with torch.autograd.profiler.emit_nvtx(): 27 | profiler.start() 28 | z = traced_foo(x, y) 29 | profiler.stop() 30 | print(z) 31 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/jit_trace_method.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.cuda.profiler as profiler 5 | from apex import pyprof 6 | 7 | class Foo(torch.nn.Module): 8 | def __init__(self, size): 9 | super(Foo, self).__init__() 10 | self.n = torch.nn.Parameter(torch.ones(size)) 11 | self.m = torch.nn.Parameter(torch.ones(size)) 12 | 13 | def forward(self, input): 14 | return self.n*input + self.m 15 | 16 | foo = Foo(4) 17 | foo.cuda() 18 | x = torch.ones(4).cuda() 19 | 20 | #JIT the class using tracing 21 | traced_foo = torch.jit.trace(foo, x) 22 | 23 | #Initialize pyprof after the JIT step 24 | pyprof.nvtx.init() 25 | 26 | #Assign a name to the object "traced_foo" 27 | traced_foo.__dict__['__name__'] = "foo" 28 | 29 | #Hook up the forward function to pyprof 30 | pyprof.nvtx.wrap(traced_foo, 'forward') 31 | 32 | with torch.autograd.profiler.emit_nvtx(): 33 | profiler.start() 34 | z = traced_foo(x) 35 | profiler.stop() 36 | print(z) 37 | -------------------------------------------------------------------------------- /apex/pyprof/examples/jit/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT=`realpath $0` 6 | SCRIPTPATH=`dirname $SCRIPT` 7 | PYPROF="$SCRIPTPATH/../.." 8 | 9 | parse="python $PYPROF/parse/parse.py" 10 | prof="python $PYPROF/prof/prof.py" 11 | 12 | for f in *.py 13 | do 14 | base=`basename $f .py` 15 | sql=$base.sql 16 | dict=$base.dict 17 | 18 | #NVprof 19 | echo "nvprof -fo $sql python $f" 20 | nvprof -fo $sql python $f 21 | 22 | #Parse 23 | echo $parse $sql 24 | $parse $sql > $dict 25 | 26 | #Prof 27 | echo $prof $dict 28 | $prof -w 130 $dict 29 | \rm $sql $dict 30 | done 31 | -------------------------------------------------------------------------------- /apex/pyprof/examples/lenet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.cuda.profiler as profiler 7 | import torch.optim as optim 8 | 9 | from apex import pyprof 10 | pyprof.nvtx.init() 11 | 12 | class LeNet5(nn.Module): 13 | def __init__(self): 14 | super(LeNet5, self).__init__() 15 | # 1 input image channel, 6 output channels, 5x5 square convolution 16 | # kernel 17 | self.conv1 = nn.Conv2d(1, 6, 5) 18 | self.conv2 = nn.Conv2d(6, 16, 5) 19 | # an affine operation: y = Wx + b 20 | self.fc1 = nn.Linear(16 * 5 * 5, 120) 21 | self.fc2 = nn.Linear(120, 84) 22 | self.fc3 = nn.Linear(84, 10) 23 | 24 | def forward(self, x): 25 | # Max pooling over a (2, 2) window 26 | x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) 27 | # If the size is a square you can only specify a single number 28 | x = F.max_pool2d(F.relu(self.conv2(x)), 2) 29 | x = x.view(-1, self.num_flat_features(x)) 30 | x = F.relu(self.fc1(x)) 31 | x = F.relu(self.fc2(x)) 32 | x = self.fc3(x) 33 | return x 34 | 35 | def num_flat_features(self, x): 36 | size = x.size()[1:] # all dimensions except the batch dimension 37 | num_features = 1 38 | for s in size: 39 | num_features *= s 40 | return num_features 41 | 42 | with torch.autograd.profiler.emit_nvtx(): 43 | 44 | net = LeNet5().cuda() 45 | 46 | input = torch.randn(1, 1, 32, 32).cuda() 47 | out = net(input) 48 | 49 | target = torch.randn(10) # a dummy target, for example 50 | target = target.view(1, -1).cuda() # make it the same shape as output 51 | criterion = nn.MSELoss() 52 | 53 | # create your optimizer 54 | optimizer = optim.SGD(net.parameters(), lr=0.01) 55 | 56 | # in your training loop: 57 | optimizer.zero_grad() # zero the gradient buffers 58 | 59 | profiler.start() 60 | output = net(input) 61 | loss = criterion(output, target) 62 | loss.backward() 63 | optimizer.step() # Does the update 64 | profiler.stop() 65 | 66 | -------------------------------------------------------------------------------- /apex/pyprof/examples/operators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This file checks all Python operators. 5 | """ 6 | 7 | import sys 8 | import torch 9 | import torch.cuda.profiler as profiler 10 | import operator 11 | import inspect 12 | 13 | #Import and initialize pyprof 14 | from apex import pyprof 15 | pyprof.nvtx.init() 16 | 17 | X = 1024 18 | Y = 1024 19 | 20 | fa = torch.rand(X, Y).cuda() 21 | fb = torch.rand(X, Y).cuda() 22 | fc = torch.rand(X, Y).cuda() 23 | 24 | ia = torch.randint(0, 100, (X, Y)).cuda() 25 | ib = torch.randint(0, 100, (X, Y)).cuda() 26 | 27 | sa = torch.ones(1,1).cuda() 28 | sb = torch.ones(1,1).cuda() 29 | 30 | ba = fa.byte() 31 | 32 | unaryOps = ["abs", "__abs__", "neg", "__neg__",] 33 | invertOps = ["inv", "invert", "__inv__", "__invert__",] #imlemented only for byte tensors 34 | #pos, __pos__ is not implemented for tensors 35 | 36 | binaryOps = [] 37 | binaryOps += [ "lt", "__lt__", "le", "__le__", "eq", "__eq__", "ne", "__ne__", "ge", "__ge__", "gt", "__gt__" ] 38 | binaryOps += [ "add", "__add__", "sub", "__sub__", "mul", "__mul__", "floordiv", "__floordiv__", "truediv", "__truediv__", "pow", "__pow__", "mod", "__mod__"] 39 | binaryOps += [ "and_", "__and__", "or_", "__or__", "xor", "__xor__", "lshift", "__lshift__", "rshift", "__rshift__"] 40 | 41 | inplaceOps = [] 42 | inplaceOps += ["iadd", "__iadd__", "isub", "__isub__", "imul", "__imul__", "ifloordiv", "__ifloordiv__", "itruediv", "__itruediv__", "imod", "__imod__",] 43 | #ipow, __ipow__ is not implemented in pytorch 44 | inplaceOps += [ "iand", "__iand__", "ior", "__ior__", "ixor", "__ixor__", "ilshift", "__ilshift__", "irshift", "__irshift__",] 45 | 46 | matmulOps = [ "matmul", "__matmul__" ] 47 | inplacematmulOps = [ "imatmul", "__imatmul__" ] 48 | 49 | reverseIntBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rfloordiv__", "__rpow__",] 50 | reverseFloatBinaryOps = ["__radd__", "__rsub__", "__rmul__", "__rdiv__", "__rtruediv__", "__rfloordiv__", "__rpow__",] 51 | 52 | ''' 53 | TODO 54 | .concat(a, b) 55 | .__concat__(a, b) 56 | .contains(a, b) 57 | .__contains__(a, b) 58 | .countOf(a, b) 59 | .delitem(a, b) 60 | .__delitem__(a, b) 61 | .getitem(a, b) 62 | .__getitem__(a, b) 63 | .indexOf(a, b) 64 | .setitem(a, b, c) 65 | .__setitem__(a, b, c) 66 | .length_hint(obj, default=0) 67 | .iconcat(a, b) 68 | .__iconcat__(a, b) 69 | .index(a) 70 | .__index__(a) 71 | ''' 72 | 73 | #Context manager 74 | with torch.autograd.profiler.emit_nvtx(): 75 | 76 | #Start profiler 77 | profiler.start() 78 | 79 | for op in unaryOps: 80 | assert hasattr(operator, op) 81 | f = getattr(operator, op) 82 | assert inspect.isbuiltin(f) 83 | c = f(ia) 84 | 85 | for op in invertOps: 86 | assert hasattr(operator, op) 87 | f = getattr(operator, op) 88 | assert inspect.isbuiltin(f) 89 | c = f(ba) 90 | 91 | for op in binaryOps: 92 | assert hasattr(operator, op) 93 | f = getattr(operator, op) 94 | assert inspect.isbuiltin(f) 95 | c = f(ia, ib) 96 | c = f(ia, 2) 97 | 98 | for op in inplaceOps: 99 | assert hasattr(operator, op) 100 | f = getattr(operator, op) 101 | assert inspect.isbuiltin(f) 102 | ia = f(ia, ib) 103 | ia = f(ia, 2) 104 | 105 | for op in matmulOps: 106 | assert hasattr(operator, op) 107 | f = getattr(operator, op) 108 | assert inspect.isbuiltin(f) 109 | c = f(fa, fb) 110 | 111 | for op in inplacematmulOps: 112 | assert hasattr(operator, op) 113 | f = getattr(operator, op) 114 | assert inspect.isbuiltin(f) 115 | fa = f(fa, fb) 116 | 117 | for op in reverseIntBinaryOps: 118 | assert hasattr(torch.Tensor, op) 119 | f = getattr(torch.Tensor, op) 120 | ia = f(ia, ib) 121 | 122 | for op in reverseFloatBinaryOps: 123 | assert hasattr(torch.Tensor, op) 124 | f = getattr(torch.Tensor, op) 125 | fa = f(fa, fb) 126 | 127 | ''' 128 | #c = fa[3] 129 | #c = fa[3][3] 130 | #c = torch.min(fa, 3) 131 | c = torch.sum(fa) 132 | c = torch.max(fa) 133 | c = -fa 134 | #fc[2][2] = fa[2][2] 135 | 136 | c = a_scalar and b_scalar 137 | c = a_scalar or b_scalar 138 | c = not a_scalar 139 | 140 | c = a is b 141 | c = a is not b 142 | ''' 143 | 144 | #Stop profiler 145 | profiler.stop() 146 | -------------------------------------------------------------------------------- /apex/pyprof/examples/simple.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This simple file provides an example of how to 5 | - import the pyprof library and initialize it 6 | - use the emit_nvtx context manager 7 | - start and stop the profiler 8 | 9 | Only kernels within profiler.start and profiler.stop calls are profiled. 10 | To profile 11 | $ nvprof -f -o simple.sql --profile-from-start off ./simple.py 12 | """ 13 | 14 | import sys 15 | import torch 16 | import torch.cuda.profiler as profiler 17 | 18 | #Import and initialize pyprof 19 | from apex import pyprof 20 | pyprof.nvtx.init() 21 | 22 | a = torch.randn(5, 5).cuda() 23 | b = torch.randn(5, 5).cuda() 24 | 25 | #Context manager 26 | with torch.autograd.profiler.emit_nvtx(): 27 | 28 | #Start profiler 29 | profiler.start() 30 | 31 | c = a + b 32 | c = torch.mul(a,b) 33 | c = torch.matmul(a,b) 34 | c = torch.argmax(a, dim=1) 35 | c = torch.nn.functional.pad(a, (1,1)) 36 | 37 | #Stop profiler 38 | profiler.stop() 39 | -------------------------------------------------------------------------------- /apex/pyprof/examples/user_annotation/README.md: -------------------------------------------------------------------------------- 1 | Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm) 2 | are a useful tool to capture and observe events and code ranges etc. 3 | Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP). 4 | 5 | While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option. 6 | 7 | NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel. 8 | 9 | The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks. 10 | 11 | Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all. 12 | 13 | ### To run 14 | 15 | ```sh 16 | nvprof -fo resnet.sql --profile-from-start off python resnet.py 17 | parse.py resnet.sql > resnet.dict 18 | prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict 19 | ``` 20 | 21 | The file `resnet.sql` can also be opened with NVVP as usual. 22 | -------------------------------------------------------------------------------- /apex/pyprof/examples/user_annotation/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT=`realpath $0` 6 | SCRIPTPATH=`dirname $SCRIPT` 7 | PYPROF="$SCRIPTPATH/../.." 8 | 9 | parse="python $PYPROF/parse/parse.py" 10 | prof="python $PYPROF/prof/prof.py" 11 | 12 | for f in *.py 13 | do 14 | base=`basename $f .py` 15 | sql=$base.sql 16 | dict=$base.dict 17 | 18 | #NVprof 19 | echo "nvprof -fo --profile-from-start off $sql python $f" 20 | nvprof -fo $sql --profile-from-start off python $f 21 | 22 | #Parse 23 | echo $parse $sql 24 | $parse $sql > $dict 25 | 26 | #Prof 27 | echo $prof $dict 28 | #$prof -w 130 $dict 29 | $prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict 30 | \rm $sql $dict 31 | done 32 | -------------------------------------------------------------------------------- /apex/pyprof/nvtx/__init__.py: -------------------------------------------------------------------------------- 1 | from .nvmarker import init 2 | from .nvmarker import add_wrapper as wrap 3 | -------------------------------------------------------------------------------- /apex/pyprof/parse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/apex/pyprof/parse/__init__.py -------------------------------------------------------------------------------- /apex/pyprof/parse/__main__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | try: 4 | from .parse import main 5 | except ImportError as e: 6 | warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)") 7 | raise e 8 | 9 | if __name__ == '__main__': 10 | main() 11 | -------------------------------------------------------------------------------- /apex/pyprof/parse/db.py: -------------------------------------------------------------------------------- 1 | import sys, sqlite3 2 | 3 | class DB(object): 4 | """ 5 | This class provides functions for DB operations 6 | with exception handling. 7 | """ 8 | 9 | def __init__(self, dbFile): 10 | try: 11 | conn = sqlite3.connect(dbFile) 12 | conn.row_factory = sqlite3.Row 13 | c = conn.cursor() 14 | except: 15 | print("Error opening {}".format(dbFile)) 16 | sys.exit(1) 17 | 18 | self.conn = conn 19 | self.c = c 20 | 21 | def select(self, cmd): 22 | try: 23 | self.c.execute(cmd) 24 | #rows = self.c.fetchall() 25 | rows = [dict(row) for row in self.c.fetchall()] 26 | except sqlite3.Error as e: 27 | print(e) 28 | sys.exit(1) 29 | except: 30 | print("Uncaught error in SQLite access while executing {}".format(cmd)) 31 | sys.exit(1) 32 | 33 | #print(rows) 34 | return rows 35 | 36 | def insert(self, cmd, data): 37 | try: 38 | self.c.execute(cmd, data) 39 | except sqlite3.Error as e: 40 | print(e) 41 | sys.exit(1) 42 | except: 43 | print("Uncaught error in SQLite access while executing {}".format(cmd)) 44 | sys.exit(1) 45 | 46 | def execute(self, cmd): 47 | try: 48 | self.c.execute(cmd) 49 | except sqlite3.Error as e: 50 | print(e) 51 | sys.exit(1) 52 | except: 53 | print("Uncaught error in SQLite access while executing {}".format(cmd)) 54 | sys.exit(1) 55 | 56 | def commit(self): 57 | self.conn.commit() 58 | 59 | def close(self): 60 | self.c.close() 61 | self.conn.close() 62 | -------------------------------------------------------------------------------- /apex/pyprof/parse/parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Parse the SQL db and print a dictionary for every kernel. 5 | """ 6 | 7 | import sys 8 | import argparse 9 | from tqdm import tqdm 10 | 11 | from .db import DB 12 | from .kernel import Kernel 13 | from .nvvp import NVVP 14 | 15 | def parseArgs(): 16 | parser = argparse.ArgumentParser(prog=sys.argv[0], description="Parse SQL (nvvp) db.") 17 | parser.add_argument("file", 18 | type=str, 19 | default=None, 20 | help="SQL db (nvvp) file.") 21 | 22 | args = parser.parse_args() 23 | return args 24 | 25 | def main(): 26 | args = parseArgs() 27 | 28 | db = DB(args.file) 29 | nvvp = NVVP(db) 30 | 31 | kInfo = nvvp.getKernelInfo() 32 | if len(kInfo) == 0: 33 | print("Found 0 kernels. Exiting.", file=sys.stderr) 34 | db.close() 35 | sys.exit(0) 36 | else: 37 | print("Found {} kernels. Getting info for each kernel.".format(len(kInfo)), file=sys.stderr) 38 | 39 | nvvp.createMarkerTable() 40 | 41 | prevSeqId = -1 42 | prevSubSeqId = -1 43 | prevOp = "na" 44 | 45 | Kernel.profStart = nvvp.getProfileStart() 46 | 47 | for i in tqdm(range(len(kInfo)), ascii=True): 48 | info = kInfo[i] 49 | k = Kernel() 50 | 51 | #Set kernel info 52 | k.setKernelInfo(info) 53 | 54 | #Get, set kernel name 55 | name = nvvp.getString(k.kNameId) 56 | k.setKernelName(name) 57 | 58 | #Get runtime info 59 | info = nvvp.getCPUInfo(k.corrId) 60 | k.setRunTimeInfo(info) 61 | 62 | #Get and set marker and seqid info 63 | info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime) 64 | k.setMarkerInfo(info) 65 | 66 | #If the seqId contains both 0 and non zero integers, remove 0. 67 | if any(seq != 0 for seq in k.seqId) and (0 in k.seqId): 68 | k.seqId.remove(0) 69 | 70 | #Set direction (it uses seq id) 71 | k.setDirection() 72 | 73 | #Set op 74 | k.setOp() 75 | 76 | #The following code is based on heuristics. 77 | #TODO: Refactor. 78 | #Assign subSeqId, adjust seqId and altSeqId 79 | #seqId can be 0. 80 | #A kernel can have multiple seqIds both in fprop and bprop. 81 | #In bprop, seqIds might not decrease monotonically. I have observed a few blips. 82 | if len(k.seqId): 83 | assert (k.dir in ["fprop", "bprop"]) 84 | if (k.dir == "fprop"): 85 | #Check if there is a sequence id larger than the previous 86 | inc = (k.seqId[-1] > prevSeqId) 87 | if inc: 88 | currSeqId = [x for x in k.seqId if x > prevSeqId][0] 89 | else: 90 | currSeqId = prevSeqId 91 | else: 92 | currSeqId = k.seqId[0] 93 | 94 | #if ((currSeqId == prevSeqId) and (k.op == prevOp)): 95 | if ((currSeqId == prevSeqId) and (k.op == prevOp)) or ((k.op[0] == "forward") and (k.op == prevOp) and (k.mod[0] in ["LSTMCell", "GRUCell", "RNNCell"])): 96 | #The second condition is to trap cases when pytorch does not use cudnn for a LSTMCell. 97 | k.subSeqId = prevSubSeqId + 1 98 | 99 | prevSeqId = currSeqId 100 | prevSubSeqId = k.subSeqId 101 | prevOp = k.op 102 | 103 | #Keep currSeqId in k.seqId, move everything else to k.altSeqId 104 | for s in k.seqId: 105 | if s != currSeqId: 106 | k.seqId.remove(s) 107 | k.altSeqId.append(s) 108 | 109 | for s in k.altSeqId: 110 | if s == currSeqId: 111 | k.altSeqId.remove(s) 112 | 113 | k.altSeqId = list(set(k.altSeqId)) 114 | if (len(k.altSeqId)): 115 | (k.altSeqId).sort() 116 | 117 | k.print() 118 | 119 | db.close() 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /apex/pyprof/prof/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, prof 2 | -------------------------------------------------------------------------------- /apex/pyprof/prof/__main__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | try: 4 | from .prof import main 5 | except ImportError as e: 6 | warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?") 7 | raise e 8 | 9 | if __name__ == '__main__': 10 | main() 11 | -------------------------------------------------------------------------------- /apex/pyprof/prof/activation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Activation(OperatorLayerBase): 6 | """ 7 | This class handles the various activation functions. 8 | """ 9 | 10 | ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"] 11 | 12 | def __init__(self, d): 13 | marker = eval(d.argMarker[0]) 14 | mod = marker['mod'] 15 | op = marker['op'] 16 | args = marker['args'] 17 | 18 | self.marker = marker 19 | self.mod_ = mod 20 | self.op_ = op 21 | self.args = args 22 | 23 | assert (mod in ["torch.nn.functional", "torch", "Tensor"]) 24 | 25 | #Filter out named parameters 26 | args = list(filter(lambda x : x['name'] == '', args)) 27 | 28 | assert (len(args) >= 1) 29 | arg = args[0] 30 | assert (arg['type'] == "tensor") 31 | 32 | self.i = arg 33 | self.dir = d.dir 34 | 35 | def params(self): 36 | p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])]) 37 | return p 38 | 39 | def flops(self): 40 | direction = self.dir 41 | tensor = self.i['shape'] 42 | t = self.i['dtype'] 43 | 44 | # TODO: revise 45 | elems = Utility.numElems(tensor) 46 | return elems 47 | 48 | def bytes(self): 49 | direction = self.dir 50 | tensor = self.i['shape'] 51 | t = self.i['dtype'] 52 | 53 | elems = Utility.numElems(tensor) 54 | elems = elems * (2 if direction == "fprop" else 3) 55 | 56 | return elems * Utility.typeToBytes(t) 57 | 58 | def tc(self): 59 | return "-" 60 | 61 | def op(self): 62 | return self.op_ 63 | 64 | def mod(self): 65 | return self.mod_ 66 | -------------------------------------------------------------------------------- /apex/pyprof/prof/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class OperatorLayerBase(ABC): 4 | """ 5 | Base class for all layers and operators. 6 | Every derived class should have the following functions. 7 | """ 8 | 9 | @abstractmethod 10 | def tc(self): 11 | """ 12 | Tensor core usage by the kernel. 13 | Return "1" (yes), "0" (no, but possible), "-" (not applicable) 14 | """ 15 | pass 16 | 17 | @abstractmethod 18 | def params(self): 19 | """ 20 | Kernel parameters to be printed. 21 | """ 22 | pass 23 | 24 | @abstractmethod 25 | def flops(self): 26 | """ 27 | Note that 1 FMA = 2 flops. 28 | """ 29 | pass 30 | 31 | @abstractmethod 32 | def bytes(self): 33 | pass 34 | 35 | @abstractmethod 36 | def mod(self): 37 | """ 38 | Name of the module/class e.g. torch.nn.functional. 39 | """ 40 | pass 41 | 42 | @abstractmethod 43 | def op(self): 44 | """ 45 | Name of the operator e.g. sigmoid. 46 | """ 47 | pass 48 | -------------------------------------------------------------------------------- /apex/pyprof/prof/convert.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Convert(OperatorLayerBase): 6 | """ 7 | Class to handle convert operations. 8 | """ 9 | ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"] 10 | 11 | def __init__(self, d): 12 | marker = eval(d.argMarker[0]) 13 | mod = marker['mod'] 14 | op = marker['op'] 15 | args = marker['args'] 16 | 17 | self.marker = marker 18 | self.mod_ = mod 19 | self.op_ = op 20 | self.args = args 21 | 22 | assert (mod == "Tensor") 23 | assert (op in Convert.ops) 24 | assert (len(args) == 1) 25 | 26 | #The argument could be a tensor or scalar 27 | t = args[0] 28 | if t['type'] == "tensor": 29 | shape = t['shape'] 30 | stype = t['dtype'] 31 | else: 32 | shape = (1,) 33 | stype = t['type'] 34 | if self.op_ == "to": 35 | op = stype 36 | 37 | self.shape = shape 38 | self.stype = stype 39 | self.dtype = op 40 | 41 | def params(self): 42 | p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)]) 43 | return p 44 | 45 | def op(self): 46 | return self.op_ 47 | 48 | def mod(self): 49 | return self.mod_ 50 | 51 | def tc(self): 52 | return "-" 53 | 54 | def elems(self): 55 | return Utility.numElems(self.shape) 56 | 57 | def flops(self): 58 | return 0 59 | 60 | def bytes(self): 61 | b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype)) 62 | return b 63 | -------------------------------------------------------------------------------- /apex/pyprof/prof/data.py: -------------------------------------------------------------------------------- 1 | from .utility import Utility 2 | 3 | class Data(object): 4 | """ 5 | Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc. 6 | """ 7 | def __init__(self, kernel): 8 | #Available from NVprof 9 | self.tid = kernel['tid'] 10 | self.device = kernel['device'] 11 | self.stream = kernel['stream'] 12 | self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","") 13 | self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","") 14 | self.name = kernel['kShortName'].replace(" ","_") 15 | self.lName = kernel['kLongName'] 16 | self.sil = kernel['kDuration'] #units ns 17 | 18 | self.index = None 19 | 20 | #Markers 21 | self.argMarker = kernel['marker'] 22 | self.modMarker = kernel['reprMarkers'] 23 | self.seqMarker = kernel['seqMarker'] 24 | 25 | self.layer = kernel['layer'] 26 | self.trace = kernel['trace'] 27 | 28 | self.seqId = kernel['seqId'] 29 | self.altSeqId = kernel['altSeqId'] 30 | 31 | self.dir = kernel['dir'] 32 | self.sub = kernel['subSeqId'] 33 | 34 | self.mod = "na" 35 | self.op = "na" 36 | self.params = {"na":"na"} 37 | self.tc = "na" 38 | self.flops = 0 39 | self.bytes = 0 40 | 41 | def setParams(self, params): 42 | #Remove space from params 43 | qaz = "" 44 | for key,value in params.items(): 45 | if "type" not in key: 46 | qaz += "{}={},".format(key,value) 47 | else: 48 | if type(value) is str: 49 | qaz += "{},".format(Utility.typeToString(value)) 50 | else: 51 | qaz += "{}".format(value) 52 | 53 | self.params = qaz.replace(" ", "") 54 | 55 | -------------------------------------------------------------------------------- /apex/pyprof/prof/dropout.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Dropout(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (mod == "torch.nn.functional") 19 | assert (op == "dropout") 20 | #assert (len(args) == 1) 21 | 22 | self.shape = args[0]['shape'] 23 | self.type = args[0]['dtype'] 24 | self.dir = d.dir 25 | 26 | return 27 | 28 | def params(self): 29 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 30 | return p 31 | 32 | def op(self): 33 | return self.op_ 34 | 35 | def mod(self): 36 | return self.mod_ 37 | 38 | def tc(self): 39 | return "-" 40 | 41 | def elems(self): 42 | return Utility.numElems(self.shape) 43 | 44 | def bytes(self): 45 | #Ignoring the cost of writing and reading the mask 46 | return Utility.typeToBytes(self.type) * self.elems() * 2 47 | 48 | def flops(self): 49 | # Note: This is approximate and depends on the RNG 50 | return 5*self.elems() 51 | -------------------------------------------------------------------------------- /apex/pyprof/prof/embedding.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Embedding(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (mod == "torch.nn.functional") 19 | assert (op == "embedding") 20 | 21 | self.ishape = args[0]['shape'] 22 | self.itype = args[0]['dtype'] 23 | 24 | self.eshape = args[1]['shape'] 25 | self.etype = args[1]['dtype'] 26 | 27 | assert (len(self.eshape) == 2) 28 | 29 | self.dir = d.dir 30 | self.sub = d.sub 31 | return 32 | 33 | def params(self): 34 | p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)]) 35 | return p 36 | 37 | def op(self): 38 | return self.op_ 39 | 40 | def mod(self): 41 | return self.mod_ 42 | 43 | def tc(self): 44 | return "-" 45 | 46 | def bytes(self): 47 | ishape = self.ishape 48 | itype = self.itype 49 | eshape = self.eshape 50 | etype = self.etype 51 | 52 | ielems = Utility.numElems(ishape) 53 | 54 | b = 0 55 | if self.dir == "fprop": 56 | #indices 57 | b += ielems * Utility.typeToBytes(itype) 58 | #read and write the embedding matrix 59 | b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype) 60 | else: 61 | #3 times the size of the incoming gradient 62 | b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype) 63 | 64 | if self.sub > 0: 65 | b = 0 66 | 67 | return b 68 | 69 | def flops(self): 70 | # Note: not implemented yet 71 | return 0 72 | -------------------------------------------------------------------------------- /apex/pyprof/prof/loss.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | #TODO: Add support for additional loss functions. 6 | 7 | class MSELoss(OperatorLayerBase): 8 | 9 | def __init__(self, d): 10 | marker = eval(d.argMarker[0]) 11 | mod = marker['mod'] 12 | op = marker['op'] 13 | args = marker['args'] 14 | 15 | self.marker = marker 16 | self.mod_ = mod 17 | self.op_ = op 18 | self.args = args 19 | 20 | assert (mod == "torch.nn.functional") 21 | assert (op == "mse_loss") 22 | assert (len(args) == 3) 23 | 24 | #Get input, target and reduction 25 | if (args[0]['name'] == ""): 26 | x = args[0] 27 | else: 28 | x = list(filter(lambda x : x['name'] == "input", args))[0] 29 | 30 | if (args[1]['name'] == ""): 31 | y = args[1] 32 | else: 33 | y = list(filter(lambda x : x['name'] == "target", args))[0] 34 | 35 | if (args[2]['name'] == ""): 36 | r = args[2] 37 | else: 38 | r = list(filter(lambda x : x['name'] == "reduction", args))[0] 39 | 40 | assert (x['type'] == y['type'] == "tensor") 41 | assert (x['shape'] == y['shape']) 42 | assert (x['dtype'] == y['dtype']) 43 | assert (r['type'] == "str") 44 | assert (r['value'] in ["none", "mean", "sum"]) 45 | 46 | self.shape = x['shape'] 47 | self.type = x['dtype'] 48 | self.red = r['value'] 49 | self.dir = d.dir 50 | 51 | def params(self): 52 | p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)]) 53 | return p 54 | 55 | def elems(self): 56 | red = self.red 57 | e = Utility.numElems(self.shape) 58 | 59 | if self.dir == "fprop": 60 | if red == "none": 61 | e *= 3 62 | else: 63 | e *= 2 64 | else: 65 | if red == "none": 66 | e *= 4 67 | else: 68 | e *= 3 69 | return e 70 | 71 | def bytes(self): 72 | return self.elems() * Utility.typeToBytes(self.type) 73 | 74 | def flops(self): 75 | return self.elems() * 2 + 1 76 | 77 | def tc(self): 78 | return "-" 79 | 80 | def op(self): 81 | return self.op_ 82 | 83 | def mod(self): 84 | return self.mod_ 85 | -------------------------------------------------------------------------------- /apex/pyprof/prof/normalization.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class BatchNorm(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (op == "batch_norm") 19 | assert (len(args) == 8) 20 | i = args[0] 21 | assert (i['type'] == "tensor") 22 | 23 | self.shape = i['shape'] 24 | self.type = i['dtype'] 25 | self.dir = d.dir 26 | 27 | def params(self): 28 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 29 | return p 30 | 31 | def tc(self): 32 | return "-" 33 | 34 | def op(self): 35 | return self.op_ 36 | 37 | def mod(self): 38 | return self.mod_ 39 | 40 | def elems(self): 41 | return Utility.numElems(self.shape) 42 | 43 | def flops(self): 44 | # Variance algo-dependent, but this is a reasonable value. 45 | return self.elems() * 8 46 | 47 | def bytes(self): 48 | e = self.elems() 49 | if self.dir == "fprop": 50 | e *= 4 51 | else: 52 | e *= 5 53 | 54 | return e * Utility.typeToBytes(self.type) 55 | -------------------------------------------------------------------------------- /apex/pyprof/prof/optim.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | #TODO: Add support for other optimizers. 6 | 7 | class Adam(OperatorLayerBase): 8 | 9 | def __init__(self, d): 10 | marker = eval(d.argMarker[0]) 11 | mod = marker['mod'] 12 | op = marker['op'] 13 | args = marker['args'] 14 | 15 | self.marker = marker 16 | self.mod_ = mod 17 | self.op_ = op 18 | self.args = args 19 | 20 | assert(op == "adam") 21 | assert (len(args) == 12) or (len(args) == 14) 22 | w, hw, m, v, g = args[0:5] 23 | assert (w['shape'] == m['shape'] == v['shape'] == g['shape']) 24 | assert (hw['shape'] == w['shape']) or (hw['shape'] == (0,)) #hw could be null 25 | assert (w['type'] == m['type'] == v['type'] == g['type'] == hw['type'] == "tensor") 26 | assert (w['dtype'] == m['dtype'] == v['dtype'] == "float32") 27 | 28 | self.w = w 29 | self.g = g 30 | 31 | def params(self): 32 | p = OrderedDict([('T',self.w['shape']), ('wtype',self.w['dtype']), ('gtype',self.g['dtype'])]) 33 | return p 34 | 35 | def flops(self): 36 | return 0 37 | 38 | def bytes(self): 39 | wshape = self.w['shape'] 40 | wtype = self.w['dtype'] 41 | gtype = self.g['dtype'] 42 | b = 0 43 | 44 | elems = Utility.numElems(wshape) 45 | 46 | #Get time to stream read/write w, m, v 47 | b += 6 * elems * Utility.typeToBytes(wtype) 48 | 49 | #Get time to read "g" 50 | b += elems * Utility.typeToBytes(gtype) 51 | 52 | if wtype != gtype: #mixed precision 53 | #Get time to write "hw 54 | b += elems * Utility.typeToBytes(gtype) 55 | 56 | return b 57 | 58 | def tc(self): 59 | return "-" 60 | 61 | def op(self): 62 | return self.op_ 63 | 64 | def mod(self): 65 | return self.mod_ 66 | -------------------------------------------------------------------------------- /apex/pyprof/prof/pooling.py: -------------------------------------------------------------------------------- 1 | from .collections import OrderedDict 2 | from .utility import Utility 3 | 4 | # Work in progress. 5 | 6 | #poolFuncs = ["max_pool2d_with_indices_forward", "max_pool2d_with_indices"] 7 | class MaxPool2d(object): 8 | 9 | def parse(marker): 10 | 11 | def convert2Tuple(arg): 12 | assert (arg['type'] in ["int", "tuple"]) 13 | if arg['type'] == "int": 14 | return (arg['value'], arg['value']) 15 | else: 16 | return arg['value'] 17 | 18 | mod = marker['mod'] 19 | op = marker['op'] 20 | args = marker['args'] 21 | assert (mod == "torch.nn.functional") 22 | assert (op == "max_pool2d") 23 | assert (len(args) >= 2) 24 | 25 | #input 26 | assert (args[0]['name'] == "") 27 | inp = args[0] 28 | assert (inp['type'] == "tensor") 29 | i = inp['shape'] 30 | t = inp['dtype'] 31 | assert (len(i) == 4) #nchw tensor 32 | 33 | #kernel 34 | if (args[1]['name'] == ""): 35 | k = args[1] 36 | else: 37 | k = list(filter(lambda x : x['name'] == "kernel_size", args))[0] 38 | k = convert2Tuple(k) 39 | 40 | #stride 41 | s = k #default value 42 | if ((len(args) >= 3) and args[2] == ""): 43 | s = args[2] 44 | s = convert2Tuple(s) 45 | elif any(x['name'] == "stride" for x in args): 46 | s = list(filter(lambda x : x['name'] == "stride", args))[0] 47 | s = convert2Tuple(s) 48 | 49 | #padding 50 | p = (0,0) 51 | if ((len(args) >= 4) and args[3] == ""): 52 | p = args[3] 53 | p = convert2Tuple(p) 54 | elif any(x['name'] == "padding" for x in args): 55 | p = list(filter(lambda x : x['name'] == "padding", args))[0] 56 | p = convert2Tuple(p) 57 | 58 | params = OrderedDict([('T', i), ('K', k), ('s',s), ('p',p), ('type', t)]) 59 | return params 60 | -------------------------------------------------------------------------------- /apex/pyprof/prof/randomSample.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class RandPerm(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (mod == "torch") 19 | assert (op == "randperm") 20 | assert (len(args) == 1) 21 | n = args[0] 22 | assert n['type'] == "int" 23 | self.n = n['value'] 24 | 25 | def params(self): 26 | p = OrderedDict([('N', self.n)]) 27 | return p 28 | 29 | def tc(self): 30 | return "-" 31 | 32 | def op(self): 33 | return self.op_ 34 | 35 | def mod(self): 36 | return self.mod_ 37 | 38 | def bytes(self): 39 | return self.n * Utility.typeToBytes("int64") 40 | 41 | def flops(self): 42 | # Depends on RNG but this is probably a reasonable assumption. 43 | return self.n * 3 44 | -------------------------------------------------------------------------------- /apex/pyprof/prof/reduction.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Mean(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (mod in ["torch", "Tensor"]) 19 | assert (op == "mean") 20 | 21 | #Filter out named parameters 22 | args = list(filter(lambda x : x['name'] == '', args)) 23 | 24 | assert (len(args) <= 2) 25 | i = args[0] 26 | 27 | self.shape = i['shape'] 28 | self.type = i['dtype'] 29 | self.dir = d.dir 30 | self.sub = d.sub 31 | 32 | def params(self): 33 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 34 | return p 35 | 36 | def tc(self): 37 | return "-" 38 | 39 | def op(self): 40 | return self.op_ 41 | 42 | def mod(self): 43 | return self.mod_ 44 | 45 | def elems(self): 46 | return Utility.numElems(self.shape) 47 | 48 | def bytes(self): 49 | if self.sub == 0: 50 | return self.elems() * Utility.typeToBytes(self.type) 51 | else: 52 | return 0 53 | 54 | def flops(self): 55 | if self.sub == 0: 56 | return self.elems() + 1 57 | else: 58 | return 0 59 | 60 | class Sum(OperatorLayerBase): 61 | 62 | def __init__(self, d): 63 | marker = eval(d.argMarker[0]) 64 | mod = marker['mod'] 65 | op = marker['op'] 66 | args = marker['args'] 67 | 68 | self.marker = marker 69 | self.mod_ = mod 70 | self.op_ = op 71 | self.args = args 72 | 73 | assert (mod in ["torch", "Tensor"]) 74 | assert (op == "sum") 75 | assert (len(args) >= 1) 76 | 77 | #Get input 78 | if (args[0]['name'] == ""): 79 | i = args[0] 80 | else: 81 | i = list(filter(lambda x : x['name'] == "input", args))[0] 82 | 83 | self.shape = i['shape'] 84 | self.type = i['dtype'] 85 | 86 | def params(self): 87 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 88 | return p 89 | 90 | def tc(self): 91 | return "-" 92 | 93 | def op(self): 94 | return self.op_ 95 | 96 | def mod(self): 97 | return self.mod_ 98 | 99 | def elems(self): 100 | return Utility.numElems(self.shape) 101 | 102 | def flops(self): 103 | # Note: This is incorrect, need to calculate actual flops (say via nvprof) 104 | return self.elems() 105 | 106 | def bytes(self): 107 | return self.elems() * Utility.typeToBytes(self.type) 108 | 109 | class Norm(OperatorLayerBase): 110 | 111 | def __init__(self, d): 112 | marker = eval(d.argMarker[0]) 113 | mod = marker['mod'] 114 | op = marker['op'] 115 | args = marker['args'] 116 | 117 | self.marker = marker 118 | self.mod_ = mod 119 | self.op_ = op 120 | self.args = args 121 | 122 | assert (mod in ["torch", "Tensor"]) 123 | assert (op == "norm") 124 | #assert (len(args) == 1) 125 | i = args[0] 126 | self.shape = i['shape'] 127 | self.type = i['dtype'] 128 | 129 | def params(self): 130 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 131 | return p 132 | 133 | def elems(self): 134 | return Utility.numElems(self.shape) 135 | 136 | def bytes(self): 137 | return self.elems() * Utility.typeToBytes(self.type) 138 | 139 | def flops(self): 140 | # square and add plus sqrt 141 | return 2 * self.elems() + 1 142 | 143 | def tc(self): 144 | return "-" 145 | 146 | def op(self): 147 | return self.op_ 148 | 149 | def mod(self): 150 | return self.mod_ 151 | -------------------------------------------------------------------------------- /apex/pyprof/prof/softmax.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from .utility import Utility 3 | from .base import OperatorLayerBase 4 | 5 | class Softmax(OperatorLayerBase): 6 | 7 | def __init__(self, d): 8 | marker = eval(d.argMarker[0]) 9 | mod = marker['mod'] 10 | op = marker['op'] 11 | args = marker['args'] 12 | 13 | self.marker = marker 14 | self.mod_ = mod 15 | self.op_ = op 16 | self.args = args 17 | 18 | assert (mod == "torch.nn.functional") 19 | assert (op == "softmax") 20 | 21 | #Filter out named parameters 22 | args = list(filter(lambda x : x['name'] == '', args)) 23 | 24 | assert (len(args) <= 2) 25 | self.shape = args[0]['shape'] 26 | self.type = args[0]['dtype'] 27 | self.dir = d.dir 28 | 29 | return 30 | 31 | def op(self): 32 | return self.op_ 33 | 34 | def mod(self): 35 | return self.mod_ 36 | 37 | def tc(self): 38 | return "-" 39 | 40 | def params(self): 41 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 42 | return p 43 | 44 | def elems(self): 45 | return Utility.numElems(self.shape) 46 | 47 | def flops(self): 48 | # Note: exp, sum-reduce, divide 49 | #flops = elems * 3 50 | return 0 51 | 52 | def bytes(self): 53 | b = self.elems() * Utility.typeToBytes(self.type) 54 | b *= 3 if self.dir == "fprop" else 5 #verify 55 | return b 56 | 57 | class LogSoftmax(OperatorLayerBase): 58 | 59 | def __init__(self, d): 60 | marker = eval(d.argMarker[0]) 61 | mod = marker['mod'] 62 | op = marker['op'] 63 | args = marker['args'] 64 | 65 | self.marker = marker 66 | self.mod_ = mod 67 | self.op_ = op 68 | self.args = args 69 | 70 | assert (mod == "torch.nn.functional") 71 | assert (op == "log_softmax") 72 | 73 | #Filter out named parameters 74 | args = list(filter(lambda x : x['name'] == '', args)) 75 | 76 | assert (len(args) <= 2) 77 | 78 | #Get input 79 | if (args[0]['name'] == ""): 80 | i = args[0] 81 | else: 82 | i = list(filter(lambda x : x['name'] == "input", args))[0] 83 | 84 | t = i['dtype'] 85 | 86 | self.shape = i['shape'] 87 | self.type = i['dtype'] 88 | self.dir = d.dir 89 | return 90 | 91 | def op(self): 92 | return self.op_ 93 | 94 | def mod(self): 95 | return self.mod_ 96 | 97 | def tc(self): 98 | return "-" 99 | 100 | def params(self): 101 | p = OrderedDict([('T', self.shape), ('type', self.type)]) 102 | return p 103 | 104 | def elems(self): 105 | return Utility.numElems(self.shape) 106 | 107 | def flops(self): 108 | # Note: exp, sum-reduce, divide, log 109 | #flops = elems * 4 110 | return 0 111 | 112 | def bytes(self): 113 | b = self.elems() * Utility.typeToBytes(self.type) 114 | b *= 3 if self.dir == "fprop" else 5 #verify 115 | return b 116 | -------------------------------------------------------------------------------- /apex/pyprof/prof/usage.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | def parseArgs(): 5 | """ 6 | Print usage and parse arguments. 7 | """ 8 | 9 | def check_cols(value): 10 | valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"] 11 | cols = value.split(",") 12 | for col in cols: 13 | if col not in valid: 14 | raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid))) 15 | return cols 16 | 17 | def openFile(f): 18 | try: 19 | d = open(f, "r") 20 | return d 21 | except IOError: 22 | print("Error opening file {}. Exiting.".format(f), file=sys.stderr) 23 | sys.exit(1) 24 | 25 | parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter) 26 | parser.add_argument("file", 27 | nargs='?', 28 | type=str, 29 | default=None, 30 | help="Output of parse.py (Python dictionary).") 31 | 32 | parser.add_argument("-c", 33 | type=check_cols, 34 | default="idx,dir,sub,mod,op,kernel,params,sil", 35 | help='''Comma seperated names of columns to print. 36 | idx: Index 37 | seq: PyTorch Sequence Id 38 | altseq: PyTorch Alternate Sequence Id 39 | tid: Thread Id 40 | layer: User annotated NVTX string (can be nested) 41 | trace: Function Call Trace 42 | dir: Direction 43 | sub: Sub Sequence Id 44 | mod: Module 45 | op: Operattion 46 | kernel: Kernel Name 47 | params: Parameters 48 | sil: Silicon Time (in ns) 49 | tc: Tensor Core Usage 50 | device: GPU Device Id 51 | stream: Stream Id 52 | grid: Grid Dimensions 53 | block: Block Dimensions 54 | flops: Floating point ops (FMA = 2 FLOPs) 55 | bytes: Number of bytes in and out of DRAM 56 | e.g. -c idx,kernel,sil''') 57 | 58 | group = parser.add_mutually_exclusive_group() 59 | group.add_argument("--csv", 60 | action="store_true", 61 | default=False, 62 | help="Print a CSV output.") 63 | group.add_argument("-w", 64 | type=int, 65 | default=0, 66 | help="Width of columnated output.") 67 | 68 | args = parser.parse_args() 69 | if args.file is None: 70 | args.file = sys.stdin 71 | else: 72 | args.file = openFile(args.file) 73 | return args 74 | -------------------------------------------------------------------------------- /apex/pyprof/prof/utility.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | class Utility(object): 4 | 5 | @staticmethod 6 | def numElems(shape): 7 | assert (type(shape) == tuple) 8 | return reduce(lambda x,y: x*y, shape, 1) 9 | 10 | @staticmethod 11 | def typeToBytes(t): 12 | if (t in ["uint8", "int8", "byte", "char", "bool"]): 13 | return 1 14 | elif (t in ["float16", "half", "int16", "short"]): 15 | return 2 16 | elif (t in ["float32", "float", "int32", "int"]): 17 | return 4 18 | elif (t in ["int64", "long", "float64", "double"]): 19 | return 8 20 | assert False 21 | 22 | @staticmethod 23 | def typeToString(t): 24 | if (t in ["uint8", "byte", "char",]): 25 | return "uint8" 26 | elif (t in ["int8",]): 27 | return "int8" 28 | elif (t in ["int16", "short",]): 29 | return "int16" 30 | elif (t in ["float16", "half"]): 31 | return "fp16" 32 | elif (t in ["float32", "float"]): 33 | return "fp32" 34 | elif (t in ["int32", "int",]): 35 | return "int32" 36 | elif (t in ["int64", "long"]): 37 | return "int64" 38 | elif (t in ["float64", "double",]): 39 | return "fp64" 40 | elif (t in ["bool",]): 41 | return "bool" 42 | assert False 43 | 44 | @staticmethod 45 | def hasNVTX(marker): 46 | if type(marker) is str: 47 | try: 48 | marker = eval(marker) 49 | except: 50 | return False 51 | 52 | if type(marker) is dict: 53 | keys = marker.keys() 54 | return ("mod" in keys) and ("op" in keys) and ("args" in keys) 55 | else: 56 | return False 57 | 58 | @staticmethod 59 | def isscalar(t): 60 | return (t in ["float", "int"]) 61 | -------------------------------------------------------------------------------- /apex/reparameterization/README.md: -------------------------------------------------------------------------------- 1 | Under construction... 2 | -------------------------------------------------------------------------------- /apex/reparameterization/weight_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.parameter import Parameter 3 | from ..fp16_utils import Fused_Weight_Norm 4 | import time 5 | 6 | from .reparameterization import Reparameterization 7 | 8 | def _norm(p, dim): 9 | """Computes the norm over all dimensions except dim""" 10 | if dim is None: 11 | return p.norm() 12 | elif dim == 0: 13 | output_size = (p.size(0),) + (1,) * (p.dim() - 1) 14 | return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size) 15 | elif dim == p.dim() - 1: 16 | output_size = (1,) * (p.dim() - 1) + (p.size(-1),) 17 | return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size) 18 | return _norm(p.transpose(0, dim), 0).transpose(0, dim) 19 | 20 | HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor) 21 | 22 | class WeightNorm(Reparameterization): 23 | r""" 24 | Weight normalization is a reparameterization that decouples the magnitude 25 | of a weight tensor from its direction. This replaces the parameter specified 26 | by `name` (e.g. "weight") with two parameters: one specifying the magnitude 27 | (e.g. "weight_g") and one specifying the direction (e.g. "weight_v"). 28 | Weight normalization is implemented via a hook that recomputes the weight 29 | tensor from the magnitude and direction before every :meth:`~Module.forward` 30 | call. 31 | 32 | .. math:: 33 | \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|} 34 | 35 | By default, with `dim=0`, the norm is computed independently per output 36 | channel/plane. To compute a norm over the entire weight tensor, use 37 | `dim=None`. 38 | """ 39 | def compute_weight(self, module=None, name=None): 40 | """ 41 | Computes weight normalized weight value to assign value to module attribute 42 | with name `name`. 43 | Arguments: 44 | module (nn.Module): module with weight we'd like to reparameterize 45 | Returns: 46 | w (Tensor): Tensor object containing value of reparameterized weight 47 | """ 48 | if module is None: 49 | module = self.module 50 | if name is None: 51 | name = self.name 52 | module, name = Reparameterization.get_module_and_name(module, name) 53 | g = getattr(module, name + '_g') 54 | v = getattr(module, name + '_v') 55 | 56 | fused_weight_norm = Fused_Weight_Norm.apply 57 | v = v.contiguous() 58 | w = fused_weight_norm(v, g, self.dim) 59 | 60 | return w 61 | 62 | def reparameterize(self, name, weight, dim): 63 | """ 64 | Creates Parameters v and gto be used for weight normalization 65 | and creates names that for attributes for the module these Parameters 66 | will correspond to. The parameters will be registered according to the names 67 | provided. 68 | Arguments: 69 | module (nn.Module): module with weight we'd like to reparameterize 70 | name (str, optional): name of weight parameter 71 | dim (int, optional): dimension over which to compute parameterization 72 | Returns: 73 | names (list, str): names of Parameters to be used for reparameterization 74 | params (list, Parameter): Parameters to be used for reparameterization 75 | """ 76 | names = [name + '_g', name + '_v'] 77 | params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)] 78 | return names, params 79 | -------------------------------------------------------------------------------- /data/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BM-K/KoBART-summarization-pytorch/0945030ea49450b91d711b3fcd19af71762c5628/data/dataset.zip -------------------------------------------------------------------------------- /model/KoBART/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from kobart import get_pytorch_kobart_model 4 | from transformers import BartForConditionalGeneration 5 | 6 | 7 | class KoBARTConditionalGeneration(nn.Module): 8 | def __init__(self, args, tokenizer): 9 | super(KoBARTConditionalGeneration, self).__init__() 10 | self.model = BartForConditionalGeneration.from_pretrained(get_pytorch_kobart_model(), 11 | output_attentions=True, 12 | output_hidden_states=True) 13 | 14 | self.vocab_size = self.model.config.vocab_size 15 | 16 | self.args = args 17 | self.linear_copy = nn.Linear(768, 1) 18 | self.tokenizer = tokenizer 19 | self.loss_fct = nn.CrossEntropyLoss() 20 | 21 | def forward(self, inputs, mode): 22 | if mode != 'test': 23 | outs = self.model(input_ids=inputs['input_ids'], 24 | attention_mask=inputs['attention_mask'], 25 | decoder_input_ids=inputs['decoder_input_ids'], 26 | decoder_attention_mask=inputs['decoder_attention_mask'], 27 | labels=inputs['labels'], return_dict=True) 28 | 29 | encoder_input_ids = inputs['input_ids'] 30 | 31 | logits = outs.logits 32 | last_hidden_state = outs.decoder_hidden_states[-1] 33 | last_attention_weight = torch.softmax(outs.cross_attentions[-1], dim=-1) 34 | 35 | p_copy = torch.sigmoid(self.linear_copy(last_hidden_state)) 36 | previous_word_pro = torch.softmax(logits, dim=-1) * (1 - p_copy) 37 | 38 | encoder_word_attention = p_copy * torch.mean(last_attention_weight, dim=1) 39 | 40 | mask = torch.where(encoder_input_ids == self.tokenizer.pad_token_id, 41 | encoder_word_attention.new_zeros(encoder_input_ids.shape), 42 | encoder_word_attention.new_ones(encoder_input_ids.shape)) 43 | 44 | encoder_word_attention = encoder_word_attention * mask.unsqueeze(1) 45 | personal_words = encoder_input_ids.unsqueeze(1).repeat(1, encoder_word_attention.shape[1], 1) 46 | word_pro = torch.scatter_add(previous_word_pro, 2, personal_words, encoder_word_attention) 47 | 48 | loss = self.loss_fct(word_pro.view(-1, self.vocab_size), inputs['labels'].view(-1)) 49 | 50 | return loss 51 | 52 | else: 53 | outputs = self.model.generate(inputs['input_ids'], 54 | max_length=self.args.max_len, 55 | num_beams=5, 56 | linear_copy=self.linear_copy) 57 | 58 | return outputs 59 | -------------------------------------------------------------------------------- /model/setting.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import logging 4 | import numpy as np 5 | from argparse import ArgumentParser 6 | 7 | 8 | class Arguments(): 9 | 10 | def __init__(self): 11 | self.parser = ArgumentParser() 12 | 13 | def add_type_of_processing(self): 14 | self.add_argument('--opt_level', type=str, default='O1') 15 | self.add_argument('--fp16', type=str, default='True') 16 | self.add_argument('--train', type=str, default='True') 17 | self.add_argument('--test', type=str, default='True') 18 | self.add_argument('--device', type=str, default=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) 19 | 20 | def add_hyper_parameters(self): 21 | self.add_argument('--patient', type=int, default=5) 22 | self.add_argument('--dropout', type=int, default=0.1) 23 | self.add_argument('--max_len', type=int, default=256) 24 | self.add_argument('--batch_size', type=int, default=32) 25 | self.add_argument('--epochs', type=int, default=10) 26 | self.add_argument('--seed', type=int, default=1) 27 | self.add_argument('--lr', type=float, default=0.00003) 28 | self.add_argument('--warmup_ratio', type=float, default=0.1) 29 | 30 | def add_data_parameters(self): 31 | self.add_argument('--train_data', type=str, default='train.tsv') 32 | self.add_argument('--test_data', type=str, default='test.tsv') 33 | self.add_argument('--valid_data', type=str, default='valid.tsv') 34 | self.add_argument('--path_to_data', type=str, default='./data/') 35 | self.add_argument('--path_to_save', type=str, default='./output/') 36 | self.add_argument('--ckpt', type=str, default='best_ckpt.pt') 37 | 38 | def print_args(self, args): 39 | for idx, (key, value) in enumerate(args.__dict__.items()): 40 | if idx == 0:print("argparse{\n", "\t", key, ":", value) 41 | elif idx == len(args.__dict__) - 1:print("\t", key, ":", value, "\n}") 42 | else:print("\t", key, ":", value) 43 | 44 | def add_argument(self, *args, **kw_args): 45 | return self.parser.add_argument(*args, **kw_args) 46 | 47 | def parse(self): 48 | args = self.parser.parse_args() 49 | self.print_args(args) 50 | 51 | return args 52 | 53 | 54 | class Setting(): 55 | 56 | def set_logger(self): 57 | 58 | _logger = logging.getLogger() 59 | formatter = logging.Formatter( 60 | '[%(levelname)s] %(asctime)s [ %(message)s ] | file::%(filename)s | line::%(lineno)s') 61 | 62 | stream_handler = logging.StreamHandler() 63 | stream_handler.setFormatter(formatter) 64 | 65 | _logger.addHandler(stream_handler) 66 | _logger.setLevel(logging.DEBUG) 67 | 68 | return _logger 69 | 70 | def set_seed(self, args): 71 | 72 | seed = args.seed 73 | 74 | random.seed(seed) 75 | np.random.seed(seed) 76 | 77 | torch.manual_seed(seed) 78 | torch.backends.cudnn.deterministic = True 79 | torch.backends.cudnn.benchmark = False 80 | 81 | torch.cuda.manual_seed(seed) 82 | torch.cuda.manual_seed_all(seed) 83 | 84 | def run(self): 85 | 86 | parser = Arguments() 87 | parser.add_type_of_processing() 88 | parser.add_hyper_parameters() 89 | parser.add_data_parameters() 90 | 91 | args = parser.parse() 92 | logger = self.set_logger() 93 | self.set_seed(args) 94 | 95 | return args, logger 96 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | from model.setting import Setting, Arguments 4 | from model.KoBART.processor import Processor 5 | 6 | 7 | def main(args, logger) -> None: 8 | 9 | processor = Processor(args) 10 | config = processor.model_setting() 11 | logger.info('Model Setting Complete') 12 | 13 | if args.train == 'True': 14 | logger.info('Start Training') 15 | 16 | for epoch in range(args.epochs): 17 | start_time = time.time() 18 | 19 | train_loss = processor.train() 20 | valid_loss = processor.valid() 21 | 22 | end_time = time.time() 23 | epoch_mins, epoch_secs = processor.metric.cal_time(start_time, end_time) 24 | 25 | performance = {'tl': train_loss, 'vl': valid_loss, 26 | 'ep': epoch, 'epm': epoch_mins, 'eps': epoch_secs} 27 | 28 | processor.metric.save_model(config, performance, processor.model_checker) 29 | 30 | if processor.model_checker['early_stop']: 31 | logger.info('Early Stopping') 32 | break 33 | 34 | if args.test == 'True': 35 | logger.info("Start Test") 36 | 37 | rouge_score = processor.test() 38 | print(f'\n{rouge_score}') 39 | 40 | processor.metric.print_size_of_model(config['model']) 41 | 42 | 43 | if __name__ == '__main__': 44 | args, logger = Setting().run() 45 | main(args, logger) 46 | -------------------------------------------------------------------------------- /train_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES=1 python train.py --train True --test False --batch_size 14 --max_len 512 --lr 5e-05 --epochs 10 3 | CUDA_VISIBLE_DEVICES=1 python train.py --train False --test True --batch_size 14 --max_len 512 4 | --------------------------------------------------------------------------------