├── examples ├── cpp │ ├── gptj │ │ ├── bad_words.csv │ │ ├── stop_words.csv │ │ ├── start_ids.csv │ │ ├── gptj_config.ini │ │ └── CMakeLists.txt │ ├── gpt │ │ ├── start_ids.csv │ │ ├── CMakeLists.txt │ │ └── gpt_config.ini │ ├── multi_gpu_gpt │ │ ├── start_ids.csv │ │ ├── gpt_example_utils.h │ │ ├── CMakeLists.txt │ │ └── gpt_config.ini │ ├── bert_int8 │ │ └── CMakeLists.txt │ ├── vit │ │ └── CMakeLists.txt │ ├── vit_int8 │ │ └── CMakeLists.txt │ ├── bert │ │ └── CMakeLists.txt │ ├── decoding │ │ └── CMakeLists.txt │ ├── swin_int8 │ │ └── CMakeLists.txt │ ├── swin │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ └── xlnet │ │ └── CMakeLists.txt ├── pytorch │ ├── bert │ │ └── bert-quantization-sparsity │ │ │ ├── checkpoints │ │ │ └── .keep │ │ │ ├── processors │ │ │ └── __init__.py │ │ │ ├── apex_sparsity │ │ │ └── __init__.py │ │ │ ├── scripts │ │ │ ├── docker │ │ │ │ ├── build.sh │ │ │ │ └── launch.sh │ │ │ └── data_download.sh │ │ │ ├── images │ │ │ ├── model.png │ │ │ ├── nvlamb.png │ │ │ └── loss_curves.png │ │ │ ├── NOTICE │ │ │ ├── requirements.txt │ │ │ ├── bert_config.json │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── NVIDIAPretrainedWeightDownloader.py │ │ │ ├── BooksDownloader.py │ │ │ ├── BookscorpusTextFormatting.py │ │ │ └── GLUEDownloader.py │ │ │ ├── .dockerignore │ │ │ ├── Dockerfile │ │ │ └── utils.py │ ├── gpt │ │ ├── requirement.txt │ │ └── utils │ │ │ ├── generate_start_ids.py │ │ │ ├── gpt_token_converter.py │ │ │ └── parallel_gpt.py │ ├── swin │ │ ├── Swin-Transformer-Quantization │ │ │ ├── __init__.py │ │ │ ├── run.sh │ │ │ ├── calib.sh │ │ │ └── qat.sh │ │ ├── run_test.sh │ │ ├── run_test_int8.sh │ │ └── run_test_int8_accuracy.sh │ ├── vit │ │ ├── requirement.txt │ │ ├── run.sh │ │ ├── run2.sh │ │ └── ViT-quantization │ │ │ ├── calib.sh │ │ │ └── qat.sh │ ├── t5 │ │ └── requirement.txt │ ├── requirement.txt │ ├── utils.py │ ├── codegeex │ │ └── utils │ │ │ └── tokenizer │ │ │ ├── special_tokens_map.json │ │ │ └── tokenizer_config.json │ └── decoding │ │ └── utils │ │ ├── __init__.py │ │ ├── download_model.sh │ │ └── recover_bpe.py ├── tensorflow │ ├── requirement.txt │ ├── bert │ │ ├── bert-quantization │ │ │ ├── NOTICE │ │ │ ├── __init__.py │ │ │ ├── .dockerignore │ │ │ ├── ft-tensorflow-quantization │ │ │ │ ├── ft_tensorflow_quantization │ │ │ │ │ ├── python │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── calib │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ ├── ops │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ ├── utils │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ │ └── layers │ │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── __init__.py │ │ │ │ └── setup.py │ │ │ ├── Dockerfile │ │ │ ├── CONTRIBUTING.md │ │ │ ├── fp16_utils.py │ │ │ └── gpu_environment.py │ │ ├── utils │ │ │ └── __init__.py │ │ └── tensorflow_bert │ │ │ └── __init__.py │ ├── xlnet │ │ └── downloadModel.sh │ ├── decoding │ │ └── utils │ │ │ ├── translation │ │ │ └── download_model_data.sh │ │ │ └── bleu_score.py │ └── gpt │ │ └── utils │ │ └── gpt_token_converter.py ├── tensorrt │ ├── swin │ │ ├── run_builder_fp16.sh │ │ ├── run_builder_fp32.sh │ │ ├── run_builder_int8.sh │ │ ├── run_infer_fp32.sh │ │ ├── run_infer_fp16.sh │ │ └── run_infer_int8.sh │ └── t5 │ │ └── createT5TestData.py └── CMakeLists.txt ├── .gitignore ├── docs └── images │ └── inference_performance.png ├── make_all.sh ├── src ├── CMakeLists.txt └── fastertransformer │ ├── tensorrt_plugin │ ├── t5 │ │ ├── T5PluginGemm.h │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ ├── vit │ │ └── CMakeLists.txt │ └── swin │ │ └── CMakeLists.txt │ ├── utils │ ├── cuda_bf16_wrapper.h │ ├── word_list.h │ ├── nvtx_utils.cc │ ├── convert_data_type.h │ ├── gemm_test │ │ ├── swin_gemm_func.h │ │ ├── encoder_gemm_func.h │ │ ├── xlnet_gemm_func.h │ │ └── swin_igemm_func.h │ ├── word_list.cc │ ├── mpi_utils.h │ └── memory_utils.h │ ├── th_op │ ├── gpt │ │ └── CMakeLists.txt │ ├── encoder │ │ └── CMakeLists.txt │ ├── decoder │ │ └── CMakeLists.txt │ ├── codegeex │ │ └── CMakeLists.txt │ ├── longformer │ │ └── CMakeLists.txt │ ├── t5 │ │ └── CMakeLists.txt │ ├── bert │ │ └── CMakeLists.txt │ ├── multi_gpu_gpt │ │ ├── CMakeLists.txt │ │ └── WeightTransposeCalibrateQuantizeOp.h │ ├── multi_gpu_codegeex │ │ ├── CMakeLists.txt │ │ └── WeightTransposeCalibrateQuantizeOp.h │ ├── decoding │ │ ├── CMakeLists.txt │ │ └── GatherTreeOp.h │ ├── vit │ │ └── CMakeLists.txt │ ├── th_traits.h │ ├── swin │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ └── th_utils.cu │ ├── tf_op │ ├── gpt │ │ └── CMakeLists.txt │ ├── decoding │ │ └── CMakeLists.txt │ ├── encoder │ │ └── CMakeLists.txt │ ├── bert │ │ └── CMakeLists.txt │ ├── decoder │ │ └── CMakeLists.txt │ └── CMakeLists.txt │ ├── triton_backend │ ├── CMakeLists.txt │ ├── t5 │ │ └── CMakeLists.txt │ ├── gptj │ │ └── CMakeLists.txt │ ├── multi_gpu_gpt │ │ └── CMakeLists.txt │ └── triton_utils.hpp │ ├── layers │ ├── FfnWeight.h │ ├── FfnINT8Weight.h │ ├── xlnet_attention_layers │ │ ├── CMakeLists.txt │ │ └── XlnetAttentionWeight.h │ ├── attention_layers │ │ └── AttentionWeight.h │ ├── attention_layers_int8 │ │ └── AttentionINT8Weight.h │ ├── DenseWeight.h │ ├── beam_search_layers │ │ └── CMakeLists.txt │ ├── DynamicDecodeBaseLayer.h │ └── sampling_layers │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ ├── kernels │ ├── transform_mask_kernels.h │ ├── int8_utils.cuh │ ├── matrix_transpose_kernels.h │ ├── quantization_int8_kernels.h │ ├── stop_criteria_kernels.h │ ├── quantize_weight.h │ ├── gen_relative_pos_bias.h │ ├── dequantize_kernels.h │ ├── activation_kernels.h │ ├── calibrate_quantize_weight_kernels.h │ ├── ban_bad_words.h │ ├── logprob_kernels.h │ ├── online_softmax_beamsearch_kernels.h │ ├── xlnet_preprocess_kernels.h │ ├── layout_transformer_int8_kernels.h │ ├── add_bias_transpose_kernels.h │ ├── reverse_roll_kernels.h │ ├── vit_kernels.h │ ├── beam_search_topk_kernels.h │ ├── custom_ar_kernels.h │ └── beam_search_penalty_kernels.h │ └── models │ ├── decoder │ └── CMakeLists.txt │ ├── CMakeLists.txt │ ├── longformer │ └── CMakeLists.txt │ ├── xlnet │ └── CMakeLists.txt │ ├── decoding │ └── CMakeLists.txt │ ├── bert │ └── CMakeLists.txt │ ├── vit_int8 │ └── CMakeLists.txt │ ├── vit │ └── CMakeLists.txt │ ├── bert_int8 │ └── CMakeLists.txt │ ├── swin_int8 │ └── CMakeLists.txt │ └── swin │ ├── CMakeLists.txt │ └── SwinWeight.h ├── tests ├── CMakeLists.txt └── unittests │ └── CMakeLists.txt ├── .gitmodules ├── 3rdparty ├── CMakeLists.txt └── trt_fused_multihead_attention │ ├── fused_multihead_attention_common.h │ └── CMakeLists.txt ├── post.py ├── README.md └── cmake └── FasterTransformerConfig.cmake.in /examples/cpp/gptj/bad_words.csv: -------------------------------------------------------------------------------- 1 | 7768,3908 2 | 1,2 3 | -------------------------------------------------------------------------------- /examples/cpp/gptj/stop_words.csv: -------------------------------------------------------------------------------- 1 | 287, 4346, 12 2 | 3, -1, -1 3 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/checkpoints/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/processors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/pytorch/gpt/requirement.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | fire 3 | rouge_score 4 | transformers -------------------------------------------------------------------------------- /examples/pytorch/swin/Swin-Transformer-Quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from SwinTransformer.config import get_config -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *build*/ 4 | ./models/ 5 | __pycache__/ 6 | .vscode 7 | ./translation 8 | .cache 9 | -------------------------------------------------------------------------------- /examples/pytorch/vit/requirement.txt: -------------------------------------------------------------------------------- 1 | ml_collections 2 | pytorch-quantization 3 | timm==0.4.12 4 | termcolor==1.1.0 5 | yacs -------------------------------------------------------------------------------- /examples/pytorch/t5/requirement.txt: -------------------------------------------------------------------------------- 1 | transformers==4.10.0 2 | tokenizers==0.10.1 3 | omegaconf 4 | SentencePiece 5 | sacrebleu 6 | -------------------------------------------------------------------------------- /examples/tensorflow/requirement.txt: -------------------------------------------------------------------------------- 1 | fire>=0.1.3 2 | regex==2017.4.5 3 | requests==2.21.0 4 | tqdm==4.31.1 5 | opennmt-tf==1.25.1 # for tf -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/apex_sparsity/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_masklib import create_mask 2 | from .asp import ASP 3 | -------------------------------------------------------------------------------- /docs/images/inference_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/docs/images/inference_performance.png -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker build --network=host . --rm --pull --no-cache -t bert 3 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/model.png -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/images/nvlamb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/nvlamb.png -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/images/loss_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/loss_curves.png -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/NOTICE: -------------------------------------------------------------------------------- 1 | BERT PyTorch 2 | 3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT 4 | licensed under the Apache License 2.0. 5 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/NOTICE: -------------------------------------------------------------------------------- 1 | BERT TensorFlow 2 | 3 | This repository includes software from https://github.com/google-research/bert 4 | licensed under the Apache License, Version 2.0 (the "License") -------------------------------------------------------------------------------- /examples/pytorch/requirement.txt: -------------------------------------------------------------------------------- 1 | fire>=0.1.3 2 | regex==2017.4.5 3 | requests==2.21.0 4 | tqdm==4.31.1 5 | opennmt-py==1.1.1 # for pytorch 6 | transformers==2.5.1 # for pytorch 7 | ml_collection # for pytorch 8 | sacrebleu # for pytorch -------------------------------------------------------------------------------- /examples/pytorch/vit/run.sh: -------------------------------------------------------------------------------- 1 | python infer_visiontransformer_int8_op.py \ 2 | --th-path=../../../build/lib/libpyt_vit.so \ 3 | --calibrated_dir /workspace/checkpoint/ViT-B_16_ft1_99.999_82.846.pth \ 4 | --img_size 384 \ 5 | --quant-mode ft1 6 | -------------------------------------------------------------------------------- /examples/pytorch/vit/run2.sh: -------------------------------------------------------------------------------- 1 | python infer_visiontransformer_int8_op.py \ 2 | --th-path=../../../build/lib/libpyt_vit.so \ 3 | --calibrated_dir /workspace/checkpoint/ViT-B_16_ft2_99.99_81.948.pth \ 4 | --img_size 384 \ 5 | --quant-mode ft2 6 | -------------------------------------------------------------------------------- /make_all.sh: -------------------------------------------------------------------------------- 1 | cd /workspace/FasterTransformer/ 2 | mkdir build 3 | cd build 4 | # Change DSM to the correspoding version of GPUs (e.g. 80 for A100, RTX 3090; 75 for RTX TITAN) 5 | cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. 6 | make -j12 7 | cd .. 8 | ./build/bin/codegeex_gemm 1 1 32 64 64 16348 50048 1 1 -------------------------------------------------------------------------------- /examples/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def print_memory_usage(info=""): 4 | t = torch.cuda.get_device_properties(0).total_memory / 1024**2 5 | r = torch.cuda.memory_reserved(0) / 1024**2 6 | a = torch.cuda.memory_allocated(0) / 1024**2 7 | f = r-a # free inside reserved 8 | print(f"[INFO][{info}] total_memory: {t}, reversed: {r}, allocated: {a}") 9 | -------------------------------------------------------------------------------- /examples/cpp/gpt/start_ids.csv: -------------------------------------------------------------------------------- 1 | 818, 262, 938, 3155, 286, 1528, 11, 257 2 | 198, 464, 968, 8221, 2732, 286, 15198, 318 3 | 464, 968, 1971, 12056, 423, 257, 649, 1182 4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079 5 | 818, 257, 1445, 326, 481, 1884, 787, 340 6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423 7 | 198, 198, 464, 5398, 4332, 628, 628, 198 8 | 464, 717, 640, 314, 2497, 262, 3807, 11 9 | -------------------------------------------------------------------------------- /examples/cpp/gptj/start_ids.csv: -------------------------------------------------------------------------------- 1 | 818, 262, 938, 3155, 286, 1528, 11, 257 2 | 198, 464, 968, 8221, 2732, 286, 15198, 318 3 | 464, 968, 1971, 12056, 423, 257, 649, 1182 4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079 5 | 818, 257, 1445, 326, 481, 1884, 787, 340 6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423 7 | 198, 198, 464, 5398, 4332, 628, 628, 198 8 | 464, 717, 640, 314, 2497, 262, 3807, 11 9 | -------------------------------------------------------------------------------- /examples/pytorch/swin/run_test.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_op.py \ 2 | --eval \ 3 | --data-path /workspace \ 4 | --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 5 | --resume Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \ 6 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 7 | --batch-size $1 8 | -------------------------------------------------------------------------------- /examples/cpp/multi_gpu_gpt/start_ids.csv: -------------------------------------------------------------------------------- 1 | 818, 262, 938, 3155, 286, 1528, 11, 257 2 | 198, 464, 968, 8221, 2732, 286, 15198, 318 3 | 464, 968, 1971, 12056, 423, 257, 649, 1182 4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079 5 | 818, 257, 1445, 326, 481, 1884, 787, 340 6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423 7 | 198, 198, 464, 5398, 4332, 628, 628, 198 8 | 464, 717, 640, 314, 2497, 262, 3807, 11 9 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3 5 | # Used for downloading models over HTTP 6 | requests 7 | six 8 | ipdb 9 | #Data processing 10 | h5py 11 | html2text 12 | nltk 13 | progressbar 14 | #Others 15 | onnxruntime 16 | git+https://github.com/NVIDIA/dllogger 17 | -------------------------------------------------------------------------------- /examples/pytorch/codegeex/utils/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}} 2 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_builder_fp16.sh: -------------------------------------------------------------------------------- 1 | python builder_fp16.py \ 2 | --batch-size 32 \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \ 5 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 6 | --output swin_transformer_fp16.engine 7 | 8 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_builder_fp32.sh: -------------------------------------------------------------------------------- 1 | python builder_fp32.py \ 2 | --batch-size 32 \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \ 5 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 6 | --output swin_transformer_fp32.engine 7 | 8 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_builder_int8.sh: -------------------------------------------------------------------------------- 1 | python builder_int8.py \ 2 | --batch-size 32 \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 5 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 6 | --output swin_transformer_int8.engine -------------------------------------------------------------------------------- /examples/pytorch/swin/run_test_int8.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_int8_op.py \ 2 | --profile \ 3 | --data-path /workspace \ 4 | --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 5 | --resume Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 6 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 7 | --int8-mode 1\ 8 | --batch-size $1 9 | -------------------------------------------------------------------------------- /examples/pytorch/swin/Swin-Transformer-Quantization/run.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch \ 2 | --nproc_per_node 1 \ 3 | --master_port 12346 main.py \ 4 | --eval \ 5 | --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 6 | --resume ./calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 7 | --data-path /data/datasets/ILSVRC2012/ \ 8 | --quant-mode ft2\ 9 | --int8-mode 1\ 10 | --batch-size 128 11 | -------------------------------------------------------------------------------- /examples/pytorch/swin/run_test_int8_accuracy.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_int8_op.py \ 2 | --eval \ 3 | --data-path /workspace \ 4 | --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 5 | --resume Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 6 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 7 | --int8-mode 1\ 8 | --batch-size $1 9 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_infer_fp32.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_plugin.py \ 2 | --eval \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \ 5 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 6 | --engine swin_transformer_fp32.engine \ 7 | --batch-size $1 8 | -------------------------------------------------------------------------------- /examples/pytorch/swin/Swin-Transformer-Quantization/calib.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node 1 \ 2 | --master_port 12345 main.py \ 3 | --calib \ 4 | --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 5 | --resume swin_tiny_patch4_window7_224.pth \ 6 | --data-path /data/datasets/ILSVRC2012 \ 7 | --num-calib-batch 10 \ 8 | --calib-batchsz 8\ 9 | --int8-mode 1\ 10 | --calib-output-path calib-checkpoint 11 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_infer_fp16.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_plugin.py \ 2 | --eval \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \ 5 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 6 | --engine swin_transformer_fp16.engine \ 7 | --batch-size $1 \ 8 | --use-fp16 9 | -------------------------------------------------------------------------------- /examples/tensorrt/swin/run_infer_int8.sh: -------------------------------------------------------------------------------- 1 | python infer_swintransformer_plugin_int8.py \ 2 | --eval \ 3 | --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 4 | --resume ../../pytorch/swin/Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 5 | --int8-mode 1 \ 6 | --th-path ../../../build/lib/libpyt_swintransformer.so \ 7 | --engine swin_transformer_int8.engine \ 8 | --batch-size $1 9 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/scripts/docker/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD=${1:-/bin/bash} 4 | NV_VISIBLE_DEVICES=${2:-"all"} 5 | DOCKER_BRIDGE=${3:-"host"} 6 | 7 | docker run -it --rm \ 8 | --gpus device=$NV_VISIBLE_DEVICES \ 9 | --net=$DOCKER_BRIDGE \ 10 | --shm-size=1g \ 11 | --ulimit memlock=-1 \ 12 | --ulimit stack=67108864 \ 13 | -e LD_LIBRARY_PATH='/workspace/install/lib/' \ 14 | -v $PWD:/workspace/bert \ 15 | -v $PWD/results:/results \ 16 | bert $CMD 17 | -------------------------------------------------------------------------------- /examples/pytorch/vit/ViT-quantization/calib.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node 1 \ 2 | --master_port 12345 main.py \ 3 | --calib \ 4 | --name vit \ 5 | --pretrained_dir $CKPT_DIR/ViT-B_16.npz \ 6 | --data-path $DATA_DIR \ 7 | --model_type ViT-B_16 \ 8 | --img_size 384 \ 9 | --num-calib-batch 10 \ 10 | --calib-batchsz 8 \ 11 | --quant-mode ft2 \ 12 | --calibrator percentile \ 13 | --percentile 99.99 \ 14 | --calib-output-path calib-checkpoint 15 | -------------------------------------------------------------------------------- /examples/pytorch/vit/ViT-quantization/qat.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node 1 \ 2 | --master_port 12345 main.py \ 3 | --train \ 4 | --name vit \ 5 | --pretrained_dir calib-checkpoint/ViT-B_16_calib.pth \ 6 | --data-path $DATA_DIR \ 7 | --model_type ViT-B_16 \ 8 | --quant-mode ft2 \ 9 | --img_size 384 \ 10 | --distill \ 11 | --teacher $CKPT_DIR/ViT-B_16.npz \ 12 | --output qat_output \ 13 | --quant-mode ft2\ 14 | --batch-size 16 \ 15 | --num-epochs 5 \ 16 | --qat-lr 1e-4 -------------------------------------------------------------------------------- /examples/pytorch/swin/Swin-Transformer-Quantization/qat.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch \ 2 | --nproc_per_node 1 --master_port 12346 main.py \ 3 | --train \ 4 | --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \ 5 | --resume ./calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \ 6 | --data-path /data/datasets/ILSVRC2012 \ 7 | --quant-mode ft2 \ 8 | --teacher swin_tiny_patch4_window7_224.pth \ 9 | --output qat-output \ 10 | --distill \ 11 | --int8-mode 1\ 12 | --batch-size 32\ 13 | --num-epochs 3 \ 14 | --qat-lr 1e-5 15 | -------------------------------------------------------------------------------- /examples/pytorch/codegeex/utils/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"} 2 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/tensorflow_bert/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. -------------------------------------------------------------------------------- /examples/pytorch/decoding/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(cpp) -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(fastertransformer) -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(unittests) 16 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/Megatron-LM"] 2 | path = 3rdparty/Megatron-LM 3 | url = https://github.com/NVIDIA/Megatron-LM.git 4 | branch = v2.6 5 | [submodule "examples/tensorflow/bert/tensorflow_bert/bert"] 6 | path = examples/tensorflow/bert/tensorflow_bert/bert 7 | url = https://github.com/google-research/bert.git 8 | [submodule "examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer"] 9 | path = examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer 10 | url = https://github.com/microsoft/Swin-Transformer 11 | [submodule "examples/pytorch/vit/ViT-quantization/ViT-pytorch"] 12 | path = examples/pytorch/vit/ViT-quantization/ViT-pytorch 13 | url = https://github.com/jeonsworld/ViT-pytorch 14 | -------------------------------------------------------------------------------- /3rdparty/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(trt_fused_multihead_attention) -------------------------------------------------------------------------------- /src/fastertransformer/tensorrt_plugin/t5/T5PluginGemm.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | int t5_gemm(int argv[16]); 18 | -------------------------------------------------------------------------------- /src/fastertransformer/tensorrt_plugin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(swin) 16 | add_subdirectory(t5) 17 | add_subdirectory(vit) 18 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/cuda_bf16_wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #ifdef ENABLE_BF16 20 | #include 21 | #endif -------------------------------------------------------------------------------- /src/fastertransformer/th_op/gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_gpt SHARED GptOp.cc) 16 | target_link_libraries(th_gpt PRIVATE "${TORCH_LIBRARIES}" ParallelGpt th_utils) 17 | -------------------------------------------------------------------------------- /examples/cpp/gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(gpt_example gpt_example.cc) 16 | target_link_libraries(gpt_example PUBLIC -lcublas -lcublasLt -lcudart ParallelGpt nvtx_utils) 17 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/.dockerignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | data/download 15 | data/extracted 16 | data/formatted_one_article_per_line 17 | data/sharded 18 | data/hdf5 19 | vocab/ 20 | results/ -------------------------------------------------------------------------------- /src/fastertransformer/th_op/encoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_encoder SHARED EncoderOp.cc) 16 | target_link_libraries(th_encoder PRIVATE "${TORCH_LIBRARIES}" Bert th_utils) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(tf_gpt SHARED GptOp.cc) 16 | target_link_libraries(tf_gpt PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart ParallelGpt) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/decoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_decoder SHARED DecoderOp.cc) 16 | target_link_libraries(th_decoder PRIVATE "${TORCH_LIBRARIES}" Decoder th_utils) 17 | -------------------------------------------------------------------------------- /examples/cpp/bert_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(bert_int8_example bert_int8_example.cc) 16 | target_link_libraries(bert_int8_example PUBLIC -lcublas -lcublasLt -lcudart BertINT8) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/decoding/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(tf_decoding SHARED DecodingOp.cc) 16 | target_link_libraries(tf_decoding PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart Decoding) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/codegeex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_codegeex SHARED CodegeexOp.cc) 16 | target_link_libraries(th_codegeex PRIVATE "${TORCH_LIBRARIES}" ParallelCodegeex th_utils) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/tensorrt_plugin/t5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(trt_t5 SHARED T5PluginGemm.cc T5Plugin.cu) 16 | target_link_libraries(trt_t5 PRIVATE T5Encoder T5Decoding t5_gemm_func -lnvinfer) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/encoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(tf_encoder SHARED EncoderOp.cc) 16 | target_link_libraries(tf_encoder PRIVATE Bert ${tf_link} -lcublas -lcublasLt -lcudart cublasAlgoMap) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/longformer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_longformer SHARED LongformerEncoderOp.cc) 16 | target_link_libraries(th_longformer PRIVATE "${TORCH_LIBRARIES}" LongformerEncoder th_utils) -------------------------------------------------------------------------------- /src/fastertransformer/th_op/t5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_t5 SHARED T5EncoderOp.cc T5DecoderOp.cc T5DecodingOp.cc) 16 | target_link_libraries(th_t5 PRIVATE "${TORCH_LIBRARIES}" T5Encoder T5Decoder T5Decoding th_utils) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/bert/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_bert SHARED BertOp.cc BertINT8Op.cc WeightQuantizeOp.cc) 16 | target_link_libraries(th_bert PRIVATE "${TORCH_LIBRARIES}" Bert BertINT8 th_utils quantize_weight) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/triton_backend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | if(BUILD_MULTI_GPU) 18 | add_subdirectory(gptj) 19 | add_subdirectory(t5) 20 | add_subdirectory(multi_gpu_gpt) 21 | endif() -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/bert/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(tf_bert SHARED BertOp.cc BertINT8Op.cc weight_quantize_op.cc) 16 | target_link_libraries(tf_bert PRIVATE Bert BertINT8 ${tf_link} -lcublas -lcublasLt -lcudart cublasAlgoMap) 17 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/.dockerignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | .idea/ 15 | .git/ 16 | __pycache__/ 17 | results/ 18 | data/binary 19 | data/download 20 | data/extracted 21 | data/formatted_one_article_per_line 22 | data/sharded 23 | data/hdf5* 24 | data/tfrecord* 25 | checkpoints/ 26 | -------------------------------------------------------------------------------- /examples/cpp/vit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | add_executable(vit_example vit_example.cc) 17 | target_link_libraries(vit_example PUBLIC ViT trt_fused_multi_head_attention vit_kernels 18 | cublasMMWrapper -lcublas -lcublasLt -lcudart -lcudnn -lm) 19 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/multi_gpu_gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_parallel_gpt SHARED ParallelGptOp.cc WeightTransposeCalibrateQuantizeOp.cc) 16 | target_link_libraries(th_parallel_gpt PRIVATE "${TORCH_LIBRARIES}" ParallelGpt th_utils calibrate_quantize_weight_kernels) 17 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/word_list.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "Tensor.h" 20 | #include "stdlib.h" 21 | 22 | namespace fastertransformer { 23 | 24 | int read_word_list(const std::string& filename, std::vector& tensor_data); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/calib/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/ops/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/layers/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | -------------------------------------------------------------------------------- /examples/cpp/gptj/gptj_config.ini: -------------------------------------------------------------------------------- 1 | [ft_instance_hyperparameter] 2 | max_batch_size=8 ; Use for allocate the buffer 3 | max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter 4 | beam_width=1 ; beam width for beam search 5 | top_k=0 ; k value for top k sampling 6 | top_p=0.5 ; p value for top p sampling 7 | temperature=1.0 ; Use for sampling 8 | repetition_penalty=2.0 ; Use for sampling 9 | len_penalty=1.0 10 | beam_search_diversity_rate=0.0 11 | is_half=0 12 | enable_custom_all_reduce=0 13 | 14 | tensor_para_size=8 15 | pipeline_para_size=1 16 | 17 | model_name=gptj_6B 18 | model_dir=../models/j6b_ckpt/ 19 | 20 | [request] 21 | request_batch_size=8 # determine by the request 22 | request_output_len=32 # determine by the request 23 | 24 | [gptj_6B] 25 | head_num=16 26 | size_per_head=256 27 | vocab_size=50400 28 | decoder_layers=28 29 | rotary_embedding=64 30 | start_id=50256 31 | end_id=50256 32 | inter_size=16384 33 | -------------------------------------------------------------------------------- /examples/cpp/vit_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | add_executable(vit_int8_example vit_int8_example.cc) 17 | target_link_libraries(vit_int8_example PUBLIC ViTINT8 trt_fused_multi_head_attention vit_kernels 18 | cublasMMWrapper -lcublas -lcublasLt -lcudart -lcudnn -lm) 19 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/multi_gpu_codegeex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_parallel_codegeex SHARED ParallelCodegeexOp.cc WeightTransposeCalibrateQuantizeOp.cc) 16 | target_link_libraries(th_parallel_codegeex PRIVATE "${TORCH_LIBRARIES}" ParallelCodegeex th_utils calibrate_quantize_weight_kernels) 17 | -------------------------------------------------------------------------------- /examples/cpp/bert/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(bert_example bert_example.cc) 16 | if (SPARSITY_SUPPORT) 17 | target_link_libraries(bert_example PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt Bert) 18 | else() 19 | target_link_libraries(bert_example PUBLIC -lcublas -lcublasLt -lcudart Bert) 20 | endif() 21 | -------------------------------------------------------------------------------- /examples/cpp/decoding/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(decoding_example decoding_example.cc) 16 | target_link_libraries(decoding_example PUBLIC -lcublasLt Decoding nvtx_utils) 17 | 18 | add_executable(layernorm_test layernorm_test.cc) 19 | target_link_libraries(layernorm_test PUBLIC -lcublasLt layernorm_kernels memory_utils) 20 | -------------------------------------------------------------------------------- /examples/cpp/swin_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | add_executable(swin_int8_example swin_int8_example.cc) 17 | target_link_libraries(swin_int8_example PUBLIC trt_fused_multi_head_attention SwinINT8 cublasAlgoMap 18 | cublasINT8MMWrapper quantize_weight memory_utils -lcublasLt -lcublas -lcudart -lcudnn) 19 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/decoding/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(th_decoding SHARED DecodingOp.cc) 16 | target_link_libraries(th_decoding PRIVATE "${TORCH_LIBRARIES}" Decoding th_utils) 17 | 18 | add_library(th_gather_tree SHARED GatherTreeOp.cc) 19 | target_link_libraries(th_gather_tree PRIVATE "${TORCH_LIBRARIES}" decoding_kernels th_utils) -------------------------------------------------------------------------------- /examples/cpp/swin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | set(swin_transformer_nv_files 17 | swin_example.cc 18 | ) 19 | add_executable(swin_example ${swin_transformer_nv_files}) 20 | target_link_libraries(swin_example PUBLIC trt_fused_multi_head_attention Swin 21 | cublasMMWrapper memory_utils -lcublas -lcublasLt -lcudart -lcudnn) 22 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/FfnWeight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "DenseWeight.h" 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | struct FfnWeight { 25 | DenseWeight intermediate_weight; 26 | DenseWeight output_weight; 27 | }; 28 | 29 | } // namespace fastertransformer 30 | -------------------------------------------------------------------------------- /src/fastertransformer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(utils) 16 | add_subdirectory(kernels) 17 | add_subdirectory(layers) 18 | add_subdirectory(models) 19 | if(BUILD_TF) 20 | add_subdirectory(tf_op) 21 | endif() 22 | if(BUILD_PYT) 23 | add_subdirectory(th_op) 24 | endif() 25 | add_subdirectory(triton_backend) 26 | if(BUILD_TRT) 27 | add_subdirectory(tensorrt_plugin) 28 | endif() 29 | -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/decoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(tf_decoder SHARED DecoderOp.cc) 16 | target_link_libraries(tf_decoder PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart Decoder) 17 | 18 | add_library(tf_fused_self_attention SHARED FusedSelfAttentionOp.cc) 19 | target_link_libraries(tf_fused_self_attention PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart DecoderSelfAttentionLayer) 20 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/FfnINT8Weight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "FfnWeight.h" 20 | #include "src/fastertransformer/utils/ScaleList.h" 21 | namespace fastertransformer { 22 | 23 | template 24 | struct FfnINT8Weight: FfnWeight { 25 | ScaleList* scale_list_ptr; 26 | }; 27 | 28 | } // namespace fastertransformer 29 | -------------------------------------------------------------------------------- /examples/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(bert) 16 | add_subdirectory(bert_int8) 17 | add_subdirectory(decoding) 18 | add_subdirectory(gpt) 19 | add_subdirectory(xlnet) 20 | add_subdirectory(swin) 21 | add_subdirectory(swin_int8) 22 | add_subdirectory(vit) 23 | add_subdirectory(vit_int8) 24 | 25 | if(BUILD_MULTI_GPU) 26 | add_subdirectory(gptj) 27 | add_subdirectory(multi_gpu_gpt) 28 | endif() 29 | -------------------------------------------------------------------------------- /src/fastertransformer/triton_backend/t5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | set(t5_triton_backend_files 18 | T5TritonModel.cc 19 | T5TritonModelInstance.cc 20 | ) 21 | 22 | add_library(T5TritonBackend SHARED ${t5_triton_backend_files}) 23 | target_link_libraries(T5TritonBackend PRIVATE T5Encoder T5Decoding) 24 | target_compile_features(T5TritonBackend PRIVATE cxx_std_14) 25 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/multi_gpu_gpt/WeightTransposeCalibrateQuantizeOp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h" 18 | #include "src/fastertransformer/th_op/th_utils.h" 19 | 20 | namespace torch_ext { 21 | using torch::Tensor; 22 | 23 | std::vector weight_transpose_calibrate_quantize(Tensor weight); 24 | 25 | } // namespace torch_ext 26 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/multi_gpu_codegeex/WeightTransposeCalibrateQuantizeOp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h" 18 | #include "src/fastertransformer/th_op/th_utils.h" 19 | 20 | namespace torch_ext { 21 | using torch::Tensor; 22 | 23 | std::vector weight_transpose_calibrate_quantize(Tensor weight); 24 | 25 | } // namespace torch_ext 26 | -------------------------------------------------------------------------------- /src/fastertransformer/triton_backend/gptj/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | set(parallel_gpt_triton_backend_files 18 | GptJTritonModel.cc 19 | GptJTritonModelInstance.cc 20 | ) 21 | 22 | add_library(GptJTritonBackend SHARED ${parallel_gpt_triton_backend_files}) 23 | target_link_libraries(GptJTritonBackend PRIVATE GptJ) 24 | target_compile_features(GptJTritonBackend PRIVATE cxx_std_14) 25 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/transform_mask_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | void invokeTransformMask( 26 | half* tranformed_mask, const half* mask, const uint32_t B, const uint32_t S, cudaStream_t stream); 27 | 28 | } // namespace fastertransformer 29 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/decoding/GatherTreeOp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "src/fastertransformer/kernels/decoding_kernels.h" 18 | #include "src/fastertransformer/th_op/th_utils.h" 19 | 20 | namespace th = torch; 21 | 22 | namespace torch_ext { 23 | 24 | th::Tensor 25 | gather_tree(th::Tensor step_ids, th::Tensor parent_ids, th::Tensor max_sequence_lengths, th::Tensor end_tokens); 26 | 27 | } // namespace torch_ext -------------------------------------------------------------------------------- /src/fastertransformer/kernels/int8_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | 22 | static inline __device__ int8_t float_to_int8_rn(float x) 23 | { 24 | uint32_t dst; 25 | asm volatile("cvt.rni.sat.s8.f32 %0, %1;" 26 | : "=r"(dst) 27 | : "f"(x)); 28 | return reinterpret_cast(dst); 29 | } 30 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/matrix_transpose_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | template 26 | void invokeMatrixTranspose(T* dst, const T* src, const int m, const int n, cudaStream_t stream); 27 | 28 | } // namespace fastertransformer 29 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/quantization_int8_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "int8_utils.cuh" 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | template 26 | void invokeQuantization(int8_t* dst, const T* src, const int size, const float* scale_ptr, cudaStream_t stream); 27 | 28 | } // namespace fastertransformer 29 | -------------------------------------------------------------------------------- /examples/pytorch/decoding/utils/download_model.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAIN_PATH=$PWD 17 | 18 | mkdir -p $MAIN_PATH/pytorch/translation/models/ 19 | 20 | cd $MAIN_PATH/pytorch/translation/models/ 21 | if [ ! -f "sentencepiece.model" ] || [ ! -f "averaged-10-epoch.pt" ]; then 22 | wget -c https://s3.amazonaws.com/opennmt-models/transformer-ende-wmt-pyOnmt.tar.gz 23 | tar -xzvf transformer-ende-wmt-pyOnmt.tar.gz 24 | rm transformer-ende-wmt-pyOnmt.tar.gz 25 | fi 26 | -------------------------------------------------------------------------------- /src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | set(parallel_gpt_triton_backend_files 18 | ParallelGptTritonModel.cc 19 | ParallelGptTritonModelInstance.cc 20 | ) 21 | 22 | add_library(ParallelGptTritonBackend SHARED ${parallel_gpt_triton_backend_files}) 23 | target_link_libraries(ParallelGptTritonBackend PRIVATE ParallelGpt) 24 | target_compile_features(ParallelGptTritonBackend PRIVATE cxx_std_14) -------------------------------------------------------------------------------- /src/fastertransformer/models/decoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(Decoder STATIC Decoder.cc) 18 | set_property(TARGET Decoder PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET Decoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(Decoder PUBLIC -lcudart cublasMMWrapper DecoderSelfAttentionLayer 21 | DecoderCrossAttentionLayer FfnLayer layernorm_kernels add_residual_kernels) -------------------------------------------------------------------------------- /src/fastertransformer/layers/xlnet_attention_layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(XlnetAttentionLayer STATIC XlnetAttentionLayer.cc) 18 | set_property(TARGET XlnetAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET XlnetAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(XlnetAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils xlnet_attention_kernels) 21 | 22 | -------------------------------------------------------------------------------- /src/fastertransformer/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(bert) 16 | add_subdirectory(bert_int8) 17 | add_subdirectory(decoder) 18 | add_subdirectory(longformer) 19 | add_subdirectory(decoding) 20 | add_subdirectory(xlnet) 21 | 22 | add_subdirectory(t5) 23 | add_subdirectory(gptj) 24 | add_subdirectory(multi_gpu_gpt) 25 | add_subdirectory(multi_gpu_codegeex) 26 | add_subdirectory(swin) 27 | add_subdirectory(swin_int8) 28 | add_subdirectory(vit) 29 | add_subdirectory(vit_int8) 30 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/nvtx_utils.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "nvtx_utils.h" 18 | 19 | namespace nvtx { 20 | std::string getScope() 21 | { 22 | return scope; 23 | } 24 | void addScope(std::string name) 25 | { 26 | scope = scope + name + "/"; 27 | return; 28 | } 29 | void setScope(std::string name) 30 | { 31 | scope = name + "/"; 32 | return; 33 | } 34 | void resetScope() 35 | { 36 | scope = ""; 37 | return; 38 | } 39 | } // namespace nvtx 40 | -------------------------------------------------------------------------------- /examples/cpp/gptj/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(gptj_example gptj_example.cc) 16 | target_link_libraries(gptj_example PUBLIC -lcublas -lcublasLt -lcudart 17 | GptJ nvtx_utils -lmpi gpt_example_utils word_list) 18 | 19 | add_executable(gptj_triton_example gptj_triton_example.cc) 20 | target_link_libraries(gptj_triton_example PUBLIC -lcublas -lcublasLt -lcudart 21 | GptJTritonBackend custom_ar_comm -lmpi gpt_example_utils word_list -lpthread) 22 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/attention_layers/AttentionWeight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/layers/DenseWeight.h" 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | struct AttentionWeight { 25 | DenseWeight query_weight; 26 | DenseWeight key_weight; 27 | DenseWeight value_weight; 28 | DenseWeight attention_output_weight; 29 | }; 30 | 31 | } // namespace fastertransformer 32 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/data/NVIDIAPretrainedWeightDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | 16 | class NVIDIAPretrainedWeightDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path + '/nvidia_pretrained_weights' 19 | 20 | if not os.path.exists(self.save_path): 21 | os.makedirs(self.save_path) 22 | 23 | pass 24 | 25 | 26 | def download(self): 27 | assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.' -------------------------------------------------------------------------------- /src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" 20 | #include "src/fastertransformer/utils/ScaleList.h" 21 | 22 | namespace fastertransformer { 23 | 24 | template 25 | struct AttentionINT8Weight: AttentionWeight { 26 | ScaleList* scale_list_ptr; 27 | }; 28 | 29 | } // namespace fastertransformer 30 | -------------------------------------------------------------------------------- /src/fastertransformer/tensorrt_plugin/vit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | set(vit_trt_files 17 | ViTPlugin.cpp 18 | ) 19 | 20 | 21 | if(BUILD_TRT) 22 | set(LIB_NAME "vit_plugin") 23 | add_library(${LIB_NAME} SHARED ${vit_trt_files}) 24 | set_target_properties(${LIB_NAME} PROPERTIES 25 | CUDA_RESOLVE_DEVICE_SYMBOLS ON) 26 | target_link_libraries(${LIB_NAME} trt_fused_multi_head_attention ViT -lcudnn -lcublas -lcudart -lnvinfer) 27 | endif() 28 | -------------------------------------------------------------------------------- /examples/tensorflow/xlnet/downloadModel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | data_dir=./data/ 17 | if [[ ! -e $data_dir ]]; then 18 | mkdir $data_dir 19 | fi 20 | 21 | wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip 22 | unzip cased_L-12_H-768_A-12.zip 23 | mv xlnet_cased_L-12_H-768_A-12 ${data_dir} 24 | mv cased_L-12_H-768_A-12.zip ${data_dir} 25 | 26 | wget https://dl.fbaipublicfiles.com/glue/data/STS-B.zip 27 | unzip STS-B.zip 28 | mv STS-B ${data_dir} 29 | mv STS-B.zip ${data_dir} 30 | 31 | -------------------------------------------------------------------------------- /examples/cpp/xlnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(cnpy STATIC cnpy.cpp) 16 | target_link_libraries(cnpy PUBLIC -lz) 17 | set_property(TARGET cnpy PROPERTY POSITION_INDEPENDENT_CODE ON) 18 | 19 | add_executable(xlnet_example xlnet_example.cc) 20 | target_link_libraries(xlnet_example PUBLIC -lcublas -lcublasLt -lcudart -lz cnpy Xlnet) 21 | 22 | add_executable(xlnet_correctness_example xlnet_correctness_example.cc) 23 | target_link_libraries(xlnet_correctness_example PUBLIC -lcublas -lcublasLt -lcudart -lz cnpy Xlnet) 24 | 25 | -------------------------------------------------------------------------------- /examples/cpp/multi_gpu_gpt/gpt_example_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | namespace fastertransformer { 21 | 22 | int read_start_ids(int batch_size, 23 | std::vector* v_start_lengths, 24 | std::vector* v_start_ids, 25 | int& max_input_len, 26 | const int end_id, 27 | const int beam_width, 28 | std::string file_name); 29 | 30 | } // namespace fastertransformer 31 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/DenseWeight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "stdlib.h" 19 | namespace fastertransformer { 20 | 21 | template 22 | struct DenseWeight { 23 | const T* kernel = nullptr; 24 | const T* bias = nullptr; 25 | const T* sp_kernel = nullptr; 26 | // for int8 kernel 27 | const int8_t* int8_kernel = nullptr; 28 | const int8_t* int4_kernel = nullptr; 29 | const T* quant_scale = nullptr; 30 | const float* scale = nullptr; 31 | }; 32 | 33 | } // namespace fastertransformer -------------------------------------------------------------------------------- /src/fastertransformer/layers/xlnet_attention_layers/XlnetAttentionWeight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/layers/DenseWeight.h" 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | struct XlnetAttentionWeight { 25 | T* attr_kernel_Q; 26 | T* attr_kernel_K; 27 | T* attr_kernel_V; 28 | T* attr_bias_Q_w; 29 | T* attr_bias_Q_r; 30 | T* attr_bias_Q_s; 31 | 32 | T* attr_pos_emb; 33 | T* attr_seg_embed; 34 | T* attr_proj_o; 35 | }; 36 | 37 | } // namespace fastertransformer 38 | -------------------------------------------------------------------------------- /src/fastertransformer/models/longformer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(LongformerEncoder STATIC LongformerEncoder.cc) 18 | set_property(TARGET LongformerEncoder PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET LongformerEncoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(LongformerEncoder PUBLIC -lcublas -lcudart -lcurand cublasMMWrapper 21 | LongformerAttentionLayer longformer_kernels add_bias_transpose_kernels 22 | activation_kernels layernorm_kernels FfnLayer) -------------------------------------------------------------------------------- /src/fastertransformer/tensorrt_plugin/swin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.8) 15 | 16 | set(swintransformer_trt_files 17 | swinTransformerPlugin.cpp 18 | swinTransformerINT8Plugin.cpp 19 | ) 20 | 21 | 22 | if(BUILD_TRT) 23 | set(LIB_NAME "swinTransformer_plugin") 24 | add_library(${LIB_NAME} SHARED ${swintransformer_trt_files}) 25 | set_target_properties(${LIB_NAME} PROPERTIES 26 | CUDA_RESOLVE_DEVICE_SYMBOLS ON) 27 | target_link_libraries(${LIB_NAME} trt_fused_multi_head_attention Swin SwinINT8 -lcudnn -lcublas -lcudart -lnvinfer) 28 | endif() 29 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/vit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.13) 15 | 16 | set(vit_ths_files 17 | ViTOp.cc 18 | ViTINT8Op.cc 19 | WeightQuantizeOp.cc 20 | ) 21 | 22 | add_definitions(-DTORCH_CUDA=1) 23 | 24 | if(BUILD_PYT) 25 | set(LIB_NAME "pyt_vit") 26 | add_library(${LIB_NAME} SHARED ${vit_ths_files}) 27 | set_target_properties(${LIB_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) 28 | target_link_libraries(${LIB_NAME} ViT ViTINT8 quantize_weight cublasMMWrapper trt_fused_multi_head_attention 29 | -lcudnn -lcublas -lcudart "${TORCH_LIBRARIES}") 30 | endif() 31 | -------------------------------------------------------------------------------- /src/fastertransformer/models/xlnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(Xlnet STATIC Xlnet.cc) 18 | set_property(TARGET Xlnet PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET Xlnet PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(Xlnet PUBLIC -lcudart xlnet_preprocess_kernels cublasMMWrapper 21 | XlnetAttentionLayer FfnLayer layernorm_kernels) 22 | 23 | add_executable(xlnet_gemm xlnet_gemm.cc) 24 | target_link_libraries(xlnet_gemm PUBLIC -lcublas -lcublasLt -lcudart xlnet_gemm_func xlnet_gemm_func memory_utils) 25 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.03-tf1-py3 2 | 3 | FROM ${FROM_IMAGE_NAME} 4 | 5 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl 6 | 7 | RUN pip install toposort networkx pytest nltk tqdm html2text progressbar 8 | 9 | WORKDIR /workspace 10 | RUN git clone https://github.com/openai/gradient-checkpointing.git 11 | RUN git clone https://github.com/attardi/wikiextractor.git 12 | RUN git clone https://github.com/soskek/bookcorpus.git 13 | RUN git clone https://github.com/titipata/pubmed_parser 14 | 15 | RUN pip3 install /workspace/pubmed_parser 16 | 17 | #Copy the perf_client over 18 | ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.5.0/v1.5.0_ubuntu1804.clients.tar.gz 19 | RUN mkdir -p /workspace/install \ 20 | && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install 21 | 22 | #Install the python wheel with pip 23 | RUN pip install /workspace/install/python/tensorrtserver*.whl 24 | 25 | WORKDIR /workspace/bert 26 | COPY . . 27 | 28 | ENV PYTHONPATH /workspace/bert 29 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data 30 | ENV PATH //workspace/install/bin:${PATH} 31 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH} -------------------------------------------------------------------------------- /src/fastertransformer/models/decoding/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(Decoding STATIC Decoding.cc) 18 | set_property(TARGET Decoding PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET Decoding PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(Decoding PUBLIC -lcublas -lcudart -lcurand Decoder decoding_kernels 21 | BeamSearchLayer DynamicDecodeLayer) 22 | 23 | add_executable(decoding_gemm decoding_gemm.cc) 24 | target_link_libraries(decoding_gemm PUBLIC -lcublas -lcublasLt -lcudart decoding_gemm_func memory_utils) 25 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/convert_data_type.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "stdio.h" 19 | #include "stdlib.h" 20 | 21 | // be consistent with FasterTransformer 22 | int8_t float_to_int8_rn_host(float x) 23 | { 24 | int8_t res; 25 | int32_t tmp; 26 | if (x >= 0) { 27 | tmp = int(x + 0.5); 28 | tmp = tmp > 127 ? 127 : tmp; 29 | res = int8_t(tmp); 30 | } 31 | else { 32 | tmp = int(x - 0.5); 33 | tmp = tmp < -127 ? -127 : tmp; 34 | res = int8_t(tmp); 35 | } 36 | return res; 37 | } -------------------------------------------------------------------------------- /src/fastertransformer/tf_op/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_definitions(-DGOOGLE_CUDA=1) 16 | 17 | if(EXISTS ${TF_PATH}libtensorflow_framework.so) 18 | set(tf_link 19 | -ltensorflow_framework 20 | ) 21 | elseif(EXISTS ${TF_PATH}libtensorflow_framework.so.1) 22 | set(tf_link 23 | -l:libtensorflow_framework.so.1 24 | ) 25 | elseif(EXISTS ${TF_PATH}libtensorflow_framework.so.2) 26 | set(tf_link 27 | -l:libtensorflow_framework.so.2 28 | ) 29 | endif() 30 | 31 | add_subdirectory(bert) 32 | add_subdirectory(encoder) 33 | add_subdirectory(decoder) 34 | add_subdirectory(decoding) 35 | add_subdirectory(gpt) -------------------------------------------------------------------------------- /3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | namespace fastertransformer 21 | { 22 | enum Data_type 23 | { 24 | DATA_TYPE_BOOL, 25 | DATA_TYPE_E8M10, 26 | DATA_TYPE_E8M7, 27 | DATA_TYPE_FP16, 28 | DATA_TYPE_FP32, 29 | DATA_TYPE_INT4, 30 | DATA_TYPE_INT8, 31 | DATA_TYPE_INT32 32 | }; 33 | 34 | constexpr int32_t kSM_70 = 70; 35 | constexpr int32_t kSM_72 = 72; 36 | constexpr int32_t kSM_75 = 75; 37 | constexpr int32_t kSM_80 = 80; 38 | constexpr int32_t kSM_86 = 86; 39 | } // namespace fastertransformer 40 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/data/BooksDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import subprocess 15 | 16 | class BooksDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path 19 | pass 20 | 21 | 22 | def download(self): 23 | bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out' 24 | bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' 25 | bookscorpus_download_command += ' --trash-bad-count' 26 | bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) 27 | -------------------------------------------------------------------------------- /post.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import requests 4 | 5 | url = 'http://0.0.0.0:5000/code' 6 | 7 | headers = { 8 | "Content-Type": "application/json; charset=UTF-8", 9 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", 10 | } 11 | 12 | sentence = "# language: python\n# write a quick sort function\ndef" 13 | 14 | results=[] 15 | for i in range(1): 16 | data = json.dumps({'ability': 'seo_article_creation', 17 | 'context': sentence, 18 | 'temperature':1.0 , 19 | 'top_k': 1, 20 | 'top_p': 0.0, 21 | 'max_seq_len': 256, 22 | 'len_penalty': 1.0, 23 | 'repetition_penalty': 1.0, 24 | 'presence_penalty': 1.0, 25 | 'frequency_penalty': 1.0, 26 | 'end_tokens': [], 27 | }) 28 | time1=time.time() 29 | r = requests.post(url, data, headers=headers) 30 | time2=time.time() 31 | print("time used",time2-time1) 32 | print(r.json()['generated']) 33 | rdict=json.loads(r.text) 34 | result={"sentence":sentence,"result":rdict['generated']} 35 | results.append(result) -------------------------------------------------------------------------------- /src/fastertransformer/models/bert/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(Bert STATIC Bert.cc) 18 | set_property(TARGET Bert PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET Bert PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(Bert PUBLIC -lcudart bert_preprocess_kernels cublasMMWrapper 21 | UnfusedAttentionLayer FusedAttentionLayer FfnLayer layernorm_kernels 22 | add_residual_kernels) 23 | 24 | add_executable(bert_gemm bert_gemm.cc) 25 | target_link_libraries(bert_gemm PUBLIC -lcublas -lcublasLt -lcudart encoder_gemm_func encoder_igemm_func memory_utils) -------------------------------------------------------------------------------- /src/fastertransformer/models/vit_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(ViTINT8 STATIC ViTINT8.cc) 18 | set_property(TARGET ViTINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET ViTINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(ViTINT8 PUBLIC -lcudart -lcublasLt -lcublas cublasINT8MMWrapper 21 | UnfusedAttentionLayerINT8 FusedAttentionLayerINT8 FfnLayerINT8 layernorm_kernels 22 | layernorm_int8_kernels add_residual_kernels activation_kernels layout_transformer_int8_kernels 23 | vit_kernels bert_preprocess_kernels) 24 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/th_traits.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | #ifndef TORCH_TRAITS_H_ 19 | #define TORCH_TRAITS_H_ 20 | 21 | #include "src/fastertransformer/utils/cuda_utils.h" 22 | #include 23 | 24 | using namespace fastertransformer; 25 | namespace torch_ext { 26 | template 27 | class THTraits; 28 | 29 | template<> 30 | class THTraits { 31 | public: 32 | static const OperationType OpType = OperationType::FP32; 33 | }; 34 | 35 | template<> 36 | class THTraits { 37 | public: 38 | static const OperationType OpType = OperationType::FP16; 39 | }; 40 | 41 | } // namespace torch_ext 42 | #endif 43 | -------------------------------------------------------------------------------- /3rdparty/trt_fused_multihead_attention/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | cmake_minimum_required(VERSION 3.8) 17 | 18 | set(trt_fused_multi_head_attention_files 19 | cudaDriverWrapper.cpp 20 | qkvToContext.cu 21 | ) 22 | 23 | file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} *.sm*.cpp) 24 | 25 | add_library(trt_fused_multi_head_attention STATIC ${trt_fused_multi_head_attention_files}) 26 | target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart) 27 | set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE ON) 28 | set_property(TARGET trt_fused_multi_head_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 29 | -------------------------------------------------------------------------------- /examples/tensorflow/decoding/utils/translation/download_model_data.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Install the OpenNMT-tf v1 16 | pip install opennmt-tf==1.25.1 17 | 18 | # Download the vocabulary and test data 19 | # wget https://s3.amazonaws.com/opennmt-trainingdata/wmt_ende_sp.tar.gz 20 | 21 | # Download the pretrained model 22 | wget --progress=dot:giga https://s3.amazonaws.com/opennmt-models/averaged-ende-ckpt500k.tar.gz 23 | 24 | mkdir ../translation 25 | mkdir ../translation/ckpt 26 | tar xf averaged-ende-ckpt500k.tar.gz -C ../translation/ckpt 27 | rm averaged-ende-ckpt500k.tar.gz 28 | 29 | # convert the pretrained model to fit our model structure 30 | # python tensorflow/utils/dump_model.py translation/ckpt/model.ckpt-500000 31 | -------------------------------------------------------------------------------- /src/fastertransformer/models/vit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(ViT STATIC ViT.cc) 18 | set_property(TARGET ViT PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET ViT PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(ViT PUBLIC -lcudart -lcublasLt -lcublas cublasMMWrapper 21 | UnfusedAttentionLayer FusedAttentionLayer FfnLayer layernorm_kernels 22 | add_residual_kernels activation_kernels vit_kernels bert_preprocess_kernels) 23 | 24 | add_executable(vit_gemm vit_gemm.cc) 25 | target_link_libraries(vit_gemm PUBLIC -lcublas -lcublasLt -lcudart encoder_gemm_func encoder_igemm_func memory_utils) -------------------------------------------------------------------------------- /tests/unittests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_executable(test_gemm test_gemm.cu) 16 | target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper) 17 | 18 | add_executable(test_sampling test_sampling.cu) 19 | target_link_libraries(test_sampling PUBLIC 20 | -lcublas -lcublasLt -lcudart 21 | cublasMMWrapper memory_utils 22 | DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer TopKTopPSamplingLayer) 23 | 24 | add_executable(test_logprob_kernels test_logprob_kernels.cu) 25 | target_link_libraries(test_logprob_kernels PUBLIC 26 | -lcublas -lcublasLt -lcudart 27 | logprob_kernels memory_utils) 28 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/swin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | cmake_minimum_required(VERSION 3.13) 15 | 16 | set(swintransformer_ths_files 17 | SwinOp.cc 18 | SwinINT8Op.cc 19 | WeightQuantizeOp.cc 20 | ) 21 | 22 | add_definitions(-DTORCH_CUDA=1) 23 | 24 | if(BUILD_PYT) 25 | set(LIB_NAME "pyt_swintransformer") 26 | add_library(${LIB_NAME} SHARED ${swintransformer_ths_files}) 27 | set_target_properties(${LIB_NAME} PROPERTIES 28 | CUDA_RESOLVE_DEVICE_SYMBOLS ON) 29 | target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" Swin SwinINT8 30 | cublasINT8MMWrapper cublasAlgoMap trt_fused_multi_head_attention 31 | gen_relative_pos_bias quantize_weight -lcudnn -lcublas -lcudart) 32 | endif() 33 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/setup.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | """Setup script""" 19 | 20 | from setuptools import setup, find_packages 21 | 22 | setup(name="TensorFlow_FastTransformer_Quantization", 23 | package=["ft_tensorflow_quantization"], 24 | package_dir={'ft_tensorflow_quantization': 'ft_tensorflow_quantization'}, 25 | version="0.1.0", 26 | description="TensorFlow FasterTransformer Quantization", 27 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 28 | zip_safe=False) 29 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | """TensorFlow Quantization""" 19 | 20 | from ft_tensorflow_quantization.python.ops.fake_quantize import * 21 | 22 | from ft_tensorflow_quantization.python.layers.tensor_quantizer import * 23 | from ft_tensorflow_quantization.python.layers.dense import * 24 | 25 | from ft_tensorflow_quantization.python.calib.max import * 26 | from ft_tensorflow_quantization.python.calib.histogram import * 27 | from ft_tensorflow_quantization.python.calib.calibrator import * 28 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/stop_criteria_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | #include 19 | 20 | namespace fastertransformer { 21 | 22 | void invokeStopWordsCriterion(const int* output_ids, 23 | const int* parent_ids, 24 | const int* stop_words, 25 | bool* finished, 26 | size_t id_offset, 27 | size_t stop_words_len, 28 | int batch_size, 29 | int beam_width, 30 | int step, 31 | cudaStream_t stream); 32 | 33 | } // namespace fastertransformer 34 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_definitions(-DTORCH_CUDA=1) 16 | 17 | add_library(th_utils STATIC th_utils.cu) 18 | set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(th_utils PUBLIC "${TORCH_LIBRARIES}" -lcublas -lcudart -lcurand) 21 | 22 | add_subdirectory(bert) 23 | add_subdirectory(encoder) 24 | add_subdirectory(decoder) 25 | add_subdirectory(decoding) 26 | add_subdirectory(gpt) 27 | add_subdirectory(codegeex) 28 | add_subdirectory(longformer) 29 | add_subdirectory(swin) 30 | add_subdirectory(vit) 31 | 32 | if(BUILD_MULTI_GPU) 33 | add_subdirectory(multi_gpu_gpt) 34 | add_subdirectory(multi_gpu_codegeex) 35 | add_subdirectory(t5) 36 | endif() 37 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/quantize_weight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "int8_utils.cuh" 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | // format: 26 | // 0: row major 27 | // 1: CUBLASLT_ORDER_COL32_2R_4R4 28 | // 2: CUBLASLT_ORDER_COL4_4R2_8C 29 | template 30 | void invokeQuantizeWeight(int8_t* dst, 31 | const T* src, 32 | const float* amax, 33 | const int n, 34 | const int k, 35 | const int format, 36 | cudaStream_t stream, 37 | const int scale_is_vector = 1); 38 | 39 | } // namespace fastertransformer 40 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/gen_relative_pos_bias.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | enum class PositionEmbeddingType { 26 | relative, 27 | absolute, 28 | }; 29 | 30 | template 31 | void invokeGenRelativePosBias(T* relative_position_bias, 32 | const T* relative_position_bias_table, 33 | const Tindex* relative_position_bias_index, 34 | const int window_size, 35 | const int head_num, 36 | cudaStream_t stream); 37 | 38 | } // namespace fastertransformer 39 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/dequantize_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "src/fastertransformer/utils/cuda_utils.h" 19 | #include 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | template 26 | void invokeDequantization(T* dst, const int8_t* src, const int size, const float* scale_ptr, cudaStream_t stream); 27 | 28 | template 29 | void invokeDequantization_INT32(T* dst, 30 | const int32_t* src, 31 | const int size, 32 | cudaStream_t stream, 33 | const float* input_amax_ptr, 34 | const float* weight_amax_ptr); 35 | 36 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | BERT needs to maintain permanent compatibility with the pre-trained model files, 4 | so we do not plan to make any major changes to this library (other than what was 5 | promised in the README). However, we can accept small patches related to 6 | re-factoring and documentation. To submit contributes, there are just a few 7 | small guidelines you need to follow. 8 | 9 | ## Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a Contributor License 12 | Agreement. You (or your employer) retain the copyright to your contribution; 13 | this simply gives us permission to use and redistribute your contributions as 14 | part of the project. Head over to to see 15 | your current agreements on file or to sign a new one. 16 | 17 | You generally only need to submit a CLA once, so if you've already submitted one 18 | (even if it was for a different project), you probably don't need to do it 19 | again. 20 | 21 | ## Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use GitHub pull requests for this purpose. Consult 25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 26 | information on using pull requests. 27 | 28 | ## Community Guidelines 29 | 30 | This project follows 31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 32 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/gemm_test/swin_gemm_func.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cublasAlgoMap.h" 20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h" 21 | #include "src/fastertransformer/utils/cuda_utils.h" 22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | namespace fastertransformer { 35 | 36 | template 37 | void generate_swin_gemm_config( 38 | int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); 39 | 40 | } // namespace fastertransformer 41 | -------------------------------------------------------------------------------- /examples/tensorrt/t5/createT5TestData.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import numpy as np 18 | 19 | np.random.seed(97) 20 | 21 | data = {} 22 | 23 | fpList = [32,16] 24 | bsList = [1,8,32,128] 25 | slList = [32,128,384] 26 | 27 | for bs in bsList: 28 | for sl in slList: 29 | for fp in fpList: 30 | name = '-fp'+str(fp)+'-bs'+str(bs)+'-sl'+str(sl) 31 | data['encoder'+name] = np.random.randint(0,32128,[bs,sl]).astype(np.int32) 32 | data['decoding'+name] = np.random.rand(bs,sl,512).astype([np.float32,np.float16][int(fp==16)])*2-1 33 | data['seqLen'+name] = np.full([bs],sl,dtype=np.int32) 34 | 35 | np.savez("T5PluginTestIO.npz",**data) 36 | 37 | #for k in data.keys(): 38 | # print(k,data[k].shape,data[k].dtype,data[k].reshape(-1)[:10]) 39 | print("create T5 test data finish!") 40 | 41 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/activation_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h" 20 | #include 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | template 26 | void invokeAddBiasGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream); 27 | 28 | template 29 | void invokeAddBiasFastGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream); 30 | 31 | template 32 | void invokeAddBiasRelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream); 33 | 34 | template 35 | void invokeAddBias(F_T* out, const B_T* bias, const int m, const int n, cudaStream_t stream); 36 | 37 | } // namespace fastertransformer 38 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/gemm_test/encoder_gemm_func.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cublasAlgoMap.h" 20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h" 21 | #include "src/fastertransformer/utils/cuda_utils.h" 22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | namespace fastertransformer { 35 | 36 | template 37 | void generate_encoder_gemm_config( 38 | int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); 39 | 40 | } // namespace fastertransformer 41 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "int8_utils.cuh" 20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h" 21 | #include 22 | #include 23 | 24 | namespace fastertransformer { 25 | 26 | template 27 | void invokeLdnCalibrateWeightPerChannel(float* scale, const T* src, const int k, const int n, cudaStream_t stream); 28 | 29 | template 30 | void invokeLdkCalibrateQuantizeWeightPerChannel( 31 | int8_t* dst, float* scale, const T* src, const int n, const int k, cudaStream_t stream); 32 | 33 | template 34 | void invokeLdnTransposeQuantizeWeightPerChannel( 35 | int8_t* dst, const float* scale, const T* src, const int k, const int n, cudaStream_t stream); 36 | 37 | } // namespace fastertransformer 38 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/ban_bad_words.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | namespace fastertransformer { 23 | 24 | template 25 | void invokeBanBadWords(T* logits, 26 | const int* output_ids_buf, 27 | const int* parent_ids_buf, 28 | int batch_size, 29 | int local_batch_size, 30 | int beam_width, 31 | const int* bad_words, 32 | bool share_words, 33 | size_t bad_words_len, 34 | int id_offset, 35 | int vocab_size_padded, 36 | size_t step, 37 | cudaStream_t stream); 38 | 39 | } // namespace fastertransformer 40 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/data/BookscorpusTextFormatting.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import glob 15 | import os 16 | 17 | class BookscorpusTextFormatting: 18 | def __init__(self, books_path, output_filename, recursive = False): 19 | self.books_path = books_path 20 | self.recursive = recursive 21 | self.output_filename = output_filename 22 | 23 | 24 | # This puts one book per line 25 | def merge(self): 26 | with open(self.output_filename, mode='w', newline='\n') as ofile: 27 | for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True): 28 | with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file: 29 | for line in file: 30 | if line.strip() != '': 31 | ofile.write(line.strip() + ' ') 32 | ofile.write("\n\n") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeGeeX FasterTransformer 2 | 3 | This repository provides the fastertrasformer implementation of [CodeGeeX](https://github.com/THUDM/CodeGeeX) model. 4 | 5 | ## Get Started 6 | First, download and setup the following docker environment, replace `````` by the directory of this repo: 7 | ``` 8 | docker pull nvcr.io/nvidia/pytorch:21.11-py3 9 | docker run -p 9114:5000 --cpus 12 --gpus '"device=0"' -it -v :/workspace/codegeex-fastertransformer --ipc=host --name=test nvcr.io/nvidia/pytorch:21.11-py3 10 | ``` 11 | Second, install following packages in the docker: 12 | ``` 13 | pip3 install transformers 14 | pip3 install sentencepiece 15 | cd codegeex-fastertransformer 16 | sh make_all.sh # Remember to specify the DSM version according to the GPU. 17 | ``` 18 | Then, convert the initial checkpoint (download [here](https://models.aminer.cn/codegeex/download/request)) to FT version using ```get_ckpt_ft.py```. 19 | 20 | Finally, run ```api.py``` to start the server and run ```post.py``` to send request: 21 | ``` 22 | nohup python3 api.py > test.log 2>&1 & 23 | python3 post.py 24 | ``` 25 | ## Inference performance 26 | 27 | The following figure compares the performances of pure Pytorch, Megatron and FasterTransformer under INT8 and FP16. 28 | The fastest implementation is INT8 + FastTrans, and the average time of generating a token <15ms. 29 | 30 |
31 | 32 | ## Liscense 33 | 34 | Our code is licensed under the [Apache-2.0 license](LICENSE). 35 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/logprob_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | namespace fastertransformer { 20 | 21 | template 22 | void invokeLogProbFromLogits(float* cum_log_probs, 23 | const T* logits, 24 | const int* input_ids, 25 | const int* input_lengths, 26 | const size_t max_input_length, 27 | const size_t batch_size, 28 | const size_t vocab_size, 29 | const size_t vocab_size_padded, 30 | void* workspace, 31 | const size_t workspace_size, 32 | cudaStream_t stream, 33 | const bool batch_first = false); 34 | } // namespace fastertransformer 35 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | namespace fastertransformer { 19 | 20 | template 21 | void invokeTopkSoftMax(const T* log_probs, 22 | const T* bias, 23 | const bool* finished, 24 | float* cum_log_probs, 25 | float* output_log_probs, 26 | int* ids, 27 | void* tmp_storage, 28 | const int temp_storage_size, 29 | const int batch_size, 30 | const int beam_width, 31 | const int vocab_size, 32 | const int* end_ids, 33 | const float diversity_rate, 34 | cudaStream_t stream); 35 | 36 | } // namespace fastertransformer 37 | -------------------------------------------------------------------------------- /examples/cpp/gpt/gpt_config.ini: -------------------------------------------------------------------------------- 1 | [ft_instance_hyperparameter] 2 | max_batch_size=8 ; Use for allocate the buffer 3 | max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter 4 | beam_width=1 ; beam width for beam search 5 | top_k=0 ; k value for top k sampling 6 | top_p=0.5 ; p value for top p sampling 7 | temperature=1.0 ; Use for sampling 8 | repetition_penalty=2.0 ; Use for sampling 9 | data_type=fp16 10 | sparse=0 11 | model_name=gpt_124M 12 | ; model_name=megatron_345M 13 | ; model_name=megatron_6.7B 14 | ; model_name=gpt_175B 15 | ; model_name=self_defined 16 | ; model_dir=./models/megatron-models/c-model/6.7b/ 17 | model_dir=models/openai-gpt-models/c-model/124m/1-gpu/ 18 | 19 | [request] 20 | request_batch_size=8 ; determine by the request 21 | request_output_len=32 ; determine by the request 22 | return_log_probs=false ; return the output log probs and cumulative log probs. 23 | context_log_probs=false ; include input contexts in the cumulative log probability computation. 24 | 25 | [gpt_124M] 26 | head_num=12 27 | size_per_head=64 28 | vocab_size=50257 29 | decoder_layers=12 30 | 31 | [gpt_175B] 32 | head_num=96 33 | size_per_head=128 34 | vocab_size=51200 35 | decoder_layers=96 36 | 37 | [self_defined] 38 | head_num=16 39 | size_per_head=64 40 | vocab_size=30000 41 | decoder_layers=12 42 | 43 | [megatron_345M] 44 | head_num=16 45 | size_per_head=64 46 | vocab_size=50304 47 | decoder_layers=24 48 | 49 | [megatron_6.7B] 50 | head_num=32 51 | size_per_head=128 52 | vocab_size=51200 53 | decoder_layers=32 54 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/xlnet_preprocess_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "src/fastertransformer/utils/cuda_utils.h" 19 | #include 20 | #include 21 | 22 | namespace fastertransformer { 23 | 24 | template 25 | void blockAttnMask(dim3& grid, dim3& block, int batch_size, int seq_len); 26 | 27 | template 28 | void genWordEmdK( 29 | int batch_size, int seq_len, int hidden_dim, T* word_emb_k, T* params_word_emb_k, int* inp_k, cudaStream_t stream); 30 | 31 | template 32 | void preProcess(int batch_size, 33 | int seq_len, 34 | int hidden_dim, 35 | T* attn_mask, 36 | float* input_mask, 37 | T* seg_mat, 38 | int* seg_id, 39 | T* attr_k_head_r, 40 | cudaStream_t stream); 41 | } // namespace fastertransformer 42 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/layout_transformer_int8_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "int8_utils.cuh" 20 | #include 21 | #include 22 | #include 23 | 24 | namespace fastertransformer { 25 | 26 | template 27 | void invokeTransposeMatrixCOL32ToColMajor(T* dst, const T* src, const int m, const int n, cudaStream_t stream); 28 | 29 | template 30 | void invokeTransposeMatrixColMajorToCOL32(T* dst, const T* src, const int m, const int n, cudaStream_t stream); 31 | 32 | template 33 | void invokeTransposeMatrixColMajorToCOL32Quantize( 34 | int8_t* dst, const T* src, const int m, const int n, const float* scale_ptr, cudaStream_t stream); 35 | 36 | void invokeRowMajorToCOL32(int8_t* dst, const int8_t* src, const int m, const int n, cudaStream_t stream); 37 | } // namespace fastertransformer 38 | -------------------------------------------------------------------------------- /examples/cpp/multi_gpu_gpt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_library(gpt_example_utils STATIC gpt_example_utils.cc) 16 | target_link_libraries(gpt_example_utils PUBLIC -lcudart) 17 | 18 | add_executable(multi_gpu_gpt_example multi_gpu_gpt_example.cc) 19 | target_link_libraries(multi_gpu_gpt_example PUBLIC -lcublas -lcublasLt -lcudart 20 | ParallelGpt nvtx_utils -lmpi gpt_example_utils) 21 | 22 | add_executable(multi_gpu_gpt_async_example multi_gpu_gpt_async_example.cc) 23 | target_link_libraries(multi_gpu_gpt_async_example PUBLIC -lcublas -lcublasLt -lcudart 24 | ParallelGpt nvtx_utils -lmpi gpt_example_utils) 25 | 26 | add_executable(multi_gpu_gpt_triton_example multi_gpu_gpt_triton_example.cc) 27 | target_link_libraries(multi_gpu_gpt_triton_example PUBLIC -lcublas -lcublasLt -lcudart 28 | ParallelGptTritonBackend memory_utils custom_ar_comm -lmpi gpt_example_utils -lpthread) 29 | -------------------------------------------------------------------------------- /src/fastertransformer/models/bert_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(BertLayerINT8 STATIC BertLayerINT8.cc) 18 | set_property(TARGET BertLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET BertLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(BertLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart -lcurand cublasMMWrapper 21 | cublasINT8MMWrapper UnfusedAttentionLayerINT8 FusedAttentionLayerINT8 22 | FfnLayerINT8 layernorm_int8_kernels 23 | layout_transformer_int8_kernels quantization_int8_kernels) 24 | 25 | add_library(BertINT8 STATIC BertINT8.cc) 26 | set_property(TARGET BertINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 27 | set_property(TARGET BertINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 28 | target_link_libraries(BertINT8 PUBLIC -lcublasLt -lcublas -lcudart -lcurand BertLayerINT8 bert_preprocess_kernels) 29 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/word_list.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "word_list.h" 18 | #include "memory_utils.h" 19 | 20 | #include "assert.h" 21 | 22 | namespace fastertransformer { 23 | 24 | int read_word_list(const std::string& filename, std::vector& file_data) 25 | { 26 | std::ifstream word_list_file(filename, std::ios::in); 27 | 28 | std::string line_buf; 29 | int line_count = 0; 30 | size_t id_counts[2] = {0, 0}; 31 | while (std::getline(word_list_file, line_buf)) { 32 | 33 | std::stringstream line_stream(line_buf); 34 | std::string vals; 35 | while (std::getline(line_stream, vals, ',')) { 36 | file_data.push_back(std::stoi(vals)); 37 | id_counts[line_count]++; 38 | } 39 | line_count++; 40 | 41 | if (line_count > 1) { 42 | break; 43 | } 44 | } 45 | assert(id_counts[0] == id_counts[1]); 46 | 47 | return 0; 48 | } 49 | 50 | } // namespace fastertransformer 51 | -------------------------------------------------------------------------------- /examples/pytorch/decoding/utils/recover_bpe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | 17 | def recover_bpe(src): 18 | dst = [] 19 | for line in src: 20 | line = line.strip().split() 21 | if line[-1] == '': 22 | line.pop() 23 | if line[0][0] == '▁': 24 | s = line[0][1:] 25 | else: 26 | s = line[0] 27 | for w in line[1:]: 28 | if w[0] == '▁': 29 | s += ' ' + w[1:] 30 | else: 31 | s += w 32 | s += '\n' 33 | dst.append(s) 34 | return dst 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('infile', type=str) 39 | parser.add_argument('outfile', type=str) 40 | args = parser.parse_args() 41 | 42 | with open(args.infile, 'r') as infile: 43 | with open(args.outfile, 'w') as outfile: 44 | dst = recover_bpe(infile.readlines()) 45 | for line in dst: 46 | outfile.write(line) 47 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/fp16_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import tensorflow as tf 17 | import numpy as np 18 | 19 | 20 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None, 21 | initializer=None, regularizer=None, 22 | trainable=True, 23 | *args, **kwargs): 24 | """Custom variable getter that forces trainable variables to be stored in 25 | float32 precision and then casts them to the training precision. 26 | """ 27 | storage_dtype = tf.float32 if trainable else dtype 28 | variable = getter(name, shape, dtype=storage_dtype, 29 | initializer=initializer, regularizer=regularizer, 30 | trainable=trainable, 31 | *args, **kwargs) 32 | if trainable and dtype != tf.float32: 33 | variable = tf.cast(variable, dtype) 34 | return variable 35 | 36 | -------------------------------------------------------------------------------- /src/fastertransformer/triton_backend/triton_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" 20 | #include "src/fastertransformer/utils/Tensor.h" 21 | 22 | namespace ft = fastertransformer; 23 | 24 | template 25 | void move_tensor_H2D(const triton::Tensor &tensor, T* &d_ptr) 26 | { 27 | if (tensor.where == triton::MEMORY_GPU) { 28 | return; 29 | } 30 | 31 | size_t tensor_size = 1; 32 | for (auto t : tensor.shape) { 33 | tensor_size *= t; 34 | } 35 | ft::deviceMalloc(&d_ptr, tensor_size, false); 36 | ft::cudaH2Dcpy(d_ptr, (T*) tensor.data, tensor_size); 37 | } 38 | 39 | template 40 | ft::Tensor as_GPU_tensor(const triton::Tensor &tensor, T* d_ptr) 41 | { 42 | return ft::Tensor {ft::MEMORY_GPU, 43 | triton::Tensor::convertTritonTypeToFt(tensor.type), 44 | tensor.shape, 45 | tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data}; 46 | } 47 | -------------------------------------------------------------------------------- /src/fastertransformer/th_op/th_utils.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "src/fastertransformer/th_op/th_utils.h" 18 | 19 | namespace torch_ext { 20 | 21 | std::vector convert_shape(torch::Tensor tensor) 22 | { 23 | std::vector v_shape; 24 | for (int i = 0; i < tensor.dim(); i++) { 25 | v_shape.push_back(tensor.size(i)); 26 | } 27 | return v_shape; 28 | } 29 | 30 | template 31 | fastertransformer::Tensor convert_tensor(torch::Tensor tensor) 32 | { 33 | return fastertransformer::Tensor{fastertransformer::MEMORY_GPU, 34 | fastertransformer::getTensorType(), 35 | convert_shape(tensor), 36 | get_ptr(tensor)}; 37 | } 38 | 39 | template fastertransformer::Tensor convert_tensor(torch::Tensor tensor); 40 | template fastertransformer::Tensor convert_tensor(torch::Tensor tensor); 41 | template fastertransformer::Tensor convert_tensor(torch::Tensor tensor); 42 | 43 | } // namespace torch_ext 44 | -------------------------------------------------------------------------------- /examples/pytorch/gpt/utils/generate_start_ids.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import configparser 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('-max_batch_size', '--max_batch_size', type=int, required=True, metavar='NUMBER', 21 | help='batch size') 22 | parser.add_argument('-max_input_length', '--max_input_length', type=int, required=True, metavar='NUMBER', 23 | help='max input length') 24 | args = parser.parse_args() 25 | args_dict = vars(args) 26 | 27 | batch_size = args_dict["max_batch_size"] 28 | max_input_length = args_dict["max_input_length"] 29 | path = f"../examples/cpp/multi_gpu_gpt/start_ids.csv" 30 | 31 | with open(path, 'w') as f: 32 | ids = "" 33 | for i in range(batch_size): 34 | for j in range(max_input_length): 35 | if j == 0: 36 | ids = f"{ids}198" 37 | else: 38 | ids = f"{ids}, 198" 39 | ids = f"{ids}\n" 40 | f.write(ids) 41 | -------------------------------------------------------------------------------- /examples/tensorflow/bert/bert-quantization/gpu_environment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import tensorflow as tf 17 | import numpy as np 18 | 19 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None, 20 | initializer=None, regularizer=None, 21 | trainable=True, 22 | *args, **kwargs): 23 | """Custom variable getter that forces trainable variables to be stored in 24 | float32 precision and then casts them to the training precision. 25 | """ 26 | storage_dtype = tf.float32 if trainable else dtype 27 | variable = getter(name, shape, dtype=storage_dtype, 28 | initializer=initializer, regularizer=regularizer, 29 | trainable=trainable, 30 | *args, **kwargs) 31 | if trainable and dtype != tf.float32: 32 | variable = tf.cast(variable, dtype) 33 | return variable 34 | 35 | def get_custom_getter(compute_type): 36 | return float32_variable_storage_getter if compute_type == tf.float16 else None 37 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/mpi_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "mpi.h" 20 | #include 21 | 22 | namespace fastertransformer { 23 | 24 | #define MPICHECK(cmd) \ 25 | do { \ 26 | int e = cmd; \ 27 | if (e != MPI_SUCCESS) { \ 28 | printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ 29 | exit(EXIT_FAILURE); \ 30 | } \ 31 | } while (0) 32 | 33 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/tensorflow/decoding/utils/bleu_score.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | import tensorflow as tf 17 | from sacrebleu import corpus_bleu 18 | 19 | def bleu_score(pred_file, ref_file): 20 | with tf.io.gfile.GFile(pred_file) as pred_stream, tf.io.gfile.GFile(ref_file) as ref_stream: 21 | pred_stream_txt = pred_stream.readlines() 22 | ref_stream_txt = ref_stream.readlines() 23 | bleu = corpus_bleu(pred_stream_txt, [ref_stream_txt], force=True) 24 | print(" bleu score: {:6.2f}".format(bleu.score)) 25 | print(" bleu counts: {}".format(bleu.counts)) 26 | print(" bleu totals: {}".format(bleu.totals)) 27 | print(" bleu precisions: {}".format(bleu.precisions)) 28 | print(" bleu sys_len: {}; ref_len: {}".format(bleu.sys_len, bleu.ref_len)) 29 | return bleu 30 | 31 | if __name__ == "__main__": 32 | if len(sys.argv) != 3: 33 | print("[ERROR] bleu_score.py needs a result file and a solution file. \n e.g. python bleu_score.py f1.txt f2.txt") 34 | sys.exit(0) 35 | bleu_score(sys.argv[1], sys.argv[2]) 36 | -------------------------------------------------------------------------------- /src/fastertransformer/models/swin_int8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(SwinBlockINT8 STATIC SwinBlockINT8.cc) 18 | set_property(TARGET SwinBlockINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET SwinBlockINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(SwinBlockINT8 PUBLIC -lcublasLt -lcublas -lcudart 21 | WindowAttentionINT8 activation_int8_kernels add_residual_kernels) 22 | 23 | add_library(SwinBasicLayerINT8 STATIC SwinBasicLayerINT8.cc) 24 | set_property(TARGET SwinBasicLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 25 | set_property(TARGET SwinBasicLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 26 | target_link_libraries(SwinBasicLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart SwinBlockINT8 dequantize_kernels) 27 | 28 | add_library(SwinINT8 STATIC SwinINT8.cc) 29 | set_property(TARGET SwinINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) 30 | set_property(TARGET SwinINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 31 | target_link_libraries(SwinINT8 PUBLIC -lcudart SwinBasicLayerINT8 activation_kernels memory_utils) 32 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cublasAlgoMap.h" 20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h" 21 | #include "src/fastertransformer/utils/cuda_utils.h" 22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | namespace fastertransformer { 35 | 36 | template 37 | void generate_xlnet_gemm_config(int batch_size, 38 | int seq_len, 39 | int head_num, 40 | int size_per_head, 41 | int hidden_units_, 42 | int inter_size_, 43 | void* buffer_in, 44 | bool isAppend = true); 45 | 46 | } // namespace fastertransformer 47 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/beam_search_layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu) 18 | set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels) 21 | 22 | add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu) 23 | set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 24 | set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 25 | target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels) 26 | 27 | add_library(BeamSearchLayer STATIC BeamSearchLayer.cu) 28 | set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 29 | set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 30 | target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels) 31 | -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/data/GLUEDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import sys 15 | import wget 16 | 17 | from pathlib import Path 18 | 19 | 20 | def mkdir(path): 21 | Path(path).mkdir(parents=True, exist_ok=True) 22 | 23 | 24 | class GLUEDownloader: 25 | 26 | def __init__(self, save_path): 27 | self.save_path = save_path + '/glue' 28 | 29 | def download(self, task_name): 30 | mkdir(self.save_path) 31 | if task_name in {'mrpc', 'mnli'}: 32 | task_name = task_name.upper() 33 | elif task_name == 'cola': 34 | task_name = 'CoLA' 35 | else: # SST-2 36 | assert task_name == 'sst-2' 37 | task_name = 'SST' 38 | wget.download( 39 | 'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/1502038877f6a88c225a34450793fbc3ea87eaba/download_glue_data.py', 40 | out=self.save_path, 41 | ) 42 | sys.path.append(self.save_path) 43 | import download_glue_data 44 | download_glue_data.main( 45 | ['--data_dir', self.save_path, '--tasks', task_name]) 46 | sys.path.pop() 47 | -------------------------------------------------------------------------------- /src/fastertransformer/models/swin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(SwinBlock STATIC SwinBlock.cc) 18 | set_property(TARGET SwinBlock PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET SwinBlock PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(SwinBlock PUBLIC -lcublasLt -lcublas -lcudart WindowAttention 21 | activation_kernels add_residual_kernels layernorm_kernels) 22 | 23 | add_library(SwinBasicLayer STATIC SwinBasicLayer.cc) 24 | set_property(TARGET SwinBasicLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 25 | set_property(TARGET SwinBasicLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 26 | target_link_libraries(SwinBasicLayer PUBLIC -lcublasLt -lcublas -lcudart SwinBlock) 27 | 28 | add_library(Swin STATIC Swin.cc) 29 | set_property(TARGET Swin PROPERTY POSITION_INDEPENDENT_CODE ON) 30 | set_property(TARGET Swin PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 31 | target_link_libraries(Swin PUBLIC -lcudart SwinBasicLayer memory_utils) 32 | 33 | add_executable(swin_gemm swin_gemm.cc) 34 | target_link_libraries(swin_gemm PUBLIC -lcublas -lcublasLt -lcudart swin_igemm_func swin_gemm_func memory_utils) -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.07-py3 15 | FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt 16 | FROM ${FROM_IMAGE_NAME} 17 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract 18 | 19 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data 20 | 21 | WORKDIR /workspace 22 | RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd .. 23 | RUN git clone https://github.com/soskek/bookcorpus.git 24 | 25 | # Copy the perf_client over 26 | COPY --from=trt /workspace/install/ /workspace/install/ 27 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH} 28 | 29 | # Install trt python api 30 | RUN apt-get install libb64-0d 31 | RUN pip install /workspace/install/python/tensorrtserver*.whl 32 | 33 | WORKDIR /workspace/bert 34 | RUN pip install --upgrade --no-cache-dir pip \ 35 | && pip install --no-cache-dir \ 36 | tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \ 37 | git+https://github.com/NVIDIA/dllogger wget 38 | 39 | RUN apt-get install -y iputils-ping 40 | 41 | COPY . . 42 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/gemm_test/swin_igemm_func.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cublasAlgoMap.h" 20 | #include "src/fastertransformer/utils/cuda_utils.h" 21 | #include "src/fastertransformer/utils/gemm_test/encoder_igemm_func.h" 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace fastertransformer { 34 | 35 | /* CAUTION : must match cublasLtMatmulTile_t */ 36 | // const char* const matmulTileName[] = { 37 | // "UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8", "8x64", "16x32", 38 | // "32x16", "64x8", "32x32", "32x64", "64x32", "32x128", "64x64", "128x32", "64x128", 39 | // "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64", 40 | // }; 41 | 42 | int generate_swin_igemm_config( 43 | int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); 44 | 45 | } // namespace fastertransformer 46 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/add_bias_transpose_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | namespace fastertransformer { 18 | 19 | template 20 | void invokeAddBiasTransposeToMultiHead(const T* matrices, 21 | const T* biases, 22 | T* output, 23 | const int batch_size, 24 | const int head_num, 25 | const int size_per_head, 26 | const int seq_len, 27 | const int matrices_num, 28 | const cudaStream_t stream); 29 | 30 | template 31 | void invokeTransposeMultiHeadToSingle(T* dst, 32 | T* src, 33 | const int batch_size, 34 | const int seq_len, 35 | const int head_num, 36 | const int size_per_head, 37 | cudaStream_t stream); 38 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/pytorch/gpt/utils/gpt_token_converter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | import os 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | sys.path.append(dir_path + "/../../../..") 19 | from examples.tensorflow.gpt.utils import gpt_token_encoder as encoder 20 | import fire 21 | import numpy as np 22 | 23 | def convert_token( 24 | vocab_file="../models/gpt2-vocab.json", 25 | bpe_file="../models/gpt2-merges.txt", 26 | out_file="out", 27 | max_input_length=-1 28 | ): 29 | enc = encoder.get_encoder(vocab_file, bpe_file) 30 | tokens_batch = np.loadtxt(out_file, dtype=np.int32) 31 | end_id = 50256 32 | if(tokens_batch.ndim == 1): 33 | tokens_batch = tokens_batch.reshape([1, -1]) 34 | for batch_num, tokens in enumerate(tokens_batch): 35 | if max_input_length > -1: 36 | end_index = np.where(tokens[max_input_length:] == end_id)[0] 37 | else: 38 | end_index = [] 39 | end_pos = len(tokens) 40 | if len(end_index) > 0: 41 | end_pos = end_index[0] 42 | print(f"[INFO] batch {batch_num}: {enc.decode(tokens[:end_pos])}") 43 | return tokens_batch 44 | 45 | if __name__ == "__main__": 46 | fire.Fire(convert_token) -------------------------------------------------------------------------------- /examples/tensorflow/gpt/utils/gpt_token_converter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | import os 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | sys.path.append(dir_path + "/../../../..") 19 | from examples.tensorflow.gpt.utils import gpt_token_encoder as encoder 20 | import fire 21 | import numpy as np 22 | 23 | def convert_token( 24 | vocab_file="../models/gpt2-vocab.json", 25 | bpe_file="../models/gpt2-merges.txt", 26 | out_file="out", 27 | max_input_length=-1 28 | ): 29 | enc = encoder.get_encoder(vocab_file, bpe_file) 30 | tokens_batch = np.loadtxt(out_file, dtype=np.int32) 31 | end_id = 50256 32 | if(tokens_batch.ndim == 1): 33 | tokens_batch = tokens_batch.reshape([1, -1]) 34 | for batch_num, tokens in enumerate(tokens_batch): 35 | if max_input_length > -1: 36 | end_index = np.where(tokens[max_input_length:] == end_id)[0] 37 | else: 38 | end_index = [] 39 | end_pos = -1 40 | if len(end_index) > 0: 41 | end_pos = end_index[0] 42 | print("[INFO] batch {}: {}".format(batch_num, enc.decode(tokens[:end_pos]))) 43 | return tokens_batch 44 | 45 | if __name__ == "__main__": 46 | fire.Fire(convert_token) -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/scripts/data_download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | DATA_DIR=${1:-/workspace/bert/data} 17 | 18 | # Download vocab files from pretrained model 19 | cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.* 20 | 21 | # Download SQUAD 22 | cd $DATA_DIR/squad && . squad_download.sh 23 | 24 | # Download SWAG 25 | git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag 26 | 27 | # Download GLUE 28 | cd $DATA_DIR/glue && . download_mrpc.sh 29 | 30 | # WIKI Download 31 | cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh 32 | 33 | # Bookcorpus Download 34 | cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh 35 | 36 | cd $DATA_DIR 37 | # Create HDF5 files for WIKI 38 | bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \ 39 | && rm -r ./wikipedia_corpus/final_* \ 40 | 41 | # Create HDF5 files for Bookcorpus 42 | bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \ 43 | && rm -r ./bookcorpus/final_* \ 44 | 45 | # Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus 46 | bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024 47 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/reverse_roll_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | #include 20 | #include 21 | 22 | namespace fastertransformer { 23 | 24 | void invokeReverseRollCol32(int8_t* dst, 25 | const int8_t* src, 26 | int batch, 27 | int window_num, 28 | int window_len, 29 | int window_size, 30 | int H, 31 | int W, 32 | int dim, 33 | int shift_size, 34 | cudaStream_t stream); 35 | 36 | template 37 | void invokeReverseRoll(T* dst, 38 | const T* src, 39 | int batch, 40 | int window_num, 41 | int window_len, 42 | int window_size, 43 | int H, 44 | int W, 45 | int dim, 46 | int shift_size, 47 | cudaStream_t stream); 48 | 49 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/pytorch/bert/bert-quantization-sparsity/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import torch 15 | import torch.distributed as dist 16 | 17 | from pathlib import Path 18 | 19 | 20 | def get_rank(): 21 | if not dist.is_available(): 22 | return 0 23 | if not dist.is_initialized(): 24 | return 0 25 | return dist.get_rank() 26 | 27 | 28 | def get_world_size(): 29 | if not dist.is_available(): 30 | return 1 31 | if not dist.is_initialized(): 32 | return 1 33 | return dist.get_world_size() 34 | 35 | 36 | def is_main_process(): 37 | return get_rank() == 0 38 | 39 | 40 | def barrier(): 41 | if dist.is_available() and dist.is_initialized(): 42 | dist.barrier() 43 | 44 | 45 | def format_step(step): 46 | if isinstance(step, str): 47 | return step 48 | s = "" 49 | if len(step) > 0: 50 | s += "Training Epoch: {} ".format(step[0]) 51 | if len(step) > 1: 52 | s += "Training Iteration: {} ".format(step[1]) 53 | if len(step) > 2: 54 | s += "Validation Iteration: {} ".format(step[2]) 55 | return s 56 | 57 | 58 | def mkdir(path): 59 | Path(path).mkdir(parents=True, exist_ok=True) 60 | 61 | 62 | def mkdir_by_main_process(path): 63 | if is_main_process(): 64 | mkdir(path) 65 | barrier() 66 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/vit_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | #include 19 | #include 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | void invokeAddBiasSlice(T* in, T* out, const T* bias, const int m, const int n, const int s, cudaStream_t stream); 25 | 26 | template 27 | void invokeAddBiasConcatClsTokenAddPosEmbed(const T* in, 28 | T* out, 29 | const T* bias, 30 | const T* cls_token, 31 | const T* pos_embed, 32 | const int m, 33 | const int n, 34 | const int s, 35 | cudaStream_t stream); 36 | 37 | template 38 | void invokeSliceCopy( 39 | const T* in, T* out, const int m, const int n, const int s, const int offset_s, cudaStream_t stream); 40 | 41 | template 42 | void invokeAddBiasAddPosEmbed( 43 | T* out, const T* bias, const T* pos_embed, const int m, const int n, const int s, cudaStream_t stream); 44 | 45 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/cpp/multi_gpu_gpt/gpt_config.ini: -------------------------------------------------------------------------------- 1 | [ft_instance_hyperparameter] 2 | max_batch_size=8 ; Use for allocate the buffer 3 | max_seq_len=1024 ; The sequence length of position embedding table, should move to model hyper-parameter 4 | beam_width=1 ; beam width for beam search 5 | top_k=1 ; k value for top k sampling 6 | top_p=0 ; p value for top p sampling 7 | temperature=1.0 ; Use for sampling 8 | repetition_penalty=1.0 ; Use for sampling 9 | tensor_para_size=1 10 | pipeline_para_size=1 11 | data_type=fp16 12 | sparse=0 13 | int8_mode=0 14 | enable_custom_all_reduce=0 15 | ; model_name=gpt_124M 16 | model_name=megatron_345M 17 | ; model_name=megatron_6.7B 18 | ; model_name=gpt_175B 19 | ; model_name=self_defined 20 | ; model_dir=./models/megatron-models/c-model/6.7b/ 21 | model_dir=../models/megatron-models/c-model/345m/8-gpu/ 22 | len_penalty=1.0 23 | beam_search_diversity_rate=0.0 24 | 25 | [request] 26 | request_batch_size=8 ; determine by the request 27 | request_output_len=32 ; determine by the request 28 | return_log_probs=false ; return the output log probs and cumulative log probs. 29 | context_log_probs=false ; include input contexts in the cumulative log probability computation. 30 | 31 | [gpt_124M] 32 | head_num=12 33 | size_per_head=64 34 | vocab_size=50257 35 | decoder_layers=12 36 | start_id=50256 37 | end_id=50256 38 | inter_size=3072 39 | 40 | [megatron_345M] 41 | head_num=16 42 | size_per_head=64 43 | vocab_size=50304 44 | decoder_layers=24 45 | start_id=50256 46 | end_id=50256 47 | inter_size=4096 48 | 49 | [megatron_6.7B] 50 | head_num=32 51 | size_per_head=128 52 | vocab_size=51200 53 | decoder_layers=32 54 | start_id=50256 55 | end_id=50256 56 | inter_size=16384 57 | 58 | [gpt_175B] 59 | head_num=96 60 | size_per_head=128 61 | vocab_size=51200 62 | decoder_layers=96 63 | start_id=50256 64 | end_id=50256 65 | inter_size=49152 66 | 67 | [self_defined] 68 | head_num=16 69 | size_per_head=64 70 | vocab_size=30000 71 | decoder_layers=12 72 | start_id=50256 73 | end_id=50256 74 | inter_size=4096 75 | -------------------------------------------------------------------------------- /src/fastertransformer/utils/memory_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include "src/fastertransformer/utils/cuda_utils.h" 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | void deviceMalloc(T** ptr, int size, bool is_random_initialize = true); 25 | 26 | template 27 | void deviceMemSetZero(T* ptr, int size); 28 | 29 | template 30 | void deviceFree(T*& ptr); 31 | 32 | template 33 | void deviceFill(T* devptr, int size, T value); 34 | 35 | template 36 | void cudaD2Hcpy(T* tgt, const T* src, const int size); 37 | 38 | template 39 | void cudaH2Dcpy(T* tgt, const T* src, const int size); 40 | 41 | template 42 | void cudaD2Dcpy(T* tgt, const T* src, const int size); 43 | 44 | template 45 | void cudaRandomUniform(T* buffer, const int size); 46 | 47 | template 48 | int loadWeightFromBin(T* ptr, 49 | std::vector shape, 50 | std::string filename, 51 | FtCudaDataType model_file_type = FtCudaDataType::FP32); 52 | 53 | void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const int size, cudaStream_t stream); 54 | void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const int size, cudaStream_t stream); 55 | 56 | } // namespace fastertransformer 57 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/beam_search_topk_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | #pragma once 20 | 21 | namespace fastertransformer { 22 | 23 | template 24 | void invokeTopkBeamSearch(void* workspace, 25 | size_t& workspace_size, 26 | T* log_probs, 27 | int* ids, 28 | const bool* finished, 29 | const int batch_size, 30 | const int beam_width, 31 | const int vocab_size_padded_, 32 | const T diversity_rate, 33 | const int* end_ids, 34 | cudaStream_t stream); 35 | 36 | template 37 | void invokeTileEncoderResults(T* tiled_encoder_output, 38 | int* tiled_encoder_sequence_length, 39 | const T* encoder_output, 40 | const int* encoder_sequence_length, 41 | const size_t batch_size, 42 | const size_t beam_width, 43 | const size_t mem_max_seq_len, 44 | const size_t d_model, 45 | cudaStream_t stream); 46 | 47 | } // namespace fastertransformer 48 | -------------------------------------------------------------------------------- /cmake/FasterTransformerConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | include(CMakeFindDependencyMacro) 28 | 29 | get_filename_component( 30 | FASTERTRANSFORMER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH 31 | ) 32 | 33 | list(APPEND CMAKE_MODULE_PATH ${FASTERTRANSFORMER_CMAKE_DIR}) 34 | 35 | if(NOT TARGET transformer-shared) 36 | include("${FASTERTRANSFORMER_CMAKE_DIR}/FasterTransformerTargets.cmake") 37 | endif() 38 | 39 | set(FASTERTRANSFORMER_LIBRARIES transformer-shared) 40 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/custom_ar_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "src/fastertransformer/utils/cuda_utils.h" 25 | 26 | #define CUSTOM_AR_SIZE_THRESHOLD 50331648 27 | #define MAX_ALL_REDUCE_BLOCKS 24 28 | #define FLAG(a) ((uint32_t)((a) % 0x146)) 29 | #define RANKS_PER_NODE 8 30 | #define WARP_SIZE 32 31 | #define DEFAULT_BLOCK_SIZE 1024 32 | #define DEFALUT_ALGO_AR_SIZE_THRESHOLD 196608 33 | 34 | namespace fastertransformer { 35 | 36 | #ifdef ENABLE_BF16 37 | typedef struct bf168 { 38 | __nv_bfloat162 x; 39 | __nv_bfloat162 y; 40 | __nv_bfloat162 z; 41 | __nv_bfloat162 w; 42 | } bf168; 43 | #endif 44 | 45 | template 46 | struct AllReduceParams { 47 | size_t elts_total; 48 | size_t elts_per_rank; 49 | size_t elts_per_block; 50 | size_t rank_offset; 51 | size_t rank, local_rank, node_id; 52 | uint32_t barrier_flag; 53 | uint32_t* peer_barrier_ptrs[RANKS_PER_NODE]; 54 | T* peer_comm_buffer_ptrs[RANKS_PER_NODE]; 55 | T* local_output_buffer_ptr; 56 | }; 57 | 58 | template 59 | void invokeOneOrTwoShotAllReduceKernel(AllReduceParams& param, cudaStream_t stream); 60 | 61 | void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo); 62 | 63 | } // namespace fastertransformer -------------------------------------------------------------------------------- /src/fastertransformer/layers/DynamicDecodeBaseLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include "src/fastertransformer/layers/BaseLayer.h" 23 | 24 | namespace fastertransformer { 25 | 26 | class DynamicDecodeBaseLayer: public BaseLayer { 27 | protected: 28 | virtual void allocateBuffer() = 0; 29 | virtual void freeBuffer() = 0; 30 | 31 | public: 32 | DynamicDecodeBaseLayer(cudaStream_t stream, 33 | cublasMMWrapper* cublas_wrapper, 34 | IAllocator* allocator, 35 | bool is_free_buffer_after_forward, 36 | cudaDeviceProp* cuda_device_prop): 37 | BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop){}; 38 | ~DynamicDecodeBaseLayer() = default; 39 | DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){}; 40 | 41 | virtual void forward(std::vector* output_tensors, 42 | const std::vector* input_tensors) = 0; 43 | virtual void forward(std::unordered_map* output_tensors, 44 | const std::unordered_map* input_tensors) = 0; 45 | }; 46 | 47 | } // namespace fastertransformer 48 | -------------------------------------------------------------------------------- /src/fastertransformer/kernels/beam_search_penalty_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | #include 19 | 20 | #include "src/fastertransformer/utils/cuda_utils.h" 21 | 22 | namespace fastertransformer { 23 | 24 | template 25 | void invokeAddBiasApplyPenalties(int step, 26 | T* logits, 27 | const int* current_ids, 28 | const int* previous_ids, 29 | const int* parent_ids, 30 | const int* input_lengths, 31 | const T* bias, 32 | const int ite, 33 | const int max_input_length, 34 | const int local_batch_size, 35 | const int batch_size, 36 | const int beam_width, 37 | const int vocab_size, 38 | const int vocab_size_padded, 39 | const int* end_ids, 40 | const float temerature, 41 | const float len_penalty, 42 | const float repeat_penalty, 43 | cudaStream_t stream); 44 | 45 | } // namespace fastertransformer 46 | -------------------------------------------------------------------------------- /src/fastertransformer/models/swin/SwinWeight.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include "src/fastertransformer/kernels/layernorm_kernels.h" 19 | #include "src/fastertransformer/layers/FfnWeight.h" 20 | #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" 21 | #include 22 | 23 | namespace fastertransformer { 24 | 25 | template 26 | class SwinTransformerBlockWeight { 27 | public: 28 | AttentionWeight attention_weights; 29 | FfnWeight ffn_weights; 30 | LayerNormWeight attn_layernorm_weights; 31 | LayerNormWeight ffn_layernorm_weights; 32 | const T* attention_relative_pos_bias = nullptr; 33 | }; // SwinTransformerBlockWeight 34 | 35 | template 36 | class SwinTransformerBasicLayerWeight { 37 | public: 38 | LayerNormWeight merge_layernorm_weights; 39 | DenseWeight merge_linear_weights; 40 | const T* attn_mask = nullptr; 41 | std::vector> block_weight_list; 42 | }; // SwinTransformerBasicLayerWeight 43 | 44 | template 45 | class SwinTransformerWeight { 46 | public: 47 | DenseWeight patchEmbed_linear_weights; 48 | LayerNormWeight patchEmbed_norm_weights; 49 | LayerNormWeight norm_weights; 50 | std::vector> basic_layer_weight_list; 51 | }; // class SwinTransformerWeight 52 | 53 | } // namespace fastertransformer -------------------------------------------------------------------------------- /examples/pytorch/gpt/utils/parallel_gpt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import torch 18 | from examples.pytorch.gpt.utils.gpt import GPT 19 | 20 | class ParallelGPT(GPT): 21 | def __init__(self, head_num, size_per_head, vocab_size, start_id, end_id, layer_num, max_seq_len, 22 | tensor_para_size, pipeline_para_size, lib_path, int8_mode): 23 | super().__init__(head_num, size_per_head, vocab_size, start_id, end_id, layer_num, max_seq_len, 24 | tensor_para_size, pipeline_para_size, lib_path, int8_mode) 25 | 26 | def cuda(self): 27 | self.weights._map(lambda w: w.cuda(self.device)) 28 | if self.int8_mode != 0: 29 | self.weights._map_int8(lambda w: w.cuda(self.device)) 30 | 31 | if self.build_model == True: 32 | del self.model 33 | self.build_model = False 34 | self.model = torch.classes.FasterTransformer.ParallelGptOp(self.head_num, self.size_per_head, 4 * self.head_num * self.size_per_head, 35 | self.layer_num, self.vocab_size, self.start_id, self.end_id, 36 | self.tensor_para_size, self.pipeline_para_size, self.int8_mode, 37 | self.weights.w, self.weights.int8_w, self.weights.scale) 38 | self.build_model = True 39 | -------------------------------------------------------------------------------- /src/fastertransformer/layers/sampling_layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc) 18 | set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels) 21 | 22 | add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu) 23 | set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 24 | set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 25 | target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels) 26 | 27 | add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu) 28 | set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 29 | set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 30 | target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels) 31 | 32 | add_library(TopKTopPSamplingLayer STATIC TopKTopPSamplingLayer.cu) 33 | set_property(TARGET TopKTopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 34 | set_property(TARGET TopKTopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 35 | target_link_libraries(TopKTopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels) 36 | --------------------------------------------------------------------------------