├── examples
    ├── cpp
    │   ├── gptj
    │   │   ├── bad_words.csv
    │   │   ├── stop_words.csv
    │   │   ├── start_ids.csv
    │   │   ├── gptj_config.ini
    │   │   └── CMakeLists.txt
    │   ├── gpt
    │   │   ├── start_ids.csv
    │   │   ├── CMakeLists.txt
    │   │   └── gpt_config.ini
    │   ├── multi_gpu_gpt
    │   │   ├── start_ids.csv
    │   │   ├── gpt_example_utils.h
    │   │   ├── CMakeLists.txt
    │   │   └── gpt_config.ini
    │   ├── bert_int8
    │   │   └── CMakeLists.txt
    │   ├── vit
    │   │   └── CMakeLists.txt
    │   ├── vit_int8
    │   │   └── CMakeLists.txt
    │   ├── bert
    │   │   └── CMakeLists.txt
    │   ├── decoding
    │   │   └── CMakeLists.txt
    │   ├── swin_int8
    │   │   └── CMakeLists.txt
    │   ├── swin
    │   │   └── CMakeLists.txt
    │   ├── CMakeLists.txt
    │   └── xlnet
    │   │   └── CMakeLists.txt
    ├── pytorch
    │   ├── bert
    │   │   └── bert-quantization-sparsity
    │   │   │   ├── checkpoints
    │   │   │       └── .keep
    │   │   │   ├── processors
    │   │   │       └── __init__.py
    │   │   │   ├── apex_sparsity
    │   │   │       └── __init__.py
    │   │   │   ├── scripts
    │   │   │       ├── docker
    │   │   │       │   ├── build.sh
    │   │   │       │   └── launch.sh
    │   │   │       └── data_download.sh
    │   │   │   ├── images
    │   │   │       ├── model.png
    │   │   │       ├── nvlamb.png
    │   │   │       └── loss_curves.png
    │   │   │   ├── NOTICE
    │   │   │   ├── requirements.txt
    │   │   │   ├── bert_config.json
    │   │   │   ├── data
    │   │   │       ├── __init__.py
    │   │   │       ├── NVIDIAPretrainedWeightDownloader.py
    │   │   │       ├── BooksDownloader.py
    │   │   │       ├── BookscorpusTextFormatting.py
    │   │   │       └── GLUEDownloader.py
    │   │   │   ├── .dockerignore
    │   │   │   ├── Dockerfile
    │   │   │   └── utils.py
    │   ├── gpt
    │   │   ├── requirement.txt
    │   │   └── utils
    │   │   │   ├── generate_start_ids.py
    │   │   │   ├── gpt_token_converter.py
    │   │   │   └── parallel_gpt.py
    │   ├── swin
    │   │   ├── Swin-Transformer-Quantization
    │   │   │   ├── __init__.py
    │   │   │   ├── run.sh
    │   │   │   ├── calib.sh
    │   │   │   └── qat.sh
    │   │   ├── run_test.sh
    │   │   ├── run_test_int8.sh
    │   │   └── run_test_int8_accuracy.sh
    │   ├── vit
    │   │   ├── requirement.txt
    │   │   ├── run.sh
    │   │   ├── run2.sh
    │   │   └── ViT-quantization
    │   │   │   ├── calib.sh
    │   │   │   └── qat.sh
    │   ├── t5
    │   │   └── requirement.txt
    │   ├── requirement.txt
    │   ├── utils.py
    │   ├── codegeex
    │   │   └── utils
    │   │   │   └── tokenizer
    │   │   │       ├── special_tokens_map.json
    │   │   │       └── tokenizer_config.json
    │   └── decoding
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── download_model.sh
    │   │       └── recover_bpe.py
    ├── tensorflow
    │   ├── requirement.txt
    │   ├── bert
    │   │   ├── bert-quantization
    │   │   │   ├── NOTICE
    │   │   │   ├── __init__.py
    │   │   │   ├── .dockerignore
    │   │   │   ├── ft-tensorflow-quantization
    │   │   │   │   ├── ft_tensorflow_quantization
    │   │   │   │   │   ├── python
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── calib
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   ├── ops
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   ├── utils
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   │   └── layers
    │   │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── setup.py
    │   │   │   ├── Dockerfile
    │   │   │   ├── CONTRIBUTING.md
    │   │   │   ├── fp16_utils.py
    │   │   │   └── gpu_environment.py
    │   │   ├── utils
    │   │   │   └── __init__.py
    │   │   └── tensorflow_bert
    │   │   │   └── __init__.py
    │   ├── xlnet
    │   │   └── downloadModel.sh
    │   ├── decoding
    │   │   └── utils
    │   │   │   ├── translation
    │   │   │       └── download_model_data.sh
    │   │   │   └── bleu_score.py
    │   └── gpt
    │   │   └── utils
    │   │       └── gpt_token_converter.py
    ├── tensorrt
    │   ├── swin
    │   │   ├── run_builder_fp16.sh
    │   │   ├── run_builder_fp32.sh
    │   │   ├── run_builder_int8.sh
    │   │   ├── run_infer_fp32.sh
    │   │   ├── run_infer_fp16.sh
    │   │   └── run_infer_int8.sh
    │   └── t5
    │   │   └── createT5TestData.py
    └── CMakeLists.txt
├── .gitignore
├── docs
    └── images
    │   └── inference_performance.png
├── make_all.sh
├── src
    ├── CMakeLists.txt
    └── fastertransformer
    │   ├── tensorrt_plugin
    │       ├── t5
    │       │   ├── T5PluginGemm.h
    │       │   └── CMakeLists.txt
    │       ├── CMakeLists.txt
    │       ├── vit
    │       │   └── CMakeLists.txt
    │       └── swin
    │       │   └── CMakeLists.txt
    │   ├── utils
    │       ├── cuda_bf16_wrapper.h
    │       ├── word_list.h
    │       ├── nvtx_utils.cc
    │       ├── convert_data_type.h
    │       ├── gemm_test
    │       │   ├── swin_gemm_func.h
    │       │   ├── encoder_gemm_func.h
    │       │   ├── xlnet_gemm_func.h
    │       │   └── swin_igemm_func.h
    │       ├── word_list.cc
    │       ├── mpi_utils.h
    │       └── memory_utils.h
    │   ├── th_op
    │       ├── gpt
    │       │   └── CMakeLists.txt
    │       ├── encoder
    │       │   └── CMakeLists.txt
    │       ├── decoder
    │       │   └── CMakeLists.txt
    │       ├── codegeex
    │       │   └── CMakeLists.txt
    │       ├── longformer
    │       │   └── CMakeLists.txt
    │       ├── t5
    │       │   └── CMakeLists.txt
    │       ├── bert
    │       │   └── CMakeLists.txt
    │       ├── multi_gpu_gpt
    │       │   ├── CMakeLists.txt
    │       │   └── WeightTransposeCalibrateQuantizeOp.h
    │       ├── multi_gpu_codegeex
    │       │   ├── CMakeLists.txt
    │       │   └── WeightTransposeCalibrateQuantizeOp.h
    │       ├── decoding
    │       │   ├── CMakeLists.txt
    │       │   └── GatherTreeOp.h
    │       ├── vit
    │       │   └── CMakeLists.txt
    │       ├── th_traits.h
    │       ├── swin
    │       │   └── CMakeLists.txt
    │       ├── CMakeLists.txt
    │       └── th_utils.cu
    │   ├── tf_op
    │       ├── gpt
    │       │   └── CMakeLists.txt
    │       ├── decoding
    │       │   └── CMakeLists.txt
    │       ├── encoder
    │       │   └── CMakeLists.txt
    │       ├── bert
    │       │   └── CMakeLists.txt
    │       ├── decoder
    │       │   └── CMakeLists.txt
    │       └── CMakeLists.txt
    │   ├── triton_backend
    │       ├── CMakeLists.txt
    │       ├── t5
    │       │   └── CMakeLists.txt
    │       ├── gptj
    │       │   └── CMakeLists.txt
    │       ├── multi_gpu_gpt
    │       │   └── CMakeLists.txt
    │       └── triton_utils.hpp
    │   ├── layers
    │       ├── FfnWeight.h
    │       ├── FfnINT8Weight.h
    │       ├── xlnet_attention_layers
    │       │   ├── CMakeLists.txt
    │       │   └── XlnetAttentionWeight.h
    │       ├── attention_layers
    │       │   └── AttentionWeight.h
    │       ├── attention_layers_int8
    │       │   └── AttentionINT8Weight.h
    │       ├── DenseWeight.h
    │       ├── beam_search_layers
    │       │   └── CMakeLists.txt
    │       ├── DynamicDecodeBaseLayer.h
    │       └── sampling_layers
    │       │   └── CMakeLists.txt
    │   ├── CMakeLists.txt
    │   ├── kernels
    │       ├── transform_mask_kernels.h
    │       ├── int8_utils.cuh
    │       ├── matrix_transpose_kernels.h
    │       ├── quantization_int8_kernels.h
    │       ├── stop_criteria_kernels.h
    │       ├── quantize_weight.h
    │       ├── gen_relative_pos_bias.h
    │       ├── dequantize_kernels.h
    │       ├── activation_kernels.h
    │       ├── calibrate_quantize_weight_kernels.h
    │       ├── ban_bad_words.h
    │       ├── logprob_kernels.h
    │       ├── online_softmax_beamsearch_kernels.h
    │       ├── xlnet_preprocess_kernels.h
    │       ├── layout_transformer_int8_kernels.h
    │       ├── add_bias_transpose_kernels.h
    │       ├── reverse_roll_kernels.h
    │       ├── vit_kernels.h
    │       ├── beam_search_topk_kernels.h
    │       ├── custom_ar_kernels.h
    │       └── beam_search_penalty_kernels.h
    │   └── models
    │       ├── decoder
    │           └── CMakeLists.txt
    │       ├── CMakeLists.txt
    │       ├── longformer
    │           └── CMakeLists.txt
    │       ├── xlnet
    │           └── CMakeLists.txt
    │       ├── decoding
    │           └── CMakeLists.txt
    │       ├── bert
    │           └── CMakeLists.txt
    │       ├── vit_int8
    │           └── CMakeLists.txt
    │       ├── vit
    │           └── CMakeLists.txt
    │       ├── bert_int8
    │           └── CMakeLists.txt
    │       ├── swin_int8
    │           └── CMakeLists.txt
    │       └── swin
    │           ├── CMakeLists.txt
    │           └── SwinWeight.h
├── tests
    ├── CMakeLists.txt
    └── unittests
    │   └── CMakeLists.txt
├── .gitmodules
├── 3rdparty
    ├── CMakeLists.txt
    └── trt_fused_multihead_attention
    │   ├── fused_multihead_attention_common.h
    │   └── CMakeLists.txt
├── post.py
├── README.md
└── cmake
    └── FasterTransformerConfig.cmake.in


/examples/cpp/gptj/bad_words.csv:
--------------------------------------------------------------------------------
1 | 7768,3908
2 | 1,2
3 | 


--------------------------------------------------------------------------------
/examples/cpp/gptj/stop_words.csv:
--------------------------------------------------------------------------------
1 | 287, 4346, 12
2 | 3, -1, -1
3 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/checkpoints/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/processors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/pytorch/gpt/requirement.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | fire
3 | rouge_score
4 | transformers


--------------------------------------------------------------------------------
/examples/pytorch/swin/Swin-Transformer-Quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from SwinTransformer.config import get_config


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.o
3 | *build*/
4 | ./models/
5 | __pycache__/
6 | .vscode
7 | ./translation
8 | .cache
9 | 


--------------------------------------------------------------------------------
/examples/pytorch/vit/requirement.txt:
--------------------------------------------------------------------------------
1 | ml_collections
2 | pytorch-quantization
3 | timm==0.4.12
4 | termcolor==1.1.0
5 | yacs


--------------------------------------------------------------------------------
/examples/pytorch/t5/requirement.txt:
--------------------------------------------------------------------------------
1 | transformers==4.10.0
2 | tokenizers==0.10.1
3 | omegaconf
4 | SentencePiece
5 | sacrebleu
6 | 


--------------------------------------------------------------------------------
/examples/tensorflow/requirement.txt:
--------------------------------------------------------------------------------
1 | fire>=0.1.3
2 | regex==2017.4.5
3 | requests==2.21.0
4 | tqdm==4.31.1
5 | opennmt-tf==1.25.1 # for tf 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/apex_sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_masklib import create_mask
2 | from .asp import ASP
3 | 


--------------------------------------------------------------------------------
/docs/images/inference_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/docs/images/inference_performance.png


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/scripts/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker build --network=host . --rm --pull --no-cache -t bert
3 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/images/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/model.png


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/images/nvlamb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/nvlamb.png


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/images/loss_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeGeeX/codegeex-fastertransformer/HEAD/examples/pytorch/bert/bert-quantization-sparsity/images/loss_curves.png


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/NOTICE:
--------------------------------------------------------------------------------
1 | BERT PyTorch
2 | 
3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
4 | licensed under the Apache License 2.0.
5 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/NOTICE:
--------------------------------------------------------------------------------
1 | BERT TensorFlow
2 | 
3 | This repository includes software from https://github.com/google-research/bert
4 | licensed under the Apache License, Version 2.0 (the "License")


--------------------------------------------------------------------------------
/examples/pytorch/requirement.txt:
--------------------------------------------------------------------------------
1 | fire>=0.1.3
2 | regex==2017.4.5
3 | requests==2.21.0
4 | tqdm==4.31.1
5 | opennmt-py==1.1.1 # for pytorch
6 | transformers==2.5.1 # for pytorch
7 | ml_collection # for pytorch
8 | sacrebleu # for pytorch


--------------------------------------------------------------------------------
/examples/pytorch/vit/run.sh:
--------------------------------------------------------------------------------
1 | python infer_visiontransformer_int8_op.py \
2 |     --th-path=../../../build/lib/libpyt_vit.so \
3 |     --calibrated_dir /workspace/checkpoint/ViT-B_16_ft1_99.999_82.846.pth \
4 |     --img_size 384 \
5 |     --quant-mode ft1
6 | 


--------------------------------------------------------------------------------
/examples/pytorch/vit/run2.sh:
--------------------------------------------------------------------------------
1 | python infer_visiontransformer_int8_op.py \
2 |     --th-path=../../../build/lib/libpyt_vit.so \
3 |     --calibrated_dir /workspace/checkpoint/ViT-B_16_ft2_99.99_81.948.pth \
4 |     --img_size 384 \
5 |     --quant-mode ft2 
6 | 


--------------------------------------------------------------------------------
/make_all.sh:
--------------------------------------------------------------------------------
1 | cd /workspace/FasterTransformer/
2 | mkdir build
3 | cd build
4 | # Change DSM to the correspoding version of GPUs (e.g. 80 for A100, RTX 3090; 75 for RTX TITAN)
5 | cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON ..
6 | make -j12
7 | cd ..
8 | ./build/bin/codegeex_gemm 1 1 32 64 64  16348 50048 1 1


--------------------------------------------------------------------------------
/examples/pytorch/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | def print_memory_usage(info=""):
4 |     t = torch.cuda.get_device_properties(0).total_memory / 1024**2
5 |     r = torch.cuda.memory_reserved(0) / 1024**2
6 |     a = torch.cuda.memory_allocated(0) / 1024**2
7 |     f = r-a  # free inside reserved
8 |     print(f"[INFO][{info}] total_memory: {t}, reversed: {r}, allocated: {a}")
9 | 


--------------------------------------------------------------------------------
/examples/cpp/gpt/start_ids.csv:
--------------------------------------------------------------------------------
1 | 818, 262, 938, 3155, 286, 1528, 11, 257
2 | 198, 464, 968, 8221, 2732, 286, 15198, 318
3 | 464, 968, 1971, 12056, 423, 257, 649, 1182
4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079
5 | 818, 257, 1445, 326, 481, 1884, 787, 340
6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423
7 | 198, 198, 464, 5398, 4332, 628, 628, 198
8 | 464, 717, 640, 314, 2497, 262, 3807, 11
9 | 


--------------------------------------------------------------------------------
/examples/cpp/gptj/start_ids.csv:
--------------------------------------------------------------------------------
1 | 818, 262, 938, 3155, 286, 1528, 11, 257
2 | 198, 464, 968, 8221, 2732, 286, 15198, 318
3 | 464, 968, 1971, 12056, 423, 257, 649, 1182
4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079
5 | 818, 257, 1445, 326, 481, 1884, 787, 340
6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423
7 | 198, 198, 464, 5398, 4332, 628, 628, 198
8 | 464, 717, 640, 314, 2497, 262, 3807, 11
9 | 


--------------------------------------------------------------------------------
/examples/pytorch/swin/run_test.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_op.py \
2 |     --eval \
3 |     --data-path /workspace \
4 |     --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
5 |     --resume Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \
6 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
7 |     --batch-size $1
8 | 


--------------------------------------------------------------------------------
/examples/cpp/multi_gpu_gpt/start_ids.csv:
--------------------------------------------------------------------------------
1 | 818, 262, 938, 3155, 286, 1528, 11, 257
2 | 198, 464, 968, 8221, 2732, 286, 15198, 318
3 | 464, 968, 1971, 12056, 423, 257, 649, 1182
4 | 464, 968, 1971, 3782, 468, 3199, 663, 5079
5 | 818, 257, 1445, 326, 481, 1884, 787, 340
6 | 464, 968, 1971, 12056, 6, 5859, 41683, 423
7 | 198, 198, 464, 5398, 4332, 628, 628, 198
8 | 464, 717, 640, 314, 2497, 262, 3807, 11
9 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Accessing files from S3 directly.
 4 | boto3
 5 | # Used for downloading models over HTTP
 6 | requests
 7 | six
 8 | ipdb
 9 | #Data processing
10 | h5py
11 | html2text
12 | nltk
13 | progressbar
14 | #Others
15 | onnxruntime
16 | git+https://github.com/NVIDIA/dllogger
17 | 


--------------------------------------------------------------------------------
/examples/pytorch/codegeex/utils/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
2 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_builder_fp16.sh:
--------------------------------------------------------------------------------
1 | python builder_fp16.py \
2 |     --batch-size 32 \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \
5 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
6 |     --output swin_transformer_fp16.engine
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_builder_fp32.sh:
--------------------------------------------------------------------------------
1 | python builder_fp32.py \
2 |     --batch-size 32 \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \
5 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
6 |     --output swin_transformer_fp32.engine
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 1024,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 4096,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 16,
10 |   "num_hidden_layers": 24,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_builder_int8.sh:
--------------------------------------------------------------------------------
1 | python builder_int8.py \
2 |     --batch-size 32 \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
5 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
6 |     --output swin_transformer_int8.engine


--------------------------------------------------------------------------------
/examples/pytorch/swin/run_test_int8.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_int8_op.py \
2 |     --profile \
3 |     --data-path /workspace \
4 |     --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
5 |     --resume Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
6 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
7 |     --int8-mode 1\
8 |     --batch-size $1
9 | 


--------------------------------------------------------------------------------
/examples/pytorch/swin/Swin-Transformer-Quantization/run.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch \
 2 |     --nproc_per_node 1 \
 3 |     --master_port 12346 main.py \
 4 |     --eval \
 5 |     --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
 6 |     --resume ./calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
 7 |     --data-path /data/datasets/ILSVRC2012/ \
 8 |     --quant-mode ft2\
 9 |     --int8-mode 1\
10 |     --batch-size 128
11 | 


--------------------------------------------------------------------------------
/examples/pytorch/swin/run_test_int8_accuracy.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_int8_op.py \
2 |     --eval \
3 |     --data-path /workspace \
4 |     --cfg Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
5 |     --resume  Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
6 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
7 |     --int8-mode 1\
8 |     --batch-size $1
9 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_infer_fp32.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_plugin.py \
2 |     --eval \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \
5 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
6 |     --engine swin_transformer_fp32.engine \
7 |     --batch-size $1 
8 | 


--------------------------------------------------------------------------------
/examples/pytorch/swin/Swin-Transformer-Quantization/calib.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch --nproc_per_node 1 \
 2 |   --master_port 12345 main.py \
 3 |   --calib \
 4 |   --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
 5 |   --resume swin_tiny_patch4_window7_224.pth \
 6 |   --data-path /data/datasets/ILSVRC2012 \
 7 |   --num-calib-batch 10 \
 8 |   --calib-batchsz 8\
 9 |   --int8-mode 1\
10 |   --calib-output-path calib-checkpoint
11 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_infer_fp16.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_plugin.py \
2 |     --eval \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/swin_tiny_patch4_window7_224.pth \
5 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
6 |     --engine swin_transformer_fp16.engine \
7 |     --batch-size $1 \
8 |     --use-fp16
9 | 


--------------------------------------------------------------------------------
/examples/tensorrt/swin/run_infer_int8.sh:
--------------------------------------------------------------------------------
1 | python infer_swintransformer_plugin_int8.py \
2 |     --eval \
3 |     --cfg ../../pytorch/swin/Swin-Transformer-Quantization/SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
4 |     --resume ../../pytorch/swin/Swin-Transformer-Quantization/calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
5 |     --int8-mode 1 \
6 |     --th-path ../../../build/lib/libpyt_swintransformer.so \
7 |     --engine swin_transformer_int8.engine \
8 |     --batch-size $1
9 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/scripts/docker/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CMD=${1:-/bin/bash}
 4 | NV_VISIBLE_DEVICES=${2:-"all"}
 5 | DOCKER_BRIDGE=${3:-"host"}
 6 | 
 7 | docker run -it --rm \
 8 |   --gpus device=$NV_VISIBLE_DEVICES \
 9 |   --net=$DOCKER_BRIDGE \
10 |   --shm-size=1g \
11 |   --ulimit memlock=-1 \
12 |   --ulimit stack=67108864 \
13 |   -e LD_LIBRARY_PATH='/workspace/install/lib/' \
14 |   -v $PWD:/workspace/bert \
15 |   -v $PWD/results:/results \
16 |   bert $CMD
17 | 


--------------------------------------------------------------------------------
/examples/pytorch/vit/ViT-quantization/calib.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch --nproc_per_node 1 \
 2 |     --master_port 12345 main.py \
 3 |     --calib \
 4 |     --name vit \
 5 |     --pretrained_dir $CKPT_DIR/ViT-B_16.npz \
 6 |     --data-path $DATA_DIR \
 7 |     --model_type ViT-B_16 \
 8 |     --img_size 384 \
 9 |     --num-calib-batch 10 \
10 |     --calib-batchsz 8 \
11 |     --quant-mode ft2 \
12 |     --calibrator percentile \
13 |     --percentile 99.99 \
14 |     --calib-output-path calib-checkpoint
15 | 


--------------------------------------------------------------------------------
/examples/pytorch/vit/ViT-quantization/qat.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch --nproc_per_node 1 \
 2 |     --master_port 12345 main.py \
 3 |     --train \
 4 |     --name vit \
 5 |     --pretrained_dir calib-checkpoint/ViT-B_16_calib.pth \
 6 |     --data-path $DATA_DIR \
 7 |     --model_type ViT-B_16 \
 8 |     --quant-mode ft2 \
 9 |     --img_size 384 \
10 |     --distill \
11 |     --teacher $CKPT_DIR/ViT-B_16.npz \
12 |     --output qat_output \
13 |     --quant-mode ft2\
14 |     --batch-size 16 \
15 |     --num-epochs 5 \
16 |     --qat-lr 1e-4


--------------------------------------------------------------------------------
/examples/pytorch/swin/Swin-Transformer-Quantization/qat.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch \
 2 |     --nproc_per_node 1 --master_port 12346 main.py \
 3 |     --train \
 4 |     --cfg SwinTransformer/configs/swin_tiny_patch4_window7_224.yaml \
 5 |     --resume ./calib-checkpoint/swin_tiny_patch4_window7_224_calib.pth \
 6 |     --data-path /data/datasets/ILSVRC2012 \
 7 |     --quant-mode ft2 \
 8 |     --teacher swin_tiny_patch4_window7_224.pth \
 9 |     --output qat-output \
10 |     --distill \
11 |     --int8-mode 1\
12 |     --batch-size 32\
13 |     --num-epochs 3 \
14 |     --qat-lr 1e-5
15 | 


--------------------------------------------------------------------------------
/examples/pytorch/codegeex/utils/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}
2 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/tensorflow_bert/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.


--------------------------------------------------------------------------------
/examples/pytorch/decoding/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(cpp)


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(fastertransformer)


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(unittests)
16 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "3rdparty/Megatron-LM"]
 2 | 	path = 3rdparty/Megatron-LM
 3 | 	url = https://github.com/NVIDIA/Megatron-LM.git
 4 | 	branch = v2.6
 5 | [submodule "examples/tensorflow/bert/tensorflow_bert/bert"]
 6 | 	path = examples/tensorflow/bert/tensorflow_bert/bert
 7 | 	url = https://github.com/google-research/bert.git
 8 | [submodule "examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer"]
 9 | 	path = examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer
10 | 	url = https://github.com/microsoft/Swin-Transformer
11 | [submodule "examples/pytorch/vit/ViT-quantization/ViT-pytorch"]
12 | 	path = examples/pytorch/vit/ViT-quantization/ViT-pytorch
13 | 	url = https://github.com/jeonsworld/ViT-pytorch
14 | 


--------------------------------------------------------------------------------
/3rdparty/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(trt_fused_multihead_attention)


--------------------------------------------------------------------------------
/src/fastertransformer/tensorrt_plugin/t5/T5PluginGemm.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | int t5_gemm(int argv[16]);
18 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tensorrt_plugin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(swin)
16 | add_subdirectory(t5)
17 | add_subdirectory(vit)
18 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/cuda_bf16_wrapper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #ifdef ENABLE_BF16
20 | #include <cuda_bf16.h>
21 | #endif


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_gpt SHARED GptOp.cc)
16 | target_link_libraries(th_gpt PRIVATE "${TORCH_LIBRARIES}" ParallelGpt th_utils)
17 | 


--------------------------------------------------------------------------------
/examples/cpp/gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(gpt_example gpt_example.cc)
16 | target_link_libraries(gpt_example PUBLIC -lcublas -lcublasLt -lcudart ParallelGpt nvtx_utils)
17 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | data/download
15 | data/extracted
16 | data/formatted_one_article_per_line
17 | data/sharded
18 | data/hdf5
19 | vocab/
20 | results/


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/encoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_encoder SHARED EncoderOp.cc)
16 | target_link_libraries(th_encoder PRIVATE "${TORCH_LIBRARIES}" Bert th_utils)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(tf_gpt SHARED GptOp.cc)
16 | target_link_libraries(tf_gpt PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart ParallelGpt)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/decoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_decoder SHARED DecoderOp.cc)
16 | target_link_libraries(th_decoder PRIVATE "${TORCH_LIBRARIES}" Decoder th_utils)
17 | 


--------------------------------------------------------------------------------
/examples/cpp/bert_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(bert_int8_example bert_int8_example.cc)
16 | target_link_libraries(bert_int8_example PUBLIC -lcublas -lcublasLt -lcudart BertINT8)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/decoding/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(tf_decoding SHARED DecodingOp.cc)
16 | target_link_libraries(tf_decoding PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart Decoding)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/codegeex/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_codegeex SHARED CodegeexOp.cc)
16 | target_link_libraries(th_codegeex PRIVATE "${TORCH_LIBRARIES}" ParallelCodegeex th_utils)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tensorrt_plugin/t5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(trt_t5 SHARED T5PluginGemm.cc T5Plugin.cu)
16 | target_link_libraries(trt_t5 PRIVATE T5Encoder T5Decoding t5_gemm_func -lnvinfer)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/encoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(tf_encoder SHARED EncoderOp.cc)
16 | target_link_libraries(tf_encoder PRIVATE Bert ${tf_link} -lcublas -lcublasLt -lcudart cublasAlgoMap)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/longformer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_longformer SHARED LongformerEncoderOp.cc)
16 | target_link_libraries(th_longformer PRIVATE "${TORCH_LIBRARIES}" LongformerEncoder th_utils)


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/t5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_t5 SHARED T5EncoderOp.cc T5DecoderOp.cc T5DecodingOp.cc)
16 | target_link_libraries(th_t5 PRIVATE "${TORCH_LIBRARIES}" T5Encoder T5Decoder T5Decoding th_utils)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/bert/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_bert SHARED BertOp.cc BertINT8Op.cc WeightQuantizeOp.cc)
16 | target_link_libraries(th_bert PRIVATE "${TORCH_LIBRARIES}" Bert BertINT8 th_utils quantize_weight)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/triton_backend/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | if(BUILD_MULTI_GPU)
18 |     add_subdirectory(gptj)
19 |     add_subdirectory(t5)
20 |     add_subdirectory(multi_gpu_gpt)
21 | endif()


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/bert/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(tf_bert SHARED BertOp.cc BertINT8Op.cc weight_quantize_op.cc)
16 | target_link_libraries(tf_bert PRIVATE Bert BertINT8 ${tf_link} -lcublas -lcublasLt -lcudart cublasAlgoMap)
17 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | .idea/
15 | .git/
16 | __pycache__/
17 | results/
18 | data/binary
19 | data/download
20 | data/extracted
21 | data/formatted_one_article_per_line
22 | data/sharded
23 | data/hdf5*
24 | data/tfrecord*
25 | checkpoints/
26 | 


--------------------------------------------------------------------------------
/examples/cpp/vit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | add_executable(vit_example vit_example.cc)
17 | target_link_libraries(vit_example PUBLIC ViT trt_fused_multi_head_attention vit_kernels
18 |   cublasMMWrapper -lcublas -lcublasLt -lcudart -lcudnn -lm)
19 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/multi_gpu_gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_parallel_gpt SHARED ParallelGptOp.cc WeightTransposeCalibrateQuantizeOp.cc)
16 | target_link_libraries(th_parallel_gpt PRIVATE "${TORCH_LIBRARIES}" ParallelGpt th_utils calibrate_quantize_weight_kernels)
17 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/word_list.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "Tensor.h"
20 | #include "stdlib.h"
21 | 
22 | namespace fastertransformer {
23 | 
24 | int read_word_list(const std::string& filename, std::vector<int>& tensor_data);
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/calib/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/python/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 


--------------------------------------------------------------------------------
/examples/cpp/gptj/gptj_config.ini:
--------------------------------------------------------------------------------
 1 | [ft_instance_hyperparameter]
 2 | max_batch_size=8 ; Use for allocate the buffer
 3 | max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter
 4 | beam_width=1 ; beam width for beam search
 5 | top_k=0 ; k value for top k sampling
 6 | top_p=0.5 ; p value for top p sampling
 7 | temperature=1.0 ; Use for sampling
 8 | repetition_penalty=2.0 ; Use for sampling
 9 | len_penalty=1.0
10 | beam_search_diversity_rate=0.0
11 | is_half=0
12 | enable_custom_all_reduce=0
13 | 
14 | tensor_para_size=8
15 | pipeline_para_size=1
16 | 
17 | model_name=gptj_6B
18 | model_dir=../models/j6b_ckpt/
19 | 
20 | [request]
21 | request_batch_size=8 # determine by the request
22 | request_output_len=32 # determine by the request
23 | 
24 | [gptj_6B]
25 | head_num=16
26 | size_per_head=256
27 | vocab_size=50400
28 | decoder_layers=28
29 | rotary_embedding=64
30 | start_id=50256
31 | end_id=50256
32 | inter_size=16384
33 | 


--------------------------------------------------------------------------------
/examples/cpp/vit_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | add_executable(vit_int8_example vit_int8_example.cc)
17 | target_link_libraries(vit_int8_example PUBLIC ViTINT8 trt_fused_multi_head_attention vit_kernels
18 |   cublasMMWrapper -lcublas -lcublasLt -lcudart -lcudnn -lm)
19 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/multi_gpu_codegeex/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_parallel_codegeex SHARED ParallelCodegeexOp.cc WeightTransposeCalibrateQuantizeOp.cc)
16 | target_link_libraries(th_parallel_codegeex PRIVATE "${TORCH_LIBRARIES}" ParallelCodegeex th_utils calibrate_quantize_weight_kernels)
17 | 


--------------------------------------------------------------------------------
/examples/cpp/bert/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(bert_example bert_example.cc)
16 | if (SPARSITY_SUPPORT)
17 | target_link_libraries(bert_example PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt Bert)
18 | else()
19 | target_link_libraries(bert_example PUBLIC -lcublas -lcublasLt -lcudart Bert)
20 | endif()
21 | 


--------------------------------------------------------------------------------
/examples/cpp/decoding/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(decoding_example decoding_example.cc)
16 | target_link_libraries(decoding_example PUBLIC -lcublasLt Decoding nvtx_utils)
17 | 
18 | add_executable(layernorm_test layernorm_test.cc)
19 | target_link_libraries(layernorm_test PUBLIC -lcublasLt layernorm_kernels memory_utils)
20 | 


--------------------------------------------------------------------------------
/examples/cpp/swin_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | add_executable(swin_int8_example swin_int8_example.cc)
17 | target_link_libraries(swin_int8_example PUBLIC trt_fused_multi_head_attention SwinINT8 cublasAlgoMap 
18 |     cublasINT8MMWrapper quantize_weight memory_utils -lcublasLt -lcublas -lcudart -lcudnn)
19 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/decoding/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(th_decoding SHARED DecodingOp.cc)
16 | target_link_libraries(th_decoding PRIVATE "${TORCH_LIBRARIES}" Decoding th_utils)
17 | 
18 | add_library(th_gather_tree SHARED GatherTreeOp.cc)
19 | target_link_libraries(th_gather_tree PRIVATE "${TORCH_LIBRARIES}" decoding_kernels th_utils)


--------------------------------------------------------------------------------
/examples/cpp/swin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | set(swin_transformer_nv_files
17 |   swin_example.cc
18 | )
19 | add_executable(swin_example ${swin_transformer_nv_files})
20 | target_link_libraries(swin_example PUBLIC trt_fused_multi_head_attention Swin 
21 |   cublasMMWrapper memory_utils -lcublas -lcublasLt -lcudart -lcudnn)
22 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/FfnWeight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "DenseWeight.h"
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | struct FfnWeight {
25 |     DenseWeight<T> intermediate_weight;
26 |     DenseWeight<T> output_weight;
27 | };
28 | 
29 | }  // namespace fastertransformer
30 | 


--------------------------------------------------------------------------------
/src/fastertransformer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(utils)
16 | add_subdirectory(kernels)
17 | add_subdirectory(layers)
18 | add_subdirectory(models)
19 | if(BUILD_TF)
20 |     add_subdirectory(tf_op)
21 | endif()
22 | if(BUILD_PYT)
23 |     add_subdirectory(th_op)
24 | endif()
25 | add_subdirectory(triton_backend)
26 | if(BUILD_TRT)
27 |     add_subdirectory(tensorrt_plugin)
28 | endif()
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/decoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(tf_decoder SHARED DecoderOp.cc)
16 | target_link_libraries(tf_decoder PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart Decoder)
17 | 
18 | add_library(tf_fused_self_attention SHARED FusedSelfAttentionOp.cc)
19 | target_link_libraries(tf_fused_self_attention PRIVATE ${tf_link} -lcublas -lcublasLt -lcudart DecoderSelfAttentionLayer)
20 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/FfnINT8Weight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "FfnWeight.h"
20 | #include "src/fastertransformer/utils/ScaleList.h"
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | struct FfnINT8Weight: FfnWeight<T> {
25 |     ScaleList* scale_list_ptr;
26 | };
27 | 
28 | }  // namespace fastertransformer
29 | 


--------------------------------------------------------------------------------
/examples/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(bert)
16 | add_subdirectory(bert_int8)
17 | add_subdirectory(decoding)
18 | add_subdirectory(gpt)
19 | add_subdirectory(xlnet)
20 | add_subdirectory(swin)
21 | add_subdirectory(swin_int8)
22 | add_subdirectory(vit)
23 | add_subdirectory(vit_int8)
24 | 
25 | if(BUILD_MULTI_GPU)
26 |     add_subdirectory(gptj)
27 |     add_subdirectory(multi_gpu_gpt)
28 | endif()
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/triton_backend/t5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | set(t5_triton_backend_files
18 |     T5TritonModel.cc
19 |     T5TritonModelInstance.cc
20 | )
21 | 
22 | add_library(T5TritonBackend SHARED ${t5_triton_backend_files})
23 | target_link_libraries(T5TritonBackend PRIVATE T5Encoder T5Decoding)
24 | target_compile_features(T5TritonBackend PRIVATE cxx_std_14)
25 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/multi_gpu_gpt/WeightTransposeCalibrateQuantizeOp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h"
18 | #include "src/fastertransformer/th_op/th_utils.h"
19 | 
20 | namespace torch_ext {
21 | using torch::Tensor;
22 | 
23 | std::vector<Tensor> weight_transpose_calibrate_quantize(Tensor weight);
24 | 
25 | }  // namespace torch_ext
26 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/multi_gpu_codegeex/WeightTransposeCalibrateQuantizeOp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h"
18 | #include "src/fastertransformer/th_op/th_utils.h"
19 | 
20 | namespace torch_ext {
21 | using torch::Tensor;
22 | 
23 | std::vector<Tensor> weight_transpose_calibrate_quantize(Tensor weight);
24 | 
25 | }  // namespace torch_ext
26 | 


--------------------------------------------------------------------------------
/src/fastertransformer/triton_backend/gptj/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | set(parallel_gpt_triton_backend_files
18 |     GptJTritonModel.cc
19 |     GptJTritonModelInstance.cc
20 | )
21 | 
22 | add_library(GptJTritonBackend SHARED ${parallel_gpt_triton_backend_files})
23 | target_link_libraries(GptJTritonBackend PRIVATE GptJ)
24 | target_compile_features(GptJTritonBackend PRIVATE cxx_std_14)
25 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/transform_mask_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <assert.h>
19 | #include <cuda_fp16.h>
20 | #include <cuda_runtime.h>
21 | #include <stdint.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | void invokeTransformMask(
26 |     half* tranformed_mask, const half* mask, const uint32_t B, const uint32_t S, cudaStream_t stream);
27 | 
28 | }  // namespace fastertransformer
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/decoding/GatherTreeOp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "src/fastertransformer/kernels/decoding_kernels.h"
18 | #include "src/fastertransformer/th_op/th_utils.h"
19 | 
20 | namespace th = torch;
21 | 
22 | namespace torch_ext {
23 | 
24 | th::Tensor
25 | gather_tree(th::Tensor step_ids, th::Tensor parent_ids, th::Tensor max_sequence_lengths, th::Tensor end_tokens);
26 | 
27 | }  // namespace torch_ext


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/int8_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <cuda_fp16.h>
19 | #include <cuda_runtime.h>
20 | #include <stdint.h>
21 | 
22 | static inline __device__ int8_t float_to_int8_rn(float x)
23 | {
24 |   uint32_t dst;
25 |   asm volatile("cvt.rni.sat.s8.f32 %0, %1;"
26 |                : "=r"(dst)
27 |                : "f"(x));
28 |   return reinterpret_cast<const int8_t &>(dst);
29 | }
30 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/matrix_transpose_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <assert.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_runtime.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | template<typename T>
26 | void invokeMatrixTranspose(T* dst, const T* src, const int m, const int n, cudaStream_t stream);
27 | 
28 | }  // namespace fastertransformer
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/quantization_int8_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "int8_utils.cuh"
20 | #include <cuda_fp16.h>
21 | #include <cuda_runtime.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | template<typename T>
26 | void invokeQuantization(int8_t* dst, const T* src, const int size, const float* scale_ptr, cudaStream_t stream);
27 | 
28 | }  // namespace fastertransformer
29 | 


--------------------------------------------------------------------------------
/examples/pytorch/decoding/utils/download_model.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAIN_PATH=$PWD
17 | 
18 | mkdir -p $MAIN_PATH/pytorch/translation/models/
19 | 
20 | cd $MAIN_PATH/pytorch/translation/models/
21 | if [ ! -f "sentencepiece.model" ] || [ ! -f "averaged-10-epoch.pt" ]; then
22 |     wget -c https://s3.amazonaws.com/opennmt-models/transformer-ende-wmt-pyOnmt.tar.gz
23 |     tar -xzvf transformer-ende-wmt-pyOnmt.tar.gz
24 |     rm transformer-ende-wmt-pyOnmt.tar.gz
25 | fi
26 | 


--------------------------------------------------------------------------------
/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | set(parallel_gpt_triton_backend_files
18 |     ParallelGptTritonModel.cc
19 |     ParallelGptTritonModelInstance.cc
20 | )
21 | 
22 | add_library(ParallelGptTritonBackend SHARED ${parallel_gpt_triton_backend_files})
23 | target_link_libraries(ParallelGptTritonBackend PRIVATE ParallelGpt)
24 | target_compile_features(ParallelGptTritonBackend PRIVATE cxx_std_14)


--------------------------------------------------------------------------------
/src/fastertransformer/models/decoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(Decoder STATIC Decoder.cc)
18 | set_property(TARGET Decoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET Decoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(Decoder PUBLIC -lcudart cublasMMWrapper DecoderSelfAttentionLayer
21 |                     DecoderCrossAttentionLayer FfnLayer layernorm_kernels add_residual_kernels)


--------------------------------------------------------------------------------
/src/fastertransformer/layers/xlnet_attention_layers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(XlnetAttentionLayer STATIC XlnetAttentionLayer.cc)
18 | set_property(TARGET XlnetAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET XlnetAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(XlnetAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils xlnet_attention_kernels)
21 | 
22 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(bert)
16 | add_subdirectory(bert_int8)
17 | add_subdirectory(decoder)
18 | add_subdirectory(longformer)
19 | add_subdirectory(decoding)
20 | add_subdirectory(xlnet)
21 | 
22 | add_subdirectory(t5)
23 | add_subdirectory(gptj)
24 | add_subdirectory(multi_gpu_gpt)
25 | add_subdirectory(multi_gpu_codegeex)
26 | add_subdirectory(swin)
27 | add_subdirectory(swin_int8)
28 | add_subdirectory(vit)
29 | add_subdirectory(vit_int8)
30 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/nvtx_utils.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "nvtx_utils.h"
18 | 
19 | namespace nvtx {
20 | std::string getScope()
21 | {
22 |     return scope;
23 | }
24 | void addScope(std::string name)
25 | {
26 |     scope = scope + name + "/";
27 |     return;
28 | }
29 | void setScope(std::string name)
30 | {
31 |     scope = name + "/";
32 |     return;
33 | }
34 | void resetScope()
35 | {
36 |     scope = "";
37 |     return;
38 | }
39 | }  // namespace nvtx
40 | 


--------------------------------------------------------------------------------
/examples/cpp/gptj/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(gptj_example gptj_example.cc)
16 | target_link_libraries(gptj_example PUBLIC -lcublas -lcublasLt -lcudart
17 |                       GptJ nvtx_utils -lmpi gpt_example_utils word_list)
18 | 
19 | add_executable(gptj_triton_example gptj_triton_example.cc)
20 | target_link_libraries(gptj_triton_example PUBLIC -lcublas -lcublasLt -lcudart
21 |                       GptJTritonBackend custom_ar_comm -lmpi gpt_example_utils word_list -lpthread)
22 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/attention_layers/AttentionWeight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/layers/DenseWeight.h"
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | struct AttentionWeight {
25 |     DenseWeight<T> query_weight;
26 |     DenseWeight<T> key_weight;
27 |     DenseWeight<T> value_weight;
28 |     DenseWeight<T> attention_output_weight;
29 | };
30 | 
31 | }  // namespace fastertransformer
32 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/data/NVIDIAPretrainedWeightDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import os
15 | 
16 | class NVIDIAPretrainedWeightDownloader:
17 |     def __init__(self, save_path):
18 |         self.save_path = save_path + '/nvidia_pretrained_weights'
19 | 
20 |         if not os.path.exists(self.save_path):
21 |             os.makedirs(self.save_path)
22 | 
23 |         pass
24 | 
25 | 
26 |     def download(self):
27 |         assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'


--------------------------------------------------------------------------------
/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
20 | #include "src/fastertransformer/utils/ScaleList.h"
21 | 
22 | namespace fastertransformer {
23 | 
24 | template<typename T>
25 | struct AttentionINT8Weight: AttentionWeight<T> {
26 |     ScaleList* scale_list_ptr;
27 | };
28 | 
29 | }  // namespace fastertransformer
30 | 


--------------------------------------------------------------------------------
/src/fastertransformer/tensorrt_plugin/vit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | set(vit_trt_files
17 |   ViTPlugin.cpp
18 | )
19 | 
20 | 
21 | if(BUILD_TRT)
22 |   set(LIB_NAME "vit_plugin")
23 |   add_library(${LIB_NAME} SHARED ${vit_trt_files})
24 |   set_target_properties(${LIB_NAME} PROPERTIES
25 |                         CUDA_RESOLVE_DEVICE_SYMBOLS ON)
26 |   target_link_libraries(${LIB_NAME} trt_fused_multi_head_attention ViT -lcudnn -lcublas -lcudart -lnvinfer)
27 | endif()
28 | 


--------------------------------------------------------------------------------
/examples/tensorflow/xlnet/downloadModel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | data_dir=./data/
17 | if [[ ! -e $data_dir ]]; then
18 |     mkdir $data_dir
19 | fi
20 | 
21 | wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip
22 | unzip cased_L-12_H-768_A-12.zip
23 | mv xlnet_cased_L-12_H-768_A-12 ${data_dir}
24 | mv cased_L-12_H-768_A-12.zip ${data_dir}
25 | 
26 | wget https://dl.fbaipublicfiles.com/glue/data/STS-B.zip
27 | unzip STS-B.zip
28 | mv STS-B ${data_dir}
29 | mv STS-B.zip ${data_dir}
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/cpp/xlnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(cnpy STATIC cnpy.cpp)
16 | target_link_libraries(cnpy PUBLIC -lz)
17 | set_property(TARGET cnpy PROPERTY POSITION_INDEPENDENT_CODE ON)
18 | 
19 | add_executable(xlnet_example xlnet_example.cc)
20 | target_link_libraries(xlnet_example PUBLIC -lcublas -lcublasLt -lcudart -lz cnpy Xlnet)
21 | 
22 | add_executable(xlnet_correctness_example xlnet_correctness_example.cc)
23 | target_link_libraries(xlnet_correctness_example PUBLIC -lcublas -lcublasLt -lcudart -lz cnpy Xlnet)
24 | 
25 | 


--------------------------------------------------------------------------------
/examples/cpp/multi_gpu_gpt/gpt_example_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <string>
18 | #include <vector>
19 | 
20 | namespace fastertransformer {
21 | 
22 | int read_start_ids(int batch_size,
23 |                    std::vector<int>* v_start_lengths,
24 |                    std::vector<int>* v_start_ids,
25 |                    int& max_input_len,
26 |                    const int end_id,
27 |                    const int beam_width,
28 |                    std::string file_name);
29 | 
30 | }  // namespace fastertransformer
31 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/DenseWeight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "stdlib.h"
19 | namespace fastertransformer {
20 | 
21 | template<typename T>
22 | struct DenseWeight {
23 |     const T* kernel = nullptr;
24 |     const T* bias = nullptr;
25 |     const T* sp_kernel = nullptr;
26 |     // for int8 kernel
27 |     const int8_t* int8_kernel = nullptr;
28 |     const int8_t* int4_kernel = nullptr;
29 |     const T* quant_scale = nullptr;
30 |     const float* scale = nullptr;
31 | };
32 | 
33 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/src/fastertransformer/layers/xlnet_attention_layers/XlnetAttentionWeight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/layers/DenseWeight.h"
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | struct XlnetAttentionWeight {
25 |     T* attr_kernel_Q;
26 |     T* attr_kernel_K;
27 |     T* attr_kernel_V;
28 |     T* attr_bias_Q_w;
29 |     T* attr_bias_Q_r;
30 |     T* attr_bias_Q_s;
31 | 
32 |     T* attr_pos_emb;
33 |     T* attr_seg_embed;
34 |     T* attr_proj_o;
35 | };
36 | 
37 | }  // namespace fastertransformer
38 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/longformer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(LongformerEncoder STATIC LongformerEncoder.cc)
18 | set_property(TARGET LongformerEncoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET LongformerEncoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(LongformerEncoder PUBLIC -lcublas -lcudart -lcurand cublasMMWrapper
21 |                       LongformerAttentionLayer longformer_kernels add_bias_transpose_kernels
22 |                       activation_kernels layernorm_kernels FfnLayer)


--------------------------------------------------------------------------------
/src/fastertransformer/tensorrt_plugin/swin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.8)
15 | 
16 | set(swintransformer_trt_files
17 |   swinTransformerPlugin.cpp
18 |   swinTransformerINT8Plugin.cpp
19 | )
20 | 
21 | 
22 | if(BUILD_TRT)
23 |   set(LIB_NAME "swinTransformer_plugin")
24 |   add_library(${LIB_NAME} SHARED ${swintransformer_trt_files})
25 |   set_target_properties(${LIB_NAME} PROPERTIES
26 |                         CUDA_RESOLVE_DEVICE_SYMBOLS ON)
27 |   target_link_libraries(${LIB_NAME} trt_fused_multi_head_attention Swin SwinINT8 -lcudnn -lcublas -lcudart -lnvinfer)
28 | endif()
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/vit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.13)
15 | 
16 | set(vit_ths_files
17 |   ViTOp.cc
18 |   ViTINT8Op.cc
19 |   WeightQuantizeOp.cc
20 | )
21 | 
22 | add_definitions(-DTORCH_CUDA=1)
23 | 
24 | if(BUILD_PYT)
25 |   set(LIB_NAME "pyt_vit")
26 |   add_library(${LIB_NAME} SHARED ${vit_ths_files})
27 |   set_target_properties(${LIB_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
28 |   target_link_libraries(${LIB_NAME} ViT ViTINT8 quantize_weight cublasMMWrapper trt_fused_multi_head_attention 
29 |     -lcudnn -lcublas -lcudart "${TORCH_LIBRARIES}")
30 | endif()
31 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/xlnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(Xlnet STATIC Xlnet.cc)
18 | set_property(TARGET Xlnet PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET Xlnet PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(Xlnet PUBLIC -lcudart xlnet_preprocess_kernels cublasMMWrapper 
21 |                         XlnetAttentionLayer FfnLayer layernorm_kernels)
22 | 
23 | add_executable(xlnet_gemm xlnet_gemm.cc)
24 | target_link_libraries(xlnet_gemm PUBLIC -lcublas -lcublasLt -lcudart xlnet_gemm_func xlnet_gemm_func memory_utils)
25 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.03-tf1-py3
 2 | 
 3 | FROM ${FROM_IMAGE_NAME}
 4 | 
 5 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl
 6 | 
 7 | RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
 8 | 
 9 | WORKDIR /workspace
10 | RUN git clone https://github.com/openai/gradient-checkpointing.git
11 | RUN git clone https://github.com/attardi/wikiextractor.git
12 | RUN git clone https://github.com/soskek/bookcorpus.git
13 | RUN git clone https://github.com/titipata/pubmed_parser
14 | 
15 | RUN pip3 install /workspace/pubmed_parser
16 | 
17 | #Copy the perf_client over
18 | ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.5.0/v1.5.0_ubuntu1804.clients.tar.gz
19 | RUN mkdir -p /workspace/install \
20 |     && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
21 | 
22 | #Install the python wheel with pip
23 | RUN pip install /workspace/install/python/tensorrtserver*.whl
24 | 
25 | WORKDIR /workspace/bert
26 | COPY . .
27 | 
28 | ENV PYTHONPATH /workspace/bert
29 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data
30 | ENV PATH //workspace/install/bin:${PATH}
31 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}


--------------------------------------------------------------------------------
/src/fastertransformer/models/decoding/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(Decoding STATIC Decoding.cc)
18 | set_property(TARGET Decoding PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET Decoding PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(Decoding PUBLIC -lcublas -lcudart -lcurand Decoder decoding_kernels
21 |                         BeamSearchLayer DynamicDecodeLayer)
22 | 
23 | add_executable(decoding_gemm decoding_gemm.cc)
24 | target_link_libraries(decoding_gemm PUBLIC -lcublas -lcublasLt -lcudart decoding_gemm_func memory_utils)
25 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/convert_data_type.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "stdio.h"
19 | #include "stdlib.h"
20 | 
21 | // be consistent with FasterTransformer
22 | int8_t float_to_int8_rn_host(float x)
23 | {
24 |     int8_t res;
25 |     int32_t tmp;
26 |     if (x >= 0) {
27 |         tmp = int(x + 0.5);
28 |         tmp = tmp > 127 ? 127 : tmp;
29 |         res = int8_t(tmp);
30 |     }
31 |     else {
32 |         tmp = int(x - 0.5);
33 |         tmp = tmp < -127 ? -127 : tmp;
34 |         res = int8_t(tmp);
35 |     }
36 |     return res;
37 | }


--------------------------------------------------------------------------------
/src/fastertransformer/tf_op/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_definitions(-DGOOGLE_CUDA=1)
16 | 
17 | if(EXISTS ${TF_PATH}libtensorflow_framework.so)
18 |     set(tf_link
19 |         -ltensorflow_framework
20 |     )
21 | elseif(EXISTS ${TF_PATH}libtensorflow_framework.so.1)
22 |     set(tf_link
23 |         -l:libtensorflow_framework.so.1
24 |     )
25 | elseif(EXISTS ${TF_PATH}libtensorflow_framework.so.2)
26 |     set(tf_link
27 |         -l:libtensorflow_framework.so.2
28 |     )
29 | endif()
30 | 
31 | add_subdirectory(bert)
32 | add_subdirectory(encoder)
33 | add_subdirectory(decoder)
34 | add_subdirectory(decoding)
35 | add_subdirectory(gpt)


--------------------------------------------------------------------------------
/3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <stdint.h>
20 | namespace fastertransformer
21 | {
22 | enum Data_type
23 | {
24 |     DATA_TYPE_BOOL,
25 |     DATA_TYPE_E8M10,
26 |     DATA_TYPE_E8M7,
27 |     DATA_TYPE_FP16,
28 |     DATA_TYPE_FP32,
29 |     DATA_TYPE_INT4,
30 |     DATA_TYPE_INT8,
31 |     DATA_TYPE_INT32
32 | };
33 | 
34 | constexpr int32_t kSM_70 = 70;
35 | constexpr int32_t kSM_72 = 72;
36 | constexpr int32_t kSM_75 = 75;
37 | constexpr int32_t kSM_80 = 80;
38 | constexpr int32_t kSM_86 = 86;
39 | } // namespace fastertransformer
40 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/data/BooksDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import subprocess
15 | 
16 | class BooksDownloader:
17 |     def __init__(self, save_path):
18 |         self.save_path = save_path
19 |         pass
20 | 
21 | 
22 |     def download(self):
23 |         bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
24 |         bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
25 |         bookscorpus_download_command += ' --trash-bad-count'
26 |         bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
27 | 


--------------------------------------------------------------------------------
/post.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import requests
 4 | 
 5 | url = 'http://0.0.0.0:5000/code'
 6 | 
 7 | headers = {
 8 |     "Content-Type": "application/json; charset=UTF-8",
 9 |     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
10 | }
11 | 
12 | sentence = "# language: python\n# write a quick sort function\ndef"
13 | 
14 | results=[]
15 | for i in range(1):
16 |     data = json.dumps({'ability': 'seo_article_creation', 
17 |                        'context': sentence, 
18 |                        'temperature':1.0 ,
19 |                        'top_k': 1,
20 |                        'top_p': 0.0,
21 |                        'max_seq_len': 256,
22 |                        'len_penalty': 1.0,
23 |                        'repetition_penalty': 1.0,
24 |                        'presence_penalty': 1.0,
25 |                        'frequency_penalty': 1.0,
26 |                        'end_tokens': [],
27 |                     })
28 |     time1=time.time()
29 |     r = requests.post(url, data, headers=headers)
30 |     time2=time.time()
31 |     print("time used",time2-time1)
32 |     print(r.json()['generated'])
33 |     rdict=json.loads(r.text)
34 |     result={"sentence":sentence,"result":rdict['generated']}
35 |     results.append(result)


--------------------------------------------------------------------------------
/src/fastertransformer/models/bert/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(Bert STATIC Bert.cc)
18 | set_property(TARGET Bert PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET Bert PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(Bert PUBLIC -lcudart bert_preprocess_kernels cublasMMWrapper 
21 |                       UnfusedAttentionLayer FusedAttentionLayer FfnLayer layernorm_kernels
22 |                       add_residual_kernels)
23 | 
24 | add_executable(bert_gemm bert_gemm.cc)
25 | target_link_libraries(bert_gemm PUBLIC -lcublas -lcublasLt -lcudart encoder_gemm_func encoder_igemm_func memory_utils)


--------------------------------------------------------------------------------
/src/fastertransformer/models/vit_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(ViTINT8 STATIC ViTINT8.cc)
18 | set_property(TARGET ViTINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET ViTINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(ViTINT8 PUBLIC -lcudart -lcublasLt -lcublas cublasINT8MMWrapper 
21 |                       UnfusedAttentionLayerINT8 FusedAttentionLayerINT8 FfnLayerINT8 layernorm_kernels
22 |                       layernorm_int8_kernels add_residual_kernels activation_kernels layout_transformer_int8_kernels
23 |                       vit_kernels bert_preprocess_kernels)
24 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/th_traits.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | #ifndef TORCH_TRAITS_H_
19 | #define TORCH_TRAITS_H_
20 | 
21 | #include "src/fastertransformer/utils/cuda_utils.h"
22 | #include <cuda_fp16.h>
23 | 
24 | using namespace fastertransformer;
25 | namespace torch_ext {
26 | template<typename T>
27 | class THTraits;
28 | 
29 | template<>
30 | class THTraits<float> {
31 | public:
32 |     static const OperationType OpType = OperationType::FP32;
33 | };
34 | 
35 | template<>
36 | class THTraits<half> {
37 | public:
38 |     static const OperationType OpType = OperationType::FP16;
39 | };
40 | 
41 | }  // namespace torch_ext
42 | #endif
43 | 


--------------------------------------------------------------------------------
/3rdparty/trt_fused_multihead_attention/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | cmake_minimum_required(VERSION 3.8)
17 | 
18 | set(trt_fused_multi_head_attention_files
19 |     cudaDriverWrapper.cpp
20 |     qkvToContext.cu
21 | )
22 | 
23 | file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} *.sm*.cpp)
24 | 
25 | add_library(trt_fused_multi_head_attention STATIC ${trt_fused_multi_head_attention_files})
26 | target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart)
27 | set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
28 | set_property(TARGET trt_fused_multi_head_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
29 | 


--------------------------------------------------------------------------------
/examples/tensorflow/decoding/utils/translation/download_model_data.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Install the OpenNMT-tf v1
16 | pip install opennmt-tf==1.25.1
17 | 
18 | # Download the vocabulary and test data
19 | # wget https://s3.amazonaws.com/opennmt-trainingdata/wmt_ende_sp.tar.gz
20 | 
21 | # Download the pretrained model
22 | wget --progress=dot:giga https://s3.amazonaws.com/opennmt-models/averaged-ende-ckpt500k.tar.gz
23 | 
24 | mkdir ../translation
25 | mkdir ../translation/ckpt
26 | tar xf averaged-ende-ckpt500k.tar.gz -C ../translation/ckpt
27 | rm averaged-ende-ckpt500k.tar.gz
28 | 
29 | # convert the pretrained model to fit our model structure 
30 | # python tensorflow/utils/dump_model.py translation/ckpt/model.ckpt-500000
31 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/vit/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(ViT STATIC ViT.cc)
18 | set_property(TARGET ViT PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET ViT PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(ViT PUBLIC -lcudart -lcublasLt -lcublas cublasMMWrapper 
21 |                       UnfusedAttentionLayer FusedAttentionLayer FfnLayer layernorm_kernels
22 |                       add_residual_kernels activation_kernels vit_kernels bert_preprocess_kernels)
23 | 
24 | add_executable(vit_gemm vit_gemm.cc)
25 | target_link_libraries(vit_gemm PUBLIC -lcublas -lcublasLt -lcudart encoder_gemm_func encoder_igemm_func memory_utils)


--------------------------------------------------------------------------------
/tests/unittests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_executable(test_gemm test_gemm.cu)
16 | target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper)
17 | 
18 | add_executable(test_sampling test_sampling.cu)
19 | target_link_libraries(test_sampling PUBLIC
20 |                       -lcublas -lcublasLt -lcudart
21 |                       cublasMMWrapper memory_utils
22 |                       DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer TopKTopPSamplingLayer)
23 | 
24 | add_executable(test_logprob_kernels test_logprob_kernels.cu)
25 | target_link_libraries(test_logprob_kernels PUBLIC
26 |                     -lcublas -lcublasLt -lcudart
27 |                     logprob_kernels memory_utils)
28 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/swin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | cmake_minimum_required(VERSION 3.13)
15 | 
16 | set(swintransformer_ths_files
17 |   SwinOp.cc
18 |   SwinINT8Op.cc
19 |   WeightQuantizeOp.cc
20 | )
21 | 
22 | add_definitions(-DTORCH_CUDA=1)
23 | 
24 | if(BUILD_PYT)
25 |   set(LIB_NAME "pyt_swintransformer")
26 |   add_library(${LIB_NAME} SHARED ${swintransformer_ths_files})
27 |   set_target_properties(${LIB_NAME} PROPERTIES
28 |                         CUDA_RESOLVE_DEVICE_SYMBOLS ON)
29 |   target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" Swin SwinINT8 
30 |     cublasINT8MMWrapper cublasAlgoMap trt_fused_multi_head_attention 
31 |     gen_relative_pos_bias quantize_weight -lcudnn -lcublas -lcudart)
32 | endif()
33 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/setup.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | """Setup script"""
19 | 
20 | from setuptools import setup, find_packages
21 | 
22 | setup(name="TensorFlow_FastTransformer_Quantization",
23 |       package=["ft_tensorflow_quantization"],
24 |       package_dir={'ft_tensorflow_quantization': 'ft_tensorflow_quantization'},
25 |       version="0.1.0",
26 |       description="TensorFlow FasterTransformer Quantization",
27 |       packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
28 |       zip_safe=False)
29 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/ft-tensorflow-quantization/ft_tensorflow_quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | """TensorFlow Quantization"""
19 | 
20 | from ft_tensorflow_quantization.python.ops.fake_quantize import *
21 | 
22 | from ft_tensorflow_quantization.python.layers.tensor_quantizer import *
23 | from ft_tensorflow_quantization.python.layers.dense import *
24 | 
25 | from ft_tensorflow_quantization.python.calib.max import *
26 | from ft_tensorflow_quantization.python.calib.histogram import *
27 | from ft_tensorflow_quantization.python.calib.calibrator import *
28 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/stop_criteria_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | #include <cuda_runtime.h>
19 | 
20 | namespace fastertransformer {
21 | 
22 | void invokeStopWordsCriterion(const int* output_ids,
23 |                               const int* parent_ids,
24 |                               const int* stop_words,
25 |                               bool* finished,
26 |                               size_t id_offset,
27 |                               size_t stop_words_len,
28 |                               int batch_size,
29 |                               int beam_width,
30 |                               int step,
31 |                               cudaStream_t stream);
32 | 
33 | }  // namespace fastertransformer
34 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_definitions(-DTORCH_CUDA=1)
16 | 
17 | add_library(th_utils STATIC th_utils.cu)
18 | set_property(TARGET th_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET th_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(th_utils PUBLIC "${TORCH_LIBRARIES}" -lcublas -lcudart -lcurand)
21 | 
22 | add_subdirectory(bert)
23 | add_subdirectory(encoder)
24 | add_subdirectory(decoder)
25 | add_subdirectory(decoding)
26 | add_subdirectory(gpt)
27 | add_subdirectory(codegeex)
28 | add_subdirectory(longformer)
29 | add_subdirectory(swin)
30 | add_subdirectory(vit)
31 | 
32 | if(BUILD_MULTI_GPU)
33 |     add_subdirectory(multi_gpu_gpt)
34 |     add_subdirectory(multi_gpu_codegeex)
35 |     add_subdirectory(t5)
36 | endif()
37 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/quantize_weight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "int8_utils.cuh"
20 | #include <cuda_fp16.h>
21 | #include <cuda_runtime.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | // format:
26 | // 0: row major
27 | // 1: CUBLASLT_ORDER_COL32_2R_4R4
28 | // 2: CUBLASLT_ORDER_COL4_4R2_8C
29 | template<typename T>
30 | void invokeQuantizeWeight(int8_t* dst,
31 |                           const T* src,
32 |                           const float* amax,
33 |                           const int n,
34 |                           const int k,
35 |                           const int format,
36 |                           cudaStream_t stream,
37 |                           const int scale_is_vector = 1);
38 | 
39 | }  // namespace fastertransformer
40 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/gen_relative_pos_bias.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <assert.h>
19 | #include <cuda_fp16.h>
20 | #include <cuda_runtime.h>
21 | #include <stdint.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | enum class PositionEmbeddingType {
26 |     relative,
27 |     absolute,
28 | };
29 | 
30 | template<typename T, typename Tindex>
31 | void invokeGenRelativePosBias(T* relative_position_bias,
32 |                               const T* relative_position_bias_table,
33 |                               const Tindex* relative_position_bias_index,
34 |                               const int window_size,
35 |                               const int head_num,
36 |                               cudaStream_t stream);
37 | 
38 | }  // namespace fastertransformer
39 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/dequantize_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "src/fastertransformer/utils/cuda_utils.h"
19 | #include <assert.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_runtime.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | template<typename T>
26 | void invokeDequantization(T* dst, const int8_t* src, const int size, const float* scale_ptr, cudaStream_t stream);
27 | 
28 | template<typename T>
29 | void invokeDequantization_INT32(T* dst,
30 |                                 const int32_t* src,
31 |                                 const int size,
32 |                                 cudaStream_t stream,
33 |                                 const float* input_amax_ptr,
34 |                                 const float* weight_amax_ptr);
35 | 
36 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
 4 | so we do not plan to make any major changes to this library (other than what was
 5 | promised in the README). However, we can accept small patches related to
 6 | re-factoring and documentation. To submit contributes, there are just a few
 7 | small guidelines you need to follow.
 8 | 
 9 | ## Contributor License Agreement
10 | 
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to <https://cla.developers.google.com/> to see
15 | your current agreements on file or to sign a new one.
16 | 
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 | 
21 | ## Code reviews
22 | 
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 | 
28 | ## Community Guidelines
29 | 
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/gemm_test/swin_gemm_func.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cublasAlgoMap.h"
20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
21 | #include "src/fastertransformer/utils/cuda_utils.h"
22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h"
23 | 
24 | #include <cstdio>
25 | #include <cstdlib>
26 | #include <ctime>
27 | #include <cuda_fp16.h>
28 | #include <cuda_profiler_api.h>
29 | #include <map>
30 | #include <sys/time.h>
31 | #include <unistd.h>
32 | #include <vector>
33 | 
34 | namespace fastertransformer {
35 | 
36 | template<typename T>
37 | void generate_swin_gemm_config(
38 |     int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
39 | 
40 | }  // namespace fastertransformer
41 | 


--------------------------------------------------------------------------------
/examples/tensorrt/t5/createT5TestData.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import numpy as np
18 | 
19 | np.random.seed(97)
20 | 
21 | data = {}
22 | 
23 | fpList = [32,16]
24 | bsList = [1,8,32,128]
25 | slList = [32,128,384]
26 | 
27 | for bs in bsList:
28 |     for sl in slList:       
29 |         for fp in fpList:
30 |             name = '-fp'+str(fp)+'-bs'+str(bs)+'-sl'+str(sl)
31 |             data['encoder'+name]    = np.random.randint(0,32128,[bs,sl]).astype(np.int32)
32 |             data['decoding'+name]   = np.random.rand(bs,sl,512).astype([np.float32,np.float16][int(fp==16)])*2-1
33 |             data['seqLen'+name]     = np.full([bs],sl,dtype=np.int32)
34 | 
35 | np.savez("T5PluginTestIO.npz",**data)
36 | 
37 | #for k in data.keys():
38 | #    print(k,data[k].shape,data[k].dtype,data[k].reshape(-1)[:10])
39 | print("create T5 test data finish!")
40 | 
41 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/activation_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
20 | #include <cuda_fp16.h>
21 | #include <cuda_runtime.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | template<typename T>
26 | void invokeAddBiasGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
27 | 
28 | template<typename T>
29 | void invokeAddBiasFastGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
30 | 
31 | template<typename T>
32 | void invokeAddBiasRelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
33 | 
34 | template<typename F_T, typename B_T>
35 | void invokeAddBias(F_T* out, const B_T* bias, const int m, const int n, cudaStream_t stream);
36 | 
37 | }  // namespace fastertransformer
38 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cublasAlgoMap.h"
20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
21 | #include "src/fastertransformer/utils/cuda_utils.h"
22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h"
23 | 
24 | #include <cstdio>
25 | #include <cstdlib>
26 | #include <ctime>
27 | #include <cuda_fp16.h>
28 | #include <cuda_profiler_api.h>
29 | #include <map>
30 | #include <sys/time.h>
31 | #include <unistd.h>
32 | #include <vector>
33 | 
34 | namespace fastertransformer {
35 | 
36 | template<typename T>
37 | void generate_encoder_gemm_config(
38 |     int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
39 | 
40 | }  // namespace fastertransformer
41 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/calibrate_quantize_weight_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "int8_utils.cuh"
20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
21 | #include <cuda_fp16.h>
22 | #include <cuda_runtime.h>
23 | 
24 | namespace fastertransformer {
25 | 
26 | template<typename T>
27 | void invokeLdnCalibrateWeightPerChannel(float* scale, const T* src, const int k, const int n, cudaStream_t stream);
28 | 
29 | template<typename T>
30 | void invokeLdkCalibrateQuantizeWeightPerChannel(
31 |     int8_t* dst, float* scale, const T* src, const int n, const int k, cudaStream_t stream);
32 | 
33 | template<typename T>
34 | void invokeLdnTransposeQuantizeWeightPerChannel(
35 |     int8_t* dst, const float* scale, const T* src, const int k, const int n, cudaStream_t stream);
36 | 
37 | }  // namespace fastertransformer
38 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/ban_bad_words.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cuda_fp16.h>
20 | #include <cuda_runtime.h>
21 | 
22 | namespace fastertransformer {
23 | 
24 | template<typename T>
25 | void invokeBanBadWords(T* logits,
26 |                        const int* output_ids_buf,
27 |                        const int* parent_ids_buf,
28 |                        int batch_size,
29 |                        int local_batch_size,
30 |                        int beam_width,
31 |                        const int* bad_words,
32 |                        bool share_words,
33 |                        size_t bad_words_len,
34 |                        int id_offset,
35 |                        int vocab_size_padded,
36 |                        size_t step,
37 |                        cudaStream_t stream);
38 | 
39 | }  // namespace fastertransformer
40 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/data/BookscorpusTextFormatting.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import glob
15 | import os
16 | 
17 | class BookscorpusTextFormatting:
18 |     def __init__(self, books_path, output_filename, recursive = False):
19 |         self.books_path = books_path
20 |         self.recursive = recursive
21 |         self.output_filename = output_filename
22 | 
23 | 
24 |     # This puts one book per line
25 |     def merge(self):
26 |         with open(self.output_filename, mode='w', newline='\n') as ofile:
27 |             for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
28 |                 with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
29 |                     for line in file:
30 |                         if line.strip() != '':
31 |                             ofile.write(line.strip() + ' ')
32 |                 ofile.write("\n\n")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CodeGeeX FasterTransformer
 2 | 
 3 | This repository provides the fastertrasformer implementation of [CodeGeeX](https://github.com/THUDM/CodeGeeX) model.
 4 | 
 5 | ## Get Started
 6 | First, download and setup the following docker environment, replace ```<WORK_DIR>``` by the directory of this repo:
 7 | ```
 8 | docker pull nvcr.io/nvidia/pytorch:21.11-py3
 9 | docker run -p 9114:5000 --cpus 12 --gpus '"device=0"' -it -v <WORK_DIR>:/workspace/codegeex-fastertransformer --ipc=host  --name=test nvcr.io/nvidia/pytorch:21.11-py3
10 | ```
11 | Second, install following packages in the docker:
12 | ```
13 | pip3 install transformers
14 | pip3 install sentencepiece
15 | cd codegeex-fastertransformer
16 | sh make_all.sh  # Remember to specify the DSM version according to the GPU.
17 | ```
18 | Then, convert the initial checkpoint (download [here](https://models.aminer.cn/codegeex/download/request)) to FT version using ```get_ckpt_ft.py```. 
19 | 
20 | Finally, run ```api.py``` to start the server and run ```post.py``` to send request:
21 | ```
22 | nohup python3 api.py > test.log 2>&1 &
23 | python3 post.py
24 | ```
25 | ## Inference performance
26 | 
27 | The following figure compares the performances of pure Pytorch, Megatron and FasterTransformer under INT8 and FP16.
28 | The fastest implementation is INT8 + FastTrans, and the average time of generating a token <15ms.
29 | 
30 | <div align=center><img width=100% src="docs/images/inference_performance.png"/></div>
31 | 
32 | ## Liscense
33 | 
34 | Our code is licensed under the [Apache-2.0 license](LICENSE).
35 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/logprob_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | namespace fastertransformer {
20 | 
21 | template<typename T>
22 | void invokeLogProbFromLogits(float* cum_log_probs,
23 |                              const T* logits,
24 |                              const int* input_ids,
25 |                              const int* input_lengths,
26 |                              const size_t max_input_length,
27 |                              const size_t batch_size,
28 |                              const size_t vocab_size,
29 |                              const size_t vocab_size_padded,
30 |                              void* workspace,
31 |                              const size_t workspace_size,
32 |                              cudaStream_t stream,
33 |                              const bool batch_first = false);
34 | }  // namespace fastertransformer
35 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | namespace fastertransformer {
19 | 
20 | template<typename T>
21 | void invokeTopkSoftMax(const T* log_probs,
22 |                        const T* bias,
23 |                        const bool* finished,
24 |                        float* cum_log_probs,
25 |                        float* output_log_probs,
26 |                        int* ids,
27 |                        void* tmp_storage,
28 |                        const int temp_storage_size,
29 |                        const int batch_size,
30 |                        const int beam_width,
31 |                        const int vocab_size,
32 |                        const int* end_ids,
33 |                        const float diversity_rate,
34 |                        cudaStream_t stream);
35 | 
36 | }  // namespace fastertransformer
37 | 


--------------------------------------------------------------------------------
/examples/cpp/gpt/gpt_config.ini:
--------------------------------------------------------------------------------
 1 | [ft_instance_hyperparameter]
 2 | max_batch_size=8 ; Use for allocate the buffer
 3 | max_seq_len=128 ; The sequence length of position embedding table, should move to model hyper-parameter
 4 | beam_width=1 ; beam width for beam search
 5 | top_k=0 ; k value for top k sampling
 6 | top_p=0.5 ; p value for top p sampling
 7 | temperature=1.0 ; Use for sampling
 8 | repetition_penalty=2.0 ; Use for sampling
 9 | data_type=fp16
10 | sparse=0
11 | model_name=gpt_124M
12 | ; model_name=megatron_345M
13 | ; model_name=megatron_6.7B
14 | ; model_name=gpt_175B
15 | ; model_name=self_defined
16 | ; model_dir=./models/megatron-models/c-model/6.7b/
17 | model_dir=models/openai-gpt-models/c-model/124m/1-gpu/
18 | 
19 | [request]
20 | request_batch_size=8    ; determine by the request
21 | request_output_len=32   ; determine by the request
22 | return_log_probs=false  ; return the output log probs and cumulative log probs.
23 | context_log_probs=false ; include input contexts in the cumulative log probability computation.
24 | 
25 | [gpt_124M]
26 | head_num=12
27 | size_per_head=64
28 | vocab_size=50257
29 | decoder_layers=12
30 | 
31 | [gpt_175B]
32 | head_num=96
33 | size_per_head=128
34 | vocab_size=51200
35 | decoder_layers=96
36 | 
37 | [self_defined]
38 | head_num=16
39 | size_per_head=64
40 | vocab_size=30000
41 | decoder_layers=12
42 | 
43 | [megatron_345M]
44 | head_num=16
45 | size_per_head=64
46 | vocab_size=50304
47 | decoder_layers=24
48 | 
49 | [megatron_6.7B]
50 | head_num=32
51 | size_per_head=128
52 | vocab_size=51200
53 | decoder_layers=32
54 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/xlnet_preprocess_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "src/fastertransformer/utils/cuda_utils.h"
19 | #include <cuda_fp16.h>
20 | #include <cuda_runtime.h>
21 | 
22 | namespace fastertransformer {
23 | 
24 | template<typename T>
25 | void blockAttnMask(dim3& grid, dim3& block, int batch_size, int seq_len);
26 | 
27 | template<typename T>
28 | void genWordEmdK(
29 |     int batch_size, int seq_len, int hidden_dim, T* word_emb_k, T* params_word_emb_k, int* inp_k, cudaStream_t stream);
30 | 
31 | template<typename T>
32 | void preProcess(int batch_size,
33 |                 int seq_len,
34 |                 int hidden_dim,
35 |                 T* attn_mask,
36 |                 float* input_mask,
37 |                 T* seg_mat,
38 |                 int* seg_id,
39 |                 T* attr_k_head_r,
40 |                 cudaStream_t stream);
41 | }  // namespace fastertransformer
42 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/layout_transformer_int8_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "int8_utils.cuh"
20 | #include <assert.h>
21 | #include <cuda_fp16.h>
22 | #include <cuda_runtime.h>
23 | 
24 | namespace fastertransformer {
25 | 
26 | template<typename T>
27 | void invokeTransposeMatrixCOL32ToColMajor(T* dst, const T* src, const int m, const int n, cudaStream_t stream);
28 | 
29 | template<typename T>
30 | void invokeTransposeMatrixColMajorToCOL32(T* dst, const T* src, const int m, const int n, cudaStream_t stream);
31 | 
32 | template<typename T>
33 | void invokeTransposeMatrixColMajorToCOL32Quantize(
34 |     int8_t* dst, const T* src, const int m, const int n, const float* scale_ptr, cudaStream_t stream);
35 | 
36 | void invokeRowMajorToCOL32(int8_t* dst, const int8_t* src, const int m, const int n, cudaStream_t stream);
37 | }  // namespace fastertransformer
38 | 


--------------------------------------------------------------------------------
/examples/cpp/multi_gpu_gpt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_library(gpt_example_utils STATIC gpt_example_utils.cc)
16 | target_link_libraries(gpt_example_utils PUBLIC -lcudart)
17 | 
18 | add_executable(multi_gpu_gpt_example multi_gpu_gpt_example.cc)
19 | target_link_libraries(multi_gpu_gpt_example PUBLIC -lcublas -lcublasLt -lcudart
20 |                             ParallelGpt nvtx_utils -lmpi gpt_example_utils)
21 | 
22 | add_executable(multi_gpu_gpt_async_example multi_gpu_gpt_async_example.cc)
23 | target_link_libraries(multi_gpu_gpt_async_example PUBLIC -lcublas -lcublasLt -lcudart
24 |                             ParallelGpt nvtx_utils -lmpi gpt_example_utils)
25 | 
26 | add_executable(multi_gpu_gpt_triton_example multi_gpu_gpt_triton_example.cc)
27 | target_link_libraries(multi_gpu_gpt_triton_example PUBLIC -lcublas -lcublasLt -lcudart
28 |                             ParallelGptTritonBackend memory_utils custom_ar_comm -lmpi gpt_example_utils -lpthread)
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/bert_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(BertLayerINT8 STATIC BertLayerINT8.cc)
18 | set_property(TARGET BertLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET BertLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(BertLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart -lcurand cublasMMWrapper 
21 |                       cublasINT8MMWrapper UnfusedAttentionLayerINT8 FusedAttentionLayerINT8 
22 |                       FfnLayerINT8 layernorm_int8_kernels
23 |                       layout_transformer_int8_kernels quantization_int8_kernels)
24 | 
25 | add_library(BertINT8 STATIC BertINT8.cc)
26 | set_property(TARGET BertINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
27 | set_property(TARGET BertINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
28 | target_link_libraries(BertINT8 PUBLIC -lcublasLt -lcublas -lcudart -lcurand BertLayerINT8 bert_preprocess_kernels)
29 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/word_list.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "word_list.h"
18 | #include "memory_utils.h"
19 | 
20 | #include "assert.h"
21 | 
22 | namespace fastertransformer {
23 | 
24 | int read_word_list(const std::string& filename, std::vector<int>& file_data)
25 | {
26 |     std::ifstream word_list_file(filename, std::ios::in);
27 | 
28 |     std::string line_buf;
29 |     int line_count = 0;
30 |     size_t id_counts[2] = {0, 0};
31 |     while (std::getline(word_list_file, line_buf)) {
32 | 
33 |         std::stringstream line_stream(line_buf);
34 |         std::string vals;
35 |         while (std::getline(line_stream, vals, ',')) {
36 |             file_data.push_back(std::stoi(vals));
37 |             id_counts[line_count]++;
38 |         }
39 |         line_count++;
40 | 
41 |         if (line_count > 1) {
42 |             break;
43 |         }
44 |     }
45 |     assert(id_counts[0] == id_counts[1]);
46 | 
47 |     return 0;
48 | }
49 | 
50 | }  // namespace fastertransformer
51 | 


--------------------------------------------------------------------------------
/examples/pytorch/decoding/utils/recover_bpe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | 
17 | def recover_bpe(src):
18 |     dst = []
19 |     for line in src:
20 |         line = line.strip().split()
21 |         if line[-1] == '</s>':
22 |             line.pop()
23 |         if line[0][0] == '▁':
24 |             s = line[0][1:]
25 |         else:
26 |             s = line[0]
27 |         for w in line[1:]:
28 |             if w[0] == '▁':
29 |                 s += ' ' + w[1:]
30 |             else:
31 |                 s += w
32 |         s += '\n'
33 |         dst.append(s)
34 |     return dst
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('infile', type=str)
39 |     parser.add_argument('outfile', type=str)
40 |     args = parser.parse_args()
41 | 
42 |     with open(args.infile, 'r') as infile:
43 |         with open(args.outfile, 'w') as outfile:
44 |             dst = recover_bpe(infile.readlines())
45 |             for line in dst:
46 |                 outfile.write(line)
47 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/fp16_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import tensorflow as tf
17 | import numpy as np
18 | 
19 | 
20 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
21 |                                     initializer=None, regularizer=None,
22 |                                     trainable=True,
23 |                                     *args, **kwargs):
24 |     """Custom variable getter that forces trainable variables to be stored in
25 |        float32 precision and then casts them to the training precision.
26 |     """
27 |     storage_dtype = tf.float32 if trainable else dtype
28 |     variable = getter(name, shape, dtype=storage_dtype,
29 |                       initializer=initializer, regularizer=regularizer,
30 |                       trainable=trainable,
31 |                       *args, **kwargs)
32 |     if trainable and dtype != tf.float32:
33 |         variable = tf.cast(variable, dtype)
34 |     return variable
35 | 
36 | 


--------------------------------------------------------------------------------
/src/fastertransformer/triton_backend/triton_utils.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
20 | #include "src/fastertransformer/utils/Tensor.h"
21 | 
22 | namespace ft = fastertransformer;
23 | 
24 | template<typename T>
25 | void move_tensor_H2D(const triton::Tensor &tensor, T* &d_ptr)
26 | {
27 |     if (tensor.where == triton::MEMORY_GPU) {
28 |         return;
29 |     }
30 | 
31 |     size_t tensor_size = 1;
32 |     for (auto t : tensor.shape) {
33 |         tensor_size *= t;
34 |     }
35 |     ft::deviceMalloc(&d_ptr, tensor_size, false);
36 |     ft::cudaH2Dcpy(d_ptr, (T*) tensor.data, tensor_size);
37 | }
38 | 
39 | template<typename T>
40 | ft::Tensor as_GPU_tensor(const triton::Tensor &tensor, T* d_ptr)
41 | {
42 |     return ft::Tensor {ft::MEMORY_GPU,
43 |                        triton::Tensor::convertTritonTypeToFt(tensor.type),
44 |                        tensor.shape,
45 |                        tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data};
46 | }
47 | 


--------------------------------------------------------------------------------
/src/fastertransformer/th_op/th_utils.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "src/fastertransformer/th_op/th_utils.h"
18 | 
19 | namespace torch_ext {
20 | 
21 | std::vector<size_t> convert_shape(torch::Tensor tensor)
22 | {
23 |     std::vector<size_t> v_shape;
24 |     for (int i = 0; i < tensor.dim(); i++) {
25 |         v_shape.push_back(tensor.size(i));
26 |     }
27 |     return v_shape;
28 | }
29 | 
30 | template<typename T>
31 | fastertransformer::Tensor convert_tensor(torch::Tensor tensor)
32 | {
33 |     return fastertransformer::Tensor{fastertransformer::MEMORY_GPU,
34 |                                      fastertransformer::getTensorType<T>(),
35 |                                      convert_shape(tensor),
36 |                                      get_ptr<T>(tensor)};
37 | }
38 | 
39 | template fastertransformer::Tensor convert_tensor<float>(torch::Tensor tensor);
40 | template fastertransformer::Tensor convert_tensor<half>(torch::Tensor tensor);
41 | template fastertransformer::Tensor convert_tensor<int>(torch::Tensor tensor);
42 | 
43 | }  // namespace torch_ext
44 | 


--------------------------------------------------------------------------------
/examples/pytorch/gpt/utils/generate_start_ids.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import configparser
17 | 
18 | if __name__ == "__main__":
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('-max_batch_size', '--max_batch_size', type=int, required=True, metavar='NUMBER',
21 |                         help='batch size')
22 |     parser.add_argument('-max_input_length', '--max_input_length', type=int, required=True, metavar='NUMBER',
23 |                         help='max input length')
24 |     args = parser.parse_args()
25 |     args_dict = vars(args)
26 | 
27 |     batch_size = args_dict["max_batch_size"]
28 |     max_input_length = args_dict["max_input_length"]
29 |     path = f"../examples/cpp/multi_gpu_gpt/start_ids.csv"
30 | 
31 |     with open(path, 'w') as f:
32 |         ids = ""
33 |         for i in range(batch_size):
34 |             for j in range(max_input_length):
35 |                 if j == 0:
36 |                     ids = f"{ids}198"
37 |                 else:
38 |                     ids = f"{ids}, 198"
39 |             ids = f"{ids}\n"
40 |         f.write(ids)
41 | 


--------------------------------------------------------------------------------
/examples/tensorflow/bert/bert-quantization/gpu_environment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import tensorflow as tf
17 | import numpy as np
18 | 
19 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
20 |                                     initializer=None, regularizer=None,
21 |                                     trainable=True,
22 |                                     *args, **kwargs):
23 |     """Custom variable getter that forces trainable variables to be stored in
24 |        float32 precision and then casts them to the training precision.
25 |     """
26 |     storage_dtype = tf.float32 if trainable else dtype
27 |     variable = getter(name, shape, dtype=storage_dtype,
28 |                       initializer=initializer, regularizer=regularizer,
29 |                       trainable=trainable,
30 |                       *args, **kwargs)
31 |     if trainable and dtype != tf.float32:
32 |         variable = tf.cast(variable, dtype)
33 |     return variable
34 | 
35 | def get_custom_getter(compute_type):
36 |     return float32_variable_storage_getter if compute_type == tf.float16 else None
37 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/mpi_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "mpi.h"
20 | #include <stdio.h>
21 | 
22 | namespace fastertransformer {
23 | 
24 | #define MPICHECK(cmd)                                                                                                  \
25 |     do {                                                                                                               \
26 |         int e = cmd;                                                                                                   \
27 |         if (e != MPI_SUCCESS) {                                                                                        \
28 |             printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e);                                           \
29 |             exit(EXIT_FAILURE);                                                                                        \
30 |         }                                                                                                              \
31 |     } while (0)
32 | 
33 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/tensorflow/decoding/utils/bleu_score.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import sys
16 | import tensorflow as tf
17 | from sacrebleu import corpus_bleu
18 | 
19 | def bleu_score(pred_file, ref_file):
20 |     with tf.io.gfile.GFile(pred_file) as pred_stream, tf.io.gfile.GFile(ref_file) as ref_stream:
21 |         pred_stream_txt = pred_stream.readlines()
22 |         ref_stream_txt = ref_stream.readlines()
23 |         bleu = corpus_bleu(pred_stream_txt, [ref_stream_txt], force=True)
24 |         print("       bleu score: {:6.2f}".format(bleu.score))
25 |         print("       bleu counts: {}".format(bleu.counts))
26 |         print("       bleu totals: {}".format(bleu.totals))
27 |         print("       bleu precisions: {}".format(bleu.precisions))
28 |         print("       bleu sys_len: {}; ref_len: {}".format(bleu.sys_len, bleu.ref_len))
29 |         return bleu
30 |     
31 | if __name__ == "__main__":
32 |     if len(sys.argv) != 3:
33 |         print("[ERROR] bleu_score.py needs a result file and a solution file. \n e.g. python bleu_score.py f1.txt f2.txt")
34 |         sys.exit(0)
35 |     bleu_score(sys.argv[1], sys.argv[2])
36 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/swin_int8/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(SwinBlockINT8 STATIC SwinBlockINT8.cc)
18 | set_property(TARGET SwinBlockINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET SwinBlockINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(SwinBlockINT8 PUBLIC -lcublasLt -lcublas -lcudart 
21 |     WindowAttentionINT8 activation_int8_kernels add_residual_kernels)
22 | 
23 | add_library(SwinBasicLayerINT8 STATIC SwinBasicLayerINT8.cc)
24 | set_property(TARGET SwinBasicLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
25 | set_property(TARGET SwinBasicLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
26 | target_link_libraries(SwinBasicLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart SwinBlockINT8 dequantize_kernels)
27 | 
28 | add_library(SwinINT8 STATIC SwinINT8.cc)
29 | set_property(TARGET SwinINT8 PROPERTY POSITION_INDEPENDENT_CODE  ON)
30 | set_property(TARGET SwinINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
31 | target_link_libraries(SwinINT8 PUBLIC -lcudart SwinBasicLayerINT8 activation_kernels memory_utils)
32 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cublasAlgoMap.h"
20 | #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
21 | #include "src/fastertransformer/utils/cuda_utils.h"
22 | #include "src/fastertransformer/utils/gemm_test/gemm_func.h"
23 | 
24 | #include <cstdio>
25 | #include <cstdlib>
26 | #include <ctime>
27 | #include <cuda_fp16.h>
28 | #include <cuda_profiler_api.h>
29 | #include <map>
30 | #include <sys/time.h>
31 | #include <unistd.h>
32 | #include <vector>
33 | 
34 | namespace fastertransformer {
35 | 
36 | template<typename T>
37 | void generate_xlnet_gemm_config(int batch_size,
38 |                                 int seq_len,
39 |                                 int head_num,
40 |                                 int size_per_head,
41 |                                 int hidden_units_,
42 |                                 int inter_size_,
43 |                                 void* buffer_in,
44 |                                 bool isAppend = true);
45 | 
46 | }  // namespace fastertransformer
47 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
18 | set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels)
21 | 
22 | add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
23 | set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
24 | set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
25 | target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
26 | 
27 | add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
28 | set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
29 | set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
30 | target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
31 | 


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/data/GLUEDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import sys
15 | import wget
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def mkdir(path):
21 |     Path(path).mkdir(parents=True, exist_ok=True)
22 | 
23 | 
24 | class GLUEDownloader:
25 | 
26 |     def __init__(self, save_path):
27 |         self.save_path = save_path + '/glue'
28 | 
29 |     def download(self, task_name):
30 |         mkdir(self.save_path)
31 |         if task_name in {'mrpc', 'mnli'}:
32 |             task_name = task_name.upper()
33 |         elif task_name == 'cola':
34 |             task_name = 'CoLA'
35 |         else:  # SST-2
36 |             assert task_name == 'sst-2'
37 |             task_name = 'SST'
38 |         wget.download(
39 |             'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/1502038877f6a88c225a34450793fbc3ea87eaba/download_glue_data.py',
40 |             out=self.save_path,
41 |         )
42 |         sys.path.append(self.save_path)
43 |         import download_glue_data
44 |         download_glue_data.main(
45 |             ['--data_dir', self.save_path, '--tasks', task_name])
46 |         sys.path.pop()
47 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/swin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(SwinBlock STATIC SwinBlock.cc)
18 | set_property(TARGET SwinBlock PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET SwinBlock PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(SwinBlock PUBLIC -lcublasLt -lcublas -lcudart WindowAttention 
21 |     activation_kernels add_residual_kernels layernorm_kernels)
22 | 
23 | add_library(SwinBasicLayer STATIC SwinBasicLayer.cc)
24 | set_property(TARGET SwinBasicLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
25 | set_property(TARGET SwinBasicLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
26 | target_link_libraries(SwinBasicLayer PUBLIC -lcublasLt -lcublas -lcudart SwinBlock)
27 | 
28 | add_library(Swin STATIC Swin.cc)
29 | set_property(TARGET Swin PROPERTY POSITION_INDEPENDENT_CODE  ON)
30 | set_property(TARGET Swin PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
31 | target_link_libraries(Swin PUBLIC -lcudart SwinBasicLayer memory_utils)
32 | 
33 | add_executable(swin_gemm swin_gemm.cc)
34 | target_link_libraries(swin_gemm PUBLIC -lcublas -lcublasLt -lcudart swin_igemm_func swin_gemm_func memory_utils)


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.07-py3
15 | FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
16 | FROM ${FROM_IMAGE_NAME}
17 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
18 | 
19 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data
20 | 
21 | WORKDIR /workspace
22 | RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
23 | RUN git clone https://github.com/soskek/bookcorpus.git
24 | 
25 | # Copy the perf_client over
26 | COPY --from=trt /workspace/install/ /workspace/install/
27 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
28 | 
29 | # Install trt python api
30 | RUN apt-get install libb64-0d
31 | RUN pip install /workspace/install/python/tensorrtserver*.whl
32 | 
33 | WORKDIR /workspace/bert
34 | RUN pip install --upgrade --no-cache-dir pip \
35 |  && pip install --no-cache-dir \
36 |  tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
37 |  git+https://github.com/NVIDIA/dllogger wget
38 | 
39 | RUN apt-get install -y iputils-ping
40 | 
41 | COPY . .
42 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/gemm_test/swin_igemm_func.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cublasAlgoMap.h"
20 | #include "src/fastertransformer/utils/cuda_utils.h"
21 | #include "src/fastertransformer/utils/gemm_test/encoder_igemm_func.h"
22 | #include <algorithm>
23 | #include <cublasLt.h>
24 | #include <cuda_runtime.h>
25 | #include <map>
26 | #include <stdio.h>
27 | #include <stdlib.h>
28 | #include <sys/time.h>
29 | #include <time.h>
30 | #include <unistd.h>
31 | #include <vector>
32 | 
33 | namespace fastertransformer {
34 | 
35 | /* CAUTION : must match cublasLtMatmulTile_t */
36 | // const char* const matmulTileName[] = {
37 | //     "UNDEF",  "8x8",    "8x16",    "16x8",   "8x32",   "16x16",   "32x8",    "8x64",   "16x32",
38 | //     "32x16",  "64x8",   "32x32",   "32x64",  "64x32",  "32x128",  "64x64",   "128x32", "64x128",
39 | //     "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64",
40 | // };
41 | 
42 | int generate_swin_igemm_config(
43 |     int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
44 | 
45 | }  // namespace fastertransformer
46 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/add_bias_transpose_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | namespace fastertransformer {
18 | 
19 | template<typename T>
20 | void invokeAddBiasTransposeToMultiHead(const T* matrices,
21 |                                        const T* biases,
22 |                                        T* output,
23 |                                        const int batch_size,
24 |                                        const int head_num,
25 |                                        const int size_per_head,
26 |                                        const int seq_len,
27 |                                        const int matrices_num,
28 |                                        const cudaStream_t stream);
29 | 
30 | template<typename T>
31 | void invokeTransposeMultiHeadToSingle(T* dst,
32 |                                       T* src,
33 |                                       const int batch_size,
34 |                                       const int seq_len,
35 |                                       const int head_num,
36 |                                       const int size_per_head,
37 |                                       cudaStream_t stream);
38 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/pytorch/gpt/utils/gpt_token_converter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import sys
16 | import os
17 | dir_path = os.path.dirname(os.path.realpath(__file__))
18 | sys.path.append(dir_path + "/../../../..")
19 | from examples.tensorflow.gpt.utils import gpt_token_encoder as encoder
20 | import fire
21 | import numpy as np
22 | 
23 | def convert_token(
24 |     vocab_file="../models/gpt2-vocab.json",
25 |     bpe_file="../models/gpt2-merges.txt",
26 |     out_file="out",
27 |     max_input_length=-1
28 | ):
29 |     enc = encoder.get_encoder(vocab_file, bpe_file)
30 |     tokens_batch = np.loadtxt(out_file, dtype=np.int32)
31 |     end_id = 50256
32 |     if(tokens_batch.ndim == 1): 
33 |         tokens_batch = tokens_batch.reshape([1, -1])
34 |     for batch_num, tokens in enumerate(tokens_batch):
35 |         if max_input_length > -1:
36 |             end_index = np.where(tokens[max_input_length:] == end_id)[0]
37 |         else:
38 |             end_index = []
39 |         end_pos = len(tokens)
40 |         if len(end_index) > 0:
41 |             end_pos = end_index[0]
42 |         print(f"[INFO] batch {batch_num}: {enc.decode(tokens[:end_pos])}")
43 |     return tokens_batch
44 | 
45 | if __name__ == "__main__":
46 |     fire.Fire(convert_token)


--------------------------------------------------------------------------------
/examples/tensorflow/gpt/utils/gpt_token_converter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import sys
16 | import os
17 | dir_path = os.path.dirname(os.path.realpath(__file__))
18 | sys.path.append(dir_path + "/../../../..")
19 | from examples.tensorflow.gpt.utils import gpt_token_encoder as encoder
20 | import fire
21 | import numpy as np
22 | 
23 | def convert_token(
24 |     vocab_file="../models/gpt2-vocab.json",
25 |     bpe_file="../models/gpt2-merges.txt",
26 |     out_file="out",
27 |     max_input_length=-1
28 | ):
29 |     enc = encoder.get_encoder(vocab_file, bpe_file)
30 |     tokens_batch = np.loadtxt(out_file, dtype=np.int32)
31 |     end_id = 50256
32 |     if(tokens_batch.ndim == 1): 
33 |         tokens_batch = tokens_batch.reshape([1, -1])
34 |     for batch_num, tokens in enumerate(tokens_batch):
35 |         if max_input_length > -1:
36 |             end_index = np.where(tokens[max_input_length:] == end_id)[0]
37 |         else:
38 |             end_index = []
39 |         end_pos = -1
40 |         if len(end_index) > 0:
41 |             end_pos = end_index[0]
42 |         print("[INFO] batch {}: {}".format(batch_num, enc.decode(tokens[:end_pos])))
43 |     return tokens_batch
44 | 
45 | if __name__ == "__main__":
46 |     fire.Fire(convert_token)


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/scripts/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | DATA_DIR=${1:-/workspace/bert/data}
17 | 
18 | # Download vocab files from pretrained model
19 | cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
20 | 
21 | # Download SQUAD
22 | cd $DATA_DIR/squad && . squad_download.sh
23 | 
24 | # Download SWAG
25 | git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
26 | 
27 | # Download GLUE
28 | cd $DATA_DIR/glue && . download_mrpc.sh
29 | 
30 | # WIKI Download
31 | cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
32 | 
33 | # Bookcorpus  Download
34 | cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
35 | 
36 | cd $DATA_DIR
37 | # Create HDF5 files for WIKI
38 | bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
39 |   && rm -r ./wikipedia_corpus/final_* \
40 | 
41 | # Create HDF5 files for Bookcorpus
42 | bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
43 |   && rm -r ./bookcorpus/final_* \
44 | 
45 | # Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
46 | bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024
47 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/reverse_roll_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include <assert.h>
19 | #include <cuda_fp16.h>
20 | #include <cuda_runtime.h>
21 | 
22 | namespace fastertransformer {
23 | 
24 | void invokeReverseRollCol32(int8_t* dst,
25 |                             const int8_t* src,
26 |                             int batch,
27 |                             int window_num,
28 |                             int window_len,
29 |                             int window_size,
30 |                             int H,
31 |                             int W,
32 |                             int dim,
33 |                             int shift_size,
34 |                             cudaStream_t stream);
35 | 
36 | template<typename T>
37 | void invokeReverseRoll(T* dst,
38 |                        const T* src,
39 |                        int batch,
40 |                        int window_num,
41 |                        int window_len,
42 |                        int window_size,
43 |                        int H,
44 |                        int W,
45 |                        int dim,
46 |                        int shift_size,
47 |                        cudaStream_t stream);
48 | 
49 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/pytorch/bert/bert-quantization-sparsity/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import torch
15 | import torch.distributed as dist
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def get_rank():
21 |     if not dist.is_available():
22 |         return 0
23 |     if not dist.is_initialized():
24 |         return 0
25 |     return dist.get_rank()
26 | 
27 | 
28 | def get_world_size():
29 |     if not dist.is_available():
30 |         return 1
31 |     if not dist.is_initialized():
32 |         return 1
33 |     return dist.get_world_size()
34 | 
35 | 
36 | def is_main_process():
37 |     return get_rank() == 0
38 | 
39 | 
40 | def barrier():
41 |     if dist.is_available() and dist.is_initialized():
42 |         dist.barrier()
43 | 
44 | 
45 | def format_step(step):
46 |     if isinstance(step, str):
47 |         return step
48 |     s = ""
49 |     if len(step) > 0:
50 |         s += "Training Epoch: {} ".format(step[0])
51 |     if len(step) > 1:
52 |         s += "Training Iteration: {} ".format(step[1])
53 |     if len(step) > 2:
54 |         s += "Validation Iteration: {} ".format(step[2])
55 |     return s
56 | 
57 | 
58 | def mkdir(path):
59 |     Path(path).mkdir(parents=True, exist_ok=True)
60 | 
61 | 
62 | def mkdir_by_main_process(path):
63 |     if is_main_process():
64 |         mkdir(path)
65 |     barrier()
66 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/vit_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | #include <cuda_fp16.h>
19 | #include <cuda_runtime.h>
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | void invokeAddBiasSlice(T* in, T* out, const T* bias, const int m, const int n, const int s, cudaStream_t stream);
25 | 
26 | template<typename T>
27 | void invokeAddBiasConcatClsTokenAddPosEmbed(const T* in,
28 |                                             T* out,
29 |                                             const T* bias,
30 |                                             const T* cls_token,
31 |                                             const T* pos_embed,
32 |                                             const int m,
33 |                                             const int n,
34 |                                             const int s,
35 |                                             cudaStream_t stream);
36 | 
37 | template<typename T>
38 | void invokeSliceCopy(
39 |     const T* in, T* out, const int m, const int n, const int s, const int offset_s, cudaStream_t stream);
40 | 
41 | template<typename T>
42 | void invokeAddBiasAddPosEmbed(
43 |     T* out, const T* bias, const T* pos_embed, const int m, const int n, const int s, cudaStream_t stream);
44 | 
45 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/cpp/multi_gpu_gpt/gpt_config.ini:
--------------------------------------------------------------------------------
 1 | [ft_instance_hyperparameter]
 2 | max_batch_size=8 ; Use for allocate the buffer
 3 | max_seq_len=1024 ; The sequence length of position embedding table, should move to model hyper-parameter
 4 | beam_width=1 ; beam width for beam search
 5 | top_k=1 ; k value for top k sampling
 6 | top_p=0 ; p value for top p sampling
 7 | temperature=1.0 ; Use for sampling
 8 | repetition_penalty=1.0 ; Use for sampling
 9 | tensor_para_size=1
10 | pipeline_para_size=1
11 | data_type=fp16
12 | sparse=0
13 | int8_mode=0
14 | enable_custom_all_reduce=0
15 | ; model_name=gpt_124M
16 | model_name=megatron_345M
17 | ; model_name=megatron_6.7B
18 | ; model_name=gpt_175B
19 | ; model_name=self_defined
20 | ; model_dir=./models/megatron-models/c-model/6.7b/
21 | model_dir=../models/megatron-models/c-model/345m/8-gpu/
22 | len_penalty=1.0
23 | beam_search_diversity_rate=0.0
24 | 
25 | [request]
26 | request_batch_size=8    ; determine by the request
27 | request_output_len=32   ; determine by the request
28 | return_log_probs=false  ; return the output log probs and cumulative log probs.
29 | context_log_probs=false ; include input contexts in the cumulative log probability computation.
30 | 
31 | [gpt_124M]
32 | head_num=12
33 | size_per_head=64
34 | vocab_size=50257
35 | decoder_layers=12
36 | start_id=50256
37 | end_id=50256
38 | inter_size=3072
39 | 
40 | [megatron_345M]
41 | head_num=16
42 | size_per_head=64
43 | vocab_size=50304
44 | decoder_layers=24
45 | start_id=50256
46 | end_id=50256
47 | inter_size=4096
48 | 
49 | [megatron_6.7B]
50 | head_num=32
51 | size_per_head=128
52 | vocab_size=51200
53 | decoder_layers=32
54 | start_id=50256
55 | end_id=50256
56 | inter_size=16384
57 | 
58 | [gpt_175B]
59 | head_num=96
60 | size_per_head=128
61 | vocab_size=51200
62 | decoder_layers=96
63 | start_id=50256
64 | end_id=50256
65 | inter_size=49152
66 | 
67 | [self_defined]
68 | head_num=16
69 | size_per_head=64
70 | vocab_size=30000
71 | decoder_layers=12
72 | start_id=50256
73 | end_id=50256
74 | inter_size=4096
75 | 


--------------------------------------------------------------------------------
/src/fastertransformer/utils/memory_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include "src/fastertransformer/utils/cuda_utils.h"
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | void deviceMalloc(T** ptr, int size, bool is_random_initialize = true);
25 | 
26 | template<typename T>
27 | void deviceMemSetZero(T* ptr, int size);
28 | 
29 | template<typename T>
30 | void deviceFree(T*& ptr);
31 | 
32 | template<typename T>
33 | void deviceFill(T* devptr, int size, T value);
34 | 
35 | template<typename T>
36 | void cudaD2Hcpy(T* tgt, const T* src, const int size);
37 | 
38 | template<typename T>
39 | void cudaH2Dcpy(T* tgt, const T* src, const int size);
40 | 
41 | template<typename T>
42 | void cudaD2Dcpy(T* tgt, const T* src, const int size);
43 | 
44 | template<typename T>
45 | void cudaRandomUniform(T* buffer, const int size);
46 | 
47 | template<typename T>
48 | int loadWeightFromBin(T* ptr,
49 |                       std::vector<int> shape,
50 |                       std::string filename,
51 |                       FtCudaDataType model_file_type = FtCudaDataType::FP32);
52 | 
53 | void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const int size, cudaStream_t stream);
54 | void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const int size, cudaStream_t stream);
55 | 
56 | }  // namespace fastertransformer
57 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/beam_search_topk_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_runtime.h>
18 | 
19 | #pragma once
20 | 
21 | namespace fastertransformer {
22 | 
23 | template<typename T>
24 | void invokeTopkBeamSearch(void* workspace,
25 |                           size_t& workspace_size,
26 |                           T* log_probs,
27 |                           int* ids,
28 |                           const bool* finished,
29 |                           const int batch_size,
30 |                           const int beam_width,
31 |                           const int vocab_size_padded_,
32 |                           const T diversity_rate,
33 |                           const int* end_ids,
34 |                           cudaStream_t stream);
35 | 
36 | template<typename T>
37 | void invokeTileEncoderResults(T* tiled_encoder_output,
38 |                               int* tiled_encoder_sequence_length,
39 |                               const T* encoder_output,
40 |                               const int* encoder_sequence_length,
41 |                               const size_t batch_size,
42 |                               const size_t beam_width,
43 |                               const size_t mem_max_seq_len,
44 |                               const size_t d_model,
45 |                               cudaStream_t stream);
46 | 
47 | }  // namespace fastertransformer
48 | 


--------------------------------------------------------------------------------
/cmake/FasterTransformerConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | include(CMakeFindDependencyMacro)
28 | 
29 | get_filename_component(
30 |   FASTERTRANSFORMER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
31 | )
32 | 
33 | list(APPEND CMAKE_MODULE_PATH ${FASTERTRANSFORMER_CMAKE_DIR})
34 | 
35 | if(NOT TARGET transformer-shared)
36 |   include("${FASTERTRANSFORMER_CMAKE_DIR}/FasterTransformerTargets.cmake")
37 | endif()
38 | 
39 | set(FASTERTRANSFORMER_LIBRARIES transformer-shared)
40 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/custom_ar_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <assert.h>
20 | #include <cuda_fp16.h>
21 | 
22 | #include <iostream>
23 | 
24 | #include "src/fastertransformer/utils/cuda_utils.h"
25 | 
26 | #define CUSTOM_AR_SIZE_THRESHOLD 50331648
27 | #define MAX_ALL_REDUCE_BLOCKS 24
28 | #define FLAG(a) ((uint32_t)((a) % 0x146))
29 | #define RANKS_PER_NODE 8
30 | #define WARP_SIZE 32
31 | #define DEFAULT_BLOCK_SIZE 1024
32 | #define DEFALUT_ALGO_AR_SIZE_THRESHOLD 196608
33 | 
34 | namespace fastertransformer {
35 | 
36 | #ifdef ENABLE_BF16
37 | typedef struct bf168 {
38 |     __nv_bfloat162 x;
39 |     __nv_bfloat162 y;
40 |     __nv_bfloat162 z;
41 |     __nv_bfloat162 w;
42 | } bf168;
43 | #endif
44 | 
45 | template<typename T>
46 | struct AllReduceParams {
47 |     size_t elts_total;
48 |     size_t elts_per_rank;
49 |     size_t elts_per_block;
50 |     size_t rank_offset;
51 |     size_t rank, local_rank, node_id;
52 |     uint32_t barrier_flag;
53 |     uint32_t* peer_barrier_ptrs[RANKS_PER_NODE];
54 |     T* peer_comm_buffer_ptrs[RANKS_PER_NODE];
55 |     T* local_output_buffer_ptr;
56 | };
57 | 
58 | template<typename T>
59 | void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t stream);
60 | 
61 | void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);
62 | 
63 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/src/fastertransformer/layers/DynamicDecodeBaseLayer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <string>
20 | #include <unordered_map>
21 | 
22 | #include "src/fastertransformer/layers/BaseLayer.h"
23 | 
24 | namespace fastertransformer {
25 | 
26 | class DynamicDecodeBaseLayer: public BaseLayer {
27 | protected:
28 |     virtual void allocateBuffer() = 0;
29 |     virtual void freeBuffer() = 0;
30 | 
31 | public:
32 |     DynamicDecodeBaseLayer(cudaStream_t stream,
33 |                            cublasMMWrapper* cublas_wrapper,
34 |                            IAllocator* allocator,
35 |                            bool is_free_buffer_after_forward,
36 |                            cudaDeviceProp* cuda_device_prop):
37 |         BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop){};
38 |     ~DynamicDecodeBaseLayer() = default;
39 |     DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
40 | 
41 |     virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
42 |                          const std::vector<fastertransformer::Tensor>* input_tensors) = 0;
43 |     virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
44 |                          const std::unordered_map<std::string, Tensor>* input_tensors) = 0;
45 | };
46 | 
47 | }  // namespace fastertransformer
48 | 


--------------------------------------------------------------------------------
/src/fastertransformer/kernels/beam_search_penalty_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | #include <cuda_fp16.h>
19 | 
20 | #include "src/fastertransformer/utils/cuda_utils.h"
21 | 
22 | namespace fastertransformer {
23 | 
24 | template<typename T>
25 | void invokeAddBiasApplyPenalties(int step,
26 |                                  T* logits,
27 |                                  const int* current_ids,
28 |                                  const int* previous_ids,
29 |                                  const int* parent_ids,
30 |                                  const int* input_lengths,
31 |                                  const T* bias,
32 |                                  const int ite,
33 |                                  const int max_input_length,
34 |                                  const int local_batch_size,
35 |                                  const int batch_size,
36 |                                  const int beam_width,
37 |                                  const int vocab_size,
38 |                                  const int vocab_size_padded,
39 |                                  const int* end_ids,
40 |                                  const float temerature,
41 |                                  const float len_penalty,
42 |                                  const float repeat_penalty,
43 |                                  cudaStream_t stream);
44 | 
45 | }  // namespace fastertransformer
46 | 


--------------------------------------------------------------------------------
/src/fastertransformer/models/swin/SwinWeight.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | #include "src/fastertransformer/kernels/layernorm_kernels.h"
19 | #include "src/fastertransformer/layers/FfnWeight.h"
20 | #include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
21 | #include <cuda_fp16.h>
22 | 
23 | namespace fastertransformer {
24 | 
25 | template<typename T>
26 | class SwinTransformerBlockWeight {
27 | public:
28 |     AttentionWeight<T> attention_weights;
29 |     FfnWeight<T> ffn_weights;
30 |     LayerNormWeight<T> attn_layernorm_weights;
31 |     LayerNormWeight<T> ffn_layernorm_weights;
32 |     const T* attention_relative_pos_bias = nullptr;
33 | };  // SwinTransformerBlockWeight
34 | 
35 | template<typename T>
36 | class SwinTransformerBasicLayerWeight {
37 | public:
38 |     LayerNormWeight<T> merge_layernorm_weights;
39 |     DenseWeight<T> merge_linear_weights;
40 |     const T* attn_mask = nullptr;
41 |     std::vector<SwinTransformerBlockWeight<T>> block_weight_list;
42 | };  // SwinTransformerBasicLayerWeight
43 | 
44 | template<typename T>
45 | class SwinTransformerWeight {
46 | public:
47 |     DenseWeight<T> patchEmbed_linear_weights;
48 |     LayerNormWeight<T> patchEmbed_norm_weights;
49 |     LayerNormWeight<T> norm_weights;
50 |     std::vector<SwinTransformerBasicLayerWeight<T>> basic_layer_weight_list;
51 | };  // class SwinTransformerWeight
52 | 
53 | }  // namespace fastertransformer


--------------------------------------------------------------------------------
/examples/pytorch/gpt/utils/parallel_gpt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from __future__ import print_function
16 | 
17 | import torch
18 | from examples.pytorch.gpt.utils.gpt import GPT
19 | 
20 | class ParallelGPT(GPT):
21 |     def __init__(self, head_num, size_per_head, vocab_size, start_id, end_id, layer_num, max_seq_len,
22 |                  tensor_para_size, pipeline_para_size, lib_path, int8_mode):
23 |         super().__init__(head_num, size_per_head, vocab_size, start_id, end_id, layer_num, max_seq_len,
24 |                          tensor_para_size, pipeline_para_size, lib_path, int8_mode)
25 | 
26 |     def cuda(self):
27 |         self.weights._map(lambda w: w.cuda(self.device))
28 |         if self.int8_mode != 0:
29 |             self.weights._map_int8(lambda w: w.cuda(self.device))
30 | 
31 |         if self.build_model == True:
32 |             del self.model
33 |             self.build_model = False
34 |         self.model = torch.classes.FasterTransformer.ParallelGptOp(self.head_num, self.size_per_head, 4 * self.head_num * self.size_per_head,
35 |                                                                    self.layer_num, self.vocab_size, self.start_id, self.end_id,
36 |                                                                    self.tensor_para_size, self.pipeline_para_size, self.int8_mode,
37 |                                                                    self.weights.w, self.weights.int8_w, self.weights.scale)
38 |         self.build_model = True
39 | 


--------------------------------------------------------------------------------
/src/fastertransformer/layers/sampling_layers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
18 | set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels)
21 | 
22 | add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
23 | set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
24 | set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
25 | target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
26 | 
27 | add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
28 | set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
29 | set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
30 | target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
31 | 
32 | add_library(TopKTopPSamplingLayer STATIC TopKTopPSamplingLayer.cu)
33 | set_property(TARGET TopKTopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
34 | set_property(TARGET TopKTopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
35 | target_link_libraries(TopKTopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
36 | 


--------------------------------------------------------------------------------