├── tests ├── __init__.py ├── cublas │ └── build.sh ├── gemm_test │ ├── gemm_test.sh │ └── gemm_test.py └── huggingface │ └── test_gpt.py ├── lightseq ├── csrc │ ├── export │ │ └── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── cuda │ │ │ └── __init__.py │ ├── .gitignore │ ├── layers_new │ │ ├── launch_gpt_emb_layer.cpp │ │ ├── CMakeLists.txt │ │ ├── includes │ │ │ ├── linear_layer.h │ │ │ ├── llama_layer.h │ │ │ ├── crf_layer.h │ │ │ ├── sample_layer.h │ │ │ ├── encdec_kv_layer.h │ │ │ ├── llama_mlp_layer.h │ │ │ ├── sdpa_layer.h │ │ │ ├── gpt_layer.h │ │ │ ├── transformer_encoder_layer.h │ │ │ ├── generator_layer.h │ │ │ └── rms_norm_layer.h │ │ └── llama_layer.cpp │ ├── lsflow │ │ ├── README.md │ │ ├── includes │ │ │ ├── allocator.h │ │ │ ├── lsflow_util.h │ │ │ └── shape.h │ │ ├── CMakeLists.txt │ │ ├── operator.cpp │ │ ├── shape.cpp │ │ ├── lsflow_util.cpp │ │ └── allocator.cpp │ ├── kernels │ │ ├── x86 │ │ │ ├── includes │ │ │ │ ├── kernel_headers.h │ │ │ │ └── kernels.h │ │ │ ├── CMakeLists.txt │ │ │ ├── util.cc │ │ │ └── gemm.cpp │ │ ├── arm │ │ │ ├── gemm.cc │ │ │ ├── includes │ │ │ │ ├── utils.h │ │ │ │ └── kernel_headers.h │ │ │ ├── CMakeLists.txt │ │ │ └── utils.cc │ │ └── cuda │ │ │ ├── includes │ │ │ ├── ls_cub.cuh │ │ │ ├── kernel_headers.h │ │ │ ├── embKernels.h │ │ │ ├── cuda_util.h │ │ │ └── llama_kernels.h │ │ │ └── CMakeLists.txt │ ├── tensorflow │ │ └── README.md │ ├── models │ │ ├── includes │ │ │ ├── model_util.h │ │ │ ├── bert.h │ │ │ └── bert_crf.h │ │ ├── model_util.cc │ │ └── CMakeLists.txt │ ├── pybind │ │ └── CMakeLists.txt │ ├── proto │ │ ├── includes │ │ │ ├── proto_headers.h │ │ │ ├── test_model_weight.h │ │ │ └── hdf5_util.h │ │ └── CMakeLists.txt │ ├── pytorch │ │ ├── builder │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── pytorch_quantization │ │ │ ├── optim │ │ │ │ └── __init__.py │ │ │ ├── nn │ │ │ │ ├── modules │ │ │ │ │ └── __init__.py │ │ │ │ ├── _functions │ │ │ │ │ └── __init__.py │ │ │ │ └── __init__.py │ │ │ ├── version.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ └── quant_logging.py │ │ │ ├── __init__.py │ │ │ └── calib │ │ │ │ └── __init__.py │ │ └── sdpa_layers.py │ ├── example │ │ └── CMakeLists.txt │ ├── ops_new │ │ ├── CMakeLists.txt │ │ ├── includes │ │ │ ├── fuse_add2_op.h │ │ │ ├── transform_0213.h │ │ │ ├── dropout.h │ │ │ ├── act_elewise_product.h │ │ │ ├── crf.h │ │ │ ├── layer_normalize.h │ │ │ ├── rms_layer_norm.h │ │ │ ├── bias_add_transform_20314.h │ │ │ ├── bias_dropout_residual.h │ │ │ ├── bias_act_dropout.h │ │ │ ├── softmax.h │ │ │ ├── concat3_dim1.h │ │ │ ├── launch_enc_emb.h │ │ │ ├── launch_dec_emb_op.h │ │ │ ├── sampling.h │ │ │ ├── launch_llama_emb.h │ │ │ ├── linear.h │ │ │ └── launch_gpt_emb.h │ │ ├── fuse_add2_op.cpp │ │ ├── act_elewise_product.cpp │ │ ├── fuse_rotary_position_qkv.cpp │ │ ├── transform_0213.cpp │ │ ├── dropout.cpp │ │ ├── launch_dec_emb_op.cpp │ │ └── launch_enc_emb.cpp │ ├── ops │ │ └── includes │ │ │ ├── context.h │ │ │ └── softmax.h │ ├── layers │ │ └── includes │ │ │ └── cross_entropy_layer.h │ └── triton_backend │ │ └── src │ │ ├── triton_utils.h │ │ └── libtriton_minimal.ldscript ├── training │ ├── cli │ │ ├── __init__.py │ │ ├── fs_modules │ │ │ └── __init__.py │ │ ├── lightseq_deepspeed_cli.py │ │ ├── lightseq_fairseq_generate_cli.py │ │ ├── lightseq_fairseq_validate_cli.py │ │ └── lightseq_fairseq_train_cli.py │ ├── ops │ │ ├── __init__.py │ │ ├── tensorflow │ │ │ ├── __init__.py │ │ │ └── README.md │ │ └── pytorch │ │ │ ├── __init__.py │ │ │ └── builder │ │ │ ├── __init__.py │ │ │ └── adam_builder.py │ ├── gcq │ │ ├── __init__.py │ │ └── ls_fs_gcq_trainer.py │ ├── pytorch_quantization │ │ ├── optim │ │ │ └── __init__.py │ │ ├── nn │ │ │ ├── modules │ │ │ │ └── __init__.py │ │ │ ├── _functions │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── version.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── quant_logging.py │ │ ├── __init__.py │ │ └── calib │ │ │ └── __init__.py │ └── __init__.py ├── __init__.py └── inference │ ├── server │ └── libserver.ldscript │ ├── tools │ └── CMakeLists.txt │ ├── kernels │ ├── CMakeLists.txt │ ├── t5EmbKernels.h │ ├── embKernels_int8.h │ ├── t5Kernels.h │ ├── embKernels.h │ └── multilgKernels.h │ ├── pywrapper │ ├── vit.h │ ├── bert.h │ ├── quant_vit.h │ ├── quant_bert.h │ ├── gpt.h │ └── quant_gpt.h │ └── triton_backend │ └── src │ ├── triton_utils.h │ └── libtriton_minimal.ldscript ├── examples ├── inference │ ├── python │ │ ├── __init__.py │ │ ├── export │ │ │ ├── __init__.py │ │ │ ├── fairseq │ │ │ │ └── __init__.py │ │ │ ├── proto │ │ │ │ └── __init__.py │ │ │ ├── huggingface │ │ │ │ └── __init__.py │ │ │ └── util.py │ │ └── test │ │ │ └── ls_fairseq.sh │ ├── cpp │ │ └── CMakeLists.txt │ ├── benchmark_gpt.sh │ ├── benchmark_quant_gpt.sh │ ├── benchmark_bart.sh │ └── benchmark_quant_bart.sh ├── training │ ├── neurst │ │ ├── __init__.py │ │ └── README.md │ ├── deepspeed │ │ ├── __init__.py │ │ ├── deepspeed_config.json │ │ ├── README.md │ │ ├── ds_fairseq_argument.py │ │ └── ds_fairseq_wmt14en2de.sh │ ├── huggingface │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── task_ner │ │ │ │ ├── run_ner.sh │ │ │ │ ├── run_quant_ner.sh │ │ │ │ └── run_gcq_ner.sh │ │ │ ├── task_qa │ │ │ │ ├── run_qa.sh │ │ │ │ ├── run_quant_qa.sh │ │ │ │ └── run_gcq_qa.sh │ │ │ └── task_glue │ │ │ │ ├── run_glue.sh │ │ │ │ └── run_quant_glue.sh │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── requirements.txt │ │ │ ├── run_clm.sh │ │ │ ├── run_quant_clm.sh │ │ │ └── run_gcq_clm.sh │ │ ├── vit │ │ │ ├── __init__.py │ │ │ ├── run_vit.sh │ │ │ └── run_quant_vit.sh │ │ ├── gcq │ │ │ ├── __init__.py │ │ │ ├── cli_utils.py │ │ │ └── ls_hf_gcq_trainer.py │ │ └── bart │ │ │ └── summarization │ │ │ ├── requirements.txt │ │ │ └── run_summarization.sh │ ├── custom │ │ ├── run.sh │ │ ├── run_quant.sh │ │ └── README.md │ └── fairseq │ │ ├── requirements.txt │ │ ├── ls_finetune_bart │ │ ├── convert_lightseq_to_huggingface.sh │ │ └── ls_fairseq_summarization_cnn_dm.sh │ │ ├── native_fairseq_wmt14en2de.sh │ │ ├── ls_fairseq_wmt14en2de.sh │ │ ├── ls_torch_fairseq_wmt14en2de.sh │ │ ├── ls_fairseq_quant_wmt14en2de.sh │ │ ├── ls_torch_fairseq_quant_wmt14en2de.sh │ │ └── ls_fairseq_gcq_wmt14en2de.sh └── triton_backend │ ├── model_repo │ ├── bert_example │ │ ├── 1 │ │ │ └── .gitignore │ │ └── config.pbtxt │ ├── gpt_example │ │ ├── 1 │ │ │ └── .gitignore │ │ └── config.pbtxt │ └── transformer_example │ │ ├── 1 │ │ └── .gitignore │ │ └── config.pbtxt │ └── transformer_client_example.py ├── CODEOWNERS ├── docs ├── images │ ├── nmt.png │ ├── logo.png │ ├── features.png │ ├── support.png │ ├── generation.png │ ├── single_step.png │ └── total_time.png └── examples.md ├── .clang-format ├── docker └── README.md ├── MANIFEST.in ├── .gitmodules ├── CONTRIBUTING.md ├── .pre-commit-config.yaml └── .github └── workflows └── build_check.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/csrc/export/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/csrc/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/training/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/training/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/inference/python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/training/neurst/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/csrc/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | -------------------------------------------------------------------------------- /examples/training/deepspeed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/inference/python/export/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/training/huggingface/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/training/huggingface/vit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.0.1" 2 | -------------------------------------------------------------------------------- /lightseq/training/ops/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/inference/python/export/fairseq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/inference/python/export/proto/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/launch_gpt_emb_layer.cpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/inference/python/export/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/bert_example/1/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/gpt_example/1/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/transformer_example/1/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @neopro12 @godweiyang @Taka152 @hexisyztem @zjersey 2 | -------------------------------------------------------------------------------- /examples/training/custom/run.sh: -------------------------------------------------------------------------------- 1 | python3 examples/training/custom/run.py 2 | -------------------------------------------------------------------------------- /lightseq/training/gcq/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcq import GCQ, GCQState, encode_and_decode 2 | -------------------------------------------------------------------------------- /docs/images/nmt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/nmt.png -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/logo.png -------------------------------------------------------------------------------- /examples/training/custom/run_quant.sh: -------------------------------------------------------------------------------- 1 | python3 examples/training/custom/run.py --enable_quant 2 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/README.md: -------------------------------------------------------------------------------- 1 | LsFlow is a extremely clean implement of computation graph. 2 | -------------------------------------------------------------------------------- /docs/images/features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/features.png -------------------------------------------------------------------------------- /docs/images/support.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/support.png -------------------------------------------------------------------------------- /docs/images/generation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/generation.png -------------------------------------------------------------------------------- /docs/images/single_step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/single_step.png -------------------------------------------------------------------------------- /docs/images/total_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/total_time.png -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | --- 4 | Language: Cpp 5 | ColumnLimit: 80 6 | SortIncludes: false 7 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/x86/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util.h" 3 | #include "kernels.h" 4 | -------------------------------------------------------------------------------- /examples/training/huggingface/gcq/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli_utils import GCQArguments 2 | from .ls_hf_gcq_trainer import LSTrainer 3 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | ## Dockerfiles of lightseq 2 | 3 | PyPI: for publish python package. 4 | 5 | Tritonserver: for publish tritonserver 6 | -------------------------------------------------------------------------------- /examples/training/fairseq/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.19.5 2 | sacrebleu==1.5.1 3 | sacremoses 4 | fairseq==0.10.2 5 | lightseq 6 | ninja 7 | -------------------------------------------------------------------------------- /lightseq/csrc/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information. 2 | -------------------------------------------------------------------------------- /lightseq/training/ops/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information. 2 | -------------------------------------------------------------------------------- /examples/training/huggingface/gpt/requirements.txt: -------------------------------------------------------------------------------- 1 | torch >= 1.3 2 | datasets >= 1.8.0 3 | sentencepiece != 0.1.92 4 | protobuf 5 | transformers == 4.16.2 6 | -------------------------------------------------------------------------------- /tests/cublas/build.sh: -------------------------------------------------------------------------------- 1 | nvcc -c gemm.cu -o gemm.cuda.o 2 | nvcc gemm.cuda.o test.cpp -o test -L/usr/local/cuda/lib64 -lcudart -lcuda -lcublas -lcublasLt 3 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/arm/gemm.cc: -------------------------------------------------------------------------------- 1 | #include "kernel_headers.h" 2 | 3 | namespace lightseq { 4 | namespace arm {} // namespace arm 5 | } // namespace lightseq 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.txt 2 | global-include *.cu *.cpp *.cc *.cuh *.h *.ldscript *.proto *.cmake 3 | prune dist 4 | prune build 5 | prune tests 6 | prune examples 7 | -------------------------------------------------------------------------------- /lightseq/inference/server/libserver.ldscript: -------------------------------------------------------------------------------- 1 | { 2 | global: 3 | CustomErrorString; 4 | CustomExecute; 5 | CustomFinalize; 6 | CustomInitialize; 7 | local: *; 8 | }; 9 | -------------------------------------------------------------------------------- /examples/training/huggingface/bart/summarization/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets >= 1.8.0 3 | sentencepiece != 0.1.92 4 | protobuf 5 | rouge-score 6 | nltk 7 | py7zr 8 | torch >= 1.3 9 | -------------------------------------------------------------------------------- /lightseq/csrc/models/includes/model_util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "layer.h" 3 | 4 | namespace lightseq { 5 | 6 | GenerateMethod get_generate_method(std::string method_); 7 | 8 | } // namespace lightseq 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/pybind11"] 2 | path = 3rdparty/pybind11 3 | url = https://github.com/pybind/pybind11.git 4 | [submodule "3rdparty/cub"] 5 | path = 3rdparty/cub 6 | url = https://github.com/NVIDIA/cub.git 7 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/arm/includes/utils.h: -------------------------------------------------------------------------------- 1 | #include "cstdio" 2 | #include "iostream" 3 | 4 | namespace lightseq { 5 | 6 | template 7 | void print_vec(const T *outv, std::string outn, int num_output_ele); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /lightseq/training/cli/fs_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .ls_adam import LSFSAdam 2 | from .ls_label_smoothed_cross_entropy import LSLabelSmoothedCrossEntropyCriterion 3 | from .ls_transformer import LSTransformerModel 4 | from .ls_bart import LSBARTModel 5 | from .ls_translation import LSTranslationTask 6 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/arm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | 3 | cmake_minimum_required(VERSION 3.18) 4 | set(lightseq_kernel_files gemm.cc utils.cc) 5 | 6 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files}) 7 | target_include_directories(lightseq_kernels INTERFACE includes) 8 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/arm/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "utils.h" 14 | -------------------------------------------------------------------------------- /lightseq/csrc/pybind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(LS_PYBIND_KERNEL_FILES pybind_model.cpp) 4 | pybind11_add_module(lightseq MODULE ${LS_PYBIND_KERNEL_FILES}) 5 | target_link_libraries(lightseq PUBLIC liblightseq lightseq_kernels) 6 | set_target_properties(lightseq PROPERTIES OUTPUT_NAME inference) 7 | -------------------------------------------------------------------------------- /tests/gemm_test/gemm_test.sh: -------------------------------------------------------------------------------- 1 | python3 gemm_test.py -hd 1024 -id 4096 -minb 1 -maxb 10000 -d configs 2 | python3 gemm_test.py -hd 512 -id 2048 -minb 1 -maxb 10000 -d configs 3 | python3 gemm_test.py -hd 768 -id 3072 -minb 1 -maxb 10000 -d configs 4 | 5 | mkdir -p $HOME/.lightseq/igemm_configs 6 | cp configs/* $HOME/.lightseq/igemm_configs 7 | -------------------------------------------------------------------------------- /lightseq/csrc/tests/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 4 | par_dir = os.path.dirname(cur_dir) 5 | csrc_dir = os.path.dirname(par_dir) 6 | lightseq_dir = os.path.dirname(csrc_dir) 7 | 8 | sys.path.insert(0, lightseq_dir) 9 | sys.path.insert(0, os.path.dirname(lightseq_dir)) 10 | -------------------------------------------------------------------------------- /lightseq/inference/tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # (default) use C API for HDF5 library 4 | find_package(HDF5 REQUIRED) 5 | 6 | add_library(utils STATIC util.cc.cu) 7 | target_include_directories(utils PUBLIC ${HDF5_INCLUDE_DIRS}) 8 | target_include_directories(utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 9 | target_link_libraries(utils PRIVATE ${HDF5_LIBRARIES}) 10 | -------------------------------------------------------------------------------- /lightseq/training/cli/lightseq_deepspeed_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from deepspeed.launcher.runner import main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /lightseq/training/cli/lightseq_fairseq_generate_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from fairseq_cli.generate import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /lightseq/training/cli/lightseq_fairseq_validate_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from fairseq_cli.validate import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_finetune_bart/convert_lightseq_to_huggingface.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | # The model's directory should contain both source and target vocabulary files 4 | fairseq_path=/path/to/model.pt 5 | save_dir=/path/to/save_dir 6 | 7 | python3 convert_lightseq_to_huggingface.py \ 8 | --fairseq_path $fairseq_path \ 9 | --pytorch_dump_folder_path $save_dir \ 10 | --hf_config facebook/bart-base 11 | -------------------------------------------------------------------------------- /lightseq/csrc/proto/includes/proto_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "hdf5.h" 14 | 15 | #include "declaration.h" 16 | -------------------------------------------------------------------------------- /lightseq/training/cli/lightseq_fairseq_train_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from lightseq.training.gcq.ls_fs_gcq_train import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /lightseq/training/ops/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_transformer_layers import ( 2 | TransformerEncoderLayer, 3 | TransformerDecoderLayer, 4 | TransformerEmbeddingLayer, 5 | ) 6 | from .quantization import TensorQuantizer, act_quant_config, QuantLinear 7 | from .builder.transformer_builder import TransformerBuilder 8 | from .builder.operator_builder import OperatorBuilder 9 | from .builder.layer_builder import LayerBuilder 10 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/includes/ls_cub.cuh: -------------------------------------------------------------------------------- 1 | // copied from https://github.com/dmlc/dgl/pull/2758 2 | #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_ 3 | #define DGL_ARRAY_CUDA_DGL_CUB_CUH_ 4 | 5 | #define CUB_NS_PREFIX namespace ls { 6 | #define CUB_NS_POSTFIX } 7 | #define CUB_NS_QUALIFIER ::ls::cub 8 | #include "cub/cub.cuh" 9 | #include "cub/util_allocator.cuh" 10 | #undef CUB_NS_POSTFIX 11 | #undef CUB_NS_PREFIX 12 | #undef CUB_NS_QUALIFIER 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/includes/allocator.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team 3 | */ 4 | #pragma once 5 | #include "declaration.h" 6 | 7 | namespace lightseq { 8 | 9 | class Allocator { 10 | private: 11 | std::unordered_set _ptr_set; 12 | 13 | public: 14 | Allocator(); 15 | virtual ~Allocator(); 16 | char* malloc_mem(size_t size); 17 | void free_mem(char* ptr); 18 | }; 19 | 20 | } // namespace lightseq 21 | -------------------------------------------------------------------------------- /lightseq/csrc/models/model_util.cc: -------------------------------------------------------------------------------- 1 | #include "model_util.h" 2 | 3 | namespace lightseq { 4 | 5 | GenerateMethod get_generate_method(std::string method_) { 6 | if (method_ == "topk") return GenerateMethod::Topk; 7 | if (method_ == "topp") return GenerateMethod::Topp; 8 | if (method_ == "beam_search") return GenerateMethod::BeamSearch; 9 | 10 | printf("Error!\n"); 11 | return GenerateMethod::UnDefined; 12 | } 13 | 14 | } // namespace lightseq 15 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/x86/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | 3 | cmake_minimum_required(VERSION 3.18) 4 | 5 | set(lightseq_kernel_files util.cc gemm.cpp) 6 | 7 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files}) 8 | target_include_directories(lightseq_kernels PUBLIC ${HDF5_INCLUDE_DIRS}) 9 | target_include_directories(lightseq_kernels INTERFACE includes) 10 | target_link_libraries(lightseq_kernels PRIVATE ${HDF5_LIBRARIES}) 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guidelines 2 | 3 | First of all, thanks for taking the time to contribute! 4 | 5 | Please refer to the following guidelines to contribute new functionality or bug fixes: 6 | 7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code. 8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to LightSeq C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). 9 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(LightseqProtoType LANGUAGES CXX CUDA) 4 | 5 | find_package(Threads REQUIRED) 6 | 7 | set(CMAKE_CXX_STANDARD 14) 8 | 9 | add_library( 10 | lsflow STATIC 11 | context.cpp 12 | node.cpp 13 | manager.cpp 14 | layer.cpp 15 | tensor.cpp 16 | allocator.cpp 17 | lsflow_util.cpp 18 | operator.cpp 19 | shape.cpp 20 | variable.cpp) 21 | 22 | target_link_libraries(lsflow PUBLIC lightseq_kernels) 23 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import CUDAOpBuilder 2 | from .cuda_kernel_builder import CudaKernelBuilder 3 | from .x86_kernel_builder import X86KernelBuilder 4 | from .cuda_layer_builder import CudaLayerBuilder 5 | 6 | # TODO: infer this list instead of hard coded 7 | # List of all available ops 8 | __op_builders__ = [ 9 | CudaKernelBuilder(), 10 | CudaLayerBuilder(), 11 | X86KernelBuilder(), 12 | ] 13 | 14 | ALL_OPS = {op.name: op for op in __op_builders__} 15 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/includes/lsflow_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team 3 | */ 4 | 5 | #pragma once 6 | #include "declaration.h" 7 | 8 | namespace lightseq { 9 | 10 | /* Print run time, for debug */ 11 | void print_time_duration( 12 | const std::chrono::high_resolution_clock::time_point &start, 13 | std::string duration_name); 14 | 15 | #ifdef LIGHTSEQ_cuda 16 | cublasOperation_t op_from_custom(MATRIX_OP op_type); 17 | #endif 18 | 19 | } // namespace lightseq 20 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 4 | csrc_dir = os.path.dirname(cur_dir) 5 | lightseq_dir = os.path.dirname(csrc_dir) 6 | sys.path.insert(0, lightseq_dir) 7 | 8 | from .builder.cuda_kernel_builder import CudaKernelBuilder 9 | from .builder.x86_kernel_builder import X86KernelBuilder 10 | from .builder.cuda_layer_builder import CudaLayerBuilder 11 | 12 | from .torch_transformer_layers import TransformerEncoderLayer, TransformerDecoderLayer 13 | -------------------------------------------------------------------------------- /examples/inference/python/test/ls_fairseq.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | until [[ -z "$1" ]] 4 | do 5 | case $1 in 6 | -m) 7 | shift; MODEL=$1; 8 | shift;; 9 | *) 10 | shift;; 11 | esac 12 | done 13 | 14 | lightseq-infer /tmp/wmt14_en_de/ \ 15 | --gen-subset test \ 16 | --path ${MODEL} \ 17 | --task translation \ 18 | --batch-size 128 \ 19 | --beam 4 \ 20 | --lenpen 0.6 \ 21 | --fp16 \ 22 | --quiet \ 23 | --scoring sacrebleu 24 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(cuda_kernel_files 4 | gptKernels.cc.cu 5 | gptKernels_int8.cc.cu 6 | transformerKernels.cc.cu 7 | multilgKernels.cc.cu 8 | embKernels.cc.cu 9 | embKernels_int8.cc.cu 10 | transformerKernels_int8.cc.cu 11 | moeKernels.cc.cu 12 | t5Kernels.cc.cu 13 | t5EmbKernels.cc.cu) 14 | 15 | add_library(cuda_kernels STATIC ${cuda_kernel_files}) 16 | target_include_directories(cuda_kernels INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 17 | -------------------------------------------------------------------------------- /lightseq/csrc/example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | add_executable(bert_example bert_example.cc) 4 | target_link_libraries(bert_example PUBLIC liblightseq) 5 | 6 | add_executable(transformer_example transformer_example.cc) 7 | target_link_libraries(transformer_example PUBLIC liblightseq) 8 | 9 | add_executable(gpt_example gpt_example.cc) 10 | target_link_libraries(gpt_example PUBLIC liblightseq) 11 | 12 | add_executable(llama_example llama_example.cc) 13 | target_link_libraries(llama_example PUBLIC liblightseq) 14 | -------------------------------------------------------------------------------- /examples/training/huggingface/gcq/cli_utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class GCQArguments: 6 | """ 7 | Arguments Gradient Communication Quantization. 8 | """ 9 | 10 | enable_GCQ: bool = field(default=False, metadata={"help": "Whether to enable GCQ"}) 11 | GCQ_quantile: float = field( 12 | default=0.99, metadata={"help": "GCQ quantile value, between 0.0-1.0"} 13 | ) 14 | hidden_size: int = field( 15 | default=1024, metadata={"help": "The hidden size of model"} 16 | ) 17 | -------------------------------------------------------------------------------- /lightseq/csrc/proto/includes/test_model_weight.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "bert.pb.h" 3 | #include "proto_headers.h" 4 | #include "proto_util.h" 5 | 6 | namespace lightseq { 7 | template 8 | class TestModelWeight { 9 | private: 10 | const T* _p_d_weight_emb; 11 | std::vector _d_weight_emb; 12 | 13 | public: 14 | TestModelWeight(int weight_size) { 15 | _d_weight_emb.clear(); 16 | for (int i = 0; i < weight_size; i++) { 17 | _d_weight_emb.push_back(rand() % 100); 18 | } 19 | } 20 | const T*& weight_emb() const { return _p_d_weight_emb; } 21 | }; 22 | } // namespace lightseq 23 | -------------------------------------------------------------------------------- /lightseq/training/ops/pytorch/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import CUDAOpBuilder 2 | from .kernel_builder import KernelBuilder 3 | from .transformer_builder import TransformerBuilder 4 | from .operator_builder import OperatorBuilder 5 | from .adam_builder import AdamBuilder 6 | from .layer_builder import LayerBuilder 7 | 8 | # TODO: infer this list instead of hard coded 9 | # List of all available ops 10 | __op_builders__ = [ 11 | LayerBuilder(), 12 | KernelBuilder(), 13 | OperatorBuilder(), 14 | TransformerBuilder(), 15 | AdamBuilder(), 16 | ] 17 | ALL_OPS = {op.name: op for op in __op_builders__} 18 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/gpt_example/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "gpt_example" 2 | backend: "lightseq" 3 | max_batch_size: 8 4 | input [ 5 | { 6 | name: "token_ids" 7 | data_type: TYPE_INT32 8 | dims: [ -1 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "result" 14 | data_type: TYPE_INT32 15 | dims: [ -1 ] 16 | } 17 | ] 18 | instance_group [ 19 | { 20 | count: 1 21 | kind: KIND_GPU 22 | } 23 | ] 24 | default_model_filename: "lightseq_gpt2_base.hdf5" 25 | parameters: [ 26 | { 27 | key: "model_type" 28 | value: { 29 | string_value: "Gpt" 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/bert_example/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "bert_example" 2 | backend: "lightseq" 3 | max_batch_size: 8 4 | input [ 5 | { 6 | name: "token_ids" 7 | data_type: TYPE_INT32 8 | dims: [ -1 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "encoder_output" 14 | data_type: TYPE_INT32 15 | dims: [ -1 ] 16 | } 17 | ] 18 | instance_group [ 19 | { 20 | count: 1 21 | kind: KIND_GPU 22 | } 23 | ] 24 | default_model_filename: "lightseq_bert_base_uncased.hdf5" 25 | parameters: [ 26 | { 27 | key: "model_type" 28 | value: { 29 | string_value: "Bert" 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /lightseq/csrc/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(liblightseq SHARED bert.cc bert_crf.cc transformer.cu gpt.cc 2 | llama.cc model_util.cc) 3 | 4 | target_link_libraries(liblightseq PUBLIC lightseq_layers) 5 | 6 | target_link_libraries(liblightseq PUBLIC weight_lib) 7 | 8 | target_link_options(liblightseq PUBLIC $) 10 | 11 | target_include_directories(liblightseq PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 12 | 13 | set_target_properties(liblightseq PROPERTIES OUTPUT_NAME lightseq) 14 | 15 | # add_executable(test_example test_layer.cc) target_link_libraries(test_example 16 | # PUBLIC liblightseq) 17 | -------------------------------------------------------------------------------- /examples/training/huggingface/gpt/run_clm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | 5 | python3 -m torch.distributed.launch \ 6 | --nproc_per_node=1 \ 7 | $THIS_DIR/run_clm.py \ 8 | --model_name_or_path gpt2 \ 9 | --dataset_name wikitext \ 10 | --dataset_config_name wikitext-103-raw-v1 \ 11 | --per_device_train_batch_size 16 \ 12 | --per_device_eval_batch_size 8 \ 13 | --num_train_epochs 1 \ 14 | --do_train \ 15 | --do_eval \ 16 | --output_dir /tmp/test-clm \ 17 | --overwrite_output_dir \ 18 | --fp16 \ 19 | --logging_steps 10 \ 20 | --block_size 512 \ 21 | --module_type 1 \ 22 | --enable_quant false 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | 8 | - repo: https://gitlab.com/daverona/pre-commit-cpp 9 | rev: 0.8.0 10 | hooks: 11 | - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available 12 | args: [-style=file] 13 | 14 | - repo: https://github.com/psf/black 15 | rev: 22.3.0 16 | hooks: 17 | - id: black 18 | 19 | - repo: https://github.com/cheshirekow/cmake-format-precommit 20 | rev: v0.6.10 21 | hooks: 22 | - id: cmake-format 23 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "kernels.h" 18 | #include "embKernels.h" 19 | #include "gptKernels.h" 20 | #include "transformerKernels.h" 21 | #include "cuda_util.h" 22 | #include "cublas_wrappers.h" 23 | #include "llama_kernels.h" 24 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(cuda_kernel_files 4 | util.cc.cu 5 | cross_entropy.cu 6 | cublas_wrappers.cu 7 | cuda_util.cu 8 | dropout_kernels.cu 9 | embedding_kernels.cu 10 | embKernels.cc.cu 11 | # fused_adam_kernel.cu 12 | general_kernels.cu 13 | gptKernels.cc.cu 14 | llama_kernels.cu 15 | normalize_kernels.cu 16 | softmax_kernels.cu 17 | softmax_kernels_new.cu 18 | transform_kernels.cu 19 | transform_kernels_new.cu 20 | crf.cu 21 | transformerKernels.cc.cu) 22 | 23 | add_library(lightseq_kernels STATIC ${cuda_kernel_files}) 24 | target_link_libraries(lightseq_kernels PUBLIC -lcublas) 25 | -------------------------------------------------------------------------------- /examples/training/custom/README.md: -------------------------------------------------------------------------------- 1 | # Build models from scratch 2 | This repo contains an example for how to use LightSeq to build model from scratch. In this example, we train a Transformer model using LightSeq Transformer model, cross entropy layer and adam optimizer. 3 | 4 | The source inputs of the encoder are batch of sentences and the target outputs of the decoder are their corresponding replies. We use Hugging Face tokenizer to obtain the token indexes of the sentences. 5 | 6 | You can run the example simplely by: 7 | ```shell 8 | python3 examples/training/custom/run.sh 9 | ``` 10 | 11 | (Optional) You can also train the model using int8 mixed-precision: 12 | ```shell 13 | python3 examples/training/custom/run_quant.sh 14 | ``` 15 | -------------------------------------------------------------------------------- /examples/training/huggingface/gpt/run_quant_clm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | 5 | python3 -m torch.distributed.launch \ 6 | --nproc_per_node=1 \ 7 | $THIS_DIR/run_clm.py \ 8 | --model_name_or_path gpt2 \ 9 | --dataset_name wikitext \ 10 | --dataset_config_name wikitext-103-raw-v1 \ 11 | --per_device_train_batch_size 16 \ 12 | --per_device_eval_batch_size 8 \ 13 | --num_train_epochs 2 \ 14 | --do_train \ 15 | --do_eval \ 16 | --output_dir /tmp/quant/test-clm \ 17 | --overwrite_output_dir \ 18 | --resume_from_checkpoint /tmp/test-clm \ 19 | --fp16 \ 20 | --logging_steps 10 \ 21 | --block_size 512 \ 22 | --module_type 1 \ 23 | --enable_quant true 24 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/x86/includes/kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cstdio" 3 | #include "util.h" 4 | 5 | namespace lightseq { 6 | namespace x86 { 7 | 8 | template 9 | void matrix_gemm(const InType* inpA, const InType* inpB, OutType* outC, int m, 10 | int n, int k); 11 | 12 | template 13 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a, 14 | bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha, 15 | const AType* a, int64_t lda, const BType* b, int64_t ldb, float beta, 16 | CType* c, int64_t ldc, const CType* a_shift_compensation = nullptr); 17 | 18 | } // namespace x86 19 | } // namespace lightseq 20 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/operator.cpp: -------------------------------------------------------------------------------- 1 | #include "node.h" 2 | namespace lightseq { 3 | 4 | Operator::Operator(std::string name) : Node(name, NodeType::Operator) { 5 | _context_ptr->add_op(this); 6 | } 7 | 8 | void Operator::check_override_grad() { 9 | for (Node* p : this->_parents) { 10 | Variable* rp = static_cast(p); 11 | if (!rp->enable_override_grad()) { 12 | printf("can not override"); 13 | exit(-1); 14 | } 15 | } 16 | return; 17 | } 18 | 19 | void Operator::set_children(std::vector children) { 20 | if (!this->_children.empty()) { 21 | printf("children not empty!"); 22 | exit(-1); 23 | } 24 | for (Node* iter : children) { 25 | iter->set_parents({this}); 26 | } 27 | } 28 | } // namespace lightseq 29 | -------------------------------------------------------------------------------- /examples/triton_backend/model_repo/transformer_example/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "transformer_example" 2 | backend: "lightseq" 3 | max_batch_size: 8 4 | input [ 5 | { 6 | name: "source_ids" 7 | data_type: TYPE_INT32 8 | dims: [ -1 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "target_ids" 14 | data_type: TYPE_INT32 15 | dims: [ -1 ] 16 | }, 17 | { 18 | name: "target_scores" 19 | data_type: TYPE_FP32 20 | dims: [ -1 ] 21 | } 22 | ] 23 | instance_group [ 24 | { 25 | count: 1 26 | kind: KIND_GPU 27 | } 28 | ] 29 | default_model_filename: "lightseq_bart_base.hdf5" 30 | parameters: [ 31 | { 32 | key: "model_type" 33 | value: { 34 | string_value: "Transformer" 35 | } 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(layers_files 2 | feed_forward_layer.cpp 3 | linear_layer.cpp 4 | llama_attention_layer.cpp 5 | llama_mlp_layer.cpp 6 | llama_layer.cpp 7 | generator_layer.cpp 8 | gpt_attention_layer.cpp 9 | gpt_layer.cpp 10 | multihead_attention_layer.cpp 11 | transformer_encoder_layer.cpp 12 | dec_enc_attention_layer.cpp 13 | dec_self_attention_layer.cpp 14 | transformer_decoder_layer.cpp 15 | crf_layer.cpp 16 | encdec_kv_layer.cpp 17 | sample_layer.cpp 18 | sdpa_layer.cpp) 19 | 20 | add_library(lightseq_layers STATIC ${layers_files}) 21 | target_link_libraries(lightseq_layers PUBLIC lightseq_operators lsflow) 22 | target_include_directories(lightseq_layers PUBLIC includes) 23 | -------------------------------------------------------------------------------- /examples/training/deepspeed/deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 8192, 3 | "optimizer": { 4 | "type": "AdamW", 5 | "params": { 6 | "lr": 5e-4, 7 | "betas": [ 8 | 0.9, 9 | 0.98 10 | ], 11 | "eps": 1e-8, 12 | "weight_decay": 0.0001, 13 | "torch_adam": false 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupDecayLR", 18 | "params": { 19 | "warmup_num_steps": 4000, 20 | "warmup_min_lr": 0, 21 | "warmup_max_lr": 5e-4, 22 | "total_num_steps": 1000000 23 | } 24 | }, 25 | "gradient_clipping": 0.0, 26 | "wall_clock_breakdown": false, 27 | "fp16": { 28 | "enabled": true, 29 | "loss_scale": 0, 30 | "initial_scale_power": 7 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | __version__ = "2.1.2" 18 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | __version__ = "2.1.2" 18 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(operator_files 2 | act_elewise_product.cpp 3 | beam_search_topk.cu 4 | bias_act_dropout.cpp 5 | bias_add_transform_20314.cpp 6 | bias_dropout_residual.cpp 7 | concat3_dim1.cpp 8 | crf.cpp 9 | dropout.cpp 10 | fuse_add2_op.cpp 11 | launch_dec_emb_op.cpp 12 | launch_enc_emb.cpp 13 | launch_gpt_emb.cpp 14 | launch_llama_emb.cpp 15 | layer_normalize.cpp 16 | split_head_op.cpp 17 | linear.cpp 18 | rms_layer_norm.cpp 19 | fuse_rotary_position_qkv.cpp 20 | sampling.cc.cu 21 | softmax.cpp 22 | strided_batch_gemm.cpp 23 | transform_0213.cpp) 24 | 25 | add_library(lightseq_operators STATIC ${operator_files}) 26 | target_link_libraries(lightseq_operators PUBLIC lsflow) 27 | target_include_directories(lightseq_operators PUBLIC includes) 28 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/sdpa_layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from csrc.pytorch.builder.cuda_layer_builder import CudaLayerBuilder 3 | 4 | cuda_layer_module = CudaLayerBuilder().load() 5 | 6 | 7 | class SdpaLayerFunc(torch.autograd.Function): 8 | @staticmethod 9 | def forward( 10 | ctx, 11 | input, 12 | input_mask, 13 | config, 14 | ): 15 | cuda_module = cuda_layer_module 16 | forward_func = ( 17 | cuda_module.transformer_encoder_layer_fw_fp16 18 | if config.fp16 19 | else cuda_module.transformer_encoder_layer_fw_fp32 20 | ) 21 | if config.fp16: 22 | input = input.to(torch.half) 23 | input_mask = input_mask.to(torch.half) 24 | 25 | (output,) = forward_func(config.layer_id, input, input_mask) 26 | 27 | return output 28 | -------------------------------------------------------------------------------- /examples/training/deepspeed/README.md: -------------------------------------------------------------------------------- 1 | # LightSeq for Fairseq+DeepSpeed 2 | This repo contains an example for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq), together with [DeepSpeed](https://github.com/microsoft/DeepSpeed) for distributed strategies and optimizers. We provide a new trainer for translation task to connect Fairseq and DeepSpeed. 3 | 4 | First you should install these requirements. 5 | ```shell 6 | pip install torch ninja fairseq deepspeed 7 | ``` 8 | 9 | Then you can train a translation task on wmt14 en2de dataset by running the following script: 10 | ```shell 11 | sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh 12 | ``` 13 | 14 | This script firstly download the dataset, and then run native Fairseq training script using DeepSpeed launcher without any other parameter modifications. 15 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """Main entry of all utils""" 20 | 21 | from .reduce_amax import reduce_amax 22 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """Main entry of all utils""" 20 | 21 | from .reduce_amax import reduce_amax 22 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/t5EmbKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | template 9 | void t5_launch_enc_emb(const T *token_emb, const int *tokens, T *output, 10 | int *pad_mask, int pad_id, int batch_size, int seq_len, 11 | int hidden_dim, cudaStream_t stream, const T *lang_emb, 12 | const int *lang_id); 13 | 14 | template 15 | void t5_launch_dec_emb(const T *token_emb, int *tokens, const T *lang_emb, 16 | const int *lang_id, T *output, int batch_size, 17 | int beam_size, int hidden_dim, int vocab_size, int step, 18 | int max_step, int multilg_type, cudaStream_t stream); 19 | 20 | } // namespace cuda 21 | } // namespace lightseq 22 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT) 19 | # of commit 42805f0 20 | 21 | from .version import __version__ 22 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT) 19 | # of commit 42805f0 20 | 21 | from .version import __version__ 22 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/fuse_add2_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class FuseAdd2Op : public Operator { 9 | private: 10 | size_t _max_batch_tokens; 11 | size_t _batch_tokens; 12 | size_t _batch_size; 13 | size_t _seq_len; 14 | size_t _hidden_dim; 15 | 16 | Variable* _result; 17 | 18 | public: 19 | FuseAdd2Op(size_t max_batch_tokens, size_t hidden_dim) 20 | : Operator("FuseAdd2"), 21 | _max_batch_tokens(max_batch_tokens), 22 | _hidden_dim(hidden_dim) {} 23 | 24 | ~FuseAdd2Op() {} 25 | 26 | Variable* operator()(Variable* inpA, Variable* inpB); 27 | 28 | void forward() override; 29 | 30 | void before_forward(size_t batch_size, size_t seq_len) { 31 | _batch_size = batch_size; 32 | _seq_len = seq_len; 33 | _result->set_shape({batch_size, seq_len, _hidden_dim}); 34 | } 35 | 36 | void backward() override {} 37 | }; 38 | 39 | } // namespace lightseq 40 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/calib/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """``csrc.pytorch.pytorch_quantization.calib`` provides Calibrator classes that 20 | collect data statistics and determine pytorch_quantization parameters. 21 | """ 22 | 23 | from .max import MaxCalibrator 24 | from .histogram import * 25 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/calib/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """``lightseq.training.pytorch_quantization.calib`` provides Calibrator classes that 20 | collect data statistics and determine pytorch_quantization parameters. 21 | """ 22 | 23 | from .max import MaxCalibrator 24 | from .histogram import * 25 | -------------------------------------------------------------------------------- /lightseq/csrc/proto/includes/hdf5_util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "proto_headers.h" 3 | #include "proto_util.h" 4 | #include "util.h" 5 | 6 | template 7 | void convert_dtype_by_gpu(float* source_addr, float* source_buffer, 8 | T* target_buffer, T* target_addr, size_t size, 9 | cudaStream_t stream) { 10 | if (std::is_same::value) { 11 | cudaMemcpyAsync(source_buffer, source_addr, size * sizeof(float), 12 | cudaMemcpyDefault, stream); 13 | lightseq::cuda::launch_convert_dtype(source_buffer, (__half*)target_addr, 14 | size, 1024, stream); 15 | } else if (std::is_same::value) { 16 | cudaMemcpyAsync(target_addr, source_addr, size * sizeof(float), 17 | cudaMemcpyDefault, stream); 18 | } 19 | } 20 | 21 | template 22 | T* malloc_memory(size_t size) { 23 | T* buffer_addr = nullptr; 24 | cudaMalloc(&buffer_addr, size * sizeof(T)); 25 | return buffer_addr; 26 | } 27 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/shape.cpp: -------------------------------------------------------------------------------- 1 | #include "shape.h" 2 | 3 | namespace lightseq { 4 | 5 | size_t Shape::element_size() { 6 | if (_shape_vec.size() == 1 && _shape_vec[0] == 0) { 7 | printf("this tensor without shape\n"); 8 | return 0; 9 | } 10 | if (_is_calculated) { 11 | return _element_size; 12 | } 13 | size_t product = 1; 14 | for (int iter : _shape_vec) { 15 | // if (iter <= 0) { 16 | // throw std::runtime_error("this tensor with invalid shape"); 17 | // return 0; 18 | // } 19 | product *= iter; 20 | } 21 | _is_calculated = true; 22 | _element_size = product; 23 | return _element_size; 24 | } 25 | 26 | void Shape::print_shape() { 27 | printf("shape dim: %zu, element size: %d, each dimension: ", 28 | _shape_vec.size(), element_size()); 29 | for (int i = 0; i < _shape_vec.size(); i++) { 30 | printf("%zu", _shape_vec[i]); 31 | if (i == _shape_vec.size() - 1) { 32 | printf("\n"); 33 | } else { 34 | printf(", "); 35 | } 36 | } 37 | } 38 | 39 | } // namespace lightseq 40 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/transform_0213.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // [sz0, sz1, sz2, sz3] -> [sz0, sz2, sz1, sz3] 8 | template 9 | class Transform0213OP : public Operator { 10 | private: 11 | size_t _max_numel; 12 | size_t _sz0; 13 | size_t _sz1; 14 | size_t _sz2; 15 | size_t _sz3; 16 | 17 | Variable* _result; 18 | 19 | public: 20 | Transform0213OP(size_t max_numel) 21 | : Operator("Transform0213"), _max_numel(max_numel) {} 22 | 23 | virtual ~Transform0213OP() {} 24 | 25 | Variable* operator()(Variable* inp); 26 | 27 | void before_forward(size_t sz0, size_t sz1, size_t sz2, size_t sz3) { 28 | _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3; 29 | _result->set_shape({_sz0, _sz2, _sz1, _sz3}); 30 | } 31 | 32 | void forward() override; 33 | 34 | void before_backward(int sz0, int sz1, int sz2, int sz3) { 35 | _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3; 36 | } 37 | 38 | void backward() override; 39 | }; 40 | } // namespace lightseq 41 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/includes/shape.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "initializer_list" 4 | 5 | namespace lightseq { 6 | 7 | // This class records the shape information of the tensor and encapsulates some 8 | // methods that may be commonly used. 9 | class Shape { 10 | private: 11 | std::vector _shape_vec; 12 | size_t _element_size; 13 | bool _is_calculated; 14 | 15 | public: 16 | // Default constructor, not part of expected usage. 17 | Shape() : _shape_vec({0}), _element_size(0), _is_calculated(false) {} 18 | Shape(std::vector shape) 19 | : _shape_vec(shape), _element_size(0), _is_calculated(false) {} 20 | Shape(std::initializer_list list) 21 | : Shape(std::vector(list)) {} 22 | Shape(const Shape& lx) = default; 23 | virtual ~Shape() = default; 24 | const std::vector& view() const { return _shape_vec; } 25 | 26 | // Returns the product of each dimension of shape. 27 | size_t element_size(); 28 | 29 | // Print shape information. 30 | void print_shape(); 31 | }; 32 | 33 | } // namespace lightseq 34 | -------------------------------------------------------------------------------- /lightseq/csrc/ops/includes/context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "cuda_util.h" 10 | namespace lightseq { 11 | namespace cuda { 12 | 13 | class Context { 14 | public: 15 | Context() : _stream(nullptr) { 16 | CHECK_GPU_ERROR(cublasCreate(&_cublasHandle)); 17 | CHECK_GPU_ERROR(cublasLtCreate(&_cublasLtHandle)); 18 | } 19 | 20 | virtual ~Context() {} 21 | 22 | static Context &Instance() { 23 | static Context _ctx; 24 | return _ctx; 25 | } 26 | 27 | void set_stream(cudaStream_t stream) { 28 | _stream = stream; 29 | CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream)); 30 | } 31 | 32 | cudaStream_t get_stream() { return _stream; } 33 | 34 | cublasHandle_t get_cublashandle() { return _cublasHandle; } 35 | cublasLtHandle_t get_cublaslthandle() { return _cublasLtHandle; } 36 | 37 | private: 38 | cudaStream_t _stream; 39 | cublasHandle_t _cublasHandle; 40 | cublasLtHandle_t _cublasLtHandle; 41 | }; 42 | 43 | } // namespace cuda 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/dropout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // after attention softmax 8 | template 9 | class DropoutOp : public Operator { 10 | private: 11 | float ratio; 12 | size_t _max_ele_num; 13 | size_t _count; 14 | bool _is_skip; 15 | 16 | TensorPtr _mask; 17 | Variable* _result = nullptr; 18 | 19 | public: 20 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 21 | 22 | DropoutOp(float r, size_t max_ele_num) 23 | : Operator("Dropout"), ratio(r), _max_ele_num(max_ele_num) { 24 | _mask.reset(new Tensor("mask", g_dtype(), max_ele_num)); 25 | } 26 | 27 | virtual ~DropoutOp() {} 28 | 29 | Variable* operator()(Variable* inp); 30 | 31 | void before_forward(size_t count) { 32 | _count = count; 33 | if (_result) _result->set_shape({count}); 34 | } 35 | 36 | void forward() override; 37 | 38 | void before_backward(int count) { _count = count; } 39 | 40 | void backward() override; 41 | }; 42 | } // namespace lightseq 43 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | from .modules.tensor_quantizer import * 20 | from .modules.quant_conv import * 21 | from .modules.quant_linear import * 22 | from .modules.quant_pooling import * 23 | from .modules.clip import * 24 | from .modules.quant_rnn import * 25 | from .modules.quant_bert import * 26 | from .modules.quant_instancenorm import * 27 | -------------------------------------------------------------------------------- /lightseq/csrc/pytorch/pytorch_quantization/utils/quant_logging.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """A WAR for codes that messes up logging format""" 20 | 21 | import logging 22 | 23 | 24 | def reset_logger_handler(): 25 | """Remove all handler in root logger""" 26 | root_logger = logger.getLogger() 27 | while root_logger.handlers: 28 | root_logger.removeHandler(root_logger.handlers[0]) 29 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/utils/quant_logging.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """A WAR for codes that messes up logging format""" 20 | 21 | import logging 22 | 23 | 24 | def reset_logger_handler(): 25 | """Remove all handler in root logger""" 26 | root_logger = logger.getLogger() 27 | while root_logger.handlers: 28 | root_logger.removeHandler(root_logger.handlers[0]) 29 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/lsflow_util.cpp: -------------------------------------------------------------------------------- 1 | #include "lsflow_util.h" 2 | 3 | namespace lightseq { 4 | 5 | void print_time_duration( 6 | const std::chrono::high_resolution_clock::time_point &start, 7 | std::string duration_name) { 8 | #ifdef LIGHTSEQ_cuda 9 | CHECK_GPU_ERROR(cudaStreamSynchronize(0)); 10 | #endif 11 | auto finish = std::chrono::high_resolution_clock::now(); 12 | std::chrono::duration elapsed = finish - start; 13 | std::cout << duration_name 14 | << " duration time is: " << (elapsed).count() * 1000 << " ms" 15 | << std::endl; 16 | return; 17 | } 18 | 19 | #ifdef LIGHTSEQ_cuda 20 | cublasOperation_t op_from_custom(MATRIX_OP op_type) { 21 | switch (op_type) { 22 | case MATRIX_OP::Transpose: 23 | return CUBLAS_OP_T; 24 | case MATRIX_OP::NonTranspose: 25 | return CUBLAS_OP_N; 26 | default: { 27 | std::string error_message = "undefined custom MATRIX_OP\n"; 28 | printf("%s", error_message.c_str()); 29 | throw std::runtime_error("undefined custom MATRIX_OP"); 30 | } 31 | } 32 | exit(-1); 33 | } 34 | #endif 35 | } // namespace lightseq 36 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/act_elewise_product.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class ActElewiseProductOp : public Operator { 9 | private: 10 | size_t _inner_size; 11 | size_t _max_batch_tokens; 12 | size_t _batch_tokens; 13 | size_t _batch_size; 14 | size_t _seq_len; 15 | 16 | Variable* _result; 17 | 18 | public: 19 | ActElewiseProductOp(size_t max_batch_tokens, size_t inner_size) 20 | : Operator("ActElewiseProductOp"), 21 | _max_batch_tokens(max_batch_tokens), 22 | _inner_size(inner_size) {} 23 | 24 | virtual ~ActElewiseProductOp() {} 25 | 26 | Variable* operator()(Variable* inp); 27 | 28 | void forward() override; 29 | 30 | void before_forward(size_t batch_size, size_t seq_len) { 31 | _batch_size = batch_size; 32 | _seq_len = seq_len; 33 | _batch_tokens = batch_size * seq_len; 34 | _result->set_shape({_batch_tokens, _inner_size}); 35 | } 36 | 37 | void backward() override {} 38 | 39 | void before_backward() {} 40 | }; 41 | 42 | } // namespace lightseq 43 | -------------------------------------------------------------------------------- /examples/training/fairseq/native_fairseq_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | fairseq-train /tmp/wmt14_en_de/ \ 13 | --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 14 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 15 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ 16 | --weight-decay 0.0001 \ 17 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ 18 | --max-tokens 8192 \ 19 | --eval-bleu \ 20 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 21 | --eval-bleu-detok moses \ 22 | --eval-bleu-remove-bpe \ 23 | --eval-bleu-print-samples \ 24 | --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \ 25 | --fp16 26 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/embKernels_int8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | template 9 | void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb, 10 | const int *tokens, T *output, int *pad_mask, int pad_id, 11 | int batch_size, int seq_len, int hidden_dim, 12 | cudaStream_t stream, const T *lang_emb, 13 | const int *lang_id, int multilg_type, 14 | float dequant_scale, bool scaled = true); 15 | 16 | template 17 | void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens, 18 | const T *lang_emb, const int *lang_id, T *output, 19 | int batch_size, int beam_size, int hidden_dim, 20 | int vocab_size, int step, int max_step, 21 | int multilg_type, cudaStream_t stream, 22 | float dequant_scale, bool scaled = true); 23 | 24 | } // namespace cuda 25 | } // namespace lightseq 26 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/fuse_add2_op.cpp: -------------------------------------------------------------------------------- 1 | #include "fuse_add2_op.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* FuseAdd2Op::operator()(Variable* inpA, Variable* inpB) { 7 | _result = new Variable("FuseAdd2Op_out", _max_batch_tokens * _hidden_dim, 8 | g_dtype(), g_dtype()); 9 | set_parents({inpA, inpB}); 10 | this->set_children({_result}); 11 | return _result; 12 | } 13 | 14 | template 15 | void FuseAdd2Op::forward() { 16 | T1* inpA_ptr = (T1*)parent(0)->value(); 17 | T1* inpB_ptr = (T1*)parent(1)->value(); 18 | T1* out_ptr = (T1*)child(0)->value(); 19 | 20 | if (!_context_ptr->is_built()) { 21 | return; 22 | } 23 | 24 | #ifdef LIGHTSEQ_cuda 25 | cudaStream_t stream = _context_ptr->get_stream(); 26 | cuda::launch_fused_add2(out_ptr, inpA_ptr, inpB_ptr, _batch_size, _seq_len, 27 | _hidden_dim, stream); 28 | #endif 29 | } 30 | 31 | template class FuseAdd2Op; 32 | #ifdef LIGHTSEQ_cuda 33 | template class FuseAdd2Op<__half, __half>; 34 | #endif 35 | } // namespace lightseq 36 | -------------------------------------------------------------------------------- /examples/training/deepspeed/ds_fairseq_argument.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from fairseq import options 4 | from deepspeed.runtime.config_utils import dict_raise_error_on_duplicate_keys 5 | 6 | 7 | def gen_ds_fairseq_arg(): 8 | parser = options.get_training_parser() 9 | parser.add_argument( 10 | "--deepspeed_config", 11 | default=None, 12 | type=str, 13 | required=True, 14 | help="DeepSpeed json configuration file.", 15 | ) 16 | fs_args = options.parse_args_and_arch(parser, modify_parser=None) 17 | 18 | ds_config = gen_ds_config(fs_args) 19 | delattr(fs_args, "deepspeed_config") 20 | return fs_args, ds_config 21 | 22 | 23 | def gen_ds_config(fs_args): 24 | ds_config = json.load( 25 | open(fs_args.deepspeed_config), 26 | object_pairs_hook=dict_raise_error_on_duplicate_keys, 27 | ) 28 | 29 | # Different parameters in fairseq and deepspeed have the same effect. 30 | # For these parameters, we extract it from fairseq arguments and put it 31 | # int the deepspeed config file 32 | ds_config["steps_per_print"] = fs_args.log_interval 33 | return ds_config 34 | -------------------------------------------------------------------------------- /lightseq/csrc/proto/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # (default) use C API for HDF5 library 4 | find_package(HDF5 REQUIRED) 5 | include_directories(${HDF5_INCLUDE_DIRS}) 6 | 7 | find_package(Protobuf REQUIRED) 8 | include_directories(${Protobuf_INCLUDE_DIRS}) 9 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 10 | 11 | set(PROTO_FILES bert.proto bert_crf.proto transformer.proto gpt.proto) 12 | 13 | set(WEIGHT_FILES bert_weight.cc bert_crf_weight.cc transformer_weight.cc 14 | gpt_weight.cc llama_weight.cc) 15 | 16 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER ${PROTO_FILES}) 17 | add_library(weight_lib STATIC ${WEIGHT_FILES} ${PROTO_SRC} ${PROTO_HEADER} 18 | proto_util.cc) 19 | target_link_libraries(weight_lib PRIVATE ${HDF5_LIBRARIES}) 20 | target_link_libraries(weight_lib PUBLIC ${Protobuf_LIBRARIES}) 21 | target_link_libraries(weight_lib PUBLIC lightseq_kernels) 22 | 23 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 24 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 25 | target_include_directories(weight_lib PUBLIC ${HDF5_INCLUDE_DIRS}) 26 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/crf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // linear crf 8 | template 9 | class CRFOP : public Operator { 10 | private: 11 | size_t _num_tags; 12 | size_t _seq_len; 13 | size_t _batch_size; 14 | size_t _max_batch_tokens; 15 | size_t _max_batch_size; 16 | 17 | bool _forward_or_decode; // true for forward, false for decode 18 | bool _output_decode_score; 19 | TensorPtr _history; 20 | 21 | Variable* _best_tags; 22 | 23 | public: 24 | CRFOP(size_t max_batch_tokens, size_t max_batch_size, size_t num_tags); 25 | 26 | virtual ~CRFOP() {} 27 | 28 | Variable* operator()(Variable* start_transition, Variable* end_transition, 29 | Variable* transition, Variable* emission, Variable* mask, 30 | Variable* bias); 31 | 32 | void before_forward(size_t batch_size, size_t seq_len, bool forward_or_decode, 33 | bool output_decode_score); 34 | 35 | void forward() override; 36 | 37 | void before_backward(); 38 | 39 | void backward() override; 40 | }; 41 | 42 | } // namespace lightseq 43 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/act_elewise_product.cpp: -------------------------------------------------------------------------------- 1 | #include "act_elewise_product.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* ActElewiseProductOp::operator()(Variable* inp) { 7 | size_t max_size = _max_batch_tokens * _inner_size; 8 | _result = new Variable("ActElewiseProductOp_out", max_size, g_dtype(), 9 | g_dtype()); 10 | set_parents({inp}); 11 | this->set_children({_result}); 12 | return _result; 13 | } 14 | 15 | template 16 | void ActElewiseProductOp::forward() { 17 | T1* inp_val = (T1*)parent(0)->value(); 18 | T1* out_val = (T1*)child(0)->value(); 19 | 20 | if (!_context_ptr->is_built()) { 21 | return; 22 | } 23 | 24 | #ifdef LIGHTSEQ_cuda 25 | cudaStream_t stream = _context_ptr->get_stream(); 26 | cuda::launch_silu_elewise_product(inp_val, out_val, _batch_size, _seq_len, 27 | _inner_size, stream); 28 | #endif 29 | } 30 | 31 | template class ActElewiseProductOp; 32 | #ifdef LIGHTSEQ_cuda 33 | template class ActElewiseProductOp<__half, __half>; 34 | #endif 35 | } // namespace lightseq 36 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_fairseq_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | lightseq-train /tmp/wmt14_en_de/ \ 13 | --task translation \ 14 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 15 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' \ 16 | --clip-norm 0.0 \ 17 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \ 18 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 19 | --max-tokens 8192 \ 20 | --eval-bleu \ 21 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 22 | --eval-bleu-detok moses \ 23 | --eval-bleu-remove-bpe \ 24 | --eval-bleu-print-samples \ 25 | --best-checkpoint-metric bleu \ 26 | --maximize-best-checkpoint-metric \ 27 | --fp16 \ 28 | --find-unused-parameters 29 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_ner/run_ner.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | python3 -m torch.distributed.launch \ 18 | --nproc_per_node=1 \ 19 | $THIS_DIR/run_ner.py \ 20 | --model_name_or_path bert-base-uncased \ 21 | --dataset_name conll2003 \ 22 | --do_train \ 23 | --do_eval \ 24 | --per_device_train_batch_size 16 \ 25 | --num_train_epochs 10 \ 26 | --output_dir /tmp/test-ner \ 27 | --overwrite_output_dir \ 28 | --fp16 \ 29 | --seed 1234 \ 30 | --logging_steps 10 \ 31 | --module_type 1 \ 32 | --enable_quant false 33 | -------------------------------------------------------------------------------- /examples/inference/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | add_executable(transformer_example transformer_example.cc) 4 | target_link_libraries(transformer_example PUBLIC liblightseq) 5 | 6 | add_executable(quant_transformer_example quant_transformer_example.cc) 7 | target_link_libraries(quant_transformer_example PUBLIC liblightseq) 8 | 9 | add_executable(bert_example bert_example.cc) 10 | target_link_libraries(bert_example PUBLIC liblightseq) 11 | 12 | add_executable(quant_bert_example quant_bert_example.cc) 13 | target_link_libraries(quant_bert_example PUBLIC liblightseq) 14 | 15 | add_executable(gpt_example gpt_example.cc) 16 | target_link_libraries(gpt_example PUBLIC liblightseq) 17 | 18 | add_executable(quant_gpt_example quant_gpt_example.cc) 19 | target_link_libraries(quant_gpt_example PUBLIC liblightseq) 20 | 21 | add_executable(transformer_decoder_example decoder_example.cc.cu) 22 | target_link_libraries(transformer_decoder_example PUBLIC transformer_model) 23 | 24 | add_executable(vit_example vit_example.cc) 25 | target_link_libraries(vit_example PUBLIC liblightseq) 26 | 27 | add_executable(quant_vit_example quant_vit_example.cc) 28 | target_link_libraries(quant_vit_example PUBLIC liblightseq) 29 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/layer_normalize.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class LayerNormalizeOp : public Operator { 9 | private: 10 | size_t _max_batch_tokens; 11 | size_t _hidden_dim; 12 | size_t _batch_tokens; 13 | 14 | bool _use_mean; 15 | 16 | TensorPtr means_; 17 | TensorPtr vars_; 18 | 19 | Variable* _result; 20 | 21 | public: 22 | LayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim, 23 | bool use_mean = false) 24 | : Operator("LayerNormalizeOp"), 25 | _max_batch_tokens(max_batch_tokens), 26 | _hidden_dim(hidden_dim), 27 | _use_mean(use_mean) { 28 | vars_.reset(new Tensor("vars", g_dtype(), max_batch_tokens)); 29 | if (use_mean) 30 | means_.reset(new Tensor("means", g_dtype(), max_batch_tokens)); 31 | } 32 | 33 | Variable* operator()(Variable* inp, Variable* gamma, Variable* betta); 34 | 35 | virtual ~LayerNormalizeOp(); 36 | 37 | void before_forward(size_t batch_size, size_t seq_len); 38 | 39 | void forward() override; 40 | 41 | void backward() override; 42 | }; 43 | 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_torch_fairseq_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | lightseq-train /tmp/wmt14_en_de/ \ 13 | --task translation \ 14 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 15 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' \ 16 | --clip-norm 0.0 \ 17 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \ 18 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 19 | --max-tokens 8192 \ 20 | --eval-bleu \ 21 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 22 | --eval-bleu-detok moses \ 23 | --eval-bleu-remove-bpe \ 24 | --eval-bleu-print-samples \ 25 | --best-checkpoint-metric bleu \ 26 | --maximize-best-checkpoint-metric \ 27 | --fp16 \ 28 | --use-torch-layer \ 29 | --find-unused-parameters 30 | -------------------------------------------------------------------------------- /examples/training/huggingface/gpt/run_gcq_clm.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | 5 | # You can use multiple NICs in NCCL communication. 6 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command. 7 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3 8 | 9 | # Set your environment variables according to your training environment, 10 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility 11 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \ 12 | --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \ 13 | --master_port=$WORKER_0_PORT \ 14 | $THIS_DIR/run_gcq_clm.py \ 15 | --model_name_or_path gpt2 \ 16 | --dataset_name wikitext \ 17 | --dataset_config_name wikitext-103-raw-v1 \ 18 | --per_device_train_batch_size 16 \ 19 | --per_device_eval_batch_size 8 \ 20 | --num_train_epochs 1 \ 21 | --do_train \ 22 | --do_eval \ 23 | --output_dir /tmp/test-clm \ 24 | --overwrite_output_dir \ 25 | --fp16 \ 26 | --logging_steps 10 \ 27 | --block_size 512 \ 28 | --module_type 2 \ 29 | --enable_quant false \ 30 | --enable_GCQ true \ 31 | --GCQ_quantile 0.99 32 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/rms_layer_norm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class RMSLayerNormalizeOp : public Operator { 9 | private: 10 | size_t _max_batch_tokens; 11 | size_t _hidden_dim; 12 | size_t _batch_tokens; 13 | float _epsilon; 14 | 15 | bool _use_mean; 16 | bool _use_residual; 17 | 18 | TensorPtr _rms_vars; 19 | Variable* _result; 20 | Variable* _residual; 21 | 22 | public: 23 | RMSLayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim, 24 | bool use_residual = true, float epsilon = 1e-6) 25 | : Operator("RMSLayerNormalizeOp"), 26 | _max_batch_tokens(max_batch_tokens), 27 | _hidden_dim(hidden_dim), 28 | _use_residual(use_residual), 29 | _epsilon(epsilon) { 30 | _rms_vars.reset(new Tensor("rms_vars", g_dtype(), max_batch_tokens)); 31 | } 32 | 33 | std::tuple operator()(Variable* inp, Variable* scale); 34 | 35 | virtual ~RMSLayerNormalizeOp(); 36 | 37 | void before_forward(size_t batch_size, size_t seq_len); 38 | 39 | void forward() override; 40 | 41 | void backward() override {} 42 | }; 43 | 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_ner/run_quant_ner.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | python3 -m torch.distributed.launch \ 18 | --nproc_per_node=1 \ 19 | $THIS_DIR/run_ner.py \ 20 | --model_name_or_path bert-base-uncased \ 21 | --dataset_name conll2003 \ 22 | --do_train \ 23 | --do_eval \ 24 | --per_device_train_batch_size 16 \ 25 | --num_train_epochs 20 \ 26 | --output_dir /tmp/quant/test-ner \ 27 | --overwrite_output_dir \ 28 | --resume_from_checkpoint /tmp/test-ner/ \ 29 | --fp16 \ 30 | --seed 1234 \ 31 | --logging_steps 10 \ 32 | --module_type 1 \ 33 | --enable_quant true 34 | -------------------------------------------------------------------------------- /examples/training/neurst/README.md: -------------------------------------------------------------------------------- 1 | # LightSeq for NeurST 2 | This repo contains an example for how to use LightSeq to accerate the training of translation task. 3 | 4 | First you should install these requirements. 5 | ```shell 6 | $ pip install subword-nmt pyyaml sacrebleu sacremoses 7 | $ git clone https://github.com/moses-smt/mosesdecoder.git 8 | ``` 9 | Then clone NeurST and switch to lightseq branch. 10 | ```shell 11 | $ git clone https://github.com/bytedance/neurst.git 12 | $ cd neurst/ 13 | $ git checkout lightseq 14 | $ pip install -e . 15 | ``` 16 | Install lightseq 17 | ```shell 18 | $ pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl 19 | ``` 20 | Download and preprocess data 21 | ```shell 22 | $ ./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder 23 | ``` 24 | Traing the model 25 | ```shell 26 | $ python3 -m neurst.cli.run_exp \ 27 | --config_paths wmt14_en_de/training_args.yml,wmt14_en_de/translation_bpe.yml \ 28 | --hparams_set transformer_base \ 29 | --model_dir wmt14_en_de/benchmark_base \ 30 | --enable_xla 31 | ``` 32 | 33 | 34 | LightSeq can achieve about 1.33x speedup using batch size 4096 on 8 V100 GPUs, 35 | compared with original tensorflow implementation. 36 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_qa/run_qa.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | python3 -m torch.distributed.launch \ 18 | --nproc_per_node=1 \ 19 | $THIS_DIR/run_qa.py \ 20 | --model_name_or_path bert-base-uncased \ 21 | --dataset_name squad \ 22 | --do_train \ 23 | --do_eval \ 24 | --max_seq_length 256 \ 25 | --per_device_train_batch_size 16 \ 26 | --doc_stride 128 \ 27 | --learning_rate 3e-5 \ 28 | --num_train_epochs 10 \ 29 | --output_dir /tmp/squad \ 30 | --overwrite_output_dir \ 31 | --fp16 \ 32 | --seed 1234 \ 33 | --logging_steps 10 \ 34 | --module_type 1 \ 35 | --enable_quant false 36 | -------------------------------------------------------------------------------- /lightseq/training/ops/pytorch/builder/adam_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright Microsoft DeepSpeed 3 | # This builder is adapted from Microsoft DeepSpeed 4 | 5 | import torch 6 | from .builder import CUDAOpBuilder 7 | 8 | 9 | class AdamBuilder(CUDAOpBuilder): 10 | NAME = "adam" 11 | 12 | def __init__(self, name=None): 13 | name = self.NAME if name is None else name 14 | super().__init__(name=name) 15 | 16 | def absolute_name(self): 17 | return f"op_builder.{self.NAME}_op" 18 | 19 | def sources(self): 20 | return [ 21 | "csrc/kernels/fused_adam_kernel.cu", 22 | "csrc/pybind/pybind_adam.cpp", 23 | ] 24 | 25 | def include_paths(self): 26 | return ["csrc/kernels/includes", "csrc/ops/includes", "csrc/layers/includes"] 27 | 28 | def nvcc_args(self): 29 | args = [ 30 | "-O3", 31 | "--use_fast_math", 32 | "-std=c++14", 33 | "-U__CUDA_NO_HALF_OPERATORS__", 34 | "-U__CUDA_NO_HALF_CONVERSIONS__", 35 | "-U__CUDA_NO_HALF2_OPERATORS__", 36 | ] 37 | 38 | return args + self.compute_capability_args() 39 | 40 | def cxx_args(self): 41 | return ["-O3", "-std=c++14", "-g", "-Wno-reorder"] 42 | -------------------------------------------------------------------------------- /tests/huggingface/test_gpt.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import torch 3 | import numpy as np 4 | from transformers import GPT2Tokenizer, GPT2Model, AutoConfig 5 | from lightseq.training import LSGptEncoderLayer 6 | 7 | 8 | @dataclass 9 | class TrainingArguments: 10 | fp16: bool = True 11 | local_rank: int = -1 12 | 13 | 14 | def test_gpt_layer(): 15 | # text = "Replace me by any text you'd like." 16 | # tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 17 | # encoded_input = tokenizer(text, return_tensors="pt") 18 | torch.random.manual_seed(1234) 19 | test_input = torch.empty(4, 64, 768).normal_().cuda() 20 | training_args = TrainingArguments() 21 | model = GPT2Model.from_pretrained("gpt2") 22 | config = AutoConfig.from_pretrained("gpt2") 23 | layer = model.h[0].cuda().train(False) 24 | base_output = layer(test_input)[0] 25 | ls_layer = LSGptEncoderLayer.from_huggingface(layer, training_args, config).train( 26 | False 27 | ) 28 | ls_output = ls_layer(test_input)[0] 29 | np.testing.assert_allclose( 30 | base_output.detach().cpu().numpy(), 31 | ls_output.detach().cpu().numpy(), 32 | rtol=1e-2, 33 | atol=2e-1, 34 | ) 35 | 36 | 37 | if __name__ == "__main__": 38 | test_gpt_layer() 39 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/bias_add_transform_20314.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | // add bias and transform 20314, execute after qkv_linear 9 | template 10 | class BiasAddTrans20314 : public Operator { 11 | private: 12 | size_t _max_batch_tokens; 13 | size_t _batch; 14 | size_t _seq_len; 15 | size_t _heads; 16 | size_t _hidden_size; 17 | size_t _trans_count; 18 | 19 | Variable* _res; 20 | 21 | public: 22 | BiasAddTrans20314(size_t max_batch_tokens, size_t heads, size_t hidden_size, 23 | size_t trans_count) 24 | : Operator("BiasAddTrans20314"), 25 | _max_batch_tokens(max_batch_tokens), 26 | _heads(heads), 27 | _hidden_size(hidden_size), 28 | _trans_count(trans_count) {} 29 | 30 | virtual ~BiasAddTrans20314() {} 31 | 32 | Variable* operator()(Variable* inp, Variable* bias); 33 | 34 | void before_forward(size_t batch, size_t seq_len) { 35 | _batch = batch, _seq_len = seq_len; 36 | _res->set_shape( 37 | {_trans_count, _batch, _heads, _seq_len, _hidden_size / _heads}); 38 | } 39 | 40 | void forward() override; 41 | 42 | void backward() override; 43 | }; 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/t5Kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace lightseq { 8 | namespace cuda { 9 | 10 | const float t5_epsilon = 1e-6; 11 | template 12 | void t5_ker_norm_layer_launcher(int token_num, int hidden_size, 13 | cudaStream_t stream, T* matrix, T* out, 14 | const T* scale, const T* bias, 15 | int max_thread_per_block); 16 | 17 | template 18 | void t5_ker_correlation_softmax_encself_launcher( 19 | int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, 20 | T* correlation, const int* src_padding_mask, const T* pos_emb); 21 | 22 | template 23 | void t5_ker_correlation_softmax_decself_launcher( 24 | int batch_head_num, int step_num, cudaStream_t stream, T* correlation, 25 | const T* pos_emb, int head_num); 26 | 27 | template 28 | void ker_gelu_first_elementmul_launcher(int batch_token_num, int block_dim, 29 | cudaStream_t stream, T* input, 30 | const T* input2, int feature_dim); 31 | } // namespace cuda 32 | } // namespace lightseq 33 | -------------------------------------------------------------------------------- /examples/training/huggingface/vit/run_vit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | THIS_DIR=$(dirname $(readlink -f $0)) 17 | 18 | python3 -m torch.distributed.launch \ 19 | --nproc_per_node=1 \ 20 | $THIS_DIR/run_vit.py \ 21 | --dataset_name beans \ 22 | --output_dir /tmp/beans_outputs \ 23 | --overwrite_output_dir \ 24 | --remove_unused_columns False \ 25 | --do_train \ 26 | --do_eval \ 27 | --learning_rate 2e-5 \ 28 | --num_train_epochs 30 \ 29 | --per_device_train_batch_size 8 \ 30 | --per_device_eval_batch_size 8 \ 31 | --logging_steps 10 \ 32 | --seed 1337 \ 33 | --fp16 \ 34 | --module_type 1 \ 35 | --enable_quant false 36 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/bias_dropout_residual.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // transformer layer's postprocessing dropout, after attn or ffn module, 8 | // before residual add. 9 | template 10 | class BiasDropoutResOp : public Operator { 11 | private: 12 | float ratio; 13 | 14 | size_t _max_rows; 15 | size_t _max_cols; 16 | size_t _rows; 17 | size_t _cols; 18 | 19 | TensorPtr _mask; 20 | Variable* _result; 21 | 22 | public: 23 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 24 | 25 | BiasDropoutResOp(float r, size_t max_rows, size_t max_cols) 26 | : Operator("BiasDropoutResOp"), 27 | ratio(r), 28 | _max_rows(max_rows), 29 | _max_cols(max_cols) { 30 | _mask.reset(new Tensor("mask", g_dtype(), _max_rows * _max_cols)); 31 | } 32 | 33 | virtual ~BiasDropoutResOp() {} 34 | 35 | Variable* operator()(Variable* inp, Variable* bias, Variable* residual); 36 | 37 | void before_forward(size_t rows, size_t cols) { 38 | _rows = rows, _cols = cols; 39 | _result->set_shape({_rows, _cols}); 40 | } 41 | 42 | void forward() override; 43 | 44 | void backward() override; 45 | }; 46 | } // namespace lightseq 47 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/linear_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "linear.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class LinearLayer : public Layer { 10 | private: 11 | // operators 12 | LinearOp* _linear = nullptr; 13 | 14 | // parameters 15 | Variable* _linear_w; 16 | 17 | // shape related 18 | int _max_batch_tokens; 19 | size_t _input_size; 20 | size_t _output_size; 21 | 22 | public: 23 | LinearLayer(int max_batch_tokens, int input_size, int output_size, 24 | MATRIX_OP opA = MATRIX_OP::Transpose, 25 | MATRIX_OP opB = MATRIX_OP::NonTranspose, float alpha = float(1.)); 26 | 27 | virtual ~LinearLayer() {} 28 | 29 | Variable* operator()(Variable* inp); 30 | 31 | void before_forward(int batch_size, int seq_len); 32 | 33 | void before_backward(); 34 | 35 | size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr); 36 | 37 | int load_params(const std::vector& para_vec, int offset); 38 | }; 39 | 40 | template class LinearLayer; 41 | #ifdef LIGHTSEQ_cuda 42 | template class LinearLayer<__half, __half>; 43 | #endif 44 | 45 | template 46 | using LinearLayerPtr = std::shared_ptr>; 47 | 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /examples/training/deepspeed/ds_fairseq_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | lightseq-deepspeed ${THIS_DIR}/ds_fairseq.py \ 13 | /tmp/wmt14_en_de/ \ 14 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 15 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 16 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ 17 | --weight-decay 0.0001 \ 18 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 19 | --max-tokens 8192 \ 20 | --log-interval 200 \ 21 | --validate-interval-updates 2000 \ 22 | --eval-bleu \ 23 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 24 | --eval-bleu-detok moses \ 25 | --eval-bleu-remove-bpe \ 26 | --eval-bleu-print-samples \ 27 | --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \ 28 | --fp16 \ 29 | --deepspeed_config ${THIS_DIR}/deepspeed_config.json 30 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_fairseq_quant_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | lightseq-train /tmp/wmt14_en_de/ \ 13 | --task translation \ 14 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 15 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' \ 16 | --clip-norm 0.0 \ 17 | --lr 1e-6 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \ 18 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 19 | --max-tokens 8192 \ 20 | --eval-bleu \ 21 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 22 | --eval-bleu-detok moses \ 23 | --eval-bleu-remove-bpe \ 24 | --eval-bleu-print-samples \ 25 | --best-checkpoint-metric bleu \ 26 | --maximize-best-checkpoint-metric \ 27 | --fp16 \ 28 | --enable-quant \ 29 | --finetune-from-model checkpoints/checkpoint_best.pt \ 30 | --save-dir checkpoints/quant 31 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/bias_act_dropout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // dropout inside ffn. 8 | template 9 | class BiasActDropoutOp : public Operator { 10 | private: 11 | float ratio; 12 | 13 | size_t _mx_cols; 14 | size_t _mx_rows; 15 | size_t _cols; 16 | size_t _rows; 17 | 18 | Variable* _result; 19 | 20 | std::string _activation_fn; 21 | 22 | TensorPtr _mask; 23 | 24 | public: 25 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 26 | 27 | BiasActDropoutOp(float r, size_t mx_rows, size_t mx_cols, 28 | std::string activation_fn) 29 | : Operator("BiasActDropoutOp"), 30 | ratio(r), 31 | _activation_fn(activation_fn), 32 | _mx_rows(mx_rows), 33 | _mx_cols(mx_cols) { 34 | _mask.reset(new Tensor("_mask", g_dtype(), _mx_rows * _mx_cols)); 35 | } 36 | 37 | virtual ~BiasActDropoutOp() {} 38 | 39 | Variable* operator()(Variable* inp, Variable* bias); 40 | 41 | void before_forward(size_t rows, size_t cols) { 42 | _rows = rows, _cols = cols; 43 | _result->set_shape({rows, cols}); 44 | } 45 | 46 | void forward() override; 47 | 48 | void backward() override; 49 | }; 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_glue/run_glue.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | THIS_DIR=$(dirname $(readlink -f $0)) 17 | 18 | export TASK_NAME=sst2 19 | 20 | python3 -m torch.distributed.launch \ 21 | --nproc_per_node=1 \ 22 | $THIS_DIR/run_glue.py \ 23 | --model_name_or_path bert-base-cased \ 24 | --task_name $TASK_NAME \ 25 | --do_train \ 26 | --do_eval \ 27 | --max_seq_length 128 \ 28 | --per_device_train_batch_size 32 \ 29 | --learning_rate 2e-5 \ 30 | --num_train_epochs 10 \ 31 | --output_dir /tmp/$TASK_NAME/ \ 32 | --overwrite_output_dir \ 33 | --fp16 \ 34 | --seed 1234 \ 35 | --logging_steps 10 \ 36 | --module_type 1 \ 37 | --enable_quant false 38 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_qa/run_quant_qa.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | python3 -m torch.distributed.launch \ 18 | --nproc_per_node=1 \ 19 | $THIS_DIR/run_qa.py \ 20 | --model_name_or_path bert-base-uncased \ 21 | --dataset_name squad \ 22 | --do_train \ 23 | --do_eval \ 24 | --max_seq_length 256 \ 25 | --per_device_train_batch_size 16 \ 26 | --doc_stride 128 \ 27 | --learning_rate 1e-5 \ 28 | --num_train_epochs 16 \ 29 | --output_dir /tmp/quant/squad \ 30 | --overwrite_output_dir \ 31 | --resume_from_checkpoint /tmp/squad/ \ 32 | --fp16 \ 33 | --seed 1234 \ 34 | --logging_steps 10 \ 35 | --module_type 1 \ 36 | --enable_quant true 37 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/llama_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "layer.h" 3 | #include "llama_attention_layer.h" 4 | #include "llama_mlp_layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class LlamaLayer : public Layer { 10 | private: 11 | LlamaAttentionLayerPtr _attn_layer; 12 | LlamaMLPLayerPtr _mlp_layer; 13 | 14 | int _layer_id; 15 | 16 | public: 17 | LlamaLayer(int max_batch_size, int max_seq_len, int hidden_size, 18 | int inner_dim, int num_heads, int beam_size); 19 | virtual ~LlamaLayer() {} 20 | 21 | Variable* operator()(Variable* inp, Variable* cache_k, Variable* cache_v, 22 | Variable* pad_mask); 23 | 24 | void before_forward(int batch_size, int seq_len, int prompt_len) { 25 | _attn_layer->before_forward(batch_size, seq_len, prompt_len); 26 | _mlp_layer->before_forward(batch_size, seq_len); 27 | } 28 | 29 | size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr); 30 | 31 | int load_params(const std::vector& para_vec, int offset); 32 | }; 33 | 34 | template class LlamaLayer; 35 | #ifdef LIGHTSEQ_cuda 36 | template class LlamaLayer<__half, __half>; 37 | #endif 38 | 39 | template 40 | using LlamaLayerPtr = std::shared_ptr>; 41 | 42 | } // namespace lightseq 43 | -------------------------------------------------------------------------------- /lightseq/training/__init__.py: -------------------------------------------------------------------------------- 1 | from lightseq.training.ops.pytorch.transformer_embedding_layer import ( 2 | LSTransformerEmbeddingLayer, 3 | ) 4 | from lightseq.training.ops.pytorch.transformer_encoder_layer import ( 5 | LSTransformerEncoderLayer, 6 | ) 7 | from lightseq.training.ops.pytorch.transformer_decoder_layer import ( 8 | LSTransformerDecoderLayer, 9 | ) 10 | from lightseq.training.ops.pytorch.gpt_layer import ( 11 | LSGptEncoderLayer, 12 | ls_hf_gpt_enc_convert, 13 | ) 14 | from lightseq.training.ops.pytorch.transformer import ( 15 | LSTransformer, 16 | LSTransformerEncoder, 17 | LSTransformerDecoder, 18 | ) 19 | 20 | from lightseq.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer 21 | from lightseq.training.ops.pytorch.adam import LSAdam 22 | from lightseq.training.ops.pytorch.export import ( 23 | export_ls_config, 24 | export_ls_embedding, 25 | export_ls_encoder, 26 | export_ls_decoder, 27 | export_pb2hdf5, 28 | ) 29 | 30 | from lightseq.training.ops.pytorch.export_quant import ( 31 | export_ls_embedding_ptq, 32 | export_ls_encoder_ptq, 33 | export_ls_decoder_ptq, 34 | export_ls_quant_embedding, 35 | export_ls_quant_encoder, 36 | export_ls_quant_decoder, 37 | export_quant_pb2hdf5, 38 | ) 39 | 40 | from lightseq.training.ops.pytorch.gemm_test import gemm_test 41 | -------------------------------------------------------------------------------- /lightseq/csrc/layers/includes/cross_entropy_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "cuda_util.h" 10 | namespace lightseq { 11 | namespace cuda { 12 | template 13 | class CrossEntropyLayer { 14 | public: 15 | CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens); 16 | 17 | virtual ~CrossEntropyLayer(); 18 | 19 | void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr, 20 | float *nll_loss_ptr); 21 | 22 | void Backward(const float *grad_outputs_ptr, const T *inputs_ptr, 23 | const int *targets_ptr, T *grad_inputs_ptr); 24 | 25 | void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size); 26 | 27 | private: 28 | void allocate_mem_buffer() { 29 | // allocate local gpu memory 30 | _loss_buffer = cuda_malloc(_max_batch_tokens * 2); 31 | } 32 | 33 | void free_mem_buffer() { 34 | // free local gpu memory 35 | cuda_free(_loss_buffer); 36 | } 37 | 38 | const int _padding_idx; 39 | const float _epsilon; 40 | const int _max_batch_tokens; 41 | 42 | size_t _batch_size; 43 | size_t _seq_len; 44 | size_t _vocab_size; 45 | 46 | float *_loss_buffer; 47 | }; 48 | } // namespace cuda 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/crf_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "crf.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class CRFLayer : public Layer { 10 | private: 11 | // operators 12 | CRFOP* _crf_op = nullptr; 13 | 14 | // parameters 15 | Variable* _linear_b; 16 | Variable* _start_transition; 17 | Variable* _end_transition; 18 | Variable* _transition; 19 | 20 | // shape related 21 | int _num_tags; 22 | int _max_batch_tokens; 23 | int _max_batch_size; 24 | 25 | int _seq_len; 26 | int _batch_size; 27 | bool _forward_or_decode; // true for forward, false for decode 28 | bool _output_decode_score; // true for output decode score 29 | 30 | public: 31 | CRFLayer(int num_tags, int max_batch_tokens, int max_batch_size); 32 | 33 | virtual ~CRFLayer() {} 34 | 35 | Variable* operator()(Variable* emission, Variable* mask); 36 | 37 | void before_forward(int batch_size, int seq_len, bool forward_or_decode, 38 | bool output_decode_score); 39 | 40 | int load_params(const std::vector& para_vec, int offset); 41 | }; 42 | 43 | template class CRFLayer; 44 | #ifdef LIGHTSEQ_cuda 45 | template class CRFLayer<__half>; 46 | #endif 47 | 48 | template 49 | using CRFLayerPtr = std::shared_ptr>; 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /examples/training/huggingface/vit/run_quant_vit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | THIS_DIR=$(dirname $(readlink -f $0)) 17 | 18 | python3 -m torch.distributed.launch \ 19 | --nproc_per_node=1 \ 20 | $THIS_DIR/run_vit.py \ 21 | --dataset_name beans \ 22 | --output_dir /tmp/quant/beans_outputs \ 23 | --resume_from_checkpoint /tmp/beans_outputs/ \ 24 | --overwrite_output_dir \ 25 | --remove_unused_columns False \ 26 | --do_train \ 27 | --do_eval \ 28 | --learning_rate 2e-6 \ 29 | --num_train_epochs 45 \ 30 | --per_device_train_batch_size 8 \ 31 | --per_device_eval_batch_size 8 \ 32 | --logging_steps 10 \ 33 | --seed 1337 \ 34 | --fp16 \ 35 | --module_type 1 \ 36 | --enable_quant true 37 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/sample_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "beam_search_topk.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class SampleLayer : public Layer { 10 | private: 11 | // operators 12 | BeamSearchTopOp* _beam_search = nullptr; 13 | 14 | // parameters 15 | Variable* _logit_bias; 16 | size_t _trg_vocab_size; 17 | 18 | public: 19 | SampleLayer(int nshared_layer, int max_batch_size, int max_step, 20 | int trg_vocab_size, int hidden_size, int max_thread_per_block, 21 | int beam_size, int diverse_lambda, int dim_per_head, int end_id, 22 | int head_num, 23 | float length_penalty); // for beam_search 24 | 25 | virtual ~SampleLayer() {} 26 | 27 | std::tuple operator()(Variable* logits, 28 | Variable* alive_seq); 29 | 30 | void before_forward(int batch_size, int cur_step); 31 | 32 | int load_params(const std::vector& para_vec, int offset); 33 | 34 | bool is_stop() { return _beam_search->is_stop(); } 35 | }; 36 | 37 | template class SampleLayer; 38 | #ifdef LIGHTSEQ_cuda 39 | template class SampleLayer<__half>; 40 | #endif 41 | 42 | template 43 | using SampleLayerPtr = std::shared_ptr>; 44 | 45 | } // namespace lightseq 46 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_torch_fairseq_quant_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | lightseq-train /tmp/wmt14_en_de/ \ 13 | --task translation \ 14 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 15 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' \ 16 | --clip-norm 0.0 \ 17 | --lr 1e-6 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \ 18 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 19 | --max-tokens 8192 \ 20 | --eval-bleu \ 21 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 22 | --eval-bleu-detok moses \ 23 | --eval-bleu-remove-bpe \ 24 | --eval-bleu-print-samples \ 25 | --best-checkpoint-metric bleu \ 26 | --maximize-best-checkpoint-metric \ 27 | --fp16 \ 28 | --use-torch-layer \ 29 | --enable-quant \ 30 | --quant-mode qat \ 31 | --finetune-from-model checkpoints/checkpoint_best.pt \ 32 | --save-dir checkpoints/quant 33 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/encdec_kv_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "bias_add_transform_20314.h" 3 | #include "linear.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class EncDecKvLayer : public Layer { 10 | private: 11 | LinearOp* _kv_linear = nullptr; 12 | BiasAddTrans20314* _bias_add_transform_20314 = nullptr; 13 | 14 | // parameters 15 | Variable* _enc_kvw; 16 | Variable* _enc_kvb; 17 | 18 | // shape related 19 | size_t _layer_id; 20 | size_t _nshared_layer; 21 | size_t _batch_tokens; 22 | size_t _max_batch_tokens; 23 | size_t _hidden_size; 24 | size_t _heads; 25 | 26 | public: 27 | EncDecKvLayer(size_t nshared_layer, size_t max_batch_tokens, 28 | size_t hidden_size, size_t num_heads); 29 | 30 | virtual ~EncDecKvLayer() {} 31 | 32 | Variable* operator()(Variable* enc_out); 33 | 34 | void before_forward(size_t batch_size, size_t seq_len); 35 | 36 | size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr); 37 | 38 | int load_params(const std::vector& para_vec, int offset); 39 | }; 40 | 41 | template class EncDecKvLayer; 42 | #ifdef LIGHTSEQ_cuda 43 | template class EncDecKvLayer<__half, __half>; 44 | #endif 45 | 46 | template 47 | using EncDecKvLayerPtr = std::shared_ptr>; 48 | 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /examples/training/huggingface/bart/summarization/run_summarization.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | THIS_DIR=$(dirname $(readlink -f $0)) 17 | 18 | export TASK_NAME=summarization 19 | 20 | python3 -m torch.distributed.launch \ 21 | --nproc_per_node=1 \ 22 | $THIS_DIR/run_summarization.py \ 23 | --model_name_or_path facebook/bart-base \ 24 | --do_train \ 25 | --do_eval \ 26 | --dataset_name cnn_dailymail \ 27 | --dataset_config "3.0.0" \ 28 | --output_dir /tmp/$TASK_NAME \ 29 | --max_source_length 128 \ 30 | --per_device_train_batch_size 32 \ 31 | --per_device_eval_batch_size 32 \ 32 | --overwrite_output_dir \ 33 | --seed 1234 \ 34 | --logging_steps 10 \ 35 | --fp16 \ 36 | --predict_with_generate 37 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_glue/run_quant_glue.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | THIS_DIR=$(dirname $(readlink -f $0)) 17 | 18 | export TASK_NAME=sst2 19 | 20 | python3 -m torch.distributed.launch \ 21 | --nproc_per_node=1 \ 22 | $THIS_DIR/run_glue.py \ 23 | --model_name_or_path bert-base-cased \ 24 | --task_name $TASK_NAME \ 25 | --do_train \ 26 | --do_eval \ 27 | --max_seq_length 128 \ 28 | --per_device_train_batch_size 32 \ 29 | --learning_rate 2e-6 \ 30 | --num_train_epochs 20 \ 31 | --output_dir /tmp/quant/$TASK_NAME/ \ 32 | --overwrite_output_dir \ 33 | --resume_from_checkpoint /tmp/$TASK_NAME/ \ 34 | --fp16 \ 35 | --seed 1234 \ 36 | --logging_steps 10 \ 37 | --module_type 1 \ 38 | --enable_quant true 39 | -------------------------------------------------------------------------------- /lightseq/csrc/ops/includes/softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "kernels.h" 10 | 11 | using namespace std; 12 | namespace lightseq { 13 | namespace cuda { 14 | 15 | template 16 | class Softmax { 17 | public: 18 | struct Config { 19 | size_t nhead; 20 | bool mask_future; 21 | Config(size_t nhead, bool mask_future = false) 22 | : nhead(nhead), mask_future(mask_future) {} 23 | }; 24 | 25 | Softmax(Config config) : config_(config) {} 26 | 27 | ~Softmax() {} 28 | 29 | void Forward(T *vals, const T *attn_mask, int batch_size, int from_len, 30 | int to_len, cudaStream_t &stream, bool mask_future = false) { 31 | launch_attn_softmax(vals, attn_mask, batch_size, config_.nhead, from_len, 32 | to_len, config_.mask_future | mask_future, stream); 33 | } 34 | 35 | void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len, 36 | int to_len, cudaStream_t stream) { 37 | launch_attn_softmax_bw(out_grad, soft_out, 38 | batch_size * config_.nhead * from_len, to_len, 39 | stream); 40 | } 41 | 42 | private: 43 | Config config_; 44 | }; 45 | } // namespace cuda 46 | } // namespace lightseq 47 | -------------------------------------------------------------------------------- /lightseq/training/pytorch_quantization/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | from lightseq.training.pytorch_quantization.nn.modules.tensor_quantizer import * 20 | from lightseq.training.pytorch_quantization.nn.modules.quant_conv import * 21 | from lightseq.training.pytorch_quantization.nn.modules.quant_linear import * 22 | from lightseq.training.pytorch_quantization.nn.modules.quant_pooling import * 23 | from lightseq.training.pytorch_quantization.nn.modules.clip import * 24 | from lightseq.training.pytorch_quantization.nn.modules.quant_rnn import * 25 | from lightseq.training.pytorch_quantization.nn.modules.quant_bert import * 26 | from lightseq.training.pytorch_quantization.nn.modules.quant_instancenorm import * 27 | -------------------------------------------------------------------------------- /examples/inference/benchmark_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT=$(realpath "$0") 4 | CUR_DIR=$(dirname "$SCRIPT") 5 | 6 | model_full_name=gpt2 7 | model_name=$model_full_name 8 | all_log=$CUR_DIR/${model_name}_bench.log 9 | res_log=$CUR_DIR/${model_name}_bench.txt 10 | if [ -f $res_log ]; then 11 | rm $res_log 12 | fi 13 | if [ -f $all_log ]; then 14 | rm $all_log 15 | fi 16 | echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log 17 | 18 | for batch_size in 1 8 32; do 19 | for topk in 1 4 32; do 20 | for input_seq_len in 118 86 22; do 21 | output_seq_len=$((150 - $input_seq_len)) 22 | cd $CUR_DIR/python 23 | 24 | python3 generate_model.py --model_name $model_full_name --sampling_method topk \ 25 | --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len 26 | model_path=$(realpath lightseq_${model_name}_bench.hdf5) 27 | 28 | cd $CUR_DIR/../../build 29 | ./examples/inference/cpp/gpt_example \ 30 | $model_path $batch_size $input_seq_len |& tee temp.log 31 | 32 | cat temp.log >>$all_log 33 | latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}') 34 | echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log 35 | rm temp.log 36 | done 37 | done 38 | done 39 | pip3 install tabulate 40 | tabulate --header $res_log 41 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class SoftmaxOp : public Operator { 9 | private: 10 | size_t _nhead; 11 | size_t _max_batch_tokens; 12 | size_t _max_seq_len; 13 | size_t _batchs; 14 | size_t _from_len; 15 | size_t _to_len; 16 | int _kv_size; 17 | 18 | bool _config_mask_future; 19 | bool _mask_future; 20 | 21 | Variable* _result; 22 | 23 | public: 24 | SoftmaxOp(size_t max_batch_tokens, size_t max_seq_len, size_t nhead, 25 | bool mask_future = false) 26 | : Operator("SoftmaxOp"), 27 | _max_batch_tokens(max_batch_tokens), 28 | _max_seq_len(max_seq_len), 29 | _nhead(nhead), 30 | _config_mask_future(mask_future) {} 31 | 32 | virtual ~SoftmaxOp() {} 33 | 34 | Variable* operator()(Variable* inp, Variable* mask = nullptr); 35 | 36 | void forward() override; 37 | 38 | void before_forward(size_t batchs, size_t from_len, size_t to_len, 39 | int kv_size = -1, bool mask_future = false) { 40 | _batchs = batchs; 41 | _from_len = from_len; 42 | _to_len = to_len; 43 | _kv_size = (kv_size == -1 ? to_len : kv_size); 44 | _mask_future = mask_future; 45 | _result->set_shape({_batchs, _nhead, _from_len, _to_len}); 46 | } 47 | 48 | void backward() override; 49 | }; 50 | 51 | } // namespace lightseq 52 | -------------------------------------------------------------------------------- /lightseq/csrc/lsflow/allocator.cpp: -------------------------------------------------------------------------------- 1 | #include "allocator.h" 2 | 3 | namespace lightseq { 4 | 5 | Allocator::Allocator() { _ptr_set.clear(); } 6 | 7 | Allocator::~Allocator() { 8 | auto _tmp_ptr_set = _ptr_set; 9 | for (auto iter : _tmp_ptr_set) { 10 | try { 11 | free_mem(iter); 12 | } catch (...) { 13 | // printf("execute ~Allocator() free_mem %p failed!\n", iter); 14 | } 15 | } 16 | _ptr_set.clear(); 17 | } 18 | 19 | char* Allocator::malloc_mem(size_t size) { 20 | char* ptr = nullptr; 21 | 22 | try { 23 | #ifdef LIGHTSEQ_cuda 24 | ptr = cuda::cuda_malloc(size); 25 | #else 26 | ptr = (char*)malloc(size); 27 | #endif 28 | } catch (...) { 29 | std::string error_message = 30 | "allocate memory failed! size is: " + std::to_string((size / MB_SIZE)) + 31 | " MB\n"; 32 | printf("%s", error_message.c_str()); 33 | throw std::runtime_error(error_message); 34 | } 35 | if (_ptr_set.find(ptr) != _ptr_set.end()) { 36 | printf("allocate same address with twice.\n"); 37 | throw std::runtime_error("allocate same address with twice.\n"); 38 | } 39 | _ptr_set.insert(ptr); 40 | return ptr; 41 | } 42 | 43 | void Allocator::free_mem(char* ptr) { 44 | if (_ptr_set.find(ptr) == _ptr_set.end() || ptr == nullptr) { 45 | return; 46 | } 47 | _ptr_set.erase(ptr); 48 | #ifdef LIGHTSEQ_cuda 49 | cuda::cuda_free(ptr); 50 | #else 51 | free(ptr); 52 | #endif 53 | } 54 | 55 | } // namespace lightseq 56 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/llama_mlp_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rms_layer_norm.h" 4 | #include "linear.h" 5 | #include "act_elewise_product.h" 6 | #include "fuse_add2_op.h" 7 | #include "layer.h" 8 | 9 | namespace lightseq { 10 | 11 | template 12 | class LlamaMLPLayer : public Layer { 13 | private: 14 | // operators 15 | RMSLayerNormalizeOp* _mlp_ln = nullptr; 16 | LinearOp* _gate_up_linear = nullptr; 17 | LinearOp* _down_linear = nullptr; 18 | ActElewiseProductOp* _act_product = nullptr; 19 | FuseAdd2Op* _add_residual = nullptr; 20 | 21 | // parameters 22 | Variable* _norm_scale; 23 | Variable* _gate_up_linear_weight; 24 | Variable* _down_linear_weight; 25 | 26 | // shape related 27 | int _max_batch_tokens; 28 | size_t _hidden_dim; 29 | size_t _inner_dim; 30 | 31 | public: 32 | LlamaMLPLayer(int max_batch_tokens, int hidden_dim, int inner_dim); 33 | 34 | virtual ~LlamaMLPLayer() {} 35 | 36 | Variable* operator()(Variable* inp); 37 | 38 | void before_forward(int batch_size, int seq_len); 39 | 40 | int load_params(const std::vector& para_vec, int offset); 41 | }; 42 | 43 | template class LlamaMLPLayer; 44 | #ifdef LIGHTSEQ_cuda 45 | template class LlamaMLPLayer<__half, __half>; 46 | #endif 47 | 48 | template 49 | using LlamaMLPLayerPtr = std::shared_ptr>; 50 | 51 | } // namespace lightseq 52 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/fuse_rotary_position_qkv.cpp: -------------------------------------------------------------------------------- 1 | #include "fuse_rotary_position_qkv.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* RotaryPositionQk::operator()(Variable* inp, Variable* cache_k, 7 | Variable* cache_v) { 8 | size_t max_size = _max_batch_size * _max_step * _head_num * _head_dim; 9 | _result = new Variable("RotaryPositionQk_out", max_size, g_dtype(), 10 | g_dtype()); 11 | set_parents({inp, cache_k, cache_v}); 12 | this->set_children({_result}); 13 | return _result; 14 | } 15 | 16 | template 17 | void RotaryPositionQk::forward() { 18 | T1* inp_val = (T1*)parent(0)->value(); 19 | T1* cache_k_val = (T1*)parent(1)->value(); 20 | T1* cache_v_val = (T1*)parent(2)->value(); 21 | 22 | T1* out_val = (T1*)child(0)->value(); 23 | 24 | if (!_context_ptr->is_built()) { 25 | return; 26 | } 27 | 28 | #ifdef LIGHTSEQ_cuda 29 | cudaStream_t stream = _context_ptr->get_stream(); 30 | cuda::launch_split_rotary_position_qkv( 31 | inp_val, _device_sin_ptr, _device_cos_ptr, out_val, cache_k_val, 32 | cache_v_val, _max_step, _batch_size, _head_num, _offset_seq_len, 33 | _query_len, _head_dim, stream); 34 | #endif 35 | } 36 | 37 | template class RotaryPositionQk; 38 | #ifdef LIGHTSEQ_cuda 39 | template class RotaryPositionQk<__half, __half>; 40 | #endif 41 | } // namespace lightseq 42 | -------------------------------------------------------------------------------- /examples/inference/benchmark_quant_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT=$(realpath "$0") 4 | CUR_DIR=$(dirname "$SCRIPT") 5 | 6 | model_full_name=/tmp/quant/test-clm/pytorch_model.bin 7 | model_name=quant_gpt2 8 | all_log=$CUR_DIR/${model_name}_bench.log 9 | res_log=$CUR_DIR/${model_name}_bench.txt 10 | if [ -f $res_log ]; then 11 | rm $res_log 12 | fi 13 | if [ -f $all_log ]; then 14 | rm $all_log 15 | fi 16 | echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log 17 | 18 | for batch_size in 1 8 32; do 19 | for topk in 1 4 32; do 20 | for input_seq_len in 118 86 22; do 21 | output_seq_len=$((150 - $input_seq_len)) 22 | cd $CUR_DIR/python 23 | 24 | python3 generate_model.py --model_name $model_full_name --sampling_method topk \ 25 | --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len --enable_quant true 26 | model_path=$(realpath lightseq_${model_name}_bench.hdf5) 27 | 28 | cd $CUR_DIR/../../build 29 | ./examples/inference/cpp/quant_gpt_example \ 30 | $model_path $batch_size $input_seq_len |& tee temp.log 31 | 32 | cat temp.log >>$all_log 33 | latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}') 34 | echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log 35 | rm temp.log 36 | done 37 | done 38 | done 39 | pip3 install tabulate 40 | tabulate --header $res_log 41 | -------------------------------------------------------------------------------- /examples/inference/benchmark_bart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT=$(realpath "$0") 4 | CUR_DIR=$(dirname "$SCRIPT") 5 | 6 | model_full_name=facebook/bart-base 7 | model_name=$(echo $model_full_name | cut -d "/" -f 2) 8 | all_log=$CUR_DIR/${model_name}_bench.log 9 | res_log=$CUR_DIR/${model_name}_bench.txt 10 | if [ -f $res_log ]; then 11 | rm $res_log 12 | fi 13 | if [ -f $all_log ]; then 14 | rm $all_log 15 | fi 16 | echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log 17 | 18 | for batch_size in 1 8 32; do 19 | for beam_size in 1 4 32; do 20 | for input_seq_len in 8 16 32 64; do 21 | output_seq_len=$input_seq_len 22 | cd $CUR_DIR/python 23 | 24 | python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \ 25 | --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len 26 | model_path=$(realpath lightseq_${model_name}_bench.hdf5) 27 | 28 | cd $CUR_DIR/../../build 29 | ./examples/inference/cpp/transformer_example \ 30 | $model_path $batch_size $input_seq_len |& tee temp.log 31 | 32 | cat temp.log >>$all_log 33 | latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}') 34 | echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log 35 | rm temp.log 36 | done 37 | done 38 | done 39 | pip3 install tabulate 40 | tabulate --header $res_log 41 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/sdpa_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "dropout.h" 3 | #include "softmax.h" 4 | #include "strided_batch_gemm.h" 5 | #include "layer.h" 6 | 7 | namespace lightseq { 8 | 9 | /* 10 | Scaled Dot Product Attention 11 | See paper "Attention is all you need" for details. 12 | */ 13 | template 14 | class SDPALayer : public Layer { 15 | private: 16 | // operators 17 | StridedBatchGemmOp* _attn_scores = nullptr; 18 | SoftmaxOp* _softmax = nullptr; 19 | DropoutOp* _attn_prob_dropout = nullptr; 20 | StridedBatchGemmOp* _attn_context = nullptr; 21 | 22 | // shape related 23 | int _max_batch_tokens; 24 | int _max_seq_len; 25 | int _nhead; 26 | int _head_dim; 27 | 28 | public: 29 | SDPALayer(size_t max_batch_tokens, size_t max_seq_len, size_t head_dim, 30 | size_t num_heads, float attn_prob_dropout_ratio); 31 | 32 | virtual ~SDPALayer() {} 33 | 34 | // mask is for enc-self attention and enc-dec-cross attention 35 | Variable* operator()(Variable* query, Variable* key, Variable* value, 36 | Variable* mask = nullptr); 37 | 38 | void before_forward(int batch_size, int query_len, int kv_len, int kv_size, 39 | bool mask_future); 40 | }; 41 | 42 | template class SDPALayer<__half, __half>; 43 | template class SDPALayer; 44 | 45 | template 46 | using SDPALayerPtr = std::shared_ptr>; 47 | 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/concat3_dim1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class Concat3Dim1 : public Operator { 10 | private: 11 | bool _is_skip = false; 12 | bool _is_continuous_cache; 13 | 14 | size_t _mx_sz0; 15 | size_t _mx_sz1; 16 | size_t _mx_sz2; 17 | 18 | size_t _sz0; 19 | size_t _sz1_0; 20 | size_t _sz1_1; 21 | size_t _layer_id; 22 | 23 | Variable* _new_cache; 24 | 25 | public: 26 | Concat3Dim1(size_t mx_sz0, size_t mx_sz1, size_t mx_sz2, size_t layer_id, 27 | bool is_continuous_cache) 28 | : Operator("Concat3Dim1"), 29 | _mx_sz0(mx_sz0), 30 | _mx_sz1(mx_sz1), 31 | _mx_sz2(mx_sz2), 32 | _layer_id(layer_id), 33 | _is_continuous_cache(is_continuous_cache) {} 34 | 35 | virtual ~Concat3Dim1() {} 36 | 37 | Variable* operator()(Variable* inp, Variable* cache); 38 | 39 | void before_forward(size_t sz0, size_t sz1_0, size_t sz1_1, 40 | bool is_skip = false) { 41 | _sz0 = sz0, _sz1_0 = sz1_0, _sz1_1 = sz1_1, _is_skip = is_skip; 42 | if (_is_continuous_cache) { 43 | _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2}); 44 | } else { 45 | _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2}); 46 | } 47 | } 48 | 49 | void forward() override; 50 | 51 | void before_backward() {} 52 | 53 | void backward() override; 54 | }; 55 | } // namespace lightseq 56 | -------------------------------------------------------------------------------- /examples/inference/benchmark_quant_bart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT=$(realpath "$0") 4 | CUR_DIR=$(dirname "$SCRIPT") 5 | 6 | model_full_name=facebook/bart-base 7 | model_name=$(echo $model_full_name | cut -d "/" -f 2) 8 | all_log=$CUR_DIR/quant_${model_name}_bench.log 9 | res_log=$CUR_DIR/quant_${model_name}_bench.txt 10 | if [ -f $all_log ]; then 11 | rm $res_log 12 | fi 13 | if [ -f $res_log ]; then 14 | rm $res_log 15 | fi 16 | echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log 17 | 18 | for batch_size in 1 8 32; do 19 | for beam_size in 1 4 32; do 20 | for input_seq_len in 16 32 64; do 21 | output_seq_len=$input_seq_len 22 | cd $CUR_DIR/python 23 | 24 | python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \ 25 | --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len 26 | model_path=$(realpath lightseq_${model_name}_bench.hdf5) 27 | 28 | cd $CUR_DIR/../../build 29 | ./examples/inference/cpp/quant_transformer_example \ 30 | $model_path $batch_size $input_seq_len |& tee temp.log 31 | 32 | cat temp.log >> $all_log 33 | latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}') 34 | echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log 35 | rm temp.log 36 | done 37 | done 38 | done 39 | 40 | pip3 install tabulate 41 | tabulate --header $res_log 42 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/launch_enc_emb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // dropout inside ffn. 8 | template 9 | class LaunchEncEmbOp : public Operator { 10 | private: 11 | size_t _max_batch_tokens; 12 | int _pad_id; 13 | size_t _hidden_dim; 14 | size_t _multilg_type; 15 | 16 | size_t _batch_size; 17 | size_t _seq_len; 18 | 19 | Variable* _result; 20 | Variable* _pad_mask; 21 | 22 | public: 23 | LaunchEncEmbOp(size_t max_batch_tokens, int pad_id, size_t hidden_dim, 24 | size_t multilg_type) 25 | : Operator("LaunchEncEmbOp"), 26 | _max_batch_tokens(max_batch_tokens), 27 | _pad_id(pad_id), 28 | _hidden_dim(hidden_dim), 29 | _multilg_type(multilg_type) {} 30 | 31 | virtual ~LaunchEncEmbOp() {} 32 | 33 | std::tuple operator()(Variable* inp_tokens, 34 | Variable* token_emb, 35 | Variable* pos_emb, 36 | Variable* lang_emb, 37 | Variable* lang_id); 38 | 39 | void before_forward(size_t batch_size, size_t seq_len) { 40 | _batch_size = batch_size, _seq_len = seq_len; 41 | } 42 | 43 | void forward() override; 44 | 45 | void backward() override { 46 | printf("ERROR! LaunchEncEmbOp can't cal backward()\n"); 47 | exit(-1); 48 | } 49 | }; 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/gpt_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "layer.h" 3 | #include "feed_forward_layer.h" 4 | #include "gpt_attention_layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class GptLayer : public Layer { 10 | private: 11 | GptAttentionLayerPtr _attn_layer; 12 | FeedForwardLayerPtr _ffn_layer; 13 | 14 | int _layer_id; 15 | 16 | public: 17 | GptLayer(int layer_id, int max_batch_tokens, int max_seq_len, int hidden_size, 18 | int num_heads, int intermediate_size, float attn_prob_dropout_ratio, 19 | float activation_dropout_ratio, float hidden_output_dropout_ratio, 20 | std::string activation_fn, bool mask_future_tokens, 21 | int beam_size = 1); 22 | virtual ~GptLayer() {} 23 | 24 | Variable* operator()(Variable* inp, Variable* cache_k, Variable* cache_v, 25 | Variable* pad_mask); 26 | 27 | void before_forward(int batch_size, int seq_len, int steps) { 28 | _attn_layer->before_forward(batch_size, seq_len, steps); 29 | _ffn_layer->before_forward(batch_size, seq_len); 30 | } 31 | 32 | size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr); 33 | 34 | int load_params(const std::vector& para_vec, int offset); 35 | }; 36 | 37 | template class GptLayer; 38 | #ifdef LIGHTSEQ_cuda 39 | template class GptLayer<__half, __half>; 40 | #endif 41 | 42 | template 43 | using GptLayerPtr = std::shared_ptr>; 44 | 45 | } // namespace lightseq 46 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/transform_0213.cpp: -------------------------------------------------------------------------------- 1 | #include "transform_0213.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* Transform0213OP::operator()(Variable* inp) { 7 | _result = new Variable("Transform0213_res", _max_numel, g_dtype(), 8 | g_dtype()); 9 | set_parents({inp}); 10 | this->set_children({_result}); 11 | return _result; 12 | } 13 | 14 | template 15 | void Transform0213OP::forward() { 16 | T1* inp_ptr = (T1*)parent(0)->value(); 17 | T1* res_ptr = (T1*)child(0)->value(); 18 | 19 | if (!_context_ptr->is_built()) { 20 | return; 21 | } 22 | #ifdef LIGHTSEQ_cuda 23 | cudaStream_t _stream = _context_ptr->get_stream(); 24 | cuda::launch_transform_0213(inp_ptr, res_ptr, _sz0, _sz1, _sz2, _sz3, 25 | _stream); 26 | #endif 27 | } 28 | 29 | template 30 | void Transform0213OP::backward() { 31 | T2* inp_grad = (T1*)parent(0)->grad(); 32 | T2* out_grad = (T1*)child(0)->grad(); 33 | 34 | if (!_context_ptr->is_built()) { 35 | return; 36 | } 37 | 38 | #ifdef LIGHTSEQ_cuda 39 | cudaStream_t _stream = _context_ptr->get_stream(); 40 | cuda::launch_transform_0213(out_grad, inp_grad, _sz0, _sz1, _sz2, _sz3, 41 | _stream); 42 | #endif 43 | } 44 | 45 | template class Transform0213OP; 46 | #ifdef LIGHTSEQ_cuda 47 | template class Transform0213OP<__half, __half>; 48 | #endif 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/vit.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/vit_encoder.h" 4 | #include "../proto/vit_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType vit_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType vit_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class Vit : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | float *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | void *d_buf_; 29 | VitWeight tw_; 30 | 31 | public: 32 | Vit(const std::string weight_path, const int max_batch_size); 33 | 34 | ~Vit(); 35 | 36 | void Infer() override; 37 | void set_input_ptr(int index, void *input_ptr) override; 38 | void set_output_ptr(int index, void *output_ptr) override; 39 | const void *get_output_ptr(int index) override; 40 | std::vector get_input_max_shape(int index) override; 41 | std::vector get_output_max_shape(int index) override; 42 | DataType get_input_dtype(int index) override; 43 | DataType get_output_dtype(int index) override; 44 | void benchmark_mode(bool is_benchmark) override{}; 45 | }; 46 | 47 | LSMODEL_REGISTER(Vit); 48 | 49 | } // namespace cuda 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/bert.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/bert_encoder.h" 4 | #include "../proto/bert_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType bert_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType bert_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class Bert : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | int *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | void *d_buf_; 29 | BertWeight tw_; 30 | 31 | public: 32 | Bert(const std::string weight_path, const int max_batch_size); 33 | 34 | ~Bert(); 35 | 36 | void Infer() override; 37 | void set_input_ptr(int index, void *input_ptr) override; 38 | void set_output_ptr(int index, void *output_ptr) override; 39 | const void *get_output_ptr(int index) override; 40 | std::vector get_input_max_shape(int index) override; 41 | std::vector get_output_max_shape(int index) override; 42 | DataType get_input_dtype(int index) override; 43 | DataType get_output_dtype(int index) override; 44 | void benchmark_mode(bool is_benchmark) override{}; 45 | }; 46 | 47 | LSMODEL_REGISTER(Bert); 48 | 49 | } // namespace cuda 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/embKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | void launch_split_multilg_request(const int *req, int *src_lang_id, 9 | int *trg_lang_id, int *src_token_id, 10 | int batch_size, int req_len, 11 | cudaStream_t &stream); 12 | 13 | template 14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens, 15 | T *output, int *pad_mask, int pad_id, int batch_size, 16 | int seq_len, int hidden_dim, cudaStream_t stream, 17 | const T *lang_emb, const int *lang_id, int multilg_type); 18 | 19 | template 20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens, 21 | const T *lang_emb, const int *lang_id, T *output, 22 | int batch_size, int beam_size, int hidden_dim, 23 | int vocab_size, int step, int max_step, int multilg_type, 24 | cudaStream_t stream); 25 | 26 | template 27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias, 28 | const T *pos_emb, const T *cls_emb, const float *input, 29 | T *output, int patch_size, int image_size, int batch_size, 30 | int max_step, int hidden_dim, int channel_input, 31 | cudaStream_t stream); 32 | 33 | } // namespace cuda 34 | } // namespace lightseq 35 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/includes/embKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | void launch_split_multilg_request(const int *req, int *src_lang_id, 9 | int *trg_lang_id, int *src_token_id, 10 | int batch_size, int req_len, 11 | cudaStream_t &stream); 12 | 13 | template 14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens, 15 | T *output, T *pad_mask, int pad_id, int batch_size, 16 | int seq_len, int hidden_dim, cudaStream_t stream, 17 | const T *lang_emb, const int *lang_id, int multilg_type); 18 | 19 | template 20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens, 21 | const T *lang_emb, const int *lang_id, T *output, 22 | int batch_size, int beam_size, int hidden_dim, 23 | int vocab_size, int step, int max_step, int multilg_type, 24 | cudaStream_t stream); 25 | 26 | template 27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias, 28 | const T *pos_emb, const T *cls_emb, const float *input, 29 | T *output, int patch_size, int image_size, int batch_size, 30 | int max_step, int hidden_dim, int channel_input, 31 | cudaStream_t stream); 32 | 33 | } // namespace cuda 34 | } // namespace lightseq 35 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/quant_vit.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/quant_vit_encoder.h" 4 | #include "../proto/quant_vit_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType vit_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType vit_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class QuantVit : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | float *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | QuantVitWeight tw_; 29 | 30 | public: 31 | QuantVit(const std::string weight_path, const int max_batch_size); 32 | 33 | ~QuantVit(); 34 | 35 | void Infer() override; 36 | void set_input_ptr(int index, void *input_ptr) override; 37 | void set_output_ptr(int index, void *output_ptr) override; 38 | const void *get_output_ptr(int index) override; 39 | std::vector get_input_max_shape(int index) override; 40 | std::vector get_output_max_shape(int index) override; 41 | DataType get_input_dtype(int index) override; 42 | DataType get_output_dtype(int index) override; 43 | void benchmark_mode(bool is_benchmark) override{}; 44 | }; 45 | 46 | LSMODEL_REGISTER(QuantVit); 47 | 48 | } // namespace cuda 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/quant_bert.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/quant_bert_encoder.h" 4 | #include "../proto/quant_bert_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType bert_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType bert_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class QuantBert : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | int *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | QuantBertWeight tw_; 29 | 30 | public: 31 | QuantBert(const std::string weight_path, const int max_batch_size); 32 | 33 | ~QuantBert(); 34 | 35 | void Infer() override; 36 | void set_input_ptr(int index, void *input_ptr) override; 37 | void set_output_ptr(int index, void *output_ptr) override; 38 | const void *get_output_ptr(int index) override; 39 | std::vector get_input_max_shape(int index) override; 40 | std::vector get_output_max_shape(int index) override; 41 | DataType get_input_dtype(int index) override; 42 | DataType get_output_dtype(int index) override; 43 | void benchmark_mode(bool is_benchmark) override{}; 44 | }; 45 | 46 | LSMODEL_REGISTER(QuantBert); 47 | 48 | } // namespace cuda 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_finetune_bart/ls_fairseq_summarization_cnn_dm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/cnn_dm-bin" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/cnn_dm_data/databin_cnn_dm.tar.gz -P /tmp 9 | tar -xvf /tmp/databin_cnn_dm.tar.gz -C /tmp && rm /tmp/databin_cnn_dm.tar.gz 10 | fi 11 | 12 | if [ ! -d "/tmp/bart.large" ]; then 13 | echo "Downloading pretrained model" 14 | wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz -P /tmp 15 | tar -zxvf /tmp/bart.large.tar.gz -C /tmp && rm /tmp/bart.large.tar.gz 16 | fi 17 | 18 | lightseq-train /tmp/cnn_dm-bin \ 19 | --restore-file /tmp/bart.large/model.pt \ 20 | --max-tokens 2048 \ 21 | --task translation \ 22 | --source-lang source --target-lang target \ 23 | --truncate-source \ 24 | --layernorm-embedding \ 25 | --share-all-embeddings \ 26 | --reset-optimizer --reset-dataloader --reset-meters \ 27 | --required-batch-size-multiple 1 \ 28 | --arch ls_bart_large \ 29 | --criterion ls_label_smoothed_cross_entropy \ 30 | --label-smoothing 0.1 \ 31 | --dropout 0.1 --attention-dropout 0.1 \ 32 | --weight-decay 0.01 --optimizer ls_adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \ 33 | --clip-norm 0.1 \ 34 | --lr-scheduler polynomial_decay --lr 3e-05 --total-num-update 20000 --warmup-updates 500 \ 35 | --fp16 --update-freq 1 \ 36 | --skip-invalid-size-inputs-valid-test \ 37 | --find-unused-parameters 38 | -------------------------------------------------------------------------------- /lightseq/csrc/triton_backend/src/triton_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #include "triton/backend/backend_common.h" 3 | #include "triton/core/tritonserver.h" 4 | #include "model_base.h" 5 | 6 | TRITONSERVER_DataType transform_triton_datatype_to_lightseq( 7 | ::lightseq::cuda::DataType data_type_) { 8 | switch (data_type_) { 9 | case ::lightseq::cuda::DataType::kNotSupported: 10 | return TRITONSERVER_TYPE_INVALID; 11 | case ::lightseq::cuda::DataType::kFloat32: 12 | return TRITONSERVER_TYPE_FP32; 13 | case ::lightseq::cuda::DataType::kInt32: 14 | return TRITONSERVER_TYPE_INT32; 15 | case ::lightseq::cuda::DataType::kInt64: 16 | return TRITONSERVER_TYPE_INT64; 17 | case ::lightseq::cuda::DataType::kFloat16: 18 | return TRITONSERVER_TYPE_FP16; 19 | case ::lightseq::cuda::DataType::kInt8: 20 | return TRITONSERVER_TYPE_INT8; 21 | case ::lightseq::cuda::DataType::kInt16: 22 | return TRITONSERVER_TYPE_INT16; 23 | case ::lightseq::cuda::DataType::kByte: 24 | return TRITONSERVER_TYPE_BYTES; 25 | case ::lightseq::cuda::DataType::kUInt8: 26 | return TRITONSERVER_TYPE_UINT8; 27 | case ::lightseq::cuda::DataType::kUInt16: 28 | return TRITONSERVER_TYPE_UINT16; 29 | case ::lightseq::cuda::DataType::kUInt32: 30 | return TRITONSERVER_TYPE_UINT32; 31 | case ::lightseq::cuda::DataType::kUInt64: 32 | return TRITONSERVER_TYPE_UINT64; 33 | case ::lightseq::cuda::DataType::kFloat64: 34 | return TRITONSERVER_TYPE_FP64; 35 | default: 36 | return TRITONSERVER_TYPE_INVALID; 37 | } 38 | return TRITONSERVER_TYPE_INVALID; 39 | } 40 | -------------------------------------------------------------------------------- /lightseq/inference/triton_backend/src/triton_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #include "triton/backend/backend_common.h" 3 | #include "triton/core/tritonserver.h" 4 | #include "model_base.h" 5 | 6 | TRITONSERVER_DataType transform_triton_datatype_to_lightseq( 7 | ::lightseq::cuda::DataType data_type_) { 8 | switch (data_type_) { 9 | case ::lightseq::cuda::DataType::kNotSupported: 10 | return TRITONSERVER_TYPE_INVALID; 11 | case ::lightseq::cuda::DataType::kFloat32: 12 | return TRITONSERVER_TYPE_FP32; 13 | case ::lightseq::cuda::DataType::kInt32: 14 | return TRITONSERVER_TYPE_INT32; 15 | case ::lightseq::cuda::DataType::kInt64: 16 | return TRITONSERVER_TYPE_INT64; 17 | case ::lightseq::cuda::DataType::kFloat16: 18 | return TRITONSERVER_TYPE_FP16; 19 | case ::lightseq::cuda::DataType::kInt8: 20 | return TRITONSERVER_TYPE_INT8; 21 | case ::lightseq::cuda::DataType::kInt16: 22 | return TRITONSERVER_TYPE_INT16; 23 | case ::lightseq::cuda::DataType::kByte: 24 | return TRITONSERVER_TYPE_BYTES; 25 | case ::lightseq::cuda::DataType::kUInt8: 26 | return TRITONSERVER_TYPE_UINT8; 27 | case ::lightseq::cuda::DataType::kUInt16: 28 | return TRITONSERVER_TYPE_UINT16; 29 | case ::lightseq::cuda::DataType::kUInt32: 30 | return TRITONSERVER_TYPE_UINT32; 31 | case ::lightseq::cuda::DataType::kUInt64: 32 | return TRITONSERVER_TYPE_UINT64; 33 | case ::lightseq::cuda::DataType::kFloat64: 34 | return TRITONSERVER_TYPE_FP64; 35 | default: 36 | return TRITONSERVER_TYPE_INVALID; 37 | } 38 | return TRITONSERVER_TYPE_INVALID; 39 | } 40 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/launch_dec_emb_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // dropout inside ffn. 8 | template 9 | class LaunchDecEmbOp : public Operator { 10 | private: 11 | size_t _max_batch_tokens; 12 | size_t _beam_size; 13 | size_t _hidden_size; 14 | size_t _trg_vocab_size; 15 | size_t _max_step; 16 | size_t _multilg_type; 17 | 18 | size_t _batch_size; 19 | int _cur_step; 20 | 21 | Variable* _result; 22 | 23 | public: 24 | LaunchDecEmbOp(int max_batch_tokens, size_t beam_size, size_t hidden_size, 25 | size_t trg_vocab_size, size_t max_step, size_t multilg_type) 26 | : Operator("LaunchDecEmbOp"), 27 | _max_batch_tokens(max_batch_tokens), 28 | _beam_size(beam_size), 29 | _hidden_size(hidden_size), 30 | _trg_vocab_size(trg_vocab_size), 31 | _max_step(max_step), 32 | _multilg_type(multilg_type) {} 33 | 34 | virtual ~LaunchDecEmbOp() {} 35 | 36 | Variable* operator()(Variable* inp_tokens, Variable* token_emb, 37 | Variable* pos_emb, Variable* lang_emb, 38 | Variable* lang_id); 39 | 40 | void before_forward(size_t batch_size, int cur_step) { 41 | _batch_size = batch_size, _cur_step = cur_step; 42 | _result->set_shape( 43 | {batch_size, size_t(cur_step + 1), _beam_size, _hidden_size}); 44 | } 45 | 46 | void forward() override; 47 | 48 | void backward() override { 49 | printf("ERROR! LaunchDecEmbOp can't cal backward()\n"); 50 | exit(-1); 51 | } 52 | }; 53 | } // namespace lightseq 54 | -------------------------------------------------------------------------------- /lightseq/csrc/models/includes/bert.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model_base.h" 3 | 4 | #include "bert_weight.h" 5 | 6 | #include "launch_enc_emb_layer.h" 7 | #include "transformer_encoder_layer.h" 8 | #include "lyr_normalize_layer.h" 9 | 10 | namespace lightseq { 11 | namespace cuda { 12 | 13 | class Bert : public LSModel { 14 | private: 15 | BertWeight tw_; 16 | std::shared_ptr _context_ptr; 17 | 18 | LaunchEncEmbLayerPtr launch_enc_emb_layer; 19 | std::vector > enc_layer_vec; 20 | LyrNormalizeLayerPtr lyr_norm_layer; 21 | 22 | ContextPtr context_ptr; 23 | 24 | Variable* inp_tokens; // need to allocate 25 | Variable* token_emb; 26 | Variable* pos_emb; 27 | Variable* lang_emb; 28 | Variable* lang_id; 29 | 30 | Variable* bert_out; 31 | 32 | int _max_batch_size; 33 | 34 | public: 35 | Bert(const std::string weight_path, const int max_batch_size); 36 | ~Bert(); 37 | 38 | void before_forward(int batch_size, int seq_len); 39 | 40 | void Infer() override; 41 | void set_input_ptr(int index, void* input_ptr) override; 42 | void set_output_ptr(int index, void* output_ptr) override; 43 | const void* get_output_ptr(int index) override; 44 | std::vector get_input_max_shape(int index) override; 45 | std::vector get_output_max_shape(int index) override; 46 | DataType get_input_dtype(int index) override; 47 | DataType get_output_dtype(int index) override; 48 | void benchmark_mode(bool is_benchmark) override {} 49 | }; 50 | 51 | LSMODEL_REGISTER(Bert); 52 | 53 | } // namespace cuda 54 | } // namespace lightseq 55 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/llama_layer.cpp: -------------------------------------------------------------------------------- 1 | #include "llama_layer.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | LlamaLayer::LlamaLayer(int max_batch_size, int max_seq_len, 7 | int hidden_size, int inner_dim, int num_heads, 8 | int beam_size) 9 | : Layer("LlamaLayer") { 10 | _attn_layer.reset(new LlamaAttentionLayer( 11 | max_batch_size, max_seq_len, hidden_size, num_heads, beam_size)); 12 | _mlp_layer.reset(new LlamaMLPLayer(max_batch_size * max_seq_len, 13 | hidden_size, inner_dim)); 14 | 15 | this->_context_ptr->exit_layer(); // necessary 16 | } 17 | 18 | template 19 | Variable* LlamaLayer::operator()(Variable* inp, Variable* cache_k, 20 | Variable* cache_v, 21 | Variable* pad_mask) { 22 | set_inputs({inp, cache_k, cache_v, pad_mask}); 23 | 24 | Variable* attn_out = (*_attn_layer)(inp, cache_k, cache_v, pad_mask); 25 | 26 | Variable* ffn_out = (*_mlp_layer)(attn_out); 27 | 28 | set_outputs({ffn_out}); 29 | return ffn_out; 30 | } 31 | 32 | template 33 | int LlamaLayer::load_params(const std::vector& para_vec, 34 | int offset) { // for inference 35 | int size = 0; 36 | 37 | size += _attn_layer->load_params(para_vec, offset + size); 38 | 39 | size += _mlp_layer->load_params(para_vec, offset + size); 40 | 41 | return size; 42 | } 43 | 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /.github/workflows/build_check.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: build 4 | 5 | on: 6 | pull_request: 7 | branches: [master] 8 | push: 9 | paths-ignore: 10 | - 'docs/**' 11 | branches: [master] 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | format: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | 20 | # Steps represent a sequence of tasks that will be executed as part of the job 21 | steps: 22 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 23 | - uses: actions/checkout@v2 24 | 25 | # Runs a single command using the runners shell 26 | - name: install pre-commit 27 | run: | 28 | pip install pre-commit 29 | sudo apt-get install -y --no-install-recommends clang-format 30 | clang-format --version 31 | 32 | # Runs a set of commands using the runners shell 33 | - name: check format 34 | run: | 35 | pre-commit run -a --show-diff-on-failure 36 | 37 | build_wheel: 38 | runs-on: ubuntu-latest 39 | container: taka23/lightseq:build-linux 40 | 41 | steps: 42 | - uses: actions/checkout@v1 43 | with: 44 | submodules: 'recursive' 45 | - name: env check 46 | run: | 47 | /opt/python/cp38-cp38/bin/python -V 48 | - name: build wheel 49 | run: | 50 | /opt/python/cp38-cp38/bin/python -m pip install -U build 51 | /opt/python/cp38-cp38/bin/python -m build 52 | -------------------------------------------------------------------------------- /tests/gemm_test/gemm_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from lightseq.training import gemm_test 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser( 8 | description="search for the best int8 gemm algorithm", 9 | usage="python gemm_test.py -hd 1024 -id 4096 -v 32000 -minb 1 -maxb 10 -d configs", 10 | ) 11 | parser.add_argument( 12 | "--hidden_dim", 13 | "-hd", 14 | type=int, 15 | help="hidden dimension of the model", 16 | ) 17 | parser.add_argument( 18 | "--inner_dim", 19 | "-id", 20 | type=int, 21 | help="inner dimension of the ffn layer", 22 | ) 23 | parser.add_argument( 24 | "--vocab_size", 25 | "-v", 26 | type=int, 27 | help="vocabulary size of the model", 28 | ) 29 | parser.add_argument( 30 | "--min_bsz", 31 | "-minb", 32 | type=int, 33 | default=1, 34 | help="minimal batch token size", 35 | ) 36 | parser.add_argument( 37 | "--max_bsz", 38 | "-maxb", 39 | type=int, 40 | default=10000, 41 | help="maximal batch token size", 42 | ) 43 | parser.add_argument( 44 | "--dir", 45 | "-d", 46 | type=str, 47 | default="/tmp/igemm_configs", 48 | help="path of the saved configs", 49 | ) 50 | args = parser.parse_args() 51 | return args 52 | 53 | 54 | if __name__ == "__main__": 55 | args = parse_args() 56 | 57 | gemm_test( 58 | args.hidden_dim, 59 | args.inner_dim, 60 | args.vocab_size, 61 | args.min_bsz, 62 | args.max_bsz, 63 | args.dir, 64 | ) 65 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/dropout.cpp: -------------------------------------------------------------------------------- 1 | #include "dropout.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* DropoutOp::operator()(Variable* inp) { 7 | _result = 8 | new Variable("DropoutOp_out", _max_ele_num, g_dtype(), g_dtype()); 9 | set_parents({inp}); 10 | this->set_children({_result}); 11 | return _result; 12 | } 13 | 14 | template 15 | void DropoutOp::forward() { 16 | T1* input = parent(0)->value(); 17 | T1* output = child(0)->value(); 18 | uint8_t* mask_ptr = _mask->tensor(); 19 | 20 | if (!_context_ptr->is_built()) { 21 | return; 22 | } 23 | 24 | #ifdef LIGHTSEQ_cuda 25 | cudaStream_t stream = _context_ptr->get_stream(); 26 | cuda::launch_ls_dropout(output, input, mask_ptr, _count, RATIO(), stream, 27 | false); 28 | #elif defined LIGHTSEQ_x86 29 | //..... 30 | #endif 31 | } 32 | 33 | template 34 | void DropoutOp::backward() { 35 | T2* input_grad = (T2*)parent(0)->grad(); 36 | T2* output_grad = (T2*)child(0)->grad(); 37 | uint8_t* mask_ptr = (uint8_t*)_mask->tensor(); 38 | 39 | if (!_context_ptr->is_built()) { 40 | return; 41 | } 42 | 43 | if (_is_skip) { 44 | return; 45 | } 46 | 47 | #ifdef LIGHTSEQ_cuda 48 | cudaStream_t stream = _context_ptr->get_stream(); 49 | cuda::launch_ls_dropout(input_grad, output_grad, mask_ptr, _count, 50 | RATIO(), stream, true); 51 | #endif 52 | } 53 | 54 | template class DropoutOp; 55 | #ifdef LIGHTSEQ_cuda 56 | template class DropoutOp<__half, __half>; 57 | #endif 58 | } // namespace lightseq 59 | -------------------------------------------------------------------------------- /lightseq/csrc/models/includes/bert_crf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model_base.h" 3 | 4 | #include "bert_crf_weight.h" 5 | 6 | #include "launch_enc_emb_layer.h" 7 | #include "transformer_encoder_layer.h" 8 | #include "lyr_normalize_layer.h" 9 | #include "linear_layer.h" 10 | #include "crf_layer.h" 11 | 12 | namespace lightseq { 13 | namespace cuda { 14 | 15 | class BertCrf : public LSModel { 16 | private: 17 | BertCrfWeight tw_; 18 | std::shared_ptr _context_ptr; 19 | 20 | LaunchEncEmbLayerPtr launch_enc_emb_layer; 21 | std::vector > enc_layer_vec; 22 | LyrNormalizeLayerPtr lyr_norm_layer; 23 | LinearLayerPtr linear_layer; 24 | CRFLayerPtr crf_layer; 25 | 26 | ContextPtr context_ptr; 27 | 28 | Variable* inp_tokens; // need to allocate 29 | Variable* bert_out; 30 | 31 | int _max_batch_size; 32 | 33 | public: 34 | BertCrf(const std::string weight_path, const int max_batch_size); 35 | ~BertCrf(); 36 | 37 | void before_forward(int batch_size, int seq_len); 38 | 39 | void Infer() override; 40 | void set_input_ptr(int index, void* input_ptr) override; 41 | void set_output_ptr(int index, void* output_ptr) override; 42 | const void* get_output_ptr(int index) override; 43 | std::vector get_input_max_shape(int index) override; 44 | std::vector get_output_max_shape(int index) override; 45 | DataType get_input_dtype(int index) override; 46 | DataType get_output_dtype(int index) override; 47 | void benchmark_mode(bool is_benchmark) override {} 48 | }; 49 | 50 | LSMODEL_REGISTER(BertCrf); 51 | 52 | } // namespace cuda 53 | } // namespace lightseq 54 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/launch_dec_emb_op.cpp: -------------------------------------------------------------------------------- 1 | #include "launch_dec_emb_op.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable* LaunchDecEmbOp::operator()(Variable* inp_tokens, 7 | Variable* token_emb, Variable* pos_emb, 8 | Variable* lang_emb, Variable* lang_id) { 9 | size_t max_size = _max_batch_tokens * _hidden_size * _beam_size; 10 | 11 | _result = 12 | new Variable("LaunchDecEmbOp_out", 13 | _max_batch_tokens * _hidden_size * _beam_size, g_dtype()); 14 | 15 | set_parents({inp_tokens, token_emb, pos_emb, lang_emb, lang_id}); 16 | 17 | this->set_children({_result}); 18 | return _result; 19 | } 20 | 21 | template 22 | void LaunchDecEmbOp::forward() { 23 | int* inp_tokens = (int*)parent(0)->value(); 24 | const T* token_emb = (const T*)parent(1)->value(); 25 | const T* pos_emb = (const T* const)parent(2)->value(); 26 | T* lang_emb = (T*)parent(3)->value(); 27 | int* lang_id = (int*)parent(4)->value(); 28 | 29 | T* output_ptr = (T*)child(0)->value(); 30 | 31 | if (!_context_ptr->is_built()) { 32 | return; 33 | } 34 | 35 | #ifdef LIGHTSEQ_cuda 36 | cudaStream_t _stream = _context_ptr->get_stream(); 37 | cuda::launch_dec_emb(token_emb, pos_emb, inp_tokens, lang_emb, lang_id, 38 | output_ptr, _batch_size, _beam_size, _hidden_size, 39 | _trg_vocab_size, _cur_step, _max_step, _multilg_type, 40 | _stream); 41 | #endif 42 | } 43 | 44 | template class LaunchDecEmbOp; 45 | #ifdef LIGHTSEQ_cuda 46 | template class LaunchDecEmbOp<__half>; 47 | #endif 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/launch_enc_emb.cpp: -------------------------------------------------------------------------------- 1 | #include "launch_enc_emb.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | std::tuple LaunchEncEmbOp::operator()( 7 | Variable* inp_tokens, Variable* token_emb, Variable* pos_emb, 8 | Variable* lang_emb, Variable* lang_id) { 9 | size_t max_size = _max_batch_tokens * _hidden_dim; 10 | 11 | _result = new Variable("LaunchEncEmbOp_out", _max_batch_tokens * _hidden_dim, 12 | g_dtype()); 13 | _pad_mask = new Variable("pad_mask", _max_batch_tokens, g_dtype()); 14 | set_parents({inp_tokens, token_emb, pos_emb, lang_emb, lang_id}); 15 | this->set_children({_result, _pad_mask}); 16 | return std::make_tuple(_result, _pad_mask); 17 | } 18 | 19 | template 20 | void LaunchEncEmbOp::forward() { 21 | int* inp_tokens = (int*)parent(0)->value(); 22 | const T* token_emb = (const T*)parent(1)->value(); 23 | const T* pos_emb = (const T*)parent(2)->value(); 24 | T* lang_emb = (T*)parent(3)->value(); 25 | int* lang_id = (int*)parent(4)->value(); 26 | 27 | T* output_ptr = (T*)child(0)->value(); 28 | T* pad_mask = (T*)child(1)->value(); 29 | 30 | if (!_context_ptr->is_built()) { 31 | return; 32 | } 33 | 34 | #ifdef LIGHTSEQ_cuda 35 | cudaStream_t _stream = _context_ptr->get_stream(); 36 | cuda::launch_enc_emb(token_emb, pos_emb, inp_tokens, output_ptr, pad_mask, 37 | _pad_id, _batch_size, _seq_len, _hidden_dim, _stream, 38 | lang_emb, lang_id, _multilg_type); 39 | #endif 40 | } 41 | 42 | template class LaunchEncEmbOp; 43 | #ifdef LIGHTSEQ_cuda 44 | template class LaunchEncEmbOp<__half>; 45 | #endif 46 | } // namespace lightseq 47 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/transformer_encoder_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "layer.h" 3 | #include "feed_forward_layer.h" 4 | #include "multihead_attention_layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template 9 | class TransformerEncoderLayer : public Layer { 10 | private: 11 | MultiheadAttentionLayerPtr _attn_layer; 12 | FeedForwardLayerPtr _ffn_layer; 13 | 14 | int _layer_id; 15 | 16 | public: 17 | TransformerEncoderLayer(int layer_id, int max_batch_tokens, int max_seq_len, 18 | int hidden_size, int num_heads, int intermediate_size, 19 | float attn_prob_dropout_ratio, 20 | float activation_dropout_ratio, 21 | float hidden_output_dropout_ratio, bool is_pre_ln, 22 | std::string activation_fn, bool mask_future_tokens); 23 | virtual ~TransformerEncoderLayer() {} 24 | 25 | Variable* operator()(Variable* inp, Variable* inp_mask); 26 | 27 | void before_forward(int batch_size, int seq_len) { 28 | _attn_layer->before_forward(batch_size, seq_len); 29 | _ffn_layer->before_forward(batch_size, seq_len); 30 | } 31 | 32 | void before_backward() { return; } 33 | 34 | size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr); 35 | 36 | int load_params(const std::vector& para_vec, int offset); 37 | }; 38 | 39 | template class TransformerEncoderLayer; 40 | #ifdef LIGHTSEQ_cuda 41 | template class TransformerEncoderLayer<__half, __half>; 42 | #endif 43 | 44 | template 45 | using TransformerEncoderLayerPtr = 46 | std::shared_ptr>; 47 | 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /lightseq/csrc/triton_backend/src/libtriton_minimal.ldscript: -------------------------------------------------------------------------------- 1 | # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | { 27 | global: 28 | TRITONBACKEND_*; 29 | local: *; 30 | }; 31 | -------------------------------------------------------------------------------- /examples/inference/python/export/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | import h5py 4 | 5 | from export.proto.transformer_pb2 import Transformer 6 | from lightseq.training import export_pb2hdf5 7 | from lightseq.training import export_quant_pb2hdf5 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="") 12 | parser.add_argument( 13 | "--model", 14 | "-m", 15 | type=str, 16 | default="checkpoint_best.pt", 17 | help="path of fairseq checkpoint", 18 | ) 19 | parser.add_argument( 20 | "--hdf5", 21 | "-hdf5", 22 | action="store_true", 23 | help="whether to store hdf5", 24 | ) 25 | parser.add_argument( 26 | "--generation_method", 27 | "-g", 28 | type=str, 29 | default="beam_search", 30 | choices=["beam_search", "topk_greedy", "topk", "topp", "ppl"], 31 | help="generation method", 32 | ) 33 | args = parser.parse_args() 34 | return args 35 | 36 | 37 | def save_model(transformer, pb_path, hdf5_path, hdf5): 38 | if not hdf5: 39 | try: 40 | str_model = transformer.SerializeToString() 41 | print("Writing to {0}".format(pb_path)) 42 | with tf.io.gfile.GFile(pb_path, "wb") as fout: 43 | fout.write(str_model) 44 | return pb_path 45 | except: 46 | pass 47 | 48 | print("Writing to {0}".format(hdf5_path)) 49 | f = h5py.File(hdf5_path, "w") 50 | if isinstance(transformer, Transformer): 51 | export_pb2hdf5(transformer, f) 52 | else: 53 | export_quant_pb2hdf5(transformer, f) 54 | f.close() 55 | return hdf5_path 56 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/x86/util.cc: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | namespace lightseq { 4 | namespace x86 {} // namespace x86 5 | 6 | template 7 | void print_vec(const T* outv, std::string outn, int num_output_ele) { 8 | std::cout << outn << " address: " << outv << std::endl; 9 | printf("value: "); 10 | for (int i = 0; i < num_output_ele; i++) { 11 | std::cout << outv[i] << ", "; 12 | } 13 | std::cout << std::endl; 14 | } 15 | 16 | template <> 17 | void print_vec(const int8_t* outv, std::string outn, 18 | int num_output_ele) { 19 | std::cout << outn << " address: " << outv << std::endl; 20 | printf("value: "); 21 | for (int i = 0; i < num_output_ele; i++) { 22 | std::cout << static_cast(outv[i]) << ", "; 23 | } 24 | std::cout << std::endl; 25 | } 26 | 27 | template <> 28 | void print_vec(const uint8_t* outv, std::string outn, 29 | int num_output_ele) { 30 | std::cout << outn << " address: " << outv << std::endl; 31 | printf("value: "); 32 | for (int i = 0; i < num_output_ele; i++) { 33 | std::cout << static_cast(outv[i]) << ", "; 34 | } 35 | std::cout << std::endl; 36 | } 37 | 38 | template void print_vec(const float* outv, std::string outn, 39 | int num_output_ele); 40 | 41 | template void print_vec(const int* outv, std::string outn, 42 | int num_output_ele); 43 | 44 | template void print_vec(const int8_t* outv, std::string outn, 45 | int num_output_ele); 46 | 47 | template void print_vec(const uint8_t* outv, std::string outn, 48 | int num_output_ele); 49 | } // namespace lightseq 50 | -------------------------------------------------------------------------------- /lightseq/inference/triton_backend/src/libtriton_minimal.ldscript: -------------------------------------------------------------------------------- 1 | # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | { 27 | global: 28 | TRITONBACKEND_*; 29 | local: *; 30 | }; 31 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/includes/cuda_util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace lightseq { 16 | 17 | /* Print vector stored in GPU memory, for debug */ 18 | template 19 | void print_vec(const T *outv, std::string outn, int num_output_ele); 20 | 21 | template 22 | void print_vec(const T *outv, std::string outn, int start, int end); 23 | 24 | namespace cuda { 25 | template 26 | void check_gpu_error(T result, char const *const func, const char *const file, 27 | int const line); 28 | 29 | #define CHECK_GPU_ERROR(val) \ 30 | ::lightseq::cuda::check_gpu_error((val), #val, __FILE__, __LINE__) 31 | 32 | template 33 | T *cuda_malloc(size_t ele_num); 34 | 35 | void cuda_free(void *pdata); 36 | 37 | template 38 | void cuda_set(T *pdata, int value, size_t ele_num); 39 | 40 | template 41 | void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf, 42 | std::string file, int line, cudaStream_t stream); 43 | 44 | #define CHECK_NAN_INF(ptr, size, stream) \ 45 | check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \ 46 | check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream)) 47 | 48 | template 49 | void check_2norm(const T *data_ptr, std::string tensor_name, int dsize, 50 | cudaStream_t stream); 51 | 52 | int getSMVersion(); 53 | 54 | std::string getGPUName(); 55 | } // namespace cuda 56 | } // namespace lightseq 57 | -------------------------------------------------------------------------------- /examples/triton_backend/transformer_client_example.py: -------------------------------------------------------------------------------- 1 | import tritonclient.http as httpclient 2 | import os, sys 3 | from tritonclient.utils import InferenceServerException 4 | import numpy as np 5 | 6 | 7 | if __name__ == "__main__": 8 | port = os.environ["HTTP_PORT"] 9 | http_url = "localhost:{}".format(port) 10 | triton_client = httpclient.InferenceServerClient(url=http_url, concurrency=2) 11 | 12 | model_name = "transformer_example" 13 | array_list = np.array( 14 | [ 15 | [324, 423, 5413, 1314, 1451, 4134, 946, 1467], 16 | [324, 423, 5413, 1314, 1451, 4134, 946, 1467], 17 | ], 18 | np.int32, 19 | copy=True, 20 | ) 21 | 22 | async_requests = [] 23 | 24 | for _ in range(10): 25 | inputs = [] # type: httpclient.InferInput 26 | outputs = [] 27 | inputs.append(httpclient.InferInput("source_ids", array_list.shape, "INT32")) 28 | inputs[0].set_data_from_numpy(array_list) 29 | 30 | outputs.append(httpclient.InferRequestedOutput("target_ids")) 31 | outputs.append(httpclient.InferRequestedOutput("target_scores")) 32 | 33 | async_requests.append( 34 | triton_client.async_infer( 35 | model_name=model_name, inputs=inputs, outputs=outputs 36 | ) 37 | ) 38 | 39 | for request in async_requests: 40 | result = request.get_result(block=True) 41 | 42 | if type(result) == InferenceServerException: 43 | print("error") 44 | sys.exit(0) 45 | 46 | target_ids = result.as_numpy("target_ids") 47 | target_scores = result.as_numpy("target_scores") 48 | 49 | print("target_ids: ", target_ids) 50 | print("target_scores: ", target_scores) 51 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/sampling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class SamplingOp : public Operator { 9 | private: 10 | GenerateMethod _generate_method; 11 | int _max_batch_size; 12 | int _max_step; 13 | int _max_thread_per_block; 14 | int _trg_vocab_size; 15 | int _topk; 16 | float _topp; 17 | int _eos_id; 18 | bool _has_logits_bias; 19 | int* _p_d_unfinished; 20 | 21 | int _batch_size; 22 | int _seq_len; 23 | int _logits_seq_len; 24 | int _prompt_len; 25 | int _cur_step; 26 | 27 | int _h_unfinished; 28 | 29 | #ifdef LIGHTSEQ_cuda 30 | curandState* _p_d_curandstate; //[batch_size] 31 | #endif 32 | 33 | Variable* _out_token_ids; 34 | Variable* _seq_score; 35 | 36 | public: 37 | SamplingOp(GenerateMethod gm, int max_batch_size, int max_step, 38 | int max_thread_per_block, int trg_vocab_size, int topk, float topp, 39 | int eos_id); 40 | 41 | virtual ~SamplingOp() {} 42 | 43 | // output: new_token_ids 44 | std::tuple operator()(Variable* logits, 45 | Variable* logit_bias, 46 | Variable* token_ids); 47 | 48 | void before_forward(int batch_size, int prompt_len, int cur_step, 49 | int logits_seq_len) { 50 | _batch_size = batch_size; 51 | _prompt_len = prompt_len; 52 | _cur_step = cur_step; 53 | _seq_len = prompt_len + cur_step; 54 | _logits_seq_len = logits_seq_len; 55 | } 56 | 57 | void forward() override; 58 | 59 | void backward() override {} 60 | 61 | bool is_stop() { return _h_unfinished == 0; } 62 | }; 63 | 64 | } // namespace lightseq 65 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/generator_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "beam_search_topk.h" 4 | #include "sampling.h" 5 | #include "layer.h" 6 | 7 | namespace lightseq { 8 | 9 | template 10 | class GeneratorLayer : public Layer { 11 | private: 12 | // operators 13 | BeamSearchTopOp* _beam_search = nullptr; 14 | SamplingOp* _sampling = nullptr; 15 | 16 | // parameters 17 | Variable* _logit_bias; 18 | size_t _trg_vocab_size; 19 | bool _has_logits_bias; 20 | 21 | GenerateMethod _generate_method; 22 | 23 | public: 24 | // this construct method is for beam_search generate method. 25 | GeneratorLayer(GenerateMethod gm, int nshared_dec_layer, int max_batch_size, 26 | int max_step, int trg_vocab_size, int hidden_size, 27 | int max_thread_per_block, int beam_size = 0, 28 | float diverse_lambda = 0., int dim_per_head = 0, 29 | int end_id = 0, int head_num = 0, float length_penalty = 0., 30 | int topk = 0, float topp = 0, bool has_logits_bias = false); 31 | 32 | virtual ~GeneratorLayer() {} 33 | 34 | std::tuple operator()(Variable* logits, 35 | Variable* alive_seq); 36 | 37 | void before_forward(int batch_size, int prompt_len, int cur_step); 38 | 39 | void refresh_cache(Variable* caches_k, Variable* caches_v); 40 | 41 | int load_params(const std::vector& para_vec, int offset); 42 | 43 | bool is_stop(); 44 | }; 45 | 46 | template class GeneratorLayer; 47 | #ifdef LIGHTSEQ_cuda 48 | template class GeneratorLayer<__half>; 49 | #endif 50 | 51 | template 52 | using GeneratorLayerPtr = std::shared_ptr>; 53 | 54 | } // namespace lightseq 55 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_ner/run_gcq_ner.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | # You can use multiple NICs in NCCL communication. 18 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command. 19 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3 20 | 21 | # Set your environment variables according to your training environment, 22 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility 23 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \ 24 | --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \ 25 | --master_port=$WORKER_0_PORT \ 26 | $THIS_DIR/run_gcq_ner.py \ 27 | --model_name_or_path bert-base-uncased \ 28 | --dataset_name conll2003 \ 29 | --do_train \ 30 | --do_eval \ 31 | --per_device_train_batch_size 16 \ 32 | --num_train_epochs 10 \ 33 | --output_dir /tmp/test-ner \ 34 | --overwrite_output_dir \ 35 | --fp16 \ 36 | --seed 1234 \ 37 | --logging_steps 10 \ 38 | --module_type 2 \ 39 | --enable_quant false \ 40 | --enable_GCQ true \ 41 | --GCQ_quantile 0.99 42 | -------------------------------------------------------------------------------- /lightseq/training/gcq/ls_fs_gcq_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.distributed as dist 4 | from torch.nn.parallel import DistributedDataParallel 5 | from fairseq.trainer import Trainer 6 | from packaging import version 7 | from .gcq import GCQState, encode_and_decode 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class LSTrainer(Trainer): 13 | """ 14 | Main class for data parallel. 15 | 16 | This class supports GCQ (Gradient Communication Quantization) for 17 | distributed multi-machine training based on fairseq.trainer.Trainer. 18 | """ 19 | 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | 23 | @property 24 | def model(self): 25 | if self._wrapped_model is None: 26 | super().model 27 | if isinstance(self._wrapped_model, DistributedDataParallel) and getattr( 28 | self.args, "enable_GCQ", False 29 | ): 30 | assert version.parse(torch.__version__) >= version.parse( 31 | "1.10" 32 | ), "Training with GCQ requires that the version of torch has to be greater than or equal to 1.10!" 33 | state = GCQState( 34 | process_group=dist.group.WORLD if dist.is_initialized() else None, 35 | hidden_size=self.args.encoder_embed_dim, 36 | quantile_value=self.args.GCQ_quantile, 37 | ) 38 | # Register the communication hook. 39 | self._wrapped_model.register_comm_hook( 40 | state=state, hook=encode_and_decode 41 | ) 42 | logger.info("############ register communication hook done ###########") 43 | return self._wrapped_model 44 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/launch_llama_emb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | // dropout inside ffn. 9 | template 10 | class LaunchLlamaEmbOp : public Operator { 11 | private: 12 | size_t _max_batch_tokens; 13 | int _pad_id; 14 | size_t _hidden_dim; 15 | 16 | size_t _batch_size; 17 | size_t _seq_len; 18 | int _max_step; 19 | int _beam_size; 20 | int _offset; 21 | int _max_batch_size; 22 | 23 | Variable* _result; 24 | Variable* _pad_mask; 25 | Variable* _left_pad_len; 26 | 27 | public: 28 | LaunchLlamaEmbOp(size_t max_batch_tokens, int max_step, int max_batch_size, 29 | int beam_size, int pad_id, size_t hidden_dim) 30 | : Operator("LaunchLlamaEmbOp"), 31 | _max_batch_tokens(max_batch_tokens), 32 | _max_batch_size(max_batch_size), 33 | _pad_id(pad_id), 34 | _max_step(max_step), 35 | _beam_size(beam_size), 36 | _hidden_dim(hidden_dim) {} 37 | 38 | virtual ~LaunchLlamaEmbOp() {} 39 | 40 | std::tuple operator()(Variable* inp_tokens, 41 | Variable* token_emb); 42 | 43 | void before_forward(size_t batch_size, size_t seq_len, int offset) { 44 | _batch_size = batch_size, _seq_len = seq_len, _offset = offset; 45 | _result->set_shape({batch_size * seq_len, _hidden_dim}); 46 | _pad_mask->set_shape({batch_size, seq_len + offset}); 47 | _left_pad_len->set_shape({_batch_size, size_t(_beam_size)}); 48 | } 49 | 50 | void forward() override; 51 | 52 | void backward() override { 53 | printf("ERROR! LaunchLlamaEmbOp can't cal backward()\n"); 54 | exit(-1); 55 | } 56 | }; 57 | } // namespace lightseq 58 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/cuda/includes/llama_kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "kernels.h" 6 | #include 7 | 8 | namespace lightseq { 9 | namespace cuda { 10 | 11 | template 12 | void launch_llama_embedding(const T *token_emb, const int *tokens, T *output, 13 | T *pad_mask_ptr, int *left_pad_len_ptr, 14 | int batch_size, int beam_size, int hidden_dim, 15 | int step_offset, int seq_len, int max_step, 16 | int padding_id, cudaStream_t stream); 17 | 18 | template 19 | void launch_split_rotary_position_qkv(const T *input_ptr, const T *sin_ptr, 20 | const T *cos_ptr, T *q_out, 21 | T *cache_k_out, T *cache_v_out, 22 | size_t max_step, size_t batch_size, 23 | size_t nhead, size_t offset_seq_len, 24 | size_t query_len, size_t head_dim, 25 | cudaStream_t stream); 26 | 27 | template 28 | void launch_silu_elewise_product(const T *inp_ptr, T *out_ptr, 29 | size_t batch_size, size_t seq_len, 30 | size_t inner_size, cudaStream_t stream); 31 | 32 | template 33 | void launch_rms_layer_norm(const T *inp_ptr, const T *scale_ptr, T *out_ptr, 34 | T *res_ptr, T *rms_ptr, size_t batch_tokens, 35 | size_t hidden_dim, cudaStream_t stream, 36 | const float ln_epsilon = 1e-6f); 37 | 38 | } // namespace cuda 39 | } // namespace lightseq 40 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/gpt.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/gpt_encoder.h" 4 | #include "../proto/gpt_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType gpt_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType gpt_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class Gpt : public LSModel { 18 | private: 19 | typedef lightseq::cuda::OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | int* d_input_; 23 | int* d_sample_id; 24 | float* d_ppl; 25 | void* d_buf_; 26 | 27 | int _max_batch_size; 28 | cudaStream_t stream_; 29 | cudaStream_t cache_stream_; 30 | cublasHandle_t hd_; 31 | lightseq::cuda::GptWeight tw_; 32 | std::set available_sampling_methods = {"topk", "topp"}; 33 | 34 | public: 35 | Gpt(const std::string weight_path, const int max_batch_size); 36 | 37 | ~Gpt(); 38 | 39 | const int* get_result_ptr(); 40 | const float* get_score_ptr(); 41 | int get_max_step() { return tw_._max_step; } 42 | 43 | void Infer() override; 44 | void set_input_ptr(int index, void* input_ptr) override; 45 | void set_output_ptr(int index, void* output_ptr) override; 46 | const void* get_output_ptr(int index) override; 47 | std::vector get_input_max_shape(int index) override; 48 | std::vector get_output_max_shape(int index) override; 49 | DataType get_input_dtype(int index) override; 50 | DataType get_output_dtype(int index) override; 51 | void benchmark_mode(bool is_benchmark) override; 52 | }; 53 | 54 | LSMODEL_REGISTER(Gpt); 55 | 56 | } // namespace cuda 57 | } // namespace lightseq 58 | -------------------------------------------------------------------------------- /examples/training/fairseq/ls_fairseq_gcq_wmt14en2de.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | THIS_DIR=$(dirname $(readlink -f $0)) 4 | cd $THIS_DIR/../../.. 5 | 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then 7 | echo "Downloading dataset" 8 | wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp 9 | tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz 10 | fi 11 | 12 | # You can use multiple NICs in NCCL communication. 13 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command. 14 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3 15 | 16 | # Set your environment variables according to your training environment, 17 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility 18 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \ 19 | --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \ 20 | --master_port=$WORKER_0_PORT \ 21 | $(which lightseq-train) /tmp/wmt14_en_de/ \ 22 | --task translation \ 23 | --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \ 24 | --optimizer ls_adam --adam-betas '(0.9, 0.98)' \ 25 | --clip-norm 0.0 \ 26 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \ 27 | --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \ 28 | --max-tokens 8192 \ 29 | --eval-bleu \ 30 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 31 | --eval-bleu-detok moses \ 32 | --eval-bleu-remove-bpe \ 33 | --eval-bleu-print-samples \ 34 | --best-checkpoint-metric bleu \ 35 | --maximize-best-checkpoint-metric \ 36 | --fp16 \ 37 | --enable_GCQ \ 38 | --GCQ_quantile 0.99 39 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # LightSeq Examples 2 | 3 | ## Table of Contents 4 | - [Cpp Examples](#cpp-examples) 5 | - [Python Examples](#python-examples) 6 | - [Train the models](#train-the-models) 7 | - [Export and infer the models](#export-and-infer-the-models) 8 | - [Deploy using Tritonbackend](#deploy-using-tritonbackend) 9 | 10 | ## Cpp Examples 11 | We provide multiple cpp examples of LightSeq inference. 12 | 13 | First you should use the training examples in the following to train a model, and then export it to protobuf or HDF5 format. 14 | 15 | Then use the cpp examples to infer the models: 16 | 1. Uncomment the `add_subdirectory(examples/inference/cpp)` in the [CMakeLists.txt](../CMakeLists.txt). 17 | 2. Build the LightSeq. Refer to [build.md](./build.md) for more details. 18 | 3. Switch to `build/temp.linux-xxx/examples/inference/cpp`, and then run `sudo make` to compile the cpp example. 19 | 4. Run the cpp examples by `./xxx_example MODEL_PATH`. 20 | 21 | ## Python Examples 22 | We provide a series of Python examples to show how to use LightSeq to do model training and inference. 23 | 24 | ### Train the models 25 | Currently, LightSeq supports training from [Fairseq](../examples/training/fairseq/README.md), [Hugging Face](../examples/training/huggingface/README.md), [DeepSpeed](../examples/training/deepspeed/README.md) and [from scratch](../examples/training/custom/README.md). For more training details, please refer to the respective README. 26 | 27 | ### Export and infer the models 28 | First export the models training by Fairseq, Hugging Face or LightSeq to protobuf or HDF5 format. Then test the results and speeds using the testing scripts. 29 | 30 | Refer to [here](../examples/inference/python/README.md) for more details. 31 | 32 | ## Deploy using Tritonbackend 33 | Refer to [here](../examples/triton_backend/README.md) for more details. 34 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/arm/utils.cc: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | void print_vec(const T *outv, std::string outn, int num_output_ele) { 7 | std::cout << outn << " address: " << outv << std::endl; 8 | printf("value: "); 9 | for (int i = 0; i < num_output_ele; i++) { 10 | std::cout << outv[i] << ", "; 11 | } 12 | std::cout << std::endl; 13 | } 14 | 15 | template <> 16 | void print_vec(const int8_t *outv, std::string outn, 17 | int num_output_ele) { 18 | std::cout << outn << " address: " << outv << std::endl; 19 | printf("value: "); 20 | for (int i = 0; i < num_output_ele; i++) { 21 | std::cout << static_cast(outv[i]) << ", "; 22 | } 23 | std::cout << std::endl; 24 | } 25 | 26 | template <> 27 | void print_vec(const uint8_t *outv, std::string outn, 28 | int num_output_ele) { 29 | std::cout << outn << " address: " << outv << std::endl; 30 | printf("value: "); 31 | for (int i = 0; i < num_output_ele; i++) { 32 | std::cout << static_cast(outv[i]) << ", "; 33 | } 34 | std::cout << std::endl; 35 | } 36 | 37 | template void print_vec(const float *outv, std::string outn, 38 | int num_output_ele); 39 | 40 | template void print_vec(const int *outv, std::string outn, 41 | int num_output_ele); 42 | 43 | template void print_vec(const int8_t *outv, std::string outn, 44 | int num_output_ele); 45 | 46 | template void print_vec(const int8_t *outv, std::string outn, 47 | int num_output_ele); 48 | 49 | template void print_vec(const uint8_t *outv, std::string outn, 50 | int num_output_ele); 51 | } // namespace lightseq 52 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/linear.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class LinearOp : public Operator { 9 | private: 10 | size_t _output_size; 11 | size_t _input_size; 12 | size_t _max_batch_tokens; 13 | size_t _batch_tokens; 14 | std::array _gemm_algos; 15 | 16 | float _alpha; 17 | float _beta; 18 | MATRIX_OP _opA; 19 | MATRIX_OP _opB; 20 | bool _use_residual = false; 21 | 22 | Variable* _result; 23 | 24 | #ifdef PYBIND_INTERFACE 25 | #define weight_op MATRIX_OP::Transpose 26 | #else 27 | #define weight_op MATRIX_OP::NonTranspose 28 | #endif 29 | 30 | public: 31 | LinearOp(size_t max_batch_tokens, size_t output_size, size_t input_size, 32 | MATRIX_OP opA = weight_op, MATRIX_OP opB = MATRIX_OP::NonTranspose, 33 | float alpha = float(1.), float beta = float(0.)) 34 | : Operator("LinearOp"), 35 | _max_batch_tokens(max_batch_tokens), 36 | _output_size(output_size), 37 | _input_size(input_size), 38 | _opA(opA), 39 | _opB(opB), 40 | _gemm_algos(std::array({99, 99, 99})), 41 | _alpha(alpha), 42 | _beta(beta) {} 43 | 44 | ~LinearOp() {} 45 | 46 | Variable* operator()(Variable* inp, Variable* weight); 47 | Variable* operator()(Variable* inp, Variable* weight, Variable* residual); 48 | 49 | void forward() override; 50 | 51 | void before_forward(size_t batch_tokens) { 52 | _batch_tokens = batch_tokens; 53 | if (_use_residual) { 54 | _result->set_offset(0, {batch_tokens, _output_size}); 55 | } else { 56 | _result->set_shape({batch_tokens, _output_size}); 57 | } 58 | } 59 | 60 | void backward() override; 61 | 62 | void before_backward() {} 63 | }; 64 | 65 | } // namespace lightseq 66 | -------------------------------------------------------------------------------- /lightseq/inference/kernels/multilgKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | template 9 | void ker_multilg_enc_emb_launcher(int batch_size, int batch_seq_len, 10 | int hidden_size, cudaStream_t stream, 11 | const T* token_emb, const T* pos_emb, 12 | const T* src_lang_emb, const int* token_id, 13 | T* output, int* padding_mask, int padding_id, 14 | int max_thread_per_block); 15 | 16 | template 17 | void ker_multilg_dec_emb_launcher(int step_token_num, int hidden_size, 18 | cudaStream_t stream, const T* token_emb, 19 | const T* pos_emb, const T* src_lang_emb, 20 | const T* trg_lang_emb, 21 | const int* src_token_id, const int* token_id, 22 | T* output, int step, int max_step, 23 | int vocab_size, int beam_size, 24 | int src_seq_len, int max_thread_per_block); 25 | 26 | template 27 | void select_beam_rough_topk_multilg_launcher( 28 | const T* logits, const T* logit_bias, const float* seq_probs, 29 | const float* seq_score, const int* alive_seq, const int* vocab_mask, 30 | const int* src_token_id, int* can_idx, float* can_score, int* num_beam_can, 31 | int vocab_size, int max_step, float length_norm, int cur_step, 32 | int step_token_num, int max_thread_per_block, cudaStream_t stream, 33 | int beam_size, float diverse_lambda, int end_id, int src_seq_len); 34 | 35 | } // namespace cuda 36 | } // namespace lightseq 37 | -------------------------------------------------------------------------------- /examples/training/huggingface/bert/task_qa/run_gcq_qa.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | THIS_DIR=$(dirname $(readlink -f $0)) 16 | 17 | # You can use multiple NICs in NCCL communication. 18 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command. 19 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3 20 | 21 | # Set your environment variables according to your training environment, 22 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility 23 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \ 24 | --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \ 25 | --master_port=$WORKER_0_PORT \ 26 | $THIS_DIR/run_gcq_qa.py \ 27 | --model_name_or_path bert-base-uncased \ 28 | --dataset_name squad \ 29 | --do_train \ 30 | --do_eval \ 31 | --max_seq_length 256 \ 32 | --per_device_train_batch_size 16 \ 33 | --doc_stride 128 \ 34 | --learning_rate 3e-5 \ 35 | --num_train_epochs 10 \ 36 | --output_dir /tmp/squad \ 37 | --overwrite_output_dir \ 38 | --fp16 \ 39 | --seed 1234 \ 40 | --logging_steps 10 \ 41 | --module_type 1 \ 42 | --enable_quant false \ 43 | --enable_GCQ true \ 44 | --GCQ_quantile 0.99 45 | -------------------------------------------------------------------------------- /lightseq/inference/pywrapper/quant_gpt.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "../model/quant_gpt_encoder.h" 4 | #include "../proto/quant_gpt_weight.h" 5 | #include "../tools/util.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType gpt_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType gpt_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class QuantGpt : public LSModel { 18 | private: 19 | typedef lightseq::cuda::OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | int* d_input_; 23 | int* d_sample_id; 24 | float* d_ppl; 25 | 26 | int _max_batch_size; 27 | cudaStream_t stream_; 28 | cudaStream_t cache_stream_; 29 | cublasHandle_t hd_; 30 | lightseq::cuda::QuantGptWeight tw_; 31 | std::set available_sampling_methods = {"topk", "topp"}; 32 | 33 | public: 34 | QuantGpt(const std::string weight_path, const int max_batch_size); 35 | 36 | ~QuantGpt(); 37 | 38 | const int* get_result_ptr(); 39 | const float* get_score_ptr(); 40 | const int get_max_step() { return tw_._max_step; } 41 | 42 | void Infer() override; 43 | void set_input_ptr(int index, void* input_ptr) override; 44 | void set_output_ptr(int index, void* output_ptr) override; 45 | const void* get_output_ptr(int index) override; 46 | std::vector get_input_max_shape(int index) override; 47 | std::vector get_output_max_shape(int index) override; 48 | DataType get_input_dtype(int index) override; 49 | DataType get_output_dtype(int index) override; 50 | void benchmark_mode(bool is_benchmark) override{}; 51 | }; 52 | 53 | LSMODEL_REGISTER(QuantGpt); 54 | 55 | } // namespace cuda 56 | } // namespace lightseq 57 | -------------------------------------------------------------------------------- /lightseq/csrc/ops_new/includes/launch_gpt_emb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | // dropout inside ffn. 9 | template 10 | class LaunchGptEmbOp : public Operator { 11 | private: 12 | size_t _max_batch_tokens; 13 | int _pad_id; 14 | size_t _hidden_dim; 15 | 16 | size_t _batch_size; 17 | size_t _seq_len; 18 | int _max_step; 19 | int _beam_size; 20 | int _offset; 21 | int _max_batch_size; 22 | 23 | Variable* _result; 24 | Variable* _pad_mask; 25 | Variable* _left_pad_len; 26 | 27 | public: 28 | LaunchGptEmbOp(size_t max_batch_tokens, int max_step, int max_batch_size, 29 | int beam_size, int pad_id, size_t hidden_dim) 30 | : Operator("LaunchGptEmbOp"), 31 | _max_batch_tokens(max_batch_tokens), 32 | _max_batch_size(max_batch_size), 33 | _pad_id(pad_id), 34 | _max_step(max_step), 35 | _beam_size(beam_size), 36 | _hidden_dim(hidden_dim) {} 37 | 38 | virtual ~LaunchGptEmbOp() {} 39 | 40 | std::tuple operator()(Variable* inp_tokens, 41 | Variable* token_emb, 42 | Variable* pos_emb); 43 | 44 | void before_forward(size_t batch_size, size_t seq_len, int offset) { 45 | _batch_size = batch_size, _seq_len = seq_len, _offset = offset; 46 | _result->set_shape({batch_size * seq_len, _hidden_dim}); 47 | _pad_mask->set_shape({batch_size, seq_len + offset}); 48 | _left_pad_len->set_shape({_batch_size, size_t(_beam_size)}); 49 | } 50 | 51 | void forward() override; 52 | 53 | void backward() override { 54 | printf("ERROR! LaunchGptEmbOp can't cal backward()\n"); 55 | exit(-1); 56 | } 57 | }; 58 | } // namespace lightseq 59 | -------------------------------------------------------------------------------- /lightseq/csrc/kernels/x86/gemm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "kernels.h" 5 | 6 | namespace lightseq { 7 | namespace x86 { 8 | 9 | // means inpA * inpB 10 | template <> 11 | void matrix_gemm(const float* inpA, const float* inpB, float* outC, int m, 12 | int n, int k) { 13 | const int64_t lda = k; 14 | const int64_t ldb = n; 15 | const int64_t ldc = n; 16 | 17 | CBLAS_TRANSPOSE trans_a = CblasNoTrans; 18 | CBLAS_TRANSPOSE trans_b = CblasNoTrans; 19 | 20 | cblas_sgemm(CblasRowMajor, trans_a, trans_b, m, n, k, 1, inpA, lda, inpB, ldb, 21 | 0, outC, ldc); 22 | return; 23 | } 24 | 25 | template <> 26 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a, 27 | bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha, 28 | const uint8_t* a, int64_t lda, const int8_t* b, int64_t ldb, 29 | float beta, int32_t* c, int64_t ldc, 30 | const int32_t* a_shift_compensation) { 31 | const bool use_packed_api = a_is_packed || b_is_packed; 32 | 33 | const CBLAS_TRANSPOSE trans_a = transpose_a ? CblasTrans : CblasNoTrans; 34 | const CBLAS_TRANSPOSE trans_b = transpose_b ? CblasTrans : CblasNoTrans; 35 | 36 | // if (use_packed_api) { 37 | // cblas_gemm_s8u8s32_compute( 38 | // CblasRowMajor, a_is_packed ? (MKL_INT)CblasPacked : (MKL_INT)trans_a, 39 | // b_is_packed ? (MKL_INT)CblasPacked : (MKL_INT)trans_b, 40 | // CblasRowOffset, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 41 | // a_shift_compensation); 42 | // } else { 43 | 44 | cblas_gemm_s8u8s32(CblasRowMajor, trans_a, trans_b, CblasRowOffset, m, n, k, 45 | alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 46 | a_shift_compensation); 47 | // } 48 | 49 | return; 50 | } 51 | 52 | } // namespace x86 53 | } // namespace lightseq 54 | -------------------------------------------------------------------------------- /lightseq/csrc/layers_new/includes/rms_norm_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "rms_layer_norm.h" 3 | #include "layer.h" 4 | 5 | namespace lightseq { 6 | 7 | template 8 | class RMSNormLayer : public Layer { 9 | private: 10 | int _hidden_size; 11 | int _max_batch_tokens; 12 | 13 | // operators 14 | RMSLayerNormalizeOp* _rms_norm = nullptr; 15 | 16 | // parameters 17 | Variable* _norm_scale; 18 | 19 | public: 20 | RMSNormLayer(int max_batch_tokens, int hidden_size) 21 | : Layer("RMSNormLayer"), 22 | _hidden_size(hidden_size), 23 | _max_batch_tokens(max_batch_tokens), 24 | _rms_norm(new RMSLayerNormalizeOp(max_batch_tokens, hidden_size, 25 | false)) { 26 | _norm_scale = new Variable("_norm_scale", g_dtype(), g_dtype()); 27 | 28 | this->_context_ptr->exit_layer(); // necessary 29 | } 30 | 31 | virtual ~RMSNormLayer() {} 32 | 33 | Variable* operator()(Variable* inp) { 34 | set_inputs({inp}); 35 | 36 | Variable* out = std::get<0>((*_rms_norm)(inp, _norm_scale)); 37 | 38 | set_outputs({out}); 39 | return out; 40 | } 41 | 42 | void before_forward(int batch_size, int seq_len) { 43 | _rms_norm->before_forward(batch_size, seq_len); 44 | } 45 | 46 | void before_backward() {} 47 | 48 | int load_params(const std::vector& para_vec, int offset) { 49 | int size = 0; 50 | _norm_scale->set_value((char*)para_vec[offset + size]), size++; 51 | _norm_scale->set_shape({size_t(_hidden_size)}); 52 | return size; 53 | } 54 | }; 55 | 56 | template class RMSNormLayer; 57 | #ifdef LIGHTSEQ_cuda 58 | template class RMSNormLayer<__half, __half>; 59 | #endif 60 | 61 | template 62 | using RMSNormLayerPtr = std::shared_ptr>; 63 | 64 | } // namespace lightseq 65 | -------------------------------------------------------------------------------- /examples/training/huggingface/gcq/ls_hf_gcq_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.distributed as dist 4 | from transformers import Trainer 5 | from packaging import version 6 | from lightseq.training.gcq import ( 7 | GCQState, 8 | encode_and_decode, 9 | ) 10 | from examples.training.huggingface.gcq import GCQArguments 11 | 12 | logger = logging.getLogger("lightseq_hf_trainer") 13 | 14 | 15 | class LSTrainer(Trainer): 16 | """ 17 | LSTrainer supports GCQ (Gradient Communication Quantization) for distributed multi-machine training 18 | based on transformers.Trainer. 19 | """ 20 | 21 | def __init__(self, gcq_args: GCQArguments = None, *args, **kwargs): 22 | super().__init__(*args, **kwargs) 23 | logger.setLevel(logging.INFO if self.args.should_log else logging.WARN) 24 | self.gcq_args = gcq_args 25 | 26 | def _wrap_model(self, model, training=True, dataloader=None): 27 | model = super()._wrap_model(model, training, dataloader) 28 | # Enable GCQ. 29 | if isinstance(model, torch.nn.parallel.DistributedDataParallel) and getattr( 30 | self.gcq_args, "enable_GCQ", False 31 | ): 32 | assert version.parse(torch.__version__) >= version.parse( 33 | "1.10" 34 | ), "Training with GCQ requires that the version of torch has to be greater than or equal to 1.10!" 35 | state = GCQState( 36 | process_group=dist.group.WORLD if dist.is_initialized() else None, 37 | hidden_size=self.gcq_args.hidden_size, 38 | quantile_value=self.gcq_args.GCQ_quantile, 39 | ) 40 | # Register the communication hook. 41 | model.register_comm_hook(state=state, hook=encode_and_decode) 42 | logger.info("############ register communication hook done ###########") 43 | 44 | return model 45 | --------------------------------------------------------------------------------