├── tests
    ├── __init__.py
    ├── cublas
    │   └── build.sh
    ├── gemm_test
    │   ├── gemm_test.sh
    │   └── gemm_test.py
    └── huggingface
    │   └── test_gpt.py
├── lightseq
    ├── csrc
    │   ├── export
    │   │   └── __init__.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── cuda
    │   │   │   └── __init__.py
    │   ├── .gitignore
    │   ├── layers_new
    │   │   ├── launch_gpt_emb_layer.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── includes
    │   │   │   ├── linear_layer.h
    │   │   │   ├── llama_layer.h
    │   │   │   ├── crf_layer.h
    │   │   │   ├── sample_layer.h
    │   │   │   ├── encdec_kv_layer.h
    │   │   │   ├── llama_mlp_layer.h
    │   │   │   ├── sdpa_layer.h
    │   │   │   ├── gpt_layer.h
    │   │   │   ├── transformer_encoder_layer.h
    │   │   │   ├── generator_layer.h
    │   │   │   └── rms_norm_layer.h
    │   │   └── llama_layer.cpp
    │   ├── lsflow
    │   │   ├── README.md
    │   │   ├── includes
    │   │   │   ├── allocator.h
    │   │   │   ├── lsflow_util.h
    │   │   │   └── shape.h
    │   │   ├── CMakeLists.txt
    │   │   ├── operator.cpp
    │   │   ├── shape.cpp
    │   │   ├── lsflow_util.cpp
    │   │   └── allocator.cpp
    │   ├── kernels
    │   │   ├── x86
    │   │   │   ├── includes
    │   │   │   │   ├── kernel_headers.h
    │   │   │   │   └── kernels.h
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── util.cc
    │   │   │   └── gemm.cpp
    │   │   ├── arm
    │   │   │   ├── gemm.cc
    │   │   │   ├── includes
    │   │   │   │   ├── utils.h
    │   │   │   │   └── kernel_headers.h
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── utils.cc
    │   │   └── cuda
    │   │   │   ├── includes
    │   │   │       ├── ls_cub.cuh
    │   │   │       ├── kernel_headers.h
    │   │   │       ├── embKernels.h
    │   │   │       ├── cuda_util.h
    │   │   │       └── llama_kernels.h
    │   │   │   └── CMakeLists.txt
    │   ├── tensorflow
    │   │   └── README.md
    │   ├── models
    │   │   ├── includes
    │   │   │   ├── model_util.h
    │   │   │   ├── bert.h
    │   │   │   └── bert_crf.h
    │   │   ├── model_util.cc
    │   │   └── CMakeLists.txt
    │   ├── pybind
    │   │   └── CMakeLists.txt
    │   ├── proto
    │   │   ├── includes
    │   │   │   ├── proto_headers.h
    │   │   │   ├── test_model_weight.h
    │   │   │   └── hdf5_util.h
    │   │   └── CMakeLists.txt
    │   ├── pytorch
    │   │   ├── builder
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── pytorch_quantization
    │   │   │   ├── optim
    │   │   │   │   └── __init__.py
    │   │   │   ├── nn
    │   │   │   │   ├── modules
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── _functions
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── version.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── quant_logging.py
    │   │   │   ├── __init__.py
    │   │   │   └── calib
    │   │   │   │   └── __init__.py
    │   │   └── sdpa_layers.py
    │   ├── example
    │   │   └── CMakeLists.txt
    │   ├── ops_new
    │   │   ├── CMakeLists.txt
    │   │   ├── includes
    │   │   │   ├── fuse_add2_op.h
    │   │   │   ├── transform_0213.h
    │   │   │   ├── dropout.h
    │   │   │   ├── act_elewise_product.h
    │   │   │   ├── crf.h
    │   │   │   ├── layer_normalize.h
    │   │   │   ├── rms_layer_norm.h
    │   │   │   ├── bias_add_transform_20314.h
    │   │   │   ├── bias_dropout_residual.h
    │   │   │   ├── bias_act_dropout.h
    │   │   │   ├── softmax.h
    │   │   │   ├── concat3_dim1.h
    │   │   │   ├── launch_enc_emb.h
    │   │   │   ├── launch_dec_emb_op.h
    │   │   │   ├── sampling.h
    │   │   │   ├── launch_llama_emb.h
    │   │   │   ├── linear.h
    │   │   │   └── launch_gpt_emb.h
    │   │   ├── fuse_add2_op.cpp
    │   │   ├── act_elewise_product.cpp
    │   │   ├── fuse_rotary_position_qkv.cpp
    │   │   ├── transform_0213.cpp
    │   │   ├── dropout.cpp
    │   │   ├── launch_dec_emb_op.cpp
    │   │   └── launch_enc_emb.cpp
    │   ├── ops
    │   │   └── includes
    │   │   │   ├── context.h
    │   │   │   └── softmax.h
    │   ├── layers
    │   │   └── includes
    │   │   │   └── cross_entropy_layer.h
    │   └── triton_backend
    │   │   └── src
    │   │       ├── triton_utils.h
    │   │       └── libtriton_minimal.ldscript
    ├── training
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── fs_modules
    │   │   │   └── __init__.py
    │   │   ├── lightseq_deepspeed_cli.py
    │   │   ├── lightseq_fairseq_generate_cli.py
    │   │   ├── lightseq_fairseq_validate_cli.py
    │   │   └── lightseq_fairseq_train_cli.py
    │   ├── ops
    │   │   ├── __init__.py
    │   │   ├── tensorflow
    │   │   │   ├── __init__.py
    │   │   │   └── README.md
    │   │   └── pytorch
    │   │   │   ├── __init__.py
    │   │   │   └── builder
    │   │   │       ├── __init__.py
    │   │   │       └── adam_builder.py
    │   ├── gcq
    │   │   ├── __init__.py
    │   │   └── ls_fs_gcq_trainer.py
    │   ├── pytorch_quantization
    │   │   ├── optim
    │   │   │   └── __init__.py
    │   │   ├── nn
    │   │   │   ├── modules
    │   │   │   │   └── __init__.py
    │   │   │   ├── _functions
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   ├── version.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── quant_logging.py
    │   │   ├── __init__.py
    │   │   └── calib
    │   │   │   └── __init__.py
    │   └── __init__.py
    ├── __init__.py
    └── inference
    │   ├── server
    │       └── libserver.ldscript
    │   ├── tools
    │       └── CMakeLists.txt
    │   ├── kernels
    │       ├── CMakeLists.txt
    │       ├── t5EmbKernels.h
    │       ├── embKernels_int8.h
    │       ├── t5Kernels.h
    │       ├── embKernels.h
    │       └── multilgKernels.h
    │   ├── pywrapper
    │       ├── vit.h
    │       ├── bert.h
    │       ├── quant_vit.h
    │       ├── quant_bert.h
    │       ├── gpt.h
    │       └── quant_gpt.h
    │   └── triton_backend
    │       └── src
    │           ├── triton_utils.h
    │           └── libtriton_minimal.ldscript
├── examples
    ├── inference
    │   ├── python
    │   │   ├── __init__.py
    │   │   ├── export
    │   │   │   ├── __init__.py
    │   │   │   ├── fairseq
    │   │   │   │   └── __init__.py
    │   │   │   ├── proto
    │   │   │   │   └── __init__.py
    │   │   │   ├── huggingface
    │   │   │   │   └── __init__.py
    │   │   │   └── util.py
    │   │   └── test
    │   │   │   └── ls_fairseq.sh
    │   ├── cpp
    │   │   └── CMakeLists.txt
    │   ├── benchmark_gpt.sh
    │   ├── benchmark_quant_gpt.sh
    │   ├── benchmark_bart.sh
    │   └── benchmark_quant_bart.sh
    ├── training
    │   ├── neurst
    │   │   ├── __init__.py
    │   │   └── README.md
    │   ├── deepspeed
    │   │   ├── __init__.py
    │   │   ├── deepspeed_config.json
    │   │   ├── README.md
    │   │   ├── ds_fairseq_argument.py
    │   │   └── ds_fairseq_wmt14en2de.sh
    │   ├── huggingface
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── task_ner
    │   │   │   │   ├── run_ner.sh
    │   │   │   │   ├── run_quant_ner.sh
    │   │   │   │   └── run_gcq_ner.sh
    │   │   │   ├── task_qa
    │   │   │   │   ├── run_qa.sh
    │   │   │   │   ├── run_quant_qa.sh
    │   │   │   │   └── run_gcq_qa.sh
    │   │   │   └── task_glue
    │   │   │   │   ├── run_glue.sh
    │   │   │   │   └── run_quant_glue.sh
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── run_clm.sh
    │   │   │   ├── run_quant_clm.sh
    │   │   │   └── run_gcq_clm.sh
    │   │   ├── vit
    │   │   │   ├── __init__.py
    │   │   │   ├── run_vit.sh
    │   │   │   └── run_quant_vit.sh
    │   │   ├── gcq
    │   │   │   ├── __init__.py
    │   │   │   ├── cli_utils.py
    │   │   │   └── ls_hf_gcq_trainer.py
    │   │   └── bart
    │   │   │   └── summarization
    │   │   │       ├── requirements.txt
    │   │   │       └── run_summarization.sh
    │   ├── custom
    │   │   ├── run.sh
    │   │   ├── run_quant.sh
    │   │   └── README.md
    │   └── fairseq
    │   │   ├── requirements.txt
    │   │   ├── ls_finetune_bart
    │   │       ├── convert_lightseq_to_huggingface.sh
    │   │       └── ls_fairseq_summarization_cnn_dm.sh
    │   │   ├── native_fairseq_wmt14en2de.sh
    │   │   ├── ls_fairseq_wmt14en2de.sh
    │   │   ├── ls_torch_fairseq_wmt14en2de.sh
    │   │   ├── ls_fairseq_quant_wmt14en2de.sh
    │   │   ├── ls_torch_fairseq_quant_wmt14en2de.sh
    │   │   └── ls_fairseq_gcq_wmt14en2de.sh
    └── triton_backend
    │   ├── model_repo
    │       ├── bert_example
    │       │   ├── 1
    │       │   │   └── .gitignore
    │       │   └── config.pbtxt
    │       ├── gpt_example
    │       │   ├── 1
    │       │   │   └── .gitignore
    │       │   └── config.pbtxt
    │       └── transformer_example
    │       │   ├── 1
    │       │       └── .gitignore
    │       │   └── config.pbtxt
    │   └── transformer_client_example.py
├── CODEOWNERS
├── docs
    ├── images
    │   ├── nmt.png
    │   ├── logo.png
    │   ├── features.png
    │   ├── support.png
    │   ├── generation.png
    │   ├── single_step.png
    │   └── total_time.png
    └── examples.md
├── .clang-format
├── docker
    └── README.md
├── MANIFEST.in
├── .gitmodules
├── CONTRIBUTING.md
├── .pre-commit-config.yaml
└── .github
    └── workflows
        └── build_check.yml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/csrc/export/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/csrc/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/inference/python/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/training/neurst/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/csrc/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | 


--------------------------------------------------------------------------------
/examples/training/deepspeed/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/inference/python/export/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/vit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.0.1"
2 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/inference/python/export/fairseq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/inference/python/export/proto/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/launch_gpt_emb_layer.cpp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/inference/python/export/huggingface/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/bert_example/1/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/gpt_example/1/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/transformer_example/1/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @neopro12 @godweiyang @Taka152 @hexisyztem @zjersey
2 | 


--------------------------------------------------------------------------------
/examples/training/custom/run.sh:
--------------------------------------------------------------------------------
1 | python3 examples/training/custom/run.py
2 | 


--------------------------------------------------------------------------------
/lightseq/training/gcq/__init__.py:
--------------------------------------------------------------------------------
1 | from .gcq import GCQ, GCQState, encode_and_decode
2 | 


--------------------------------------------------------------------------------
/docs/images/nmt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/nmt.png


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/logo.png


--------------------------------------------------------------------------------
/examples/training/custom/run_quant.sh:
--------------------------------------------------------------------------------
1 | python3 examples/training/custom/run.py --enable_quant
2 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/README.md:
--------------------------------------------------------------------------------
1 | LsFlow is a extremely clean implement of computation graph.
2 | 


--------------------------------------------------------------------------------
/docs/images/features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/features.png


--------------------------------------------------------------------------------
/docs/images/support.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/support.png


--------------------------------------------------------------------------------
/docs/images/generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/generation.png


--------------------------------------------------------------------------------
/docs/images/single_step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/single_step.png


--------------------------------------------------------------------------------
/docs/images/total_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/lightseq/HEAD/docs/images/total_time.png


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | BasedOnStyle: Google
3 | ---
4 | Language: Cpp
5 | ColumnLimit: 80
6 | SortIncludes: false
7 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/x86/includes/kernel_headers.h:
--------------------------------------------------------------------------------
1 | #include <memory>
2 | #include "util.h"
3 | #include "kernels.h"
4 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gcq/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli_utils import GCQArguments
2 | from .ls_hf_gcq_trainer import LSTrainer
3 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | ## Dockerfiles of lightseq
2 | 
3 | PyPI: for publish python package.
4 | 
5 | Tritonserver: for publish tritonserver
6 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.19.5
2 | sacrebleu==1.5.1
3 | sacremoses
4 | fairseq==0.10.2
5 | lightseq
6 | ninja
7 | 


--------------------------------------------------------------------------------
/lightseq/csrc/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information.
2 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information.
2 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gpt/requirements.txt:
--------------------------------------------------------------------------------
1 | torch >= 1.3
2 | datasets >= 1.8.0
3 | sentencepiece != 0.1.92
4 | protobuf
5 | transformers == 4.16.2
6 | 


--------------------------------------------------------------------------------
/tests/cublas/build.sh:
--------------------------------------------------------------------------------
1 | nvcc -c gemm.cu -o gemm.cuda.o
2 | nvcc gemm.cuda.o test.cpp -o test -L/usr/local/cuda/lib64 -lcudart -lcuda -lcublas -lcublasLt
3 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/arm/gemm.cc:
--------------------------------------------------------------------------------
1 | #include "kernel_headers.h"
2 | 
3 | namespace lightseq {
4 | namespace arm {}  // namespace arm
5 | }  // namespace lightseq
6 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include *.txt
2 | global-include *.cu *.cpp *.cc *.cuh *.h *.ldscript *.proto *.cmake
3 | prune dist
4 | prune build
5 | prune tests
6 | prune examples
7 | 


--------------------------------------------------------------------------------
/lightseq/inference/server/libserver.ldscript:
--------------------------------------------------------------------------------
1 | {
2 |   global:
3 |     CustomErrorString;
4 |     CustomExecute;
5 |     CustomFinalize;
6 |     CustomInitialize;
7 |   local: *;
8 | };
9 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bart/summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.8.0
3 | sentencepiece != 0.1.92
4 | protobuf
5 | rouge-score
6 | nltk
7 | py7zr
8 | torch >= 1.3
9 | 


--------------------------------------------------------------------------------
/lightseq/csrc/models/includes/model_util.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "layer.h"
3 | 
4 | namespace lightseq {
5 | 
6 | GenerateMethod get_generate_method(std::string method_);
7 | 
8 | }  // namespace lightseq
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rdparty/pybind11"]
2 | 	path = 3rdparty/pybind11
3 | 	url = https://github.com/pybind/pybind11.git
4 | [submodule "3rdparty/cub"]
5 | 	path = 3rdparty/cub
6 | 	url = https://github.com/NVIDIA/cub.git
7 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/arm/includes/utils.h:
--------------------------------------------------------------------------------
 1 | #include "cstdio"
 2 | #include "iostream"
 3 | 
 4 | namespace lightseq {
 5 | 
 6 | template <typename T>
 7 | void print_vec(const T *outv, std::string outn, int num_output_ele);
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/fs_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .ls_adam import LSFSAdam
2 | from .ls_label_smoothed_cross_entropy import LSLabelSmoothedCrossEntropyCriterion
3 | from .ls_transformer import LSTransformerModel
4 | from .ls_bart import LSBARTModel
5 | from .ls_translation import LSTranslationTask
6 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/arm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
2 | 
3 | cmake_minimum_required(VERSION 3.18)
4 | set(lightseq_kernel_files gemm.cc utils.cc)
5 | 
6 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files})
7 | target_include_directories(lightseq_kernels INTERFACE includes)
8 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/arm/includes/kernel_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <math_constants.h>
 4 | #include <type_traits>
 5 | #include <chrono>
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <string>
 9 | #include <vector>
10 | #include <stdexcept>
11 | #include <functional>
12 | 
13 | #include "utils.h"
14 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pybind/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.18)
2 | 
3 | set(LS_PYBIND_KERNEL_FILES pybind_model.cpp)
4 | pybind11_add_module(lightseq MODULE ${LS_PYBIND_KERNEL_FILES})
5 | target_link_libraries(lightseq PUBLIC liblightseq lightseq_kernels)
6 | set_target_properties(lightseq PROPERTIES OUTPUT_NAME inference)
7 | 


--------------------------------------------------------------------------------
/tests/gemm_test/gemm_test.sh:
--------------------------------------------------------------------------------
1 | python3 gemm_test.py -hd 1024 -id 4096 -minb 1 -maxb 10000 -d configs
2 | python3 gemm_test.py -hd 512 -id 2048 -minb 1 -maxb 10000 -d configs
3 | python3 gemm_test.py -hd 768 -id 3072 -minb 1 -maxb 10000 -d configs
4 | 
5 | mkdir -p $HOME/.lightseq/igemm_configs
6 | cp configs/* $HOME/.lightseq/igemm_configs
7 | 


--------------------------------------------------------------------------------
/lightseq/csrc/tests/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 4 | par_dir = os.path.dirname(cur_dir)
 5 | csrc_dir = os.path.dirname(par_dir)
 6 | lightseq_dir = os.path.dirname(csrc_dir)
 7 | 
 8 | sys.path.insert(0, lightseq_dir)
 9 | sys.path.insert(0, os.path.dirname(lightseq_dir))
10 | 


--------------------------------------------------------------------------------
/lightseq/inference/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # (default) use C API for HDF5 library
 4 | find_package(HDF5 REQUIRED)
 5 | 
 6 | add_library(utils STATIC util.cc.cu)
 7 | target_include_directories(utils PUBLIC ${HDF5_INCLUDE_DIRS})
 8 | target_include_directories(utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 9 | target_link_libraries(utils PRIVATE ${HDF5_LIBRARIES})
10 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/lightseq_deepspeed_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from deepspeed.launcher.runner import main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/lightseq_fairseq_generate_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from fairseq_cli.generate import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/lightseq_fairseq_validate_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from fairseq_cli.validate import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_finetune_bart/convert_lightseq_to_huggingface.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | 
 3 | # The model's directory should contain both source and target vocabulary files
 4 | fairseq_path=/path/to/model.pt
 5 | save_dir=/path/to/save_dir
 6 | 
 7 | python3 convert_lightseq_to_huggingface.py \
 8 |     --fairseq_path $fairseq_path \
 9 |     --pytorch_dump_folder_path $save_dir \
10 |     --hf_config facebook/bart-base
11 | 


--------------------------------------------------------------------------------
/lightseq/csrc/proto/includes/proto_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | #include <iostream>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include <fcntl.h>
 8 | #include <google/protobuf/io/zero_copy_stream.h>
 9 | #include <google/protobuf/io/zero_copy_stream_impl.h>
10 | #include <sys/stat.h>
11 | #include <sys/types.h>
12 | #include <unistd.h>
13 | #include "hdf5.h"
14 | 
15 | #include "declaration.h"
16 | 


--------------------------------------------------------------------------------
/lightseq/training/cli/lightseq_fairseq_train_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from lightseq.training.gcq.ls_fs_gcq_train import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .torch_transformer_layers import (
 2 |     TransformerEncoderLayer,
 3 |     TransformerDecoderLayer,
 4 |     TransformerEmbeddingLayer,
 5 | )
 6 | from .quantization import TensorQuantizer, act_quant_config, QuantLinear
 7 | from .builder.transformer_builder import TransformerBuilder
 8 | from .builder.operator_builder import OperatorBuilder
 9 | from .builder.layer_builder import LayerBuilder
10 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/includes/ls_cub.cuh:
--------------------------------------------------------------------------------
 1 | // copied from https://github.com/dmlc/dgl/pull/2758
 2 | #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
 3 | #define DGL_ARRAY_CUDA_DGL_CUB_CUH_
 4 | 
 5 | #define CUB_NS_PREFIX namespace ls {
 6 | #define CUB_NS_POSTFIX }
 7 | #define CUB_NS_QUALIFIER ::ls::cub
 8 | #include "cub/cub.cuh"
 9 | #include "cub/util_allocator.cuh"
10 | #undef CUB_NS_POSTFIX
11 | #undef CUB_NS_PREFIX
12 | #undef CUB_NS_QUALIFIER
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/includes/allocator.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team
 3 | */
 4 | #pragma once
 5 | #include "declaration.h"
 6 | 
 7 | namespace lightseq {
 8 | 
 9 | class Allocator {
10 |  private:
11 |   std::unordered_set<char*> _ptr_set;
12 | 
13 |  public:
14 |   Allocator();
15 |   virtual ~Allocator();
16 |   char* malloc_mem(size_t size);
17 |   void free_mem(char* ptr);
18 | };
19 | 
20 | }  // namespace lightseq
21 | 


--------------------------------------------------------------------------------
/lightseq/csrc/models/model_util.cc:
--------------------------------------------------------------------------------
 1 | #include "model_util.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | GenerateMethod get_generate_method(std::string method_) {
 6 |   if (method_ == "topk") return GenerateMethod::Topk;
 7 |   if (method_ == "topp") return GenerateMethod::Topp;
 8 |   if (method_ == "beam_search") return GenerateMethod::BeamSearch;
 9 | 
10 |   printf("Error!\n");
11 |   return GenerateMethod::UnDefined;
12 | }
13 | 
14 | }  // namespace lightseq
15 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/x86/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 2 | 
 3 | cmake_minimum_required(VERSION 3.18)
 4 | 
 5 | set(lightseq_kernel_files util.cc gemm.cpp)
 6 | 
 7 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files})
 8 | target_include_directories(lightseq_kernels PUBLIC ${HDF5_INCLUDE_DIRS})
 9 | target_include_directories(lightseq_kernels INTERFACE includes)
10 | target_link_libraries(lightseq_kernels PRIVATE ${HDF5_LIBRARIES})
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contribution guidelines
2 | 
3 | First of all, thanks for taking the time to contribute!
4 | 
5 | Please refer to the following guidelines to contribute new functionality or bug fixes:
6 | 
7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code.
8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to LightSeq C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
9 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(LightseqProtoType LANGUAGES CXX CUDA)
 4 | 
 5 | find_package(Threads REQUIRED)
 6 | 
 7 | set(CMAKE_CXX_STANDARD 14)
 8 | 
 9 | add_library(
10 |   lsflow STATIC
11 |   context.cpp
12 |   node.cpp
13 |   manager.cpp
14 |   layer.cpp
15 |   tensor.cpp
16 |   allocator.cpp
17 |   lsflow_util.cpp
18 |   operator.cpp
19 |   shape.cpp
20 |   variable.cpp)
21 | 
22 | target_link_libraries(lsflow PUBLIC lightseq_kernels)
23 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/builder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builder import CUDAOpBuilder
 2 | from .cuda_kernel_builder import CudaKernelBuilder
 3 | from .x86_kernel_builder import X86KernelBuilder
 4 | from .cuda_layer_builder import CudaLayerBuilder
 5 | 
 6 | # TODO: infer this list instead of hard coded
 7 | # List of all available ops
 8 | __op_builders__ = [
 9 |     CudaKernelBuilder(),
10 |     CudaLayerBuilder(),
11 |     X86KernelBuilder(),
12 | ]
13 | 
14 | ALL_OPS = {op.name: op for op in __op_builders__}
15 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/includes/lsflow_util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team
 3 | */
 4 | 
 5 | #pragma once
 6 | #include "declaration.h"
 7 | 
 8 | namespace lightseq {
 9 | 
10 | /* Print run time, for debug */
11 | void print_time_duration(
12 |     const std::chrono::high_resolution_clock::time_point &start,
13 |     std::string duration_name);
14 | 
15 | #ifdef LIGHTSEQ_cuda
16 | cublasOperation_t op_from_custom(MATRIX_OP op_type);
17 | #endif
18 | 
19 | }  // namespace lightseq
20 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 4 | csrc_dir = os.path.dirname(cur_dir)
 5 | lightseq_dir = os.path.dirname(csrc_dir)
 6 | sys.path.insert(0, lightseq_dir)
 7 | 
 8 | from .builder.cuda_kernel_builder import CudaKernelBuilder
 9 | from .builder.x86_kernel_builder import X86KernelBuilder
10 | from .builder.cuda_layer_builder import CudaLayerBuilder
11 | 
12 | from .torch_transformer_layers import TransformerEncoderLayer, TransformerDecoderLayer
13 | 


--------------------------------------------------------------------------------
/examples/inference/python/test/ls_fairseq.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | until [[ -z "$1" ]]
 4 | do
 5 |     case $1 in
 6 |         -m)
 7 |             shift; MODEL=$1;
 8 |             shift;;
 9 |         *)
10 |             shift;;
11 |     esac
12 | done
13 | 
14 | lightseq-infer /tmp/wmt14_en_de/ \
15 |     --gen-subset test \
16 |     --path ${MODEL} \
17 |     --task translation \
18 |     --batch-size 128 \
19 |     --beam 4 \
20 |     --lenpen 0.6 \
21 |     --fp16 \
22 |     --quiet \
23 |     --scoring sacrebleu
24 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | set(cuda_kernel_files
 4 |     gptKernels.cc.cu
 5 |     gptKernels_int8.cc.cu
 6 |     transformerKernels.cc.cu
 7 |     multilgKernels.cc.cu
 8 |     embKernels.cc.cu
 9 |     embKernels_int8.cc.cu
10 |     transformerKernels_int8.cc.cu
11 |     moeKernels.cc.cu
12 |     t5Kernels.cc.cu
13 |     t5EmbKernels.cc.cu)
14 | 
15 | add_library(cuda_kernels STATIC ${cuda_kernel_files})
16 | target_include_directories(cuda_kernels INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
17 | 


--------------------------------------------------------------------------------
/lightseq/csrc/example/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | add_executable(bert_example bert_example.cc)
 4 | target_link_libraries(bert_example PUBLIC liblightseq)
 5 | 
 6 | add_executable(transformer_example transformer_example.cc)
 7 | target_link_libraries(transformer_example PUBLIC liblightseq)
 8 | 
 9 | add_executable(gpt_example gpt_example.cc)
10 | target_link_libraries(gpt_example PUBLIC liblightseq)
11 | 
12 | add_executable(llama_example llama_example.cc)
13 | target_link_libraries(llama_example PUBLIC liblightseq)
14 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gcq/cli_utils.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | 
 3 | 
 4 | @dataclass
 5 | class GCQArguments:
 6 |     """
 7 |     Arguments Gradient Communication Quantization.
 8 |     """
 9 | 
10 |     enable_GCQ: bool = field(default=False, metadata={"help": "Whether to enable GCQ"})
11 |     GCQ_quantile: float = field(
12 |         default=0.99, metadata={"help": "GCQ quantile value, between 0.0-1.0"}
13 |     )
14 |     hidden_size: int = field(
15 |         default=1024, metadata={"help": "The hidden size of model"}
16 |     )
17 | 


--------------------------------------------------------------------------------
/lightseq/csrc/proto/includes/test_model_weight.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "bert.pb.h"
 3 | #include "proto_headers.h"
 4 | #include "proto_util.h"
 5 | 
 6 | namespace lightseq {
 7 | template <typename T>
 8 | class TestModelWeight {
 9 |  private:
10 |   const T* _p_d_weight_emb;
11 |   std::vector<T> _d_weight_emb;
12 | 
13 |  public:
14 |   TestModelWeight(int weight_size) {
15 |     _d_weight_emb.clear();
16 |     for (int i = 0; i < weight_size; i++) {
17 |       _d_weight_emb.push_back(rand() % 100);
18 |     }
19 |   }
20 |   const T*& weight_emb() const { return _p_d_weight_emb; }
21 | };
22 | }  // namespace lightseq
23 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/pytorch/builder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builder import CUDAOpBuilder
 2 | from .kernel_builder import KernelBuilder
 3 | from .transformer_builder import TransformerBuilder
 4 | from .operator_builder import OperatorBuilder
 5 | from .adam_builder import AdamBuilder
 6 | from .layer_builder import LayerBuilder
 7 | 
 8 | # TODO: infer this list instead of hard coded
 9 | # List of all available ops
10 | __op_builders__ = [
11 |     LayerBuilder(),
12 |     KernelBuilder(),
13 |     OperatorBuilder(),
14 |     TransformerBuilder(),
15 |     AdamBuilder(),
16 | ]
17 | ALL_OPS = {op.name: op for op in __op_builders__}
18 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/gpt_example/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "gpt_example"
 2 | backend: "lightseq"
 3 | max_batch_size: 8
 4 | input [
 5 |   {
 6 |     name: "token_ids"
 7 |     data_type: TYPE_INT32
 8 |     dims: [ -1 ]
 9 |   }
10 | ]
11 | output [
12 |   {
13 |     name: "result"
14 |     data_type: TYPE_INT32
15 |     dims: [ -1 ]
16 |   }
17 | ]
18 | instance_group [
19 |   {
20 |     count: 1
21 |     kind: KIND_GPU
22 |   }
23 | ]
24 | default_model_filename: "lightseq_gpt2_base.hdf5"
25 | parameters: [
26 |     {
27 |         key: "model_type"
28 |         value: {
29 |             string_value: "Gpt"
30 |         }
31 |     }
32 | ]
33 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/bert_example/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "bert_example"
 2 | backend: "lightseq"
 3 | max_batch_size: 8
 4 | input [
 5 |   {
 6 |     name: "token_ids"
 7 |     data_type: TYPE_INT32
 8 |     dims: [ -1 ]
 9 |   }
10 | ]
11 | output [
12 |   {
13 |     name: "encoder_output"
14 |     data_type: TYPE_INT32
15 |     dims: [ -1 ]
16 |   }
17 | ]
18 | instance_group [
19 |   {
20 |     count: 1
21 |     kind: KIND_GPU
22 |   }
23 | ]
24 | default_model_filename: "lightseq_bert_base_uncased.hdf5"
25 | parameters: [
26 |     {
27 |         key: "model_type"
28 |         value: {
29 |             string_value: "Bert"
30 |         }
31 |     }
32 | ]
33 | 


--------------------------------------------------------------------------------
/lightseq/csrc/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(liblightseq SHARED bert.cc bert_crf.cc transformer.cu gpt.cc
 2 |                                llama.cc model_util.cc)
 3 | 
 4 | target_link_libraries(liblightseq PUBLIC lightseq_layers)
 5 | 
 6 | target_link_libraries(liblightseq PUBLIC weight_lib)
 7 | 
 8 | target_link_options(liblightseq PUBLIC $<DEVICE_LINK:-Xcompiler
 9 |                     -fvisibility=hidden>)
10 | 
11 | target_include_directories(liblightseq PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
12 | 
13 | set_target_properties(liblightseq PROPERTIES OUTPUT_NAME lightseq)
14 | 
15 | # add_executable(test_example test_layer.cc) target_link_libraries(test_example
16 | # PUBLIC liblightseq)
17 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gpt/run_clm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | 
 5 | python3 -m torch.distributed.launch \
 6 |     --nproc_per_node=1 \
 7 |     $THIS_DIR/run_clm.py \
 8 |     --model_name_or_path gpt2 \
 9 |     --dataset_name wikitext \
10 |     --dataset_config_name wikitext-103-raw-v1 \
11 |     --per_device_train_batch_size 16 \
12 |     --per_device_eval_batch_size 8 \
13 |     --num_train_epochs 1 \
14 |     --do_train \
15 |     --do_eval \
16 |     --output_dir /tmp/test-clm \
17 |     --overwrite_output_dir \
18 |     --fp16 \
19 |     --logging_steps 10 \
20 |     --block_size 512 \
21 |     --module_type 1 \
22 |     --enable_quant false
23 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: end-of-file-fixer
 7 | 
 8 |   - repo: https://gitlab.com/daverona/pre-commit-cpp
 9 |     rev: 0.8.0
10 |     hooks:
11 |       - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
12 |         args: [-style=file]
13 | 
14 |   - repo: https://github.com/psf/black
15 |     rev: 22.3.0
16 |     hooks:
17 |       - id: black
18 | 
19 |   - repo: https://github.com/cheshirekow/cmake-format-precommit
20 |     rev: v0.6.10
21 |     hooks:
22 |       - id: cmake-format
23 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/includes/kernel_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <cublas_v2.h>
 6 | #include <type_traits>
 7 | 
 8 | #include <thrust/copy.h>
 9 | #include <thrust/device_vector.h>
10 | #include <thrust/iterator/counting_iterator.h>
11 | #include <thrust/random.h>
12 | #include <curand_kernel.h>
13 | #include <thrust/functional.h>
14 | #include <thrust/sequence.h>
15 | #include <thrust/scan.h>
16 | 
17 | #include "kernels.h"
18 | #include "embKernels.h"
19 | #include "gptKernels.h"
20 | #include "transformerKernels.h"
21 | #include "cuda_util.h"
22 | #include "cublas_wrappers.h"
23 | #include "llama_kernels.h"
24 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | set(cuda_kernel_files
 4 |     util.cc.cu
 5 |     cross_entropy.cu
 6 |     cublas_wrappers.cu
 7 |     cuda_util.cu
 8 |     dropout_kernels.cu
 9 |     embedding_kernels.cu
10 |     embKernels.cc.cu
11 |     # fused_adam_kernel.cu
12 |     general_kernels.cu
13 |     gptKernels.cc.cu
14 |     llama_kernels.cu
15 |     normalize_kernels.cu
16 |     softmax_kernels.cu
17 |     softmax_kernels_new.cu
18 |     transform_kernels.cu
19 |     transform_kernels_new.cu
20 |     crf.cu
21 |     transformerKernels.cc.cu)
22 | 
23 | add_library(lightseq_kernels STATIC ${cuda_kernel_files})
24 | target_link_libraries(lightseq_kernels PUBLIC -lcublas)
25 | 


--------------------------------------------------------------------------------
/examples/training/custom/README.md:
--------------------------------------------------------------------------------
 1 | # Build models from scratch
 2 | This repo contains an example for how to use LightSeq to build model from scratch. In this example, we train a Transformer model using LightSeq Transformer model, cross entropy layer and adam optimizer.
 3 | 
 4 | The source inputs of the encoder are batch of sentences and the target outputs of the decoder are their corresponding replies. We use Hugging Face tokenizer to obtain the token indexes of the sentences.
 5 | 
 6 | You can run the example simplely by:
 7 | ```shell
 8 | python3 examples/training/custom/run.sh
 9 | ```
10 | 
11 | (Optional) You can also train the model using int8 mixed-precision:
12 | ```shell
13 | python3 examples/training/custom/run_quant.sh
14 | ```
15 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gpt/run_quant_clm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | 
 5 | python3 -m torch.distributed.launch \
 6 |     --nproc_per_node=1 \
 7 |     $THIS_DIR/run_clm.py \
 8 |     --model_name_or_path gpt2 \
 9 |     --dataset_name wikitext \
10 |     --dataset_config_name wikitext-103-raw-v1 \
11 |     --per_device_train_batch_size 16 \
12 |     --per_device_eval_batch_size 8 \
13 |     --num_train_epochs 2 \
14 |     --do_train \
15 |     --do_eval \
16 |     --output_dir /tmp/quant/test-clm \
17 |     --overwrite_output_dir \
18 |     --resume_from_checkpoint /tmp/test-clm \
19 |     --fp16 \
20 |     --logging_steps 10 \
21 |     --block_size 512 \
22 |     --module_type 1 \
23 |     --enable_quant true
24 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/x86/includes/kernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cstdio"
 3 | #include "util.h"
 4 | 
 5 | namespace lightseq {
 6 | namespace x86 {
 7 | 
 8 | template <typename InType, typename OutType>
 9 | void matrix_gemm(const InType* inpA, const InType* inpB, OutType* outC, int m,
10 |                  int n, int k);
11 | 
12 | template <typename AType, typename BType, typename CType>
13 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a,
14 |           bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha,
15 |           const AType* a, int64_t lda, const BType* b, int64_t ldb, float beta,
16 |           CType* c, int64_t ldc, const CType* a_shift_compensation = nullptr);
17 | 
18 | }  // namespace x86
19 | }  // namespace lightseq
20 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/operator.cpp:
--------------------------------------------------------------------------------
 1 | #include "node.h"
 2 | namespace lightseq {
 3 | 
 4 | Operator::Operator(std::string name) : Node(name, NodeType::Operator) {
 5 |   _context_ptr->add_op(this);
 6 | }
 7 | 
 8 | void Operator::check_override_grad() {
 9 |   for (Node* p : this->_parents) {
10 |     Variable* rp = static_cast<Variable*>(p);
11 |     if (!rp->enable_override_grad()) {
12 |       printf("can not override");
13 |       exit(-1);
14 |     }
15 |   }
16 |   return;
17 | }
18 | 
19 | void Operator::set_children(std::vector<Node*> children) {
20 |   if (!this->_children.empty()) {
21 |     printf("children not empty!");
22 |     exit(-1);
23 |   }
24 |   for (Node* iter : children) {
25 |     iter->set_parents({this});
26 |   }
27 | }
28 | }  // namespace lightseq
29 | 


--------------------------------------------------------------------------------
/examples/triton_backend/model_repo/transformer_example/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "transformer_example"
 2 | backend: "lightseq"
 3 | max_batch_size: 8
 4 | input [
 5 |   {
 6 |     name: "source_ids"
 7 |     data_type: TYPE_INT32
 8 |     dims: [ -1 ]
 9 |   }
10 | ]
11 | output [
12 |   {
13 |     name: "target_ids"
14 |     data_type: TYPE_INT32
15 |     dims: [ -1 ]
16 |   },
17 |   {
18 |     name: "target_scores"
19 |     data_type: TYPE_FP32
20 |     dims: [ -1 ]
21 |   }
22 | ]
23 | instance_group [
24 |   {
25 |     count: 1
26 |     kind: KIND_GPU
27 |   }
28 | ]
29 | default_model_filename: "lightseq_bart_base.hdf5"
30 | parameters: [
31 |     {
32 |         key: "model_type"
33 |         value: {
34 |             string_value: "Transformer"
35 |         }
36 |     }
37 | ]
38 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(layers_files
 2 |     feed_forward_layer.cpp
 3 |     linear_layer.cpp
 4 |     llama_attention_layer.cpp
 5 |     llama_mlp_layer.cpp
 6 |     llama_layer.cpp
 7 |     generator_layer.cpp
 8 |     gpt_attention_layer.cpp
 9 |     gpt_layer.cpp
10 |     multihead_attention_layer.cpp
11 |     transformer_encoder_layer.cpp
12 |     dec_enc_attention_layer.cpp
13 |     dec_self_attention_layer.cpp
14 |     transformer_decoder_layer.cpp
15 |     crf_layer.cpp
16 |     encdec_kv_layer.cpp
17 |     sample_layer.cpp
18 |     sdpa_layer.cpp)
19 | 
20 | add_library(lightseq_layers STATIC ${layers_files})
21 | target_link_libraries(lightseq_layers PUBLIC lightseq_operators lsflow)
22 | target_include_directories(lightseq_layers PUBLIC includes)
23 | 


--------------------------------------------------------------------------------
/examples/training/deepspeed/deepspeed_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 8192,
 3 |   "optimizer": {
 4 |     "type": "AdamW",
 5 |     "params": {
 6 |       "lr": 5e-4,
 7 |       "betas": [
 8 |         0.9,
 9 |         0.98
10 |       ],
11 |       "eps": 1e-8,
12 |       "weight_decay": 0.0001,
13 |       "torch_adam": false
14 |     }
15 |   },
16 |   "scheduler": {
17 |     "type": "WarmupDecayLR",
18 |     "params": {
19 |       "warmup_num_steps": 4000,
20 |       "warmup_min_lr": 0,
21 |       "warmup_max_lr": 5e-4,
22 |       "total_num_steps": 1000000
23 |     }
24 |   },
25 |   "gradient_clipping": 0.0,
26 |   "wall_clock_breakdown": false,
27 |   "fp16": {
28 |     "enabled": true,
29 |     "loss_scale": 0,
30 |     "initial_scale_power": 7
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | __version__ = "2.1.2"
18 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | __version__ = "2.1.2"
18 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(operator_files
 2 |     act_elewise_product.cpp
 3 |     beam_search_topk.cu
 4 |     bias_act_dropout.cpp
 5 |     bias_add_transform_20314.cpp
 6 |     bias_dropout_residual.cpp
 7 |     concat3_dim1.cpp
 8 |     crf.cpp
 9 |     dropout.cpp
10 |     fuse_add2_op.cpp
11 |     launch_dec_emb_op.cpp
12 |     launch_enc_emb.cpp
13 |     launch_gpt_emb.cpp
14 |     launch_llama_emb.cpp
15 |     layer_normalize.cpp
16 |     split_head_op.cpp
17 |     linear.cpp
18 |     rms_layer_norm.cpp
19 |     fuse_rotary_position_qkv.cpp
20 |     sampling.cc.cu
21 |     softmax.cpp
22 |     strided_batch_gemm.cpp
23 |     transform_0213.cpp)
24 | 
25 | add_library(lightseq_operators STATIC ${operator_files})
26 | target_link_libraries(lightseq_operators PUBLIC lsflow)
27 | target_include_directories(lightseq_operators PUBLIC includes)
28 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/sdpa_layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from csrc.pytorch.builder.cuda_layer_builder import CudaLayerBuilder
 3 | 
 4 | cuda_layer_module = CudaLayerBuilder().load()
 5 | 
 6 | 
 7 | class SdpaLayerFunc(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(
10 |         ctx,
11 |         input,
12 |         input_mask,
13 |         config,
14 |     ):
15 |         cuda_module = cuda_layer_module
16 |         forward_func = (
17 |             cuda_module.transformer_encoder_layer_fw_fp16
18 |             if config.fp16
19 |             else cuda_module.transformer_encoder_layer_fw_fp32
20 |         )
21 |         if config.fp16:
22 |             input = input.to(torch.half)
23 |             input_mask = input_mask.to(torch.half)
24 | 
25 |         (output,) = forward_func(config.layer_id, input, input_mask)
26 | 
27 |         return output
28 | 


--------------------------------------------------------------------------------
/examples/training/deepspeed/README.md:
--------------------------------------------------------------------------------
 1 | # LightSeq for Fairseq+DeepSpeed
 2 | This repo contains an example for how to use LightSeq to accerate the training of translation task in [Fairseq](https://github.com/pytorch/fairseq), together with [DeepSpeed](https://github.com/microsoft/DeepSpeed) for distributed strategies and optimizers. We provide a new trainer for translation task to connect Fairseq and DeepSpeed.
 3 | 
 4 | First you should install these requirements.
 5 | ```shell
 6 | pip install torch ninja fairseq deepspeed
 7 | ```
 8 | 
 9 | Then you can train a translation task on wmt14 en2de dataset by running the following script:
10 | ```shell
11 | sh examples/training/deepspeed/ds_fairseq_wmt14en2de.sh
12 | ```
13 | 
14 | This script firstly download the dataset, and then run native Fairseq training script using DeepSpeed launcher without any other parameter modifications.
15 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """Main entry of all utils"""
20 | 
21 | from .reduce_amax import reduce_amax
22 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """Main entry of all utils"""
20 | 
21 | from .reduce_amax import reduce_amax
22 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/t5EmbKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | template <typename T>
 9 | void t5_launch_enc_emb(const T *token_emb, const int *tokens, T *output,
10 |                        int *pad_mask, int pad_id, int batch_size, int seq_len,
11 |                        int hidden_dim, cudaStream_t stream, const T *lang_emb,
12 |                        const int *lang_id);
13 | 
14 | template <typename T>
15 | void t5_launch_dec_emb(const T *token_emb, int *tokens, const T *lang_emb,
16 |                        const int *lang_id, T *output, int batch_size,
17 |                        int beam_size, int hidden_dim, int vocab_size, int step,
18 |                        int max_step, int multilg_type, cudaStream_t stream);
19 | 
20 | }  // namespace cuda
21 | }  // namespace lightseq
22 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT)
19 | # of commit 42805f0
20 | 
21 | from .version import __version__
22 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT)
19 | # of commit 42805f0
20 | 
21 | from .version import __version__
22 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/fuse_add2_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2>
 8 | class FuseAdd2Op : public Operator {
 9 |  private:
10 |   size_t _max_batch_tokens;
11 |   size_t _batch_tokens;
12 |   size_t _batch_size;
13 |   size_t _seq_len;
14 |   size_t _hidden_dim;
15 | 
16 |   Variable* _result;
17 | 
18 |  public:
19 |   FuseAdd2Op(size_t max_batch_tokens, size_t hidden_dim)
20 |       : Operator("FuseAdd2"),
21 |         _max_batch_tokens(max_batch_tokens),
22 |         _hidden_dim(hidden_dim) {}
23 | 
24 |   ~FuseAdd2Op() {}
25 | 
26 |   Variable* operator()(Variable* inpA, Variable* inpB);
27 | 
28 |   void forward() override;
29 | 
30 |   void before_forward(size_t batch_size, size_t seq_len) {
31 |     _batch_size = batch_size;
32 |     _seq_len = seq_len;
33 |     _result->set_shape({batch_size, seq_len, _hidden_dim});
34 |   }
35 | 
36 |   void backward() override {}
37 | };
38 | 
39 | }  // namespace lightseq
40 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/calib/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """``csrc.pytorch.pytorch_quantization.calib`` provides Calibrator classes that
20 | collect data statistics and determine pytorch_quantization parameters.
21 | """
22 | 
23 | from .max import MaxCalibrator
24 | from .histogram import *
25 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/calib/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """``lightseq.training.pytorch_quantization.calib`` provides Calibrator classes that
20 | collect data statistics and determine pytorch_quantization parameters.
21 | """
22 | 
23 | from .max import MaxCalibrator
24 | from .histogram import *
25 | 


--------------------------------------------------------------------------------
/lightseq/csrc/proto/includes/hdf5_util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "proto_headers.h"
 3 | #include "proto_util.h"
 4 | #include "util.h"
 5 | 
 6 | template <typename T>
 7 | void convert_dtype_by_gpu(float* source_addr, float* source_buffer,
 8 |                           T* target_buffer, T* target_addr, size_t size,
 9 |                           cudaStream_t stream) {
10 |   if (std::is_same<T, __half>::value) {
11 |     cudaMemcpyAsync(source_buffer, source_addr, size * sizeof(float),
12 |                     cudaMemcpyDefault, stream);
13 |     lightseq::cuda::launch_convert_dtype(source_buffer, (__half*)target_addr,
14 |                                          size, 1024, stream);
15 |   } else if (std::is_same<T, float>::value) {
16 |     cudaMemcpyAsync(target_addr, source_addr, size * sizeof(float),
17 |                     cudaMemcpyDefault, stream);
18 |   }
19 | }
20 | 
21 | template <typename T>
22 | T* malloc_memory(size_t size) {
23 |   T* buffer_addr = nullptr;
24 |   cudaMalloc(&buffer_addr, size * sizeof(T));
25 |   return buffer_addr;
26 | }
27 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include "shape.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | size_t Shape::element_size() {
 6 |   if (_shape_vec.size() == 1 && _shape_vec[0] == 0) {
 7 |     printf("this tensor without shape\n");
 8 |     return 0;
 9 |   }
10 |   if (_is_calculated) {
11 |     return _element_size;
12 |   }
13 |   size_t product = 1;
14 |   for (int iter : _shape_vec) {
15 |     // if (iter <= 0) {
16 |     //   throw std::runtime_error("this tensor with invalid shape");
17 |     //   return 0;
18 |     // }
19 |     product *= iter;
20 |   }
21 |   _is_calculated = true;
22 |   _element_size = product;
23 |   return _element_size;
24 | }
25 | 
26 | void Shape::print_shape() {
27 |   printf("shape dim: %zu, element size: %d, each dimension: ",
28 |          _shape_vec.size(), element_size());
29 |   for (int i = 0; i < _shape_vec.size(); i++) {
30 |     printf("%zu", _shape_vec[i]);
31 |     if (i == _shape_vec.size() - 1) {
32 |       printf("\n");
33 |     } else {
34 |       printf(", ");
35 |     }
36 |   }
37 | }
38 | 
39 | }  // namespace lightseq
40 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/transform_0213.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // [sz0, sz1, sz2, sz3] -> [sz0, sz2, sz1, sz3]
 8 | template <typename T1, typename T2>
 9 | class Transform0213OP : public Operator {
10 |  private:
11 |   size_t _max_numel;
12 |   size_t _sz0;
13 |   size_t _sz1;
14 |   size_t _sz2;
15 |   size_t _sz3;
16 | 
17 |   Variable* _result;
18 | 
19 |  public:
20 |   Transform0213OP(size_t max_numel)
21 |       : Operator("Transform0213"), _max_numel(max_numel) {}
22 | 
23 |   virtual ~Transform0213OP() {}
24 | 
25 |   Variable* operator()(Variable* inp);
26 | 
27 |   void before_forward(size_t sz0, size_t sz1, size_t sz2, size_t sz3) {
28 |     _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3;
29 |     _result->set_shape({_sz0, _sz2, _sz1, _sz3});
30 |   }
31 | 
32 |   void forward() override;
33 | 
34 |   void before_backward(int sz0, int sz1, int sz2, int sz3) {
35 |     _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3;
36 |   }
37 | 
38 |   void backward() override;
39 | };
40 | }  // namespace lightseq
41 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/includes/shape.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "initializer_list"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // This class records the shape information of the tensor and encapsulates some
 8 | // methods that may be commonly used.
 9 | class Shape {
10 |  private:
11 |   std::vector<size_t> _shape_vec;
12 |   size_t _element_size;
13 |   bool _is_calculated;
14 | 
15 |  public:
16 |   // Default constructor, not part of expected usage.
17 |   Shape() : _shape_vec({0}), _element_size(0), _is_calculated(false) {}
18 |   Shape(std::vector<size_t> shape)
19 |       : _shape_vec(shape), _element_size(0), _is_calculated(false) {}
20 |   Shape(std::initializer_list<size_t> list)
21 |       : Shape(std::vector<size_t>(list)) {}
22 |   Shape(const Shape& lx) = default;
23 |   virtual ~Shape() = default;
24 |   const std::vector<size_t>& view() const { return _shape_vec; }
25 | 
26 |   // Returns the product of each dimension of shape.
27 |   size_t element_size();
28 | 
29 |   // Print shape information.
30 |   void print_shape();
31 | };
32 | 
33 | }  // namespace lightseq
34 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops/includes/context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cublas_v2.h>
 4 | #include <cuda.h>
 5 | 
 6 | #include <iostream>
 7 | #include <string>
 8 | 
 9 | #include "cuda_util.h"
10 | namespace lightseq {
11 | namespace cuda {
12 | 
13 | class Context {
14 |  public:
15 |   Context() : _stream(nullptr) {
16 |     CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
17 |     CHECK_GPU_ERROR(cublasLtCreate(&_cublasLtHandle));
18 |   }
19 | 
20 |   virtual ~Context() {}
21 | 
22 |   static Context &Instance() {
23 |     static Context _ctx;
24 |     return _ctx;
25 |   }
26 | 
27 |   void set_stream(cudaStream_t stream) {
28 |     _stream = stream;
29 |     CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
30 |   }
31 | 
32 |   cudaStream_t get_stream() { return _stream; }
33 | 
34 |   cublasHandle_t get_cublashandle() { return _cublasHandle; }
35 |   cublasLtHandle_t get_cublaslthandle() { return _cublasLtHandle; }
36 | 
37 |  private:
38 |   cudaStream_t _stream;
39 |   cublasHandle_t _cublasHandle;
40 |   cublasLtHandle_t _cublasLtHandle;
41 | };
42 | 
43 | }  // namespace cuda
44 | }  // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/dropout.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // after attention softmax
 8 | template <typename T1, typename T2>
 9 | class DropoutOp : public Operator {
10 |  private:
11 |   float ratio;
12 |   size_t _max_ele_num;
13 |   size_t _count;
14 |   bool _is_skip;
15 | 
16 |   TensorPtr _mask;
17 |   Variable* _result = nullptr;
18 | 
19 |  public:
20 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
21 | 
22 |   DropoutOp(float r, size_t max_ele_num)
23 |       : Operator("Dropout"), ratio(r), _max_ele_num(max_ele_num) {
24 |     _mask.reset(new Tensor("mask", g_dtype<uint8_t>(), max_ele_num));
25 |   }
26 | 
27 |   virtual ~DropoutOp() {}
28 | 
29 |   Variable* operator()(Variable* inp);
30 | 
31 |   void before_forward(size_t count) {
32 |     _count = count;
33 |     if (_result) _result->set_shape({count});
34 |   }
35 | 
36 |   void forward() override;
37 | 
38 |   void before_backward(int count) { _count = count; }
39 | 
40 |   void backward() override;
41 | };
42 | }  // namespace lightseq
43 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | from .modules.tensor_quantizer import *
20 | from .modules.quant_conv import *
21 | from .modules.quant_linear import *
22 | from .modules.quant_pooling import *
23 | from .modules.clip import *
24 | from .modules.quant_rnn import *
25 | from .modules.quant_bert import *
26 | from .modules.quant_instancenorm import *
27 | 


--------------------------------------------------------------------------------
/lightseq/csrc/pytorch/pytorch_quantization/utils/quant_logging.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """A WAR for codes that messes up logging format"""
20 | 
21 | import logging
22 | 
23 | 
24 | def reset_logger_handler():
25 |     """Remove all handler in root logger"""
26 |     root_logger = logger.getLogger()
27 |     while root_logger.handlers:
28 |         root_logger.removeHandler(root_logger.handlers[0])
29 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/utils/quant_logging.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """A WAR for codes that messes up logging format"""
20 | 
21 | import logging
22 | 
23 | 
24 | def reset_logger_handler():
25 |     """Remove all handler in root logger"""
26 |     root_logger = logger.getLogger()
27 |     while root_logger.handlers:
28 |         root_logger.removeHandler(root_logger.handlers[0])
29 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/lsflow_util.cpp:
--------------------------------------------------------------------------------
 1 | #include "lsflow_util.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | void print_time_duration(
 6 |     const std::chrono::high_resolution_clock::time_point &start,
 7 |     std::string duration_name) {
 8 | #ifdef LIGHTSEQ_cuda
 9 |   CHECK_GPU_ERROR(cudaStreamSynchronize(0));
10 | #endif
11 |   auto finish = std::chrono::high_resolution_clock::now();
12 |   std::chrono::duration<double> elapsed = finish - start;
13 |   std::cout << duration_name
14 |             << " duration time is: " << (elapsed).count() * 1000 << " ms"
15 |             << std::endl;
16 |   return;
17 | }
18 | 
19 | #ifdef LIGHTSEQ_cuda
20 | cublasOperation_t op_from_custom(MATRIX_OP op_type) {
21 |   switch (op_type) {
22 |     case MATRIX_OP::Transpose:
23 |       return CUBLAS_OP_T;
24 |     case MATRIX_OP::NonTranspose:
25 |       return CUBLAS_OP_N;
26 |     default: {
27 |       std::string error_message = "undefined custom MATRIX_OP\n";
28 |       printf("%s", error_message.c_str());
29 |       throw std::runtime_error("undefined custom MATRIX_OP");
30 |     }
31 |   }
32 |   exit(-1);
33 | }
34 | #endif
35 | }  // namespace lightseq
36 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/act_elewise_product.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2>
 8 | class ActElewiseProductOp : public Operator {
 9 |  private:
10 |   size_t _inner_size;
11 |   size_t _max_batch_tokens;
12 |   size_t _batch_tokens;
13 |   size_t _batch_size;
14 |   size_t _seq_len;
15 | 
16 |   Variable* _result;
17 | 
18 |  public:
19 |   ActElewiseProductOp(size_t max_batch_tokens, size_t inner_size)
20 |       : Operator("ActElewiseProductOp"),
21 |         _max_batch_tokens(max_batch_tokens),
22 |         _inner_size(inner_size) {}
23 | 
24 |   virtual ~ActElewiseProductOp() {}
25 | 
26 |   Variable* operator()(Variable* inp);
27 | 
28 |   void forward() override;
29 | 
30 |   void before_forward(size_t batch_size, size_t seq_len) {
31 |     _batch_size = batch_size;
32 |     _seq_len = seq_len;
33 |     _batch_tokens = batch_size * seq_len;
34 |     _result->set_shape({_batch_tokens, _inner_size});
35 |   }
36 | 
37 |   void backward() override {}
38 | 
39 |   void before_backward() {}
40 | };
41 | 
42 | }  // namespace lightseq
43 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/native_fairseq_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | fairseq-train /tmp/wmt14_en_de/ \
13 |     --arch transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
14 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
15 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
16 |     --weight-decay 0.0001 \
17 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
18 |     --max-tokens 8192 \
19 |     --eval-bleu \
20 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
21 |     --eval-bleu-detok moses \
22 |     --eval-bleu-remove-bpe \
23 |     --eval-bleu-print-samples \
24 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
25 |     --fp16
26 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/embKernels_int8.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | template <typename T>
 9 | void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
10 |                         const int *tokens, T *output, int *pad_mask, int pad_id,
11 |                         int batch_size, int seq_len, int hidden_dim,
12 |                         cudaStream_t stream, const T *lang_emb,
13 |                         const int *lang_id, int multilg_type,
14 |                         float dequant_scale, bool scaled = true);
15 | 
16 | template <typename T>
17 | void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
18 |                         const T *lang_emb, const int *lang_id, T *output,
19 |                         int batch_size, int beam_size, int hidden_dim,
20 |                         int vocab_size, int step, int max_step,
21 |                         int multilg_type, cudaStream_t stream,
22 |                         float dequant_scale, bool scaled = true);
23 | 
24 | }  // namespace cuda
25 | }  // namespace lightseq
26 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/fuse_add2_op.cpp:
--------------------------------------------------------------------------------
 1 | #include "fuse_add2_op.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable* FuseAdd2Op<T1, T2>::operator()(Variable* inpA, Variable* inpB) {
 7 |   _result = new Variable("FuseAdd2Op_out", _max_batch_tokens * _hidden_dim,
 8 |                          g_dtype<T1>(), g_dtype<T2>());
 9 |   set_parents({inpA, inpB});
10 |   this->set_children({_result});
11 |   return _result;
12 | }
13 | 
14 | template <typename T1, typename T2>
15 | void FuseAdd2Op<T1, T2>::forward() {
16 |   T1* inpA_ptr = (T1*)parent(0)->value();
17 |   T1* inpB_ptr = (T1*)parent(1)->value();
18 |   T1* out_ptr = (T1*)child(0)->value();
19 | 
20 |   if (!_context_ptr->is_built()) {
21 |     return;
22 |   }
23 | 
24 | #ifdef LIGHTSEQ_cuda
25 |   cudaStream_t stream = _context_ptr->get_stream();
26 |   cuda::launch_fused_add2(out_ptr, inpA_ptr, inpB_ptr, _batch_size, _seq_len,
27 |                           _hidden_dim, stream);
28 | #endif
29 | }
30 | 
31 | template class FuseAdd2Op<float, float>;
32 | #ifdef LIGHTSEQ_cuda
33 | template class FuseAdd2Op<__half, __half>;
34 | #endif
35 | }  // namespace lightseq
36 | 


--------------------------------------------------------------------------------
/examples/training/deepspeed/ds_fairseq_argument.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from fairseq import options
 4 | from deepspeed.runtime.config_utils import dict_raise_error_on_duplicate_keys
 5 | 
 6 | 
 7 | def gen_ds_fairseq_arg():
 8 |     parser = options.get_training_parser()
 9 |     parser.add_argument(
10 |         "--deepspeed_config",
11 |         default=None,
12 |         type=str,
13 |         required=True,
14 |         help="DeepSpeed json configuration file.",
15 |     )
16 |     fs_args = options.parse_args_and_arch(parser, modify_parser=None)
17 | 
18 |     ds_config = gen_ds_config(fs_args)
19 |     delattr(fs_args, "deepspeed_config")
20 |     return fs_args, ds_config
21 | 
22 | 
23 | def gen_ds_config(fs_args):
24 |     ds_config = json.load(
25 |         open(fs_args.deepspeed_config),
26 |         object_pairs_hook=dict_raise_error_on_duplicate_keys,
27 |     )
28 | 
29 |     # Different parameters in fairseq and deepspeed have the same effect.
30 |     # For these parameters, we extract it from fairseq arguments and put it
31 |     # int the deepspeed config file
32 |     ds_config["steps_per_print"] = fs_args.log_interval
33 |     return ds_config
34 | 


--------------------------------------------------------------------------------
/lightseq/csrc/proto/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # (default) use C API for HDF5 library
 4 | find_package(HDF5 REQUIRED)
 5 | include_directories(${HDF5_INCLUDE_DIRS})
 6 | 
 7 | find_package(Protobuf REQUIRED)
 8 | include_directories(${Protobuf_INCLUDE_DIRS})
 9 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
10 | 
11 | set(PROTO_FILES bert.proto bert_crf.proto transformer.proto gpt.proto)
12 | 
13 | set(WEIGHT_FILES bert_weight.cc bert_crf_weight.cc transformer_weight.cc
14 |                  gpt_weight.cc llama_weight.cc)
15 | 
16 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER ${PROTO_FILES})
17 | add_library(weight_lib STATIC ${WEIGHT_FILES} ${PROTO_SRC} ${PROTO_HEADER}
18 |                               proto_util.cc)
19 | target_link_libraries(weight_lib PRIVATE ${HDF5_LIBRARIES})
20 | target_link_libraries(weight_lib PUBLIC ${Protobuf_LIBRARIES})
21 | target_link_libraries(weight_lib PUBLIC lightseq_kernels)
22 | 
23 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
24 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
25 | target_include_directories(weight_lib PUBLIC ${HDF5_INCLUDE_DIRS})
26 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/crf.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // linear crf
 8 | template <typename T>
 9 | class CRFOP : public Operator {
10 |  private:
11 |   size_t _num_tags;
12 |   size_t _seq_len;
13 |   size_t _batch_size;
14 |   size_t _max_batch_tokens;
15 |   size_t _max_batch_size;
16 | 
17 |   bool _forward_or_decode;  // true for forward, false for decode
18 |   bool _output_decode_score;
19 |   TensorPtr _history;
20 | 
21 |   Variable* _best_tags;
22 | 
23 |  public:
24 |   CRFOP(size_t max_batch_tokens, size_t max_batch_size, size_t num_tags);
25 | 
26 |   virtual ~CRFOP() {}
27 | 
28 |   Variable* operator()(Variable* start_transition, Variable* end_transition,
29 |                        Variable* transition, Variable* emission, Variable* mask,
30 |                        Variable* bias);
31 | 
32 |   void before_forward(size_t batch_size, size_t seq_len, bool forward_or_decode,
33 |                       bool output_decode_score);
34 | 
35 |   void forward() override;
36 | 
37 |   void before_backward();
38 | 
39 |   void backward() override;
40 | };
41 | 
42 | }  // namespace lightseq
43 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/act_elewise_product.cpp:
--------------------------------------------------------------------------------
 1 | #include "act_elewise_product.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable* ActElewiseProductOp<T1, T2>::operator()(Variable* inp) {
 7 |   size_t max_size = _max_batch_tokens * _inner_size;
 8 |   _result = new Variable("ActElewiseProductOp_out", max_size, g_dtype<T1>(),
 9 |                          g_dtype<T2>());
10 |   set_parents({inp});
11 |   this->set_children({_result});
12 |   return _result;
13 | }
14 | 
15 | template <typename T1, typename T2>
16 | void ActElewiseProductOp<T1, T2>::forward() {
17 |   T1* inp_val = (T1*)parent(0)->value();
18 |   T1* out_val = (T1*)child(0)->value();
19 | 
20 |   if (!_context_ptr->is_built()) {
21 |     return;
22 |   }
23 | 
24 | #ifdef LIGHTSEQ_cuda
25 |   cudaStream_t stream = _context_ptr->get_stream();
26 |   cuda::launch_silu_elewise_product(inp_val, out_val, _batch_size, _seq_len,
27 |                                     _inner_size, stream);
28 | #endif
29 | }
30 | 
31 | template class ActElewiseProductOp<float, float>;
32 | #ifdef LIGHTSEQ_cuda
33 | template class ActElewiseProductOp<__half, __half>;
34 | #endif
35 | }  // namespace lightseq
36 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_fairseq_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | lightseq-train /tmp/wmt14_en_de/ \
13 |     --task translation \
14 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
15 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' \
16 |     --clip-norm 0.0 \
17 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \
18 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
19 |     --max-tokens 8192 \
20 |     --eval-bleu \
21 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
22 |     --eval-bleu-detok moses \
23 |     --eval-bleu-remove-bpe \
24 |     --eval-bleu-print-samples \
25 |     --best-checkpoint-metric bleu \
26 |     --maximize-best-checkpoint-metric \
27 |     --fp16 \
28 |     --find-unused-parameters
29 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_ner/run_ner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | python3 -m torch.distributed.launch \
18 |   --nproc_per_node=1 \
19 |   $THIS_DIR/run_ner.py \
20 |   --model_name_or_path bert-base-uncased \
21 |   --dataset_name conll2003 \
22 |   --do_train \
23 |   --do_eval \
24 |   --per_device_train_batch_size 16 \
25 |   --num_train_epochs 10 \
26 |   --output_dir /tmp/test-ner \
27 |   --overwrite_output_dir \
28 |   --fp16 \
29 |   --seed 1234 \
30 |   --logging_steps 10 \
31 |   --module_type 1 \
32 |   --enable_quant false
33 | 


--------------------------------------------------------------------------------
/examples/inference/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | add_executable(transformer_example transformer_example.cc)
 4 | target_link_libraries(transformer_example PUBLIC liblightseq)
 5 | 
 6 | add_executable(quant_transformer_example quant_transformer_example.cc)
 7 | target_link_libraries(quant_transformer_example PUBLIC liblightseq)
 8 | 
 9 | add_executable(bert_example bert_example.cc)
10 | target_link_libraries(bert_example PUBLIC liblightseq)
11 | 
12 | add_executable(quant_bert_example quant_bert_example.cc)
13 | target_link_libraries(quant_bert_example PUBLIC liblightseq)
14 | 
15 | add_executable(gpt_example gpt_example.cc)
16 | target_link_libraries(gpt_example PUBLIC liblightseq)
17 | 
18 | add_executable(quant_gpt_example quant_gpt_example.cc)
19 | target_link_libraries(quant_gpt_example PUBLIC liblightseq)
20 | 
21 | add_executable(transformer_decoder_example decoder_example.cc.cu)
22 | target_link_libraries(transformer_decoder_example PUBLIC transformer_model)
23 | 
24 | add_executable(vit_example vit_example.cc)
25 | target_link_libraries(vit_example PUBLIC liblightseq)
26 | 
27 | add_executable(quant_vit_example quant_vit_example.cc)
28 | target_link_libraries(quant_vit_example PUBLIC liblightseq)
29 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/layer_normalize.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <class T1, class T2>
 8 | class LayerNormalizeOp : public Operator {
 9 |  private:
10 |   size_t _max_batch_tokens;
11 |   size_t _hidden_dim;
12 |   size_t _batch_tokens;
13 | 
14 |   bool _use_mean;
15 | 
16 |   TensorPtr means_;
17 |   TensorPtr vars_;
18 | 
19 |   Variable* _result;
20 | 
21 |  public:
22 |   LayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim,
23 |                    bool use_mean = false)
24 |       : Operator("LayerNormalizeOp"),
25 |         _max_batch_tokens(max_batch_tokens),
26 |         _hidden_dim(hidden_dim),
27 |         _use_mean(use_mean) {
28 |     vars_.reset(new Tensor("vars", g_dtype<T1>(), max_batch_tokens));
29 |     if (use_mean)
30 |       means_.reset(new Tensor("means", g_dtype<T1>(), max_batch_tokens));
31 |   }
32 | 
33 |   Variable* operator()(Variable* inp, Variable* gamma, Variable* betta);
34 | 
35 |   virtual ~LayerNormalizeOp();
36 | 
37 |   void before_forward(size_t batch_size, size_t seq_len);
38 | 
39 |   void forward() override;
40 | 
41 |   void backward() override;
42 | };
43 | 
44 | }  // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_torch_fairseq_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | lightseq-train /tmp/wmt14_en_de/ \
13 |     --task translation \
14 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
15 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' \
16 |     --clip-norm 0.0 \
17 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \
18 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
19 |     --max-tokens 8192 \
20 |     --eval-bleu \
21 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
22 |     --eval-bleu-detok moses \
23 |     --eval-bleu-remove-bpe \
24 |     --eval-bleu-print-samples \
25 |     --best-checkpoint-metric bleu \
26 |     --maximize-best-checkpoint-metric \
27 |     --fp16 \
28 |     --use-torch-layer \
29 |     --find-unused-parameters
30 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gpt/run_gcq_clm.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | 
 5 | # You can use multiple NICs in NCCL communication.
 6 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command.
 7 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3
 8 | 
 9 | # Set your environment variables according to your training environment,
10 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility
11 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \
12 |     --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \
13 |     --master_port=$WORKER_0_PORT \
14 |     $THIS_DIR/run_gcq_clm.py \
15 |     --model_name_or_path gpt2 \
16 |     --dataset_name wikitext \
17 |     --dataset_config_name wikitext-103-raw-v1 \
18 |     --per_device_train_batch_size 16 \
19 |     --per_device_eval_batch_size 8 \
20 |     --num_train_epochs 1 \
21 |     --do_train \
22 |     --do_eval \
23 |     --output_dir /tmp/test-clm \
24 |     --overwrite_output_dir \
25 |     --fp16 \
26 |     --logging_steps 10 \
27 |     --block_size 512 \
28 |     --module_type 2 \
29 |     --enable_quant false \
30 |     --enable_GCQ true \
31 |     --GCQ_quantile 0.99
32 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/rms_layer_norm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <class T1, class T2>
 8 | class RMSLayerNormalizeOp : public Operator {
 9 |  private:
10 |   size_t _max_batch_tokens;
11 |   size_t _hidden_dim;
12 |   size_t _batch_tokens;
13 |   float _epsilon;
14 | 
15 |   bool _use_mean;
16 |   bool _use_residual;
17 | 
18 |   TensorPtr _rms_vars;
19 |   Variable* _result;
20 |   Variable* _residual;
21 | 
22 |  public:
23 |   RMSLayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim,
24 |                       bool use_residual = true, float epsilon = 1e-6)
25 |       : Operator("RMSLayerNormalizeOp"),
26 |         _max_batch_tokens(max_batch_tokens),
27 |         _hidden_dim(hidden_dim),
28 |         _use_residual(use_residual),
29 |         _epsilon(epsilon) {
30 |     _rms_vars.reset(new Tensor("rms_vars", g_dtype<T1>(), max_batch_tokens));
31 |   }
32 | 
33 |   std::tuple<Variable*, Variable*> operator()(Variable* inp, Variable* scale);
34 | 
35 |   virtual ~RMSLayerNormalizeOp();
36 | 
37 |   void before_forward(size_t batch_size, size_t seq_len);
38 | 
39 |   void forward() override;
40 | 
41 |   void backward() override {}
42 | };
43 | 
44 | }  // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_ner/run_quant_ner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | python3 -m torch.distributed.launch \
18 |   --nproc_per_node=1 \
19 |   $THIS_DIR/run_ner.py \
20 |   --model_name_or_path bert-base-uncased \
21 |   --dataset_name conll2003 \
22 |   --do_train \
23 |   --do_eval \
24 |   --per_device_train_batch_size 16 \
25 |   --num_train_epochs 20 \
26 |   --output_dir /tmp/quant/test-ner \
27 |   --overwrite_output_dir \
28 |   --resume_from_checkpoint /tmp/test-ner/ \
29 |   --fp16 \
30 |   --seed 1234 \
31 |   --logging_steps 10 \
32 |   --module_type 1 \
33 |   --enable_quant true
34 | 


--------------------------------------------------------------------------------
/examples/training/neurst/README.md:
--------------------------------------------------------------------------------
 1 | # LightSeq for NeurST
 2 | This repo contains an example for how to use LightSeq to accerate the training of translation task.
 3 | 
 4 | First you should install these requirements.
 5 | ```shell
 6 | $ pip install subword-nmt pyyaml sacrebleu sacremoses
 7 | $ git clone https://github.com/moses-smt/mosesdecoder.git
 8 | ```
 9 | Then clone NeurST and switch to lightseq branch.
10 | ```shell
11 | $ git clone https://github.com/bytedance/neurst.git
12 | $ cd neurst/
13 | $ git checkout lightseq
14 | $ pip install -e .
15 | ```
16 | Install lightseq
17 | ```shell
18 | $ pip install http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/tensorflow/lightseq_tf-2.0.1-cp37-cp37m-linux_x86_64.whl
19 | ```
20 | Download and preprocess data
21 | ```shell
22 | $ ./examples/translation/prepare-wmt14en2de-bpe.sh ../mosesdecoder
23 | ```
24 | Traing the model
25 | ```shell
26 | $ python3 -m neurst.cli.run_exp \
27 |     --config_paths wmt14_en_de/training_args.yml,wmt14_en_de/translation_bpe.yml \
28 |     --hparams_set transformer_base \
29 |     --model_dir wmt14_en_de/benchmark_base \
30 |     --enable_xla
31 | ```
32 | 
33 | 
34 | LightSeq can achieve about 1.33x speedup using batch size 4096 on 8 V100 GPUs,
35 | compared with original tensorflow implementation.
36 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_qa/run_qa.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | python3 -m torch.distributed.launch \
18 |   --nproc_per_node=1 \
19 |   $THIS_DIR/run_qa.py \
20 |   --model_name_or_path bert-base-uncased \
21 |   --dataset_name squad \
22 |   --do_train \
23 |   --do_eval \
24 |   --max_seq_length 256 \
25 |   --per_device_train_batch_size 16 \
26 |   --doc_stride 128 \
27 |   --learning_rate 3e-5 \
28 |   --num_train_epochs 10 \
29 |   --output_dir /tmp/squad \
30 |   --overwrite_output_dir \
31 |   --fp16 \
32 |   --seed 1234 \
33 |   --logging_steps 10 \
34 |   --module_type 1 \
35 |   --enable_quant false
36 | 


--------------------------------------------------------------------------------
/lightseq/training/ops/pytorch/builder/adam_builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright Microsoft DeepSpeed
 3 | # This builder is adapted from Microsoft DeepSpeed
 4 | 
 5 | import torch
 6 | from .builder import CUDAOpBuilder
 7 | 
 8 | 
 9 | class AdamBuilder(CUDAOpBuilder):
10 |     NAME = "adam"
11 | 
12 |     def __init__(self, name=None):
13 |         name = self.NAME if name is None else name
14 |         super().__init__(name=name)
15 | 
16 |     def absolute_name(self):
17 |         return f"op_builder.{self.NAME}_op"
18 | 
19 |     def sources(self):
20 |         return [
21 |             "csrc/kernels/fused_adam_kernel.cu",
22 |             "csrc/pybind/pybind_adam.cpp",
23 |         ]
24 | 
25 |     def include_paths(self):
26 |         return ["csrc/kernels/includes", "csrc/ops/includes", "csrc/layers/includes"]
27 | 
28 |     def nvcc_args(self):
29 |         args = [
30 |             "-O3",
31 |             "--use_fast_math",
32 |             "-std=c++14",
33 |             "-U__CUDA_NO_HALF_OPERATORS__",
34 |             "-U__CUDA_NO_HALF_CONVERSIONS__",
35 |             "-U__CUDA_NO_HALF2_OPERATORS__",
36 |         ]
37 | 
38 |         return args + self.compute_capability_args()
39 | 
40 |     def cxx_args(self):
41 |         return ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
42 | 


--------------------------------------------------------------------------------
/tests/huggingface/test_gpt.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import torch
 3 | import numpy as np
 4 | from transformers import GPT2Tokenizer, GPT2Model, AutoConfig
 5 | from lightseq.training import LSGptEncoderLayer
 6 | 
 7 | 
 8 | @dataclass
 9 | class TrainingArguments:
10 |     fp16: bool = True
11 |     local_rank: int = -1
12 | 
13 | 
14 | def test_gpt_layer():
15 |     # text = "Replace me by any text you'd like."
16 |     # tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
17 |     # encoded_input = tokenizer(text, return_tensors="pt")
18 |     torch.random.manual_seed(1234)
19 |     test_input = torch.empty(4, 64, 768).normal_().cuda()
20 |     training_args = TrainingArguments()
21 |     model = GPT2Model.from_pretrained("gpt2")
22 |     config = AutoConfig.from_pretrained("gpt2")
23 |     layer = model.h[0].cuda().train(False)
24 |     base_output = layer(test_input)[0]
25 |     ls_layer = LSGptEncoderLayer.from_huggingface(layer, training_args, config).train(
26 |         False
27 |     )
28 |     ls_output = ls_layer(test_input)[0]
29 |     np.testing.assert_allclose(
30 |         base_output.detach().cpu().numpy(),
31 |         ls_output.detach().cpu().numpy(),
32 |         rtol=1e-2,
33 |         atol=2e-1,
34 |     )
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     test_gpt_layer()
39 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/bias_add_transform_20314.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | // add bias and transform 20314, execute after qkv_linear
 9 | template <typename T1, typename T2>
10 | class BiasAddTrans20314 : public Operator {
11 |  private:
12 |   size_t _max_batch_tokens;
13 |   size_t _batch;
14 |   size_t _seq_len;
15 |   size_t _heads;
16 |   size_t _hidden_size;
17 |   size_t _trans_count;
18 | 
19 |   Variable* _res;
20 | 
21 |  public:
22 |   BiasAddTrans20314(size_t max_batch_tokens, size_t heads, size_t hidden_size,
23 |                     size_t trans_count)
24 |       : Operator("BiasAddTrans20314"),
25 |         _max_batch_tokens(max_batch_tokens),
26 |         _heads(heads),
27 |         _hidden_size(hidden_size),
28 |         _trans_count(trans_count) {}
29 | 
30 |   virtual ~BiasAddTrans20314() {}
31 | 
32 |   Variable* operator()(Variable* inp, Variable* bias);
33 | 
34 |   void before_forward(size_t batch, size_t seq_len) {
35 |     _batch = batch, _seq_len = seq_len;
36 |     _res->set_shape(
37 |         {_trans_count, _batch, _heads, _seq_len, _hidden_size / _heads});
38 |   }
39 | 
40 |   void forward() override;
41 | 
42 |   void backward() override;
43 | };
44 | }  // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/t5Kernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | #include <curand_kernel.h>
 5 | #include <cub/cub.cuh>
 6 | 
 7 | namespace lightseq {
 8 | namespace cuda {
 9 | 
10 | const float t5_epsilon = 1e-6;
11 | template <typename T>
12 | void t5_ker_norm_layer_launcher(int token_num, int hidden_size,
13 |                                 cudaStream_t stream, T* matrix, T* out,
14 |                                 const T* scale, const T* bias,
15 |                                 int max_thread_per_block);
16 | 
17 | template <typename T>
18 | void t5_ker_correlation_softmax_encself_launcher(
19 |     int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
20 |     T* correlation, const int* src_padding_mask, const T* pos_emb);
21 | 
22 | template <typename T>
23 | void t5_ker_correlation_softmax_decself_launcher(
24 |     int batch_head_num, int step_num, cudaStream_t stream, T* correlation,
25 |     const T* pos_emb, int head_num);
26 | 
27 | template <typename T>
28 | void ker_gelu_first_elementmul_launcher(int batch_token_num, int block_dim,
29 |                                         cudaStream_t stream, T* input,
30 |                                         const T* input2, int feature_dim);
31 | }  // namespace cuda
32 | }  // namespace lightseq
33 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/vit/run_vit.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | THIS_DIR=$(dirname $(readlink -f $0))
17 | 
18 | python3 -m torch.distributed.launch \
19 |   --nproc_per_node=1 \
20 |   $THIS_DIR/run_vit.py \
21 |   --dataset_name beans \
22 |   --output_dir /tmp/beans_outputs \
23 |   --overwrite_output_dir \
24 |   --remove_unused_columns False \
25 |   --do_train \
26 |   --do_eval \
27 |   --learning_rate 2e-5 \
28 |   --num_train_epochs 30 \
29 |   --per_device_train_batch_size 8 \
30 |   --per_device_eval_batch_size 8 \
31 |   --logging_steps 10 \
32 |   --seed 1337 \
33 |   --fp16 \
34 |   --module_type 1 \
35 |   --enable_quant false
36 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/bias_dropout_residual.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // transformer layer's postprocessing dropout, after attn or ffn module,
 8 | // before residual add.
 9 | template <typename T1, typename T2>
10 | class BiasDropoutResOp : public Operator {
11 |  private:
12 |   float ratio;
13 | 
14 |   size_t _max_rows;
15 |   size_t _max_cols;
16 |   size_t _rows;
17 |   size_t _cols;
18 | 
19 |   TensorPtr _mask;
20 |   Variable* _result;
21 | 
22 |  public:
23 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
24 | 
25 |   BiasDropoutResOp(float r, size_t max_rows, size_t max_cols)
26 |       : Operator("BiasDropoutResOp"),
27 |         ratio(r),
28 |         _max_rows(max_rows),
29 |         _max_cols(max_cols) {
30 |     _mask.reset(new Tensor("mask", g_dtype<uint8_t>(), _max_rows * _max_cols));
31 |   }
32 | 
33 |   virtual ~BiasDropoutResOp() {}
34 | 
35 |   Variable* operator()(Variable* inp, Variable* bias, Variable* residual);
36 | 
37 |   void before_forward(size_t rows, size_t cols) {
38 |     _rows = rows, _cols = cols;
39 |     _result->set_shape({_rows, _cols});
40 |   }
41 | 
42 |   void forward() override;
43 | 
44 |   void backward() override;
45 | };
46 | }  // namespace lightseq
47 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/linear_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "linear.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2>
 9 | class LinearLayer : public Layer {
10 |  private:
11 |   // operators
12 |   LinearOp<T1, T2>* _linear = nullptr;
13 | 
14 |   // parameters
15 |   Variable* _linear_w;
16 | 
17 |   // shape related
18 |   int _max_batch_tokens;
19 |   size_t _input_size;
20 |   size_t _output_size;
21 | 
22 |  public:
23 |   LinearLayer(int max_batch_tokens, int input_size, int output_size,
24 |               MATRIX_OP opA = MATRIX_OP::Transpose,
25 |               MATRIX_OP opB = MATRIX_OP::NonTranspose, float alpha = float(1.));
26 | 
27 |   virtual ~LinearLayer() {}
28 | 
29 |   Variable* operator()(Variable* inp);
30 | 
31 |   void before_forward(int batch_size, int seq_len);
32 | 
33 |   void before_backward();
34 | 
35 |   size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr);
36 | 
37 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
38 | };
39 | 
40 | template class LinearLayer<float, float>;
41 | #ifdef LIGHTSEQ_cuda
42 | template class LinearLayer<__half, __half>;
43 | #endif
44 | 
45 | template <class T1, class T2>
46 | using LinearLayerPtr = std::shared_ptr<LinearLayer<T1, T2>>;
47 | 
48 | }  // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/examples/training/deepspeed/ds_fairseq_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://sf3-ttcdn-tos.pstatp.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | lightseq-deepspeed ${THIS_DIR}/ds_fairseq.py \
13 |     /tmp/wmt14_en_de/ \
14 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
15 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
16 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
17 |     --weight-decay 0.0001 \
18 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
19 |     --max-tokens 8192 \
20 |     --log-interval 200 \
21 |     --validate-interval-updates 2000 \
22 |     --eval-bleu \
23 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
24 |     --eval-bleu-detok moses \
25 |     --eval-bleu-remove-bpe \
26 |     --eval-bleu-print-samples \
27 |     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
28 |     --fp16 \
29 |     --deepspeed_config ${THIS_DIR}/deepspeed_config.json
30 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_fairseq_quant_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | lightseq-train /tmp/wmt14_en_de/ \
13 |     --task translation \
14 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
15 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' \
16 |     --clip-norm 0.0 \
17 |     --lr 1e-6 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \
18 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
19 |     --max-tokens 8192 \
20 |     --eval-bleu \
21 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
22 |     --eval-bleu-detok moses \
23 |     --eval-bleu-remove-bpe \
24 |     --eval-bleu-print-samples \
25 |     --best-checkpoint-metric bleu \
26 |     --maximize-best-checkpoint-metric \
27 |     --fp16 \
28 |     --enable-quant \
29 |     --finetune-from-model checkpoints/checkpoint_best.pt \
30 |     --save-dir checkpoints/quant
31 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/bias_act_dropout.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // dropout inside ffn.
 8 | template <typename T1, typename T2>
 9 | class BiasActDropoutOp : public Operator {
10 |  private:
11 |   float ratio;
12 | 
13 |   size_t _mx_cols;
14 |   size_t _mx_rows;
15 |   size_t _cols;
16 |   size_t _rows;
17 | 
18 |   Variable* _result;
19 | 
20 |   std::string _activation_fn;
21 | 
22 |   TensorPtr _mask;
23 | 
24 |  public:
25 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
26 | 
27 |   BiasActDropoutOp(float r, size_t mx_rows, size_t mx_cols,
28 |                    std::string activation_fn)
29 |       : Operator("BiasActDropoutOp"),
30 |         ratio(r),
31 |         _activation_fn(activation_fn),
32 |         _mx_rows(mx_rows),
33 |         _mx_cols(mx_cols) {
34 |     _mask.reset(new Tensor("_mask", g_dtype<uint8_t>(), _mx_rows * _mx_cols));
35 |   }
36 | 
37 |   virtual ~BiasActDropoutOp() {}
38 | 
39 |   Variable* operator()(Variable* inp, Variable* bias);
40 | 
41 |   void before_forward(size_t rows, size_t cols) {
42 |     _rows = rows, _cols = cols;
43 |     _result->set_shape({rows, cols});
44 |   }
45 | 
46 |   void forward() override;
47 | 
48 |   void backward() override;
49 | };
50 | }  // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_glue/run_glue.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | THIS_DIR=$(dirname $(readlink -f $0))
17 | 
18 | export TASK_NAME=sst2
19 | 
20 | python3 -m torch.distributed.launch \
21 |   --nproc_per_node=1 \
22 |   $THIS_DIR/run_glue.py \
23 |   --model_name_or_path bert-base-cased \
24 |   --task_name $TASK_NAME \
25 |   --do_train \
26 |   --do_eval \
27 |   --max_seq_length 128 \
28 |   --per_device_train_batch_size 32 \
29 |   --learning_rate 2e-5 \
30 |   --num_train_epochs 10 \
31 |   --output_dir /tmp/$TASK_NAME/ \
32 |   --overwrite_output_dir \
33 |   --fp16 \
34 |   --seed 1234 \
35 |   --logging_steps 10 \
36 |   --module_type 1 \
37 |   --enable_quant false
38 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_qa/run_quant_qa.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | python3 -m torch.distributed.launch \
18 |   --nproc_per_node=1 \
19 |   $THIS_DIR/run_qa.py \
20 |   --model_name_or_path bert-base-uncased \
21 |   --dataset_name squad \
22 |   --do_train \
23 |   --do_eval \
24 |   --max_seq_length 256 \
25 |   --per_device_train_batch_size 16 \
26 |   --doc_stride 128 \
27 |   --learning_rate 1e-5 \
28 |   --num_train_epochs 16 \
29 |   --output_dir /tmp/quant/squad \
30 |   --overwrite_output_dir \
31 |   --resume_from_checkpoint /tmp/squad/ \
32 |   --fp16 \
33 |   --seed 1234 \
34 |   --logging_steps 10 \
35 |   --module_type 1 \
36 |   --enable_quant true
37 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/llama_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "layer.h"
 3 | #include "llama_attention_layer.h"
 4 | #include "llama_mlp_layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2>
 9 | class LlamaLayer : public Layer {
10 |  private:
11 |   LlamaAttentionLayerPtr<T1, T2> _attn_layer;
12 |   LlamaMLPLayerPtr<T1, T2> _mlp_layer;
13 | 
14 |   int _layer_id;
15 | 
16 |  public:
17 |   LlamaLayer(int max_batch_size, int max_seq_len, int hidden_size,
18 |              int inner_dim, int num_heads, int beam_size);
19 |   virtual ~LlamaLayer() {}
20 | 
21 |   Variable* operator()(Variable* inp, Variable* cache_k, Variable* cache_v,
22 |                        Variable* pad_mask);
23 | 
24 |   void before_forward(int batch_size, int seq_len, int prompt_len) {
25 |     _attn_layer->before_forward(batch_size, seq_len, prompt_len);
26 |     _mlp_layer->before_forward(batch_size, seq_len);
27 |   }
28 | 
29 |   size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr);
30 | 
31 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
32 | };
33 | 
34 | template class LlamaLayer<float, float>;
35 | #ifdef LIGHTSEQ_cuda
36 | template class LlamaLayer<__half, __half>;
37 | #endif
38 | 
39 | template <class T1, class T2>
40 | using LlamaLayerPtr = std::shared_ptr<LlamaLayer<T1, T2>>;
41 | 
42 | }  // namespace lightseq
43 | 


--------------------------------------------------------------------------------
/lightseq/training/__init__.py:
--------------------------------------------------------------------------------
 1 | from lightseq.training.ops.pytorch.transformer_embedding_layer import (
 2 |     LSTransformerEmbeddingLayer,
 3 | )
 4 | from lightseq.training.ops.pytorch.transformer_encoder_layer import (
 5 |     LSTransformerEncoderLayer,
 6 | )
 7 | from lightseq.training.ops.pytorch.transformer_decoder_layer import (
 8 |     LSTransformerDecoderLayer,
 9 | )
10 | from lightseq.training.ops.pytorch.gpt_layer import (
11 |     LSGptEncoderLayer,
12 |     ls_hf_gpt_enc_convert,
13 | )
14 | from lightseq.training.ops.pytorch.transformer import (
15 |     LSTransformer,
16 |     LSTransformerEncoder,
17 |     LSTransformerDecoder,
18 | )
19 | 
20 | from lightseq.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer
21 | from lightseq.training.ops.pytorch.adam import LSAdam
22 | from lightseq.training.ops.pytorch.export import (
23 |     export_ls_config,
24 |     export_ls_embedding,
25 |     export_ls_encoder,
26 |     export_ls_decoder,
27 |     export_pb2hdf5,
28 | )
29 | 
30 | from lightseq.training.ops.pytorch.export_quant import (
31 |     export_ls_embedding_ptq,
32 |     export_ls_encoder_ptq,
33 |     export_ls_decoder_ptq,
34 |     export_ls_quant_embedding,
35 |     export_ls_quant_encoder,
36 |     export_ls_quant_decoder,
37 |     export_quant_pb2hdf5,
38 | )
39 | 
40 | from lightseq.training.ops.pytorch.gemm_test import gemm_test
41 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers/includes/cross_entropy_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <cuda_runtime_api.h>
 6 | 
 7 | #include <type_traits>
 8 | 
 9 | #include "cuda_util.h"
10 | namespace lightseq {
11 | namespace cuda {
12 | template <typename T>
13 | class CrossEntropyLayer {
14 |  public:
15 |   CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens);
16 | 
17 |   virtual ~CrossEntropyLayer();
18 | 
19 |   void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
20 |                float *nll_loss_ptr);
21 | 
22 |   void Backward(const float *grad_outputs_ptr, const T *inputs_ptr,
23 |                 const int *targets_ptr, T *grad_inputs_ptr);
24 | 
25 |   void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size);
26 | 
27 |  private:
28 |   void allocate_mem_buffer() {
29 |     // allocate local gpu memory
30 |     _loss_buffer = cuda_malloc<float>(_max_batch_tokens * 2);
31 |   }
32 | 
33 |   void free_mem_buffer() {
34 |     // free local gpu memory
35 |     cuda_free(_loss_buffer);
36 |   }
37 | 
38 |   const int _padding_idx;
39 |   const float _epsilon;
40 |   const int _max_batch_tokens;
41 | 
42 |   size_t _batch_size;
43 |   size_t _seq_len;
44 |   size_t _vocab_size;
45 | 
46 |   float *_loss_buffer;
47 | };
48 | }  // namespace cuda
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/crf_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "crf.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <typename T>
 9 | class CRFLayer : public Layer {
10 |  private:
11 |   // operators
12 |   CRFOP<T>* _crf_op = nullptr;
13 | 
14 |   // parameters
15 |   Variable* _linear_b;
16 |   Variable* _start_transition;
17 |   Variable* _end_transition;
18 |   Variable* _transition;
19 | 
20 |   // shape related
21 |   int _num_tags;
22 |   int _max_batch_tokens;
23 |   int _max_batch_size;
24 | 
25 |   int _seq_len;
26 |   int _batch_size;
27 |   bool _forward_or_decode;    // true for forward, false for decode
28 |   bool _output_decode_score;  // true for output decode score
29 | 
30 |  public:
31 |   CRFLayer(int num_tags, int max_batch_tokens, int max_batch_size);
32 | 
33 |   virtual ~CRFLayer() {}
34 | 
35 |   Variable* operator()(Variable* emission, Variable* mask);
36 | 
37 |   void before_forward(int batch_size, int seq_len, bool forward_or_decode,
38 |                       bool output_decode_score);
39 | 
40 |   int load_params(const std::vector<const T*>& para_vec, int offset);
41 | };
42 | 
43 | template class CRFLayer<float>;
44 | #ifdef LIGHTSEQ_cuda
45 | template class CRFLayer<__half>;
46 | #endif
47 | 
48 | template <class T>
49 | using CRFLayerPtr = std::shared_ptr<CRFLayer<T>>;
50 | }  // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/vit/run_quant_vit.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | THIS_DIR=$(dirname $(readlink -f $0))
17 | 
18 | python3 -m torch.distributed.launch \
19 |   --nproc_per_node=1 \
20 |   $THIS_DIR/run_vit.py \
21 |   --dataset_name beans \
22 |   --output_dir /tmp/quant/beans_outputs \
23 |   --resume_from_checkpoint /tmp/beans_outputs/ \
24 |   --overwrite_output_dir \
25 |   --remove_unused_columns False \
26 |   --do_train \
27 |   --do_eval \
28 |   --learning_rate 2e-6 \
29 |   --num_train_epochs 45 \
30 |   --per_device_train_batch_size 8 \
31 |   --per_device_eval_batch_size 8 \
32 |   --logging_steps 10 \
33 |   --seed 1337 \
34 |   --fp16 \
35 |   --module_type 1 \
36 |   --enable_quant true
37 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/sample_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "beam_search_topk.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T>
 9 | class SampleLayer : public Layer {
10 |  private:
11 |   // operators
12 |   BeamSearchTopOp<T>* _beam_search = nullptr;
13 | 
14 |   // parameters
15 |   Variable* _logit_bias;
16 |   size_t _trg_vocab_size;
17 | 
18 |  public:
19 |   SampleLayer(int nshared_layer, int max_batch_size, int max_step,
20 |               int trg_vocab_size, int hidden_size, int max_thread_per_block,
21 |               int beam_size, int diverse_lambda, int dim_per_head, int end_id,
22 |               int head_num,
23 |               float length_penalty);  // for beam_search
24 | 
25 |   virtual ~SampleLayer() {}
26 | 
27 |   std::tuple<Variable*, Variable*> operator()(Variable* logits,
28 |                                               Variable* alive_seq);
29 | 
30 |   void before_forward(int batch_size, int cur_step);
31 | 
32 |   int load_params(const std::vector<const T*>& para_vec, int offset);
33 | 
34 |   bool is_stop() { return _beam_search->is_stop(); }
35 | };
36 | 
37 | template class SampleLayer<float>;
38 | #ifdef LIGHTSEQ_cuda
39 | template class SampleLayer<__half>;
40 | #endif
41 | 
42 | template <typename T>
43 | using SampleLayerPtr = std::shared_ptr<SampleLayer<T>>;
44 | 
45 | }  // namespace lightseq
46 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_torch_fairseq_quant_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | lightseq-train /tmp/wmt14_en_de/ \
13 |     --task translation \
14 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
15 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' \
16 |     --clip-norm 0.0 \
17 |     --lr 1e-6 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \
18 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
19 |     --max-tokens 8192 \
20 |     --eval-bleu \
21 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
22 |     --eval-bleu-detok moses \
23 |     --eval-bleu-remove-bpe \
24 |     --eval-bleu-print-samples \
25 |     --best-checkpoint-metric bleu \
26 |     --maximize-best-checkpoint-metric \
27 |     --fp16 \
28 |     --use-torch-layer \
29 |     --enable-quant \
30 |     --quant-mode qat \
31 |     --finetune-from-model checkpoints/checkpoint_best.pt \
32 |     --save-dir checkpoints/quant
33 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/encdec_kv_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "bias_add_transform_20314.h"
 3 | #include "linear.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2>
 9 | class EncDecKvLayer : public Layer {
10 |  private:
11 |   LinearOp<T1, T2>* _kv_linear = nullptr;
12 |   BiasAddTrans20314<T1, T2>* _bias_add_transform_20314 = nullptr;
13 | 
14 |   // parameters
15 |   Variable* _enc_kvw;
16 |   Variable* _enc_kvb;
17 | 
18 |   // shape related
19 |   size_t _layer_id;
20 |   size_t _nshared_layer;
21 |   size_t _batch_tokens;
22 |   size_t _max_batch_tokens;
23 |   size_t _hidden_size;
24 |   size_t _heads;
25 | 
26 |  public:
27 |   EncDecKvLayer(size_t nshared_layer, size_t max_batch_tokens,
28 |                 size_t hidden_size, size_t num_heads);
29 | 
30 |   virtual ~EncDecKvLayer() {}
31 | 
32 |   Variable* operator()(Variable* enc_out);
33 | 
34 |   void before_forward(size_t batch_size, size_t seq_len);
35 | 
36 |   size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr);
37 | 
38 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
39 | };
40 | 
41 | template class EncDecKvLayer<float, float>;
42 | #ifdef LIGHTSEQ_cuda
43 | template class EncDecKvLayer<__half, __half>;
44 | #endif
45 | 
46 | template <class T1, class T2>
47 | using EncDecKvLayerPtr = std::shared_ptr<EncDecKvLayer<T1, T2>>;
48 | 
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bart/summarization/run_summarization.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | THIS_DIR=$(dirname $(readlink -f $0))
17 | 
18 | export TASK_NAME=summarization
19 | 
20 | python3 -m torch.distributed.launch \
21 |     --nproc_per_node=1 \
22 |     $THIS_DIR/run_summarization.py \
23 |     --model_name_or_path facebook/bart-base \
24 |     --do_train \
25 |     --do_eval \
26 |     --dataset_name cnn_dailymail \
27 |     --dataset_config "3.0.0" \
28 |     --output_dir /tmp/$TASK_NAME \
29 |     --max_source_length 128 \
30 |     --per_device_train_batch_size 32 \
31 |     --per_device_eval_batch_size 32 \
32 |     --overwrite_output_dir \
33 |     --seed 1234 \
34 |     --logging_steps 10 \
35 |     --fp16 \
36 |     --predict_with_generate
37 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_glue/run_quant_glue.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | THIS_DIR=$(dirname $(readlink -f $0))
17 | 
18 | export TASK_NAME=sst2
19 | 
20 | python3 -m torch.distributed.launch \
21 |   --nproc_per_node=1 \
22 |   $THIS_DIR/run_glue.py \
23 |   --model_name_or_path bert-base-cased \
24 |   --task_name $TASK_NAME \
25 |   --do_train \
26 |   --do_eval \
27 |   --max_seq_length 128 \
28 |   --per_device_train_batch_size 32 \
29 |   --learning_rate 2e-6 \
30 |   --num_train_epochs 20 \
31 |   --output_dir /tmp/quant/$TASK_NAME/ \
32 |   --overwrite_output_dir \
33 |   --resume_from_checkpoint /tmp/$TASK_NAME/ \
34 |   --fp16 \
35 |   --seed 1234 \
36 |   --logging_steps 10 \
37 |   --module_type 1 \
38 |   --enable_quant true
39 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops/includes/softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <stdio.h>
 6 | 
 7 | #include <fstream>
 8 | 
 9 | #include "kernels.h"
10 | 
11 | using namespace std;
12 | namespace lightseq {
13 | namespace cuda {
14 | 
15 | template <typename T>
16 | class Softmax {
17 |  public:
18 |   struct Config {
19 |     size_t nhead;
20 |     bool mask_future;
21 |     Config(size_t nhead, bool mask_future = false)
22 |         : nhead(nhead), mask_future(mask_future) {}
23 |   };
24 | 
25 |   Softmax(Config config) : config_(config) {}
26 | 
27 |   ~Softmax() {}
28 | 
29 |   void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
30 |                int to_len, cudaStream_t &stream, bool mask_future = false) {
31 |     launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
32 |                            to_len, config_.mask_future | mask_future, stream);
33 |   }
34 | 
35 |   void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
36 |                 int to_len, cudaStream_t stream) {
37 |     launch_attn_softmax_bw<T>(out_grad, soft_out,
38 |                               batch_size * config_.nhead * from_len, to_len,
39 |                               stream);
40 |   }
41 | 
42 |  private:
43 |   Config config_;
44 | };
45 | }  // namespace cuda
46 | }  // namespace lightseq
47 | 


--------------------------------------------------------------------------------
/lightseq/training/pytorch_quantization/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | from lightseq.training.pytorch_quantization.nn.modules.tensor_quantizer import *
20 | from lightseq.training.pytorch_quantization.nn.modules.quant_conv import *
21 | from lightseq.training.pytorch_quantization.nn.modules.quant_linear import *
22 | from lightseq.training.pytorch_quantization.nn.modules.quant_pooling import *
23 | from lightseq.training.pytorch_quantization.nn.modules.clip import *
24 | from lightseq.training.pytorch_quantization.nn.modules.quant_rnn import *
25 | from lightseq.training.pytorch_quantization.nn.modules.quant_bert import *
26 | from lightseq.training.pytorch_quantization.nn.modules.quant_instancenorm import *
27 | 


--------------------------------------------------------------------------------
/examples/inference/benchmark_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT=$(realpath "$0")
 4 | CUR_DIR=$(dirname "$SCRIPT")
 5 | 
 6 | model_full_name=gpt2
 7 | model_name=$model_full_name
 8 | all_log=$CUR_DIR/${model_name}_bench.log
 9 | res_log=$CUR_DIR/${model_name}_bench.txt
10 | if [ -f $res_log ]; then
11 |     rm $res_log
12 | fi
13 | if [ -f $all_log ]; then
14 |     rm $all_log
15 | fi
16 | echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log
17 | 
18 | for batch_size in 1 8 32; do
19 |     for topk in 1 4 32; do
20 |         for input_seq_len in 118 86 22; do
21 |             output_seq_len=$((150 - $input_seq_len))
22 |             cd $CUR_DIR/python
23 | 
24 |             python3 generate_model.py --model_name $model_full_name --sampling_method topk \
25 |                 --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
26 |             model_path=$(realpath lightseq_${model_name}_bench.hdf5)
27 | 
28 |             cd $CUR_DIR/../../build
29 |             ./examples/inference/cpp/gpt_example \
30 |                 $model_path $batch_size $input_seq_len |& tee temp.log
31 | 
32 |             cat temp.log >>$all_log
33 |             latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}')
34 |             echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log
35 |             rm temp.log
36 |         done
37 |     done
38 | done
39 | pip3 install tabulate
40 | tabulate --header $res_log
41 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2>
 8 | class SoftmaxOp : public Operator {
 9 |  private:
10 |   size_t _nhead;
11 |   size_t _max_batch_tokens;
12 |   size_t _max_seq_len;
13 |   size_t _batchs;
14 |   size_t _from_len;
15 |   size_t _to_len;
16 |   int _kv_size;
17 | 
18 |   bool _config_mask_future;
19 |   bool _mask_future;
20 | 
21 |   Variable* _result;
22 | 
23 |  public:
24 |   SoftmaxOp(size_t max_batch_tokens, size_t max_seq_len, size_t nhead,
25 |             bool mask_future = false)
26 |       : Operator("SoftmaxOp"),
27 |         _max_batch_tokens(max_batch_tokens),
28 |         _max_seq_len(max_seq_len),
29 |         _nhead(nhead),
30 |         _config_mask_future(mask_future) {}
31 | 
32 |   virtual ~SoftmaxOp() {}
33 | 
34 |   Variable* operator()(Variable* inp, Variable* mask = nullptr);
35 | 
36 |   void forward() override;
37 | 
38 |   void before_forward(size_t batchs, size_t from_len, size_t to_len,
39 |                       int kv_size = -1, bool mask_future = false) {
40 |     _batchs = batchs;
41 |     _from_len = from_len;
42 |     _to_len = to_len;
43 |     _kv_size = (kv_size == -1 ? to_len : kv_size);
44 |     _mask_future = mask_future;
45 |     _result->set_shape({_batchs, _nhead, _from_len, _to_len});
46 |   }
47 | 
48 |   void backward() override;
49 | };
50 | 
51 | }  // namespace lightseq
52 | 


--------------------------------------------------------------------------------
/lightseq/csrc/lsflow/allocator.cpp:
--------------------------------------------------------------------------------
 1 | #include "allocator.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | Allocator::Allocator() { _ptr_set.clear(); }
 6 | 
 7 | Allocator::~Allocator() {
 8 |   auto _tmp_ptr_set = _ptr_set;
 9 |   for (auto iter : _tmp_ptr_set) {
10 |     try {
11 |       free_mem(iter);
12 |     } catch (...) {
13 |       // printf("execute ~Allocator() free_mem %p failed!\n", iter);
14 |     }
15 |   }
16 |   _ptr_set.clear();
17 | }
18 | 
19 | char* Allocator::malloc_mem(size_t size) {
20 |   char* ptr = nullptr;
21 | 
22 |   try {
23 | #ifdef LIGHTSEQ_cuda
24 |     ptr = cuda::cuda_malloc<char>(size);
25 | #else
26 |     ptr = (char*)malloc(size);
27 | #endif
28 |   } catch (...) {
29 |     std::string error_message =
30 |         "allocate memory failed! size is: " + std::to_string((size / MB_SIZE)) +
31 |         " MB\n";
32 |     printf("%s", error_message.c_str());
33 |     throw std::runtime_error(error_message);
34 |   }
35 |   if (_ptr_set.find(ptr) != _ptr_set.end()) {
36 |     printf("allocate same address with twice.\n");
37 |     throw std::runtime_error("allocate same address with twice.\n");
38 |   }
39 |   _ptr_set.insert(ptr);
40 |   return ptr;
41 | }
42 | 
43 | void Allocator::free_mem(char* ptr) {
44 |   if (_ptr_set.find(ptr) == _ptr_set.end() || ptr == nullptr) {
45 |     return;
46 |   }
47 |   _ptr_set.erase(ptr);
48 | #ifdef LIGHTSEQ_cuda
49 |   cuda::cuda_free(ptr);
50 | #else
51 |   free(ptr);
52 | #endif
53 | }
54 | 
55 | }  // namespace lightseq
56 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/llama_mlp_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "rms_layer_norm.h"
 4 | #include "linear.h"
 5 | #include "act_elewise_product.h"
 6 | #include "fuse_add2_op.h"
 7 | #include "layer.h"
 8 | 
 9 | namespace lightseq {
10 | 
11 | template <class T1, class T2>
12 | class LlamaMLPLayer : public Layer {
13 |  private:
14 |   // operators
15 |   RMSLayerNormalizeOp<T1, T2>* _mlp_ln = nullptr;
16 |   LinearOp<T1, T2>* _gate_up_linear = nullptr;
17 |   LinearOp<T1, T2>* _down_linear = nullptr;
18 |   ActElewiseProductOp<T1, T2>* _act_product = nullptr;
19 |   FuseAdd2Op<T1, T2>* _add_residual = nullptr;
20 | 
21 |   // parameters
22 |   Variable* _norm_scale;
23 |   Variable* _gate_up_linear_weight;
24 |   Variable* _down_linear_weight;
25 | 
26 |   // shape related
27 |   int _max_batch_tokens;
28 |   size_t _hidden_dim;
29 |   size_t _inner_dim;
30 | 
31 |  public:
32 |   LlamaMLPLayer(int max_batch_tokens, int hidden_dim, int inner_dim);
33 | 
34 |   virtual ~LlamaMLPLayer() {}
35 | 
36 |   Variable* operator()(Variable* inp);
37 | 
38 |   void before_forward(int batch_size, int seq_len);
39 | 
40 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
41 | };
42 | 
43 | template class LlamaMLPLayer<float, float>;
44 | #ifdef LIGHTSEQ_cuda
45 | template class LlamaMLPLayer<__half, __half>;
46 | #endif
47 | 
48 | template <class T1, class T2>
49 | using LlamaMLPLayerPtr = std::shared_ptr<LlamaMLPLayer<T1, T2>>;
50 | 
51 | }  // namespace lightseq
52 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/fuse_rotary_position_qkv.cpp:
--------------------------------------------------------------------------------
 1 | #include "fuse_rotary_position_qkv.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable* RotaryPositionQk<T1, T2>::operator()(Variable* inp, Variable* cache_k,
 7 |                                                Variable* cache_v) {
 8 |   size_t max_size = _max_batch_size * _max_step * _head_num * _head_dim;
 9 |   _result = new Variable("RotaryPositionQk_out", max_size, g_dtype<T1>(),
10 |                          g_dtype<T2>());
11 |   set_parents({inp, cache_k, cache_v});
12 |   this->set_children({_result});
13 |   return _result;
14 | }
15 | 
16 | template <typename T1, typename T2>
17 | void RotaryPositionQk<T1, T2>::forward() {
18 |   T1* inp_val = (T1*)parent(0)->value();
19 |   T1* cache_k_val = (T1*)parent(1)->value();
20 |   T1* cache_v_val = (T1*)parent(2)->value();
21 | 
22 |   T1* out_val = (T1*)child(0)->value();
23 | 
24 |   if (!_context_ptr->is_built()) {
25 |     return;
26 |   }
27 | 
28 | #ifdef LIGHTSEQ_cuda
29 |   cudaStream_t stream = _context_ptr->get_stream();
30 |   cuda::launch_split_rotary_position_qkv(
31 |       inp_val, _device_sin_ptr, _device_cos_ptr, out_val, cache_k_val,
32 |       cache_v_val, _max_step, _batch_size, _head_num, _offset_seq_len,
33 |       _query_len, _head_dim, stream);
34 | #endif
35 | }
36 | 
37 | template class RotaryPositionQk<float, float>;
38 | #ifdef LIGHTSEQ_cuda
39 | template class RotaryPositionQk<__half, __half>;
40 | #endif
41 | }  // namespace lightseq
42 | 


--------------------------------------------------------------------------------
/examples/inference/benchmark_quant_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT=$(realpath "$0")
 4 | CUR_DIR=$(dirname "$SCRIPT")
 5 | 
 6 | model_full_name=/tmp/quant/test-clm/pytorch_model.bin
 7 | model_name=quant_gpt2
 8 | all_log=$CUR_DIR/${model_name}_bench.log
 9 | res_log=$CUR_DIR/${model_name}_bench.txt
10 | if [ -f $res_log ]; then
11 |     rm $res_log
12 | fi
13 | if [ -f $all_log ]; then
14 |     rm $all_log
15 | fi
16 | echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log
17 | 
18 | for batch_size in 1 8 32; do
19 |     for topk in 1 4 32; do
20 |         for input_seq_len in 118 86 22; do
21 |             output_seq_len=$((150 - $input_seq_len))
22 |             cd $CUR_DIR/python
23 | 
24 |             python3 generate_model.py --model_name $model_full_name --sampling_method topk \
25 |                 --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len --enable_quant true
26 |             model_path=$(realpath lightseq_${model_name}_bench.hdf5)
27 | 
28 |             cd $CUR_DIR/../../build
29 |             ./examples/inference/cpp/quant_gpt_example \
30 |                 $model_path $batch_size $input_seq_len |& tee temp.log
31 | 
32 |             cat temp.log >>$all_log
33 |             latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}')
34 |             echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log
35 |             rm temp.log
36 |         done
37 |     done
38 | done
39 | pip3 install tabulate
40 | tabulate --header $res_log
41 | 


--------------------------------------------------------------------------------
/examples/inference/benchmark_bart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT=$(realpath "$0")
 4 | CUR_DIR=$(dirname "$SCRIPT")
 5 | 
 6 | model_full_name=facebook/bart-base
 7 | model_name=$(echo $model_full_name | cut -d "/" -f 2)
 8 | all_log=$CUR_DIR/${model_name}_bench.log
 9 | res_log=$CUR_DIR/${model_name}_bench.txt
10 | if [ -f $res_log ]; then
11 |     rm $res_log
12 | fi
13 | if [ -f $all_log ]; then
14 |     rm $all_log
15 | fi
16 | echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log
17 | 
18 | for batch_size in 1 8 32; do
19 |     for beam_size in 1 4 32; do
20 |         for input_seq_len in 8 16 32 64; do
21 |             output_seq_len=$input_seq_len
22 |             cd $CUR_DIR/python
23 | 
24 |             python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \
25 |                 --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
26 |             model_path=$(realpath lightseq_${model_name}_bench.hdf5)
27 | 
28 |             cd $CUR_DIR/../../build
29 |             ./examples/inference/cpp/transformer_example \
30 |                 $model_path $batch_size $input_seq_len |& tee temp.log
31 | 
32 |             cat temp.log >>$all_log
33 |             latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}')
34 |             echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log
35 |             rm temp.log
36 |         done
37 |     done
38 | done
39 | pip3 install tabulate
40 | tabulate --header $res_log
41 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/sdpa_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "dropout.h"
 3 | #include "softmax.h"
 4 | #include "strided_batch_gemm.h"
 5 | #include "layer.h"
 6 | 
 7 | namespace lightseq {
 8 | 
 9 | /*
10 | Scaled Dot Product Attention
11 | See paper "Attention is all you need" for details.
12 | */
13 | template <class T1, class T2>
14 | class SDPALayer : public Layer {
15 |  private:
16 |   // operators
17 |   StridedBatchGemmOp<T1, T2>* _attn_scores = nullptr;
18 |   SoftmaxOp<T1, T2>* _softmax = nullptr;
19 |   DropoutOp<T1, T2>* _attn_prob_dropout = nullptr;
20 |   StridedBatchGemmOp<T1, T2>* _attn_context = nullptr;
21 | 
22 |   // shape related
23 |   int _max_batch_tokens;
24 |   int _max_seq_len;
25 |   int _nhead;
26 |   int _head_dim;
27 | 
28 |  public:
29 |   SDPALayer(size_t max_batch_tokens, size_t max_seq_len, size_t head_dim,
30 |             size_t num_heads, float attn_prob_dropout_ratio);
31 | 
32 |   virtual ~SDPALayer() {}
33 | 
34 |   // mask is for enc-self attention and enc-dec-cross attention
35 |   Variable* operator()(Variable* query, Variable* key, Variable* value,
36 |                        Variable* mask = nullptr);
37 | 
38 |   void before_forward(int batch_size, int query_len, int kv_len, int kv_size,
39 |                       bool mask_future);
40 | };
41 | 
42 | template class SDPALayer<__half, __half>;
43 | template class SDPALayer<float, float>;
44 | 
45 | template <class T1, class T2>
46 | using SDPALayerPtr = std::shared_ptr<SDPALayer<T1, T2>>;
47 | 
48 | }  // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/concat3_dim1.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <typename T1, typename T2>
 9 | class Concat3Dim1 : public Operator {
10 |  private:
11 |   bool _is_skip = false;
12 |   bool _is_continuous_cache;
13 | 
14 |   size_t _mx_sz0;
15 |   size_t _mx_sz1;
16 |   size_t _mx_sz2;
17 | 
18 |   size_t _sz0;
19 |   size_t _sz1_0;
20 |   size_t _sz1_1;
21 |   size_t _layer_id;
22 | 
23 |   Variable* _new_cache;
24 | 
25 |  public:
26 |   Concat3Dim1(size_t mx_sz0, size_t mx_sz1, size_t mx_sz2, size_t layer_id,
27 |               bool is_continuous_cache)
28 |       : Operator("Concat3Dim1"),
29 |         _mx_sz0(mx_sz0),
30 |         _mx_sz1(mx_sz1),
31 |         _mx_sz2(mx_sz2),
32 |         _layer_id(layer_id),
33 |         _is_continuous_cache(is_continuous_cache) {}
34 | 
35 |   virtual ~Concat3Dim1() {}
36 | 
37 |   Variable* operator()(Variable* inp, Variable* cache);
38 | 
39 |   void before_forward(size_t sz0, size_t sz1_0, size_t sz1_1,
40 |                       bool is_skip = false) {
41 |     _sz0 = sz0, _sz1_0 = sz1_0, _sz1_1 = sz1_1, _is_skip = is_skip;
42 |     if (_is_continuous_cache) {
43 |       _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2});
44 |     } else {
45 |       _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2});
46 |     }
47 |   }
48 | 
49 |   void forward() override;
50 | 
51 |   void before_backward() {}
52 | 
53 |   void backward() override;
54 | };
55 | }  // namespace lightseq
56 | 


--------------------------------------------------------------------------------
/examples/inference/benchmark_quant_bart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT=$(realpath "$0")
 4 | CUR_DIR=$(dirname "$SCRIPT")
 5 | 
 6 | model_full_name=facebook/bart-base
 7 | model_name=$(echo $model_full_name | cut -d "/" -f 2)
 8 | all_log=$CUR_DIR/quant_${model_name}_bench.log
 9 | res_log=$CUR_DIR/quant_${model_name}_bench.txt
10 | if [ -f $all_log ]; then
11 |     rm $res_log
12 | fi
13 | if [ -f $res_log ]; then
14 |     rm $res_log
15 | fi
16 | echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log
17 | 
18 | for batch_size in 1 8 32; do
19 |     for beam_size in 1 4 32; do
20 |         for input_seq_len in 16 32 64; do
21 |             output_seq_len=$input_seq_len
22 |             cd $CUR_DIR/python
23 | 
24 |             python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \
25 |                 --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
26 |             model_path=$(realpath lightseq_${model_name}_bench.hdf5)
27 | 
28 |             cd $CUR_DIR/../../build
29 |             ./examples/inference/cpp/quant_transformer_example \
30 |                 $model_path $batch_size $input_seq_len |& tee temp.log
31 | 
32 |             cat temp.log >> $all_log
33 |             latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}')
34 |             echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log
35 |             rm temp.log
36 |         done
37 |     done
38 | done
39 | 
40 | pip3 install tabulate
41 | tabulate --header $res_log
42 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/launch_enc_emb.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // dropout inside ffn.
 8 | template <typename T>
 9 | class LaunchEncEmbOp : public Operator {
10 |  private:
11 |   size_t _max_batch_tokens;
12 |   int _pad_id;
13 |   size_t _hidden_dim;
14 |   size_t _multilg_type;
15 | 
16 |   size_t _batch_size;
17 |   size_t _seq_len;
18 | 
19 |   Variable* _result;
20 |   Variable* _pad_mask;
21 | 
22 |  public:
23 |   LaunchEncEmbOp(size_t max_batch_tokens, int pad_id, size_t hidden_dim,
24 |                  size_t multilg_type)
25 |       : Operator("LaunchEncEmbOp"),
26 |         _max_batch_tokens(max_batch_tokens),
27 |         _pad_id(pad_id),
28 |         _hidden_dim(hidden_dim),
29 |         _multilg_type(multilg_type) {}
30 | 
31 |   virtual ~LaunchEncEmbOp() {}
32 | 
33 |   std::tuple<Variable*, Variable*> operator()(Variable* inp_tokens,
34 |                                               Variable* token_emb,
35 |                                               Variable* pos_emb,
36 |                                               Variable* lang_emb,
37 |                                               Variable* lang_id);
38 | 
39 |   void before_forward(size_t batch_size, size_t seq_len) {
40 |     _batch_size = batch_size, _seq_len = seq_len;
41 |   }
42 | 
43 |   void forward() override;
44 | 
45 |   void backward() override {
46 |     printf("ERROR! LaunchEncEmbOp can't cal backward()\n");
47 |     exit(-1);
48 |   }
49 | };
50 | }  // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/gpt_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "layer.h"
 3 | #include "feed_forward_layer.h"
 4 | #include "gpt_attention_layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2>
 9 | class GptLayer : public Layer {
10 |  private:
11 |   GptAttentionLayerPtr<T1, T2> _attn_layer;
12 |   FeedForwardLayerPtr<T1, T2> _ffn_layer;
13 | 
14 |   int _layer_id;
15 | 
16 |  public:
17 |   GptLayer(int layer_id, int max_batch_tokens, int max_seq_len, int hidden_size,
18 |            int num_heads, int intermediate_size, float attn_prob_dropout_ratio,
19 |            float activation_dropout_ratio, float hidden_output_dropout_ratio,
20 |            std::string activation_fn, bool mask_future_tokens,
21 |            int beam_size = 1);
22 |   virtual ~GptLayer() {}
23 | 
24 |   Variable* operator()(Variable* inp, Variable* cache_k, Variable* cache_v,
25 |                        Variable* pad_mask);
26 | 
27 |   void before_forward(int batch_size, int seq_len, int steps) {
28 |     _attn_layer->before_forward(batch_size, seq_len, steps);
29 |     _ffn_layer->before_forward(batch_size, seq_len);
30 |   }
31 | 
32 |   size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr);
33 | 
34 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
35 | };
36 | 
37 | template class GptLayer<float, float>;
38 | #ifdef LIGHTSEQ_cuda
39 | template class GptLayer<__half, __half>;
40 | #endif
41 | 
42 | template <class T1, class T2>
43 | using GptLayerPtr = std::shared_ptr<GptLayer<T1, T2>>;
44 | 
45 | }  // namespace lightseq
46 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/transform_0213.cpp:
--------------------------------------------------------------------------------
 1 | #include "transform_0213.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable* Transform0213OP<T1, T2>::operator()(Variable* inp) {
 7 |   _result = new Variable("Transform0213_res", _max_numel, g_dtype<T1>(),
 8 |                          g_dtype<T2>());
 9 |   set_parents({inp});
10 |   this->set_children({_result});
11 |   return _result;
12 | }
13 | 
14 | template <typename T1, typename T2>
15 | void Transform0213OP<T1, T2>::forward() {
16 |   T1* inp_ptr = (T1*)parent(0)->value();
17 |   T1* res_ptr = (T1*)child(0)->value();
18 | 
19 |   if (!_context_ptr->is_built()) {
20 |     return;
21 |   }
22 | #ifdef LIGHTSEQ_cuda
23 |   cudaStream_t _stream = _context_ptr->get_stream();
24 |   cuda::launch_transform_0213<T1>(inp_ptr, res_ptr, _sz0, _sz1, _sz2, _sz3,
25 |                                   _stream);
26 | #endif
27 | }
28 | 
29 | template <typename T1, typename T2>
30 | void Transform0213OP<T1, T2>::backward() {
31 |   T2* inp_grad = (T1*)parent(0)->grad();
32 |   T2* out_grad = (T1*)child(0)->grad();
33 | 
34 |   if (!_context_ptr->is_built()) {
35 |     return;
36 |   }
37 | 
38 | #ifdef LIGHTSEQ_cuda
39 |   cudaStream_t _stream = _context_ptr->get_stream();
40 |   cuda::launch_transform_0213<T2>(out_grad, inp_grad, _sz0, _sz1, _sz2, _sz3,
41 |                                   _stream);
42 | #endif
43 | }
44 | 
45 | template class Transform0213OP<float, float>;
46 | #ifdef LIGHTSEQ_cuda
47 | template class Transform0213OP<__half, __half>;
48 | #endif
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/vit.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/vit_encoder.h"
 4 | #include "../proto/vit_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType vit_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType vit_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class Vit : public LSModel {
18 |  private:
19 |   typedef OperationTypeTraits<vit_optype> optraits;
20 |   std::shared_ptr<VitEncoder<vit_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   float *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   void *d_buf_;
29 |   VitWeight<vit_optype> tw_;
30 | 
31 |  public:
32 |   Vit(const std::string weight_path, const int max_batch_size);
33 | 
34 |   ~Vit();
35 | 
36 |   void Infer() override;
37 |   void set_input_ptr(int index, void *input_ptr) override;
38 |   void set_output_ptr(int index, void *output_ptr) override;
39 |   const void *get_output_ptr(int index) override;
40 |   std::vector<int> get_input_max_shape(int index) override;
41 |   std::vector<int> get_output_max_shape(int index) override;
42 |   DataType get_input_dtype(int index) override;
43 |   DataType get_output_dtype(int index) override;
44 |   void benchmark_mode(bool is_benchmark) override{};
45 | };
46 | 
47 | LSMODEL_REGISTER(Vit);
48 | 
49 | }  // namespace cuda
50 | }  // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/bert.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/bert_encoder.h"
 4 | #include "../proto/bert_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType bert_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType bert_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class Bert : public LSModel {
18 |  private:
19 |   typedef OperationTypeTraits<bert_optype> optraits;
20 |   std::shared_ptr<BertEncoder<bert_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   int *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   void *d_buf_;
29 |   BertWeight<bert_optype> tw_;
30 | 
31 |  public:
32 |   Bert(const std::string weight_path, const int max_batch_size);
33 | 
34 |   ~Bert();
35 | 
36 |   void Infer() override;
37 |   void set_input_ptr(int index, void *input_ptr) override;
38 |   void set_output_ptr(int index, void *output_ptr) override;
39 |   const void *get_output_ptr(int index) override;
40 |   std::vector<int> get_input_max_shape(int index) override;
41 |   std::vector<int> get_output_max_shape(int index) override;
42 |   DataType get_input_dtype(int index) override;
43 |   DataType get_output_dtype(int index) override;
44 |   void benchmark_mode(bool is_benchmark) override{};
45 | };
46 | 
47 | LSMODEL_REGISTER(Bert);
48 | 
49 | }  // namespace cuda
50 | }  // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/embKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | void launch_split_multilg_request(const int *req, int *src_lang_id,
 9 |                                   int *trg_lang_id, int *src_token_id,
10 |                                   int batch_size, int req_len,
11 |                                   cudaStream_t &stream);
12 | 
13 | template <typename T>
14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens,
15 |                     T *output, int *pad_mask, int pad_id, int batch_size,
16 |                     int seq_len, int hidden_dim, cudaStream_t stream,
17 |                     const T *lang_emb, const int *lang_id, int multilg_type);
18 | 
19 | template <typename T>
20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens,
21 |                     const T *lang_emb, const int *lang_id, T *output,
22 |                     int batch_size, int beam_size, int hidden_dim,
23 |                     int vocab_size, int step, int max_step, int multilg_type,
24 |                     cudaStream_t stream);
25 | 
26 | template <typename T>
27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias,
28 |                       const T *pos_emb, const T *cls_emb, const float *input,
29 |                       T *output, int patch_size, int image_size, int batch_size,
30 |                       int max_step, int hidden_dim, int channel_input,
31 |                       cudaStream_t stream);
32 | 
33 | }  // namespace cuda
34 | }  // namespace lightseq
35 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/includes/embKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | void launch_split_multilg_request(const int *req, int *src_lang_id,
 9 |                                   int *trg_lang_id, int *src_token_id,
10 |                                   int batch_size, int req_len,
11 |                                   cudaStream_t &stream);
12 | 
13 | template <typename T>
14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens,
15 |                     T *output, T *pad_mask, int pad_id, int batch_size,
16 |                     int seq_len, int hidden_dim, cudaStream_t stream,
17 |                     const T *lang_emb, const int *lang_id, int multilg_type);
18 | 
19 | template <typename T>
20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens,
21 |                     const T *lang_emb, const int *lang_id, T *output,
22 |                     int batch_size, int beam_size, int hidden_dim,
23 |                     int vocab_size, int step, int max_step, int multilg_type,
24 |                     cudaStream_t stream);
25 | 
26 | template <typename T>
27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias,
28 |                       const T *pos_emb, const T *cls_emb, const float *input,
29 |                       T *output, int patch_size, int image_size, int batch_size,
30 |                       int max_step, int hidden_dim, int channel_input,
31 |                       cudaStream_t stream);
32 | 
33 | }  // namespace cuda
34 | }  // namespace lightseq
35 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/quant_vit.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/quant_vit_encoder.h"
 4 | #include "../proto/quant_vit_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType vit_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType vit_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class QuantVit : public LSModel {
18 |  private:
19 |   typedef OperationTypeTraits<vit_optype> optraits;
20 |   std::shared_ptr<QuantVitEncoder<vit_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   float *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   QuantVitWeight<vit_optype> tw_;
29 | 
30 |  public:
31 |   QuantVit(const std::string weight_path, const int max_batch_size);
32 | 
33 |   ~QuantVit();
34 | 
35 |   void Infer() override;
36 |   void set_input_ptr(int index, void *input_ptr) override;
37 |   void set_output_ptr(int index, void *output_ptr) override;
38 |   const void *get_output_ptr(int index) override;
39 |   std::vector<int> get_input_max_shape(int index) override;
40 |   std::vector<int> get_output_max_shape(int index) override;
41 |   DataType get_input_dtype(int index) override;
42 |   DataType get_output_dtype(int index) override;
43 |   void benchmark_mode(bool is_benchmark) override{};
44 | };
45 | 
46 | LSMODEL_REGISTER(QuantVit);
47 | 
48 | }  // namespace cuda
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/quant_bert.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/quant_bert_encoder.h"
 4 | #include "../proto/quant_bert_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType bert_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType bert_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class QuantBert : public LSModel {
18 |  private:
19 |   typedef OperationTypeTraits<bert_optype> optraits;
20 |   std::shared_ptr<QuantBertEncoder<bert_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   int *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   QuantBertWeight<bert_optype> tw_;
29 | 
30 |  public:
31 |   QuantBert(const std::string weight_path, const int max_batch_size);
32 | 
33 |   ~QuantBert();
34 | 
35 |   void Infer() override;
36 |   void set_input_ptr(int index, void *input_ptr) override;
37 |   void set_output_ptr(int index, void *output_ptr) override;
38 |   const void *get_output_ptr(int index) override;
39 |   std::vector<int> get_input_max_shape(int index) override;
40 |   std::vector<int> get_output_max_shape(int index) override;
41 |   DataType get_input_dtype(int index) override;
42 |   DataType get_output_dtype(int index) override;
43 |   void benchmark_mode(bool is_benchmark) override{};
44 | };
45 | 
46 | LSMODEL_REGISTER(QuantBert);
47 | 
48 | }  // namespace cuda
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_finetune_bart/ls_fairseq_summarization_cnn_dm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/cnn_dm-bin" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/cnn_dm_data/databin_cnn_dm.tar.gz -P /tmp
 9 |     tar -xvf /tmp/databin_cnn_dm.tar.gz -C /tmp && rm /tmp/databin_cnn_dm.tar.gz
10 | fi
11 | 
12 | if [ ! -d "/tmp/bart.large" ]; then
13 |     echo "Downloading pretrained model"
14 |     wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz -P /tmp
15 |     tar -zxvf /tmp/bart.large.tar.gz -C /tmp && rm /tmp/bart.large.tar.gz
16 | fi
17 | 
18 | lightseq-train /tmp/cnn_dm-bin \
19 |     --restore-file /tmp/bart.large/model.pt \
20 |     --max-tokens 2048 \
21 |     --task translation \
22 |     --source-lang source --target-lang target \
23 |     --truncate-source \
24 |     --layernorm-embedding \
25 |     --share-all-embeddings \
26 |     --reset-optimizer --reset-dataloader --reset-meters \
27 |     --required-batch-size-multiple 1 \
28 |     --arch ls_bart_large \
29 |     --criterion ls_label_smoothed_cross_entropy \
30 |     --label-smoothing 0.1 \
31 |     --dropout 0.1 --attention-dropout 0.1 \
32 |     --weight-decay 0.01 --optimizer ls_adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
33 |     --clip-norm 0.1 \
34 |     --lr-scheduler polynomial_decay --lr 3e-05 --total-num-update 20000 --warmup-updates 500 \
35 |     --fp16 --update-freq 1 \
36 |     --skip-invalid-size-inputs-valid-test \
37 |     --find-unused-parameters
38 | 


--------------------------------------------------------------------------------
/lightseq/csrc/triton_backend/src/triton_utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "triton/backend/backend_common.h"
 3 | #include "triton/core/tritonserver.h"
 4 | #include "model_base.h"
 5 | 
 6 | TRITONSERVER_DataType transform_triton_datatype_to_lightseq(
 7 |     ::lightseq::cuda::DataType data_type_) {
 8 |   switch (data_type_) {
 9 |     case ::lightseq::cuda::DataType::kNotSupported:
10 |       return TRITONSERVER_TYPE_INVALID;
11 |     case ::lightseq::cuda::DataType::kFloat32:
12 |       return TRITONSERVER_TYPE_FP32;
13 |     case ::lightseq::cuda::DataType::kInt32:
14 |       return TRITONSERVER_TYPE_INT32;
15 |     case ::lightseq::cuda::DataType::kInt64:
16 |       return TRITONSERVER_TYPE_INT64;
17 |     case ::lightseq::cuda::DataType::kFloat16:
18 |       return TRITONSERVER_TYPE_FP16;
19 |     case ::lightseq::cuda::DataType::kInt8:
20 |       return TRITONSERVER_TYPE_INT8;
21 |     case ::lightseq::cuda::DataType::kInt16:
22 |       return TRITONSERVER_TYPE_INT16;
23 |     case ::lightseq::cuda::DataType::kByte:
24 |       return TRITONSERVER_TYPE_BYTES;
25 |     case ::lightseq::cuda::DataType::kUInt8:
26 |       return TRITONSERVER_TYPE_UINT8;
27 |     case ::lightseq::cuda::DataType::kUInt16:
28 |       return TRITONSERVER_TYPE_UINT16;
29 |     case ::lightseq::cuda::DataType::kUInt32:
30 |       return TRITONSERVER_TYPE_UINT32;
31 |     case ::lightseq::cuda::DataType::kUInt64:
32 |       return TRITONSERVER_TYPE_UINT64;
33 |     case ::lightseq::cuda::DataType::kFloat64:
34 |       return TRITONSERVER_TYPE_FP64;
35 |     default:
36 |       return TRITONSERVER_TYPE_INVALID;
37 |   }
38 |   return TRITONSERVER_TYPE_INVALID;
39 | }
40 | 


--------------------------------------------------------------------------------
/lightseq/inference/triton_backend/src/triton_utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "triton/backend/backend_common.h"
 3 | #include "triton/core/tritonserver.h"
 4 | #include "model_base.h"
 5 | 
 6 | TRITONSERVER_DataType transform_triton_datatype_to_lightseq(
 7 |     ::lightseq::cuda::DataType data_type_) {
 8 |   switch (data_type_) {
 9 |     case ::lightseq::cuda::DataType::kNotSupported:
10 |       return TRITONSERVER_TYPE_INVALID;
11 |     case ::lightseq::cuda::DataType::kFloat32:
12 |       return TRITONSERVER_TYPE_FP32;
13 |     case ::lightseq::cuda::DataType::kInt32:
14 |       return TRITONSERVER_TYPE_INT32;
15 |     case ::lightseq::cuda::DataType::kInt64:
16 |       return TRITONSERVER_TYPE_INT64;
17 |     case ::lightseq::cuda::DataType::kFloat16:
18 |       return TRITONSERVER_TYPE_FP16;
19 |     case ::lightseq::cuda::DataType::kInt8:
20 |       return TRITONSERVER_TYPE_INT8;
21 |     case ::lightseq::cuda::DataType::kInt16:
22 |       return TRITONSERVER_TYPE_INT16;
23 |     case ::lightseq::cuda::DataType::kByte:
24 |       return TRITONSERVER_TYPE_BYTES;
25 |     case ::lightseq::cuda::DataType::kUInt8:
26 |       return TRITONSERVER_TYPE_UINT8;
27 |     case ::lightseq::cuda::DataType::kUInt16:
28 |       return TRITONSERVER_TYPE_UINT16;
29 |     case ::lightseq::cuda::DataType::kUInt32:
30 |       return TRITONSERVER_TYPE_UINT32;
31 |     case ::lightseq::cuda::DataType::kUInt64:
32 |       return TRITONSERVER_TYPE_UINT64;
33 |     case ::lightseq::cuda::DataType::kFloat64:
34 |       return TRITONSERVER_TYPE_FP64;
35 |     default:
36 |       return TRITONSERVER_TYPE_INVALID;
37 |   }
38 |   return TRITONSERVER_TYPE_INVALID;
39 | }
40 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/launch_dec_emb_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // dropout inside ffn.
 8 | template <typename T>
 9 | class LaunchDecEmbOp : public Operator {
10 |  private:
11 |   size_t _max_batch_tokens;
12 |   size_t _beam_size;
13 |   size_t _hidden_size;
14 |   size_t _trg_vocab_size;
15 |   size_t _max_step;
16 |   size_t _multilg_type;
17 | 
18 |   size_t _batch_size;
19 |   int _cur_step;
20 | 
21 |   Variable* _result;
22 | 
23 |  public:
24 |   LaunchDecEmbOp(int max_batch_tokens, size_t beam_size, size_t hidden_size,
25 |                  size_t trg_vocab_size, size_t max_step, size_t multilg_type)
26 |       : Operator("LaunchDecEmbOp"),
27 |         _max_batch_tokens(max_batch_tokens),
28 |         _beam_size(beam_size),
29 |         _hidden_size(hidden_size),
30 |         _trg_vocab_size(trg_vocab_size),
31 |         _max_step(max_step),
32 |         _multilg_type(multilg_type) {}
33 | 
34 |   virtual ~LaunchDecEmbOp() {}
35 | 
36 |   Variable* operator()(Variable* inp_tokens, Variable* token_emb,
37 |                        Variable* pos_emb, Variable* lang_emb,
38 |                        Variable* lang_id);
39 | 
40 |   void before_forward(size_t batch_size, int cur_step) {
41 |     _batch_size = batch_size, _cur_step = cur_step;
42 |     _result->set_shape(
43 |         {batch_size, size_t(cur_step + 1), _beam_size, _hidden_size});
44 |   }
45 | 
46 |   void forward() override;
47 | 
48 |   void backward() override {
49 |     printf("ERROR! LaunchDecEmbOp can't cal backward()\n");
50 |     exit(-1);
51 |   }
52 | };
53 | }  // namespace lightseq
54 | 


--------------------------------------------------------------------------------
/lightseq/csrc/models/includes/bert.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model_base.h"
 3 | 
 4 | #include "bert_weight.h"
 5 | 
 6 | #include "launch_enc_emb_layer.h"
 7 | #include "transformer_encoder_layer.h"
 8 | #include "lyr_normalize_layer.h"
 9 | 
10 | namespace lightseq {
11 | namespace cuda {
12 | 
13 | class Bert : public LSModel {
14 |  private:
15 |   BertWeight<OpType_> tw_;
16 |   std::shared_ptr<Context> _context_ptr;
17 | 
18 |   LaunchEncEmbLayerPtr<OpType_> launch_enc_emb_layer;
19 |   std::vector<TransformerEncoderLayerPtr<OpType_, OpType_> > enc_layer_vec;
20 |   LyrNormalizeLayerPtr<OpType_, OpType_> lyr_norm_layer;
21 | 
22 |   ContextPtr context_ptr;
23 | 
24 |   Variable* inp_tokens;  // need to allocate
25 |   Variable* token_emb;
26 |   Variable* pos_emb;
27 |   Variable* lang_emb;
28 |   Variable* lang_id;
29 | 
30 |   Variable* bert_out;
31 | 
32 |   int _max_batch_size;
33 | 
34 |  public:
35 |   Bert(const std::string weight_path, const int max_batch_size);
36 |   ~Bert();
37 | 
38 |   void before_forward(int batch_size, int seq_len);
39 | 
40 |   void Infer() override;
41 |   void set_input_ptr(int index, void* input_ptr) override;
42 |   void set_output_ptr(int index, void* output_ptr) override;
43 |   const void* get_output_ptr(int index) override;
44 |   std::vector<int> get_input_max_shape(int index) override;
45 |   std::vector<int> get_output_max_shape(int index) override;
46 |   DataType get_input_dtype(int index) override;
47 |   DataType get_output_dtype(int index) override;
48 |   void benchmark_mode(bool is_benchmark) override {}
49 | };
50 | 
51 | LSMODEL_REGISTER(Bert);
52 | 
53 | }  // namespace cuda
54 | }  // namespace lightseq
55 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/llama_layer.cpp:
--------------------------------------------------------------------------------
 1 | #include "llama_layer.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | LlamaLayer<T1, T2>::LlamaLayer(int max_batch_size, int max_seq_len,
 7 |                                int hidden_size, int inner_dim, int num_heads,
 8 |                                int beam_size)
 9 |     : Layer("LlamaLayer") {
10 |   _attn_layer.reset(new LlamaAttentionLayer<T1, T2>(
11 |       max_batch_size, max_seq_len, hidden_size, num_heads, beam_size));
12 |   _mlp_layer.reset(new LlamaMLPLayer<T1, T2>(max_batch_size * max_seq_len,
13 |                                              hidden_size, inner_dim));
14 | 
15 |   this->_context_ptr->exit_layer();  // necessary
16 | }
17 | 
18 | template <typename T1, typename T2>
19 | Variable* LlamaLayer<T1, T2>::operator()(Variable* inp, Variable* cache_k,
20 |                                          Variable* cache_v,
21 |                                          Variable* pad_mask) {
22 |   set_inputs({inp, cache_k, cache_v, pad_mask});
23 | 
24 |   Variable* attn_out = (*_attn_layer)(inp, cache_k, cache_v, pad_mask);
25 | 
26 |   Variable* ffn_out = (*_mlp_layer)(attn_out);
27 | 
28 |   set_outputs({ffn_out});
29 |   return ffn_out;
30 | }
31 | 
32 | template <typename T1, typename T2>
33 | int LlamaLayer<T1, T2>::load_params(const std::vector<const T1*>& para_vec,
34 |                                     int offset) {  // for inference
35 |   int size = 0;
36 | 
37 |   size += _attn_layer->load_params(para_vec, offset + size);
38 | 
39 |   size += _mlp_layer->load_params(para_vec, offset + size);
40 | 
41 |   return size;
42 | }
43 | 
44 | }  // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/.github/workflows/build_check.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: build
 4 | 
 5 | on:
 6 |   pull_request:
 7 |     branches: [master]
 8 |   push:
 9 |     paths-ignore:
10 |     - 'docs/**'
11 |     branches: [master]
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   # This workflow contains a single job called "build"
16 |   format:
17 |     # The type of runner that the job will run on
18 |     runs-on: ubuntu-latest
19 | 
20 |     # Steps represent a sequence of tasks that will be executed as part of the job
21 |     steps:
22 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
23 |       - uses: actions/checkout@v2
24 | 
25 |       # Runs a single command using the runners shell
26 |       - name: install pre-commit
27 |         run: |
28 |           pip install pre-commit
29 |           sudo apt-get install -y --no-install-recommends clang-format
30 |           clang-format --version
31 | 
32 |       # Runs a set of commands using the runners shell
33 |       - name: check format
34 |         run: |
35 |           pre-commit run -a --show-diff-on-failure
36 | 
37 |   build_wheel:
38 |     runs-on: ubuntu-latest
39 |     container: taka23/lightseq:build-linux
40 | 
41 |     steps:
42 |       - uses: actions/checkout@v1
43 |         with:
44 |           submodules: 'recursive'
45 |       - name: env check
46 |         run: |
47 |           /opt/python/cp38-cp38/bin/python -V
48 |       - name: build wheel
49 |         run: |
50 |           /opt/python/cp38-cp38/bin/python -m pip install -U build
51 |           /opt/python/cp38-cp38/bin/python -m build
52 | 


--------------------------------------------------------------------------------
/tests/gemm_test/gemm_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from lightseq.training import gemm_test
 4 | 
 5 | 
 6 | def parse_args():
 7 |     parser = argparse.ArgumentParser(
 8 |         description="search for the best int8 gemm algorithm",
 9 |         usage="python gemm_test.py -hd 1024 -id 4096 -v 32000 -minb 1 -maxb 10 -d configs",
10 |     )
11 |     parser.add_argument(
12 |         "--hidden_dim",
13 |         "-hd",
14 |         type=int,
15 |         help="hidden dimension of the model",
16 |     )
17 |     parser.add_argument(
18 |         "--inner_dim",
19 |         "-id",
20 |         type=int,
21 |         help="inner dimension of the ffn layer",
22 |     )
23 |     parser.add_argument(
24 |         "--vocab_size",
25 |         "-v",
26 |         type=int,
27 |         help="vocabulary size of the model",
28 |     )
29 |     parser.add_argument(
30 |         "--min_bsz",
31 |         "-minb",
32 |         type=int,
33 |         default=1,
34 |         help="minimal batch token size",
35 |     )
36 |     parser.add_argument(
37 |         "--max_bsz",
38 |         "-maxb",
39 |         type=int,
40 |         default=10000,
41 |         help="maximal batch token size",
42 |     )
43 |     parser.add_argument(
44 |         "--dir",
45 |         "-d",
46 |         type=str,
47 |         default="/tmp/igemm_configs",
48 |         help="path of the saved configs",
49 |     )
50 |     args = parser.parse_args()
51 |     return args
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = parse_args()
56 | 
57 |     gemm_test(
58 |         args.hidden_dim,
59 |         args.inner_dim,
60 |         args.vocab_size,
61 |         args.min_bsz,
62 |         args.max_bsz,
63 |         args.dir,
64 |     )
65 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/dropout.cpp:
--------------------------------------------------------------------------------
 1 | #include "dropout.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable* DropoutOp<T1, T2>::operator()(Variable* inp) {
 7 |   _result =
 8 |       new Variable("DropoutOp_out", _max_ele_num, g_dtype<T1>(), g_dtype<T2>());
 9 |   set_parents({inp});
10 |   this->set_children({_result});
11 |   return _result;
12 | }
13 | 
14 | template <typename T1, typename T2>
15 | void DropoutOp<T1, T2>::forward() {
16 |   T1* input = parent(0)->value<T1>();
17 |   T1* output = child(0)->value<T1>();
18 |   uint8_t* mask_ptr = _mask->tensor<uint8_t>();
19 | 
20 |   if (!_context_ptr->is_built()) {
21 |     return;
22 |   }
23 | 
24 | #ifdef LIGHTSEQ_cuda
25 |   cudaStream_t stream = _context_ptr->get_stream();
26 |   cuda::launch_ls_dropout<T1>(output, input, mask_ptr, _count, RATIO(), stream,
27 |                               false);
28 | #elif defined LIGHTSEQ_x86
29 |   //.....
30 | #endif
31 | }
32 | 
33 | template <typename T1, typename T2>
34 | void DropoutOp<T1, T2>::backward() {
35 |   T2* input_grad = (T2*)parent(0)->grad();
36 |   T2* output_grad = (T2*)child(0)->grad();
37 |   uint8_t* mask_ptr = (uint8_t*)_mask->tensor();
38 | 
39 |   if (!_context_ptr->is_built()) {
40 |     return;
41 |   }
42 | 
43 |   if (_is_skip) {
44 |     return;
45 |   }
46 | 
47 | #ifdef LIGHTSEQ_cuda
48 |   cudaStream_t stream = _context_ptr->get_stream();
49 |   cuda::launch_ls_dropout<T2>(input_grad, output_grad, mask_ptr, _count,
50 |                               RATIO(), stream, true);
51 | #endif
52 | }
53 | 
54 | template class DropoutOp<float, float>;
55 | #ifdef LIGHTSEQ_cuda
56 | template class DropoutOp<__half, __half>;
57 | #endif
58 | }  // namespace lightseq
59 | 


--------------------------------------------------------------------------------
/lightseq/csrc/models/includes/bert_crf.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model_base.h"
 3 | 
 4 | #include "bert_crf_weight.h"
 5 | 
 6 | #include "launch_enc_emb_layer.h"
 7 | #include "transformer_encoder_layer.h"
 8 | #include "lyr_normalize_layer.h"
 9 | #include "linear_layer.h"
10 | #include "crf_layer.h"
11 | 
12 | namespace lightseq {
13 | namespace cuda {
14 | 
15 | class BertCrf : public LSModel {
16 |  private:
17 |   BertCrfWeight<OpType_> tw_;
18 |   std::shared_ptr<Context> _context_ptr;
19 | 
20 |   LaunchEncEmbLayerPtr<OpType_> launch_enc_emb_layer;
21 |   std::vector<TransformerEncoderLayerPtr<OpType_, OpType_> > enc_layer_vec;
22 |   LyrNormalizeLayerPtr<OpType_, OpType_> lyr_norm_layer;
23 |   LinearLayerPtr<OpType_, OpType_> linear_layer;
24 |   CRFLayerPtr<OpType_> crf_layer;
25 | 
26 |   ContextPtr context_ptr;
27 | 
28 |   Variable* inp_tokens;  // need to allocate
29 |   Variable* bert_out;
30 | 
31 |   int _max_batch_size;
32 | 
33 |  public:
34 |   BertCrf(const std::string weight_path, const int max_batch_size);
35 |   ~BertCrf();
36 | 
37 |   void before_forward(int batch_size, int seq_len);
38 | 
39 |   void Infer() override;
40 |   void set_input_ptr(int index, void* input_ptr) override;
41 |   void set_output_ptr(int index, void* output_ptr) override;
42 |   const void* get_output_ptr(int index) override;
43 |   std::vector<int> get_input_max_shape(int index) override;
44 |   std::vector<int> get_output_max_shape(int index) override;
45 |   DataType get_input_dtype(int index) override;
46 |   DataType get_output_dtype(int index) override;
47 |   void benchmark_mode(bool is_benchmark) override {}
48 | };
49 | 
50 | LSMODEL_REGISTER(BertCrf);
51 | 
52 | }  // namespace cuda
53 | }  // namespace lightseq
54 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/launch_dec_emb_op.cpp:
--------------------------------------------------------------------------------
 1 | #include "launch_dec_emb_op.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T>
 6 | Variable* LaunchDecEmbOp<T>::operator()(Variable* inp_tokens,
 7 |                                         Variable* token_emb, Variable* pos_emb,
 8 |                                         Variable* lang_emb, Variable* lang_id) {
 9 |   size_t max_size = _max_batch_tokens * _hidden_size * _beam_size;
10 | 
11 |   _result =
12 |       new Variable("LaunchDecEmbOp_out",
13 |                    _max_batch_tokens * _hidden_size * _beam_size, g_dtype<T>());
14 | 
15 |   set_parents({inp_tokens, token_emb, pos_emb, lang_emb, lang_id});
16 | 
17 |   this->set_children({_result});
18 |   return _result;
19 | }
20 | 
21 | template <typename T>
22 | void LaunchDecEmbOp<T>::forward() {
23 |   int* inp_tokens = (int*)parent(0)->value();
24 |   const T* token_emb = (const T*)parent(1)->value();
25 |   const T* pos_emb = (const T* const)parent(2)->value();
26 |   T* lang_emb = (T*)parent(3)->value();
27 |   int* lang_id = (int*)parent(4)->value();
28 | 
29 |   T* output_ptr = (T*)child(0)->value();
30 | 
31 |   if (!_context_ptr->is_built()) {
32 |     return;
33 |   }
34 | 
35 | #ifdef LIGHTSEQ_cuda
36 |   cudaStream_t _stream = _context_ptr->get_stream();
37 |   cuda::launch_dec_emb<T>(token_emb, pos_emb, inp_tokens, lang_emb, lang_id,
38 |                           output_ptr, _batch_size, _beam_size, _hidden_size,
39 |                           _trg_vocab_size, _cur_step, _max_step, _multilg_type,
40 |                           _stream);
41 | #endif
42 | }
43 | 
44 | template class LaunchDecEmbOp<float>;
45 | #ifdef LIGHTSEQ_cuda
46 | template class LaunchDecEmbOp<__half>;
47 | #endif
48 | }  // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/launch_enc_emb.cpp:
--------------------------------------------------------------------------------
 1 | #include "launch_enc_emb.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T>
 6 | std::tuple<Variable*, Variable*> LaunchEncEmbOp<T>::operator()(
 7 |     Variable* inp_tokens, Variable* token_emb, Variable* pos_emb,
 8 |     Variable* lang_emb, Variable* lang_id) {
 9 |   size_t max_size = _max_batch_tokens * _hidden_dim;
10 | 
11 |   _result = new Variable("LaunchEncEmbOp_out", _max_batch_tokens * _hidden_dim,
12 |                          g_dtype<T>());
13 |   _pad_mask = new Variable("pad_mask", _max_batch_tokens, g_dtype<T>());
14 |   set_parents({inp_tokens, token_emb, pos_emb, lang_emb, lang_id});
15 |   this->set_children({_result, _pad_mask});
16 |   return std::make_tuple(_result, _pad_mask);
17 | }
18 | 
19 | template <typename T>
20 | void LaunchEncEmbOp<T>::forward() {
21 |   int* inp_tokens = (int*)parent(0)->value();
22 |   const T* token_emb = (const T*)parent(1)->value();
23 |   const T* pos_emb = (const T*)parent(2)->value();
24 |   T* lang_emb = (T*)parent(3)->value();
25 |   int* lang_id = (int*)parent(4)->value();
26 | 
27 |   T* output_ptr = (T*)child(0)->value();
28 |   T* pad_mask = (T*)child(1)->value();
29 | 
30 |   if (!_context_ptr->is_built()) {
31 |     return;
32 |   }
33 | 
34 | #ifdef LIGHTSEQ_cuda
35 |   cudaStream_t _stream = _context_ptr->get_stream();
36 |   cuda::launch_enc_emb<T>(token_emb, pos_emb, inp_tokens, output_ptr, pad_mask,
37 |                           _pad_id, _batch_size, _seq_len, _hidden_dim, _stream,
38 |                           lang_emb, lang_id, _multilg_type);
39 | #endif
40 | }
41 | 
42 | template class LaunchEncEmbOp<float>;
43 | #ifdef LIGHTSEQ_cuda
44 | template class LaunchEncEmbOp<__half>;
45 | #endif
46 | }  // namespace lightseq
47 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/transformer_encoder_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "layer.h"
 3 | #include "feed_forward_layer.h"
 4 | #include "multihead_attention_layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2>
 9 | class TransformerEncoderLayer : public Layer {
10 |  private:
11 |   MultiheadAttentionLayerPtr<T1, T2> _attn_layer;
12 |   FeedForwardLayerPtr<T1, T2> _ffn_layer;
13 | 
14 |   int _layer_id;
15 | 
16 |  public:
17 |   TransformerEncoderLayer(int layer_id, int max_batch_tokens, int max_seq_len,
18 |                           int hidden_size, int num_heads, int intermediate_size,
19 |                           float attn_prob_dropout_ratio,
20 |                           float activation_dropout_ratio,
21 |                           float hidden_output_dropout_ratio, bool is_pre_ln,
22 |                           std::string activation_fn, bool mask_future_tokens);
23 |   virtual ~TransformerEncoderLayer() {}
24 | 
25 |   Variable* operator()(Variable* inp, Variable* inp_mask);
26 | 
27 |   void before_forward(int batch_size, int seq_len) {
28 |     _attn_layer->before_forward(batch_size, seq_len);
29 |     _ffn_layer->before_forward(batch_size, seq_len);
30 |   }
31 | 
32 |   void before_backward() { return; }
33 | 
34 |   size_t load_para_and_grad(const T1* para_ptr, T2* grad_ptr);
35 | 
36 |   int load_params(const std::vector<const T1*>& para_vec, int offset);
37 | };
38 | 
39 | template class TransformerEncoderLayer<float, float>;
40 | #ifdef LIGHTSEQ_cuda
41 | template class TransformerEncoderLayer<__half, __half>;
42 | #endif
43 | 
44 | template <class T1, class T2>
45 | using TransformerEncoderLayerPtr =
46 |     std::shared_ptr<TransformerEncoderLayer<T1, T2>>;
47 | 
48 | }  // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/lightseq/csrc/triton_backend/src/libtriton_minimal.ldscript:
--------------------------------------------------------------------------------
 1 | # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | {
27 |   global:
28 |     TRITONBACKEND_*;
29 |   local: *;
30 | };
31 | 


--------------------------------------------------------------------------------
/examples/inference/python/export/util.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tensorflow as tf
 3 | import h5py
 4 | 
 5 | from export.proto.transformer_pb2 import Transformer
 6 | from lightseq.training import export_pb2hdf5
 7 | from lightseq.training import export_quant_pb2hdf5
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser(description="export fairseq checkpoint", usage="")
12 |     parser.add_argument(
13 |         "--model",
14 |         "-m",
15 |         type=str,
16 |         default="checkpoint_best.pt",
17 |         help="path of fairseq checkpoint",
18 |     )
19 |     parser.add_argument(
20 |         "--hdf5",
21 |         "-hdf5",
22 |         action="store_true",
23 |         help="whether to store hdf5",
24 |     )
25 |     parser.add_argument(
26 |         "--generation_method",
27 |         "-g",
28 |         type=str,
29 |         default="beam_search",
30 |         choices=["beam_search", "topk_greedy", "topk", "topp", "ppl"],
31 |         help="generation method",
32 |     )
33 |     args = parser.parse_args()
34 |     return args
35 | 
36 | 
37 | def save_model(transformer, pb_path, hdf5_path, hdf5):
38 |     if not hdf5:
39 |         try:
40 |             str_model = transformer.SerializeToString()
41 |             print("Writing to {0}".format(pb_path))
42 |             with tf.io.gfile.GFile(pb_path, "wb") as fout:
43 |                 fout.write(str_model)
44 |             return pb_path
45 |         except:
46 |             pass
47 | 
48 |     print("Writing to {0}".format(hdf5_path))
49 |     f = h5py.File(hdf5_path, "w")
50 |     if isinstance(transformer, Transformer):
51 |         export_pb2hdf5(transformer, f)
52 |     else:
53 |         export_quant_pb2hdf5(transformer, f)
54 |     f.close()
55 |     return hdf5_path
56 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/x86/util.cc:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | namespace lightseq {
 4 | namespace x86 {}  // namespace x86
 5 | 
 6 | template <typename T>
 7 | void print_vec(const T* outv, std::string outn, int num_output_ele) {
 8 |   std::cout << outn << " address: " << outv << std::endl;
 9 |   printf("value: ");
10 |   for (int i = 0; i < num_output_ele; i++) {
11 |     std::cout << outv[i] << ", ";
12 |   }
13 |   std::cout << std::endl;
14 | }
15 | 
16 | template <>
17 | void print_vec<int8_t>(const int8_t* outv, std::string outn,
18 |                        int num_output_ele) {
19 |   std::cout << outn << " address: " << outv << std::endl;
20 |   printf("value: ");
21 |   for (int i = 0; i < num_output_ele; i++) {
22 |     std::cout << static_cast<int>(outv[i]) << ", ";
23 |   }
24 |   std::cout << std::endl;
25 | }
26 | 
27 | template <>
28 | void print_vec<uint8_t>(const uint8_t* outv, std::string outn,
29 |                         int num_output_ele) {
30 |   std::cout << outn << " address: " << outv << std::endl;
31 |   printf("value: ");
32 |   for (int i = 0; i < num_output_ele; i++) {
33 |     std::cout << static_cast<int>(outv[i]) << ", ";
34 |   }
35 |   std::cout << std::endl;
36 | }
37 | 
38 | template void print_vec<float>(const float* outv, std::string outn,
39 |                                int num_output_ele);
40 | 
41 | template void print_vec<int>(const int* outv, std::string outn,
42 |                              int num_output_ele);
43 | 
44 | template void print_vec<int8_t>(const int8_t* outv, std::string outn,
45 |                                 int num_output_ele);
46 | 
47 | template void print_vec<uint8_t>(const uint8_t* outv, std::string outn,
48 |                                  int num_output_ele);
49 | }  // namespace lightseq
50 | 


--------------------------------------------------------------------------------
/lightseq/inference/triton_backend/src/libtriton_minimal.ldscript:
--------------------------------------------------------------------------------
 1 | # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | {
27 |   global:
28 |     TRITONBACKEND_*;
29 |   local: *;
30 | };
31 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/includes/cuda_util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cublas_v2.h>
 4 | #include <cublasLt.h>
 5 | #include <cuda.h>
 6 | #include <math_constants.h>
 7 | 
 8 | #include <chrono>
 9 | #include <fstream>
10 | #include <iostream>
11 | #include <string>
12 | #include <type_traits>
13 | #include <vector>
14 | 
15 | namespace lightseq {
16 | 
17 | /* Print vector stored in GPU memory, for debug */
18 | template <typename T>
19 | void print_vec(const T *outv, std::string outn, int num_output_ele);
20 | 
21 | template <typename T>
22 | void print_vec(const T *outv, std::string outn, int start, int end);
23 | 
24 | namespace cuda {
25 | template <typename T>
26 | void check_gpu_error(T result, char const *const func, const char *const file,
27 |                      int const line);
28 | 
29 | #define CHECK_GPU_ERROR(val) \
30 |   ::lightseq::cuda::check_gpu_error((val), #val, __FILE__, __LINE__)
31 | 
32 | template <typename T>
33 | T *cuda_malloc(size_t ele_num);
34 | 
35 | void cuda_free(void *pdata);
36 | 
37 | template <typename T>
38 | void cuda_set(T *pdata, int value, size_t ele_num);
39 | 
40 | template <typename T>
41 | void check_nan_inf(const T *data_ptr, int dsize, bool check_nan_inf,
42 |                    std::string file, int line, cudaStream_t stream);
43 | 
44 | #define CHECK_NAN_INF(ptr, size, stream)                            \
45 |   check_nan_inf((ptr), (size), true, __FILE__, __LINE__, (stream)); \
46 |   check_nan_inf((ptr), (size), false, __FILE__, __LINE__, (stream))
47 | 
48 | template <typename T>
49 | void check_2norm(const T *data_ptr, std::string tensor_name, int dsize,
50 |                  cudaStream_t stream);
51 | 
52 | int getSMVersion();
53 | 
54 | std::string getGPUName();
55 | }  // namespace cuda
56 | }  // namespace lightseq
57 | 


--------------------------------------------------------------------------------
/examples/triton_backend/transformer_client_example.py:
--------------------------------------------------------------------------------
 1 | import tritonclient.http as httpclient
 2 | import os, sys
 3 | from tritonclient.utils import InferenceServerException
 4 | import numpy as np
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     port = os.environ["HTTP_PORT"]
 9 |     http_url = "localhost:{}".format(port)
10 |     triton_client = httpclient.InferenceServerClient(url=http_url, concurrency=2)
11 | 
12 |     model_name = "transformer_example"
13 |     array_list = np.array(
14 |         [
15 |             [324, 423, 5413, 1314, 1451, 4134, 946, 1467],
16 |             [324, 423, 5413, 1314, 1451, 4134, 946, 1467],
17 |         ],
18 |         np.int32,
19 |         copy=True,
20 |     )
21 | 
22 |     async_requests = []
23 | 
24 |     for _ in range(10):
25 |         inputs = []  # type: httpclient.InferInput
26 |         outputs = []
27 |         inputs.append(httpclient.InferInput("source_ids", array_list.shape, "INT32"))
28 |         inputs[0].set_data_from_numpy(array_list)
29 | 
30 |         outputs.append(httpclient.InferRequestedOutput("target_ids"))
31 |         outputs.append(httpclient.InferRequestedOutput("target_scores"))
32 | 
33 |         async_requests.append(
34 |             triton_client.async_infer(
35 |                 model_name=model_name, inputs=inputs, outputs=outputs
36 |             )
37 |         )
38 | 
39 |     for request in async_requests:
40 |         result = request.get_result(block=True)
41 | 
42 |         if type(result) == InferenceServerException:
43 |             print("error")
44 |             sys.exit(0)
45 | 
46 |         target_ids = result.as_numpy("target_ids")
47 |         target_scores = result.as_numpy("target_scores")
48 | 
49 |         print("target_ids: ", target_ids)
50 |         print("target_scores: ", target_scores)
51 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/sampling.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T>
 8 | class SamplingOp : public Operator {
 9 |  private:
10 |   GenerateMethod _generate_method;
11 |   int _max_batch_size;
12 |   int _max_step;
13 |   int _max_thread_per_block;
14 |   int _trg_vocab_size;
15 |   int _topk;
16 |   float _topp;
17 |   int _eos_id;
18 |   bool _has_logits_bias;
19 |   int* _p_d_unfinished;
20 | 
21 |   int _batch_size;
22 |   int _seq_len;
23 |   int _logits_seq_len;
24 |   int _prompt_len;
25 |   int _cur_step;
26 | 
27 |   int _h_unfinished;
28 | 
29 | #ifdef LIGHTSEQ_cuda
30 |   curandState* _p_d_curandstate;  //[batch_size]
31 | #endif
32 | 
33 |   Variable* _out_token_ids;
34 |   Variable* _seq_score;
35 | 
36 |  public:
37 |   SamplingOp(GenerateMethod gm, int max_batch_size, int max_step,
38 |              int max_thread_per_block, int trg_vocab_size, int topk, float topp,
39 |              int eos_id);
40 | 
41 |   virtual ~SamplingOp() {}
42 | 
43 |   // output: new_token_ids
44 |   std::tuple<Variable*, Variable*> operator()(Variable* logits,
45 |                                               Variable* logit_bias,
46 |                                               Variable* token_ids);
47 | 
48 |   void before_forward(int batch_size, int prompt_len, int cur_step,
49 |                       int logits_seq_len) {
50 |     _batch_size = batch_size;
51 |     _prompt_len = prompt_len;
52 |     _cur_step = cur_step;
53 |     _seq_len = prompt_len + cur_step;
54 |     _logits_seq_len = logits_seq_len;
55 |   }
56 | 
57 |   void forward() override;
58 | 
59 |   void backward() override {}
60 | 
61 |   bool is_stop() { return _h_unfinished == 0; }
62 | };
63 | 
64 | }  // namespace lightseq
65 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/generator_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "beam_search_topk.h"
 4 | #include "sampling.h"
 5 | #include "layer.h"
 6 | 
 7 | namespace lightseq {
 8 | 
 9 | template <class T>
10 | class GeneratorLayer : public Layer {
11 |  private:
12 |   // operators
13 |   BeamSearchTopOp<T>* _beam_search = nullptr;
14 |   SamplingOp<T>* _sampling = nullptr;
15 | 
16 |   // parameters
17 |   Variable* _logit_bias;
18 |   size_t _trg_vocab_size;
19 |   bool _has_logits_bias;
20 | 
21 |   GenerateMethod _generate_method;
22 | 
23 |  public:
24 |   // this construct method is for beam_search generate method.
25 |   GeneratorLayer(GenerateMethod gm, int nshared_dec_layer, int max_batch_size,
26 |                  int max_step, int trg_vocab_size, int hidden_size,
27 |                  int max_thread_per_block, int beam_size = 0,
28 |                  float diverse_lambda = 0., int dim_per_head = 0,
29 |                  int end_id = 0, int head_num = 0, float length_penalty = 0.,
30 |                  int topk = 0, float topp = 0, bool has_logits_bias = false);
31 | 
32 |   virtual ~GeneratorLayer() {}
33 | 
34 |   std::tuple<Variable*, Variable*> operator()(Variable* logits,
35 |                                               Variable* alive_seq);
36 | 
37 |   void before_forward(int batch_size, int prompt_len, int cur_step);
38 | 
39 |   void refresh_cache(Variable* caches_k, Variable* caches_v);
40 | 
41 |   int load_params(const std::vector<const T*>& para_vec, int offset);
42 | 
43 |   bool is_stop();
44 | };
45 | 
46 | template class GeneratorLayer<float>;
47 | #ifdef LIGHTSEQ_cuda
48 | template class GeneratorLayer<__half>;
49 | #endif
50 | 
51 | template <typename T>
52 | using GeneratorLayerPtr = std::shared_ptr<GeneratorLayer<T>>;
53 | 
54 | }  // namespace lightseq
55 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_ner/run_gcq_ner.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | # You can use multiple NICs in NCCL communication.
18 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command.
19 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3
20 | 
21 | # Set your environment variables according to your training environment,
22 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility
23 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \
24 |   --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \
25 |   --master_port=$WORKER_0_PORT \
26 |   $THIS_DIR/run_gcq_ner.py \
27 |   --model_name_or_path bert-base-uncased \
28 |   --dataset_name conll2003 \
29 |   --do_train \
30 |   --do_eval \
31 |   --per_device_train_batch_size 16 \
32 |   --num_train_epochs 10 \
33 |   --output_dir /tmp/test-ner \
34 |   --overwrite_output_dir \
35 |   --fp16 \
36 |   --seed 1234 \
37 |   --logging_steps 10 \
38 |   --module_type 2 \
39 |   --enable_quant false \
40 |   --enable_GCQ true \
41 |   --GCQ_quantile 0.99
42 | 


--------------------------------------------------------------------------------
/lightseq/training/gcq/ls_fs_gcq_trainer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | import torch.distributed as dist
 4 | from torch.nn.parallel import DistributedDataParallel
 5 | from fairseq.trainer import Trainer
 6 | from packaging import version
 7 | from .gcq import GCQState, encode_and_decode
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class LSTrainer(Trainer):
13 |     """
14 |     Main class for data parallel.
15 | 
16 |     This class supports GCQ (Gradient Communication Quantization) for
17 |     distributed multi-machine training based on fairseq.trainer.Trainer.
18 |     """
19 | 
20 |     def __init__(self, *args, **kwargs):
21 |         super().__init__(*args, **kwargs)
22 | 
23 |     @property
24 |     def model(self):
25 |         if self._wrapped_model is None:
26 |             super().model
27 |             if isinstance(self._wrapped_model, DistributedDataParallel) and getattr(
28 |                 self.args, "enable_GCQ", False
29 |             ):
30 |                 assert version.parse(torch.__version__) >= version.parse(
31 |                     "1.10"
32 |                 ), "Training with GCQ requires that the version of torch has to be greater than or equal to 1.10!"
33 |                 state = GCQState(
34 |                     process_group=dist.group.WORLD if dist.is_initialized() else None,
35 |                     hidden_size=self.args.encoder_embed_dim,
36 |                     quantile_value=self.args.GCQ_quantile,
37 |                 )
38 |                 # Register the communication hook.
39 |                 self._wrapped_model.register_comm_hook(
40 |                     state=state, hook=encode_and_decode
41 |                 )
42 |                 logger.info("############ register communication hook done ###########")
43 |         return self._wrapped_model
44 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/launch_llama_emb.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | // dropout inside ffn.
 9 | template <typename T>
10 | class LaunchLlamaEmbOp : public Operator {
11 |  private:
12 |   size_t _max_batch_tokens;
13 |   int _pad_id;
14 |   size_t _hidden_dim;
15 | 
16 |   size_t _batch_size;
17 |   size_t _seq_len;
18 |   int _max_step;
19 |   int _beam_size;
20 |   int _offset;
21 |   int _max_batch_size;
22 | 
23 |   Variable* _result;
24 |   Variable* _pad_mask;
25 |   Variable* _left_pad_len;
26 | 
27 |  public:
28 |   LaunchLlamaEmbOp(size_t max_batch_tokens, int max_step, int max_batch_size,
29 |                    int beam_size, int pad_id, size_t hidden_dim)
30 |       : Operator("LaunchLlamaEmbOp"),
31 |         _max_batch_tokens(max_batch_tokens),
32 |         _max_batch_size(max_batch_size),
33 |         _pad_id(pad_id),
34 |         _max_step(max_step),
35 |         _beam_size(beam_size),
36 |         _hidden_dim(hidden_dim) {}
37 | 
38 |   virtual ~LaunchLlamaEmbOp() {}
39 | 
40 |   std::tuple<Variable*, Variable*, Variable*> operator()(Variable* inp_tokens,
41 |                                                          Variable* token_emb);
42 | 
43 |   void before_forward(size_t batch_size, size_t seq_len, int offset) {
44 |     _batch_size = batch_size, _seq_len = seq_len, _offset = offset;
45 |     _result->set_shape({batch_size * seq_len, _hidden_dim});
46 |     _pad_mask->set_shape({batch_size, seq_len + offset});
47 |     _left_pad_len->set_shape({_batch_size, size_t(_beam_size)});
48 |   }
49 | 
50 |   void forward() override;
51 | 
52 |   void backward() override {
53 |     printf("ERROR! LaunchLlamaEmbOp can't cal backward()\n");
54 |     exit(-1);
55 |   }
56 | };
57 | }  // namespace lightseq
58 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/cuda/includes/llama_kernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | #include <curand_kernel.h>
 5 | #include "kernels.h"
 6 | #include <stdexcept>
 7 | 
 8 | namespace lightseq {
 9 | namespace cuda {
10 | 
11 | template <typename T>
12 | void launch_llama_embedding(const T *token_emb, const int *tokens, T *output,
13 |                             T *pad_mask_ptr, int *left_pad_len_ptr,
14 |                             int batch_size, int beam_size, int hidden_dim,
15 |                             int step_offset, int seq_len, int max_step,
16 |                             int padding_id, cudaStream_t stream);
17 | 
18 | template <typename T>
19 | void launch_split_rotary_position_qkv(const T *input_ptr, const T *sin_ptr,
20 |                                       const T *cos_ptr, T *q_out,
21 |                                       T *cache_k_out, T *cache_v_out,
22 |                                       size_t max_step, size_t batch_size,
23 |                                       size_t nhead, size_t offset_seq_len,
24 |                                       size_t query_len, size_t head_dim,
25 |                                       cudaStream_t stream);
26 | 
27 | template <typename T>
28 | void launch_silu_elewise_product(const T *inp_ptr, T *out_ptr,
29 |                                  size_t batch_size, size_t seq_len,
30 |                                  size_t inner_size, cudaStream_t stream);
31 | 
32 | template <typename T>
33 | void launch_rms_layer_norm(const T *inp_ptr, const T *scale_ptr, T *out_ptr,
34 |                            T *res_ptr, T *rms_ptr, size_t batch_tokens,
35 |                            size_t hidden_dim, cudaStream_t stream,
36 |                            const float ln_epsilon = 1e-6f);
37 | 
38 | }  // namespace cuda
39 | }  // namespace lightseq
40 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/gpt.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/gpt_encoder.h"
 4 | #include "../proto/gpt_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType gpt_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType gpt_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class Gpt : public LSModel {
18 |  private:
19 |   typedef lightseq::cuda::OperationTypeTraits<gpt_optype> optraits;
20 |   std::shared_ptr<lightseq::cuda::GptEncoder<gpt_optype>> encoder_;
21 | 
22 |   int* d_input_;
23 |   int* d_sample_id;
24 |   float* d_ppl;
25 |   void* d_buf_;
26 | 
27 |   int _max_batch_size;
28 |   cudaStream_t stream_;
29 |   cudaStream_t cache_stream_;
30 |   cublasHandle_t hd_;
31 |   lightseq::cuda::GptWeight<gpt_optype> tw_;
32 |   std::set<std::string> available_sampling_methods = {"topk", "topp"};
33 | 
34 |  public:
35 |   Gpt(const std::string weight_path, const int max_batch_size);
36 | 
37 |   ~Gpt();
38 | 
39 |   const int* get_result_ptr();
40 |   const float* get_score_ptr();
41 |   int get_max_step() { return tw_._max_step; }
42 | 
43 |   void Infer() override;
44 |   void set_input_ptr(int index, void* input_ptr) override;
45 |   void set_output_ptr(int index, void* output_ptr) override;
46 |   const void* get_output_ptr(int index) override;
47 |   std::vector<int> get_input_max_shape(int index) override;
48 |   std::vector<int> get_output_max_shape(int index) override;
49 |   DataType get_input_dtype(int index) override;
50 |   DataType get_output_dtype(int index) override;
51 |   void benchmark_mode(bool is_benchmark) override;
52 | };
53 | 
54 | LSMODEL_REGISTER(Gpt);
55 | 
56 | }  // namespace cuda
57 | }  // namespace lightseq
58 | 


--------------------------------------------------------------------------------
/examples/training/fairseq/ls_fairseq_gcq_wmt14en2de.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | THIS_DIR=$(dirname $(readlink -f $0))
 4 | cd $THIS_DIR/../../..
 5 | 
 6 | if [ ! -d "/tmp/wmt14_en_de" ]; then
 7 |     echo "Downloading dataset"
 8 |     wget http://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/lightseq/wmt_data/databin_wmt14_en_de.tar.gz -P /tmp
 9 |     tar -zxvf /tmp/databin_wmt14_en_de.tar.gz -C /tmp && rm /tmp/databin_wmt14_en_de.tar.gz
10 | fi
11 | 
12 | # You can use multiple NICs in NCCL communication.
13 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command.
14 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3
15 | 
16 | # Set your environment variables according to your training environment,
17 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility
18 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \
19 |     --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \
20 |     --master_port=$WORKER_0_PORT \
21 |     $(which lightseq-train) /tmp/wmt14_en_de/ \
22 |     --task translation \
23 |     --arch ls_transformer_wmt_en_de_big_t2t --share-decoder-input-output-embed \
24 |     --optimizer ls_adam --adam-betas '(0.9, 0.98)' \
25 |     --clip-norm 0.0 \
26 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --weight-decay 0.0001 \
27 |     --criterion ls_label_smoothed_cross_entropy --label-smoothing 0.1 \
28 |     --max-tokens 8192 \
29 |     --eval-bleu \
30 |     --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
31 |     --eval-bleu-detok moses \
32 |     --eval-bleu-remove-bpe \
33 |     --eval-bleu-print-samples \
34 |     --best-checkpoint-metric bleu \
35 |     --maximize-best-checkpoint-metric \
36 |     --fp16 \
37 |     --enable_GCQ \
38 |     --GCQ_quantile 0.99
39 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
 1 | # LightSeq Examples
 2 | 
 3 | ## Table of Contents
 4 | - [Cpp Examples](#cpp-examples)
 5 | - [Python Examples](#python-examples)
 6 |     - [Train the models](#train-the-models)
 7 |     - [Export and infer the models](#export-and-infer-the-models)
 8 | - [Deploy using Tritonbackend](#deploy-using-tritonbackend)
 9 | 
10 | ## Cpp Examples
11 | We provide multiple cpp examples of LightSeq inference.
12 | 
13 | First you should use the training examples in the following to train a model, and then export it to protobuf or HDF5 format.
14 | 
15 | Then use the cpp examples to infer the models:
16 | 1. Uncomment the `add_subdirectory(examples/inference/cpp)` in the [CMakeLists.txt](../CMakeLists.txt).
17 | 2. Build the LightSeq. Refer to [build.md](./build.md) for more details.
18 | 3. Switch to `build/temp.linux-xxx/examples/inference/cpp`, and then run `sudo make` to compile the cpp example.
19 | 4. Run the cpp examples by `./xxx_example MODEL_PATH`.
20 | 
21 | ## Python Examples
22 | We provide a series of Python examples to show how to use LightSeq to do model training and inference.
23 | 
24 | ### Train the models
25 | Currently, LightSeq supports training from [Fairseq](../examples/training/fairseq/README.md), [Hugging Face](../examples/training/huggingface/README.md), [DeepSpeed](../examples/training/deepspeed/README.md) and [from scratch](../examples/training/custom/README.md). For more training details, please refer to the respective README.
26 | 
27 | ### Export and infer the models
28 | First export the models training by Fairseq, Hugging Face or LightSeq to protobuf or HDF5 format. Then test the results and speeds using the testing scripts.
29 | 
30 | Refer to [here](../examples/inference/python/README.md) for more details.
31 | 
32 | ## Deploy using Tritonbackend
33 | Refer to [here](../examples/triton_backend/README.md) for more details.
34 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/arm/utils.cc:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T>
 6 | void print_vec(const T *outv, std::string outn, int num_output_ele) {
 7 |   std::cout << outn << " address: " << outv << std::endl;
 8 |   printf("value: ");
 9 |   for (int i = 0; i < num_output_ele; i++) {
10 |     std::cout << outv[i] << ", ";
11 |   }
12 |   std::cout << std::endl;
13 | }
14 | 
15 | template <>
16 | void print_vec<int8_t>(const int8_t *outv, std::string outn,
17 |                        int num_output_ele) {
18 |   std::cout << outn << " address: " << outv << std::endl;
19 |   printf("value: ");
20 |   for (int i = 0; i < num_output_ele; i++) {
21 |     std::cout << static_cast<int>(outv[i]) << ", ";
22 |   }
23 |   std::cout << std::endl;
24 | }
25 | 
26 | template <>
27 | void print_vec<uint8_t>(const uint8_t *outv, std::string outn,
28 |                         int num_output_ele) {
29 |   std::cout << outn << " address: " << outv << std::endl;
30 |   printf("value: ");
31 |   for (int i = 0; i < num_output_ele; i++) {
32 |     std::cout << static_cast<int>(outv[i]) << ", ";
33 |   }
34 |   std::cout << std::endl;
35 | }
36 | 
37 | template void print_vec<float>(const float *outv, std::string outn,
38 |                                int num_output_ele);
39 | 
40 | template void print_vec<int>(const int *outv, std::string outn,
41 |                              int num_output_ele);
42 | 
43 | template void print_vec<int8_t>(const int8_t *outv, std::string outn,
44 |                                 int num_output_ele);
45 | 
46 | template void print_vec<int8_t>(const int8_t *outv, std::string outn,
47 |                                 int num_output_ele);
48 | 
49 | template void print_vec<uint8_t>(const uint8_t *outv, std::string outn,
50 |                                  int num_output_ele);
51 | }  // namespace lightseq
52 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/linear.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2>
 8 | class LinearOp : public Operator {
 9 |  private:
10 |   size_t _output_size;
11 |   size_t _input_size;
12 |   size_t _max_batch_tokens;
13 |   size_t _batch_tokens;
14 |   std::array<int, 3> _gemm_algos;
15 | 
16 |   float _alpha;
17 |   float _beta;
18 |   MATRIX_OP _opA;
19 |   MATRIX_OP _opB;
20 |   bool _use_residual = false;
21 | 
22 |   Variable* _result;
23 | 
24 | #ifdef PYBIND_INTERFACE
25 | #define weight_op MATRIX_OP::Transpose
26 | #else
27 | #define weight_op MATRIX_OP::NonTranspose
28 | #endif
29 | 
30 |  public:
31 |   LinearOp(size_t max_batch_tokens, size_t output_size, size_t input_size,
32 |            MATRIX_OP opA = weight_op, MATRIX_OP opB = MATRIX_OP::NonTranspose,
33 |            float alpha = float(1.), float beta = float(0.))
34 |       : Operator("LinearOp"),
35 |         _max_batch_tokens(max_batch_tokens),
36 |         _output_size(output_size),
37 |         _input_size(input_size),
38 |         _opA(opA),
39 |         _opB(opB),
40 |         _gemm_algos(std::array<int, 3>({99, 99, 99})),
41 |         _alpha(alpha),
42 |         _beta(beta) {}
43 | 
44 |   ~LinearOp() {}
45 | 
46 |   Variable* operator()(Variable* inp, Variable* weight);
47 |   Variable* operator()(Variable* inp, Variable* weight, Variable* residual);
48 | 
49 |   void forward() override;
50 | 
51 |   void before_forward(size_t batch_tokens) {
52 |     _batch_tokens = batch_tokens;
53 |     if (_use_residual) {
54 |       _result->set_offset(0, {batch_tokens, _output_size});
55 |     } else {
56 |       _result->set_shape({batch_tokens, _output_size});
57 |     }
58 |   }
59 | 
60 |   void backward() override;
61 | 
62 |   void before_backward() {}
63 | };
64 | 
65 | }  // namespace lightseq
66 | 


--------------------------------------------------------------------------------
/lightseq/inference/kernels/multilgKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | template <typename T>
 9 | void ker_multilg_enc_emb_launcher(int batch_size, int batch_seq_len,
10 |                                   int hidden_size, cudaStream_t stream,
11 |                                   const T* token_emb, const T* pos_emb,
12 |                                   const T* src_lang_emb, const int* token_id,
13 |                                   T* output, int* padding_mask, int padding_id,
14 |                                   int max_thread_per_block);
15 | 
16 | template <typename T>
17 | void ker_multilg_dec_emb_launcher(int step_token_num, int hidden_size,
18 |                                   cudaStream_t stream, const T* token_emb,
19 |                                   const T* pos_emb, const T* src_lang_emb,
20 |                                   const T* trg_lang_emb,
21 |                                   const int* src_token_id, const int* token_id,
22 |                                   T* output, int step, int max_step,
23 |                                   int vocab_size, int beam_size,
24 |                                   int src_seq_len, int max_thread_per_block);
25 | 
26 | template <typename T>
27 | void select_beam_rough_topk_multilg_launcher(
28 |     const T* logits, const T* logit_bias, const float* seq_probs,
29 |     const float* seq_score, const int* alive_seq, const int* vocab_mask,
30 |     const int* src_token_id, int* can_idx, float* can_score, int* num_beam_can,
31 |     int vocab_size, int max_step, float length_norm, int cur_step,
32 |     int step_token_num, int max_thread_per_block, cudaStream_t stream,
33 |     int beam_size, float diverse_lambda, int end_id, int src_seq_len);
34 | 
35 | }  // namespace cuda
36 | }  // namespace lightseq
37 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/bert/task_qa/run_gcq_qa.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | THIS_DIR=$(dirname $(readlink -f $0))
16 | 
17 | # You can use multiple NICs in NCCL communication.
18 | # E.g., if every machine has 4 NICs: eth0, eth1, eth2, eth3, you can use the following command.
19 | # export NCCL_SOCKET_IFNAME=eth0,eth1,eth2,eth3
20 | 
21 | # Set your environment variables according to your training environment,
22 | # for details, please refer to https://pytorch.org/docs/1.10/distributed.html#launch-utility
23 | python3 -m torch.distributed.launch --nproc_per_node=$WORKER_GPU_NUM \
24 |   --nnodes=$WORKER_NUM --node_rank=$WORKER_ID --master_addr=$WORKER_0_HOST \
25 |   --master_port=$WORKER_0_PORT \
26 |   $THIS_DIR/run_gcq_qa.py \
27 |   --model_name_or_path bert-base-uncased \
28 |   --dataset_name squad \
29 |   --do_train \
30 |   --do_eval \
31 |   --max_seq_length 256 \
32 |   --per_device_train_batch_size 16 \
33 |   --doc_stride 128 \
34 |   --learning_rate 3e-5 \
35 |   --num_train_epochs 10 \
36 |   --output_dir /tmp/squad \
37 |   --overwrite_output_dir \
38 |   --fp16 \
39 |   --seed 1234 \
40 |   --logging_steps 10 \
41 |   --module_type 1 \
42 |   --enable_quant false \
43 |   --enable_GCQ true \
44 |   --GCQ_quantile 0.99
45 | 


--------------------------------------------------------------------------------
/lightseq/inference/pywrapper/quant_gpt.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "../model/quant_gpt_encoder.h"
 4 | #include "../proto/quant_gpt_weight.h"
 5 | #include "../tools/util.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType gpt_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType gpt_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class QuantGpt : public LSModel {
18 |  private:
19 |   typedef lightseq::cuda::OperationTypeTraits<gpt_optype> optraits;
20 |   std::shared_ptr<lightseq::cuda::QuantGptEncoder<gpt_optype>> encoder_;
21 | 
22 |   int* d_input_;
23 |   int* d_sample_id;
24 |   float* d_ppl;
25 | 
26 |   int _max_batch_size;
27 |   cudaStream_t stream_;
28 |   cudaStream_t cache_stream_;
29 |   cublasHandle_t hd_;
30 |   lightseq::cuda::QuantGptWeight<gpt_optype> tw_;
31 |   std::set<std::string> available_sampling_methods = {"topk", "topp"};
32 | 
33 |  public:
34 |   QuantGpt(const std::string weight_path, const int max_batch_size);
35 | 
36 |   ~QuantGpt();
37 | 
38 |   const int* get_result_ptr();
39 |   const float* get_score_ptr();
40 |   const int get_max_step() { return tw_._max_step; }
41 | 
42 |   void Infer() override;
43 |   void set_input_ptr(int index, void* input_ptr) override;
44 |   void set_output_ptr(int index, void* output_ptr) override;
45 |   const void* get_output_ptr(int index) override;
46 |   std::vector<int> get_input_max_shape(int index) override;
47 |   std::vector<int> get_output_max_shape(int index) override;
48 |   DataType get_input_dtype(int index) override;
49 |   DataType get_output_dtype(int index) override;
50 |   void benchmark_mode(bool is_benchmark) override{};
51 | };
52 | 
53 | LSMODEL_REGISTER(QuantGpt);
54 | 
55 | }  // namespace cuda
56 | }  // namespace lightseq
57 | 


--------------------------------------------------------------------------------
/lightseq/csrc/ops_new/includes/launch_gpt_emb.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | // dropout inside ffn.
 9 | template <typename T>
10 | class LaunchGptEmbOp : public Operator {
11 |  private:
12 |   size_t _max_batch_tokens;
13 |   int _pad_id;
14 |   size_t _hidden_dim;
15 | 
16 |   size_t _batch_size;
17 |   size_t _seq_len;
18 |   int _max_step;
19 |   int _beam_size;
20 |   int _offset;
21 |   int _max_batch_size;
22 | 
23 |   Variable* _result;
24 |   Variable* _pad_mask;
25 |   Variable* _left_pad_len;
26 | 
27 |  public:
28 |   LaunchGptEmbOp(size_t max_batch_tokens, int max_step, int max_batch_size,
29 |                  int beam_size, int pad_id, size_t hidden_dim)
30 |       : Operator("LaunchGptEmbOp"),
31 |         _max_batch_tokens(max_batch_tokens),
32 |         _max_batch_size(max_batch_size),
33 |         _pad_id(pad_id),
34 |         _max_step(max_step),
35 |         _beam_size(beam_size),
36 |         _hidden_dim(hidden_dim) {}
37 | 
38 |   virtual ~LaunchGptEmbOp() {}
39 | 
40 |   std::tuple<Variable*, Variable*, Variable*> operator()(Variable* inp_tokens,
41 |                                                          Variable* token_emb,
42 |                                                          Variable* pos_emb);
43 | 
44 |   void before_forward(size_t batch_size, size_t seq_len, int offset) {
45 |     _batch_size = batch_size, _seq_len = seq_len, _offset = offset;
46 |     _result->set_shape({batch_size * seq_len, _hidden_dim});
47 |     _pad_mask->set_shape({batch_size, seq_len + offset});
48 |     _left_pad_len->set_shape({_batch_size, size_t(_beam_size)});
49 |   }
50 | 
51 |   void forward() override;
52 | 
53 |   void backward() override {
54 |     printf("ERROR! LaunchGptEmbOp can't cal backward()\n");
55 |     exit(-1);
56 |   }
57 | };
58 | }  // namespace lightseq
59 | 


--------------------------------------------------------------------------------
/lightseq/csrc/kernels/x86/gemm.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <mkl.h>
 3 | 
 4 | #include "kernels.h"
 5 | 
 6 | namespace lightseq {
 7 | namespace x86 {
 8 | 
 9 | // means inpA * inpB
10 | template <>
11 | void matrix_gemm(const float* inpA, const float* inpB, float* outC, int m,
12 |                  int n, int k) {
13 |   const int64_t lda = k;
14 |   const int64_t ldb = n;
15 |   const int64_t ldc = n;
16 | 
17 |   CBLAS_TRANSPOSE trans_a = CblasNoTrans;
18 |   CBLAS_TRANSPOSE trans_b = CblasNoTrans;
19 | 
20 |   cblas_sgemm(CblasRowMajor, trans_a, trans_b, m, n, k, 1, inpA, lda, inpB, ldb,
21 |               0, outC, ldc);
22 |   return;
23 | }
24 | 
25 | template <>
26 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a,
27 |           bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha,
28 |           const uint8_t* a, int64_t lda, const int8_t* b, int64_t ldb,
29 |           float beta, int32_t* c, int64_t ldc,
30 |           const int32_t* a_shift_compensation) {
31 |   const bool use_packed_api = a_is_packed || b_is_packed;
32 | 
33 |   const CBLAS_TRANSPOSE trans_a = transpose_a ? CblasTrans : CblasNoTrans;
34 |   const CBLAS_TRANSPOSE trans_b = transpose_b ? CblasTrans : CblasNoTrans;
35 | 
36 |   // if (use_packed_api) {
37 |   //   cblas_gemm_s8u8s32_compute(
38 |   //       CblasRowMajor, a_is_packed ? (MKL_INT)CblasPacked : (MKL_INT)trans_a,
39 |   //       b_is_packed ? (MKL_INT)CblasPacked : (MKL_INT)trans_b,
40 |   //       CblasRowOffset, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc,
41 |   //       a_shift_compensation);
42 |   // } else {
43 | 
44 |   cblas_gemm_s8u8s32(CblasRowMajor, trans_a, trans_b, CblasRowOffset, m, n, k,
45 |                      alpha, a, lda, 0, b, ldb, 0, beta, c, ldc,
46 |                      a_shift_compensation);
47 |   // }
48 | 
49 |   return;
50 | }
51 | 
52 | }  // namespace x86
53 | }  // namespace lightseq
54 | 


--------------------------------------------------------------------------------
/lightseq/csrc/layers_new/includes/rms_norm_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "rms_layer_norm.h"
 3 | #include "layer.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <class T1, class T2>
 8 | class RMSNormLayer : public Layer {
 9 |  private:
10 |   int _hidden_size;
11 |   int _max_batch_tokens;
12 | 
13 |   // operators
14 |   RMSLayerNormalizeOp<T1, T2>* _rms_norm = nullptr;
15 | 
16 |   // parameters
17 |   Variable* _norm_scale;
18 | 
19 |  public:
20 |   RMSNormLayer(int max_batch_tokens, int hidden_size)
21 |       : Layer("RMSNormLayer"),
22 |         _hidden_size(hidden_size),
23 |         _max_batch_tokens(max_batch_tokens),
24 |         _rms_norm(new RMSLayerNormalizeOp<T1, T2>(max_batch_tokens, hidden_size,
25 |                                                   false)) {
26 |     _norm_scale = new Variable("_norm_scale", g_dtype<T1>(), g_dtype<T2>());
27 | 
28 |     this->_context_ptr->exit_layer();  // necessary
29 |   }
30 | 
31 |   virtual ~RMSNormLayer() {}
32 | 
33 |   Variable* operator()(Variable* inp) {
34 |     set_inputs({inp});
35 | 
36 |     Variable* out = std::get<0>((*_rms_norm)(inp, _norm_scale));
37 | 
38 |     set_outputs({out});
39 |     return out;
40 |   }
41 | 
42 |   void before_forward(int batch_size, int seq_len) {
43 |     _rms_norm->before_forward(batch_size, seq_len);
44 |   }
45 | 
46 |   void before_backward() {}
47 | 
48 |   int load_params(const std::vector<const T1*>& para_vec, int offset) {
49 |     int size = 0;
50 |     _norm_scale->set_value((char*)para_vec[offset + size]), size++;
51 |     _norm_scale->set_shape({size_t(_hidden_size)});
52 |     return size;
53 |   }
54 | };
55 | 
56 | template class RMSNormLayer<float, float>;
57 | #ifdef LIGHTSEQ_cuda
58 | template class RMSNormLayer<__half, __half>;
59 | #endif
60 | 
61 | template <class T1, class T2>
62 | using RMSNormLayerPtr = std::shared_ptr<RMSNormLayer<T1, T2>>;
63 | 
64 | }  // namespace lightseq
65 | 


--------------------------------------------------------------------------------
/examples/training/huggingface/gcq/ls_hf_gcq_trainer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | import torch.distributed as dist
 4 | from transformers import Trainer
 5 | from packaging import version
 6 | from lightseq.training.gcq import (
 7 |     GCQState,
 8 |     encode_and_decode,
 9 | )
10 | from examples.training.huggingface.gcq import GCQArguments
11 | 
12 | logger = logging.getLogger("lightseq_hf_trainer")
13 | 
14 | 
15 | class LSTrainer(Trainer):
16 |     """
17 |     LSTrainer supports GCQ (Gradient Communication Quantization) for distributed multi-machine training
18 |     based on transformers.Trainer.
19 |     """
20 | 
21 |     def __init__(self, gcq_args: GCQArguments = None, *args, **kwargs):
22 |         super().__init__(*args, **kwargs)
23 |         logger.setLevel(logging.INFO if self.args.should_log else logging.WARN)
24 |         self.gcq_args = gcq_args
25 | 
26 |     def _wrap_model(self, model, training=True, dataloader=None):
27 |         model = super()._wrap_model(model, training, dataloader)
28 |         # Enable GCQ.
29 |         if isinstance(model, torch.nn.parallel.DistributedDataParallel) and getattr(
30 |             self.gcq_args, "enable_GCQ", False
31 |         ):
32 |             assert version.parse(torch.__version__) >= version.parse(
33 |                 "1.10"
34 |             ), "Training with GCQ requires that the version of torch has to be greater than or equal to 1.10!"
35 |             state = GCQState(
36 |                 process_group=dist.group.WORLD if dist.is_initialized() else None,
37 |                 hidden_size=self.gcq_args.hidden_size,
38 |                 quantile_value=self.gcq_args.GCQ_quantile,
39 |             )
40 |             # Register the communication hook.
41 |             model.register_comm_hook(state=state, hook=encode_and_decode)
42 |             logger.info("############ register communication hook done ###########")
43 | 
44 |         return model
45 | 


--------------------------------------------------------------------------------