├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── feature.md │ └── todo.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── contributors.yaml │ └── pull_request.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE.apache-2.0 ├── MANIFEST.in ├── README.md ├── assets └── logo.png ├── docs ├── .buildinfo ├── .nojekyll ├── CNAME ├── CONCEPTS │ ├── data_parallelism.html │ ├── dp │ │ └── zero_algorithm.html │ ├── parallel_context.html │ ├── tensor_model_parallelism.html │ └── tp │ │ ├── 1d_parallel_algorithm.html │ │ ├── 2d_parallel_algorithm.html │ │ ├── 2p5d_parallel_algorithm.html │ │ └── 3d_parallel_algorithm.html ├── Makefile ├── TUTORIALS │ ├── data_parallelism.html │ ├── tensor_model_parallelism.html │ └── zero_redundancy_optimizer.html ├── _images │ ├── 260461C3-EA3B-405C-9B34-05BA3C781161.png │ ├── 2d.png │ ├── 2p5d.png │ ├── 98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg │ ├── E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg │ ├── figure1.png │ └── figure11.png ├── _sources │ ├── CONCEPTS │ │ ├── data_parallelism.md │ │ ├── dp │ │ │ └── zero_algorithm.md │ │ ├── parallel_context.md │ │ ├── tensor_model_parallelism.md │ │ └── tp │ │ │ ├── 1d_parallel_algorithm.md │ │ │ ├── 2d_parallel_algorithm.md │ │ │ ├── 2p5d_parallel_algorithm.md │ │ │ └── 3d_parallel_algorithm.md │ ├── TUTORIALS │ │ ├── data_parallelism.md │ │ ├── tensor_model_parallelism.md │ │ └── zero_redundancy_optimizer.md │ └── index.rst ├── _static │ ├── _sphinx_javascript_frameworks_compat.js │ ├── basic.css │ ├── doctools.js │ ├── documentation_options.js │ ├── file.png │ ├── images │ │ ├── logo_binder.svg │ │ ├── logo_colab.png │ │ ├── logo_deepnote.svg │ │ └── logo_jupyterhub.svg │ ├── jquery.js │ ├── language_data.js │ ├── locales │ │ ├── ar │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── bg │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── bn │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ca │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── cs │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── da │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── de │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── el │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── eo │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── es │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── et │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── fi │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── fr │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── hr │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── id │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── it │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── iw │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ja │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ko │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── lt │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── lv │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ml │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── mr │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ms │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── nl │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── no │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── pl │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── pt │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ro │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ru │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── sk │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── sl │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── sr │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── sv │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ta │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── te │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── tg │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── th │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── tl │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── tr │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── uk │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── ur │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── vi │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ ├── zh_CN │ │ │ └── LC_MESSAGES │ │ │ │ └── booktheme.po │ │ └── zh_TW │ │ │ └── LC_MESSAGES │ │ │ └── booktheme.po │ ├── minus.png │ ├── plus.png │ ├── pygments.css │ ├── sbt-webpack-macros.html │ ├── scripts │ │ ├── bootstrap.js │ │ ├── pydata-sphinx-theme.js │ │ ├── sphinx-book-theme.js │ │ └── sphinx-book-theme.js.map │ ├── searchtools.js │ ├── sphinx_highlight.js │ ├── styles │ │ ├── bootstrap.css │ │ ├── pydata-sphinx-theme.css │ │ ├── sphinx-book-theme.css │ │ └── theme.css │ ├── vendor │ │ └── fontawesome │ │ │ └── 6.1.2 │ │ │ ├── LICENSE.txt │ │ │ ├── css │ │ │ └── all.min.css │ │ │ └── webfonts │ │ │ ├── fa-brands-400.ttf │ │ │ ├── fa-brands-400.woff2 │ │ │ ├── fa-regular-400.ttf │ │ │ ├── fa-regular-400.woff2 │ │ │ ├── fa-solid-900.ttf │ │ │ ├── fa-solid-900.woff2 │ │ │ ├── fa-v4compatibility.ttf │ │ │ └── fa-v4compatibility.woff2 │ └── webpack-macros.html ├── genindex.html ├── index.html ├── make.bat ├── objects.inv ├── search.html ├── searchindex.js └── source │ ├── CONCEPTS │ ├── data_parallelism.md │ ├── dp │ │ ├── zero_algorithm.md │ │ └── zero_image │ │ │ └── figure1.png │ ├── parallel_context.md │ ├── tensor_model_parallelism.md │ └── tp │ │ ├── 1d_image │ │ └── 98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg │ │ ├── 1d_parallel_algorithm.md │ │ ├── 2d_image │ │ └── 2d.png │ │ ├── 2d_parallel_algorithm.md │ │ ├── 2p5d_image │ │ └── 2p5d.png │ │ ├── 2p5d_parallel_algorithm.md │ │ ├── 3d_image │ │ └── E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg │ │ └── 3d_parallel_algorithm.md │ ├── TUTORIALS │ ├── data_parallelism.md │ ├── image │ │ ├── 260461C3-EA3B-405C-9B34-05BA3C781161.png │ │ └── figure1.png │ ├── tensor_model_parallelism.md │ └── zero_redundancy_optimizer.md │ ├── conf.py │ └── index.rst ├── fname.list ├── gcc_install.sh ├── oslo ├── __init__.py ├── __version__.py ├── lightseq2 │ ├── __init__.py │ ├── csrc │ │ ├── example │ │ │ ├── CMakeLists.txt │ │ │ ├── bert_example.cc │ │ │ ├── gpt_example.cc │ │ │ └── transformer_example.cc │ │ ├── kernels │ │ │ ├── arm │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── gemm.cc │ │ │ │ ├── includes │ │ │ │ │ ├── kernel_headers.h │ │ │ │ │ └── utils.h │ │ │ │ └── utils.cc │ │ │ ├── cuda │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── crf.cu │ │ │ │ ├── cross_entropy.cu │ │ │ │ ├── cublas_algo_map.cpp │ │ │ │ ├── cublas_wrappers.cpp │ │ │ │ ├── cublas_wrappers.cu │ │ │ │ ├── cuda_util.cu │ │ │ │ ├── dropout_kernels.cu │ │ │ │ ├── embKernels.cc.cu │ │ │ │ ├── embedding_kernels.cu │ │ │ │ ├── fused_adam_kernel.cu │ │ │ │ ├── gemm_test.cpp │ │ │ │ ├── general_kernels.cu │ │ │ │ ├── gptKernels.cc.cu │ │ │ │ ├── includes │ │ │ │ │ ├── block_reduce.h │ │ │ │ │ ├── common.h │ │ │ │ │ ├── cublas_algo_map.h │ │ │ │ │ ├── cublas_wrappers.h │ │ │ │ │ ├── cuda_util.h │ │ │ │ │ ├── embKernels.h │ │ │ │ │ ├── fused_adam_kernel.h │ │ │ │ │ ├── gptKernels.h │ │ │ │ │ ├── kernel_headers.h │ │ │ │ │ ├── kernels.h │ │ │ │ │ ├── ls_cub.cuh │ │ │ │ │ ├── multi_tensor_apply.cuh │ │ │ │ │ ├── transformerKernels.h │ │ │ │ │ └── util.h │ │ │ │ ├── normalize_kernels.cu │ │ │ │ ├── quantize_kernels.cu │ │ │ │ ├── softmax_kernels.cu │ │ │ │ ├── softmax_kernels_new.cu │ │ │ │ ├── transform_kernels.cu │ │ │ │ ├── transform_kernels_new.cu │ │ │ │ ├── transformerKernels.cc.cu │ │ │ │ └── util.cc.cu │ │ │ └── x86 │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── gemm.cpp │ │ │ │ ├── includes │ │ │ │ ├── kernel_headers.h │ │ │ │ ├── kernels.h │ │ │ │ └── util.h │ │ │ │ └── util.cc │ │ ├── layers │ │ │ ├── cross_entropy_layer.cpp │ │ │ ├── includes │ │ │ │ ├── cross_entropy_layer.h │ │ │ │ ├── quant_linear_layer.h │ │ │ │ ├── transformer_decoder_layer.h │ │ │ │ ├── transformer_embedding_layer.h │ │ │ │ └── transformer_encoder_layer.h │ │ │ ├── quant_linear_layer.cpp │ │ │ ├── transformer_decoder_layer.cpp │ │ │ ├── transformer_embedding_layer.cpp │ │ │ └── transformer_encoder_layer.cpp │ │ ├── layers_new │ │ │ ├── CMakeLists.txt │ │ │ ├── crf_layer.cpp │ │ │ ├── dec_enc_attention_layer.cpp │ │ │ ├── dec_self_attention_layer.cpp │ │ │ ├── encdec_kv_layer.cpp │ │ │ ├── feed_forward_layer.cpp │ │ │ ├── generator_layer.cpp │ │ │ ├── gpt_attention_layer.cpp │ │ │ ├── gpt_layer.cpp │ │ │ ├── includes │ │ │ │ ├── crf_layer.h │ │ │ │ ├── dec_enc_attention_layer.h │ │ │ │ ├── dec_self_attention_layer.h │ │ │ │ ├── encdec_kv_layer.h │ │ │ │ ├── feed_forward_layer.h │ │ │ │ ├── generator_layer.h │ │ │ │ ├── gpt_attention_layer.h │ │ │ │ ├── gpt_layer.h │ │ │ │ ├── launch_dec_emb_layer.h │ │ │ │ ├── launch_enc_emb_layer.h │ │ │ │ ├── launch_gpt_emb_layer.h │ │ │ │ ├── linear_layer.h │ │ │ │ ├── lyr_normalize_layer.h │ │ │ │ ├── multihead_attention_layer.h │ │ │ │ ├── sample_layer.h │ │ │ │ ├── sdpa_layer.h │ │ │ │ ├── transformer_decoder_layer.h │ │ │ │ └── transformer_encoder_layer.h │ │ │ ├── linear_layer.cpp │ │ │ ├── multihead_attention_layer.cpp │ │ │ ├── sample_layer.cpp │ │ │ ├── sdpa_layer.cpp │ │ │ ├── transformer_decoder_layer.cpp │ │ │ └── transformer_encoder_layer.cpp │ │ ├── lsflow │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── allocator.cpp │ │ │ ├── context.cpp │ │ │ ├── includes │ │ │ │ ├── allocator.h │ │ │ │ ├── context.h │ │ │ │ ├── declaration.h │ │ │ │ ├── layer.h │ │ │ │ ├── lsflow_util.h │ │ │ │ ├── manager.h │ │ │ │ ├── node.h │ │ │ │ ├── shape.h │ │ │ │ └── tensor.h │ │ │ ├── layer.cpp │ │ │ ├── lsflow_util.cpp │ │ │ ├── manager.cpp │ │ │ ├── node.cpp │ │ │ ├── operator.cpp │ │ │ ├── shape.cpp │ │ │ ├── tensor.cpp │ │ │ └── variable.cpp │ │ ├── models │ │ │ ├── CMakeLists.txt │ │ │ ├── bert.cc │ │ │ ├── bert_crf.cc │ │ │ ├── gpt.cc │ │ │ ├── includes │ │ │ │ ├── bert.h │ │ │ │ ├── bert_crf.h │ │ │ │ ├── gpt.h │ │ │ │ ├── model_base.h │ │ │ │ ├── model_util.h │ │ │ │ └── transformer.h │ │ │ ├── model_util.cc │ │ │ ├── test_layer.cc │ │ │ └── transformer.cu │ │ ├── ops │ │ │ └── includes │ │ │ │ ├── context.h │ │ │ │ ├── dropout.h │ │ │ │ ├── feed_forward.h │ │ │ │ ├── normalize_layer.h │ │ │ │ ├── softmax.h │ │ │ │ └── strided_batch_gemm.h │ │ ├── ops_new │ │ │ ├── CMakeLists.txt │ │ │ ├── beam_search_topk.cu │ │ │ ├── bias_act_dropout.cpp │ │ │ ├── bias_add_transform_20314.cpp │ │ │ ├── bias_dropout_residual.cpp │ │ │ ├── concat3_dim1.cpp │ │ │ ├── crf.cpp │ │ │ ├── dropout.cpp │ │ │ ├── includes │ │ │ │ ├── beam_search_topk.h │ │ │ │ ├── bias_act_dropout.h │ │ │ │ ├── bias_add_transform_20314.h │ │ │ │ ├── bias_dropout_residual.h │ │ │ │ ├── concat3_dim1.h │ │ │ │ ├── crf.h │ │ │ │ ├── dropout.h │ │ │ │ ├── launch_dec_emb_op.h │ │ │ │ ├── launch_enc_emb.h │ │ │ │ ├── launch_gpt_emb.h │ │ │ │ ├── layer_normalize.h │ │ │ │ ├── linear.h │ │ │ │ ├── sampling.h │ │ │ │ ├── softmax.h │ │ │ │ ├── split_head_op.h │ │ │ │ ├── strided_batch_gemm.h │ │ │ │ └── transform_0213.h │ │ │ ├── launch_dec_emb_op.cpp │ │ │ ├── launch_enc_emb.cpp │ │ │ ├── launch_gpt_emb.cpp │ │ │ ├── layer_normalize.cpp │ │ │ ├── linear.cpp │ │ │ ├── sampling.cc.cu │ │ │ ├── softmax.cpp │ │ │ ├── split_head_op.cpp │ │ │ ├── strided_batch_gemm.cpp │ │ │ └── transform_0213.cpp │ │ ├── proto │ │ │ ├── CMakeLists.txt │ │ │ ├── bert.proto │ │ │ ├── bert_crf.proto │ │ │ ├── bert_crf_weight.cc │ │ │ ├── bert_weight.cc │ │ │ ├── gpt.proto │ │ │ ├── gpt_weight.cc │ │ │ ├── includes │ │ │ │ ├── bert_crf_weight.h │ │ │ │ ├── bert_weight.h │ │ │ │ ├── gpt_weight.h │ │ │ │ ├── proto_headers.h │ │ │ │ ├── proto_util.h │ │ │ │ ├── test_model_weight.h │ │ │ │ └── transformer_weight.h │ │ │ ├── proto_util.cc │ │ │ ├── transformer.proto │ │ │ └── transformer_weight.cc │ │ ├── pybind │ │ │ ├── CMakeLists.txt │ │ │ ├── pybind_adam.cpp │ │ │ ├── pybind_kernel_cuda.cpp │ │ │ ├── pybind_kernel_x86.cpp │ │ │ ├── pybind_layer.cpp │ │ │ ├── pybind_layer_new.cpp │ │ │ ├── pybind_model.cpp │ │ │ └── pybind_op.cpp │ │ ├── pytorch │ │ │ ├── __init__.py │ │ │ ├── builder │ │ │ │ ├── __init__.py │ │ │ │ ├── builder.py │ │ │ │ ├── cuda_kernel_builder.py │ │ │ │ ├── cuda_layer_builder.py │ │ │ │ └── x86_kernel_builder.py │ │ │ ├── layer_base.py │ │ │ ├── pytorch_quantization │ │ │ │ ├── __init__.py │ │ │ │ ├── calib │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── calibrator.py │ │ │ │ │ ├── histogram.py │ │ │ │ │ └── max.py │ │ │ │ ├── nn │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── _functions │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── quant_rnn.py │ │ │ │ │ ├── functional.py │ │ │ │ │ └── modules │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── _utils.py │ │ │ │ │ │ ├── clip.py │ │ │ │ │ │ ├── quant_bert.py │ │ │ │ │ │ ├── quant_conv.py │ │ │ │ │ │ ├── quant_instancenorm.py │ │ │ │ │ │ ├── quant_linear.py │ │ │ │ │ │ ├── quant_pooling.py │ │ │ │ │ │ ├── quant_rnn.py │ │ │ │ │ │ └── tensor_quantizer.py │ │ │ │ ├── optim │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── helper.py │ │ │ │ ├── quant_modules.py │ │ │ │ ├── tensor_quant.py │ │ │ │ ├── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── quant_logging.py │ │ │ │ │ └── reduce_amax.py │ │ │ │ └── version.py │ │ │ ├── quantization.py │ │ │ ├── sdpa_layers.py │ │ │ ├── torch_transformer_layers.py │ │ │ ├── transformer_decoder_layer.py │ │ │ ├── transformer_encoder_layer.py │ │ │ └── util.py │ │ ├── tensorflow │ │ │ └── README.md │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── cuda │ │ │ │ ├── __init__.py │ │ │ │ ├── fairseq_layers.py │ │ │ │ ├── test_kernel.py │ │ │ │ ├── test_layer.py │ │ │ │ ├── test_ls_small_layer.py │ │ │ │ └── torch_crf.py │ │ │ ├── util.py │ │ │ └── x86 │ │ │ │ └── test_kernel.py │ │ └── triton_backend │ │ │ ├── CMakeLists.txt │ │ │ ├── cmake │ │ │ └── TutorialMinimalBackendConfig.cmake.in │ │ │ └── src │ │ │ ├── libtriton_minimal.ldscript │ │ │ ├── lightseq_backend.cc │ │ │ ├── triton_model.h │ │ │ └── triton_utils.h │ ├── inference │ │ ├── kernels │ │ │ ├── CMakeLists.txt │ │ │ ├── common.h │ │ │ ├── embKernels.cc.cu │ │ │ ├── embKernels.h │ │ │ ├── embKernels_int8.cc.cu │ │ │ ├── embKernels_int8.h │ │ │ ├── gptKernels.cc.cu │ │ │ ├── gptKernels.h │ │ │ ├── gptKernels_int8.cc.cu │ │ │ ├── gptKernels_int8.h │ │ │ ├── moeKernels.cc.cu │ │ │ ├── moeKernels.h │ │ │ ├── multilgKernels.cc.cu │ │ │ ├── multilgKernels.h │ │ │ ├── t5EmbKernels.cc.cu │ │ │ ├── t5EmbKernels.h │ │ │ ├── t5Kernels.cc.cu │ │ │ ├── t5Kernels.h │ │ │ ├── transformerKernels.cc.cu │ │ │ ├── transformerKernels.h │ │ │ ├── transformerKernels_int8.cc.cu │ │ │ └── transformerKernels_int8.h │ │ ├── model │ │ │ ├── CMakeLists.txt │ │ │ ├── bert_encoder.cc.cu │ │ │ ├── bert_encoder.h │ │ │ ├── cublas_algo_map.cc │ │ │ ├── cublas_algo_map.h │ │ │ ├── cublas_helper.cc │ │ │ ├── cublas_helper.h │ │ │ ├── decoder.cc.cu │ │ │ ├── decoder.h │ │ │ ├── encoder.cc.cu │ │ │ ├── encoder.h │ │ │ ├── gpt_encoder.cc.cu │ │ │ ├── gpt_encoder.h │ │ │ ├── moe_decoder.cc.cu │ │ │ ├── moe_decoder.h │ │ │ ├── moe_encoder.cc.cu │ │ │ ├── moe_encoder.h │ │ │ ├── mt5_decoder.cc.cu │ │ │ ├── mt5_decoder.h │ │ │ ├── mt5_encoder.cc.cu │ │ │ ├── mt5_encoder.h │ │ │ ├── quant_bert_encoder.cc.cu │ │ │ ├── quant_bert_encoder.h │ │ │ ├── quant_decoder.cc.cu │ │ │ ├── quant_decoder.h │ │ │ ├── quant_encoder.cc.cu │ │ │ ├── quant_encoder.h │ │ │ ├── quant_gpt_encoder.cc.cu │ │ │ ├── quant_gpt_encoder.h │ │ │ ├── quant_vit_encoder.cc.cu │ │ │ ├── quant_vit_encoder.h │ │ │ ├── t5_decoder.cc.cu │ │ │ ├── t5_decoder.h │ │ │ ├── t5_encoder.cc.cu │ │ │ ├── t5_encoder.h │ │ │ ├── vit_encoder.cc.cu │ │ │ └── vit_encoder.h │ │ ├── proto │ │ │ ├── CMakeLists.txt │ │ │ ├── bert.proto │ │ │ ├── bert_weight.cc │ │ │ ├── bert_weight.h │ │ │ ├── gpt.proto │ │ │ ├── gpt_weight.cc │ │ │ ├── gpt_weight.h │ │ │ ├── moe.proto │ │ │ ├── moe_weight.cc │ │ │ ├── moe_weight.h │ │ │ ├── mt5.proto │ │ │ ├── mt5_weight.cc │ │ │ ├── mt5_weight.h │ │ │ ├── quant_bert.proto │ │ │ ├── quant_bert_weight.cc │ │ │ ├── quant_bert_weight.h │ │ │ ├── quant_gpt.proto │ │ │ ├── quant_gpt_weight.cc │ │ │ ├── quant_gpt_weight.h │ │ │ ├── quant_transformer.proto │ │ │ ├── quant_transformer_weight.cc │ │ │ ├── quant_transformer_weight.h │ │ │ ├── quant_vit.proto │ │ │ ├── quant_vit_weight.cc │ │ │ ├── quant_vit_weight.h │ │ │ ├── t5.proto │ │ │ ├── t5_weight.cc │ │ │ ├── t5_weight.h │ │ │ ├── transformer.proto │ │ │ ├── transformer_weight.cc │ │ │ ├── transformer_weight.h │ │ │ ├── vit.proto │ │ │ ├── vit_weight.cc │ │ │ └── vit_weight.h │ │ ├── pywrapper │ │ │ ├── CMakeLists.txt │ │ │ ├── bert.cc │ │ │ ├── bert.h │ │ │ ├── gpt.cc │ │ │ ├── gpt.h │ │ │ ├── model_base.h │ │ │ ├── moe.cc │ │ │ ├── moe.h │ │ │ ├── mt5.cc │ │ │ ├── mt5.h │ │ │ ├── quant_bert.cc │ │ │ ├── quant_bert.h │ │ │ ├── quant_gpt.cc │ │ │ ├── quant_gpt.h │ │ │ ├── quant_transformer.cc │ │ │ ├── quant_transformer.h │ │ │ ├── quant_vit.cc │ │ │ ├── quant_vit.h │ │ │ ├── t5.cc │ │ │ ├── t5.h │ │ │ ├── transformer.cc │ │ │ ├── transformer.h │ │ │ ├── transformer_decoder.cc.cu │ │ │ ├── vit.cc │ │ │ ├── vit.h │ │ │ └── wrapper.cc │ │ ├── server │ │ │ ├── CMakeLists.txt │ │ │ ├── custom.h │ │ │ ├── decoder_generate_server.cc.cu │ │ │ ├── generate_server.cc.cu │ │ │ ├── gpt_generate_server.cc.cu │ │ │ ├── gptlm_server.cc.cu │ │ │ ├── libserver.ldscript │ │ │ ├── model_config.h │ │ │ ├── model_config.proto │ │ │ ├── model_config_cuda.h │ │ │ ├── moe_server.cc.cu │ │ │ └── transformer_server.cc.cu │ │ ├── tools │ │ │ ├── CMakeLists.txt │ │ │ ├── util.cc.cu │ │ │ └── util.h │ │ └── triton_backend │ │ │ ├── CMakeLists.txt │ │ │ ├── cmake │ │ │ └── TutorialMinimalBackendConfig.cmake.in │ │ │ └── src │ │ │ ├── libtriton_minimal.ldscript │ │ │ ├── lightseq_backend.cc │ │ │ ├── triton_model.h │ │ │ └── triton_utils.h │ └── training │ │ ├── __init__.py │ │ ├── cli │ │ ├── __init__.py │ │ ├── fs_modules │ │ │ ├── __init__.py │ │ │ ├── ls_adam.py │ │ │ ├── ls_bart.py │ │ │ ├── ls_fs_transformer_decoder_layer.py │ │ │ ├── ls_label_smoothed_cross_entropy.py │ │ │ ├── ls_transformer.py │ │ │ └── ls_translation.py │ │ ├── lightseq_deepspeed_cli.py │ │ ├── lightseq_fairseq_generate_cli.py │ │ ├── lightseq_fairseq_train_cli.py │ │ ├── lightseq_fairseq_validate_cli.py │ │ └── lightseq_infer_cli.py │ │ ├── csrc │ │ └── ops │ │ │ └── includes │ │ │ └── strided_batch_gemm.h │ │ ├── gcq │ │ ├── __init__.py │ │ ├── gcq.py │ │ ├── ls_fs_gcq_train.py │ │ └── ls_fs_gcq_trainer.py │ │ ├── ops │ │ ├── __init__.py │ │ ├── pytorch │ │ │ ├── __init__.py │ │ │ ├── adam.py │ │ │ ├── builder │ │ │ │ ├── __init__.py │ │ │ │ ├── adam_builder.py │ │ │ │ ├── builder.py │ │ │ │ ├── kernel_builder.py │ │ │ │ ├── layer_builder.py │ │ │ │ ├── operator_builder.py │ │ │ │ └── transformer_builder.py │ │ │ ├── cross_entropy_layer.py │ │ │ ├── export.py │ │ │ ├── export_quant.py │ │ │ ├── gemm_test.py │ │ │ ├── gpt_layer.py │ │ │ ├── layer_base.py │ │ │ ├── multihead_attention_layer.py │ │ │ ├── quant_linear_layer.py │ │ │ ├── quantization.py │ │ │ ├── torch_transformer_layers.py │ │ │ ├── transformer.py │ │ │ ├── transformer_decoder_layer.py │ │ │ ├── transformer_decoder_layer_new.py │ │ │ ├── transformer_embedding_layer.py │ │ │ ├── transformer_encoder_layer.py │ │ │ ├── transformer_encoder_layer_new.py │ │ │ └── util.py │ │ └── tensorflow │ │ │ ├── README.md │ │ │ └── __init__.py │ │ └── pytorch_quantization │ │ ├── __init__.py │ │ ├── calib │ │ ├── __init__.py │ │ ├── calibrator.py │ │ ├── histogram.py │ │ └── max.py │ │ ├── nn │ │ ├── __init__.py │ │ ├── _functions │ │ │ ├── __init__.py │ │ │ └── quant_rnn.py │ │ ├── functional.py │ │ └── modules │ │ │ ├── __init__.py │ │ │ ├── _utils.py │ │ │ ├── clip.py │ │ │ ├── quant_bert.py │ │ │ ├── quant_conv.py │ │ │ ├── quant_instancenorm.py │ │ │ ├── quant_linear.py │ │ │ ├── quant_pooling.py │ │ │ ├── quant_rnn.py │ │ │ └── tensor_quantizer.py │ │ ├── optim │ │ ├── __init__.py │ │ └── helper.py │ │ ├── quant_modules.py │ │ ├── tensor_quant.py │ │ ├── utils │ │ ├── __init__.py │ │ ├── quant_logging.py │ │ └── reduce_amax.py │ │ └── version.py ├── torch │ ├── _C │ │ ├── __init__.py │ │ └── csrc │ │ │ ├── CPUAdagradBinder.cpp │ │ │ ├── CPUAdamBinder.cpp │ │ │ ├── ExpertParallelBinder.cpp │ │ │ ├── FusedAdagradBinder.cpp │ │ │ ├── FusedAdamBinder.cpp │ │ │ ├── FusedL2NormBinder.cpp │ │ │ ├── FusedLambBinder.cpp │ │ │ ├── FusedLayerNormBinder.cpp │ │ │ ├── FusedMixedPrecisionL2NormBinder.cpp │ │ │ ├── FusedMixedPrecisionLambBinder.cpp │ │ │ ├── FusedNovogradBinder.cpp │ │ │ ├── FusedSGDBinder.cpp │ │ │ ├── FusedScaleMaskSoftmaxBinder.cpp │ │ │ ├── __init__.py │ │ │ ├── custom_cuda_kernel.cu │ │ │ ├── expert_parallel_cuda_kernel.cu │ │ │ ├── fused_layer_norm.cu │ │ │ ├── includes │ │ │ ├── block_reduce.h │ │ │ ├── compat.h │ │ │ ├── context.h │ │ │ ├── cpu_adagrad.h │ │ │ ├── cpu_adam.h │ │ │ ├── custom_cuda_layers.h │ │ │ ├── multi_tensor_apply.cuh │ │ │ ├── simd.h │ │ │ ├── tqdm.h │ │ │ └── type_shim.h │ │ │ ├── multi_tensor_adagrad.cu │ │ │ ├── multi_tensor_adam.cu │ │ │ ├── multi_tensor_l2norm.cu │ │ │ ├── multi_tensor_l2norm_mp.cu │ │ │ ├── multi_tensor_lamb.cu │ │ │ ├── multi_tensor_lamb_mp.cu │ │ │ ├── multi_tensor_novograd.cu │ │ │ ├── multi_tensor_sgd.cu │ │ │ ├── ngram_repeat_block_cuda.cpp │ │ │ └── ngram_repeat_block_cuda_kernel.cu │ ├── __init__.py │ ├── distributed │ │ ├── __init__.py │ │ ├── _initializers │ │ │ ├── __init__.py │ │ │ ├── initializer.py │ │ │ ├── initializer_data.py │ │ │ ├── initializer_expert.py │ │ │ ├── initializer_model.py │ │ │ ├── initializer_pipeline.py │ │ │ ├── initializer_sequence.py │ │ │ ├── initializer_tensor.py │ │ │ ├── initializer_tensor_1d.py │ │ │ ├── initializer_tensor_2d.py │ │ │ ├── initializer_tensor_2p5d.py │ │ │ └── initializer_tensor_3d.py │ │ ├── _seed │ │ │ ├── __init__.py │ │ │ ├── helper.py │ │ │ └── seed_manager.py │ │ ├── nn │ │ │ ├── __init__.py │ │ │ ├── _p2p.py │ │ │ ├── _ring_self_attention.py │ │ │ └── functional.py │ │ ├── parallel_context.py │ │ └── parallel_mode.py │ ├── jit │ │ ├── __init__.py │ │ └── _utils.py │ ├── nn │ │ ├── __init__.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── conv.py │ │ │ ├── dropout.py │ │ │ ├── embedding.py │ │ │ ├── functional.py │ │ │ ├── layer_norm.py │ │ │ ├── linear.py │ │ │ ├── loss.py │ │ │ └── ngram_repeat_block.py │ │ └── parallel │ │ │ ├── __init__.py │ │ │ ├── data_parallel │ │ │ ├── __init__.py │ │ │ ├── _reducer.py │ │ │ ├── _utils.py │ │ │ ├── data_parallel.py │ │ │ └── zero │ │ │ │ ├── __init__.py │ │ │ │ ├── _optim_interface.py │ │ │ │ ├── hetero │ │ │ │ ├── __init__.py │ │ │ │ ├── chunk │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── chunk.py │ │ │ │ │ ├── manager.py │ │ │ │ │ └── utils.py │ │ │ │ ├── data_parallel.py │ │ │ │ ├── hook.py │ │ │ │ ├── memory_manager.py │ │ │ │ ├── memory_tracer │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── chunk_memstats_collector.py │ │ │ │ │ ├── memory_monitor.py │ │ │ │ │ ├── memory_stats.py │ │ │ │ │ ├── param_runtime_order.py │ │ │ │ │ └── utils.py │ │ │ │ ├── optim.py │ │ │ │ ├── placement_policy.py │ │ │ │ └── utils.py │ │ │ │ └── optim │ │ │ │ ├── __init__.py │ │ │ │ ├── _utils.py │ │ │ │ ├── bookkeeping │ │ │ │ ├── __init__.py │ │ │ │ ├── _base_store.py │ │ │ │ ├── bucket_store.py │ │ │ │ ├── gradient_store.py │ │ │ │ ├── parameter_store.py │ │ │ │ └── tensor_store.py │ │ │ │ └── optim.py │ │ │ ├── expert_parallel │ │ │ ├── __init__.py │ │ │ ├── _ops.py │ │ │ ├── expert_parallel.py │ │ │ ├── experts.py │ │ │ ├── layers.py │ │ │ ├── mapping.py │ │ │ └── utils.py │ │ │ ├── pipeline_parallel │ │ │ ├── __init__.py │ │ │ ├── _buffers.py │ │ │ ├── _comm.py │ │ │ ├── _cost_estimator.py │ │ │ ├── _functional.py │ │ │ ├── _job.py │ │ │ ├── _messages.py │ │ │ ├── _model_partitioner.py │ │ │ ├── _sync.py │ │ │ ├── _types.py │ │ │ ├── _utils.py │ │ │ ├── _workers.py │ │ │ └── pipeline_parallel.py │ │ │ ├── tensor_parallel │ │ │ ├── _1d │ │ │ │ ├── __init__.py │ │ │ │ ├── _ops.py │ │ │ │ └── _wrapper.py │ │ │ ├── _2d │ │ │ │ ├── __init__.py │ │ │ │ ├── _ops.py │ │ │ │ └── _wrapper.py │ │ │ ├── _2p5d │ │ │ │ ├── __init__.py │ │ │ │ ├── _ops.py │ │ │ │ └── _wrapper.py │ │ │ ├── _3d │ │ │ │ ├── __init__.py │ │ │ │ ├── _ops.py │ │ │ │ └── _wrapper.py │ │ │ ├── __init__.py │ │ │ ├── mapping.py │ │ │ ├── tensor_parallel.py │ │ │ └── utils.py │ │ │ └── utils.py │ ├── optim │ │ ├── __init__.py │ │ ├── cpu_adagrad.py │ │ ├── cpu_adam.py │ │ ├── fused_adagrad.py │ │ ├── fused_adam.py │ │ ├── fused_lamb.py │ │ ├── fused_mixed_precision_lamb.py │ │ ├── fused_novograd.py │ │ └── fused_sgd.py │ └── utils │ │ ├── __init__.py │ │ ├── checkpoint │ │ ├── __init__.py │ │ ├── _checkpoint_function.py │ │ ├── _checkpoint_partitioner.py │ │ ├── _checkpoint_utils.py │ │ ├── _rng_state_tracker.py │ │ └── activation_checkpointing.py │ │ ├── common.py │ │ ├── data │ │ ├── __init__.py │ │ └── data_collators.py │ │ ├── extensions.py │ │ ├── logging │ │ ├── __init__.py │ │ └── logger.py │ │ ├── multi_tensor_apply │ │ ├── __init__.py │ │ └── multi_tensor_apply.py │ │ ├── random.py │ │ └── version.py └── transformers │ ├── __init__.py │ ├── constants.py │ ├── data │ ├── __init__.py │ └── data_collator.py │ ├── kernel_fusion_utils.py │ ├── mapping_utils.py │ ├── modeling_utils.py │ ├── models │ ├── __init__.py │ ├── albert │ │ ├── __init__.py │ │ └── modeling_albert.py │ ├── bart │ │ ├── __init__.py │ │ └── modeling_bart.py │ ├── bert │ │ ├── __init__.py │ │ └── modeling_bert.py │ ├── distilbert │ │ ├── __init__.py │ │ └── modeling_distilbert.py │ ├── electra │ │ ├── __init__.py │ │ └── modeling_electra.py │ ├── gpt2 │ │ ├── __init__.py │ │ └── modeling_gpt2.py │ ├── mbart │ │ ├── __init__.py │ │ └── modeling_mbart.py │ ├── mt5 │ │ ├── __init__.py │ │ └── modeling_mt5.py │ ├── roberta │ │ ├── __init__.py │ │ └── modeling_roberta.py │ └── t5 │ │ ├── __init__.py │ │ └── modeling_t5.py │ ├── oslo_init.py │ ├── profiler.py │ ├── tasks │ ├── __init__.py │ ├── data_albert_pretraining.py │ ├── data_bart_pretraining.py │ ├── data_base.py │ ├── data_bert_pretraining.py │ ├── data_causal_lm.py │ ├── data_masked_lm.py │ ├── data_sequence_classification.py │ ├── data_summarization.py │ ├── data_t5_pretraining.py │ ├── data_token_classification.py │ ├── data_utils.py │ └── loading │ │ ├── __init__.py │ │ └── sent_text.py │ ├── trainer.py │ ├── trainer_utils.py │ └── training_args.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── inference.py ├── merge.py ├── misc │ └── test_rpc.py ├── tasks │ └── model_task.py ├── test_script │ ├── run_inference.sh │ ├── run_merge.sh │ └── run_train.sh ├── training.py └── util │ ├── arg_parser.py │ └── oslo.py └── tests_deprecated ├── __init__.py ├── torch ├── _C │ └── csrc │ │ ├── test_fused_layer_norm.py │ │ ├── test_fused_layer_norm_autocast.py │ │ ├── test_fused_layer_norm_fusedlayernorm.py │ │ ├── test_fused_layer_norm_rms.py │ │ └── test_sequence_generator.py ├── __init__.py ├── distributed │ └── __init__.py ├── nn │ ├── __init__.py │ └── parallel │ │ ├── __init__.py │ │ ├── data_parallel │ │ ├── __init__.py │ │ ├── data_parallel.py │ │ └── zero │ │ │ ├── __init__.py │ │ │ ├── heterogeneous_manager │ │ │ ├── __init__.py │ │ │ ├── test_chunk.py │ │ │ ├── test_chunk_manager.py │ │ │ ├── test_mem_collector.py │ │ │ ├── test_mem_monitor.py │ │ │ └── test_memstats.py │ │ │ ├── sharded_optim │ │ │ ├── __init__.py │ │ │ ├── test_grad_acc.py │ │ │ ├── test_hetero_step.py │ │ │ ├── test_hybrid.py │ │ │ ├── test_integrity.py │ │ │ └── test_mixed_prec.py │ │ │ ├── test_fsdp_wrapper.py │ │ │ ├── test_grad.py │ │ │ └── test_state_dict.py │ │ ├── expert_parallel │ │ └── gpt2 │ │ │ ├── gpt2.py │ │ │ ├── gpt2_deparallelize.py │ │ │ ├── gpt2_load.py │ │ │ ├── gpt2_pr_ep.py │ │ │ ├── gpt2_pr_moe.py │ │ │ ├── gpt2_save.py │ │ │ └── utils.py │ │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── compare_grad_pptp_no.py │ │ ├── compare_grad_tp_no.py │ │ ├── compare_output_pptp_no.py │ │ ├── compare_output_tp_no.py │ │ ├── compare_pp_nopp.py │ │ ├── compare_pptp_trial.py │ │ ├── compare_send_recv.py │ │ ├── test_batch_order.py │ │ ├── test_p2p.py │ │ ├── test_partioning.py │ │ ├── test_pp.py │ │ ├── test_pp4.py │ │ ├── test_rpc.py │ │ └── test_tp.py │ │ └── tensor_parallel │ │ ├── 1d │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── deparallel │ │ │ ├── __init__.py │ │ │ ├── test_deparallelize.py │ │ │ ├── test_load_parallel.py │ │ │ ├── test_qkv.py │ │ │ └── test_vocab.py │ │ ├── test_col_linear_1d.py │ │ ├── test_embedding_1d.py │ │ ├── test_layer_norm_1d.py │ │ ├── test_row_linear_1d.py │ │ ├── test_vocab_embedding_1d.py │ │ ├── test_vocab_parallel_cross_entropy_1d.py │ │ ├── test_wrapper_1d.py │ │ ├── test_wrapper_1d_vocab.py │ │ └── test_wrapper_1d_with_t5.py │ │ ├── 2d │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── deparallel │ │ │ ├── __init__.py │ │ │ ├── test_deparallelize.py │ │ │ ├── test_load_parallel.py │ │ │ └── test_qkv.py │ │ ├── test_embedding_2d.py │ │ ├── test_layer_norm_2d.py │ │ ├── test_linear_2d.py │ │ ├── test_vocab_embedding_2d.py │ │ ├── test_vocab_parallel_cross_entropy_2d.py │ │ ├── test_wrapper_2d.py │ │ └── test_wrapper_2d_vocab.py │ │ ├── 2p5d │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── deparallel │ │ │ ├── __init__.py │ │ │ ├── test_deparallelize.py │ │ │ ├── test_load_parallel.py │ │ │ ├── test_qkv.py │ │ │ └── test_vocab.py │ │ ├── test_embedding_2p5d.py │ │ ├── test_layer_norm_2p5d.py │ │ ├── test_linear_2p5d.py │ │ ├── test_vocab_embedding_2p5d.py │ │ ├── test_vocab_parallel_cross_entropy_2p5d.py │ │ ├── test_wrapper_2p5d.py │ │ └── test_wrapper_2p5d_vocab.py │ │ ├── 3d │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── deparallel │ │ │ ├── __init__.py │ │ │ └── test_deparallelize.py │ │ ├── test_embedding_3d.py │ │ ├── test_layer_norm_3d.py │ │ ├── test_linear_3d.py │ │ ├── test_vocab_embedding_3d.py │ │ ├── test_vocab_parallel_cross_entropy_3d.py │ │ ├── test_wrapper_3d.py │ │ └── test_wrapper_3d_vocab.py │ │ └── __init__.py ├── optim │ ├── cpu_adagrad.py │ ├── cpu_adam.py │ ├── fused_adam.py │ ├── fused_lamb.py │ ├── fused_novograd.py │ └── fused_optimizers.py └── utils │ ├── __init__.py │ ├── data │ ├── __init__.py │ └── test_data_collators.py │ └── logging.py └── transformers ├── __init__.py ├── models ├── bert │ └── test_modeling_bert.py ├── electra │ ├── test_mlm.py │ └── test_token_cls.py ├── gpt2 │ └── test_modeling_gpt2.py ├── mbart │ └── test_training.py └── mt5 │ └── test_training.py ├── tasks ├── test_data_albert_pretraining.py ├── test_data_bart_pretraining.py ├── test_data_base.py ├── test_data_bert_pretraining.py ├── test_data_causal_lm.py ├── test_data_masked_lm.py ├── test_data_sequence_classification.py ├── test_data_summarization.py ├── test_data_t5_pretraining.py ├── test_data_token_classification.py └── test_model_bart_pretraining.py ├── test_kernel_fusion_utils.py └── trainer ├── oslo_user_config.json ├── test_oslo_config.py ├── test_trainer_basic.py ├── test_trainer_ddp.py ├── test_trainer_dp_zero1.py ├── test_trainer_pp.py ├── test_trainer_reload.py ├── test_trainer_tp_1d.py ├── test_trainer_tp_2d.py ├── test_trainer_tp_2p5d.py └── test_trainer_tp_3d.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @hyunwoongko 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Report a bug 3 | about: Bug report 4 | labels: 'bug' 5 | --- 6 | 7 | ## How to reproduce 8 | 9 | ```python 10 | ``` 11 | 12 | ## Environment 13 | 14 | - OS : 15 | - Python version : 16 | - Transformers version : 17 | - Whether to use Docker: 18 | - Misc.: 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request a feature 3 | about: Feature request 4 | labels: 'enhancement' 5 | --- 6 | 7 | ## Describe a requested feature 8 | 9 | - 10 | 11 | ## Expected behavior 12 | 13 | ```python 14 | >>> a = Foo() 15 | >>> a.predict() 16 | ``` 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/todo.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: TODO feature 3 | about: TODO feature 4 | --- 5 | 6 | ## Describe a TODO feature 7 | 8 | - 9 | 10 | ## Assignees 11 | 12 | - 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Title 2 | 3 | - 4 | 5 | ## Description 6 | 7 | - 8 | 9 | ## Linked Issues 10 | 11 | - resolved #00 12 | -------------------------------------------------------------------------------- /.github/workflows/contributors.yaml: -------------------------------------------------------------------------------- 1 | name: Add contributors 2 | on: 3 | schedule: 4 | - cron: '20 20 * * *' 5 | push: 6 | branches: 7 | - master 8 | 9 | jobs: 10 | add-contributors: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: BobAnkh/add-contributors@master 15 | with: 16 | CONTRIBUTOR: 'Thanks so much to all of our amazing contributors!' 17 | COLUMN_PER_ROW: '6' 18 | ACCESS_TOKEN: ${{secrets.GITHUB_TOKEN}} 19 | IMG_WIDTH: '120' 20 | FONT_SIZE: '14' 21 | PATH: '/README.md' 22 | COMMIT_MESSAGE: 'docs(README): update contributors' 23 | AVATAR_SHAPE: 'round' 24 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-python@v2 11 | with: 12 | python-version: 3.8 13 | - uses: pre-commit/action@v2.0.3 14 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: 'docs|assets' 2 | 3 | default_language_version: 4 | python: python3 5 | 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v4.4.0 9 | hooks: 10 | - id: check-case-conflict 11 | - id: check-json 12 | - id: check-symlinks 13 | - id: check-yaml 14 | - id: destroyed-symlinks 15 | - id: check-ast 16 | - id: check-merge-conflict 17 | - id: check-added-large-files 18 | args: ['--maxkb=500'] 19 | - id: end-of-file-fixer 20 | - id: fix-byte-order-marker 21 | - id: fix-encoding-pragma 22 | args: [--remove] 23 | - id: mixed-line-ending 24 | args: [--fix=lf] 25 | - id: requirements-txt-fixer 26 | - id: trailing-whitespace 27 | 28 | - repo: https://github.com/pocc/pre-commit-hooks 29 | rev: v1.3.5 30 | hooks: 31 | - id: clang-format 32 | 33 | - repo: https://github.com/psf/black 34 | rev: 22.12.0 35 | hooks: 36 | - id: black 37 | language_version: python3.8 38 | 39 | - repo: https://github.com/codespell-project/codespell 40 | rev: v2.2.2 41 | hooks: 42 | - id: codespell 43 | args: [ 44 | '--ignore-words-list=reord,dout,nd,te,ser,mata', # Word used in error messages that need rewording 45 | --check-filenames, 46 | --check-hidden, 47 | ] 48 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.txt 2 | global-include *.cu *.cpp *.cc *.cuh *.h *.ldscript *.proto *.cmake 3 | prune dist 4 | prune build 5 | prune tests 6 | include LICENSE.apache-2.0 7 | include LICENSE.3rd_party_library 8 | include README.md 9 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/assets/logo.png -------------------------------------------------------------------------------- /docs/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 6a7a9cb54d9ab2728b51824ec90997d3 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/.nojekyll -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | oslo.eleuther.ai -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_images/260461C3-EA3B-405C-9B34-05BA3C781161.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/260461C3-EA3B-405C-9B34-05BA3C781161.png -------------------------------------------------------------------------------- /docs/_images/2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/2d.png -------------------------------------------------------------------------------- /docs/_images/2p5d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/2p5d.png -------------------------------------------------------------------------------- /docs/_images/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg -------------------------------------------------------------------------------- /docs/_images/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg -------------------------------------------------------------------------------- /docs/_images/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/figure1.png -------------------------------------------------------------------------------- /docs/_images/figure11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/figure11.png -------------------------------------------------------------------------------- /docs/_sources/CONCEPTS/data_parallelism.md: -------------------------------------------------------------------------------- 1 | # Concept of Data Parallelism 2 | - Authors: Jinwon Kim 3 | 4 | **Data Parallelism** is a widely-used technique for training deep learning models in parallel. It involves distributing the training data across multiple processing units, such as GPUs, each of which has a copy of the model parameters. The data is divided into subsets, and each unit independently computes the gradients for its subset. The gradients are then aggregated to update the model parameters. This approach enables efficient parallelization of the training process and can accelerate the training of deep learning models on large datasets. 5 | 6 | Oslo supports Zero Redundancy Optimizer (ZeRO) to easily scale deep learning model. 7 | 8 | ## Optimizer-Level Parallel 9 | - [Zero Redundancy Optimizer DP](dp/zero_algorithm.md) 10 | 11 | 12 | ### References 13 | - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054) 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/_sources/CONCEPTS/dp/zero_algorithm.md: -------------------------------------------------------------------------------- 1 | # Zero Redundancy Optimizer DP 2 | - Authors: Jinwon Kim 3 | - Paper: https://arxiv.org/abs/1910.02054 4 | 5 | ![figure1.png](zero_image/figure1.png) 6 | 7 | The Zero Redundancy Optimizer for Data Parallelism (ZeRO-DP) is a technique used to remove memory state redundancies and optimize computational efficiency in data parallel distributed deep learning. ZeRO-DP partitions the model states across data-parallel processes, eliminating the need for replication of model parameters, which in turn reduces memory usage and communication overhead during training. 8 | 9 | ## Optimizer State Partitioning (Level 1) 10 | - The optimizer states are partitioned across data parallel processes 11 | ## Gradient Partitioning (Level 2) 12 | - The reduced gradients are partitioned based on the corresponding parameter and are reduced only by the data parallel process responsible for updating those parameters. After the reduction, the memory can be released. 13 | ## Parameter Partitioning (Level 3) 14 | - Similar to the optimizer states and gradients, each process only stores the parameters associated with its partition. 15 | 16 | -------------------------------------------------------------------------------- /docs/_sources/CONCEPTS/tensor_model_parallelism.md: -------------------------------------------------------------------------------- 1 | # Concept of Tensor Model Parallelism 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu 3 | 4 | **Tensor Model Parallelism** makes it possible to train larger models by partitioning the parameter tensors into multiple dimensions. 5 | We support 1D, 2D, 2.5D, and 3D tensor partitioning algorithms which make tensor parallel training more efficient. 6 | 7 | ## Tensor Parallel Algorithms 8 | - [1D parallel algorithm (same as Megatron-LM)](tp/1d_parallel_algorithm.md) 9 | - [2D parallel (SUMMA) algorithm](tp/2d_parallel_algorithm.md) 10 | - [2.5D parallel (SUMMA-2.5) algorithm](tp/2p5d_parallel_algorithm.md) 11 | - [3D parallel Algorithm](tp/3d_parallel_algorithm.md) 12 | 13 | ### References 14 | - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 15 | - [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343) 16 | - [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500) 17 | - [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450) 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/_sources/CONCEPTS/tp/2d_parallel_algorithm.md: -------------------------------------------------------------------------------- 1 | # 2D parallel (SUMMA) algorithm 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu 3 | - Paper : [https://arxiv.org/pdf/2104.05343.pdf](https://arxiv.org/pdf/2104.05343.pdf) 4 | 5 | ![image.png](2d_image/2d.png) 6 | 7 | The use of 1D tensor parallelism can lead to high memory consumption in large-scale models because it does not partition activations. 8 | To address this issue, a more efficient 2D tensor parallelism algorithm based on SUMMA was introduced. This algorithm evenly distributes computation and memory load. 9 | For instance, when computing a linear layer $Y = XA$, the input $X$ and weight $A$ are split into four sub-matrices and the calculation is done in two steps, broadcasting rows and columns of $X$ and $A$ in turn. 10 | The result is a matrix $Y$ that is the product of $X$ and $A$. 11 | 12 | ## Usage 13 | 14 | Use `ParallelMode.TENSOR_2D` as a parameter of `tensor_parallel_mode`. Since the algorithm splits model along both rows and columns, `tp_size` should be a **square of positive integer**. 15 | 16 | ```python 17 | from oslo import ParallelContext, ParallelMode 18 | from oslo.torch.nn.parallel import TensorParallel 19 | 20 | tp_size = 4 21 | tp_depth = 1 22 | 23 | parallel_context = ParallelContext.from_torch( 24 | data_parallel_size=1, 25 | pipeline_parallel_size=1, 26 | tensor_parallel_size=tp_size, 27 | tensor_parallel_mode=ParallelMode.TENSOR_2D, 28 | ) 29 | model = TensorParallel(model, parallel_context) 30 | oslo.ready(model, parallel_context) 31 | ``` -------------------------------------------------------------------------------- /docs/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '', 4 | LANGUAGE: 'en', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '', 11 | NAVIGATION_WITH_KEYS: true, 12 | SHOW_SEARCH_SUMMARY: true, 13 | ENABLE_SEARCH_SHORTCUTS: true, 14 | }; -------------------------------------------------------------------------------- /docs/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/file.png -------------------------------------------------------------------------------- /docs/_static/images/logo_binder.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 10 | logo 11 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/_static/images/logo_colab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/images/logo_colab.png -------------------------------------------------------------------------------- /docs/_static/images/logo_deepnote.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/_static/locales/ar/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ar\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "بواسطة" 13 | 14 | msgid "repository" 15 | msgstr "مخزن" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "وضع ملء الشاشة" 19 | 20 | msgid "Contents" 21 | msgstr "محتويات" 22 | 23 | msgid "Download source file" 24 | msgstr "تنزيل ملف المصدر" 25 | 26 | msgid "Edit this page" 27 | msgstr "قم بتحرير هذه الصفحة" 28 | 29 | msgid "Last updated on" 30 | msgstr "آخر تحديث في" 31 | 32 | msgid "Print to PDF" 33 | msgstr "طباعة إلى PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "أقترح تحرير" 37 | 38 | msgid "Download this page" 39 | msgstr "قم بتنزيل هذه الصفحة" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "تبديل التنقل" 43 | 44 | msgid "By the" 45 | msgstr "بواسطة" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "موضوع كتاب أبو الهول" 49 | 50 | msgid "previous page" 51 | msgstr "الصفحة السابقة" 52 | 53 | msgid "Source repository" 54 | msgstr "مستودع المصدر" 55 | 56 | msgid "open issue" 57 | msgstr "قضية مفتوحة" 58 | 59 | msgid "Download notebook file" 60 | msgstr "تنزيل ملف دفتر الملاحظات" 61 | 62 | msgid "Copyright" 63 | msgstr "حقوق النشر" 64 | 65 | msgid "Theme by the" 66 | msgstr "موضوع بواسطة" 67 | 68 | msgid "Open an issue" 69 | msgstr "افتح قضية" 70 | 71 | msgid "next page" 72 | msgstr "الصفحة التالية" 73 | 74 | msgid "Launch" 75 | msgstr "إطلاق" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/bn/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: bn\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "দ্বারা" 13 | 14 | msgid "Download source file" 15 | msgstr "উত্স ফাইল ডাউনলোড করুন" 16 | 17 | msgid "Edit this page" 18 | msgstr "এই পৃষ্ঠাটি সম্পাদনা করুন" 19 | 20 | msgid "Last updated on" 21 | msgstr "সর্বশেষ আপডেট" 22 | 23 | msgid "Print to PDF" 24 | msgstr "পিডিএফ প্রিন্ট করুন" 25 | 26 | msgid "Download this page" 27 | msgstr "এই পৃষ্ঠাটি ডাউনলোড করুন" 28 | 29 | msgid "Toggle navigation" 30 | msgstr "নেভিগেশন টগল করুন" 31 | 32 | msgid "By the" 33 | msgstr "দ্বারা" 34 | 35 | msgid "Sphinx Book Theme" 36 | msgstr "স্পিনিক্স বুক থিম" 37 | 38 | msgid "previous page" 39 | msgstr "আগের পৃষ্ঠা" 40 | 41 | msgid "Source repository" 42 | msgstr "উত্স সংগ্রহস্থল" 43 | 44 | msgid "open issue" 45 | msgstr "খোলা সমস্যা" 46 | 47 | msgid "Download notebook file" 48 | msgstr "নোটবুক ফাইল ডাউনলোড করুন" 49 | 50 | msgid "Copyright" 51 | msgstr "কপিরাইট" 52 | 53 | msgid "Theme by the" 54 | msgstr "থিম দ্বারা" 55 | 56 | msgid "Open an issue" 57 | msgstr "একটি সমস্যা খুলুন" 58 | 59 | msgid "next page" 60 | msgstr "পরবর্তী পৃষ্ঠা" 61 | 62 | msgid "Launch" 63 | msgstr "শুরু করা" 64 | -------------------------------------------------------------------------------- /docs/_static/locales/ca/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ca\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "Per" 13 | 14 | msgid "Download source file" 15 | msgstr "Baixeu el fitxer font" 16 | 17 | msgid "Edit this page" 18 | msgstr "Editeu aquesta pàgina" 19 | 20 | msgid "Last updated on" 21 | msgstr "Darrera actualització el" 22 | 23 | msgid "Print to PDF" 24 | msgstr "Imprimeix a PDF" 25 | 26 | msgid "suggest edit" 27 | msgstr "suggerir edició" 28 | 29 | msgid "Download this page" 30 | msgstr "Descarregueu aquesta pàgina" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "Commuta la navegació" 34 | 35 | msgid "By the" 36 | msgstr "Per la" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "Tema del llibre Esfinx" 40 | 41 | msgid "previous page" 42 | msgstr "Pàgina anterior" 43 | 44 | msgid "Source repository" 45 | msgstr "Dipòsit de fonts" 46 | 47 | msgid "open issue" 48 | msgstr "número obert" 49 | 50 | msgid "Download notebook file" 51 | msgstr "Descarregar fitxer de quadern" 52 | 53 | msgid "Copyright" 54 | msgstr "Copyright" 55 | 56 | msgid "Theme by the" 57 | msgstr "Tema del" 58 | 59 | msgid "Open an issue" 60 | msgstr "Obriu un número" 61 | 62 | msgid "next page" 63 | msgstr "pàgina següent" 64 | 65 | msgid "Launch" 66 | msgstr "Llançament" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/da/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: da\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "Ved" 13 | 14 | msgid "repository" 15 | msgstr "lager" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "Fuldskærmstilstand" 19 | 20 | msgid "Contents" 21 | msgstr "Indhold" 22 | 23 | msgid "Download source file" 24 | msgstr "Download kildefil" 25 | 26 | msgid "Edit this page" 27 | msgstr "Rediger denne side" 28 | 29 | msgid "Last updated on" 30 | msgstr "Sidst opdateret den" 31 | 32 | msgid "Print to PDF" 33 | msgstr "Udskriv til PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "foreslå redigering" 37 | 38 | msgid "Download this page" 39 | msgstr "Download denne side" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "Skift navigation" 43 | 44 | msgid "By the" 45 | msgstr "Ved" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "Sphinx bogtema" 49 | 50 | msgid "previous page" 51 | msgstr "forrige side" 52 | 53 | msgid "Source repository" 54 | msgstr "Kildelager" 55 | 56 | msgid "open issue" 57 | msgstr "åbent nummer" 58 | 59 | msgid "Download notebook file" 60 | msgstr "Download notesbog-fil" 61 | 62 | msgid "Copyright" 63 | msgstr "ophavsret" 64 | 65 | msgid "Theme by the" 66 | msgstr "Tema af" 67 | 68 | msgid "Open an issue" 69 | msgstr "Åbn et problem" 70 | 71 | msgid "next page" 72 | msgstr "Næste side" 73 | 74 | msgid "Launch" 75 | msgstr "Start" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/iw/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: iw\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "על ידי" 13 | 14 | msgid "repository" 15 | msgstr "מאגר" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "מצב מסך מלא" 19 | 20 | msgid "Contents" 21 | msgstr "תוכן" 22 | 23 | msgid "Download source file" 24 | msgstr "הורד את קובץ המקור" 25 | 26 | msgid "Edit this page" 27 | msgstr "ערוך דף זה" 28 | 29 | msgid "Last updated on" 30 | msgstr "עודכן לאחרונה ב" 31 | 32 | msgid "Print to PDF" 33 | msgstr "הדפס לקובץ PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "מציע לערוך" 37 | 38 | msgid "Download this page" 39 | msgstr "הורד דף זה" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "החלף ניווט" 43 | 44 | msgid "By the" 45 | msgstr "דרך" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "נושא ספר ספינקס" 49 | 50 | msgid "previous page" 51 | msgstr "עמוד קודם" 52 | 53 | msgid "Source repository" 54 | msgstr "מאגר המקורות" 55 | 56 | msgid "open issue" 57 | msgstr "בעיה פתוחה" 58 | 59 | msgid "Download notebook file" 60 | msgstr "הורד קובץ מחברת" 61 | 62 | msgid "Copyright" 63 | msgstr "זכויות יוצרים" 64 | 65 | msgid "Theme by the" 66 | msgstr "נושא מאת" 67 | 68 | msgid "Open an issue" 69 | msgstr "פתח גיליון" 70 | 71 | msgid "next page" 72 | msgstr "עמוד הבא" 73 | 74 | msgid "Launch" 75 | msgstr "לְהַשִׁיק" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/ja/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ja\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "著者" 13 | 14 | msgid "repository" 15 | msgstr "リポジトリ" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "全画面モード" 19 | 20 | msgid "Contents" 21 | msgstr "目次" 22 | 23 | msgid "Download source file" 24 | msgstr "ソースファイルをダウンロード" 25 | 26 | msgid "Edit this page" 27 | msgstr "このページを編集" 28 | 29 | msgid "Last updated on" 30 | msgstr "最終更新日" 31 | 32 | msgid "Print to PDF" 33 | msgstr "PDFに印刷" 34 | 35 | msgid "suggest edit" 36 | msgstr "編集を提案する" 37 | 38 | msgid "Download this page" 39 | msgstr "このページをダウンロード" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "ナビゲーションを切り替え" 43 | 44 | msgid "By the" 45 | msgstr "によって" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "スフィンクスの本のテーマ" 49 | 50 | msgid "previous page" 51 | msgstr "前のページ" 52 | 53 | msgid "Source repository" 54 | msgstr "ソースリポジトリ" 55 | 56 | msgid "open issue" 57 | msgstr "未解決の問題" 58 | 59 | msgid "Download notebook file" 60 | msgstr "ノートブックファイルをダウンロード" 61 | 62 | msgid "Copyright" 63 | msgstr "Copyright" 64 | 65 | msgid "Theme by the" 66 | msgstr "のテーマ" 67 | 68 | msgid "Open an issue" 69 | msgstr "問題を報告" 70 | 71 | msgid "next page" 72 | msgstr "次のページ" 73 | 74 | msgid "Launch" 75 | msgstr "起動" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/ko/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ko\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "으로" 13 | 14 | msgid "repository" 15 | msgstr "저장소" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "전체 화면으로보기" 19 | 20 | msgid "Contents" 21 | msgstr "내용" 22 | 23 | msgid "Download source file" 24 | msgstr "소스 파일 다운로드" 25 | 26 | msgid "Edit this page" 27 | msgstr "이 페이지 편집" 28 | 29 | msgid "Last updated on" 30 | msgstr "마지막 업데이트" 31 | 32 | msgid "Print to PDF" 33 | msgstr "PDF로 인쇄" 34 | 35 | msgid "suggest edit" 36 | msgstr "편집 제안" 37 | 38 | msgid "Download this page" 39 | msgstr "이 페이지 다운로드" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "탐색 전환" 43 | 44 | msgid "By the" 45 | msgstr "에 의해" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "스핑크스 도서 테마" 49 | 50 | msgid "previous page" 51 | msgstr "이전 페이지" 52 | 53 | msgid "Source repository" 54 | msgstr "소스 저장소" 55 | 56 | msgid "open issue" 57 | msgstr "열린 문제" 58 | 59 | msgid "Download notebook file" 60 | msgstr "노트북 파일 다운로드" 61 | 62 | msgid "Copyright" 63 | msgstr "저작권" 64 | 65 | msgid "Theme by the" 66 | msgstr "테마별" 67 | 68 | msgid "Open an issue" 69 | msgstr "이슈 열기" 70 | 71 | msgid "next page" 72 | msgstr "다음 페이지" 73 | 74 | msgid "Launch" 75 | msgstr "시작하다" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/ml/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ml\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "എഴുതിയത്" 13 | 14 | msgid "Download source file" 15 | msgstr "ഉറവിട ഫയൽ ഡൗൺലോഡുചെയ്യുക" 16 | 17 | msgid "Edit this page" 18 | msgstr "ഈ പേജ് എഡിറ്റുചെയ്യുക" 19 | 20 | msgid "Last updated on" 21 | msgstr "അവസാനം അപ്‌ഡേറ്റുചെയ്‌തത്" 22 | 23 | msgid "Print to PDF" 24 | msgstr "PDF- ലേക്ക് പ്രിന്റുചെയ്യുക" 25 | 26 | msgid "suggest edit" 27 | msgstr "എഡിറ്റുചെയ്യാൻ നിർദ്ദേശിക്കുക" 28 | 29 | msgid "Download this page" 30 | msgstr "ഈ പേജ് ഡൗൺലോഡുചെയ്യുക" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "നാവിഗേഷൻ ടോഗിൾ ചെയ്യുക" 34 | 35 | msgid "By the" 36 | msgstr "എഴുതിയത്" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "സ്ഫിങ്ക്സ് പുസ്തക തീം" 40 | 41 | msgid "previous page" 42 | msgstr "മുൻപത്തെ താൾ" 43 | 44 | msgid "Source repository" 45 | msgstr "ഉറവിട ശേഖരം" 46 | 47 | msgid "open issue" 48 | msgstr "തുറന്ന പ്രശ്നം" 49 | 50 | msgid "Download notebook file" 51 | msgstr "നോട്ട്ബുക്ക് ഫയൽ ഡൺലോഡ് ചെയ്യുക" 52 | 53 | msgid "Copyright" 54 | msgstr "പകർപ്പവകാശം" 55 | 56 | msgid "Theme by the" 57 | msgstr "പ്രമേയം" 58 | 59 | msgid "Open an issue" 60 | msgstr "ഒരു പ്രശ്നം തുറക്കുക" 61 | 62 | msgid "next page" 63 | msgstr "അടുത്ത പേജ്" 64 | 65 | msgid "Launch" 66 | msgstr "സമാരംഭിക്കുക" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/mr/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: mr\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "द्वारा" 13 | 14 | msgid "Download source file" 15 | msgstr "स्त्रोत फाइल डाउनलोड करा" 16 | 17 | msgid "Edit this page" 18 | msgstr "हे पृष्ठ संपादित करा" 19 | 20 | msgid "Last updated on" 21 | msgstr "अखेरचे अद्यतनित" 22 | 23 | msgid "Print to PDF" 24 | msgstr "पीडीएफवर मुद्रित करा" 25 | 26 | msgid "suggest edit" 27 | msgstr "संपादन सुचवा" 28 | 29 | msgid "Download this page" 30 | msgstr "हे पृष्ठ डाउनलोड करा" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "नेव्हिगेशन टॉगल करा" 34 | 35 | msgid "By the" 36 | msgstr "द्वारा" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "स्फिंक्स बुक थीम" 40 | 41 | msgid "previous page" 42 | msgstr "मागील पान" 43 | 44 | msgid "Source repository" 45 | msgstr "स्त्रोत भांडार" 46 | 47 | msgid "open issue" 48 | msgstr "खुला मुद्दा" 49 | 50 | msgid "Download notebook file" 51 | msgstr "नोटबुक फाईल डाउनलोड करा" 52 | 53 | msgid "Copyright" 54 | msgstr "कॉपीराइट" 55 | 56 | msgid "Theme by the" 57 | msgstr "द्वारा थीम" 58 | 59 | msgid "Open an issue" 60 | msgstr "एक मुद्दा उघडा" 61 | 62 | msgid "next page" 63 | msgstr "पुढील पृष्ठ" 64 | 65 | msgid "Launch" 66 | msgstr "लाँच करा" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/ms/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ms\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "Oleh" 13 | 14 | msgid "Download source file" 15 | msgstr "Muat turun fail sumber" 16 | 17 | msgid "Edit this page" 18 | msgstr "Edit halaman ini" 19 | 20 | msgid "Last updated on" 21 | msgstr "Terakhir dikemas kini pada" 22 | 23 | msgid "Print to PDF" 24 | msgstr "Cetak ke PDF" 25 | 26 | msgid "suggest edit" 27 | msgstr "cadangkan edit" 28 | 29 | msgid "Download this page" 30 | msgstr "Muat turun halaman ini" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "Togol navigasi" 34 | 35 | msgid "By the" 36 | msgstr "Oleh" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "Tema Buku Sphinx" 40 | 41 | msgid "previous page" 42 | msgstr "halaman sebelumnya" 43 | 44 | msgid "Source repository" 45 | msgstr "Repositori sumber" 46 | 47 | msgid "open issue" 48 | msgstr "isu terbuka" 49 | 50 | msgid "Download notebook file" 51 | msgstr "Muat turun fail buku nota" 52 | 53 | msgid "Copyright" 54 | msgstr "hak cipta" 55 | 56 | msgid "Theme by the" 57 | msgstr "Tema oleh" 58 | 59 | msgid "Open an issue" 60 | msgstr "Buka masalah" 61 | 62 | msgid "next page" 63 | msgstr "muka surat seterusnya" 64 | 65 | msgid "Launch" 66 | msgstr "Lancarkan" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/ta/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ta\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "வழங்கியவர்" 13 | 14 | msgid "Download source file" 15 | msgstr "மூல கோப்பைப் பதிவிறக்குக" 16 | 17 | msgid "Edit this page" 18 | msgstr "இந்தப் பக்கத்தைத் திருத்தவும்" 19 | 20 | msgid "Last updated on" 21 | msgstr "கடைசியாக புதுப்பிக்கப்பட்டது" 22 | 23 | msgid "Print to PDF" 24 | msgstr "PDF இல் அச்சிடுக" 25 | 26 | msgid "suggest edit" 27 | msgstr "திருத்த பரிந்துரைக்கவும்" 28 | 29 | msgid "Download this page" 30 | msgstr "இந்தப் பக்கத்தைப் பதிவிறக்கவும்" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "வழிசெலுத்தலை நிலைமாற்று" 34 | 35 | msgid "By the" 36 | msgstr "மூலம்" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "ஸ்பிங்க்ஸ் புத்தக தீம்" 40 | 41 | msgid "previous page" 42 | msgstr "முந்தைய பக்கம்" 43 | 44 | msgid "Source repository" 45 | msgstr "மூல களஞ்சியம்" 46 | 47 | msgid "open issue" 48 | msgstr "திறந்த பிரச்சினை" 49 | 50 | msgid "Download notebook file" 51 | msgstr "நோட்புக் கோப்பைப் பதிவிறக்கவும்" 52 | 53 | msgid "Copyright" 54 | msgstr "பதிப்புரிமை" 55 | 56 | msgid "Theme by the" 57 | msgstr "வழங்கிய தீம்" 58 | 59 | msgid "Open an issue" 60 | msgstr "சிக்கலைத் திறக்கவும்" 61 | 62 | msgid "next page" 63 | msgstr "அடுத்த பக்கம்" 64 | 65 | msgid "Launch" 66 | msgstr "தொடங்க" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/te/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: te\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "ద్వారా" 13 | 14 | msgid "Download source file" 15 | msgstr "మూల ఫైల్‌ను డౌన్‌లోడ్ చేయండి" 16 | 17 | msgid "Edit this page" 18 | msgstr "ఈ పేజీని సవరించండి" 19 | 20 | msgid "Last updated on" 21 | msgstr "చివరిగా నవీకరించబడింది" 22 | 23 | msgid "Print to PDF" 24 | msgstr "PDF కి ముద్రించండి" 25 | 26 | msgid "suggest edit" 27 | msgstr "సవరించమని సూచించండి" 28 | 29 | msgid "Download this page" 30 | msgstr "ఈ పేజీని డౌన్‌లోడ్ చేయండి" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "నావిగేషన్‌ను టోగుల్ చేయండి" 34 | 35 | msgid "By the" 36 | msgstr "ద్వారా" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "సింహిక పుస్తక థీమ్" 40 | 41 | msgid "previous page" 42 | msgstr "ముందు పేజి" 43 | 44 | msgid "Source repository" 45 | msgstr "మూల రిపోజిటరీ" 46 | 47 | msgid "open issue" 48 | msgstr "ఓపెన్ ఇష్యూ" 49 | 50 | msgid "Download notebook file" 51 | msgstr "నోట్బుక్ ఫైల్ను డౌన్లోడ్ చేయండి" 52 | 53 | msgid "Copyright" 54 | msgstr "కాపీరైట్" 55 | 56 | msgid "Theme by the" 57 | msgstr "ద్వారా థీమ్" 58 | 59 | msgid "Open an issue" 60 | msgstr "సమస్యను తెరవండి" 61 | 62 | msgid "next page" 63 | msgstr "తరువాతి పేజీ" 64 | 65 | msgid "Launch" 66 | msgstr "ప్రారంభించండి" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/th/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: th\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "โดย" 13 | 14 | msgid "repository" 15 | msgstr "ที่เก็บ" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "โหมดเต็มหน้าจอ" 19 | 20 | msgid "Contents" 21 | msgstr "สารบัญ" 22 | 23 | msgid "Download source file" 24 | msgstr "ดาวน์โหลดไฟล์ต้นฉบับ" 25 | 26 | msgid "Edit this page" 27 | msgstr "แก้ไขหน้านี้" 28 | 29 | msgid "Last updated on" 30 | msgstr "ปรับปรุงล่าสุดเมื่อ" 31 | 32 | msgid "Print to PDF" 33 | msgstr "พิมพ์เป็น PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "แนะนำแก้ไข" 37 | 38 | msgid "Download this page" 39 | msgstr "ดาวน์โหลดหน้านี้" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "ไม่ต้องสลับช่องทาง" 43 | 44 | msgid "By the" 45 | msgstr "โดย" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "ธีมหนังสือสฟิงซ์" 49 | 50 | msgid "previous page" 51 | msgstr "หน้าที่แล้ว" 52 | 53 | msgid "Source repository" 54 | msgstr "ที่เก็บซอร์ส" 55 | 56 | msgid "open issue" 57 | msgstr "เปิดปัญหา" 58 | 59 | msgid "Download notebook file" 60 | msgstr "ดาวน์โหลดไฟล์สมุดบันทึก" 61 | 62 | msgid "Copyright" 63 | msgstr "ลิขสิทธิ์" 64 | 65 | msgid "Theme by the" 66 | msgstr "ธีมโดย" 67 | 68 | msgid "Open an issue" 69 | msgstr "เปิดปัญหา" 70 | 71 | msgid "next page" 72 | msgstr "หน้าต่อไป" 73 | 74 | msgid "Launch" 75 | msgstr "เปิด" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/tl/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: tl\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "Ni" 13 | 14 | msgid "Download source file" 15 | msgstr "Mag-download ng file ng pinagmulan" 16 | 17 | msgid "Edit this page" 18 | msgstr "I-edit ang pahinang ito" 19 | 20 | msgid "Last updated on" 21 | msgstr "Huling na-update noong" 22 | 23 | msgid "Print to PDF" 24 | msgstr "I-print sa PDF" 25 | 26 | msgid "suggest edit" 27 | msgstr "iminumungkahi i-edit" 28 | 29 | msgid "Download this page" 30 | msgstr "I-download ang pahinang ito" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "I-toggle ang pag-navigate" 34 | 35 | msgid "By the" 36 | msgstr "Sa pamamagitan ng" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "Tema ng Sphinx Book" 40 | 41 | msgid "previous page" 42 | msgstr "Nakaraang pahina" 43 | 44 | msgid "Source repository" 45 | msgstr "Pinagmulan ng imbakan" 46 | 47 | msgid "open issue" 48 | msgstr "bukas na isyu" 49 | 50 | msgid "Download notebook file" 51 | msgstr "Mag-download ng file ng notebook" 52 | 53 | msgid "Copyright" 54 | msgstr "Copyright" 55 | 56 | msgid "Theme by the" 57 | msgstr "Tema ng" 58 | 59 | msgid "Open an issue" 60 | msgstr "Magbukas ng isyu" 61 | 62 | msgid "next page" 63 | msgstr "Susunod na pahina" 64 | 65 | msgid "Launch" 66 | msgstr "Ilunsad" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/ur/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: ur\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "بذریعہ" 13 | 14 | msgid "Download source file" 15 | msgstr "سورس فائل ڈاؤن لوڈ کریں" 16 | 17 | msgid "Edit this page" 18 | msgstr "اس صفحے میں ترمیم کریں" 19 | 20 | msgid "Last updated on" 21 | msgstr "آخری بار تازہ کاری ہوئی" 22 | 23 | msgid "Print to PDF" 24 | msgstr "پی ڈی ایف پرنٹ کریں" 25 | 26 | msgid "suggest edit" 27 | msgstr "ترمیم کی تجویز کریں" 28 | 29 | msgid "Download this page" 30 | msgstr "اس صفحے کو ڈاؤن لوڈ کریں" 31 | 32 | msgid "Toggle navigation" 33 | msgstr "نیویگیشن ٹوگل کریں" 34 | 35 | msgid "By the" 36 | msgstr "کی طرف" 37 | 38 | msgid "Sphinx Book Theme" 39 | msgstr "سپنکس بک تھیم" 40 | 41 | msgid "previous page" 42 | msgstr "سابقہ ​​صفحہ" 43 | 44 | msgid "Source repository" 45 | msgstr "ماخذ ذخیرہ" 46 | 47 | msgid "open issue" 48 | msgstr "کھلا مسئلہ" 49 | 50 | msgid "Download notebook file" 51 | msgstr "نوٹ بک فائل ڈاؤن لوڈ کریں" 52 | 53 | msgid "Copyright" 54 | msgstr "کاپی رائٹ" 55 | 56 | msgid "Theme by the" 57 | msgstr "کے ذریعہ تھیم" 58 | 59 | msgid "Open an issue" 60 | msgstr "ایک مسئلہ کھولیں" 61 | 62 | msgid "next page" 63 | msgstr "اگلا صفحہ" 64 | 65 | msgid "Launch" 66 | msgstr "لانچ کریں" 67 | -------------------------------------------------------------------------------- /docs/_static/locales/zh_CN/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: zh_CN\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "作者:" 13 | 14 | msgid "repository" 15 | msgstr "仓库" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "全屏模式" 19 | 20 | msgid "Contents" 21 | msgstr "目录" 22 | 23 | msgid "Download source file" 24 | msgstr "下载源文件" 25 | 26 | msgid "Edit this page" 27 | msgstr "编辑此页面" 28 | 29 | msgid "Last updated on" 30 | msgstr "上次更新时间:" 31 | 32 | msgid "Print to PDF" 33 | msgstr "列印成 PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "提出修改建议" 37 | 38 | msgid "Download this page" 39 | msgstr "下载此页面" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "显示或隐藏导航栏" 43 | 44 | msgid "By the" 45 | msgstr "作者:" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "Sphinx Book 主题" 49 | 50 | msgid "previous page" 51 | msgstr "上一页" 52 | 53 | msgid "Source repository" 54 | msgstr "源码库" 55 | 56 | msgid "open issue" 57 | msgstr "创建议题" 58 | 59 | msgid "Download notebook file" 60 | msgstr "下载笔记本文件" 61 | 62 | msgid "Copyright" 63 | msgstr "版权" 64 | 65 | msgid "Theme by the" 66 | msgstr "主题作者:" 67 | 68 | msgid "Open an issue" 69 | msgstr "创建议题" 70 | 71 | msgid "next page" 72 | msgstr "下一页" 73 | 74 | msgid "Launch" 75 | msgstr "启动" 76 | -------------------------------------------------------------------------------- /docs/_static/locales/zh_TW/LC_MESSAGES/booktheme.po: -------------------------------------------------------------------------------- 1 | 2 | msgid "" 3 | msgstr "" 4 | "Project-Id-Version: Sphinx-Book-Theme\n" 5 | "MIME-Version: 1.0\n" 6 | "Content-Type: text/plain; charset=UTF-8\n" 7 | "Content-Transfer-Encoding: 8bit\n" 8 | "Language: zh_TW\n" 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 10 | 11 | msgid "By" 12 | msgstr "作者:" 13 | 14 | msgid "repository" 15 | msgstr "儲存庫" 16 | 17 | msgid "Fullscreen mode" 18 | msgstr "全螢幕模式" 19 | 20 | msgid "Contents" 21 | msgstr "目錄" 22 | 23 | msgid "Download source file" 24 | msgstr "下載原始檔" 25 | 26 | msgid "Edit this page" 27 | msgstr "編輯此頁面" 28 | 29 | msgid "Last updated on" 30 | msgstr "最後更新時間:" 31 | 32 | msgid "Print to PDF" 33 | msgstr "列印成 PDF" 34 | 35 | msgid "suggest edit" 36 | msgstr "提出修改建議" 37 | 38 | msgid "Download this page" 39 | msgstr "下載此頁面" 40 | 41 | msgid "Toggle navigation" 42 | msgstr "顯示或隱藏導覽列" 43 | 44 | msgid "By the" 45 | msgstr "作者:" 46 | 47 | msgid "Sphinx Book Theme" 48 | msgstr "Sphinx Book 佈景主題" 49 | 50 | msgid "previous page" 51 | msgstr "上一頁" 52 | 53 | msgid "Source repository" 54 | msgstr "來源儲存庫" 55 | 56 | msgid "open issue" 57 | msgstr "公開的問題" 58 | 59 | msgid "Download notebook file" 60 | msgstr "下載 Notebook 檔案" 61 | 62 | msgid "Copyright" 63 | msgstr "Copyright" 64 | 65 | msgid "Theme by the" 66 | msgstr "佈景主題作者:" 67 | 68 | msgid "Open an issue" 69 | msgstr "開啟議題" 70 | 71 | msgid "next page" 72 | msgstr "下一頁" 73 | 74 | msgid "Launch" 75 | msgstr "啟動" 76 | -------------------------------------------------------------------------------- /docs/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/minus.png -------------------------------------------------------------------------------- /docs/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/plus.png -------------------------------------------------------------------------------- /docs/_static/sbt-webpack-macros.html: -------------------------------------------------------------------------------- 1 | 5 | {% macro head_pre_bootstrap() %} 6 | 7 | {% endmacro %} 8 | 9 | {% macro body_post() %} 10 | 11 | {% endmacro %} 12 | -------------------------------------------------------------------------------- /docs/_static/styles/theme.css: -------------------------------------------------------------------------------- 1 | /* Provided by Sphinx's 'basic' theme, and included in the final set of assets */ 2 | @import "../basic.css"; 3 | -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.ttf -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2 -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.ttf -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2 -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.ttf -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2 -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.ttf -------------------------------------------------------------------------------- /docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.woff2 -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/objects.inv -------------------------------------------------------------------------------- /docs/source/CONCEPTS/data_parallelism.md: -------------------------------------------------------------------------------- 1 | # Concept of Data Parallelism 2 | - Authors: Jinwon Kim 3 | 4 | **Data Parallelism** is a widely-used technique for training deep learning models in parallel. It involves distributing the training data across multiple processing units, such as GPUs, each of which has a copy of the model parameters. The data is divided into subsets, and each unit independently computes the gradients for its subset. The gradients are then aggregated to update the model parameters. This approach enables efficient parallelization of the training process and can accelerate the training of deep learning models on large datasets. 5 | 6 | Oslo supports Zero Redundancy Optimizer (ZeRO) to easily scale deep learning model. 7 | 8 | ## Optimizer-Level Parallel 9 | - [Zero Redundancy Optimizer DP](dp/zero_algorithm.md) 10 | 11 | 12 | ### References 13 | - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054) 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/source/CONCEPTS/dp/zero_algorithm.md: -------------------------------------------------------------------------------- 1 | # Zero Redundancy Optimizer DP 2 | - Authors: Jinwon Kim 3 | - Paper: https://arxiv.org/abs/1910.02054 4 | 5 | ![figure1.png](zero_image/figure1.png) 6 | 7 | The Zero Redundancy Optimizer for Data Parallelism (ZeRO-DP) is a technique used to remove memory state redundancies and optimize computational efficiency in data parallel distributed deep learning. ZeRO-DP partitions the model states across data-parallel processes, eliminating the need for replication of model parameters, which in turn reduces memory usage and communication overhead during training. 8 | 9 | ## Optimizer State Partitioning (Level 1) 10 | - The optimizer states are partitioned across data parallel processes 11 | ## Gradient Partitioning (Level 2) 12 | - The reduced gradients are partitioned based on the corresponding parameter and are reduced only by the data parallel process responsible for updating those parameters. After the reduction, the memory can be released. 13 | ## Parameter Partitioning (Level 3) 14 | - Similar to the optimizer states and gradients, each process only stores the parameters associated with its partition. 15 | 16 | -------------------------------------------------------------------------------- /docs/source/CONCEPTS/dp/zero_image/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/dp/zero_image/figure1.png -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tensor_model_parallelism.md: -------------------------------------------------------------------------------- 1 | # Concept of Tensor Model Parallelism 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu 3 | 4 | **Tensor Model Parallelism** makes it possible to train larger models by partitioning the parameter tensors into multiple dimensions. 5 | We support 1D, 2D, 2.5D, and 3D tensor partitioning algorithms which make tensor parallel training more efficient. 6 | 7 | ## Tensor Parallel Algorithms 8 | - [1D parallel algorithm (same as Megatron-LM)](tp/1d_parallel_algorithm.md) 9 | - [2D parallel (SUMMA) algorithm](tp/2d_parallel_algorithm.md) 10 | - [2.5D parallel (SUMMA-2.5) algorithm](tp/2p5d_parallel_algorithm.md) 11 | - [3D parallel Algorithm](tp/3d_parallel_algorithm.md) 12 | 13 | ### References 14 | - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 15 | - [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343) 16 | - [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500) 17 | - [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450) 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tp/1d_image/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/1d_image/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tp/2d_image/2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/2d_image/2d.png -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tp/2d_parallel_algorithm.md: -------------------------------------------------------------------------------- 1 | # 2D parallel (SUMMA) algorithm 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu 3 | - Paper : [https://arxiv.org/pdf/2104.05343.pdf](https://arxiv.org/pdf/2104.05343.pdf) 4 | 5 | ![image.png](2d_image/2d.png) 6 | 7 | The use of 1D tensor parallelism can lead to high memory consumption in large-scale models because it does not partition activations. 8 | To address this issue, a more efficient 2D tensor parallelism algorithm based on SUMMA was introduced. This algorithm evenly distributes computation and memory load. 9 | For instance, when computing a linear layer $Y = XA$, the input $X$ and weight $A$ are split into four sub-matrices and the calculation is done in two steps, broadcasting rows and columns of $X$ and $A$ in turn. 10 | The result is a matrix $Y$ that is the product of $X$ and $A$. 11 | 12 | ## Usage 13 | 14 | Use `ParallelMode.TENSOR_2D` as a parameter of `tensor_parallel_mode`. Since the algorithm splits model along both rows and columns, `tp_size` should be a **square of positive integer**. 15 | 16 | ```python 17 | from oslo import ParallelContext, ParallelMode 18 | from oslo.torch.nn.parallel import TensorParallel 19 | 20 | tp_size = 4 21 | tp_depth = 1 22 | 23 | parallel_context = ParallelContext.from_torch( 24 | data_parallel_size=1, 25 | pipeline_parallel_size=1, 26 | tensor_parallel_size=tp_size, 27 | tensor_parallel_mode=ParallelMode.TENSOR_2D, 28 | ) 29 | model = TensorParallel(model, parallel_context) 30 | oslo.ready(model, parallel_context) 31 | ``` -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tp/2p5d_image/2p5d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/2p5d_image/2p5d.png -------------------------------------------------------------------------------- /docs/source/CONCEPTS/tp/3d_image/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/3d_image/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg -------------------------------------------------------------------------------- /docs/source/TUTORIALS/image/260461C3-EA3B-405C-9B34-05BA3C781161.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/TUTORIALS/image/260461C3-EA3B-405C-9B34-05BA3C781161.png -------------------------------------------------------------------------------- /docs/source/TUTORIALS/image/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/TUTORIALS/image/figure1.png -------------------------------------------------------------------------------- /gcc_install.sh: -------------------------------------------------------------------------------- 1 | #~/bin/bash 2 | # install for gcc 3 | yum install libaio-devel -y 4 | yum install centos-release-scl -y 5 | yum-config-manager --enable rhel-server-rhscl-7-rpms -y 6 | yum install devtoolset-8 -y 7 | yum install llvm-toolset-7 -y 8 | sudo yum -y install llvm-toolset-7-clang-analyzer llvm-toolset-7-clang-tools-extra 9 | sudo yum -y install pdsh 10 | scl enable devtoolset-8 llvm-toolset-7 bash 11 | -------------------------------------------------------------------------------- /oslo/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.distributed import ParallelContext, ParallelMode 2 | from oslo.torch.utils.extensions import ready_torch 3 | 4 | 5 | def ready(model, parallel_context: ParallelContext): 6 | ready_torch(model, parallel_context) 7 | -------------------------------------------------------------------------------- /oslo/__version__.py: -------------------------------------------------------------------------------- 1 | version = "3.0.0" 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.0.1" 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | add_executable(bert_example bert_example.cc) 4 | target_link_libraries(bert_example PUBLIC liblightseq) 5 | 6 | add_executable(transformer_example transformer_example.cc) 7 | target_link_libraries(transformer_example PUBLIC liblightseq) 8 | 9 | add_executable(gpt_example gpt_example.cc) 10 | target_link_libraries(gpt_example PUBLIC liblightseq) 11 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/arm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | 3 | cmake_minimum_required(VERSION 3.18) 4 | set(lightseq_kernel_files gemm.cc utils.cc) 5 | 6 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files}) 7 | target_include_directories(lightseq_kernels INTERFACE includes) 8 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/arm/gemm.cc: -------------------------------------------------------------------------------- 1 | #include "kernel_headers.h" 2 | 3 | namespace lightseq { 4 | namespace arm {} // namespace arm 5 | } // namespace lightseq 6 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/arm/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "utils.h" 14 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/arm/includes/utils.h: -------------------------------------------------------------------------------- 1 | #include "cstdio" 2 | #include "iostream" 3 | 4 | namespace lightseq { 5 | 6 | template 7 | void print_vec(const T *outv, std::string outn, int num_output_ele); 8 | } 9 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(cuda_kernel_files 4 | util.cc.cu 5 | cross_entropy.cu 6 | cublas_wrappers.cu 7 | cuda_util.cu 8 | dropout_kernels.cu 9 | embedding_kernels.cu 10 | embKernels.cc.cu 11 | # fused_adam_kernel.cu 12 | general_kernels.cu 13 | gptKernels.cc.cu 14 | normalize_kernels.cu 15 | softmax_kernels.cu 16 | softmax_kernels_new.cu 17 | transform_kernels.cu 18 | transform_kernels_new.cu 19 | crf.cu 20 | transformerKernels.cc.cu) 21 | 22 | add_library(lightseq_kernels STATIC ${cuda_kernel_files}) 23 | target_link_libraries(lightseq_kernels PUBLIC -lcublas) 24 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/cuda/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "cublas_wrappers.h" 18 | #include "cuda_util.h" 19 | #include "embKernels.h" 20 | #include "gptKernels.h" 21 | #include "kernels.h" 22 | #include "transformerKernels.h" 23 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/cuda/includes/ls_cub.cuh: -------------------------------------------------------------------------------- 1 | // copied from https://github.com/dmlc/dgl/pull/2758 2 | #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_ 3 | #define DGL_ARRAY_CUDA_DGL_CUB_CUH_ 4 | 5 | #define CUB_NS_PREFIX namespace ls { 6 | #define CUB_NS_POSTFIX } 7 | #define CUB_NS_QUALIFIER ::ls::cub 8 | #include "cub/cub.cuh" 9 | #include "cub/util_allocator.cuh" 10 | #undef CUB_NS_POSTFIX 11 | #undef CUB_NS_PREFIX 12 | #undef CUB_NS_QUALIFIER 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/x86/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | 3 | cmake_minimum_required(VERSION 3.18) 4 | 5 | set(lightseq_kernel_files util.cc gemm.cpp) 6 | 7 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files}) 8 | target_include_directories(lightseq_kernels PUBLIC ${HDF5_INCLUDE_DIRS}) 9 | target_include_directories(lightseq_kernels INTERFACE includes) 10 | target_link_libraries(lightseq_kernels PRIVATE ${HDF5_LIBRARIES}) 11 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/x86/includes/kernel_headers.h: -------------------------------------------------------------------------------- 1 | #include "kernels.h" 2 | #include "util.h" 3 | #include 4 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/kernels/x86/includes/kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cstdio" 3 | #include "util.h" 4 | 5 | namespace lightseq { 6 | namespace x86 { 7 | 8 | template 9 | void matrix_gemm(const InType *inpA, const InType *inpB, OutType *outC, int m, 10 | int n, int k); 11 | 12 | template 13 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a, 14 | bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha, 15 | const AType *a, int64_t lda, const BType *b, int64_t ldb, float beta, 16 | CType *c, int64_t ldc, const CType *a_shift_compensation = nullptr); 17 | 18 | } // namespace x86 19 | } // namespace lightseq 20 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers/includes/cross_entropy_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "cuda_util.h" 10 | namespace lightseq { 11 | namespace cuda { 12 | template class CrossEntropyLayer { 13 | public: 14 | CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens); 15 | 16 | virtual ~CrossEntropyLayer(); 17 | 18 | void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr, 19 | float *nll_loss_ptr); 20 | 21 | void Backward(const float *grad_outputs_ptr, const T *inputs_ptr, 22 | const int *targets_ptr, T *grad_inputs_ptr); 23 | 24 | void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size); 25 | 26 | private: 27 | void allocate_mem_buffer() { 28 | // allocate local gpu memory 29 | _loss_buffer = cuda_malloc(_max_batch_tokens * 2); 30 | } 31 | 32 | void free_mem_buffer() { 33 | // free local gpu memory 34 | cuda_free(_loss_buffer); 35 | } 36 | 37 | const int _padding_idx; 38 | const float _epsilon; 39 | const int _max_batch_tokens; 40 | 41 | size_t _batch_size; 42 | size_t _seq_len; 43 | size_t _vocab_size; 44 | 45 | float *_loss_buffer; 46 | }; 47 | } // namespace cuda 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(layers_files 2 | feed_forward_layer.cpp 3 | linear_layer.cpp 4 | generator_layer.cpp 5 | gpt_attention_layer.cpp 6 | gpt_layer.cpp 7 | multihead_attention_layer.cpp 8 | transformer_encoder_layer.cpp 9 | dec_enc_attention_layer.cpp 10 | dec_self_attention_layer.cpp 11 | transformer_decoder_layer.cpp 12 | crf_layer.cpp 13 | encdec_kv_layer.cpp 14 | sample_layer.cpp 15 | sdpa_layer.cpp) 16 | 17 | add_library(lightseq_layers STATIC ${layers_files}) 18 | target_link_libraries(lightseq_layers PUBLIC lightseq_operators lsflow) 19 | target_include_directories(lightseq_layers PUBLIC includes) 20 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/crf_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "crf.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template class CRFLayer : public Layer { 9 | private: 10 | // operators 11 | CRFOP *_crf_op = nullptr; 12 | 13 | // parameters 14 | Variable *_linear_b; 15 | Variable *_start_transition; 16 | Variable *_end_transition; 17 | Variable *_transition; 18 | 19 | // shape related 20 | int _num_tags; 21 | int _max_batch_tokens; 22 | int _max_batch_size; 23 | 24 | int _seq_len; 25 | int _batch_size; 26 | bool _forward_or_decode; // true for forward, false for decode 27 | bool _output_decode_score; // true for output decode score 28 | 29 | public: 30 | CRFLayer(int num_tags, int max_batch_tokens, int max_batch_size); 31 | 32 | virtual ~CRFLayer() {} 33 | 34 | Variable *operator()(Variable *emission, Variable *mask); 35 | 36 | void before_forward(int batch_size, int seq_len, bool forward_or_decode, 37 | bool output_decode_score); 38 | 39 | int load_params(const std::vector ¶_vec, int offset); 40 | }; 41 | 42 | template class CRFLayer; 43 | #ifdef LIGHTSEQ_cuda 44 | template class CRFLayer<__half>; 45 | #endif 46 | 47 | template using CRFLayerPtr = std::shared_ptr>; 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/encdec_kv_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "bias_add_transform_20314.h" 3 | #include "layer.h" 4 | #include "linear.h" 5 | 6 | namespace lightseq { 7 | 8 | template class EncDecKvLayer : public Layer { 9 | private: 10 | LinearOp *_kv_linear = nullptr; 11 | BiasAddTrans20314 *_bias_add_transform_20314 = nullptr; 12 | 13 | // parameters 14 | Variable *_enc_kvw; 15 | Variable *_enc_kvb; 16 | 17 | // shape related 18 | size_t _layer_id; 19 | size_t _nshared_layer; 20 | size_t _batch_tokens; 21 | size_t _max_batch_tokens; 22 | size_t _hidden_size; 23 | size_t _heads; 24 | 25 | public: 26 | EncDecKvLayer(size_t nshared_layer, size_t max_batch_tokens, 27 | size_t hidden_size, size_t num_heads); 28 | 29 | virtual ~EncDecKvLayer() {} 30 | 31 | Variable *operator()(Variable *enc_out); 32 | 33 | void before_forward(size_t batch_size, size_t seq_len); 34 | 35 | size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr); 36 | 37 | int load_params(const std::vector ¶_vec, int offset); 38 | }; 39 | 40 | template class EncDecKvLayer; 41 | #ifdef LIGHTSEQ_cuda 42 | template class EncDecKvLayer<__half, __half>; 43 | #endif 44 | 45 | template 46 | using EncDecKvLayerPtr = std::shared_ptr>; 47 | 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/gpt_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "feed_forward_layer.h" 3 | #include "gpt_attention_layer.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template class GptLayer : public Layer { 9 | private: 10 | GptAttentionLayerPtr _attn_layer; 11 | FeedForwardLayerPtr _ffn_layer; 12 | 13 | int _layer_id; 14 | 15 | public: 16 | GptLayer(int layer_id, int max_batch_tokens, int max_seq_len, int hidden_size, 17 | int num_heads, int intermediate_size, float attn_prob_dropout_ratio, 18 | float activation_dropout_ratio, float hidden_output_dropout_ratio, 19 | std::string activation_fn, bool mask_future_tokens, 20 | int beam_size = 1); 21 | virtual ~GptLayer() {} 22 | 23 | Variable *operator()(Variable *inp, Variable *cache_k, Variable *cache_v, 24 | Variable *pad_mask); 25 | 26 | void before_forward(int batch_size, int seq_len, int steps) { 27 | _attn_layer->before_forward(batch_size, seq_len, steps); 28 | _ffn_layer->before_forward(batch_size, seq_len); 29 | } 30 | 31 | size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr); 32 | 33 | int load_params(const std::vector ¶_vec, int offset); 34 | }; 35 | 36 | template class GptLayer; 37 | #ifdef LIGHTSEQ_cuda 38 | template class GptLayer<__half, __half>; 39 | #endif 40 | 41 | template 42 | using GptLayerPtr = std::shared_ptr>; 43 | 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/linear_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "layer.h" 4 | #include "linear.h" 5 | 6 | namespace lightseq { 7 | 8 | template class LinearLayer : public Layer { 9 | private: 10 | // operators 11 | LinearOp *_linear = nullptr; 12 | 13 | // parameters 14 | Variable *_linear_w; 15 | 16 | // shape related 17 | int _max_batch_tokens; 18 | size_t _input_size; 19 | size_t _output_size; 20 | 21 | public: 22 | LinearLayer(int max_batch_tokens, int input_size, int output_size, 23 | MATRIX_OP opA = MATRIX_OP::Transpose, 24 | MATRIX_OP opB = MATRIX_OP::NonTranspose, float alpha = float(1.)); 25 | 26 | virtual ~LinearLayer() {} 27 | 28 | Variable *operator()(Variable *inp); 29 | 30 | void before_forward(int batch_size, int seq_len); 31 | 32 | void before_backward(); 33 | 34 | size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr); 35 | 36 | int load_params(const std::vector ¶_vec, int offset); 37 | }; 38 | 39 | template class LinearLayer; 40 | #ifdef LIGHTSEQ_cuda 41 | template class LinearLayer<__half, __half>; 42 | #endif 43 | 44 | template 45 | using LinearLayerPtr = std::shared_ptr>; 46 | 47 | } // namespace lightseq 48 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/sample_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "beam_search_topk.h" 4 | #include "layer.h" 5 | 6 | namespace lightseq { 7 | 8 | template class SampleLayer : public Layer { 9 | private: 10 | // operators 11 | BeamSearchTopOp *_beam_search = nullptr; 12 | 13 | // parameters 14 | Variable *_logit_bias; 15 | size_t _trg_vocab_size; 16 | 17 | public: 18 | SampleLayer(int nshared_layer, int max_batch_size, int max_step, 19 | int trg_vocab_size, int hidden_size, int max_thread_per_block, 20 | int beam_size, int diverse_lambda, int dim_per_head, int end_id, 21 | int head_num, 22 | float length_penalty); // for beam_search 23 | 24 | virtual ~SampleLayer() {} 25 | 26 | std::tuple operator()(Variable *logits, 27 | Variable *alive_seq); 28 | 29 | void before_forward(int batch_size, int cur_step); 30 | 31 | int load_params(const std::vector ¶_vec, int offset); 32 | 33 | bool is_stop() { return _beam_search->is_stop(); } 34 | }; 35 | 36 | template class SampleLayer; 37 | #ifdef LIGHTSEQ_cuda 38 | template class SampleLayer<__half>; 39 | #endif 40 | 41 | template using SampleLayerPtr = std::shared_ptr>; 42 | 43 | } // namespace lightseq 44 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/layers_new/includes/sdpa_layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "dropout.h" 3 | #include "layer.h" 4 | #include "softmax.h" 5 | #include "strided_batch_gemm.h" 6 | 7 | namespace lightseq { 8 | 9 | /* 10 | Scaled Dot Product Attention 11 | See paper "Attention is all you need" for details. 12 | */ 13 | template class SDPALayer : public Layer { 14 | private: 15 | // operators 16 | StridedBatchGemmOp *_attn_scores = nullptr; 17 | SoftmaxOp *_softmax = nullptr; 18 | DropoutOp *_attn_prob_dropout = nullptr; 19 | StridedBatchGemmOp *_attn_context = nullptr; 20 | 21 | // shape related 22 | int _max_batch_tokens; 23 | int _max_seq_len; 24 | int _nhead; 25 | int _head_dim; 26 | 27 | public: 28 | SDPALayer(size_t max_batch_tokens, size_t max_seq_len, size_t head_dim, 29 | size_t num_heads, float attn_prob_dropout_ratio); 30 | 31 | virtual ~SDPALayer() {} 32 | 33 | // mask is for enc-self attention and enc-dec-cross attention 34 | Variable *operator()(Variable *query, Variable *key, Variable *value, 35 | Variable *mask = nullptr); 36 | 37 | void before_forward(int batch_size, int query_len, int kv_len, int kv_size, 38 | bool mask_future); 39 | }; 40 | 41 | template class SDPALayer<__half, __half>; 42 | template class SDPALayer; 43 | 44 | template 45 | using SDPALayerPtr = std::shared_ptr>; 46 | 47 | } // namespace lightseq 48 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(LightseqProtoType LANGUAGES CXX CUDA) 4 | 5 | find_package(Threads REQUIRED) 6 | 7 | set(CMAKE_CXX_STANDARD 14) 8 | 9 | add_library( 10 | lsflow STATIC 11 | context.cpp 12 | node.cpp 13 | manager.cpp 14 | layer.cpp 15 | tensor.cpp 16 | allocator.cpp 17 | lsflow_util.cpp 18 | operator.cpp 19 | shape.cpp 20 | variable.cpp) 21 | 22 | target_link_libraries(lsflow PUBLIC lightseq_kernels) 23 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/README.md: -------------------------------------------------------------------------------- 1 | LsFlow is a extremely clean implement of computation graph. 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/allocator.cpp: -------------------------------------------------------------------------------- 1 | #include "allocator.h" 2 | 3 | namespace lightseq { 4 | 5 | Allocator::Allocator() { _ptr_set.clear(); } 6 | 7 | Allocator::~Allocator() { 8 | auto _tmp_ptr_set = _ptr_set; 9 | for (auto iter : _tmp_ptr_set) { 10 | try { 11 | free_mem(iter); 12 | } catch (...) { 13 | // printf("execute ~Allocator() free_mem %p failed!\n", iter); 14 | } 15 | } 16 | _ptr_set.clear(); 17 | } 18 | 19 | char *Allocator::malloc_mem(size_t size) { 20 | char *ptr = nullptr; 21 | 22 | try { 23 | #ifdef LIGHTSEQ_cuda 24 | ptr = cuda::cuda_malloc(size); 25 | #else 26 | ptr = (char *)malloc(size); 27 | #endif 28 | } catch (...) { 29 | std::string error_message = 30 | "allocate memory failed! size is: " + std::to_string((size / MB_SIZE)) + 31 | " MB\n"; 32 | printf("%s", error_message.c_str()); 33 | throw std::runtime_error(error_message); 34 | } 35 | if (_ptr_set.find(ptr) != _ptr_set.end()) { 36 | printf("allocate same address with twice.\n"); 37 | throw std::runtime_error("allocate same address with twice.\n"); 38 | } 39 | _ptr_set.insert(ptr); 40 | return ptr; 41 | } 42 | 43 | void Allocator::free_mem(char *ptr) { 44 | if (_ptr_set.find(ptr) == _ptr_set.end() || ptr == nullptr) { 45 | return; 46 | } 47 | _ptr_set.erase(ptr); 48 | #ifdef LIGHTSEQ_cuda 49 | cuda::cuda_free(ptr); 50 | #else 51 | free(ptr); 52 | #endif 53 | } 54 | 55 | } // namespace lightseq 56 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/includes/allocator.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team 3 | */ 4 | #pragma once 5 | #include "declaration.h" 6 | 7 | namespace lightseq { 8 | 9 | class Allocator { 10 | private: 11 | std::unordered_set _ptr_set; 12 | 13 | public: 14 | Allocator(); 15 | virtual ~Allocator(); 16 | char *malloc_mem(size_t size); 17 | void free_mem(char *ptr); 18 | }; 19 | 20 | } // namespace lightseq 21 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/includes/lsflow_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team 3 | */ 4 | 5 | #pragma once 6 | #include "declaration.h" 7 | 8 | namespace lightseq { 9 | 10 | /* Print run time, for debug */ 11 | void print_time_duration( 12 | const std::chrono::high_resolution_clock::time_point &start, 13 | std::string duration_name); 14 | 15 | #ifdef LIGHTSEQ_cuda 16 | cublasOperation_t op_from_custom(MATRIX_OP op_type); 17 | #endif 18 | 19 | } // namespace lightseq 20 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/includes/shape.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "initializer_list" 4 | 5 | namespace lightseq { 6 | 7 | // This class records the shape information of the tensor and encapsulates some 8 | // methods that may be commonly used. 9 | class Shape { 10 | private: 11 | std::vector _shape_vec; 12 | size_t _element_size; 13 | bool _is_calculated; 14 | 15 | public: 16 | // Default constructor, not part of expected usage. 17 | Shape() : _shape_vec({0}), _element_size(0), _is_calculated(false) {} 18 | Shape(std::vector shape) 19 | : _shape_vec(shape), _element_size(0), _is_calculated(false) {} 20 | Shape(std::initializer_list list) 21 | : Shape(std::vector(list)) {} 22 | Shape(const Shape &lx) = default; 23 | virtual ~Shape() = default; 24 | const std::vector &view() const { return _shape_vec; } 25 | 26 | // Returns the product of each dimension of shape. 27 | size_t element_size(); 28 | 29 | // Print shape information. 30 | void print_shape(); 31 | }; 32 | 33 | } // namespace lightseq 34 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/lsflow_util.cpp: -------------------------------------------------------------------------------- 1 | #include "lsflow_util.h" 2 | 3 | namespace lightseq { 4 | 5 | void print_time_duration( 6 | const std::chrono::high_resolution_clock::time_point &start, 7 | std::string duration_name) { 8 | #ifdef LIGHTSEQ_cuda 9 | CHECK_GPU_ERROR(cudaStreamSynchronize(0)); 10 | #endif 11 | auto finish = std::chrono::high_resolution_clock::now(); 12 | std::chrono::duration elapsed = finish - start; 13 | std::cout << duration_name 14 | << " duration time is: " << (elapsed).count() * 1000 << " ms" 15 | << std::endl; 16 | return; 17 | } 18 | 19 | #ifdef LIGHTSEQ_cuda 20 | cublasOperation_t op_from_custom(MATRIX_OP op_type) { 21 | switch (op_type) { 22 | case MATRIX_OP::Transpose: 23 | return CUBLAS_OP_T; 24 | case MATRIX_OP::NonTranspose: 25 | return CUBLAS_OP_N; 26 | default: { 27 | std::string error_message = "undefined custom MATRIX_OP\n"; 28 | printf("%s", error_message.c_str()); 29 | throw std::runtime_error("undefined custom MATRIX_OP"); 30 | } 31 | } 32 | exit(-1); 33 | } 34 | #endif 35 | } // namespace lightseq 36 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/operator.cpp: -------------------------------------------------------------------------------- 1 | #include "node.h" 2 | namespace lightseq { 3 | 4 | Operator::Operator(std::string name) : Node(name, NodeType::Operator) { 5 | _context_ptr->add_op(this); 6 | } 7 | 8 | void Operator::check_override_grad() { 9 | for (Node *p : this->_parents) { 10 | Variable *rp = static_cast(p); 11 | if (!rp->enable_override_grad()) { 12 | printf("can not override"); 13 | exit(-1); 14 | } 15 | } 16 | return; 17 | } 18 | 19 | void Operator::set_children(std::vector children) { 20 | if (!this->_children.empty()) { 21 | printf("children not empty!"); 22 | exit(-1); 23 | } 24 | for (Node *iter : children) { 25 | iter->set_parents({this}); 26 | } 27 | } 28 | } // namespace lightseq 29 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/lsflow/shape.cpp: -------------------------------------------------------------------------------- 1 | #include "shape.h" 2 | 3 | namespace lightseq { 4 | 5 | size_t Shape::element_size() { 6 | if (_shape_vec.size() == 1 && _shape_vec[0] == 0) { 7 | printf("this tensor without shape\n"); 8 | return 0; 9 | } 10 | if (_is_calculated) { 11 | return _element_size; 12 | } 13 | size_t product = 1; 14 | for (int iter : _shape_vec) { 15 | // if (iter <= 0) { 16 | // throw std::runtime_error("this tensor with invalid shape"); 17 | // return 0; 18 | // } 19 | product *= iter; 20 | } 21 | _is_calculated = true; 22 | _element_size = product; 23 | return _element_size; 24 | } 25 | 26 | void Shape::print_shape() { 27 | printf("shape dim: %zu, element size: %d, each dimension: ", 28 | _shape_vec.size(), element_size()); 29 | for (int i = 0; i < _shape_vec.size(); i++) { 30 | printf("%zu", _shape_vec[i]); 31 | if (i == _shape_vec.size() - 1) { 32 | printf("\n"); 33 | } else { 34 | printf(", "); 35 | } 36 | } 37 | } 38 | 39 | } // namespace lightseq 40 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(liblightseq SHARED bert.cc bert_crf.cc transformer.cu gpt.cc 2 | model_util.cc) 3 | 4 | target_link_libraries(liblightseq PUBLIC lightseq_layers) 5 | 6 | target_link_libraries(liblightseq PUBLIC weight_lib) 7 | 8 | target_link_options(liblightseq PUBLIC $) 10 | 11 | target_include_directories(liblightseq PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 12 | 13 | set_target_properties(liblightseq PROPERTIES OUTPUT_NAME lightseq) 14 | 15 | # add_executable(test_example test_layer.cc) target_link_libraries(test_example 16 | # PUBLIC liblightseq) 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/models/includes/model_util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "layer.h" 3 | 4 | namespace lightseq { 5 | 6 | GenerateMethod get_generate_method(std::string method_); 7 | 8 | } // namespace lightseq 9 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/models/model_util.cc: -------------------------------------------------------------------------------- 1 | #include "model_util.h" 2 | 3 | namespace lightseq { 4 | 5 | GenerateMethod get_generate_method(std::string method_) { 6 | if (method_ == "topk") 7 | return GenerateMethod::Topk; 8 | if (method_ == "topp") 9 | return GenerateMethod::Topp; 10 | if (method_ == "beam_search") 11 | return GenerateMethod::BeamSearch; 12 | 13 | printf("Error!\n"); 14 | return GenerateMethod::UnDefined; 15 | } 16 | 17 | } // namespace lightseq 18 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops/includes/context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "cuda_util.h" 10 | namespace lightseq { 11 | namespace cuda { 12 | 13 | class Context { 14 | public: 15 | Context() : _stream(nullptr) { 16 | CHECK_GPU_ERROR(cublasCreate(&_cublasHandle)); 17 | CHECK_GPU_ERROR(cublasLtCreate(&_cublasLtHandle)); 18 | } 19 | 20 | virtual ~Context() {} 21 | 22 | static Context &Instance() { 23 | static Context _ctx; 24 | return _ctx; 25 | } 26 | 27 | void set_stream(cudaStream_t stream) { 28 | _stream = stream; 29 | CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream)); 30 | } 31 | 32 | cudaStream_t get_stream() { return _stream; } 33 | 34 | cublasHandle_t get_cublashandle() { return _cublasHandle; } 35 | cublasLtHandle_t get_cublaslthandle() { return _cublasLtHandle; } 36 | 37 | private: 38 | cudaStream_t _stream; 39 | cublasHandle_t _cublasHandle; 40 | cublasLtHandle_t _cublasLtHandle; 41 | }; 42 | 43 | } // namespace cuda 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops/includes/softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "kernels.h" 10 | 11 | using namespace std; 12 | namespace lightseq { 13 | namespace cuda { 14 | 15 | template class Softmax { 16 | public: 17 | struct Config { 18 | size_t nhead; 19 | bool mask_future; 20 | Config(size_t nhead, bool mask_future = false) 21 | : nhead(nhead), mask_future(mask_future) {} 22 | }; 23 | 24 | Softmax(Config config) : config_(config) {} 25 | 26 | ~Softmax() {} 27 | 28 | void Forward(T *vals, const T *attn_mask, int batch_size, int from_len, 29 | int to_len, cudaStream_t &stream, bool mask_future = false) { 30 | launch_attn_softmax(vals, attn_mask, batch_size, config_.nhead, from_len, 31 | to_len, config_.mask_future | mask_future, stream); 32 | } 33 | 34 | void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len, 35 | int to_len, cudaStream_t stream) { 36 | launch_attn_softmax_bw(out_grad, soft_out, 37 | batch_size * config_.nhead * from_len, to_len, 38 | stream); 39 | } 40 | 41 | private: 42 | Config config_; 43 | }; 44 | } // namespace cuda 45 | } // namespace lightseq 46 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(operator_files 2 | beam_search_topk.cu 3 | sampling.cc.cu 4 | bias_act_dropout.cpp 5 | bias_add_transform_20314.cpp 6 | bias_dropout_residual.cpp 7 | concat3_dim1.cpp 8 | crf.cpp 9 | dropout.cpp 10 | launch_dec_emb_op.cpp 11 | launch_enc_emb.cpp 12 | launch_gpt_emb.cpp 13 | layer_normalize.cpp 14 | split_head_op.cpp 15 | linear.cpp 16 | softmax.cpp 17 | strided_batch_gemm.cpp 18 | transform_0213.cpp) 19 | 20 | add_library(lightseq_operators STATIC ${operator_files}) 21 | target_link_libraries(lightseq_operators PUBLIC lsflow) 22 | target_include_directories(lightseq_operators PUBLIC includes) 23 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/bias_act_dropout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // dropout inside ffn. 8 | template class BiasActDropoutOp : public Operator { 9 | private: 10 | float ratio; 11 | 12 | size_t _mx_cols; 13 | size_t _mx_rows; 14 | size_t _cols; 15 | size_t _rows; 16 | 17 | Variable *_result; 18 | 19 | std::string _activation_fn; 20 | 21 | TensorPtr _mask; 22 | 23 | public: 24 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 25 | 26 | BiasActDropoutOp(float r, size_t mx_rows, size_t mx_cols, 27 | std::string activation_fn) 28 | : Operator("BiasActDropoutOp"), ratio(r), _activation_fn(activation_fn), 29 | _mx_rows(mx_rows), _mx_cols(mx_cols) { 30 | _mask.reset(new Tensor("_mask", g_dtype(), _mx_rows * _mx_cols)); 31 | } 32 | 33 | virtual ~BiasActDropoutOp() {} 34 | 35 | Variable *operator()(Variable *inp, Variable *bias); 36 | 37 | void before_forward(size_t rows, size_t cols) { 38 | _rows = rows, _cols = cols; 39 | _result->set_shape({rows, cols}); 40 | } 41 | 42 | void forward() override; 43 | 44 | void backward() override; 45 | }; 46 | } // namespace lightseq 47 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/bias_add_transform_20314.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | // add bias and transform 20314, execute after qkv_linear 9 | template class BiasAddTrans20314 : public Operator { 10 | private: 11 | size_t _max_batch_tokens; 12 | size_t _batch; 13 | size_t _seq_len; 14 | size_t _heads; 15 | size_t _hidden_size; 16 | size_t _trans_count; 17 | 18 | Variable *_res; 19 | 20 | public: 21 | BiasAddTrans20314(size_t max_batch_tokens, size_t heads, size_t hidden_size, 22 | size_t trans_count) 23 | : Operator("BiasAddTrans20314"), _max_batch_tokens(max_batch_tokens), 24 | _heads(heads), _hidden_size(hidden_size), _trans_count(trans_count) {} 25 | 26 | virtual ~BiasAddTrans20314() {} 27 | 28 | Variable *operator()(Variable *inp, Variable *bias); 29 | 30 | void before_forward(size_t batch, size_t seq_len) { 31 | _batch = batch, _seq_len = seq_len; 32 | _res->set_shape( 33 | {_trans_count, _batch, _heads, _seq_len, _hidden_size / _heads}); 34 | } 35 | 36 | void forward() override; 37 | 38 | void backward() override; 39 | }; 40 | } // namespace lightseq 41 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/bias_dropout_residual.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // transformer layer's postprocessing dropout, after attn or ffn module, 8 | // before residual add. 9 | template class BiasDropoutResOp : public Operator { 10 | private: 11 | float ratio; 12 | 13 | size_t _max_rows; 14 | size_t _max_cols; 15 | size_t _rows; 16 | size_t _cols; 17 | 18 | TensorPtr _mask; 19 | Variable *_result; 20 | 21 | public: 22 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 23 | 24 | BiasDropoutResOp(float r, size_t max_rows, size_t max_cols) 25 | : Operator("BiasDropoutResOp"), ratio(r), _max_rows(max_rows), 26 | _max_cols(max_cols) { 27 | _mask.reset(new Tensor("mask", g_dtype(), _max_rows * _max_cols)); 28 | } 29 | 30 | virtual ~BiasDropoutResOp() {} 31 | 32 | Variable *operator()(Variable *inp, Variable *bias, Variable *residual); 33 | 34 | void before_forward(size_t rows, size_t cols) { 35 | _rows = rows, _cols = cols; 36 | _result->set_shape({_rows, _cols}); 37 | } 38 | 39 | void forward() override; 40 | 41 | void backward() override; 42 | }; 43 | } // namespace lightseq 44 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/concat3_dim1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | #include "tuple" 5 | 6 | namespace lightseq { 7 | 8 | template class Concat3Dim1 : public Operator { 9 | private: 10 | bool _is_skip = false; 11 | bool _is_continuous_cache; 12 | 13 | size_t _mx_sz0; 14 | size_t _mx_sz1; 15 | size_t _mx_sz2; 16 | 17 | size_t _sz0; 18 | size_t _sz1_0; 19 | size_t _sz1_1; 20 | size_t _layer_id; 21 | 22 | Variable *_new_cache; 23 | 24 | public: 25 | Concat3Dim1(size_t mx_sz0, size_t mx_sz1, size_t mx_sz2, size_t layer_id, 26 | bool is_continuous_cache) 27 | : Operator("Concat3Dim1"), _mx_sz0(mx_sz0), _mx_sz1(mx_sz1), 28 | _mx_sz2(mx_sz2), _layer_id(layer_id), 29 | _is_continuous_cache(is_continuous_cache) {} 30 | 31 | virtual ~Concat3Dim1() {} 32 | 33 | Variable *operator()(Variable *inp, Variable *cache); 34 | 35 | void before_forward(size_t sz0, size_t sz1_0, size_t sz1_1, 36 | bool is_skip = false) { 37 | _sz0 = sz0, _sz1_0 = sz1_0, _sz1_1 = sz1_1, _is_skip = is_skip; 38 | if (_is_continuous_cache) { 39 | _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2}); 40 | } else { 41 | _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2}); 42 | } 43 | } 44 | 45 | void forward() override; 46 | 47 | void before_backward() {} 48 | 49 | void backward() override; 50 | }; 51 | } // namespace lightseq 52 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/crf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // linear crf 8 | template class CRFOP : public Operator { 9 | private: 10 | size_t _num_tags; 11 | size_t _seq_len; 12 | size_t _batch_size; 13 | size_t _max_batch_tokens; 14 | size_t _max_batch_size; 15 | 16 | bool _forward_or_decode; // true for forward, false for decode 17 | bool _output_decode_score; 18 | TensorPtr _history; 19 | 20 | Variable *_best_tags; 21 | 22 | public: 23 | CRFOP(size_t max_batch_tokens, size_t max_batch_size, size_t num_tags); 24 | 25 | virtual ~CRFOP() {} 26 | 27 | Variable *operator()(Variable *start_transition, Variable *end_transition, 28 | Variable *transition, Variable *emission, Variable *mask, 29 | Variable *bias); 30 | 31 | void before_forward(size_t batch_size, size_t seq_len, bool forward_or_decode, 32 | bool output_decode_score); 33 | 34 | void forward() override; 35 | 36 | void before_backward(); 37 | 38 | void backward() override; 39 | }; 40 | 41 | } // namespace lightseq 42 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/dropout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // after attention softmax 8 | template class DropoutOp : public Operator { 9 | private: 10 | float ratio; 11 | size_t _max_ele_num; 12 | size_t _count; 13 | bool _is_skip; 14 | 15 | TensorPtr _mask; 16 | Variable *_result = nullptr; 17 | 18 | public: 19 | float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; } 20 | 21 | DropoutOp(float r, size_t max_ele_num) 22 | : Operator("Dropout"), ratio(r), _max_ele_num(max_ele_num) { 23 | _mask.reset(new Tensor("mask", g_dtype(), max_ele_num)); 24 | } 25 | 26 | virtual ~DropoutOp() {} 27 | 28 | Variable *operator()(Variable *inp); 29 | 30 | void before_forward(size_t count) { 31 | _count = count; 32 | if (_result) 33 | _result->set_shape({count}); 34 | } 35 | 36 | void forward() override; 37 | 38 | void before_backward(int count) { _count = count; } 39 | 40 | void backward() override; 41 | }; 42 | } // namespace lightseq 43 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/launch_enc_emb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // dropout inside ffn. 8 | template class LaunchEncEmbOp : public Operator { 9 | private: 10 | size_t _max_batch_tokens; 11 | int _pad_id; 12 | size_t _hidden_dim; 13 | size_t _multilg_type; 14 | 15 | size_t _batch_size; 16 | size_t _seq_len; 17 | 18 | Variable *_result; 19 | Variable *_pad_mask; 20 | 21 | public: 22 | LaunchEncEmbOp(size_t max_batch_tokens, int pad_id, size_t hidden_dim, 23 | size_t multilg_type) 24 | : Operator("LaunchEncEmbOp"), _max_batch_tokens(max_batch_tokens), 25 | _pad_id(pad_id), _hidden_dim(hidden_dim), _multilg_type(multilg_type) {} 26 | 27 | virtual ~LaunchEncEmbOp() {} 28 | 29 | std::tuple 30 | operator()(Variable *inp_tokens, Variable *token_emb, Variable *pos_emb, 31 | Variable *lang_emb, Variable *lang_id); 32 | 33 | void before_forward(size_t batch_size, size_t seq_len) { 34 | _batch_size = batch_size, _seq_len = seq_len; 35 | } 36 | 37 | void forward() override; 38 | 39 | void backward() override { 40 | printf("ERROR! LaunchEncEmbOp can't cal backward()\n"); 41 | exit(-1); 42 | } 43 | }; 44 | } // namespace lightseq 45 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/layer_normalize.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template class LayerNormalizeOp : public Operator { 8 | private: 9 | size_t _max_batch_tokens; 10 | size_t _hidden_dim; 11 | size_t _batch_tokens; 12 | 13 | bool _use_mean; 14 | 15 | TensorPtr means_; 16 | TensorPtr vars_; 17 | 18 | Variable *_result; 19 | 20 | public: 21 | LayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim, 22 | bool use_mean = false) 23 | : Operator("LayerNormalizeOp"), _max_batch_tokens(max_batch_tokens), 24 | _hidden_dim(hidden_dim), _use_mean(use_mean) { 25 | vars_.reset(new Tensor("vars", g_dtype(), max_batch_tokens)); 26 | if (use_mean) 27 | means_.reset(new Tensor("means", g_dtype(), max_batch_tokens)); 28 | } 29 | 30 | Variable *operator()(Variable *inp, Variable *gamma, Variable *betta); 31 | 32 | virtual ~LayerNormalizeOp(); 33 | 34 | void before_forward(size_t batch_size, size_t seq_len); 35 | 36 | void forward() override; 37 | 38 | void backward() override; 39 | }; 40 | 41 | } // namespace lightseq 42 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/linear.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template class LinearOp : public Operator { 8 | private: 9 | size_t _output_size; 10 | size_t _input_size; 11 | size_t _max_batch_tokens; 12 | size_t _batch_tokens; 13 | std::array _gemm_algos; 14 | 15 | float _alpha; 16 | MATRIX_OP _opA; 17 | MATRIX_OP _opB; 18 | 19 | Variable *_result; 20 | 21 | #ifdef PYBIND_INTERFACE 22 | #define weight_op MATRIX_OP::Transpose 23 | #else 24 | #define weight_op MATRIX_OP::NonTranspose 25 | #endif 26 | 27 | public: 28 | LinearOp(size_t max_batch_tokens, size_t output_size, size_t input_size, 29 | MATRIX_OP opA = weight_op, MATRIX_OP opB = MATRIX_OP::NonTranspose, 30 | float alpha = float(1.)) 31 | : Operator("LinearOp"), _max_batch_tokens(max_batch_tokens), 32 | _output_size(output_size), _input_size(input_size), _opA(opA), 33 | _opB(opB), _gemm_algos(std::array({99, 99, 99})), 34 | _alpha(alpha) {} 35 | 36 | ~LinearOp() {} 37 | 38 | Variable *operator()(Variable *inp, Variable *weight); 39 | 40 | void forward() override; 41 | 42 | void before_forward(size_t batch_tokens) { 43 | _batch_tokens = batch_tokens; 44 | _result->set_shape({batch_tokens, _output_size}); 45 | } 46 | 47 | void backward() override; 48 | 49 | void before_backward() {} 50 | }; 51 | 52 | } // namespace lightseq 53 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | template class SoftmaxOp : public Operator { 8 | private: 9 | size_t _nhead; 10 | size_t _max_batch_tokens; 11 | size_t _max_seq_len; 12 | size_t _batchs; 13 | size_t _from_len; 14 | size_t _to_len; 15 | int _kv_size; 16 | 17 | bool _config_mask_future; 18 | bool _mask_future; 19 | 20 | Variable *_result; 21 | 22 | public: 23 | SoftmaxOp(size_t max_batch_tokens, size_t max_seq_len, size_t nhead, 24 | bool mask_future = false) 25 | : Operator("SoftmaxOp"), _max_batch_tokens(max_batch_tokens), 26 | _max_seq_len(max_seq_len), _nhead(nhead), 27 | _config_mask_future(mask_future) {} 28 | 29 | virtual ~SoftmaxOp() {} 30 | 31 | Variable *operator()(Variable *inp, Variable *mask = nullptr); 32 | 33 | void forward() override; 34 | 35 | void before_forward(size_t batchs, size_t from_len, size_t to_len, 36 | int kv_size = -1, bool mask_future = false) { 37 | _batchs = batchs; 38 | _from_len = from_len; 39 | _to_len = to_len; 40 | _kv_size = (kv_size == -1 ? to_len : kv_size); 41 | _mask_future = mask_future; 42 | _result->set_shape({_batchs, _nhead, _from_len, _to_len}); 43 | } 44 | 45 | void backward() override; 46 | }; 47 | 48 | } // namespace lightseq 49 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/includes/transform_0213.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "declaration.h" 3 | #include "node.h" 4 | 5 | namespace lightseq { 6 | 7 | // [sz0, sz1, sz2, sz3] -> [sz0, sz2, sz1, sz3] 8 | template class Transform0213OP : public Operator { 9 | private: 10 | size_t _max_numel; 11 | size_t _sz0; 12 | size_t _sz1; 13 | size_t _sz2; 14 | size_t _sz3; 15 | 16 | Variable *_result; 17 | 18 | public: 19 | Transform0213OP(size_t max_numel) 20 | : Operator("Transform0213"), _max_numel(max_numel) {} 21 | 22 | virtual ~Transform0213OP() {} 23 | 24 | Variable *operator()(Variable *inp); 25 | 26 | void before_forward(size_t sz0, size_t sz1, size_t sz2, size_t sz3) { 27 | _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3; 28 | _result->set_shape({_sz0, _sz2, _sz1, _sz3}); 29 | } 30 | 31 | void forward() override; 32 | 33 | void before_backward(int sz0, int sz1, int sz2, int sz3) { 34 | _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3; 35 | } 36 | 37 | void backward() override; 38 | }; 39 | } // namespace lightseq 40 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/ops_new/transform_0213.cpp: -------------------------------------------------------------------------------- 1 | #include "transform_0213.h" 2 | 3 | namespace lightseq { 4 | 5 | template 6 | Variable *Transform0213OP::operator()(Variable *inp) { 7 | _result = new Variable("Transform0213_res", _max_numel, g_dtype(), 8 | g_dtype()); 9 | set_parents({inp}); 10 | this->set_children({_result}); 11 | return _result; 12 | } 13 | 14 | template void Transform0213OP::forward() { 15 | T1 *inp_ptr = (T1 *)parent(0)->value(); 16 | T1 *res_ptr = (T1 *)child(0)->value(); 17 | 18 | if (!_context_ptr->is_built()) { 19 | return; 20 | } 21 | #ifdef LIGHTSEQ_cuda 22 | cudaStream_t _stream = _context_ptr->get_stream(); 23 | cuda::launch_transform_0213(inp_ptr, res_ptr, _sz0, _sz1, _sz2, _sz3, 24 | _stream); 25 | #endif 26 | } 27 | 28 | template void Transform0213OP::backward() { 29 | T2 *inp_grad = (T1 *)parent(0)->grad(); 30 | T2 *out_grad = (T1 *)child(0)->grad(); 31 | 32 | if (!_context_ptr->is_built()) { 33 | return; 34 | } 35 | 36 | #ifdef LIGHTSEQ_cuda 37 | cudaStream_t _stream = _context_ptr->get_stream(); 38 | cuda::launch_transform_0213(out_grad, inp_grad, _sz0, _sz1, _sz2, _sz3, 39 | _stream); 40 | #endif 41 | } 42 | 43 | template class Transform0213OP; 44 | #ifdef LIGHTSEQ_cuda 45 | template class Transform0213OP<__half, __half>; 46 | #endif 47 | } // namespace lightseq 48 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/proto/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # (default) use C API for HDF5 library 4 | find_package(HDF5 REQUIRED) 5 | include_directories(${HDF5_INCLUDE_DIRS}) 6 | 7 | find_package(Protobuf REQUIRED) 8 | include_directories(${Protobuf_INCLUDE_DIRS}) 9 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 10 | 11 | set(PROTO_FILES bert.proto bert_crf.proto transformer.proto gpt.proto) 12 | 13 | set(WEIGHT_FILES bert_weight.cc bert_crf_weight.cc transformer_weight.cc 14 | gpt_weight.cc) 15 | 16 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER ${PROTO_FILES}) 17 | add_library(weight_lib STATIC ${WEIGHT_FILES} ${PROTO_SRC} ${PROTO_HEADER} 18 | proto_util.cc) 19 | target_link_libraries(weight_lib PRIVATE ${HDF5_LIBRARIES}) 20 | target_link_libraries(weight_lib PUBLIC ${Protobuf_LIBRARIES}) 21 | target_link_libraries(weight_lib PUBLIC lightseq_kernels) 22 | 23 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 24 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 25 | target_include_directories(weight_lib PUBLIC ${HDF5_INCLUDE_DIRS}) 26 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/proto/includes/proto_headers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "hdf5.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "declaration.h" 16 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/proto/includes/test_model_weight.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "bert.pb.h" 3 | #include "proto_headers.h" 4 | #include "proto_util.h" 5 | 6 | namespace lightseq { 7 | template class TestModelWeight { 8 | private: 9 | const T *_p_d_weight_emb; 10 | std::vector _d_weight_emb; 11 | 12 | public: 13 | TestModelWeight(int weight_size) { 14 | _d_weight_emb.clear(); 15 | for (int i = 0; i < weight_size; i++) { 16 | _d_weight_emb.push_back(rand() % 100); 17 | } 18 | } 19 | const T *&weight_emb() const { return _p_d_weight_emb; } 20 | }; 21 | } // namespace lightseq 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pybind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(LS_PYBIND_KERNEL_FILES pybind_model.cpp) 4 | pybind11_add_module(lightseq MODULE ${LS_PYBIND_KERNEL_FILES}) 5 | target_link_libraries(lightseq PUBLIC liblightseq lightseq_kernels) 6 | set_target_properties(lightseq PROPERTIES OUTPUT_NAME inference) 7 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 4 | csrc_dir = os.path.dirname(cur_dir) 5 | lightseq_dir = os.path.dirname(csrc_dir) 6 | sys.path.insert(0, lightseq_dir) 7 | 8 | from .builder.cuda_kernel_builder import CudaKernelBuilder 9 | from .builder.x86_kernel_builder import X86KernelBuilder 10 | from .builder.cuda_layer_builder import CudaLayerBuilder 11 | 12 | from .torch_transformer_layers import TransformerEncoderLayer, TransformerDecoderLayer 13 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import CUDAOpBuilder 2 | from .cuda_kernel_builder import CudaKernelBuilder 3 | from .x86_kernel_builder import X86KernelBuilder 4 | from .cuda_layer_builder import CudaLayerBuilder 5 | 6 | # TODO: infer this list instead of hard coded 7 | # List of all available ops 8 | __op_builders__ = [ 9 | CudaKernelBuilder(), 10 | CudaLayerBuilder(), 11 | X86KernelBuilder(), 12 | ] 13 | 14 | ALL_OPS = {op.name: op for op in __op_builders__} 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT) 19 | # of commit 42805f0 20 | 21 | from .version import __version__ 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/calib/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """``csrc.pytorch.pytorch_quantization.calib`` provides Calibrator classes that 20 | collect data statistics and determine pytorch_quantization parameters. 21 | """ 22 | 23 | from .max import MaxCalibrator 24 | from .histogram import * 25 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | from .modules.tensor_quantizer import * 20 | from .modules.quant_conv import * 21 | from .modules.quant_linear import * 22 | from .modules.quant_pooling import * 23 | from .modules.clip import * 24 | from .modules.quant_rnn import * 25 | from .modules.quant_bert import * 26 | from .modules.quant_instancenorm import * 27 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """Main entry of all utils""" 20 | 21 | from .reduce_amax import reduce_amax 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/utils/quant_logging.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """A WAR for codes that messes up logging format""" 20 | 21 | import logging 22 | 23 | 24 | def reset_logger_handler(): 25 | """Remove all handler in root logger""" 26 | root_logger = logger.getLogger() 27 | while root_logger.handlers: 28 | root_logger.removeHandler(root_logger.handlers[0]) 29 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/pytorch_quantization/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | __version__ = "2.1.2" 18 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/pytorch/sdpa_layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from csrc.pytorch.builder.cuda_layer_builder import CudaLayerBuilder 3 | 4 | cuda_layer_module = CudaLayerBuilder().load() 5 | 6 | 7 | class SdpaLayerFunc(torch.autograd.Function): 8 | @staticmethod 9 | def forward( 10 | ctx, 11 | input, 12 | input_mask, 13 | config, 14 | ): 15 | cuda_module = cuda_layer_module 16 | forward_func = ( 17 | cuda_module.transformer_encoder_layer_fw_fp16 18 | if config.fp16 19 | else cuda_module.transformer_encoder_layer_fw_fp32 20 | ) 21 | if config.fp16: 22 | input = input.to(torch.half) 23 | input_mask = input_mask.to(torch.half) 24 | 25 | (output,) = forward_func(config.layer_id, input, input_mask) 26 | 27 | return output 28 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information. 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/csrc/tests/__init__.py -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/tests/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 4 | par_dir = os.path.dirname(cur_dir) 5 | csrc_dir = os.path.dirname(par_dir) 6 | lightseq_dir = os.path.dirname(csrc_dir) 7 | 8 | sys.path.insert(0, lightseq_dir) 9 | sys.path.insert(0, os.path.dirname(lightseq_dir)) 10 | -------------------------------------------------------------------------------- /oslo/lightseq2/csrc/triton_backend/src/triton_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "triton/backend/backend_common.h" 4 | #include "triton/core/tritonserver.h" 5 | 6 | TRITONSERVER_DataType 7 | transform_triton_datatype_to_lightseq(::lightseq::cuda::DataType data_type_) { 8 | switch (data_type_) { 9 | case ::lightseq::cuda::DataType::kNotSupported: 10 | return TRITONSERVER_TYPE_INVALID; 11 | case ::lightseq::cuda::DataType::kFloat32: 12 | return TRITONSERVER_TYPE_FP32; 13 | case ::lightseq::cuda::DataType::kInt32: 14 | return TRITONSERVER_TYPE_INT32; 15 | case ::lightseq::cuda::DataType::kInt64: 16 | return TRITONSERVER_TYPE_INT64; 17 | case ::lightseq::cuda::DataType::kFloat16: 18 | return TRITONSERVER_TYPE_FP16; 19 | case ::lightseq::cuda::DataType::kInt8: 20 | return TRITONSERVER_TYPE_INT8; 21 | case ::lightseq::cuda::DataType::kInt16: 22 | return TRITONSERVER_TYPE_INT16; 23 | case ::lightseq::cuda::DataType::kByte: 24 | return TRITONSERVER_TYPE_BYTES; 25 | case ::lightseq::cuda::DataType::kUInt8: 26 | return TRITONSERVER_TYPE_UINT8; 27 | case ::lightseq::cuda::DataType::kUInt16: 28 | return TRITONSERVER_TYPE_UINT16; 29 | case ::lightseq::cuda::DataType::kUInt32: 30 | return TRITONSERVER_TYPE_UINT32; 31 | case ::lightseq::cuda::DataType::kUInt64: 32 | return TRITONSERVER_TYPE_UINT64; 33 | case ::lightseq::cuda::DataType::kFloat64: 34 | return TRITONSERVER_TYPE_FP64; 35 | default: 36 | return TRITONSERVER_TYPE_INVALID; 37 | } 38 | return TRITONSERVER_TYPE_INVALID; 39 | } 40 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | set(cuda_kernel_files 4 | gptKernels.cc.cu 5 | gptKernels_int8.cc.cu 6 | transformerKernels.cc.cu 7 | multilgKernels.cc.cu 8 | embKernels.cc.cu 9 | embKernels_int8.cc.cu 10 | transformerKernels_int8.cc.cu 11 | moeKernels.cc.cu 12 | t5Kernels.cc.cu 13 | t5EmbKernels.cc.cu) 14 | 15 | add_library(cuda_kernels STATIC ${cuda_kernel_files}) 16 | target_include_directories(cuda_kernels INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/kernels/embKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | void launch_split_multilg_request(const int *req, int *src_lang_id, 9 | int *trg_lang_id, int *src_token_id, 10 | int batch_size, int req_len, 11 | cudaStream_t &stream); 12 | 13 | template 14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens, 15 | T *output, int *pad_mask, int pad_id, int batch_size, 16 | int seq_len, int hidden_dim, cudaStream_t stream, 17 | const T *lang_emb, const int *lang_id, int multilg_type); 18 | 19 | template 20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens, 21 | const T *lang_emb, const int *lang_id, T *output, 22 | int batch_size, int beam_size, int hidden_dim, 23 | int vocab_size, int step, int max_step, int multilg_type, 24 | cudaStream_t stream); 25 | 26 | template 27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias, 28 | const T *pos_emb, const T *cls_emb, const float *input, 29 | T *output, int patch_size, int image_size, int batch_size, 30 | int max_step, int hidden_dim, int channel_input, 31 | cudaStream_t stream); 32 | 33 | } // namespace cuda 34 | } // namespace lightseq 35 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/kernels/embKernels_int8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | template 9 | void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb, 10 | const int *tokens, T *output, int *pad_mask, int pad_id, 11 | int batch_size, int seq_len, int hidden_dim, 12 | cudaStream_t stream, const T *lang_emb, 13 | const int *lang_id, int multilg_type, 14 | float dequant_scale, bool scaled = true); 15 | 16 | template 17 | void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens, 18 | const T *lang_emb, const int *lang_id, T *output, 19 | int batch_size, int beam_size, int hidden_dim, 20 | int vocab_size, int step, int max_step, 21 | int multilg_type, cudaStream_t stream, 22 | float dequant_scale, bool scaled = true); 23 | 24 | } // namespace cuda 25 | } // namespace lightseq 26 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/kernels/t5EmbKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace lightseq { 6 | namespace cuda { 7 | 8 | template 9 | void t5_launch_enc_emb(const T *token_emb, const int *tokens, T *output, 10 | int *pad_mask, int pad_id, int batch_size, int seq_len, 11 | int hidden_dim, cudaStream_t stream, const T *lang_emb, 12 | const int *lang_id); 13 | 14 | template 15 | void t5_launch_dec_emb(const T *token_emb, int *tokens, const T *lang_emb, 16 | const int *lang_id, T *output, int batch_size, 17 | int beam_size, int hidden_dim, int vocab_size, int step, 18 | int max_step, int multilg_type, cudaStream_t stream); 19 | 20 | } // namespace cuda 21 | } // namespace lightseq 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/kernels/t5Kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace lightseq { 8 | namespace cuda { 9 | 10 | const float t5_epsilon = 1e-6; 11 | template 12 | void t5_ker_norm_layer_launcher(int token_num, int hidden_size, 13 | cudaStream_t stream, T *matrix, T *out, 14 | const T *scale, const T *bias, 15 | int max_thread_per_block); 16 | 17 | template 18 | void t5_ker_correlation_softmax_encself_launcher( 19 | int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, 20 | T *correlation, const int *src_padding_mask, const T *pos_emb); 21 | 22 | template 23 | void t5_ker_correlation_softmax_decself_launcher( 24 | int batch_head_num, int step_num, cudaStream_t stream, T *correlation, 25 | const T *pos_emb, int head_num); 26 | 27 | template 28 | void ker_gelu_first_elementmul_launcher(int batch_token_num, int block_dim, 29 | cudaStream_t stream, T *input, 30 | const T *input2, int feature_dim); 31 | } // namespace cuda 32 | } // namespace lightseq 33 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/pywrapper/bert.h: -------------------------------------------------------------------------------- 1 | 2 | #include "../model/bert_encoder.h" 3 | #include "../proto/bert_weight.h" 4 | #include "../tools/util.h" 5 | #include "model_base.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType bert_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType bert_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class Bert : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | int *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | void *d_buf_; 29 | BertWeight tw_; 30 | 31 | public: 32 | Bert(const std::string weight_path, const int max_batch_size); 33 | 34 | ~Bert(); 35 | 36 | void Infer() override; 37 | void set_input_ptr(int index, void *input_ptr) override; 38 | void set_output_ptr(int index, void *output_ptr) override; 39 | const void *get_output_ptr(int index) override; 40 | std::vector get_input_max_shape(int index) override; 41 | std::vector get_output_max_shape(int index) override; 42 | DataType get_input_dtype(int index) override; 43 | DataType get_output_dtype(int index) override; 44 | void benchmark_mode(bool is_benchmark) override{}; 45 | }; 46 | 47 | LSMODEL_REGISTER(Bert); 48 | 49 | } // namespace cuda 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/pywrapper/vit.h: -------------------------------------------------------------------------------- 1 | 2 | #include "../model/vit_encoder.h" 3 | #include "../proto/vit_weight.h" 4 | #include "../tools/util.h" 5 | #include "model_base.h" 6 | 7 | #ifdef FP16_MODE 8 | const lightseq::cuda::OperationType vit_optype = 9 | lightseq::cuda::OperationType::FP16; 10 | #else 11 | const lightseq::cuda::OperationType vit_optype = 12 | lightseq::cuda::OperationType::FP32; 13 | #endif 14 | 15 | namespace lightseq { 16 | namespace cuda { 17 | class Vit : public LSModel { 18 | private: 19 | typedef OperationTypeTraits optraits; 20 | std::shared_ptr> encoder_; 21 | 22 | optraits::DataType *d_encoder_output_; 23 | float *d_input_; 24 | int *d_padding_mask_; 25 | int _max_batch_size; 26 | cudaStream_t stream_; 27 | cublasHandle_t hd_; 28 | void *d_buf_; 29 | VitWeight tw_; 30 | 31 | public: 32 | Vit(const std::string weight_path, const int max_batch_size); 33 | 34 | ~Vit(); 35 | 36 | void Infer() override; 37 | void set_input_ptr(int index, void *input_ptr) override; 38 | void set_output_ptr(int index, void *output_ptr) override; 39 | const void *get_output_ptr(int index) override; 40 | std::vector get_input_max_shape(int index) override; 41 | std::vector get_output_max_shape(int index) override; 42 | DataType get_input_dtype(int index) override; 43 | DataType get_output_dtype(int index) override; 44 | void benchmark_mode(bool is_benchmark) override{}; 45 | }; 46 | 47 | LSMODEL_REGISTER(Vit); 48 | 49 | } // namespace cuda 50 | } // namespace lightseq 51 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/server/libserver.ldscript: -------------------------------------------------------------------------------- 1 | { 2 | global: 3 | CustomErrorString; 4 | CustomExecute; 5 | CustomFinalize; 6 | CustomInitialize; 7 | local: *; 8 | }; 9 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | # (default) use C API for HDF5 library 4 | find_package(HDF5 REQUIRED) 5 | 6 | add_library(utils STATIC util.cc.cu) 7 | target_include_directories(utils PUBLIC ${HDF5_INCLUDE_DIRS}) 8 | target_include_directories(utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 9 | target_link_libraries(utils PRIVATE ${HDF5_LIBRARIES}) 10 | -------------------------------------------------------------------------------- /oslo/lightseq2/inference/triton_backend/src/triton_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #include "model_base.h" 3 | #include "triton/backend/backend_common.h" 4 | #include "triton/core/tritonserver.h" 5 | 6 | TRITONSERVER_DataType 7 | transform_triton_datatype_to_lightseq(::lightseq::cuda::DataType data_type_) { 8 | switch (data_type_) { 9 | case ::lightseq::cuda::DataType::kNotSupported: 10 | return TRITONSERVER_TYPE_INVALID; 11 | case ::lightseq::cuda::DataType::kFloat32: 12 | return TRITONSERVER_TYPE_FP32; 13 | case ::lightseq::cuda::DataType::kInt32: 14 | return TRITONSERVER_TYPE_INT32; 15 | case ::lightseq::cuda::DataType::kInt64: 16 | return TRITONSERVER_TYPE_INT64; 17 | case ::lightseq::cuda::DataType::kFloat16: 18 | return TRITONSERVER_TYPE_FP16; 19 | case ::lightseq::cuda::DataType::kInt8: 20 | return TRITONSERVER_TYPE_INT8; 21 | case ::lightseq::cuda::DataType::kInt16: 22 | return TRITONSERVER_TYPE_INT16; 23 | case ::lightseq::cuda::DataType::kByte: 24 | return TRITONSERVER_TYPE_BYTES; 25 | case ::lightseq::cuda::DataType::kUInt8: 26 | return TRITONSERVER_TYPE_UINT8; 27 | case ::lightseq::cuda::DataType::kUInt16: 28 | return TRITONSERVER_TYPE_UINT16; 29 | case ::lightseq::cuda::DataType::kUInt32: 30 | return TRITONSERVER_TYPE_UINT32; 31 | case ::lightseq::cuda::DataType::kUInt64: 32 | return TRITONSERVER_TYPE_UINT64; 33 | case ::lightseq::cuda::DataType::kFloat64: 34 | return TRITONSERVER_TYPE_FP64; 35 | default: 36 | return TRITONSERVER_TYPE_INVALID; 37 | } 38 | return TRITONSERVER_TYPE_INVALID; 39 | } 40 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.lightseq2.training.ops.pytorch.transformer_embedding_layer import ( 2 | LSTransformerEmbeddingLayer, 3 | ) 4 | from oslo.lightseq2.training.ops.pytorch.transformer_encoder_layer import ( 5 | LSTransformerEncoderLayer, 6 | ) 7 | from oslo.lightseq2.training.ops.pytorch.transformer_decoder_layer import ( 8 | LSTransformerDecoderLayer, 9 | ) 10 | from oslo.lightseq2.training.ops.pytorch.gpt_layer import ( 11 | LSGptEncoderLayer, 12 | ls_hf_gpt_enc_convert, 13 | ) 14 | from oslo.lightseq2.training.ops.pytorch.transformer import ( 15 | LSTransformer, 16 | LSTransformerEncoder, 17 | LSTransformerDecoder, 18 | ) 19 | 20 | from oslo.lightseq2.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer 21 | from oslo.lightseq2.training.ops.pytorch.adam import LSAdam 22 | from oslo.lightseq2.training.ops.pytorch.export import ( 23 | export_ls_config, 24 | export_ls_embedding, 25 | export_ls_encoder, 26 | export_ls_decoder, 27 | export_pb2hdf5, 28 | ) 29 | 30 | from oslo.lightseq2.training.ops.pytorch.export_quant import ( 31 | export_ls_embedding_ptq, 32 | export_ls_encoder_ptq, 33 | export_ls_decoder_ptq, 34 | export_ls_quant_embedding, 35 | export_ls_quant_encoder, 36 | export_ls_quant_decoder, 37 | export_quant_pb2hdf5, 38 | ) 39 | 40 | from oslo.lightseq2.training.ops.pytorch.gemm_test import gemm_test 41 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/cli/__init__.py -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/fs_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .ls_adam import LSFSAdam 2 | from .ls_label_smoothed_cross_entropy import LSLabelSmoothedCrossEntropyCriterion 3 | from .ls_transformer import LSTransformerModel 4 | from .ls_bart import LSBARTModel 5 | from .ls_translation import LSTranslationTask 6 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/lightseq_deepspeed_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from deepspeed.launcher.runner import main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/lightseq_fairseq_generate_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from fairseq_cli.generate import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/lightseq_fairseq_train_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from oslo.lightseq2.training.gcq.ls_fs_gcq_train import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/cli/lightseq_fairseq_validate_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | from fairseq_cli.validate import cli_main 5 | 6 | 7 | def ls_cli_main(*args, **kwargs): 8 | user_path = pathlib.Path(__file__).parent.joinpath("fs_modules") 9 | sys.argv.extend(["--user-dir", str(user_path)]) 10 | cli_main(*args, **kwargs) 11 | 12 | 13 | if __name__ == "__main__": 14 | ls_cli_main() 15 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/gcq/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcq import GCQ, GCQState, encode_and_decode 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/ops/__init__.py -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_transformer_layers import ( 2 | TransformerEncoderLayer, 3 | TransformerDecoderLayer, 4 | TransformerEmbeddingLayer, 5 | ) 6 | from .quantization import TensorQuantizer, act_quant_config, QuantLinear 7 | from .builder.transformer_builder import TransformerBuilder 8 | from .builder.operator_builder import OperatorBuilder 9 | from .builder.layer_builder import LayerBuilder 10 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/pytorch/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import CUDAOpBuilder 2 | from .kernel_builder import KernelBuilder 3 | from .transformer_builder import TransformerBuilder 4 | from .operator_builder import OperatorBuilder 5 | from .adam_builder import AdamBuilder 6 | from .layer_builder import LayerBuilder 7 | 8 | # TODO: infer this list instead of hard coded 9 | # List of all available ops 10 | __op_builders__ = [ 11 | LayerBuilder(), 12 | KernelBuilder(), 13 | OperatorBuilder(), 14 | TransformerBuilder(), 15 | AdamBuilder(), 16 | ] 17 | ALL_OPS = {op.name: op for op in __op_builders__} 18 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/pytorch/builder/adam_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The LightSeq Team 2 | # Copyright Microsoft DeepSpeed 3 | # This builder is adapted from Microsoft DeepSpeed 4 | 5 | import torch 6 | from .builder import CUDAOpBuilder 7 | 8 | 9 | class AdamBuilder(CUDAOpBuilder): 10 | NAME = "adam" 11 | 12 | def __init__(self, name=None): 13 | name = self.NAME if name is None else name 14 | super().__init__(name=name) 15 | 16 | def absolute_name(self): 17 | return f"op_builder.{self.NAME}_op" 18 | 19 | def sources(self): 20 | return [ 21 | "csrc/kernels/fused_adam_kernel.cu", 22 | "csrc/pybind/pybind_adam.cpp", 23 | ] 24 | 25 | def include_paths(self): 26 | return ["csrc/kernels/includes", "csrc/ops/includes", "csrc/layers/includes"] 27 | 28 | def nvcc_args(self): 29 | args = [ 30 | "-O3", 31 | "--use_fast_math", 32 | "-std=c++14", 33 | "-U__CUDA_NO_HALF_OPERATORS__", 34 | "-U__CUDA_NO_HALF_CONVERSIONS__", 35 | "-U__CUDA_NO_HALF2_OPERATORS__", 36 | ] 37 | 38 | return args + self.compute_capability_args() 39 | 40 | def cxx_args(self): 41 | return ["-O3", "-std=c++14", "-g", "-Wno-reorder"] 42 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information. 2 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/ops/tensorflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/ops/tensorflow/__init__.py -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT) 19 | # of commit 42805f0 20 | 21 | from .version import __version__ 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/calib/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """``oslo.lightseq2.training.pytorch_quantization.calib`` provides Calibrator classes that 20 | collect data statistics and determine pytorch_quantization parameters. 21 | """ 22 | 23 | from .max import MaxCalibrator 24 | from .histogram import * 25 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.tensor_quantizer import * 20 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_conv import * 21 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_linear import * 22 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_pooling import * 23 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.clip import * 24 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_rnn import * 25 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_bert import * 26 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_instancenorm import * 27 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """Main entry of all utils""" 20 | 21 | from .reduce_amax import reduce_amax 22 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/utils/quant_logging.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """A WAR for codes that messes up logging format""" 20 | 21 | import logging 22 | 23 | 24 | def reset_logger_handler(): 25 | """Remove all handler in root logger""" 26 | root_logger = logger.getLogger() 27 | while root_logger.handlers: 28 | root_logger.removeHandler(root_logger.handlers[0]) 29 | -------------------------------------------------------------------------------- /oslo/lightseq2/training/pytorch_quantization/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved. 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | __version__ = "2.1.2" 18 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedAdagradBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_adagrad_cuda( 4 | int chunk_size, at::Tensor noop_flag, 5 | std::vector> tensor_lists, const float lr, 6 | const float epsilon, const int mode, const float weight_decay); 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 9 | m.def("multi_tensor_adagrad", &multi_tensor_adagrad_cuda, 10 | "Compute and apply gradient update to parameters for Adam optimizer"); 11 | } 12 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedAdamBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_adam_cuda(int chunk_size, at::Tensor noop_flag, 4 | std::vector> tensor_lists, 5 | const float lr, const float beta1, 6 | const float beta2, const float epsilon, 7 | const int step, const int mode, 8 | const int bias_correction, 9 | const float weight_decay); 10 | 11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 12 | m.def("multi_tensor_adam", &multi_tensor_adam_cuda, 13 | "Compute and apply gradient update to parameters for Adam optimizer"); 14 | } 15 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedL2NormBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::tuple 4 | multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag, 5 | std::vector> tensor_lists, 6 | at::optional per_tensor_python); 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 9 | m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda, 10 | "Computes L2 norm for a list of contiguous tensors"); 11 | } 12 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedLambBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag, 4 | std::vector> tensor_lists, 5 | const float lr, const float beta1, 6 | const float beta2, const float epsilon, 7 | const int step, const int bias_correction, 8 | const float weight_decay, const int grad_averaging, 9 | const int mode, at::Tensor global_grad_norm, 10 | const float max_grad_norm, 11 | at::optional use_nvlamb_python); 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("multi_tensor_lamb", &multi_tensor_lamb_cuda, 15 | "Computes and apply update for LAMB optimizer"); 16 | } 17 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedMixedPrecisionL2NormBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::tuple 4 | multi_tensor_l2norm_mp_cuda(int chunk_size, at::Tensor noop_flag, 5 | std::vector> tensor_lists, 6 | at::optional per_tensor_python); 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 9 | m.def("multi_tensor_l2norm_mp", &multi_tensor_l2norm_mp_cuda, 10 | "Computes L2 norm for a list of contiguous tensors"); 11 | } 12 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedMixedPrecisionLambBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lamb_mp_cuda( 4 | int chunk_size, at::Tensor noop_flag, 5 | std::vector> tensor_lists, at::Tensor lr, 6 | const float beta1, const float beta2, const float epsilon, at::Tensor step, 7 | const int bias_correction, const float weight_decay, 8 | const int grad_averaging, const int mode, at::Tensor global_grad_norm, 9 | at::Tensor max_grad_norm, at::optional use_nvlamb_python, 10 | at::Tensor found_inf, at::Tensor inv_scale); 11 | 12 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 13 | m.def("multi_tensor_lamb_mp", &multi_tensor_lamb_mp_cuda, 14 | "Computes and apply update for LAMB optimizer"); 15 | } 16 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedNovogradBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_novograd_cuda( 4 | int chunk_size, at::Tensor noop_flag, 5 | std::vector> tensor_lists, at::Tensor grad_norms, 6 | const float lr, const float beta1, const float beta2, const float epsilon, 7 | const int step, const int bias_correction, const float weight_decay, 8 | const int grad_averaging, const int mode, const int norm_type); 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda, 12 | "Compute and apply gradient update to parameters for Adam optimizer"); 13 | } 14 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/FusedSGDBinder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_sgd_cuda(int chunk_size, at::Tensor noop_flag, 4 | std::vector> tensor_lists, 5 | float wd, float momentum, float dampening, float lr, 6 | bool nesterov, bool first_run, 7 | bool wd_after_momentum, float scale); 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda, 11 | "Fused SGD optimizer for list of contiguous tensors"); 12 | } 13 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/_C/csrc/__init__.py -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/custom_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "custom_cuda_layers.h" 2 | 3 | __global__ void param_update_kernel(const float *input, __half *output, 4 | int size) { 5 | int id = blockIdx.x * blockDim.x + threadIdx.x; 6 | 7 | if (id < size) { 8 | output[id] = (__half)input[id]; 9 | } 10 | } 11 | 12 | void launch_param_update(const float *input, __half *output, int size, 13 | cudaStream_t stream) { 14 | int threads = 1024; 15 | 16 | dim3 grid_dim((size - 1) / threads + 1); 17 | dim3 block_dim(threads); 18 | 19 | param_update_kernel<<>>(input, output, size); 20 | } 21 | 22 | __global__ void param_update_kernel_half(const float *input, __half *output, 23 | int size) { 24 | int id = blockIdx.x * blockDim.x + threadIdx.x; 25 | __half2 *output_cast = reinterpret_cast<__half2 *>(output); 26 | if (id < size) { 27 | float input_f = input[id]; 28 | __half2 *input_h = reinterpret_cast<__half2 *>(&input_f); 29 | output_cast[id] = *input_h; 30 | } 31 | } 32 | 33 | void launch_param_update_half(const float *input, __half *output, int size, 34 | cudaStream_t stream) { 35 | int threads = 1024; 36 | size /= 2; 37 | dim3 grid_dim((size - 1) / threads + 1); 38 | dim3 block_dim(threads); 39 | 40 | param_update_kernel_half<<>>(input, output, 41 | size); 42 | } 43 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/includes/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright 2021 TUNiB Inc. 3 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | /*This code is copied from NVIDIA apex: 19 | * https://github.com/NVIDIA/apex 20 | * with minor changes. */ 21 | 22 | #ifndef TORCH_CHECK 23 | #define TORCH_CHECK AT_CHECK 24 | #endif 25 | 26 | #ifdef VERSION_GE_1_3 27 | #define DATA_PTR data_ptr 28 | #else 29 | #define DATA_PTR data 30 | #endif 31 | -------------------------------------------------------------------------------- /oslo/torch/_C/csrc/includes/custom_cuda_layers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cuda_fp16.h" 4 | 5 | void launch_param_update(const float *input, __half *output, int size, 6 | cudaStream_t stream); 7 | void launch_param_update_half(const float *input, __half *output, int size, 8 | cudaStream_t stream); 9 | -------------------------------------------------------------------------------- /oslo/torch/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.distributed import ParallelContext, ParallelMode 2 | 3 | __ALL__ = [ParallelContext, ParallelMode] 4 | -------------------------------------------------------------------------------- /oslo/torch/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.distributed.parallel_context import ParallelContext 2 | from oslo.torch.distributed.parallel_mode import ParallelMode 3 | 4 | __ALL__ = [ParallelMode, ParallelContext] 5 | -------------------------------------------------------------------------------- /oslo/torch/distributed/_initializers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/_initializers/__init__.py -------------------------------------------------------------------------------- /oslo/torch/distributed/_initializers/initializer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class ProcessGroupInitializer(ABC): 5 | """ 6 | The abstract class for process group initialization. 7 | 8 | Args: 9 | rank (int): The rank of current process 10 | world_size (int): Size of whole communication world 11 | data_parallel_size (int): Size of data parallelization 12 | """ 13 | 14 | def __init__( 15 | self, 16 | rank: int, 17 | world_size: int, 18 | data_parallel_size: int, 19 | sequence_parallel_size: int, 20 | pipeline_parallel_size: int, 21 | tensor_parallel_size: int, 22 | expert_parallel_size: int, 23 | ): 24 | self.rank = rank 25 | self.world_size = world_size 26 | self.data_parallel_size = data_parallel_size 27 | self.sequence_parallel_size = sequence_parallel_size 28 | self.pipeline_parallel_size = pipeline_parallel_size 29 | self.tensor_parallel_size = tensor_parallel_size 30 | self.expert_parallel_size = expert_parallel_size 31 | 32 | @abstractmethod 33 | def init_dist_group(self): 34 | raise NotImplementedError 35 | -------------------------------------------------------------------------------- /oslo/torch/distributed/_seed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/_seed/__init__.py -------------------------------------------------------------------------------- /oslo/torch/distributed/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/nn/__init__.py -------------------------------------------------------------------------------- /oslo/torch/distributed/parallel_mode.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ParallelMode(Enum): 5 | """Enum class about parallelization mode.""" 6 | 7 | # global parallel groups 8 | GLOBAL = "global" 9 | 10 | # data parallel groups 11 | DATA = "data" 12 | 13 | # model parallel groups - containing tensor and pipeline parallel groups 14 | # this is added to facilitate amp and grad clipping in hybrid parallel 15 | MODEL = "model" 16 | 17 | # pipeline parallel groups 18 | PIPELINE = "pipe" 19 | 20 | # tensor parallel groups - containing all ranks in tensor parallel 21 | TENSOR = "tensor" 22 | 23 | # sequence parallel groups 24 | SEQUENCE = "sequence" 25 | SEQUENCE_DP = "sequence_dp" 26 | 27 | # 1D tensor parallel groups 28 | TENSOR_1D = "tensor_1d" 29 | 30 | # 2D tensor parallel groups 31 | TENSOR_2D = "tensor_2d" 32 | TENSOR_2D_ROW = "tensor_2d_row" 33 | TENSOR_2D_COL = "tensor_2d_col" 34 | 35 | # 2.5D tensor parallel groups 36 | TENSOR_2P5D = "tensor_2p5d" 37 | TENSOR_2P5D_ROW = "2p5d_row" 38 | TENSOR_2P5D_COL = "2p5d_col" 39 | TENSOR_2P5D_DEP = "2p5d_dep" 40 | TENSOR_2P5D_XZ = "2p5d_xz" 41 | 42 | # 3D tensor parallel groups 43 | TENSOR_3D = "tensor_3d" 44 | TENSOR_3D_INPUT = "tensor_3d_input" 45 | TENSOR_3D_WEIGHT = "tensor_3d_weight" 46 | TENSOR_3D_OUTPUT = "tensor_3d_output" 47 | 48 | # Expert parallel groups 49 | EXPERT = "expert" 50 | -------------------------------------------------------------------------------- /oslo/torch/jit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/jit/__init__.py -------------------------------------------------------------------------------- /oslo/torch/jit/_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def _set_jit_fusion_options(): 5 | """Set PyTorch JIT layer fusion options.""" 6 | TORCH_MAJOR = int(torch.__version__.split(".")[0]) 7 | TORCH_MINOR = int(torch.__version__.split(".")[1]) 8 | 9 | if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10): 10 | # nv fuser 11 | torch._C._jit_set_profiling_executor(True) 12 | torch._C._jit_set_profiling_mode(True) 13 | torch._C._jit_override_can_fuse_on_cpu(False) 14 | torch._C._jit_override_can_fuse_on_gpu(False) 15 | torch._C._jit_set_texpr_fuser_enabled(False) # fuser1 16 | torch._C._jit_set_nvfuser_enabled(True) # fuser2 17 | torch._C._debug_set_autodiff_subgraph_inlining(False) 18 | else: 19 | # legacy pytorch fuser 20 | torch._C._jit_set_profiling_mode(False) 21 | torch._C._jit_set_profiling_executor(False) 22 | torch._C._jit_override_can_fuse_on_cpu(True) 23 | torch._C._jit_override_can_fuse_on_gpu(True) 24 | -------------------------------------------------------------------------------- /oslo/torch/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.modules.conv import Conv1D 2 | from oslo.torch.nn.modules.dropout import ( 3 | FusedBiasDropout, 4 | ) 5 | from oslo.torch.nn.modules.embedding import ( 6 | Embedding1D, 7 | Embedding2D, 8 | Embedding2p5D, 9 | Embedding3D, 10 | VocabParallelEmbedding1D, 11 | VocabParallelEmbedding2D, 12 | VocabParallelEmbedding2p5D, 13 | VocabParallelEmbedding3D, 14 | ) 15 | from oslo.torch.nn.modules.functional import _NGramRepeatBlockFunction 16 | from oslo.torch.nn.modules.functional import ( 17 | fused_bias_dropout, 18 | fused_bias_gelu, 19 | fused_gelu, 20 | fused_rms_norm_affine, 21 | fused_layer_norm, 22 | mixed_dtype_fused_layer_norm_affine, 23 | fused_layer_norm_affine, 24 | mixed_dtype_fused_rms_norm_affine, 25 | fused_rms_norm, 26 | ) 27 | from oslo.torch.nn.modules.layer_norm import ( 28 | LayerNorm1D, 29 | LayerNorm2D, 30 | LayerNorm2p5D, 31 | LayerNorm3D, 32 | FusedLayerNorm, 33 | MixedFusedLayerNorm, 34 | MixedFusedRMSNorm, 35 | FusedRMSNorm, 36 | ) 37 | from oslo.torch.nn.modules.linear import ( 38 | ColLinear1D, 39 | Linear, 40 | Linear2D, 41 | Linear2p5D, 42 | Linear3D, 43 | RowLinear1D, 44 | ) 45 | from oslo.torch.nn.modules.loss import ( 46 | CrossEntropyLoss2D, 47 | CrossEntropyLoss2p5D, 48 | CrossEntropyLoss3D, 49 | VocabParallelCrossEntropyLoss1D, 50 | VocabParallelCrossEntropyLoss2D, 51 | VocabParallelCrossEntropyLoss2p5D, 52 | ) 53 | from oslo.torch.nn.modules.ngram_repeat_block import NGramRepeatBlock 54 | -------------------------------------------------------------------------------- /oslo/torch/nn/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/modules/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/modules/conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Conv1D(nn.Module): 6 | """ 7 | 1D-convolutional layer as defined by Radford et al. 8 | for OpenAI GPT (and also used in GPT-2). 9 | 10 | Basically works like a linear layer but the weights are transposed. 11 | 12 | Args: 13 | nf (int): The number of output features. 14 | nx (int): The number of input features. 15 | skip_bias_add (bool): This was added to enable performance optimization where bias 16 | can be fused with other elementwise operations. We skip 17 | adding bias but instead return it. 18 | References: 19 | https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py 20 | """ 21 | 22 | def __init__(self, nf, nx, skip_bias_add=False): 23 | super().__init__() 24 | self.nf = nf 25 | self.skip_bias_add = skip_bias_add 26 | 27 | w = torch.empty(nx, nf) 28 | nn.init.normal_(w, std=0.02) 29 | self.weight = nn.Parameter(w) 30 | self.bias = nn.Parameter(torch.zeros(nf)) 31 | 32 | def forward(self, x): 33 | size_out = x.size()[:-1] + (self.nf,) 34 | if not self.skip_bias_add: 35 | return torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight).view( 36 | size_out 37 | ) 38 | else: 39 | return ( 40 | torch.mm(x.view(-1, x.size(-1)), self.weight).view(size_out), 41 | self.bias, 42 | ) 43 | -------------------------------------------------------------------------------- /oslo/torch/nn/modules/dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.modules.dropout import _DropoutNd 3 | 4 | from oslo.torch.nn.modules.functional import ( 5 | fused_bias_dropout, 6 | ) 7 | 8 | 9 | class FusedBiasDropout(_DropoutNd): 10 | def forward(self, input: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: 11 | return fused_bias_dropout(input, bias, self.p, self.training, self.inplace) 12 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.pipeline_parallel import * 2 | from oslo.torch.nn.parallel.tensor_parallel import * 3 | from oslo.torch.nn.parallel.data_parallel import * 4 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.data_parallel import ( 2 | DistributedDataParallel, 3 | ShardingStrategy, 4 | ) 5 | from oslo.torch.nn.parallel.data_parallel.zero import ZeroRedundancyOptimizer 6 | 7 | from oslo.torch.nn.parallel.data_parallel._utils import set_params_to_ignore 8 | 9 | __ALL__ = [ 10 | "DistributedDataParallel", 11 | "ZeroRedundancyOptimizer", 12 | "set_params_to_ignore", 13 | "ShardingStrategy", 14 | ] 15 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | def is_ddp_ignored(p): 8 | return getattr(p, "_ddp_to_ignore", False) 9 | 10 | 11 | def set_params_to_ignore(params_to_ignore: Iterable[torch.Tensor]) -> None: 12 | """Sets parameters to be ignored by DDP. 13 | This method must be called before initializing DistributedDataParallel. 14 | Example: 15 | >>> params_to_ignore = [] 16 | >>> for p in module.parameters(): 17 | >>> if should_ignore(p): 18 | >>> params_to_ignore.append(p) 19 | >>> set_params_to_ignore(params_to_ignore) 20 | >>> module = DistributedDataParallel(module) 21 | Args: 22 | params_to_ignore (Iterable[torch.Tensor]): A list of parameters to be ignored. 23 | """ 24 | for p in params_to_ignore: 25 | p._ddp_to_ignore = True 26 | 27 | 28 | class DistributedBackwardFunction(torch.autograd.Function): 29 | @staticmethod 30 | def forward(ctx, module, *inputs): 31 | ctx.module = module 32 | return inputs 33 | 34 | @staticmethod 35 | def backward(ctx, *grad_outputs): 36 | ctx.module._pre_backward() 37 | # Enqueue a callback to flush the reducer. 38 | # This callback is triggered after all gradients' calculation is completed. 39 | Variable._execution_engine.queue_callback(ctx.module._post_backward) 40 | return (None,) + grad_outputs 41 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.zero.optim.optim import ( 2 | ZeroRedundancyOptimizer, 3 | ) 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.data_parallel import ( 5 | _HeteroDataParallel, 6 | ) 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.optim import ( 8 | _HeteroOptimizer, 9 | ) 10 | 11 | __ALL__ = [ 12 | "ZeroRedundancyOptimizer", 13 | "_HeteroDataParallel", 14 | "_HeteroOptimizer", 15 | ] 16 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/hetero/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.data_parallel import ( 2 | _HeteroDataParallel, 3 | ) 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.optim import _HeteroOptimizer 5 | 6 | __ALL__ = ["_HeteroDataParallel", "_HeteroOptimizer"] 7 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/hetero/chunk/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.chunk import ( 2 | Chunk, 3 | TensorState, 4 | ChunkFullError, 5 | ) 6 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.manager import ChunkManager 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.utils import ( 8 | init_chunk_manager, 9 | ) 10 | 11 | __ALL__ = [ 12 | "Chunk", 13 | "TensorState", 14 | "ChunkFullError", 15 | "ChunkManager", 16 | "init_chunk_manager", 17 | ] 18 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/hetero/memory_tracer/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.chunk_memstats_collector import ( 2 | ChunkMemStatsCollector, 3 | ) 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.memory_stats import ( 5 | MemStats, 6 | ) 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.param_runtime_order import ( 8 | OrderedParamGenerator, 9 | ) 10 | 11 | __ALL__ = ["ChunkMemStatsCollector", "MemStats", "OrderedParamGenerator"] 12 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/hetero/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from typing import TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk import Chunk 8 | 9 | 10 | def get_current_device() -> torch.device: 11 | """ 12 | Returns currently selected device (gpu/cpu). 13 | If cuda available, return gpu, otherwise return cpu. 14 | """ 15 | if torch.cuda.is_available(): 16 | return torch.device(f"cuda:{torch.cuda.current_device()}") 17 | else: 18 | return torch.device("cpu") 19 | 20 | 21 | def get_temp_total_chunk_on_cuda(chunk: "Chunk") -> torch.Tensor: 22 | if chunk.is_gathered: 23 | return chunk.cuda_global_chunk 24 | 25 | if chunk.cuda_shard is not None: 26 | shard_temp = chunk.cuda_shard 27 | else: 28 | shard_temp = chunk.cpu_shard.to(get_current_device()) 29 | 30 | total_temp = torch.zeros( 31 | chunk.chunk_size, dtype=chunk.dtype, device=get_current_device() 32 | ) 33 | gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0)) 34 | dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg) 35 | 36 | return total_temp 37 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.data_parallel.zero.optim.optim import ( 2 | ZeroRedundancyOptimizer, 3 | ) 4 | 5 | __ALL__ = ["ZeroRedundancyOptimizer"] 6 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/data_parallel/zero/optim/bookkeeping/__init__.py: -------------------------------------------------------------------------------- 1 | from .bucket_store import BucketStore 2 | from .gradient_store import GradientStore 3 | from .parameter_store import ParameterStore 4 | from .tensor_store import TensorBucket 5 | 6 | __ALL__ = ["BucketStore", "GradientStore", "ParameterStore", "TensorBucket"] 7 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/expert_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/expert_parallel/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.pipeline_parallel.pipeline_parallel import ( 2 | PipelineParallel, 3 | ) 4 | 5 | # initialize workers 6 | # TODO; better way? 7 | from oslo.torch.nn.parallel.pipeline_parallel._workers import * 8 | 9 | __ALL__ = [PipelineParallel] 10 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/pipeline_parallel/_buffers.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.pipeline_parallel._sync import ( 2 | register_location_for_forward_counter, 3 | ) 4 | 5 | 6 | # original forward dictionary 7 | _ORIGINAL_FORWARDS = dict() 8 | 9 | # module device(local) locations 10 | _MODULE_DEVICE_LOCATIONS = dict() 11 | 12 | 13 | def register_original_forward_function(location, func, device): 14 | _ORIGINAL_FORWARDS[location] = func 15 | _MODULE_DEVICE_LOCATIONS[location] = device 16 | register_location_for_forward_counter(location) 17 | 18 | 19 | def get_original_forward_function(location): 20 | return _ORIGINAL_FORWARDS[location] 21 | 22 | 23 | def get_module_device_location(location): 24 | return _MODULE_DEVICE_LOCATIONS[location] 25 | 26 | 27 | # Activations 28 | _ACTIVATIONS = dict() 29 | 30 | 31 | def save_activation(key, activation): 32 | _ACTIVATIONS[key] = activation 33 | 34 | 35 | def pop_activation(key): 36 | return _ACTIVATIONS.pop(key, []) # TODO; okay? 37 | # return _ACTIVATIONS.pop(key) 38 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/pipeline_parallel/_sync.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from oslo.torch.nn.parallel.pipeline_parallel._job import Input 4 | from oslo.torch.nn.parallel.pipeline_parallel._types import ( 5 | SyncNotification, 6 | SyncQueues, 7 | ) 8 | 9 | import torch 10 | 11 | 12 | QUEUES = SyncQueues() 13 | 14 | NOTIFICATIONS = SyncNotification() 15 | 16 | 17 | def sleep(): 18 | time.sleep(0.05) 19 | 20 | 21 | def initialize_job(fn, is_grad_enabled, unique_key, out_queue, **kwargs): 22 | job = Input( 23 | fn=fn, 24 | is_grad_enabled=is_grad_enabled, 25 | unique_key=unique_key, 26 | out_queue=out_queue, 27 | **kwargs, 28 | ) 29 | 30 | register_job(job) 31 | 32 | 33 | def register_job(job): 34 | QUEUES.JOBS.add(job) 35 | 36 | 37 | # TODO; support TP 38 | def select_job(): 39 | while len(QUEUES.JOBS) <= 0: 40 | sleep() 41 | 42 | job = list(sorted(QUEUES.JOBS))[0] 43 | QUEUES.JOBS.remove(job) 44 | return job 45 | 46 | 47 | # for unique tag generation 48 | _NUM_FORWARD_USED_COUNTER = dict() 49 | 50 | 51 | def register_location_for_forward_counter(location): 52 | _NUM_FORWARD_USED_COUNTER[location] = 0 53 | 54 | 55 | def make_unique_key(location, from_, to_): 56 | cnt = _NUM_FORWARD_USED_COUNTER[location] 57 | unique_key = (location, cnt, from_, to_) 58 | _NUM_FORWARD_USED_COUNTER[location] += 1 59 | return unique_key 60 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/pipeline_parallel/_utils.py: -------------------------------------------------------------------------------- 1 | def dfs(node, bfs_dict=None): 2 | yield node 3 | if bfs_dict is not None: 4 | if node.depth in bfs_dict: 5 | bfs_dict[node.depth].append(node) 6 | else: 7 | bfs_dict[node.depth] = [node] 8 | 9 | for child in node.children: 10 | for c in dfs(child, bfs_dict): 11 | yield c 12 | 13 | 14 | def bfs(node, bfs_dict=None): 15 | if bfs_dict is None: 16 | bfs_dict = {} 17 | if len(bfs_dict) == 0: 18 | list(dfs(node, bfs_dict)) 19 | for nodes in bfs_dict.values(): 20 | for node in nodes: 21 | yield node 22 | 23 | 24 | def post_order_traverse(node): 25 | for child in node.children: 26 | yield from post_order_traverse(child) 27 | yield node 28 | 29 | 30 | # from https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/scatter_gather.py#L12 31 | def _is_namedtuple(obj): 32 | # Check if type was created from collections.namedtuple or a typing.NamedTuple. 33 | return ( 34 | isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields") 35 | ) 36 | 37 | 38 | def _is_primitive(obj): 39 | return not hasattr(obj, "__dict__") 40 | 41 | 42 | def _is_private(attr): 43 | return attr.startswith("__") 44 | -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/tensor_parallel/_1d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_1d/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/tensor_parallel/_2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_2d/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/tensor_parallel/_2p5d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_2p5d/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/tensor_parallel/_3d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_3d/__init__.py -------------------------------------------------------------------------------- /oslo/torch/nn/parallel/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.nn.parallel.tensor_parallel.mapping import ( 2 | Column, 3 | Row, 4 | Update, 5 | Head, 6 | Other, 7 | ) 8 | from oslo.torch.nn.parallel.tensor_parallel.tensor_parallel import TensorParallel 9 | 10 | __ALL__ = [TensorParallel, Column, Row, Update, Head, Other] 11 | -------------------------------------------------------------------------------- /oslo/torch/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.optim.fused_adagrad import FusedAdagrad 2 | from oslo.torch.optim.fused_adam import FusedAdam 3 | from oslo.torch.optim.fused_lamb import FusedLamb 4 | from oslo.torch.optim.fused_mixed_precision_lamb import FusedMixedPrecisionLamb 5 | from oslo.torch.optim.fused_novograd import FusedNovograd 6 | from oslo.torch.optim.fused_sgd import FusedSGD 7 | -------------------------------------------------------------------------------- /oslo/torch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.utils.version import torch_version 2 | from oslo.torch.utils.common import get_free_port 3 | from oslo.torch.utils.random import set_seed 4 | from oslo.torch.utils.logging import get_dist_logger 5 | 6 | __all__ = ["get_free_port", "set_seed", "get_dist_logger"] 7 | -------------------------------------------------------------------------------- /oslo/torch/utils/checkpoint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/utils/checkpoint/__init__.py -------------------------------------------------------------------------------- /oslo/torch/utils/common.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import random 3 | 4 | 5 | def get_free_port() -> int: 6 | """Get a free port on localhost. 7 | 8 | Returns: 9 | int: A free port on localhost. 10 | """ 11 | while True: 12 | port = random.randrange(20000, 65000) 13 | try: 14 | with socket.socket() as sock: 15 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 16 | sock.bind(("localhost", port)) 17 | return port 18 | except OSError: 19 | continue 20 | -------------------------------------------------------------------------------- /oslo/torch/utils/data/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.torch.utils.data.data_collators import ( 2 | SequenceParallelCollator, 3 | ) 4 | -------------------------------------------------------------------------------- /oslo/torch/utils/logging/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import DistributedLogger 2 | 3 | __all__ = ["get_dist_logger", "DistributedLogger"] 4 | 5 | 6 | def get_dist_logger(name: str = "oslo") -> DistributedLogger: 7 | """Get logger instance based on name. The DistributedLogger will create singleton instances, 8 | which means that only one logger instance is created per name. 9 | 10 | Args: 11 | name (str): name of the logger, name must be unique 12 | 13 | Returns: 14 | DistributedLogger: A DistributedLogger object 15 | """ 16 | return DistributedLogger.get_instance(name=name) 17 | -------------------------------------------------------------------------------- /oslo/torch/utils/multi_tensor_apply/__init__.py: -------------------------------------------------------------------------------- 1 | from .multi_tensor_apply import MultiTensorApply 2 | 3 | multi_tensor_applier = MultiTensorApply(2048 * 32) 4 | -------------------------------------------------------------------------------- /oslo/torch/utils/multi_tensor_apply/multi_tensor_apply.py: -------------------------------------------------------------------------------- 1 | class MultiTensorApply(object): 2 | def __init__(self, chunk_size): 3 | self.chunk_size = chunk_size 4 | 5 | def __call__(self, op, noop_flag_buffer, tensor_lists, *args): 6 | return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args) 7 | -------------------------------------------------------------------------------- /oslo/torch/utils/random.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def set_seed(seed: int, cuda_deterministic: bool = False): 8 | """Set seed for random, numpy, torch. 9 | 10 | Args: 11 | seed (int): Random seed. 12 | cuda_deterministic (bool, optional): Deterministic for cuda. Defaults to False. 13 | """ 14 | random.seed(seed) 15 | np.random.seed(seed) 16 | torch.manual_seed(seed) 17 | if torch.cuda.is_available(): 18 | torch.cuda.manual_seed(seed) 19 | torch.cuda.manual_seed_all(seed) 20 | if cuda_deterministic: # slower, more reproducible 21 | torch.backends.cudnn.deterministic = True 22 | torch.backends.cudnn.benchmark = False 23 | else: 24 | torch.backends.cudnn.deterministic = False 25 | torch.backends.cudnn.benchmark = True 26 | -------------------------------------------------------------------------------- /oslo/torch/utils/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 2 | # 3 | # This source code is licensed under the BSD license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import logging 7 | import re 8 | from typing import List, Tuple 9 | 10 | import torch 11 | 12 | __all__: List[str] = ["torch_version"] 13 | 14 | 15 | def torch_version(version: str = torch.__version__) -> Tuple[int, ...]: 16 | numbering = re.search(r"^(\d+).(\d+).(\d+)([^\+]*)(\+\S*)?$", version) 17 | if not numbering: 18 | return tuple() 19 | # Catch torch version if run against internal pre-releases, like `1.8.0a0fb`, 20 | if numbering.group(4): 21 | # Two options here: 22 | # - either skip this version (minor number check is not relevant) 23 | # - or check that our codebase is not broken by this ongoing development. 24 | 25 | # Assuming that we're interested in the second use-case more than the first, 26 | # return the pre-release or dev numbering 27 | logging.warning( 28 | f"Pytorch pre-release version {version} - assuming intent to test it" 29 | ) 30 | 31 | return tuple(int(numbering.group(n)) for n in range(1, 4)) 32 | -------------------------------------------------------------------------------- /oslo/transformers/constants.py: -------------------------------------------------------------------------------- 1 | BATCH_DIMENSIONS_TP = { 2 | "input_ids": 0, 3 | "attention_mask": 0, 4 | "token_type_ids": 0, 5 | "position_ids": 0, 6 | "inputs_embeds": 0, 7 | "decoder_input_ids": 0, 8 | "decoder_attention_mask": 0, 9 | "decoder_token_type_ids": 0, 10 | "decoder_position_ids": 0, 11 | "decoder_inputs_embeds": 0, 12 | "pixel_values": 0, 13 | } 14 | 15 | BATCH_DIMENSIONS_PP = { 16 | "input_ids": 0, 17 | "attention_mask": 0, 18 | "token_type_ids": 0, 19 | "position_ids": 0, 20 | "inputs_embeds": 0, 21 | "decoder_input_ids": 0, 22 | "decoder_attention_mask": 0, 23 | "decoder_token_type_ids": 0, 24 | "decoder_position_ids": 0, 25 | "decoder_inputs_embeds": 0, 26 | "labels": 0, 27 | } 28 | -------------------------------------------------------------------------------- /oslo/transformers/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/data/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/modeling_utils.py: -------------------------------------------------------------------------------- 1 | class OsloModel: 2 | """Marker Interface""" 3 | -------------------------------------------------------------------------------- /oslo/transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | from oslo.transformers.models.gpt2.modeling_gpt2 import ( 2 | GPT2Model, 3 | GPT2LMHeadModel, 4 | GPT2ForSequenceClassification, 5 | GPT2ForTokenClassification, 6 | ) 7 | 8 | # from oslo.transformers.trainer import Trainer 9 | -------------------------------------------------------------------------------- /oslo/transformers/models/albert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/albert/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/bart/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/bart/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/bert/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/distilbert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/distilbert/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/electra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/electra/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/gpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/gpt2/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/mbart/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/mbart/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/mt5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/mt5/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/roberta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/roberta/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/models/t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/t5/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/tasks/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/tasks/data_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, List, Optional 3 | 4 | # from datasets.arrow_dataset import Batch 5 | 6 | try: 7 | from transformers import PreTrainedTokenizerBase 8 | except ImportError: 9 | print("You have to install `transformers` to use `oslo.transformers` modules") 10 | 11 | 12 | class BaseProcessor(ABC): 13 | def __init__(self, tokenizer: PreTrainedTokenizerBase, max_seq_length: int) -> None: 14 | self._tokenizer = tokenizer 15 | self._max_seq_length = max_seq_length 16 | self._chunk_size = max_seq_length - 1 17 | self._buffer = [] 18 | 19 | def save_tokenizer(self, path: str) -> None: 20 | self._tokenizer.save_pretrained(path) 21 | 22 | @abstractmethod 23 | # def __call__(self, examples: Batch) -> Dict[str, List[int]]: 24 | def __call__(self, examples) -> Dict[str, List[int]]: 25 | pass 26 | 27 | 28 | def pad_labels( 29 | labels, 30 | tokenizer, 31 | label_pad_token_id: int, 32 | pad_to_multiple_of: Optional[int] = None, 33 | ): 34 | labels = tokenizer.pad( 35 | {"input_ids": labels}, 36 | padding=True, 37 | return_attention_mask=False, 38 | return_tensors="pt", 39 | pad_to_multiple_of=pad_to_multiple_of, 40 | )["input_ids"] 41 | 42 | labels.masked_fill_(labels == tokenizer.pad_token_id, label_pad_token_id) 43 | return labels 44 | -------------------------------------------------------------------------------- /oslo/transformers/tasks/loading/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/tasks/loading/__init__.py -------------------------------------------------------------------------------- /oslo/transformers/trainer_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import numpy as np 4 | import torch 5 | import time 6 | 7 | from transformers.utils import ExplicitEnum 8 | 9 | 10 | class SchedulerType(ExplicitEnum): 11 | LINEAR = "linear" 12 | COSINE = "cosine" 13 | COSINE_WITH_RESTARTS = "cosine_with_restarts" 14 | POLYNOMIAL = "polynomial" 15 | CONSTANT = "constant" 16 | CONSTANT_WITH_WARMUP = "constant_with_warmup" 17 | 18 | 19 | class OptimizerNames(ExplicitEnum): 20 | """ 21 | Stores the acceptable string identifiers for optimizers. 22 | """ 23 | 24 | ADAM = "adam" 25 | ADAMW = "adamw" 26 | ADAGRAD = "adagrad" 27 | ADADELTA = "adadelta" 28 | ADAFACTOR = "adafactor" 29 | ADAMW_BNB = "adamw_bnb_8bit" 30 | SGD = "sgd" 31 | NOVOGRAD = "novograd" 32 | LAMB = "lamb" 33 | 34 | 35 | def log_dist(message: str, rank: int = 0, level: int = logging.INFO) -> None: 36 | if rank == -1: 37 | ranks = [i for i in range(int(os.environ["WORLD_SIZE"]))] 38 | else: 39 | ranks = [rank] 40 | my_rank = int(os.environ.get("RANK", "0")) 41 | if my_rank in ranks: 42 | if level == logging.INFO: 43 | logging.info(f"[Rank {my_rank}] {message}") 44 | if level == logging.WARNING: 45 | logging.warning(f"[Rank {my_rank}] {message}") 46 | if level == logging.ERROR: 47 | logging.error(f"[Rank {my_rank}] {message}") 48 | if level == logging.DEBUG: 49 | logging.debug(f"[Rank {my_rank}] {message}") 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anytree 2 | datasets 3 | ninja 4 | packaging 5 | parameterized >= 0.8.1 6 | psutil 7 | py-cpuinfo 8 | pybind11 9 | scipy 10 | torch >= 1.11.0 11 | transformers 12 | wandb 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:isort] 2 | multi_line_output = 3 3 | line_length = 79 4 | include_trailing_comma = True 5 | 6 | [flake8] 7 | ignore = E203, E501, E731, E741, W503, W504, W605, PAI100, PAI101, PAI201, PAI202, PAI203, B009, B010, F401, F403 8 | max-line-length = 119 9 | -------------------------------------------------------------------------------- /tests/test_script/run_inference.sh: -------------------------------------------------------------------------------- 1 | ## inference shell code 2 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 bert-base-cased masked-lm `` 3 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 ishan/bert-base-uncased-mnli sequence-classification `` 4 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 gpt2 causal-lm `` 5 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 EleutherAI/gpt-neo-1.3B causal-lm `` 6 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 t5-base seq2seq-lm `` 7 | 8 | NUM_GPUS=$1 9 | MODEL=$2 10 | TASK=$3 11 | 12 | python -m torch.distributed.launch \ 13 | --nproc_per_node="$NUM_GPUS" \ 14 | ./tests/inference.py \ 15 | --task=$TASK \ 16 | --model=$MODEL \ 17 | --tensor_parallel_size="$NUM_GPUS" 18 | -------------------------------------------------------------------------------- /tests/test_script/run_merge.sh: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # If you use only two gpu example 3 | # Checkpoint directory : tests/ckpt/checkpoint_0 4 | # saved merge directory: tests/ckpt/checkpoint_0_merge 5 | ########################################################### 6 | 7 | # EXAMPLE merge TP case BERT:`sh ./tests/test_script/run_merge.sh ishan/bert-base-uncased-mnli sequence-classification 2 1 1 2 1` 8 | 9 | # EXAMPLE merge TP case GPT:`sh ./tests/test_script/run_merge.sh gpt2 causal-lm 2 1 1 2 1` 10 | 11 | # EXAMPLE merge TP case T5:`sh ./tests/test_script/run_merge.sh t5-base seq2seq 2 1 1 2 1` 12 | 13 | 14 | MODEL=$1 15 | TASK=$2 16 | 17 | NUM_GPUS=$3 18 | DATA_PARALLEL_SIZE=$4 19 | PIPELINE_PARALLEL_SIZE=$5 20 | TENSOR_PARALLEL_SIZE=$6 21 | TENSOR_PARALLEL_DEPTH=$7 22 | 23 | # tensor parallel mode 24 | # "1D", "2D", "2D_ROW", "2D_COL", "2P5D", "2P5D_ROW", "2P5D_COL" 25 | # "2P5D_DEP", "2P5D_XZ", "3D", "3D_INPUT", "3D_WEIGHT", "3D_OUTPUT" 26 | TENSOR_PARALLEL_MODE=1D 27 | MERGE_DIR=tests/ckpt/checkpoint_0 28 | 29 | run_cmd="torchrun --standalone --nproc_per_node=${NUM_GPUS} \ 30 | ./tests/merge.py \ 31 | --task=$TASK \ 32 | --model=$MODEL \ 33 | --tensor_parallel_size=$TENSOR_PARALLEL_SIZE \ 34 | --data_parallel_size=$DATA_PARALLEL_SIZE \ 35 | --pipeline_parallel_size=$PIPELINE_PARALLEL_SIZE \ 36 | --tensor_parallel_mode=$TENSOR_PARALLEL_MODE \ 37 | --tensor_parallel_depth=$TENSOR_PARALLEL_DEPTH \ 38 | --merge_dir=$MERGE_DIR 39 | " 40 | 41 | echo ${run_cmd} 42 | eval ${run_cmd} 43 | -------------------------------------------------------------------------------- /tests/util/arg_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--local-rank", default=0, type=int) 7 | # parser.add_argument("--config", required=True, type=str) 8 | parser.add_argument("--task", required=True, type=str) 9 | parser.add_argument("--model", required=True, type=str) 10 | parser.add_argument("--tokenizer", default=None, type=str) 11 | parser.add_argument("--batch_size", required=False, type=int) 12 | parser.add_argument("--sequence_length", required=False, type=int) 13 | parser.add_argument("--train_step", required=False, type=int) 14 | parser.add_argument("--save_interval", required=False, type=int) 15 | parser.add_argument("--tensor_parallel_size", default=1, type=int) 16 | parser.add_argument("--data_parallel_size", default=1, type=int) 17 | parser.add_argument("--pipeline_parallel_size", default=1, type=int) 18 | parser.add_argument("--tensor_parallel_depth", default=1, type=int) 19 | parser.add_argument("--epoch", default=1, type=int) 20 | parser.add_argument("--tensor_parallel_mode", default="1D", type=str) 21 | parser.add_argument("--merge_dir", required=False, type=str) 22 | args = parser.parse_args() 23 | return args 24 | -------------------------------------------------------------------------------- /tests/util/oslo.py: -------------------------------------------------------------------------------- 1 | import oslo 2 | from oslo.torch.distributed.parallel_context import ParallelContext, ParallelMode 3 | from oslo.torch.nn.parallel import TensorParallel, PipelineParallel 4 | 5 | 6 | def initialize_oslo(args, model): 7 | try: 8 | pc = ParallelContext.from_torch( 9 | data_parallel_size=args.data_parallel_size, 10 | pipeline_parallel_size=args.pipeline_parallel_size, 11 | tensor_parallel_size=args.tensor_parallel_size, 12 | tensor_parallel_depth=args.tensor_parallel_depth, 13 | tensor_parallel_mode={ 14 | "1D": ParallelMode.TENSOR_1D, 15 | "2D": ParallelMode.TENSOR_2D, 16 | "2P5D": ParallelMode.TENSOR_2P5D, 17 | "3D": ParallelMode.TENSOR_3D, 18 | }[args.tensor_parallel_mode], 19 | ) 20 | 21 | if pc.get_world_size(ParallelMode.TENSOR) > 1: 22 | model = TensorParallel(model, pc) 23 | if pc.get_world_size(ParallelMode.PIPELINE) > 1: 24 | model = PipelineParallel(model, pc) 25 | oslo.ready(model, pc) 26 | 27 | except Exception as e: 28 | print(e) 29 | pc = None 30 | model = model.cuda() 31 | 32 | return model, pc 33 | 34 | 35 | def print_rank_0(message, pc): 36 | if pc is None: 37 | print(message) 38 | elif pc.get_global_rank() == 0: 39 | print(f"Rank :{pc.get_global_rank()}") 40 | print(message) 41 | -------------------------------------------------------------------------------- /tests_deprecated/__init__.py: -------------------------------------------------------------------------------- 1 | from transformers.modeling_outputs import Seq2SeqModelOutput 2 | 3 | a = Seq2SeqModelOutput() 4 | print(a) 5 | -------------------------------------------------------------------------------- /tests_deprecated/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/distributed/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/data_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/data_parallel/zero/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/test_mem_monitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.memory_monitor import ( 5 | SyncCudaMemoryMonitor, 6 | ) 7 | 8 | 9 | class TestSyncCudaMemoryMonitor(unittest.TestCase): 10 | @patch("torch.cuda.synchronize") 11 | @patch("torch.cuda.reset_peak_memory_stats") 12 | @patch("torch.cuda.max_memory_allocated", return_value=1024) 13 | def test_sync_cuda_memory_monitor_methods( 14 | self, mock_max_memory_allocated, mock_reset_peak_memory_stats, mock_synchronize 15 | ): 16 | # Create a SyncCudaMemoryMonitor instance 17 | sync_cuda_mem_monitor = SyncCudaMemoryMonitor() 18 | 19 | # Test the start method 20 | sync_cuda_mem_monitor.start() 21 | mock_synchronize.assert_called_once() 22 | mock_reset_peak_memory_stats.assert_called_once() 23 | 24 | # Test the finish method 25 | max_usage = sync_cuda_mem_monitor.finish() 26 | self.assertIsInstance(max_usage, int) 27 | self.assertEqual(max_usage, 1024) # The mock max_memory_allocated returns 1024 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/data_parallel/zero/sharded_optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/sharded_optim/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/expert_parallel/gpt2/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | import torch 6 | import torch.backends.cudnn as cudnn 7 | 8 | import deepspeed.comm as dist 9 | 10 | 11 | def create_config_from_dict(tmpdir, config_dict): 12 | config_path = os.path.join(tmpdir, "temp_config.json") 13 | with open(config_path, "w") as fd: 14 | json.dump(config_dict, fd) 15 | return config_path 16 | 17 | 18 | def create_deepspeed_args(): 19 | parser = argparse.ArgumentParser() 20 | args = parser.parse_args(args="") 21 | args.deepspeed = True 22 | if dist.is_initialized(): 23 | # We assume up to one full node executing unit tests 24 | assert dist.get_world_size() <= torch.cuda.device_count() 25 | args.local_rank = dist.get_rank() 26 | return args 27 | 28 | 29 | def args_from_dict(tmpdir, config_dict): 30 | args = create_deepspeed_args() 31 | config_path = create_config_from_dict(tmpdir, config_dict) 32 | args.deepspeed_config = config_path 33 | return args 34 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_pp_nopp.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def main(): 7 | file_names = os.listdir("tmp2/") 8 | no_pp_names = sorted([fn for fn in file_names if "no_pp" in fn]) 9 | 10 | print(len(no_pp_names)) 11 | 12 | diff_cnt = 0 13 | for no_pp_name in no_pp_names: 14 | pp_name = no_pp_name.replace("no_pp", "pp") 15 | 16 | pp_path = os.path.join("tmp2", pp_name) 17 | no_pp_path = os.path.join("tmp2", no_pp_name) 18 | 19 | pp_data = torch.load(pp_path, map_location="cpu") 20 | no_pp_data = torch.load(no_pp_path, map_location="cpu") 21 | 22 | if not torch.allclose(pp_data, no_pp_data): 23 | # print(torch.abs(pp_data - no_pp_data)) 24 | # print(pp_name) 25 | 26 | diff_cnt += 1 27 | 28 | # break 29 | 30 | print(diff_cnt) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_pptp_trial.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | import torch 5 | 6 | 7 | main_dir = "tmp2" 8 | 9 | compair_dirs = ["tmp3"] 10 | 11 | 12 | def main(): 13 | file_names = os.listdir(f"{main_dir}/") 14 | 15 | diff_names = set() 16 | same_names = set() 17 | for name in file_names: 18 | left_path = os.path.join(main_dir, name) 19 | left = torch.load(left_path, map_location="cpu") 20 | 21 | for rd in compair_dirs: 22 | right_path = left_path.replace(main_dir, rd) 23 | right = torch.load(right_path, map_location="cpu") 24 | 25 | if not torch.allclose(left, right): 26 | diff_names.add(name) 27 | else: 28 | same_names.add(name) 29 | 30 | print("Names with difference gradient: ") 31 | for dn in diff_names: 32 | print(dn) 33 | 34 | print(f"{len(diff_names)} / {len(file_names)}") 35 | 36 | print("Names with same gradient: ") 37 | for sn in same_names: 38 | print(sn) 39 | 40 | print(f"{len(same_names)} / {len(file_names)}") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_send_recv.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | 6 | def main(): 7 | file_names = os.listdir("tmp/") 8 | send_names = [fn for fn in file_names if "send" in fn] 9 | 10 | for send_name in send_names: 11 | recv_name = send_name.replace("send", "recv") 12 | 13 | send_path = os.path.join("tmp", send_name) 14 | recv_path = os.path.join("tmp", recv_name) 15 | 16 | send_data = torch.load(send_path, map_location="cpu") 17 | recv_data = torch.load(recv_path, map_location="cpu") 18 | 19 | assert send_data["__KEY__"] == recv_data["__KEY__"] 20 | assert send_data["__META__"] == recv_data["__META__"] 21 | 22 | assert send_data["__VALUE__"]["stub"] == recv_data["__VALUE__"]["stub"] 23 | 24 | send_data = send_data["__VALUE__"]["tensors"] 25 | recv_data = recv_data["__VALUE__"]["tensors"] 26 | 27 | for x, y in zip(send_data, recv_data): 28 | assert torch.allclose(x, y, atol=1e-16), send_name 29 | assert x.dtype == y.dtype, send_name 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/test_p2p.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | from oslo.torch.distributed import ParallelContext 4 | from oslo.torch.distributed.nn.functional import send, recv 5 | 6 | parallel_context = ParallelContext.from_torch(pipeline_parallel_size=2) 7 | 8 | example_data = [ 9 | True, 10 | None, 11 | 1, 12 | 2.3, 13 | "안녕", 14 | {"xx": "yy"}, 15 | {"1", "2", "3"}, 16 | (1, 2, 3), 17 | complex(1, 2), 18 | [1, 2, [1, 2, {"1": "x", "2": (1, 2, {3})}]], 19 | ] 20 | 21 | send(example_data, src_rank=0, dst_rank=1, parallel_context=parallel_context) 22 | data = recv(src_rank=0, dst_rank=1, parallel_context=parallel_context) 23 | 24 | if dist.get_rank() == 1: 25 | print(data) 26 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/pipeline_parallel/test_partioning.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from transformers import T5ForConditionalGeneration 4 | 5 | from oslo.torch.distributed import ParallelContext 6 | from oslo.torch.nn.parallel.pipeline_parallel.pipeline_parallel import _PipelineParallel 7 | from oslo.torch.nn.parallel.utils import parallelize 8 | 9 | parallel_context = ParallelContext.from_torch(pipeline_parallel_size=8) 10 | model = T5ForConditionalGeneration.from_pretrained("t5-large") 11 | 12 | wrapper_pp = _PipelineParallel(model, parallel_context=parallel_context) 13 | parallelize(wrapper_pp, parallel_context) 14 | 15 | for rank in range(dist.get_world_size()): 16 | if dist.get_rank() == rank: 17 | print(f"RANK: {rank}:") 18 | num_params = 0 19 | for name, param in wrapper_pp.named_parameters(): 20 | if param.device != torch.device("cpu"): 21 | # print(f"> {name}: {param.device}") 22 | num_params += param.numel() 23 | print(f"RANK {rank} params: {num_params}") 24 | dist.barrier() 25 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/1d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/1d/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/1d/_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from oslo.torch.distributed import ParallelMode 5 | 6 | 7 | def split_1d(tensor, world_size, dim, parallel_context): 8 | tensor = tensor.chunk(world_size, dim=dim)[ 9 | parallel_context.get_local_rank(ParallelMode.TENSOR_1D) 10 | ] 11 | return tensor 12 | 13 | 14 | def gather_1d(tensor, world_size, dim, parallel_context): 15 | tensor_list = [torch.zeros_like(tensor) for _ in range(world_size)] 16 | dist.all_gather( 17 | tensor_list, 18 | tensor.contiguous(), 19 | parallel_context.get_group(ParallelMode.TENSOR_1D), 20 | ) 21 | tensor = torch.cat(tensor_list, dim=dim) 22 | return tensor 23 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/1d/deparallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import _utils 2 | 3 | _ALL__ = [_utils] 4 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/2d/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/2d/deparallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import _utils 2 | 3 | _ALL__ = [_utils] 4 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/deparallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import _utils 2 | 3 | _ALL__ = [_utils] 4 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/3d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/3d/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/3d/deparallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .. import _utils 2 | 3 | _ALL__ = [_utils] 4 | -------------------------------------------------------------------------------- /tests_deprecated/torch/nn/parallel/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/utils/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/utils/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/utils/data/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/torch/utils/data/test_data_collators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from oslo.torch.distributed import ParallelContext 4 | from oslo.torch.utils.data import SequenceParallelCollator 5 | 6 | parallel_context = ParallelContext.from_torch(sequence_parallel_size=4) 7 | 8 | data = { 9 | "input_ids": torch.randn(16, 129).cuda(), 10 | "attention_mask": torch.ones(16, 129).cuda(), 11 | } 12 | 13 | collator = SequenceParallelCollator( 14 | parallel_context=parallel_context, 15 | parallel_keys=["input_ids", "attention_mask"], 16 | pad_token_id=99, 17 | ) 18 | 19 | sharded = collator(**data) 20 | print(sharded["input_ids"].size()) 21 | -------------------------------------------------------------------------------- /tests_deprecated/transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/transformers/__init__.py -------------------------------------------------------------------------------- /tests_deprecated/transformers/test_kernel_fusion_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | AutoTokenizer, 3 | AutoModelForSeq2SeqLM, 4 | ) 5 | 6 | from oslo.transformers.kernel_fusion_utils import fused_no_repeat_ngram_logits_processor 7 | 8 | tokenizer = AutoTokenizer.from_pretrained("t5-base") 9 | model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to("cuda") 10 | 11 | output = model.generate( 12 | **tokenizer("hello", return_tensors="pt").to("cuda"), no_repeat_ngram_size=2 13 | ) 14 | print(tokenizer.decode(output[0])) 15 | 16 | fused_no_repeat_ngram_logits_processor(model) 17 | 18 | output = model.generate( 19 | **tokenizer("hello", return_tensors="pt").to("cuda"), no_repeat_ngram_size=2 20 | ) 21 | print(tokenizer.decode(output[0])) 22 | -------------------------------------------------------------------------------- /tests_deprecated/transformers/trainer/oslo_user_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_parallelism": { 3 | "enable": true, 4 | "parallel_size": 1, 5 | "zero_stage": 0 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /tests_deprecated/transformers/trainer/test_oslo_config.py: -------------------------------------------------------------------------------- 1 | from oslo.transformers.oslo_init import OsloTrainerConfig, init_oslo_features 2 | 3 | 4 | oslo_init_dict_form = { 5 | "data_parallelism": { 6 | "enable": True, 7 | "parallel_size": 1, 8 | "zero_stage": 0, 9 | }, 10 | "tensor_parallelism": { 11 | "enable": True, 12 | "parallel_size": 1, 13 | "parallel_mode": "1d", 14 | }, 15 | "sequence_parallelism": {"enable": True, "parallel_size": 1}, 16 | } 17 | 18 | user_config_from_dict = OsloTrainerConfig(oslo_init_dict_form) 19 | 20 | user_config_from_json = OsloTrainerConfig("oslo_user_config.json") 21 | 22 | print(user_config_from_dict) 23 | 24 | res = init_oslo_features(user_config_from_dict) 25 | 26 | print(res) 27 | --------------------------------------------------------------------------------