├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── feature.md
    │   └── todo.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── contributors.yaml
    │   └── pull_request.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE.apache-2.0
├── MANIFEST.in
├── README.md
├── assets
    └── logo.png
├── docs
    ├── .buildinfo
    ├── .nojekyll
    ├── CNAME
    ├── CONCEPTS
    │   ├── data_parallelism.html
    │   ├── dp
    │   │   └── zero_algorithm.html
    │   ├── parallel_context.html
    │   ├── tensor_model_parallelism.html
    │   └── tp
    │   │   ├── 1d_parallel_algorithm.html
    │   │   ├── 2d_parallel_algorithm.html
    │   │   ├── 2p5d_parallel_algorithm.html
    │   │   └── 3d_parallel_algorithm.html
    ├── Makefile
    ├── TUTORIALS
    │   ├── data_parallelism.html
    │   ├── tensor_model_parallelism.html
    │   └── zero_redundancy_optimizer.html
    ├── _images
    │   ├── 260461C3-EA3B-405C-9B34-05BA3C781161.png
    │   ├── 2d.png
    │   ├── 2p5d.png
    │   ├── 98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg
    │   ├── E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg
    │   ├── figure1.png
    │   └── figure11.png
    ├── _sources
    │   ├── CONCEPTS
    │   │   ├── data_parallelism.md
    │   │   ├── dp
    │   │   │   └── zero_algorithm.md
    │   │   ├── parallel_context.md
    │   │   ├── tensor_model_parallelism.md
    │   │   └── tp
    │   │   │   ├── 1d_parallel_algorithm.md
    │   │   │   ├── 2d_parallel_algorithm.md
    │   │   │   ├── 2p5d_parallel_algorithm.md
    │   │   │   └── 3d_parallel_algorithm.md
    │   ├── TUTORIALS
    │   │   ├── data_parallelism.md
    │   │   ├── tensor_model_parallelism.md
    │   │   └── zero_redundancy_optimizer.md
    │   └── index.rst
    ├── _static
    │   ├── _sphinx_javascript_frameworks_compat.js
    │   ├── basic.css
    │   ├── doctools.js
    │   ├── documentation_options.js
    │   ├── file.png
    │   ├── images
    │   │   ├── logo_binder.svg
    │   │   ├── logo_colab.png
    │   │   ├── logo_deepnote.svg
    │   │   └── logo_jupyterhub.svg
    │   ├── jquery.js
    │   ├── language_data.js
    │   ├── locales
    │   │   ├── ar
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── bg
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── bn
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ca
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── cs
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── da
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── de
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── el
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── eo
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── es
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── et
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── fi
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── fr
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── hr
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── id
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── it
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── iw
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ja
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ko
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── lt
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── lv
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ml
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── mr
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ms
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── nl
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── no
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── pl
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── pt
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ro
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ru
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── sk
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── sl
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── sr
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── sv
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ta
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── te
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── tg
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── th
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── tl
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── tr
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── uk
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── ur
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── vi
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   ├── zh_CN
    │   │   │   └── LC_MESSAGES
    │   │   │   │   └── booktheme.po
    │   │   └── zh_TW
    │   │   │   └── LC_MESSAGES
    │   │   │       └── booktheme.po
    │   ├── minus.png
    │   ├── plus.png
    │   ├── pygments.css
    │   ├── sbt-webpack-macros.html
    │   ├── scripts
    │   │   ├── bootstrap.js
    │   │   ├── pydata-sphinx-theme.js
    │   │   ├── sphinx-book-theme.js
    │   │   └── sphinx-book-theme.js.map
    │   ├── searchtools.js
    │   ├── sphinx_highlight.js
    │   ├── styles
    │   │   ├── bootstrap.css
    │   │   ├── pydata-sphinx-theme.css
    │   │   ├── sphinx-book-theme.css
    │   │   └── theme.css
    │   ├── vendor
    │   │   └── fontawesome
    │   │   │   └── 6.1.2
    │   │   │       ├── LICENSE.txt
    │   │   │       ├── css
    │   │   │           └── all.min.css
    │   │   │       └── webfonts
    │   │   │           ├── fa-brands-400.ttf
    │   │   │           ├── fa-brands-400.woff2
    │   │   │           ├── fa-regular-400.ttf
    │   │   │           ├── fa-regular-400.woff2
    │   │   │           ├── fa-solid-900.ttf
    │   │   │           ├── fa-solid-900.woff2
    │   │   │           ├── fa-v4compatibility.ttf
    │   │   │           └── fa-v4compatibility.woff2
    │   └── webpack-macros.html
    ├── genindex.html
    ├── index.html
    ├── make.bat
    ├── objects.inv
    ├── search.html
    ├── searchindex.js
    └── source
    │   ├── CONCEPTS
    │       ├── data_parallelism.md
    │       ├── dp
    │       │   ├── zero_algorithm.md
    │       │   └── zero_image
    │       │   │   └── figure1.png
    │       ├── parallel_context.md
    │       ├── tensor_model_parallelism.md
    │       └── tp
    │       │   ├── 1d_image
    │       │       └── 98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg
    │       │   ├── 1d_parallel_algorithm.md
    │       │   ├── 2d_image
    │       │       └── 2d.png
    │       │   ├── 2d_parallel_algorithm.md
    │       │   ├── 2p5d_image
    │       │       └── 2p5d.png
    │       │   ├── 2p5d_parallel_algorithm.md
    │       │   ├── 3d_image
    │       │       └── E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg
    │       │   └── 3d_parallel_algorithm.md
    │   ├── TUTORIALS
    │       ├── data_parallelism.md
    │       ├── image
    │       │   ├── 260461C3-EA3B-405C-9B34-05BA3C781161.png
    │       │   └── figure1.png
    │       ├── tensor_model_parallelism.md
    │       └── zero_redundancy_optimizer.md
    │   ├── conf.py
    │   └── index.rst
├── fname.list
├── gcc_install.sh
├── oslo
    ├── __init__.py
    ├── __version__.py
    ├── lightseq2
    │   ├── __init__.py
    │   ├── csrc
    │   │   ├── example
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert_example.cc
    │   │   │   ├── gpt_example.cc
    │   │   │   └── transformer_example.cc
    │   │   ├── kernels
    │   │   │   ├── arm
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── gemm.cc
    │   │   │   │   ├── includes
    │   │   │   │   │   ├── kernel_headers.h
    │   │   │   │   │   └── utils.h
    │   │   │   │   └── utils.cc
    │   │   │   ├── cuda
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── crf.cu
    │   │   │   │   ├── cross_entropy.cu
    │   │   │   │   ├── cublas_algo_map.cpp
    │   │   │   │   ├── cublas_wrappers.cpp
    │   │   │   │   ├── cublas_wrappers.cu
    │   │   │   │   ├── cuda_util.cu
    │   │   │   │   ├── dropout_kernels.cu
    │   │   │   │   ├── embKernels.cc.cu
    │   │   │   │   ├── embedding_kernels.cu
    │   │   │   │   ├── fused_adam_kernel.cu
    │   │   │   │   ├── gemm_test.cpp
    │   │   │   │   ├── general_kernels.cu
    │   │   │   │   ├── gptKernels.cc.cu
    │   │   │   │   ├── includes
    │   │   │   │   │   ├── block_reduce.h
    │   │   │   │   │   ├── common.h
    │   │   │   │   │   ├── cublas_algo_map.h
    │   │   │   │   │   ├── cublas_wrappers.h
    │   │   │   │   │   ├── cuda_util.h
    │   │   │   │   │   ├── embKernels.h
    │   │   │   │   │   ├── fused_adam_kernel.h
    │   │   │   │   │   ├── gptKernels.h
    │   │   │   │   │   ├── kernel_headers.h
    │   │   │   │   │   ├── kernels.h
    │   │   │   │   │   ├── ls_cub.cuh
    │   │   │   │   │   ├── multi_tensor_apply.cuh
    │   │   │   │   │   ├── transformerKernels.h
    │   │   │   │   │   └── util.h
    │   │   │   │   ├── normalize_kernels.cu
    │   │   │   │   ├── quantize_kernels.cu
    │   │   │   │   ├── softmax_kernels.cu
    │   │   │   │   ├── softmax_kernels_new.cu
    │   │   │   │   ├── transform_kernels.cu
    │   │   │   │   ├── transform_kernels_new.cu
    │   │   │   │   ├── transformerKernels.cc.cu
    │   │   │   │   └── util.cc.cu
    │   │   │   └── x86
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── gemm.cpp
    │   │   │   │   ├── includes
    │   │   │   │       ├── kernel_headers.h
    │   │   │   │       ├── kernels.h
    │   │   │   │       └── util.h
    │   │   │   │   └── util.cc
    │   │   ├── layers
    │   │   │   ├── cross_entropy_layer.cpp
    │   │   │   ├── includes
    │   │   │   │   ├── cross_entropy_layer.h
    │   │   │   │   ├── quant_linear_layer.h
    │   │   │   │   ├── transformer_decoder_layer.h
    │   │   │   │   ├── transformer_embedding_layer.h
    │   │   │   │   └── transformer_encoder_layer.h
    │   │   │   ├── quant_linear_layer.cpp
    │   │   │   ├── transformer_decoder_layer.cpp
    │   │   │   ├── transformer_embedding_layer.cpp
    │   │   │   └── transformer_encoder_layer.cpp
    │   │   ├── layers_new
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── crf_layer.cpp
    │   │   │   ├── dec_enc_attention_layer.cpp
    │   │   │   ├── dec_self_attention_layer.cpp
    │   │   │   ├── encdec_kv_layer.cpp
    │   │   │   ├── feed_forward_layer.cpp
    │   │   │   ├── generator_layer.cpp
    │   │   │   ├── gpt_attention_layer.cpp
    │   │   │   ├── gpt_layer.cpp
    │   │   │   ├── includes
    │   │   │   │   ├── crf_layer.h
    │   │   │   │   ├── dec_enc_attention_layer.h
    │   │   │   │   ├── dec_self_attention_layer.h
    │   │   │   │   ├── encdec_kv_layer.h
    │   │   │   │   ├── feed_forward_layer.h
    │   │   │   │   ├── generator_layer.h
    │   │   │   │   ├── gpt_attention_layer.h
    │   │   │   │   ├── gpt_layer.h
    │   │   │   │   ├── launch_dec_emb_layer.h
    │   │   │   │   ├── launch_enc_emb_layer.h
    │   │   │   │   ├── launch_gpt_emb_layer.h
    │   │   │   │   ├── linear_layer.h
    │   │   │   │   ├── lyr_normalize_layer.h
    │   │   │   │   ├── multihead_attention_layer.h
    │   │   │   │   ├── sample_layer.h
    │   │   │   │   ├── sdpa_layer.h
    │   │   │   │   ├── transformer_decoder_layer.h
    │   │   │   │   └── transformer_encoder_layer.h
    │   │   │   ├── linear_layer.cpp
    │   │   │   ├── multihead_attention_layer.cpp
    │   │   │   ├── sample_layer.cpp
    │   │   │   ├── sdpa_layer.cpp
    │   │   │   ├── transformer_decoder_layer.cpp
    │   │   │   └── transformer_encoder_layer.cpp
    │   │   ├── lsflow
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   ├── allocator.cpp
    │   │   │   ├── context.cpp
    │   │   │   ├── includes
    │   │   │   │   ├── allocator.h
    │   │   │   │   ├── context.h
    │   │   │   │   ├── declaration.h
    │   │   │   │   ├── layer.h
    │   │   │   │   ├── lsflow_util.h
    │   │   │   │   ├── manager.h
    │   │   │   │   ├── node.h
    │   │   │   │   ├── shape.h
    │   │   │   │   └── tensor.h
    │   │   │   ├── layer.cpp
    │   │   │   ├── lsflow_util.cpp
    │   │   │   ├── manager.cpp
    │   │   │   ├── node.cpp
    │   │   │   ├── operator.cpp
    │   │   │   ├── shape.cpp
    │   │   │   ├── tensor.cpp
    │   │   │   └── variable.cpp
    │   │   ├── models
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert.cc
    │   │   │   ├── bert_crf.cc
    │   │   │   ├── gpt.cc
    │   │   │   ├── includes
    │   │   │   │   ├── bert.h
    │   │   │   │   ├── bert_crf.h
    │   │   │   │   ├── gpt.h
    │   │   │   │   ├── model_base.h
    │   │   │   │   ├── model_util.h
    │   │   │   │   └── transformer.h
    │   │   │   ├── model_util.cc
    │   │   │   ├── test_layer.cc
    │   │   │   └── transformer.cu
    │   │   ├── ops
    │   │   │   └── includes
    │   │   │   │   ├── context.h
    │   │   │   │   ├── dropout.h
    │   │   │   │   ├── feed_forward.h
    │   │   │   │   ├── normalize_layer.h
    │   │   │   │   ├── softmax.h
    │   │   │   │   └── strided_batch_gemm.h
    │   │   ├── ops_new
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── beam_search_topk.cu
    │   │   │   ├── bias_act_dropout.cpp
    │   │   │   ├── bias_add_transform_20314.cpp
    │   │   │   ├── bias_dropout_residual.cpp
    │   │   │   ├── concat3_dim1.cpp
    │   │   │   ├── crf.cpp
    │   │   │   ├── dropout.cpp
    │   │   │   ├── includes
    │   │   │   │   ├── beam_search_topk.h
    │   │   │   │   ├── bias_act_dropout.h
    │   │   │   │   ├── bias_add_transform_20314.h
    │   │   │   │   ├── bias_dropout_residual.h
    │   │   │   │   ├── concat3_dim1.h
    │   │   │   │   ├── crf.h
    │   │   │   │   ├── dropout.h
    │   │   │   │   ├── launch_dec_emb_op.h
    │   │   │   │   ├── launch_enc_emb.h
    │   │   │   │   ├── launch_gpt_emb.h
    │   │   │   │   ├── layer_normalize.h
    │   │   │   │   ├── linear.h
    │   │   │   │   ├── sampling.h
    │   │   │   │   ├── softmax.h
    │   │   │   │   ├── split_head_op.h
    │   │   │   │   ├── strided_batch_gemm.h
    │   │   │   │   └── transform_0213.h
    │   │   │   ├── launch_dec_emb_op.cpp
    │   │   │   ├── launch_enc_emb.cpp
    │   │   │   ├── launch_gpt_emb.cpp
    │   │   │   ├── layer_normalize.cpp
    │   │   │   ├── linear.cpp
    │   │   │   ├── sampling.cc.cu
    │   │   │   ├── softmax.cpp
    │   │   │   ├── split_head_op.cpp
    │   │   │   ├── strided_batch_gemm.cpp
    │   │   │   └── transform_0213.cpp
    │   │   ├── proto
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert.proto
    │   │   │   ├── bert_crf.proto
    │   │   │   ├── bert_crf_weight.cc
    │   │   │   ├── bert_weight.cc
    │   │   │   ├── gpt.proto
    │   │   │   ├── gpt_weight.cc
    │   │   │   ├── includes
    │   │   │   │   ├── bert_crf_weight.h
    │   │   │   │   ├── bert_weight.h
    │   │   │   │   ├── gpt_weight.h
    │   │   │   │   ├── proto_headers.h
    │   │   │   │   ├── proto_util.h
    │   │   │   │   ├── test_model_weight.h
    │   │   │   │   └── transformer_weight.h
    │   │   │   ├── proto_util.cc
    │   │   │   ├── transformer.proto
    │   │   │   └── transformer_weight.cc
    │   │   ├── pybind
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── pybind_adam.cpp
    │   │   │   ├── pybind_kernel_cuda.cpp
    │   │   │   ├── pybind_kernel_x86.cpp
    │   │   │   ├── pybind_layer.cpp
    │   │   │   ├── pybind_layer_new.cpp
    │   │   │   ├── pybind_model.cpp
    │   │   │   └── pybind_op.cpp
    │   │   ├── pytorch
    │   │   │   ├── __init__.py
    │   │   │   ├── builder
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── builder.py
    │   │   │   │   ├── cuda_kernel_builder.py
    │   │   │   │   ├── cuda_layer_builder.py
    │   │   │   │   └── x86_kernel_builder.py
    │   │   │   ├── layer_base.py
    │   │   │   ├── pytorch_quantization
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── calib
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── calibrator.py
    │   │   │   │   │   ├── histogram.py
    │   │   │   │   │   └── max.py
    │   │   │   │   ├── nn
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── _functions
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── quant_rnn.py
    │   │   │   │   │   ├── functional.py
    │   │   │   │   │   └── modules
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── _utils.py
    │   │   │   │   │   │   ├── clip.py
    │   │   │   │   │   │   ├── quant_bert.py
    │   │   │   │   │   │   ├── quant_conv.py
    │   │   │   │   │   │   ├── quant_instancenorm.py
    │   │   │   │   │   │   ├── quant_linear.py
    │   │   │   │   │   │   ├── quant_pooling.py
    │   │   │   │   │   │   ├── quant_rnn.py
    │   │   │   │   │   │   └── tensor_quantizer.py
    │   │   │   │   ├── optim
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── helper.py
    │   │   │   │   ├── quant_modules.py
    │   │   │   │   ├── tensor_quant.py
    │   │   │   │   ├── utils
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── quant_logging.py
    │   │   │   │   │   └── reduce_amax.py
    │   │   │   │   └── version.py
    │   │   │   ├── quantization.py
    │   │   │   ├── sdpa_layers.py
    │   │   │   ├── torch_transformer_layers.py
    │   │   │   ├── transformer_decoder_layer.py
    │   │   │   ├── transformer_encoder_layer.py
    │   │   │   └── util.py
    │   │   ├── tensorflow
    │   │   │   └── README.md
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── cuda
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── fairseq_layers.py
    │   │   │   │   ├── test_kernel.py
    │   │   │   │   ├── test_layer.py
    │   │   │   │   ├── test_ls_small_layer.py
    │   │   │   │   └── torch_crf.py
    │   │   │   ├── util.py
    │   │   │   └── x86
    │   │   │   │   └── test_kernel.py
    │   │   └── triton_backend
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── cmake
    │   │   │       └── TutorialMinimalBackendConfig.cmake.in
    │   │   │   └── src
    │   │   │       ├── libtriton_minimal.ldscript
    │   │   │       ├── lightseq_backend.cc
    │   │   │       ├── triton_model.h
    │   │   │       └── triton_utils.h
    │   ├── inference
    │   │   ├── kernels
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── common.h
    │   │   │   ├── embKernels.cc.cu
    │   │   │   ├── embKernels.h
    │   │   │   ├── embKernels_int8.cc.cu
    │   │   │   ├── embKernels_int8.h
    │   │   │   ├── gptKernels.cc.cu
    │   │   │   ├── gptKernels.h
    │   │   │   ├── gptKernels_int8.cc.cu
    │   │   │   ├── gptKernels_int8.h
    │   │   │   ├── moeKernels.cc.cu
    │   │   │   ├── moeKernels.h
    │   │   │   ├── multilgKernels.cc.cu
    │   │   │   ├── multilgKernels.h
    │   │   │   ├── t5EmbKernels.cc.cu
    │   │   │   ├── t5EmbKernels.h
    │   │   │   ├── t5Kernels.cc.cu
    │   │   │   ├── t5Kernels.h
    │   │   │   ├── transformerKernels.cc.cu
    │   │   │   ├── transformerKernels.h
    │   │   │   ├── transformerKernels_int8.cc.cu
    │   │   │   └── transformerKernels_int8.h
    │   │   ├── model
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert_encoder.cc.cu
    │   │   │   ├── bert_encoder.h
    │   │   │   ├── cublas_algo_map.cc
    │   │   │   ├── cublas_algo_map.h
    │   │   │   ├── cublas_helper.cc
    │   │   │   ├── cublas_helper.h
    │   │   │   ├── decoder.cc.cu
    │   │   │   ├── decoder.h
    │   │   │   ├── encoder.cc.cu
    │   │   │   ├── encoder.h
    │   │   │   ├── gpt_encoder.cc.cu
    │   │   │   ├── gpt_encoder.h
    │   │   │   ├── moe_decoder.cc.cu
    │   │   │   ├── moe_decoder.h
    │   │   │   ├── moe_encoder.cc.cu
    │   │   │   ├── moe_encoder.h
    │   │   │   ├── mt5_decoder.cc.cu
    │   │   │   ├── mt5_decoder.h
    │   │   │   ├── mt5_encoder.cc.cu
    │   │   │   ├── mt5_encoder.h
    │   │   │   ├── quant_bert_encoder.cc.cu
    │   │   │   ├── quant_bert_encoder.h
    │   │   │   ├── quant_decoder.cc.cu
    │   │   │   ├── quant_decoder.h
    │   │   │   ├── quant_encoder.cc.cu
    │   │   │   ├── quant_encoder.h
    │   │   │   ├── quant_gpt_encoder.cc.cu
    │   │   │   ├── quant_gpt_encoder.h
    │   │   │   ├── quant_vit_encoder.cc.cu
    │   │   │   ├── quant_vit_encoder.h
    │   │   │   ├── t5_decoder.cc.cu
    │   │   │   ├── t5_decoder.h
    │   │   │   ├── t5_encoder.cc.cu
    │   │   │   ├── t5_encoder.h
    │   │   │   ├── vit_encoder.cc.cu
    │   │   │   └── vit_encoder.h
    │   │   ├── proto
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert.proto
    │   │   │   ├── bert_weight.cc
    │   │   │   ├── bert_weight.h
    │   │   │   ├── gpt.proto
    │   │   │   ├── gpt_weight.cc
    │   │   │   ├── gpt_weight.h
    │   │   │   ├── moe.proto
    │   │   │   ├── moe_weight.cc
    │   │   │   ├── moe_weight.h
    │   │   │   ├── mt5.proto
    │   │   │   ├── mt5_weight.cc
    │   │   │   ├── mt5_weight.h
    │   │   │   ├── quant_bert.proto
    │   │   │   ├── quant_bert_weight.cc
    │   │   │   ├── quant_bert_weight.h
    │   │   │   ├── quant_gpt.proto
    │   │   │   ├── quant_gpt_weight.cc
    │   │   │   ├── quant_gpt_weight.h
    │   │   │   ├── quant_transformer.proto
    │   │   │   ├── quant_transformer_weight.cc
    │   │   │   ├── quant_transformer_weight.h
    │   │   │   ├── quant_vit.proto
    │   │   │   ├── quant_vit_weight.cc
    │   │   │   ├── quant_vit_weight.h
    │   │   │   ├── t5.proto
    │   │   │   ├── t5_weight.cc
    │   │   │   ├── t5_weight.h
    │   │   │   ├── transformer.proto
    │   │   │   ├── transformer_weight.cc
    │   │   │   ├── transformer_weight.h
    │   │   │   ├── vit.proto
    │   │   │   ├── vit_weight.cc
    │   │   │   └── vit_weight.h
    │   │   ├── pywrapper
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── bert.cc
    │   │   │   ├── bert.h
    │   │   │   ├── gpt.cc
    │   │   │   ├── gpt.h
    │   │   │   ├── model_base.h
    │   │   │   ├── moe.cc
    │   │   │   ├── moe.h
    │   │   │   ├── mt5.cc
    │   │   │   ├── mt5.h
    │   │   │   ├── quant_bert.cc
    │   │   │   ├── quant_bert.h
    │   │   │   ├── quant_gpt.cc
    │   │   │   ├── quant_gpt.h
    │   │   │   ├── quant_transformer.cc
    │   │   │   ├── quant_transformer.h
    │   │   │   ├── quant_vit.cc
    │   │   │   ├── quant_vit.h
    │   │   │   ├── t5.cc
    │   │   │   ├── t5.h
    │   │   │   ├── transformer.cc
    │   │   │   ├── transformer.h
    │   │   │   ├── transformer_decoder.cc.cu
    │   │   │   ├── vit.cc
    │   │   │   ├── vit.h
    │   │   │   └── wrapper.cc
    │   │   ├── server
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── custom.h
    │   │   │   ├── decoder_generate_server.cc.cu
    │   │   │   ├── generate_server.cc.cu
    │   │   │   ├── gpt_generate_server.cc.cu
    │   │   │   ├── gptlm_server.cc.cu
    │   │   │   ├── libserver.ldscript
    │   │   │   ├── model_config.h
    │   │   │   ├── model_config.proto
    │   │   │   ├── model_config_cuda.h
    │   │   │   ├── moe_server.cc.cu
    │   │   │   └── transformer_server.cc.cu
    │   │   ├── tools
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── util.cc.cu
    │   │   │   └── util.h
    │   │   └── triton_backend
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── cmake
    │   │   │       └── TutorialMinimalBackendConfig.cmake.in
    │   │   │   └── src
    │   │   │       ├── libtriton_minimal.ldscript
    │   │   │       ├── lightseq_backend.cc
    │   │   │       ├── triton_model.h
    │   │   │       └── triton_utils.h
    │   └── training
    │   │   ├── __init__.py
    │   │   ├── cli
    │   │       ├── __init__.py
    │   │       ├── fs_modules
    │   │       │   ├── __init__.py
    │   │       │   ├── ls_adam.py
    │   │       │   ├── ls_bart.py
    │   │       │   ├── ls_fs_transformer_decoder_layer.py
    │   │       │   ├── ls_label_smoothed_cross_entropy.py
    │   │       │   ├── ls_transformer.py
    │   │       │   └── ls_translation.py
    │   │       ├── lightseq_deepspeed_cli.py
    │   │       ├── lightseq_fairseq_generate_cli.py
    │   │       ├── lightseq_fairseq_train_cli.py
    │   │       ├── lightseq_fairseq_validate_cli.py
    │   │       └── lightseq_infer_cli.py
    │   │   ├── csrc
    │   │       └── ops
    │   │       │   └── includes
    │   │       │       └── strided_batch_gemm.h
    │   │   ├── gcq
    │   │       ├── __init__.py
    │   │       ├── gcq.py
    │   │       ├── ls_fs_gcq_train.py
    │   │       └── ls_fs_gcq_trainer.py
    │   │   ├── ops
    │   │       ├── __init__.py
    │   │       ├── pytorch
    │   │       │   ├── __init__.py
    │   │       │   ├── adam.py
    │   │       │   ├── builder
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── adam_builder.py
    │   │       │   │   ├── builder.py
    │   │       │   │   ├── kernel_builder.py
    │   │       │   │   ├── layer_builder.py
    │   │       │   │   ├── operator_builder.py
    │   │       │   │   └── transformer_builder.py
    │   │       │   ├── cross_entropy_layer.py
    │   │       │   ├── export.py
    │   │       │   ├── export_quant.py
    │   │       │   ├── gemm_test.py
    │   │       │   ├── gpt_layer.py
    │   │       │   ├── layer_base.py
    │   │       │   ├── multihead_attention_layer.py
    │   │       │   ├── quant_linear_layer.py
    │   │       │   ├── quantization.py
    │   │       │   ├── torch_transformer_layers.py
    │   │       │   ├── transformer.py
    │   │       │   ├── transformer_decoder_layer.py
    │   │       │   ├── transformer_decoder_layer_new.py
    │   │       │   ├── transformer_embedding_layer.py
    │   │       │   ├── transformer_encoder_layer.py
    │   │       │   ├── transformer_encoder_layer_new.py
    │   │       │   └── util.py
    │   │       └── tensorflow
    │   │       │   ├── README.md
    │   │       │   └── __init__.py
    │   │   └── pytorch_quantization
    │   │       ├── __init__.py
    │   │       ├── calib
    │   │           ├── __init__.py
    │   │           ├── calibrator.py
    │   │           ├── histogram.py
    │   │           └── max.py
    │   │       ├── nn
    │   │           ├── __init__.py
    │   │           ├── _functions
    │   │           │   ├── __init__.py
    │   │           │   └── quant_rnn.py
    │   │           ├── functional.py
    │   │           └── modules
    │   │           │   ├── __init__.py
    │   │           │   ├── _utils.py
    │   │           │   ├── clip.py
    │   │           │   ├── quant_bert.py
    │   │           │   ├── quant_conv.py
    │   │           │   ├── quant_instancenorm.py
    │   │           │   ├── quant_linear.py
    │   │           │   ├── quant_pooling.py
    │   │           │   ├── quant_rnn.py
    │   │           │   └── tensor_quantizer.py
    │   │       ├── optim
    │   │           ├── __init__.py
    │   │           └── helper.py
    │   │       ├── quant_modules.py
    │   │       ├── tensor_quant.py
    │   │       ├── utils
    │   │           ├── __init__.py
    │   │           ├── quant_logging.py
    │   │           └── reduce_amax.py
    │   │       └── version.py
    ├── torch
    │   ├── _C
    │   │   ├── __init__.py
    │   │   └── csrc
    │   │   │   ├── CPUAdagradBinder.cpp
    │   │   │   ├── CPUAdamBinder.cpp
    │   │   │   ├── ExpertParallelBinder.cpp
    │   │   │   ├── FusedAdagradBinder.cpp
    │   │   │   ├── FusedAdamBinder.cpp
    │   │   │   ├── FusedL2NormBinder.cpp
    │   │   │   ├── FusedLambBinder.cpp
    │   │   │   ├── FusedLayerNormBinder.cpp
    │   │   │   ├── FusedMixedPrecisionL2NormBinder.cpp
    │   │   │   ├── FusedMixedPrecisionLambBinder.cpp
    │   │   │   ├── FusedNovogradBinder.cpp
    │   │   │   ├── FusedSGDBinder.cpp
    │   │   │   ├── FusedScaleMaskSoftmaxBinder.cpp
    │   │   │   ├── __init__.py
    │   │   │   ├── custom_cuda_kernel.cu
    │   │   │   ├── expert_parallel_cuda_kernel.cu
    │   │   │   ├── fused_layer_norm.cu
    │   │   │   ├── includes
    │   │   │       ├── block_reduce.h
    │   │   │       ├── compat.h
    │   │   │       ├── context.h
    │   │   │       ├── cpu_adagrad.h
    │   │   │       ├── cpu_adam.h
    │   │   │       ├── custom_cuda_layers.h
    │   │   │       ├── multi_tensor_apply.cuh
    │   │   │       ├── simd.h
    │   │   │       ├── tqdm.h
    │   │   │       └── type_shim.h
    │   │   │   ├── multi_tensor_adagrad.cu
    │   │   │   ├── multi_tensor_adam.cu
    │   │   │   ├── multi_tensor_l2norm.cu
    │   │   │   ├── multi_tensor_l2norm_mp.cu
    │   │   │   ├── multi_tensor_lamb.cu
    │   │   │   ├── multi_tensor_lamb_mp.cu
    │   │   │   ├── multi_tensor_novograd.cu
    │   │   │   ├── multi_tensor_sgd.cu
    │   │   │   ├── ngram_repeat_block_cuda.cpp
    │   │   │   └── ngram_repeat_block_cuda_kernel.cu
    │   ├── __init__.py
    │   ├── distributed
    │   │   ├── __init__.py
    │   │   ├── _initializers
    │   │   │   ├── __init__.py
    │   │   │   ├── initializer.py
    │   │   │   ├── initializer_data.py
    │   │   │   ├── initializer_expert.py
    │   │   │   ├── initializer_model.py
    │   │   │   ├── initializer_pipeline.py
    │   │   │   ├── initializer_sequence.py
    │   │   │   ├── initializer_tensor.py
    │   │   │   ├── initializer_tensor_1d.py
    │   │   │   ├── initializer_tensor_2d.py
    │   │   │   ├── initializer_tensor_2p5d.py
    │   │   │   └── initializer_tensor_3d.py
    │   │   ├── _seed
    │   │   │   ├── __init__.py
    │   │   │   ├── helper.py
    │   │   │   └── seed_manager.py
    │   │   ├── nn
    │   │   │   ├── __init__.py
    │   │   │   ├── _p2p.py
    │   │   │   ├── _ring_self_attention.py
    │   │   │   └── functional.py
    │   │   ├── parallel_context.py
    │   │   └── parallel_mode.py
    │   ├── jit
    │   │   ├── __init__.py
    │   │   └── _utils.py
    │   ├── nn
    │   │   ├── __init__.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── conv.py
    │   │   │   ├── dropout.py
    │   │   │   ├── embedding.py
    │   │   │   ├── functional.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── linear.py
    │   │   │   ├── loss.py
    │   │   │   └── ngram_repeat_block.py
    │   │   └── parallel
    │   │   │   ├── __init__.py
    │   │   │   ├── data_parallel
    │   │   │       ├── __init__.py
    │   │   │       ├── _reducer.py
    │   │   │       ├── _utils.py
    │   │   │       ├── data_parallel.py
    │   │   │       └── zero
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── _optim_interface.py
    │   │   │       │   ├── hetero
    │   │   │       │       ├── __init__.py
    │   │   │       │       ├── chunk
    │   │   │       │       │   ├── __init__.py
    │   │   │       │       │   ├── chunk.py
    │   │   │       │       │   ├── manager.py
    │   │   │       │       │   └── utils.py
    │   │   │       │       ├── data_parallel.py
    │   │   │       │       ├── hook.py
    │   │   │       │       ├── memory_manager.py
    │   │   │       │       ├── memory_tracer
    │   │   │       │       │   ├── __init__.py
    │   │   │       │       │   ├── chunk_memstats_collector.py
    │   │   │       │       │   ├── memory_monitor.py
    │   │   │       │       │   ├── memory_stats.py
    │   │   │       │       │   ├── param_runtime_order.py
    │   │   │       │       │   └── utils.py
    │   │   │       │       ├── optim.py
    │   │   │       │       ├── placement_policy.py
    │   │   │       │       └── utils.py
    │   │   │       │   └── optim
    │   │   │       │       ├── __init__.py
    │   │   │       │       ├── _utils.py
    │   │   │       │       ├── bookkeeping
    │   │   │       │           ├── __init__.py
    │   │   │       │           ├── _base_store.py
    │   │   │       │           ├── bucket_store.py
    │   │   │       │           ├── gradient_store.py
    │   │   │       │           ├── parameter_store.py
    │   │   │       │           └── tensor_store.py
    │   │   │       │       └── optim.py
    │   │   │   ├── expert_parallel
    │   │   │       ├── __init__.py
    │   │   │       ├── _ops.py
    │   │   │       ├── expert_parallel.py
    │   │   │       ├── experts.py
    │   │   │       ├── layers.py
    │   │   │       ├── mapping.py
    │   │   │       └── utils.py
    │   │   │   ├── pipeline_parallel
    │   │   │       ├── __init__.py
    │   │   │       ├── _buffers.py
    │   │   │       ├── _comm.py
    │   │   │       ├── _cost_estimator.py
    │   │   │       ├── _functional.py
    │   │   │       ├── _job.py
    │   │   │       ├── _messages.py
    │   │   │       ├── _model_partitioner.py
    │   │   │       ├── _sync.py
    │   │   │       ├── _types.py
    │   │   │       ├── _utils.py
    │   │   │       ├── _workers.py
    │   │   │       └── pipeline_parallel.py
    │   │   │   ├── tensor_parallel
    │   │   │       ├── _1d
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── _ops.py
    │   │   │       │   └── _wrapper.py
    │   │   │       ├── _2d
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── _ops.py
    │   │   │       │   └── _wrapper.py
    │   │   │       ├── _2p5d
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── _ops.py
    │   │   │       │   └── _wrapper.py
    │   │   │       ├── _3d
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── _ops.py
    │   │   │       │   └── _wrapper.py
    │   │   │       ├── __init__.py
    │   │   │       ├── mapping.py
    │   │   │       ├── tensor_parallel.py
    │   │   │       └── utils.py
    │   │   │   └── utils.py
    │   ├── optim
    │   │   ├── __init__.py
    │   │   ├── cpu_adagrad.py
    │   │   ├── cpu_adam.py
    │   │   ├── fused_adagrad.py
    │   │   ├── fused_adam.py
    │   │   ├── fused_lamb.py
    │   │   ├── fused_mixed_precision_lamb.py
    │   │   ├── fused_novograd.py
    │   │   └── fused_sgd.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── checkpoint
    │   │       ├── __init__.py
    │   │       ├── _checkpoint_function.py
    │   │       ├── _checkpoint_partitioner.py
    │   │       ├── _checkpoint_utils.py
    │   │       ├── _rng_state_tracker.py
    │   │       └── activation_checkpointing.py
    │   │   ├── common.py
    │   │   ├── data
    │   │       ├── __init__.py
    │   │       └── data_collators.py
    │   │   ├── extensions.py
    │   │   ├── logging
    │   │       ├── __init__.py
    │   │       └── logger.py
    │   │   ├── multi_tensor_apply
    │   │       ├── __init__.py
    │   │       └── multi_tensor_apply.py
    │   │   ├── random.py
    │   │   └── version.py
    └── transformers
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── data
    │       ├── __init__.py
    │       └── data_collator.py
    │   ├── kernel_fusion_utils.py
    │   ├── mapping_utils.py
    │   ├── modeling_utils.py
    │   ├── models
    │       ├── __init__.py
    │       ├── albert
    │       │   ├── __init__.py
    │       │   └── modeling_albert.py
    │       ├── bart
    │       │   ├── __init__.py
    │       │   └── modeling_bart.py
    │       ├── bert
    │       │   ├── __init__.py
    │       │   └── modeling_bert.py
    │       ├── distilbert
    │       │   ├── __init__.py
    │       │   └── modeling_distilbert.py
    │       ├── electra
    │       │   ├── __init__.py
    │       │   └── modeling_electra.py
    │       ├── gpt2
    │       │   ├── __init__.py
    │       │   └── modeling_gpt2.py
    │       ├── mbart
    │       │   ├── __init__.py
    │       │   └── modeling_mbart.py
    │       ├── mt5
    │       │   ├── __init__.py
    │       │   └── modeling_mt5.py
    │       ├── roberta
    │       │   ├── __init__.py
    │       │   └── modeling_roberta.py
    │       └── t5
    │       │   ├── __init__.py
    │       │   └── modeling_t5.py
    │   ├── oslo_init.py
    │   ├── profiler.py
    │   ├── tasks
    │       ├── __init__.py
    │       ├── data_albert_pretraining.py
    │       ├── data_bart_pretraining.py
    │       ├── data_base.py
    │       ├── data_bert_pretraining.py
    │       ├── data_causal_lm.py
    │       ├── data_masked_lm.py
    │       ├── data_sequence_classification.py
    │       ├── data_summarization.py
    │       ├── data_t5_pretraining.py
    │       ├── data_token_classification.py
    │       ├── data_utils.py
    │       └── loading
    │       │   ├── __init__.py
    │       │   └── sent_text.py
    │   ├── trainer.py
    │   ├── trainer_utils.py
    │   └── training_args.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── inference.py
    ├── merge.py
    ├── misc
    │   └── test_rpc.py
    ├── tasks
    │   └── model_task.py
    ├── test_script
    │   ├── run_inference.sh
    │   ├── run_merge.sh
    │   └── run_train.sh
    ├── training.py
    └── util
    │   ├── arg_parser.py
    │   └── oslo.py
└── tests_deprecated
    ├── __init__.py
    ├── torch
        ├── _C
        │   └── csrc
        │   │   ├── test_fused_layer_norm.py
        │   │   ├── test_fused_layer_norm_autocast.py
        │   │   ├── test_fused_layer_norm_fusedlayernorm.py
        │   │   ├── test_fused_layer_norm_rms.py
        │   │   └── test_sequence_generator.py
        ├── __init__.py
        ├── distributed
        │   └── __init__.py
        ├── nn
        │   ├── __init__.py
        │   └── parallel
        │   │   ├── __init__.py
        │   │   ├── data_parallel
        │   │       ├── __init__.py
        │   │       ├── data_parallel.py
        │   │       └── zero
        │   │       │   ├── __init__.py
        │   │       │   ├── heterogeneous_manager
        │   │       │       ├── __init__.py
        │   │       │       ├── test_chunk.py
        │   │       │       ├── test_chunk_manager.py
        │   │       │       ├── test_mem_collector.py
        │   │       │       ├── test_mem_monitor.py
        │   │       │       └── test_memstats.py
        │   │       │   ├── sharded_optim
        │   │       │       ├── __init__.py
        │   │       │       ├── test_grad_acc.py
        │   │       │       ├── test_hetero_step.py
        │   │       │       ├── test_hybrid.py
        │   │       │       ├── test_integrity.py
        │   │       │       └── test_mixed_prec.py
        │   │       │   ├── test_fsdp_wrapper.py
        │   │       │   ├── test_grad.py
        │   │       │   └── test_state_dict.py
        │   │   ├── expert_parallel
        │   │       └── gpt2
        │   │       │   ├── gpt2.py
        │   │       │   ├── gpt2_deparallelize.py
        │   │       │   ├── gpt2_load.py
        │   │       │   ├── gpt2_pr_ep.py
        │   │       │   ├── gpt2_pr_moe.py
        │   │       │   ├── gpt2_save.py
        │   │       │   └── utils.py
        │   │   ├── pipeline_parallel
        │   │       ├── __init__.py
        │   │       ├── compare_grad_pptp_no.py
        │   │       ├── compare_grad_tp_no.py
        │   │       ├── compare_output_pptp_no.py
        │   │       ├── compare_output_tp_no.py
        │   │       ├── compare_pp_nopp.py
        │   │       ├── compare_pptp_trial.py
        │   │       ├── compare_send_recv.py
        │   │       ├── test_batch_order.py
        │   │       ├── test_p2p.py
        │   │       ├── test_partioning.py
        │   │       ├── test_pp.py
        │   │       ├── test_pp4.py
        │   │       ├── test_rpc.py
        │   │       └── test_tp.py
        │   │   └── tensor_parallel
        │   │       ├── 1d
        │   │           ├── __init__.py
        │   │           ├── _utils.py
        │   │           ├── deparallel
        │   │           │   ├── __init__.py
        │   │           │   ├── test_deparallelize.py
        │   │           │   ├── test_load_parallel.py
        │   │           │   ├── test_qkv.py
        │   │           │   └── test_vocab.py
        │   │           ├── test_col_linear_1d.py
        │   │           ├── test_embedding_1d.py
        │   │           ├── test_layer_norm_1d.py
        │   │           ├── test_row_linear_1d.py
        │   │           ├── test_vocab_embedding_1d.py
        │   │           ├── test_vocab_parallel_cross_entropy_1d.py
        │   │           ├── test_wrapper_1d.py
        │   │           ├── test_wrapper_1d_vocab.py
        │   │           └── test_wrapper_1d_with_t5.py
        │   │       ├── 2d
        │   │           ├── __init__.py
        │   │           ├── _utils.py
        │   │           ├── deparallel
        │   │           │   ├── __init__.py
        │   │           │   ├── test_deparallelize.py
        │   │           │   ├── test_load_parallel.py
        │   │           │   └── test_qkv.py
        │   │           ├── test_embedding_2d.py
        │   │           ├── test_layer_norm_2d.py
        │   │           ├── test_linear_2d.py
        │   │           ├── test_vocab_embedding_2d.py
        │   │           ├── test_vocab_parallel_cross_entropy_2d.py
        │   │           ├── test_wrapper_2d.py
        │   │           └── test_wrapper_2d_vocab.py
        │   │       ├── 2p5d
        │   │           ├── __init__.py
        │   │           ├── _utils.py
        │   │           ├── deparallel
        │   │           │   ├── __init__.py
        │   │           │   ├── test_deparallelize.py
        │   │           │   ├── test_load_parallel.py
        │   │           │   ├── test_qkv.py
        │   │           │   └── test_vocab.py
        │   │           ├── test_embedding_2p5d.py
        │   │           ├── test_layer_norm_2p5d.py
        │   │           ├── test_linear_2p5d.py
        │   │           ├── test_vocab_embedding_2p5d.py
        │   │           ├── test_vocab_parallel_cross_entropy_2p5d.py
        │   │           ├── test_wrapper_2p5d.py
        │   │           └── test_wrapper_2p5d_vocab.py
        │   │       ├── 3d
        │   │           ├── __init__.py
        │   │           ├── _utils.py
        │   │           ├── deparallel
        │   │           │   ├── __init__.py
        │   │           │   └── test_deparallelize.py
        │   │           ├── test_embedding_3d.py
        │   │           ├── test_layer_norm_3d.py
        │   │           ├── test_linear_3d.py
        │   │           ├── test_vocab_embedding_3d.py
        │   │           ├── test_vocab_parallel_cross_entropy_3d.py
        │   │           ├── test_wrapper_3d.py
        │   │           └── test_wrapper_3d_vocab.py
        │   │       └── __init__.py
        ├── optim
        │   ├── cpu_adagrad.py
        │   ├── cpu_adam.py
        │   ├── fused_adam.py
        │   ├── fused_lamb.py
        │   ├── fused_novograd.py
        │   └── fused_optimizers.py
        └── utils
        │   ├── __init__.py
        │   ├── data
        │       ├── __init__.py
        │       └── test_data_collators.py
        │   └── logging.py
    └── transformers
        ├── __init__.py
        ├── models
            ├── bert
            │   └── test_modeling_bert.py
            ├── electra
            │   ├── test_mlm.py
            │   └── test_token_cls.py
            ├── gpt2
            │   └── test_modeling_gpt2.py
            ├── mbart
            │   └── test_training.py
            └── mt5
            │   └── test_training.py
        ├── tasks
            ├── test_data_albert_pretraining.py
            ├── test_data_bart_pretraining.py
            ├── test_data_base.py
            ├── test_data_bert_pretraining.py
            ├── test_data_causal_lm.py
            ├── test_data_masked_lm.py
            ├── test_data_sequence_classification.py
            ├── test_data_summarization.py
            ├── test_data_t5_pretraining.py
            ├── test_data_token_classification.py
            └── test_model_bart_pretraining.py
        ├── test_kernel_fusion_utils.py
        └── trainer
            ├── oslo_user_config.json
            ├── test_oslo_config.py
            ├── test_trainer_basic.py
            ├── test_trainer_ddp.py
            ├── test_trainer_dp_zero1.py
            ├── test_trainer_pp.py
            ├── test_trainer_reload.py
            ├── test_trainer_tp_1d.py
            ├── test_trainer_tp_2d.py
            ├── test_trainer_tp_2p5d.py
            └── test_trainer_tp_3d.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @hyunwoongko
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Report a bug
 3 | about: Bug report
 4 | labels: 'bug'
 5 | ---
 6 | 
 7 | ## How to reproduce
 8 | 
 9 | ```python
10 | ```
11 | 
12 | ## Environment
13 | 
14 | - OS :
15 | - Python version :
16 | - Transformers version :
17 | - Whether to use Docker:
18 | - Misc.:
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request a feature
 3 | about: Feature request
 4 | labels: 'enhancement'
 5 | ---
 6 | 
 7 | ## Describe a requested feature
 8 | 
 9 | -
10 | 
11 | ## Expected behavior
12 | 
13 | ```python
14 | >>> a = Foo()
15 | >>> a.predict()
16 | ```
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/todo.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: TODO feature
 3 | about: TODO feature
 4 | ---
 5 | 
 6 | ## Describe a TODO feature
 7 | 
 8 | -
 9 | 
10 | ## Assignees
11 | 
12 | -
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Title
 2 | 
 3 | -
 4 | 
 5 | ## Description
 6 | 
 7 | -
 8 | 
 9 | ## Linked Issues
10 | 
11 | - resolved #00
12 | 


--------------------------------------------------------------------------------
/.github/workflows/contributors.yaml:
--------------------------------------------------------------------------------
 1 | name: Add contributors
 2 | on:
 3 |   schedule:
 4 |     - cron:  '20 20 * * *'
 5 |   push:
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   add-contributors:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - uses: BobAnkh/add-contributors@master
15 |         with:
16 |           CONTRIBUTOR: 'Thanks so much to all of our amazing contributors!'
17 |           COLUMN_PER_ROW: '6'
18 |           ACCESS_TOKEN: ${{secrets.GITHUB_TOKEN}}
19 |           IMG_WIDTH: '120'
20 |           FONT_SIZE: '14'
21 |           PATH: '/README.md'
22 |           COMMIT_MESSAGE: 'docs(README): update contributors'
23 |           AVATAR_SHAPE: 'round'
24 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-20.04
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: actions/setup-python@v2
11 |         with:
12 |           python-version: 3.8
13 |       - uses: pre-commit/action@v2.0.3
14 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: 'docs|assets'
 2 | 
 3 | default_language_version:
 4 |     python: python3
 5 | 
 6 | repos:
 7 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 8 |       rev: v4.4.0
 9 |       hooks:
10 |       - id: check-case-conflict
11 |       - id: check-json
12 |       - id: check-symlinks
13 |       - id: check-yaml
14 |       - id: destroyed-symlinks
15 |       - id: check-ast
16 |       - id: check-merge-conflict
17 |       - id: check-added-large-files
18 |         args: ['--maxkb=500']
19 |       - id: end-of-file-fixer
20 |       - id: fix-byte-order-marker
21 |       - id: fix-encoding-pragma
22 |         args: [--remove]
23 |       - id: mixed-line-ending
24 |         args: [--fix=lf]
25 |       - id: requirements-txt-fixer
26 |       - id: trailing-whitespace
27 | 
28 |     - repo: https://github.com/pocc/pre-commit-hooks
29 |       rev: v1.3.5
30 |       hooks:
31 |       - id: clang-format
32 | 
33 |     - repo: https://github.com/psf/black
34 |       rev: 22.12.0
35 |       hooks:
36 |       - id: black
37 |         language_version: python3.8
38 | 
39 |     - repo: https://github.com/codespell-project/codespell
40 |       rev: v2.2.2
41 |       hooks:
42 |       - id: codespell
43 |         args: [
44 |               '--ignore-words-list=reord,dout,nd,te,ser,mata',  # Word used in error messages that need rewording
45 |               --check-filenames,
46 |               --check-hidden,
47 |           ]
48 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include *.txt
2 | global-include *.cu *.cpp *.cc *.cuh *.h *.ldscript *.proto *.cmake
3 | prune dist
4 | prune build
5 | prune tests
6 | include LICENSE.apache-2.0
7 | include LICENSE.3rd_party_library
8 | include README.md
9 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/assets/logo.png


--------------------------------------------------------------------------------
/docs/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 6a7a9cb54d9ab2728b51824ec90997d3
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | oslo.eleuther.ai


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_images/260461C3-EA3B-405C-9B34-05BA3C781161.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/260461C3-EA3B-405C-9B34-05BA3C781161.png


--------------------------------------------------------------------------------
/docs/_images/2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/2d.png


--------------------------------------------------------------------------------
/docs/_images/2p5d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/2p5d.png


--------------------------------------------------------------------------------
/docs/_images/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg


--------------------------------------------------------------------------------
/docs/_images/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg


--------------------------------------------------------------------------------
/docs/_images/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/figure1.png


--------------------------------------------------------------------------------
/docs/_images/figure11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_images/figure11.png


--------------------------------------------------------------------------------
/docs/_sources/CONCEPTS/data_parallelism.md:
--------------------------------------------------------------------------------
 1 | # Concept of Data Parallelism
 2 | - Authors: Jinwon Kim
 3 | 
 4 | **Data Parallelism** is a widely-used technique for training deep learning models in parallel. It involves distributing the training data across multiple processing units, such as GPUs, each of which has a copy of the model parameters. The data is divided into subsets, and each unit independently computes the gradients for its subset. The gradients are then aggregated to update the model parameters. This approach enables efficient parallelization of the training process and can accelerate the training of deep learning models on large datasets.
 5 | 
 6 | Oslo supports Zero Redundancy Optimizer (ZeRO) to easily scale deep learning model.
 7 | 
 8 | ## Optimizer-Level Parallel
 9 | - [Zero Redundancy Optimizer DP](dp/zero_algorithm.md)
10 | 
11 | 
12 | ### References
13 | - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/docs/_sources/CONCEPTS/dp/zero_algorithm.md:
--------------------------------------------------------------------------------
 1 | # Zero Redundancy Optimizer DP
 2 | - Authors: Jinwon Kim
 3 | - Paper: https://arxiv.org/abs/1910.02054
 4 | 
 5 | ![figure1.png](zero_image/figure1.png)
 6 | 
 7 | The Zero Redundancy Optimizer for Data Parallelism (ZeRO-DP) is a technique used to remove memory state redundancies and optimize computational efficiency in data parallel distributed deep learning. ZeRO-DP partitions the model states across data-parallel processes, eliminating the need for replication of model parameters, which in turn reduces memory usage and communication overhead during training.
 8 | 
 9 | ## Optimizer State Partitioning (Level 1)
10 | -  The optimizer states are partitioned across data parallel processes
11 | ## Gradient Partitioning (Level 2)
12 | -  The reduced gradients are partitioned based on the corresponding parameter and are reduced only by the data parallel process responsible for updating those parameters. After the reduction, the memory can be released.
13 | ## Parameter Partitioning (Level 3)
14 | - Similar to the optimizer states and gradients, each process only stores the parameters associated with its partition.
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/_sources/CONCEPTS/tensor_model_parallelism.md:
--------------------------------------------------------------------------------
 1 | # Concept of Tensor Model Parallelism
 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu
 3 | 
 4 | **Tensor Model Parallelism** makes it possible to train larger models by partitioning the parameter tensors into multiple dimensions.
 5 | We support 1D, 2D, 2.5D, and 3D tensor partitioning algorithms which make tensor parallel training more efficient.
 6 | 
 7 | ## Tensor Parallel Algorithms
 8 | - [1D parallel algorithm (same as Megatron-LM)](tp/1d_parallel_algorithm.md)
 9 | - [2D parallel (SUMMA) algorithm](tp/2d_parallel_algorithm.md)
10 | - [2.5D parallel (SUMMA-2.5) algorithm](tp/2p5d_parallel_algorithm.md)
11 | - [3D parallel Algorithm](tp/3d_parallel_algorithm.md)
12 | 
13 | ### References
14 | - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
15 | - [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
16 | - [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
17 | - [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/_sources/CONCEPTS/tp/2d_parallel_algorithm.md:
--------------------------------------------------------------------------------
 1 | # 2D parallel (SUMMA) algorithm
 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu
 3 | - Paper : [https://arxiv.org/pdf/2104.05343.pdf](https://arxiv.org/pdf/2104.05343.pdf)
 4 | 
 5 | ![image.png](2d_image/2d.png)
 6 | 
 7 | The use of 1D tensor parallelism can lead to high memory consumption in large-scale models because it does not partition activations.
 8 | To address this issue, a more efficient 2D tensor parallelism algorithm based on SUMMA was introduced. This algorithm evenly distributes computation and memory load.
 9 | For instance, when computing a linear layer $Y = XA$, the input $X$ and weight $A$ are split into four sub-matrices and the calculation is done in two steps, broadcasting rows and columns of $X$ and $A$ in turn.
10 | The result is a matrix $Y$ that is the product of $X$ and $A$.
11 | 
12 | ## Usage
13 | 
14 | Use `ParallelMode.TENSOR_2D` as a parameter of `tensor_parallel_mode`. Since the algorithm splits model along both rows and columns, `tp_size` should be a **square of positive integer**.
15 | 
16 | ```python
17 | from oslo import ParallelContext, ParallelMode
18 | from oslo.torch.nn.parallel import TensorParallel
19 | 
20 | tp_size = 4
21 | tp_depth = 1
22 | 
23 | parallel_context = ParallelContext.from_torch(
24 |     data_parallel_size=1,
25 |     pipeline_parallel_size=1,
26 |     tensor_parallel_size=tp_size,
27 |     tensor_parallel_mode=ParallelMode.TENSOR_2D,
28 | )
29 | model = TensorParallel(model, parallel_context)
30 | oslo.ready(model, parallel_context)
31 | ```


--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '',
 4 |     LANGUAGE: 'en',
 5 |     COLLAPSE_INDEX: false,
 6 |     BUILDER: 'html',
 7 |     FILE_SUFFIX: '.html',
 8 |     LINK_SUFFIX: '.html',
 9 |     HAS_SOURCE: true,
10 |     SOURCELINK_SUFFIX: '',
11 |     NAVIGATION_WITH_KEYS: true,
12 |     SHOW_SEARCH_SUMMARY: true,
13 |     ENABLE_SEARCH_SHORTCUTS: true,
14 | };


--------------------------------------------------------------------------------
/docs/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/file.png


--------------------------------------------------------------------------------
/docs/_static/images/logo_binder.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 23.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 44.4 44.4" style="enable-background:new 0 0 44.4 44.4;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:none;stroke:#F5A252;stroke-width:5;stroke-miterlimit:10;}
 7 | 	.st1{fill:none;stroke:#579ACA;stroke-width:5;stroke-miterlimit:10;}
 8 | 	.st2{fill:none;stroke:#E66581;stroke-width:5;stroke-miterlimit:10;}
 9 | </style>
10 | <title>logo</title>
11 | <g>
12 | 	<path class="st0" d="M33.9,6.4c3.6,3.9,3.4,9.9-0.5,13.5s-9.9,3.4-13.5-0.5s-3.4-9.9,0.5-13.5l0,0C24.2,2.4,30.2,2.6,33.9,6.4z"/>
13 | 	<path class="st1" d="M35.1,27.3c2.6,4.6,1.1,10.4-3.5,13c-4.6,2.6-10.4,1.1-13-3.5s-1.1-10.4,3.5-13l0,0
14 | 		C26.6,21.2,32.4,22.7,35.1,27.3z"/>
15 | 	<path class="st2" d="M25.9,17.8c2.6,4.6,1.1,10.4-3.5,13s-10.4,1.1-13-3.5s-1.1-10.4,3.5-13l0,0C17.5,11.7,23.3,13.2,25.9,17.8z"/>
16 | 	<path class="st1" d="M19.2,26.4c3.1-4.3,9.1-5.2,13.3-2.1c1.1,0.8,2,1.8,2.7,3"/>
17 | 	<path class="st0" d="M19.9,19.4c-3.6-3.9-3.4-9.9,0.5-13.5s9.9-3.4,13.5,0.5"/>
18 | </g>
19 | </svg>
20 | 


--------------------------------------------------------------------------------
/docs/_static/images/logo_colab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/images/logo_colab.png


--------------------------------------------------------------------------------
/docs/_static/images/logo_deepnote.svg:
--------------------------------------------------------------------------------
1 | <svg viewBox="0 0 128 128" fill="none" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M0 128h52.512l29.539-11.077-11.077-43.487-34.051 3.693L0 128Z" fill="#0076D4"/><path fill-rule="evenodd" clip-rule="evenodd" d="M52.513 128s16.6-8.759 19.673-24.277c3.072-15.517-12.091-26.594-35.263-26.594 0-.41 20.343-28.718 20.343-28.718l49.4 1.435L95.71 107.7l-20.452 15.978L52.513 128Z" fill="#002868"/><path fill-rule="evenodd" clip-rule="evenodd" d="M0 60.718 41.025.001s1.006.01 3.282 0c16.082-.068 81.23 3.12 81.23 60.368 0 65.352-73.025 67.631-73.025 67.631s30.495-5.839 30.495-34.816c0-28.978-27.541-32.466-45.264-32.466H0Z" fill="#00A9FF"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ar/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ar\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "بواسطة"
13 | 
14 | msgid "repository"
15 | msgstr "مخزن"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "وضع ملء الشاشة"
19 | 
20 | msgid "Contents"
21 | msgstr "محتويات"
22 | 
23 | msgid "Download source file"
24 | msgstr "تنزيل ملف المصدر"
25 | 
26 | msgid "Edit this page"
27 | msgstr "قم بتحرير هذه الصفحة"
28 | 
29 | msgid "Last updated on"
30 | msgstr "آخر تحديث في"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "طباعة إلى PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "أقترح تحرير"
37 | 
38 | msgid "Download this page"
39 | msgstr "قم بتنزيل هذه الصفحة"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "تبديل التنقل"
43 | 
44 | msgid "By the"
45 | msgstr "بواسطة"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "موضوع كتاب أبو الهول"
49 | 
50 | msgid "previous page"
51 | msgstr "الصفحة السابقة"
52 | 
53 | msgid "Source repository"
54 | msgstr "مستودع المصدر"
55 | 
56 | msgid "open issue"
57 | msgstr "قضية مفتوحة"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "تنزيل ملف دفتر الملاحظات"
61 | 
62 | msgid "Copyright"
63 | msgstr "حقوق النشر"
64 | 
65 | msgid "Theme by the"
66 | msgstr "موضوع بواسطة"
67 | 
68 | msgid "Open an issue"
69 | msgstr "افتح قضية"
70 | 
71 | msgid "next page"
72 | msgstr "الصفحة التالية"
73 | 
74 | msgid "Launch"
75 | msgstr "إطلاق"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/bn/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: bn\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "দ্বারা"
13 | 
14 | msgid "Download source file"
15 | msgstr "উত্স ফাইল ডাউনলোড করুন"
16 | 
17 | msgid "Edit this page"
18 | msgstr "এই পৃষ্ঠাটি সম্পাদনা করুন"
19 | 
20 | msgid "Last updated on"
21 | msgstr "সর্বশেষ আপডেট"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "পিডিএফ প্রিন্ট করুন"
25 | 
26 | msgid "Download this page"
27 | msgstr "এই পৃষ্ঠাটি ডাউনলোড করুন"
28 | 
29 | msgid "Toggle navigation"
30 | msgstr "নেভিগেশন টগল করুন"
31 | 
32 | msgid "By the"
33 | msgstr "দ্বারা"
34 | 
35 | msgid "Sphinx Book Theme"
36 | msgstr "স্পিনিক্স বুক থিম"
37 | 
38 | msgid "previous page"
39 | msgstr "আগের পৃষ্ঠা"
40 | 
41 | msgid "Source repository"
42 | msgstr "উত্স সংগ্রহস্থল"
43 | 
44 | msgid "open issue"
45 | msgstr "খোলা সমস্যা"
46 | 
47 | msgid "Download notebook file"
48 | msgstr "নোটবুক ফাইল ডাউনলোড করুন"
49 | 
50 | msgid "Copyright"
51 | msgstr "কপিরাইট"
52 | 
53 | msgid "Theme by the"
54 | msgstr "থিম দ্বারা"
55 | 
56 | msgid "Open an issue"
57 | msgstr "একটি সমস্যা খুলুন"
58 | 
59 | msgid "next page"
60 | msgstr "পরবর্তী পৃষ্ঠা"
61 | 
62 | msgid "Launch"
63 | msgstr "শুরু করা"
64 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ca/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ca\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "Per"
13 | 
14 | msgid "Download source file"
15 | msgstr "Baixeu el fitxer font"
16 | 
17 | msgid "Edit this page"
18 | msgstr "Editeu aquesta pàgina"
19 | 
20 | msgid "Last updated on"
21 | msgstr "Darrera actualització el"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "Imprimeix a PDF"
25 | 
26 | msgid "suggest edit"
27 | msgstr "suggerir edició"
28 | 
29 | msgid "Download this page"
30 | msgstr "Descarregueu aquesta pàgina"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "Commuta la navegació"
34 | 
35 | msgid "By the"
36 | msgstr "Per la"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "Tema del llibre Esfinx"
40 | 
41 | msgid "previous page"
42 | msgstr "Pàgina anterior"
43 | 
44 | msgid "Source repository"
45 | msgstr "Dipòsit de fonts"
46 | 
47 | msgid "open issue"
48 | msgstr "número obert"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "Descarregar fitxer de quadern"
52 | 
53 | msgid "Copyright"
54 | msgstr "Copyright"
55 | 
56 | msgid "Theme by the"
57 | msgstr "Tema del"
58 | 
59 | msgid "Open an issue"
60 | msgstr "Obriu un número"
61 | 
62 | msgid "next page"
63 | msgstr "pàgina següent"
64 | 
65 | msgid "Launch"
66 | msgstr "Llançament"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/da/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: da\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "Ved"
13 | 
14 | msgid "repository"
15 | msgstr "lager"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "Fuldskærmstilstand"
19 | 
20 | msgid "Contents"
21 | msgstr "Indhold"
22 | 
23 | msgid "Download source file"
24 | msgstr "Download kildefil"
25 | 
26 | msgid "Edit this page"
27 | msgstr "Rediger denne side"
28 | 
29 | msgid "Last updated on"
30 | msgstr "Sidst opdateret den"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "Udskriv til PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "foreslå redigering"
37 | 
38 | msgid "Download this page"
39 | msgstr "Download denne side"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "Skift navigation"
43 | 
44 | msgid "By the"
45 | msgstr "Ved"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "Sphinx bogtema"
49 | 
50 | msgid "previous page"
51 | msgstr "forrige side"
52 | 
53 | msgid "Source repository"
54 | msgstr "Kildelager"
55 | 
56 | msgid "open issue"
57 | msgstr "åbent nummer"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "Download notesbog-fil"
61 | 
62 | msgid "Copyright"
63 | msgstr "ophavsret"
64 | 
65 | msgid "Theme by the"
66 | msgstr "Tema af"
67 | 
68 | msgid "Open an issue"
69 | msgstr "Åbn et problem"
70 | 
71 | msgid "next page"
72 | msgstr "Næste side"
73 | 
74 | msgid "Launch"
75 | msgstr "Start"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/iw/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: iw\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "על ידי"
13 | 
14 | msgid "repository"
15 | msgstr "מאגר"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "מצב מסך מלא"
19 | 
20 | msgid "Contents"
21 | msgstr "תוכן"
22 | 
23 | msgid "Download source file"
24 | msgstr "הורד את קובץ המקור"
25 | 
26 | msgid "Edit this page"
27 | msgstr "ערוך דף זה"
28 | 
29 | msgid "Last updated on"
30 | msgstr "עודכן לאחרונה ב"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "הדפס לקובץ PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "מציע לערוך"
37 | 
38 | msgid "Download this page"
39 | msgstr "הורד דף זה"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "החלף ניווט"
43 | 
44 | msgid "By the"
45 | msgstr "דרך"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "נושא ספר ספינקס"
49 | 
50 | msgid "previous page"
51 | msgstr "עמוד קודם"
52 | 
53 | msgid "Source repository"
54 | msgstr "מאגר המקורות"
55 | 
56 | msgid "open issue"
57 | msgstr "בעיה פתוחה"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "הורד קובץ מחברת"
61 | 
62 | msgid "Copyright"
63 | msgstr "זכויות יוצרים"
64 | 
65 | msgid "Theme by the"
66 | msgstr "נושא מאת"
67 | 
68 | msgid "Open an issue"
69 | msgstr "פתח גיליון"
70 | 
71 | msgid "next page"
72 | msgstr "עמוד הבא"
73 | 
74 | msgid "Launch"
75 | msgstr "לְהַשִׁיק"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ja/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ja\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "著者"
13 | 
14 | msgid "repository"
15 | msgstr "リポジトリ"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "全画面モード"
19 | 
20 | msgid "Contents"
21 | msgstr "目次"
22 | 
23 | msgid "Download source file"
24 | msgstr "ソースファイルをダウンロード"
25 | 
26 | msgid "Edit this page"
27 | msgstr "このページを編集"
28 | 
29 | msgid "Last updated on"
30 | msgstr "最終更新日"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "PDFに印刷"
34 | 
35 | msgid "suggest edit"
36 | msgstr "編集を提案する"
37 | 
38 | msgid "Download this page"
39 | msgstr "このページをダウンロード"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "ナビゲーションを切り替え"
43 | 
44 | msgid "By the"
45 | msgstr "によって"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "スフィンクスの本のテーマ"
49 | 
50 | msgid "previous page"
51 | msgstr "前のページ"
52 | 
53 | msgid "Source repository"
54 | msgstr "ソースリポジトリ"
55 | 
56 | msgid "open issue"
57 | msgstr "未解決の問題"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "ノートブックファイルをダウンロード"
61 | 
62 | msgid "Copyright"
63 | msgstr "Copyright"
64 | 
65 | msgid "Theme by the"
66 | msgstr "のテーマ"
67 | 
68 | msgid "Open an issue"
69 | msgstr "問題を報告"
70 | 
71 | msgid "next page"
72 | msgstr "次のページ"
73 | 
74 | msgid "Launch"
75 | msgstr "起動"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ko/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ko\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "으로"
13 | 
14 | msgid "repository"
15 | msgstr "저장소"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "전체 화면으로보기"
19 | 
20 | msgid "Contents"
21 | msgstr "내용"
22 | 
23 | msgid "Download source file"
24 | msgstr "소스 파일 다운로드"
25 | 
26 | msgid "Edit this page"
27 | msgstr "이 페이지 편집"
28 | 
29 | msgid "Last updated on"
30 | msgstr "마지막 업데이트"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "PDF로 인쇄"
34 | 
35 | msgid "suggest edit"
36 | msgstr "편집 제안"
37 | 
38 | msgid "Download this page"
39 | msgstr "이 페이지 다운로드"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "탐색 전환"
43 | 
44 | msgid "By the"
45 | msgstr "에 의해"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "스핑크스 도서 테마"
49 | 
50 | msgid "previous page"
51 | msgstr "이전 페이지"
52 | 
53 | msgid "Source repository"
54 | msgstr "소스 저장소"
55 | 
56 | msgid "open issue"
57 | msgstr "열린 문제"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "노트북 파일 다운로드"
61 | 
62 | msgid "Copyright"
63 | msgstr "저작권"
64 | 
65 | msgid "Theme by the"
66 | msgstr "테마별"
67 | 
68 | msgid "Open an issue"
69 | msgstr "이슈 열기"
70 | 
71 | msgid "next page"
72 | msgstr "다음 페이지"
73 | 
74 | msgid "Launch"
75 | msgstr "시작하다"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ml/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ml\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "എഴുതിയത്"
13 | 
14 | msgid "Download source file"
15 | msgstr "ഉറവിട ഫയൽ ഡൗൺലോഡുചെയ്യുക"
16 | 
17 | msgid "Edit this page"
18 | msgstr "ഈ പേജ് എഡിറ്റുചെയ്യുക"
19 | 
20 | msgid "Last updated on"
21 | msgstr "അവസാനം അപ്‌ഡേറ്റുചെയ്‌തത്"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "PDF- ലേക്ക് പ്രിന്റുചെയ്യുക"
25 | 
26 | msgid "suggest edit"
27 | msgstr "എഡിറ്റുചെയ്യാൻ നിർദ്ദേശിക്കുക"
28 | 
29 | msgid "Download this page"
30 | msgstr "ഈ പേജ് ഡൗൺലോഡുചെയ്യുക"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "നാവിഗേഷൻ ടോഗിൾ ചെയ്യുക"
34 | 
35 | msgid "By the"
36 | msgstr "എഴുതിയത്"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "സ്ഫിങ്ക്സ് പുസ്തക തീം"
40 | 
41 | msgid "previous page"
42 | msgstr "മുൻപത്തെ താൾ"
43 | 
44 | msgid "Source repository"
45 | msgstr "ഉറവിട ശേഖരം"
46 | 
47 | msgid "open issue"
48 | msgstr "തുറന്ന പ്രശ്നം"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "നോട്ട്ബുക്ക് ഫയൽ ഡൺലോഡ് ചെയ്യുക"
52 | 
53 | msgid "Copyright"
54 | msgstr "പകർപ്പവകാശം"
55 | 
56 | msgid "Theme by the"
57 | msgstr "പ്രമേയം"
58 | 
59 | msgid "Open an issue"
60 | msgstr "ഒരു പ്രശ്നം തുറക്കുക"
61 | 
62 | msgid "next page"
63 | msgstr "അടുത്ത പേജ്"
64 | 
65 | msgid "Launch"
66 | msgstr "സമാരംഭിക്കുക"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/mr/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: mr\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "द्वारा"
13 | 
14 | msgid "Download source file"
15 | msgstr "स्त्रोत फाइल डाउनलोड करा"
16 | 
17 | msgid "Edit this page"
18 | msgstr "हे पृष्ठ संपादित करा"
19 | 
20 | msgid "Last updated on"
21 | msgstr "अखेरचे अद्यतनित"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "पीडीएफवर मुद्रित करा"
25 | 
26 | msgid "suggest edit"
27 | msgstr "संपादन सुचवा"
28 | 
29 | msgid "Download this page"
30 | msgstr "हे पृष्ठ डाउनलोड करा"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "नेव्हिगेशन टॉगल करा"
34 | 
35 | msgid "By the"
36 | msgstr "द्वारा"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "स्फिंक्स बुक थीम"
40 | 
41 | msgid "previous page"
42 | msgstr "मागील पान"
43 | 
44 | msgid "Source repository"
45 | msgstr "स्त्रोत भांडार"
46 | 
47 | msgid "open issue"
48 | msgstr "खुला मुद्दा"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "नोटबुक फाईल डाउनलोड करा"
52 | 
53 | msgid "Copyright"
54 | msgstr "कॉपीराइट"
55 | 
56 | msgid "Theme by the"
57 | msgstr "द्वारा थीम"
58 | 
59 | msgid "Open an issue"
60 | msgstr "एक मुद्दा उघडा"
61 | 
62 | msgid "next page"
63 | msgstr "पुढील पृष्ठ"
64 | 
65 | msgid "Launch"
66 | msgstr "लाँच करा"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ms/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ms\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "Oleh"
13 | 
14 | msgid "Download source file"
15 | msgstr "Muat turun fail sumber"
16 | 
17 | msgid "Edit this page"
18 | msgstr "Edit halaman ini"
19 | 
20 | msgid "Last updated on"
21 | msgstr "Terakhir dikemas kini pada"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "Cetak ke PDF"
25 | 
26 | msgid "suggest edit"
27 | msgstr "cadangkan edit"
28 | 
29 | msgid "Download this page"
30 | msgstr "Muat turun halaman ini"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "Togol navigasi"
34 | 
35 | msgid "By the"
36 | msgstr "Oleh"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "Tema Buku Sphinx"
40 | 
41 | msgid "previous page"
42 | msgstr "halaman sebelumnya"
43 | 
44 | msgid "Source repository"
45 | msgstr "Repositori sumber"
46 | 
47 | msgid "open issue"
48 | msgstr "isu terbuka"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "Muat turun fail buku nota"
52 | 
53 | msgid "Copyright"
54 | msgstr "hak cipta"
55 | 
56 | msgid "Theme by the"
57 | msgstr "Tema oleh"
58 | 
59 | msgid "Open an issue"
60 | msgstr "Buka masalah"
61 | 
62 | msgid "next page"
63 | msgstr "muka surat seterusnya"
64 | 
65 | msgid "Launch"
66 | msgstr "Lancarkan"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ta/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ta\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "வழங்கியவர்"
13 | 
14 | msgid "Download source file"
15 | msgstr "மூல கோப்பைப் பதிவிறக்குக"
16 | 
17 | msgid "Edit this page"
18 | msgstr "இந்தப் பக்கத்தைத் திருத்தவும்"
19 | 
20 | msgid "Last updated on"
21 | msgstr "கடைசியாக புதுப்பிக்கப்பட்டது"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "PDF இல் அச்சிடுக"
25 | 
26 | msgid "suggest edit"
27 | msgstr "திருத்த பரிந்துரைக்கவும்"
28 | 
29 | msgid "Download this page"
30 | msgstr "இந்தப் பக்கத்தைப் பதிவிறக்கவும்"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "வழிசெலுத்தலை நிலைமாற்று"
34 | 
35 | msgid "By the"
36 | msgstr "மூலம்"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "ஸ்பிங்க்ஸ் புத்தக தீம்"
40 | 
41 | msgid "previous page"
42 | msgstr "முந்தைய பக்கம்"
43 | 
44 | msgid "Source repository"
45 | msgstr "மூல களஞ்சியம்"
46 | 
47 | msgid "open issue"
48 | msgstr "திறந்த பிரச்சினை"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "நோட்புக் கோப்பைப் பதிவிறக்கவும்"
52 | 
53 | msgid "Copyright"
54 | msgstr "பதிப்புரிமை"
55 | 
56 | msgid "Theme by the"
57 | msgstr "வழங்கிய தீம்"
58 | 
59 | msgid "Open an issue"
60 | msgstr "சிக்கலைத் திறக்கவும்"
61 | 
62 | msgid "next page"
63 | msgstr "அடுத்த பக்கம்"
64 | 
65 | msgid "Launch"
66 | msgstr "தொடங்க"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/te/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: te\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "ద్వారా"
13 | 
14 | msgid "Download source file"
15 | msgstr "మూల ఫైల్‌ను డౌన్‌లోడ్ చేయండి"
16 | 
17 | msgid "Edit this page"
18 | msgstr "ఈ పేజీని సవరించండి"
19 | 
20 | msgid "Last updated on"
21 | msgstr "చివరిగా నవీకరించబడింది"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "PDF కి ముద్రించండి"
25 | 
26 | msgid "suggest edit"
27 | msgstr "సవరించమని సూచించండి"
28 | 
29 | msgid "Download this page"
30 | msgstr "ఈ పేజీని డౌన్‌లోడ్ చేయండి"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "నావిగేషన్‌ను టోగుల్ చేయండి"
34 | 
35 | msgid "By the"
36 | msgstr "ద్వారా"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "సింహిక పుస్తక థీమ్"
40 | 
41 | msgid "previous page"
42 | msgstr "ముందు పేజి"
43 | 
44 | msgid "Source repository"
45 | msgstr "మూల రిపోజిటరీ"
46 | 
47 | msgid "open issue"
48 | msgstr "ఓపెన్ ఇష్యూ"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "నోట్బుక్ ఫైల్ను డౌన్లోడ్ చేయండి"
52 | 
53 | msgid "Copyright"
54 | msgstr "కాపీరైట్"
55 | 
56 | msgid "Theme by the"
57 | msgstr "ద్వారా థీమ్"
58 | 
59 | msgid "Open an issue"
60 | msgstr "సమస్యను తెరవండి"
61 | 
62 | msgid "next page"
63 | msgstr "తరువాతి పేజీ"
64 | 
65 | msgid "Launch"
66 | msgstr "ప్రారంభించండి"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/th/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: th\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "โดย"
13 | 
14 | msgid "repository"
15 | msgstr "ที่เก็บ"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "โหมดเต็มหน้าจอ"
19 | 
20 | msgid "Contents"
21 | msgstr "สารบัญ"
22 | 
23 | msgid "Download source file"
24 | msgstr "ดาวน์โหลดไฟล์ต้นฉบับ"
25 | 
26 | msgid "Edit this page"
27 | msgstr "แก้ไขหน้านี้"
28 | 
29 | msgid "Last updated on"
30 | msgstr "ปรับปรุงล่าสุดเมื่อ"
31 | 
32 | msgid "Print to PDF"
33 | msgstr "พิมพ์เป็น PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "แนะนำแก้ไข"
37 | 
38 | msgid "Download this page"
39 | msgstr "ดาวน์โหลดหน้านี้"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "ไม่ต้องสลับช่องทาง"
43 | 
44 | msgid "By the"
45 | msgstr "โดย"
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "ธีมหนังสือสฟิงซ์"
49 | 
50 | msgid "previous page"
51 | msgstr "หน้าที่แล้ว"
52 | 
53 | msgid "Source repository"
54 | msgstr "ที่เก็บซอร์ส"
55 | 
56 | msgid "open issue"
57 | msgstr "เปิดปัญหา"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "ดาวน์โหลดไฟล์สมุดบันทึก"
61 | 
62 | msgid "Copyright"
63 | msgstr "ลิขสิทธิ์"
64 | 
65 | msgid "Theme by the"
66 | msgstr "ธีมโดย"
67 | 
68 | msgid "Open an issue"
69 | msgstr "เปิดปัญหา"
70 | 
71 | msgid "next page"
72 | msgstr "หน้าต่อไป"
73 | 
74 | msgid "Launch"
75 | msgstr "เปิด"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/tl/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: tl\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "Ni"
13 | 
14 | msgid "Download source file"
15 | msgstr "Mag-download ng file ng pinagmulan"
16 | 
17 | msgid "Edit this page"
18 | msgstr "I-edit ang pahinang ito"
19 | 
20 | msgid "Last updated on"
21 | msgstr "Huling na-update noong"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "I-print sa PDF"
25 | 
26 | msgid "suggest edit"
27 | msgstr "iminumungkahi i-edit"
28 | 
29 | msgid "Download this page"
30 | msgstr "I-download ang pahinang ito"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "I-toggle ang pag-navigate"
34 | 
35 | msgid "By the"
36 | msgstr "Sa pamamagitan ng"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "Tema ng Sphinx Book"
40 | 
41 | msgid "previous page"
42 | msgstr "Nakaraang pahina"
43 | 
44 | msgid "Source repository"
45 | msgstr "Pinagmulan ng imbakan"
46 | 
47 | msgid "open issue"
48 | msgstr "bukas na isyu"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "Mag-download ng file ng notebook"
52 | 
53 | msgid "Copyright"
54 | msgstr "Copyright"
55 | 
56 | msgid "Theme by the"
57 | msgstr "Tema ng"
58 | 
59 | msgid "Open an issue"
60 | msgstr "Magbukas ng isyu"
61 | 
62 | msgid "next page"
63 | msgstr "Susunod na pahina"
64 | 
65 | msgid "Launch"
66 | msgstr "Ilunsad"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/ur/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: ur\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "بذریعہ"
13 | 
14 | msgid "Download source file"
15 | msgstr "سورس فائل ڈاؤن لوڈ کریں"
16 | 
17 | msgid "Edit this page"
18 | msgstr "اس صفحے میں ترمیم کریں"
19 | 
20 | msgid "Last updated on"
21 | msgstr "آخری بار تازہ کاری ہوئی"
22 | 
23 | msgid "Print to PDF"
24 | msgstr "پی ڈی ایف پرنٹ کریں"
25 | 
26 | msgid "suggest edit"
27 | msgstr "ترمیم کی تجویز کریں"
28 | 
29 | msgid "Download this page"
30 | msgstr "اس صفحے کو ڈاؤن لوڈ کریں"
31 | 
32 | msgid "Toggle navigation"
33 | msgstr "نیویگیشن ٹوگل کریں"
34 | 
35 | msgid "By the"
36 | msgstr "کی طرف"
37 | 
38 | msgid "Sphinx Book Theme"
39 | msgstr "سپنکس بک تھیم"
40 | 
41 | msgid "previous page"
42 | msgstr "سابقہ ​​صفحہ"
43 | 
44 | msgid "Source repository"
45 | msgstr "ماخذ ذخیرہ"
46 | 
47 | msgid "open issue"
48 | msgstr "کھلا مسئلہ"
49 | 
50 | msgid "Download notebook file"
51 | msgstr "نوٹ بک فائل ڈاؤن لوڈ کریں"
52 | 
53 | msgid "Copyright"
54 | msgstr "کاپی رائٹ"
55 | 
56 | msgid "Theme by the"
57 | msgstr "کے ذریعہ تھیم"
58 | 
59 | msgid "Open an issue"
60 | msgstr "ایک مسئلہ کھولیں"
61 | 
62 | msgid "next page"
63 | msgstr "اگلا صفحہ"
64 | 
65 | msgid "Launch"
66 | msgstr "لانچ کریں"
67 | 


--------------------------------------------------------------------------------
/docs/_static/locales/zh_CN/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: zh_CN\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "作者："
13 | 
14 | msgid "repository"
15 | msgstr "仓库"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "全屏模式"
19 | 
20 | msgid "Contents"
21 | msgstr "目录"
22 | 
23 | msgid "Download source file"
24 | msgstr "下载源文件"
25 | 
26 | msgid "Edit this page"
27 | msgstr "编辑此页面"
28 | 
29 | msgid "Last updated on"
30 | msgstr "上次更新时间："
31 | 
32 | msgid "Print to PDF"
33 | msgstr "列印成 PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "提出修改建议"
37 | 
38 | msgid "Download this page"
39 | msgstr "下载此页面"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "显示或隐藏导航栏"
43 | 
44 | msgid "By the"
45 | msgstr "作者："
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "Sphinx Book 主题"
49 | 
50 | msgid "previous page"
51 | msgstr "上一页"
52 | 
53 | msgid "Source repository"
54 | msgstr "源码库"
55 | 
56 | msgid "open issue"
57 | msgstr "创建议题"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "下载笔记本文件"
61 | 
62 | msgid "Copyright"
63 | msgstr "版权"
64 | 
65 | msgid "Theme by the"
66 | msgstr "主题作者："
67 | 
68 | msgid "Open an issue"
69 | msgstr "创建议题"
70 | 
71 | msgid "next page"
72 | msgstr "下一页"
73 | 
74 | msgid "Launch"
75 | msgstr "启动"
76 | 


--------------------------------------------------------------------------------
/docs/_static/locales/zh_TW/LC_MESSAGES/booktheme.po:
--------------------------------------------------------------------------------
 1 | 
 2 | msgid ""
 3 | msgstr ""
 4 | "Project-Id-Version: Sphinx-Book-Theme\n"
 5 | "MIME-Version: 1.0\n"
 6 | "Content-Type: text/plain; charset=UTF-8\n"
 7 | "Content-Transfer-Encoding: 8bit\n"
 8 | "Language: zh_TW\n"
 9 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
10 | 
11 | msgid "By"
12 | msgstr "作者："
13 | 
14 | msgid "repository"
15 | msgstr "儲存庫"
16 | 
17 | msgid "Fullscreen mode"
18 | msgstr "全螢幕模式"
19 | 
20 | msgid "Contents"
21 | msgstr "目錄"
22 | 
23 | msgid "Download source file"
24 | msgstr "下載原始檔"
25 | 
26 | msgid "Edit this page"
27 | msgstr "編輯此頁面"
28 | 
29 | msgid "Last updated on"
30 | msgstr "最後更新時間："
31 | 
32 | msgid "Print to PDF"
33 | msgstr "列印成 PDF"
34 | 
35 | msgid "suggest edit"
36 | msgstr "提出修改建議"
37 | 
38 | msgid "Download this page"
39 | msgstr "下載此頁面"
40 | 
41 | msgid "Toggle navigation"
42 | msgstr "顯示或隱藏導覽列"
43 | 
44 | msgid "By the"
45 | msgstr "作者："
46 | 
47 | msgid "Sphinx Book Theme"
48 | msgstr "Sphinx Book 佈景主題"
49 | 
50 | msgid "previous page"
51 | msgstr "上一頁"
52 | 
53 | msgid "Source repository"
54 | msgstr "來源儲存庫"
55 | 
56 | msgid "open issue"
57 | msgstr "公開的問題"
58 | 
59 | msgid "Download notebook file"
60 | msgstr "下載 Notebook 檔案"
61 | 
62 | msgid "Copyright"
63 | msgstr "Copyright"
64 | 
65 | msgid "Theme by the"
66 | msgstr "佈景主題作者："
67 | 
68 | msgid "Open an issue"
69 | msgstr "開啟議題"
70 | 
71 | msgid "next page"
72 | msgstr "下一頁"
73 | 
74 | msgid "Launch"
75 | msgstr "啟動"
76 | 


--------------------------------------------------------------------------------
/docs/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/minus.png


--------------------------------------------------------------------------------
/docs/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/plus.png


--------------------------------------------------------------------------------
/docs/_static/sbt-webpack-macros.html:
--------------------------------------------------------------------------------
 1 | <!--
 2 |   All these macros are auto-generated and must **NOT** be edited by hand.
 3 |   See the webpack.config.js file, to learn more about how this is generated.
 4 | -->
 5 | {% macro head_pre_bootstrap() %}
 6 |   <link href="{{ pathto('_static/styles/sphinx-book-theme.css', 1) }}" rel="stylesheet">
 7 | {% endmacro %}
 8 | 
 9 | {% macro body_post() %}
10 |   <script src="{{ pathto('_static/scripts/sphinx-book-theme.js', 1) }}"></script>
11 | {% endmacro %}
12 | 


--------------------------------------------------------------------------------
/docs/_static/styles/theme.css:
--------------------------------------------------------------------------------
1 | /* Provided by Sphinx's 'basic' theme, and included in the final set of assets */
2 | @import "../basic.css";
3 | 


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.ttf


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.ttf


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.ttf


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.ttf


--------------------------------------------------------------------------------
/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/_static/vendor/fontawesome/6.1.2/webfonts/fa-v4compatibility.woff2


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/objects.inv


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/data_parallelism.md:
--------------------------------------------------------------------------------
 1 | # Concept of Data Parallelism
 2 | - Authors: Jinwon Kim
 3 | 
 4 | **Data Parallelism** is a widely-used technique for training deep learning models in parallel. It involves distributing the training data across multiple processing units, such as GPUs, each of which has a copy of the model parameters. The data is divided into subsets, and each unit independently computes the gradients for its subset. The gradients are then aggregated to update the model parameters. This approach enables efficient parallelization of the training process and can accelerate the training of deep learning models on large datasets.
 5 | 
 6 | Oslo supports Zero Redundancy Optimizer (ZeRO) to easily scale deep learning model.
 7 | 
 8 | ## Optimizer-Level Parallel
 9 | - [Zero Redundancy Optimizer DP](dp/zero_algorithm.md)
10 | 
11 | 
12 | ### References
13 | - [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/dp/zero_algorithm.md:
--------------------------------------------------------------------------------
 1 | # Zero Redundancy Optimizer DP
 2 | - Authors: Jinwon Kim
 3 | - Paper: https://arxiv.org/abs/1910.02054
 4 | 
 5 | ![figure1.png](zero_image/figure1.png)
 6 | 
 7 | The Zero Redundancy Optimizer for Data Parallelism (ZeRO-DP) is a technique used to remove memory state redundancies and optimize computational efficiency in data parallel distributed deep learning. ZeRO-DP partitions the model states across data-parallel processes, eliminating the need for replication of model parameters, which in turn reduces memory usage and communication overhead during training.
 8 | 
 9 | ## Optimizer State Partitioning (Level 1)
10 | -  The optimizer states are partitioned across data parallel processes
11 | ## Gradient Partitioning (Level 2)
12 | -  The reduced gradients are partitioned based on the corresponding parameter and are reduced only by the data parallel process responsible for updating those parameters. After the reduction, the memory can be released.
13 | ## Parameter Partitioning (Level 3)
14 | - Similar to the optimizer states and gradients, each process only stores the parameters associated with its partition.
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/dp/zero_image/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/dp/zero_image/figure1.png


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tensor_model_parallelism.md:
--------------------------------------------------------------------------------
 1 | # Concept of Tensor Model Parallelism
 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu
 3 | 
 4 | **Tensor Model Parallelism** makes it possible to train larger models by partitioning the parameter tensors into multiple dimensions.
 5 | We support 1D, 2D, 2.5D, and 3D tensor partitioning algorithms which make tensor parallel training more efficient.
 6 | 
 7 | ## Tensor Parallel Algorithms
 8 | - [1D parallel algorithm (same as Megatron-LM)](tp/1d_parallel_algorithm.md)
 9 | - [2D parallel (SUMMA) algorithm](tp/2d_parallel_algorithm.md)
10 | - [2.5D parallel (SUMMA-2.5) algorithm](tp/2p5d_parallel_algorithm.md)
11 | - [3D parallel Algorithm](tp/3d_parallel_algorithm.md)
12 | 
13 | ### References
14 | - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
15 | - [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)
16 | - [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)
17 | - [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tp/1d_image/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/1d_image/98C5FDF3-0DB1-4A2F-8E99-F0EFFB453B0B.jpeg


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tp/2d_image/2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/2d_image/2d.png


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tp/2d_parallel_algorithm.md:
--------------------------------------------------------------------------------
 1 | # 2D parallel (SUMMA) algorithm
 2 | - Authors: Kichang Yang, Kevin Ko, Minho Ryu
 3 | - Paper : [https://arxiv.org/pdf/2104.05343.pdf](https://arxiv.org/pdf/2104.05343.pdf)
 4 | 
 5 | ![image.png](2d_image/2d.png)
 6 | 
 7 | The use of 1D tensor parallelism can lead to high memory consumption in large-scale models because it does not partition activations.
 8 | To address this issue, a more efficient 2D tensor parallelism algorithm based on SUMMA was introduced. This algorithm evenly distributes computation and memory load.
 9 | For instance, when computing a linear layer $Y = XA$, the input $X$ and weight $A$ are split into four sub-matrices and the calculation is done in two steps, broadcasting rows and columns of $X$ and $A$ in turn.
10 | The result is a matrix $Y$ that is the product of $X$ and $A$.
11 | 
12 | ## Usage
13 | 
14 | Use `ParallelMode.TENSOR_2D` as a parameter of `tensor_parallel_mode`. Since the algorithm splits model along both rows and columns, `tp_size` should be a **square of positive integer**.
15 | 
16 | ```python
17 | from oslo import ParallelContext, ParallelMode
18 | from oslo.torch.nn.parallel import TensorParallel
19 | 
20 | tp_size = 4
21 | tp_depth = 1
22 | 
23 | parallel_context = ParallelContext.from_torch(
24 |     data_parallel_size=1,
25 |     pipeline_parallel_size=1,
26 |     tensor_parallel_size=tp_size,
27 |     tensor_parallel_mode=ParallelMode.TENSOR_2D,
28 | )
29 | model = TensorParallel(model, parallel_context)
30 | oslo.ready(model, parallel_context)
31 | ```


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tp/2p5d_image/2p5d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/2p5d_image/2p5d.png


--------------------------------------------------------------------------------
/docs/source/CONCEPTS/tp/3d_image/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/CONCEPTS/tp/3d_image/E4D02BEB-A5BB-461D-9B62-213A61DB5B74.jpeg


--------------------------------------------------------------------------------
/docs/source/TUTORIALS/image/260461C3-EA3B-405C-9B34-05BA3C781161.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/TUTORIALS/image/260461C3-EA3B-405C-9B34-05BA3C781161.png


--------------------------------------------------------------------------------
/docs/source/TUTORIALS/image/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/docs/source/TUTORIALS/image/figure1.png


--------------------------------------------------------------------------------
/gcc_install.sh:
--------------------------------------------------------------------------------
 1 | #~/bin/bash
 2 | # install for gcc
 3 | yum install libaio-devel -y
 4 | yum install centos-release-scl -y
 5 | yum-config-manager --enable rhel-server-rhscl-7-rpms -y
 6 | yum install devtoolset-8 -y
 7 | yum install llvm-toolset-7 -y
 8 | sudo yum -y install llvm-toolset-7-clang-analyzer llvm-toolset-7-clang-tools-extra
 9 | sudo yum -y install pdsh
10 | scl enable devtoolset-8 llvm-toolset-7 bash
11 | 


--------------------------------------------------------------------------------
/oslo/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.distributed import ParallelContext, ParallelMode
2 | from oslo.torch.utils.extensions import ready_torch
3 | 
4 | 
5 | def ready(model, parallel_context: ParallelContext):
6 |     ready_torch(model, parallel_context)
7 | 


--------------------------------------------------------------------------------
/oslo/__version__.py:
--------------------------------------------------------------------------------
1 | version = "3.0.0"
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.0.1"
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/example/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | add_executable(bert_example bert_example.cc)
 4 | target_link_libraries(bert_example PUBLIC liblightseq)
 5 | 
 6 | add_executable(transformer_example transformer_example.cc)
 7 | target_link_libraries(transformer_example PUBLIC liblightseq)
 8 | 
 9 | add_executable(gpt_example gpt_example.cc)
10 | target_link_libraries(gpt_example PUBLIC liblightseq)
11 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/arm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
2 | 
3 | cmake_minimum_required(VERSION 3.18)
4 | set(lightseq_kernel_files gemm.cc utils.cc)
5 | 
6 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files})
7 | target_include_directories(lightseq_kernels INTERFACE includes)
8 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/arm/gemm.cc:
--------------------------------------------------------------------------------
1 | #include "kernel_headers.h"
2 | 
3 | namespace lightseq {
4 | namespace arm {} // namespace arm
5 | } // namespace lightseq
6 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/arm/includes/kernel_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | #include <fstream>
 5 | #include <functional>
 6 | #include <iostream>
 7 | #include <math_constants.h>
 8 | #include <stdexcept>
 9 | #include <string>
10 | #include <type_traits>
11 | #include <vector>
12 | 
13 | #include "utils.h"
14 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/arm/includes/utils.h:
--------------------------------------------------------------------------------
1 | #include "cstdio"
2 | #include "iostream"
3 | 
4 | namespace lightseq {
5 | 
6 | template <typename T>
7 | void print_vec(const T *outv, std::string outn, int num_output_ele);
8 | }
9 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | set(cuda_kernel_files
 4 |     util.cc.cu
 5 |     cross_entropy.cu
 6 |     cublas_wrappers.cu
 7 |     cuda_util.cu
 8 |     dropout_kernels.cu
 9 |     embedding_kernels.cu
10 |     embKernels.cc.cu
11 |     # fused_adam_kernel.cu
12 |     general_kernels.cu
13 |     gptKernels.cc.cu
14 |     normalize_kernels.cu
15 |     softmax_kernels.cu
16 |     softmax_kernels_new.cu
17 |     transform_kernels.cu
18 |     transform_kernels_new.cu
19 |     crf.cu
20 |     transformerKernels.cc.cu)
21 | 
22 | add_library(lightseq_kernels STATIC ${cuda_kernel_files})
23 | target_link_libraries(lightseq_kernels PUBLIC -lcublas)
24 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/cuda/includes/kernel_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cublas_v2.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <cuda_runtime_api.h>
 6 | #include <type_traits>
 7 | 
 8 | #include <curand_kernel.h>
 9 | #include <thrust/copy.h>
10 | #include <thrust/device_vector.h>
11 | #include <thrust/functional.h>
12 | #include <thrust/iterator/counting_iterator.h>
13 | #include <thrust/random.h>
14 | #include <thrust/scan.h>
15 | #include <thrust/sequence.h>
16 | 
17 | #include "cublas_wrappers.h"
18 | #include "cuda_util.h"
19 | #include "embKernels.h"
20 | #include "gptKernels.h"
21 | #include "kernels.h"
22 | #include "transformerKernels.h"
23 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/cuda/includes/ls_cub.cuh:
--------------------------------------------------------------------------------
 1 | // copied from https://github.com/dmlc/dgl/pull/2758
 2 | #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
 3 | #define DGL_ARRAY_CUDA_DGL_CUB_CUH_
 4 | 
 5 | #define CUB_NS_PREFIX namespace ls {
 6 | #define CUB_NS_POSTFIX }
 7 | #define CUB_NS_QUALIFIER ::ls::cub
 8 | #include "cub/cub.cuh"
 9 | #include "cub/util_allocator.cuh"
10 | #undef CUB_NS_POSTFIX
11 | #undef CUB_NS_PREFIX
12 | #undef CUB_NS_QUALIFIER
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/x86/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 2 | 
 3 | cmake_minimum_required(VERSION 3.18)
 4 | 
 5 | set(lightseq_kernel_files util.cc gemm.cpp)
 6 | 
 7 | add_library(lightseq_kernels STATIC ${lightseq_kernel_files})
 8 | target_include_directories(lightseq_kernels PUBLIC ${HDF5_INCLUDE_DIRS})
 9 | target_include_directories(lightseq_kernels INTERFACE includes)
10 | target_link_libraries(lightseq_kernels PRIVATE ${HDF5_LIBRARIES})
11 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/x86/includes/kernel_headers.h:
--------------------------------------------------------------------------------
1 | #include "kernels.h"
2 | #include "util.h"
3 | #include <memory>
4 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/kernels/x86/includes/kernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cstdio"
 3 | #include "util.h"
 4 | 
 5 | namespace lightseq {
 6 | namespace x86 {
 7 | 
 8 | template <typename InType, typename OutType>
 9 | void matrix_gemm(const InType *inpA, const InType *inpB, OutType *outC, int m,
10 |                  int n, int k);
11 | 
12 | template <typename AType, typename BType, typename CType>
13 | void gemm(bool a_is_packed, bool b_is_packed, bool transpose_a,
14 |           bool transpose_b, int64_t m, int64_t n, int64_t k, float alpha,
15 |           const AType *a, int64_t lda, const BType *b, int64_t ldb, float beta,
16 |           CType *c, int64_t ldc, const CType *a_shift_compensation = nullptr);
17 | 
18 | } // namespace x86
19 | } // namespace lightseq
20 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers/includes/cross_entropy_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <cuda_runtime_api.h>
 6 | 
 7 | #include <type_traits>
 8 | 
 9 | #include "cuda_util.h"
10 | namespace lightseq {
11 | namespace cuda {
12 | template <typename T> class CrossEntropyLayer {
13 | public:
14 |   CrossEntropyLayer(float epsilon, int padding_idx, int max_batch_tokens);
15 | 
16 |   virtual ~CrossEntropyLayer();
17 | 
18 |   void Forward(const T *inputs_ptr, const int *targets_ptr, float *outputs_ptr,
19 |                float *nll_loss_ptr);
20 | 
21 |   void Backward(const float *grad_outputs_ptr, const T *inputs_ptr,
22 |                 const int *targets_ptr, T *grad_inputs_ptr);
23 | 
24 |   void set_cur_batch_shape(int batch_size, int seq_len, int vocab_size);
25 | 
26 | private:
27 |   void allocate_mem_buffer() {
28 |     // allocate local gpu memory
29 |     _loss_buffer = cuda_malloc<float>(_max_batch_tokens * 2);
30 |   }
31 | 
32 |   void free_mem_buffer() {
33 |     // free local gpu memory
34 |     cuda_free(_loss_buffer);
35 |   }
36 | 
37 |   const int _padding_idx;
38 |   const float _epsilon;
39 |   const int _max_batch_tokens;
40 | 
41 |   size_t _batch_size;
42 |   size_t _seq_len;
43 |   size_t _vocab_size;
44 | 
45 |   float *_loss_buffer;
46 | };
47 | } // namespace cuda
48 | } // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(layers_files
 2 |     feed_forward_layer.cpp
 3 |     linear_layer.cpp
 4 |     generator_layer.cpp
 5 |     gpt_attention_layer.cpp
 6 |     gpt_layer.cpp
 7 |     multihead_attention_layer.cpp
 8 |     transformer_encoder_layer.cpp
 9 |     dec_enc_attention_layer.cpp
10 |     dec_self_attention_layer.cpp
11 |     transformer_decoder_layer.cpp
12 |     crf_layer.cpp
13 |     encdec_kv_layer.cpp
14 |     sample_layer.cpp
15 |     sdpa_layer.cpp)
16 | 
17 | add_library(lightseq_layers STATIC ${layers_files})
18 | target_link_libraries(lightseq_layers PUBLIC lightseq_operators lsflow)
19 | target_include_directories(lightseq_layers PUBLIC includes)
20 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/crf_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "crf.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <typename T> class CRFLayer : public Layer {
 9 | private:
10 |   // operators
11 |   CRFOP<T> *_crf_op = nullptr;
12 | 
13 |   // parameters
14 |   Variable *_linear_b;
15 |   Variable *_start_transition;
16 |   Variable *_end_transition;
17 |   Variable *_transition;
18 | 
19 |   // shape related
20 |   int _num_tags;
21 |   int _max_batch_tokens;
22 |   int _max_batch_size;
23 | 
24 |   int _seq_len;
25 |   int _batch_size;
26 |   bool _forward_or_decode;   // true for forward, false for decode
27 |   bool _output_decode_score; // true for output decode score
28 | 
29 | public:
30 |   CRFLayer(int num_tags, int max_batch_tokens, int max_batch_size);
31 | 
32 |   virtual ~CRFLayer() {}
33 | 
34 |   Variable *operator()(Variable *emission, Variable *mask);
35 | 
36 |   void before_forward(int batch_size, int seq_len, bool forward_or_decode,
37 |                       bool output_decode_score);
38 | 
39 |   int load_params(const std::vector<const T *> &para_vec, int offset);
40 | };
41 | 
42 | template class CRFLayer<float>;
43 | #ifdef LIGHTSEQ_cuda
44 | template class CRFLayer<__half>;
45 | #endif
46 | 
47 | template <class T> using CRFLayerPtr = std::shared_ptr<CRFLayer<T>>;
48 | } // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/encdec_kv_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "bias_add_transform_20314.h"
 3 | #include "layer.h"
 4 | #include "linear.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2> class EncDecKvLayer : public Layer {
 9 | private:
10 |   LinearOp<T1, T2> *_kv_linear = nullptr;
11 |   BiasAddTrans20314<T1, T2> *_bias_add_transform_20314 = nullptr;
12 | 
13 |   // parameters
14 |   Variable *_enc_kvw;
15 |   Variable *_enc_kvb;
16 | 
17 |   // shape related
18 |   size_t _layer_id;
19 |   size_t _nshared_layer;
20 |   size_t _batch_tokens;
21 |   size_t _max_batch_tokens;
22 |   size_t _hidden_size;
23 |   size_t _heads;
24 | 
25 | public:
26 |   EncDecKvLayer(size_t nshared_layer, size_t max_batch_tokens,
27 |                 size_t hidden_size, size_t num_heads);
28 | 
29 |   virtual ~EncDecKvLayer() {}
30 | 
31 |   Variable *operator()(Variable *enc_out);
32 | 
33 |   void before_forward(size_t batch_size, size_t seq_len);
34 | 
35 |   size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr);
36 | 
37 |   int load_params(const std::vector<const T1 *> &para_vec, int offset);
38 | };
39 | 
40 | template class EncDecKvLayer<float, float>;
41 | #ifdef LIGHTSEQ_cuda
42 | template class EncDecKvLayer<__half, __half>;
43 | #endif
44 | 
45 | template <class T1, class T2>
46 | using EncDecKvLayerPtr = std::shared_ptr<EncDecKvLayer<T1, T2>>;
47 | 
48 | } // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/gpt_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "feed_forward_layer.h"
 3 | #include "gpt_attention_layer.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2> class GptLayer : public Layer {
 9 | private:
10 |   GptAttentionLayerPtr<T1, T2> _attn_layer;
11 |   FeedForwardLayerPtr<T1, T2> _ffn_layer;
12 | 
13 |   int _layer_id;
14 | 
15 | public:
16 |   GptLayer(int layer_id, int max_batch_tokens, int max_seq_len, int hidden_size,
17 |            int num_heads, int intermediate_size, float attn_prob_dropout_ratio,
18 |            float activation_dropout_ratio, float hidden_output_dropout_ratio,
19 |            std::string activation_fn, bool mask_future_tokens,
20 |            int beam_size = 1);
21 |   virtual ~GptLayer() {}
22 | 
23 |   Variable *operator()(Variable *inp, Variable *cache_k, Variable *cache_v,
24 |                        Variable *pad_mask);
25 | 
26 |   void before_forward(int batch_size, int seq_len, int steps) {
27 |     _attn_layer->before_forward(batch_size, seq_len, steps);
28 |     _ffn_layer->before_forward(batch_size, seq_len);
29 |   }
30 | 
31 |   size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr);
32 | 
33 |   int load_params(const std::vector<const T1 *> &para_vec, int offset);
34 | };
35 | 
36 | template class GptLayer<float, float>;
37 | #ifdef LIGHTSEQ_cuda
38 | template class GptLayer<__half, __half>;
39 | #endif
40 | 
41 | template <class T1, class T2>
42 | using GptLayerPtr = std::shared_ptr<GptLayer<T1, T2>>;
43 | 
44 | } // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/linear_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "layer.h"
 4 | #include "linear.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T1, class T2> class LinearLayer : public Layer {
 9 | private:
10 |   // operators
11 |   LinearOp<T1, T2> *_linear = nullptr;
12 | 
13 |   // parameters
14 |   Variable *_linear_w;
15 | 
16 |   // shape related
17 |   int _max_batch_tokens;
18 |   size_t _input_size;
19 |   size_t _output_size;
20 | 
21 | public:
22 |   LinearLayer(int max_batch_tokens, int input_size, int output_size,
23 |               MATRIX_OP opA = MATRIX_OP::Transpose,
24 |               MATRIX_OP opB = MATRIX_OP::NonTranspose, float alpha = float(1.));
25 | 
26 |   virtual ~LinearLayer() {}
27 | 
28 |   Variable *operator()(Variable *inp);
29 | 
30 |   void before_forward(int batch_size, int seq_len);
31 | 
32 |   void before_backward();
33 | 
34 |   size_t load_para_and_grad(const T1 *para_ptr, T2 *grad_ptr);
35 | 
36 |   int load_params(const std::vector<const T1 *> &para_vec, int offset);
37 | };
38 | 
39 | template class LinearLayer<float, float>;
40 | #ifdef LIGHTSEQ_cuda
41 | template class LinearLayer<__half, __half>;
42 | #endif
43 | 
44 | template <class T1, class T2>
45 | using LinearLayerPtr = std::shared_ptr<LinearLayer<T1, T2>>;
46 | 
47 | } // namespace lightseq
48 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/sample_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "beam_search_topk.h"
 4 | #include "layer.h"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <class T> class SampleLayer : public Layer {
 9 | private:
10 |   // operators
11 |   BeamSearchTopOp<T> *_beam_search = nullptr;
12 | 
13 |   // parameters
14 |   Variable *_logit_bias;
15 |   size_t _trg_vocab_size;
16 | 
17 | public:
18 |   SampleLayer(int nshared_layer, int max_batch_size, int max_step,
19 |               int trg_vocab_size, int hidden_size, int max_thread_per_block,
20 |               int beam_size, int diverse_lambda, int dim_per_head, int end_id,
21 |               int head_num,
22 |               float length_penalty); // for beam_search
23 | 
24 |   virtual ~SampleLayer() {}
25 | 
26 |   std::tuple<Variable *, Variable *> operator()(Variable *logits,
27 |                                                 Variable *alive_seq);
28 | 
29 |   void before_forward(int batch_size, int cur_step);
30 | 
31 |   int load_params(const std::vector<const T *> &para_vec, int offset);
32 | 
33 |   bool is_stop() { return _beam_search->is_stop(); }
34 | };
35 | 
36 | template class SampleLayer<float>;
37 | #ifdef LIGHTSEQ_cuda
38 | template class SampleLayer<__half>;
39 | #endif
40 | 
41 | template <typename T> using SampleLayerPtr = std::shared_ptr<SampleLayer<T>>;
42 | 
43 | } // namespace lightseq
44 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/layers_new/includes/sdpa_layer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "dropout.h"
 3 | #include "layer.h"
 4 | #include "softmax.h"
 5 | #include "strided_batch_gemm.h"
 6 | 
 7 | namespace lightseq {
 8 | 
 9 | /*
10 | Scaled Dot Product Attention
11 | See paper "Attention is all you need" for details.
12 | */
13 | template <class T1, class T2> class SDPALayer : public Layer {
14 | private:
15 |   // operators
16 |   StridedBatchGemmOp<T1, T2> *_attn_scores = nullptr;
17 |   SoftmaxOp<T1, T2> *_softmax = nullptr;
18 |   DropoutOp<T1, T2> *_attn_prob_dropout = nullptr;
19 |   StridedBatchGemmOp<T1, T2> *_attn_context = nullptr;
20 | 
21 |   // shape related
22 |   int _max_batch_tokens;
23 |   int _max_seq_len;
24 |   int _nhead;
25 |   int _head_dim;
26 | 
27 | public:
28 |   SDPALayer(size_t max_batch_tokens, size_t max_seq_len, size_t head_dim,
29 |             size_t num_heads, float attn_prob_dropout_ratio);
30 | 
31 |   virtual ~SDPALayer() {}
32 | 
33 |   // mask is for enc-self attention and enc-dec-cross attention
34 |   Variable *operator()(Variable *query, Variable *key, Variable *value,
35 |                        Variable *mask = nullptr);
36 | 
37 |   void before_forward(int batch_size, int query_len, int kv_len, int kv_size,
38 |                       bool mask_future);
39 | };
40 | 
41 | template class SDPALayer<__half, __half>;
42 | template class SDPALayer<float, float>;
43 | 
44 | template <class T1, class T2>
45 | using SDPALayerPtr = std::shared_ptr<SDPALayer<T1, T2>>;
46 | 
47 | } // namespace lightseq
48 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(LightseqProtoType LANGUAGES CXX CUDA)
 4 | 
 5 | find_package(Threads REQUIRED)
 6 | 
 7 | set(CMAKE_CXX_STANDARD 14)
 8 | 
 9 | add_library(
10 |   lsflow STATIC
11 |   context.cpp
12 |   node.cpp
13 |   manager.cpp
14 |   layer.cpp
15 |   tensor.cpp
16 |   allocator.cpp
17 |   lsflow_util.cpp
18 |   operator.cpp
19 |   shape.cpp
20 |   variable.cpp)
21 | 
22 | target_link_libraries(lsflow PUBLIC lightseq_kernels)
23 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/README.md:
--------------------------------------------------------------------------------
1 | LsFlow is a extremely clean implement of computation graph.
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/allocator.cpp:
--------------------------------------------------------------------------------
 1 | #include "allocator.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | Allocator::Allocator() { _ptr_set.clear(); }
 6 | 
 7 | Allocator::~Allocator() {
 8 |   auto _tmp_ptr_set = _ptr_set;
 9 |   for (auto iter : _tmp_ptr_set) {
10 |     try {
11 |       free_mem(iter);
12 |     } catch (...) {
13 |       // printf("execute ~Allocator() free_mem %p failed!\n", iter);
14 |     }
15 |   }
16 |   _ptr_set.clear();
17 | }
18 | 
19 | char *Allocator::malloc_mem(size_t size) {
20 |   char *ptr = nullptr;
21 | 
22 |   try {
23 | #ifdef LIGHTSEQ_cuda
24 |     ptr = cuda::cuda_malloc<char>(size);
25 | #else
26 |     ptr = (char *)malloc(size);
27 | #endif
28 |   } catch (...) {
29 |     std::string error_message =
30 |         "allocate memory failed! size is: " + std::to_string((size / MB_SIZE)) +
31 |         " MB\n";
32 |     printf("%s", error_message.c_str());
33 |     throw std::runtime_error(error_message);
34 |   }
35 |   if (_ptr_set.find(ptr) != _ptr_set.end()) {
36 |     printf("allocate same address with twice.\n");
37 |     throw std::runtime_error("allocate same address with twice.\n");
38 |   }
39 |   _ptr_set.insert(ptr);
40 |   return ptr;
41 | }
42 | 
43 | void Allocator::free_mem(char *ptr) {
44 |   if (_ptr_set.find(ptr) == _ptr_set.end() || ptr == nullptr) {
45 |     return;
46 |   }
47 |   _ptr_set.erase(ptr);
48 | #ifdef LIGHTSEQ_cuda
49 |   cuda::cuda_free(ptr);
50 | #else
51 |   free(ptr);
52 | #endif
53 | }
54 | 
55 | } // namespace lightseq
56 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/includes/allocator.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team
 3 | */
 4 | #pragma once
 5 | #include "declaration.h"
 6 | 
 7 | namespace lightseq {
 8 | 
 9 | class Allocator {
10 | private:
11 |   std::unordered_set<char *> _ptr_set;
12 | 
13 | public:
14 |   Allocator();
15 |   virtual ~Allocator();
16 |   char *malloc_mem(size_t size);
17 |   void free_mem(char *ptr);
18 | };
19 | 
20 | } // namespace lightseq
21 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/includes/lsflow_util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (c) 2022 - 2023, Bytedance, The LightSeq Team
 3 | */
 4 | 
 5 | #pragma once
 6 | #include "declaration.h"
 7 | 
 8 | namespace lightseq {
 9 | 
10 | /* Print run time, for debug */
11 | void print_time_duration(
12 |     const std::chrono::high_resolution_clock::time_point &start,
13 |     std::string duration_name);
14 | 
15 | #ifdef LIGHTSEQ_cuda
16 | cublasOperation_t op_from_custom(MATRIX_OP op_type);
17 | #endif
18 | 
19 | } // namespace lightseq
20 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/includes/shape.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "initializer_list"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // This class records the shape information of the tensor and encapsulates some
 8 | // methods that may be commonly used.
 9 | class Shape {
10 | private:
11 |   std::vector<size_t> _shape_vec;
12 |   size_t _element_size;
13 |   bool _is_calculated;
14 | 
15 | public:
16 |   // Default constructor, not part of expected usage.
17 |   Shape() : _shape_vec({0}), _element_size(0), _is_calculated(false) {}
18 |   Shape(std::vector<size_t> shape)
19 |       : _shape_vec(shape), _element_size(0), _is_calculated(false) {}
20 |   Shape(std::initializer_list<size_t> list)
21 |       : Shape(std::vector<size_t>(list)) {}
22 |   Shape(const Shape &lx) = default;
23 |   virtual ~Shape() = default;
24 |   const std::vector<size_t> &view() const { return _shape_vec; }
25 | 
26 |   // Returns the product of each dimension of shape.
27 |   size_t element_size();
28 | 
29 |   // Print shape information.
30 |   void print_shape();
31 | };
32 | 
33 | } // namespace lightseq
34 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/lsflow_util.cpp:
--------------------------------------------------------------------------------
 1 | #include "lsflow_util.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | void print_time_duration(
 6 |     const std::chrono::high_resolution_clock::time_point &start,
 7 |     std::string duration_name) {
 8 | #ifdef LIGHTSEQ_cuda
 9 |   CHECK_GPU_ERROR(cudaStreamSynchronize(0));
10 | #endif
11 |   auto finish = std::chrono::high_resolution_clock::now();
12 |   std::chrono::duration<double> elapsed = finish - start;
13 |   std::cout << duration_name
14 |             << " duration time is: " << (elapsed).count() * 1000 << " ms"
15 |             << std::endl;
16 |   return;
17 | }
18 | 
19 | #ifdef LIGHTSEQ_cuda
20 | cublasOperation_t op_from_custom(MATRIX_OP op_type) {
21 |   switch (op_type) {
22 |   case MATRIX_OP::Transpose:
23 |     return CUBLAS_OP_T;
24 |   case MATRIX_OP::NonTranspose:
25 |     return CUBLAS_OP_N;
26 |   default: {
27 |     std::string error_message = "undefined custom MATRIX_OP\n";
28 |     printf("%s", error_message.c_str());
29 |     throw std::runtime_error("undefined custom MATRIX_OP");
30 |   }
31 |   }
32 |   exit(-1);
33 | }
34 | #endif
35 | } // namespace lightseq
36 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/operator.cpp:
--------------------------------------------------------------------------------
 1 | #include "node.h"
 2 | namespace lightseq {
 3 | 
 4 | Operator::Operator(std::string name) : Node(name, NodeType::Operator) {
 5 |   _context_ptr->add_op(this);
 6 | }
 7 | 
 8 | void Operator::check_override_grad() {
 9 |   for (Node *p : this->_parents) {
10 |     Variable *rp = static_cast<Variable *>(p);
11 |     if (!rp->enable_override_grad()) {
12 |       printf("can not override");
13 |       exit(-1);
14 |     }
15 |   }
16 |   return;
17 | }
18 | 
19 | void Operator::set_children(std::vector<Node *> children) {
20 |   if (!this->_children.empty()) {
21 |     printf("children not empty!");
22 |     exit(-1);
23 |   }
24 |   for (Node *iter : children) {
25 |     iter->set_parents({this});
26 |   }
27 | }
28 | } // namespace lightseq
29 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/lsflow/shape.cpp:
--------------------------------------------------------------------------------
 1 | #include "shape.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | size_t Shape::element_size() {
 6 |   if (_shape_vec.size() == 1 && _shape_vec[0] == 0) {
 7 |     printf("this tensor without shape\n");
 8 |     return 0;
 9 |   }
10 |   if (_is_calculated) {
11 |     return _element_size;
12 |   }
13 |   size_t product = 1;
14 |   for (int iter : _shape_vec) {
15 |     // if (iter <= 0) {
16 |     //   throw std::runtime_error("this tensor with invalid shape");
17 |     //   return 0;
18 |     // }
19 |     product *= iter;
20 |   }
21 |   _is_calculated = true;
22 |   _element_size = product;
23 |   return _element_size;
24 | }
25 | 
26 | void Shape::print_shape() {
27 |   printf("shape dim: %zu, element size: %d, each dimension: ",
28 |          _shape_vec.size(), element_size());
29 |   for (int i = 0; i < _shape_vec.size(); i++) {
30 |     printf("%zu", _shape_vec[i]);
31 |     if (i == _shape_vec.size() - 1) {
32 |       printf("\n");
33 |     } else {
34 |       printf(", ");
35 |     }
36 |   }
37 | }
38 | 
39 | } // namespace lightseq
40 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(liblightseq SHARED bert.cc bert_crf.cc transformer.cu gpt.cc
 2 |                                model_util.cc)
 3 | 
 4 | target_link_libraries(liblightseq PUBLIC lightseq_layers)
 5 | 
 6 | target_link_libraries(liblightseq PUBLIC weight_lib)
 7 | 
 8 | target_link_options(liblightseq PUBLIC $<DEVICE_LINK:-Xcompiler
 9 |                     -fvisibility=hidden>)
10 | 
11 | target_include_directories(liblightseq PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
12 | 
13 | set_target_properties(liblightseq PROPERTIES OUTPUT_NAME lightseq)
14 | 
15 | # add_executable(test_example test_layer.cc) target_link_libraries(test_example
16 | # PUBLIC liblightseq)
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/models/includes/model_util.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "layer.h"
3 | 
4 | namespace lightseq {
5 | 
6 | GenerateMethod get_generate_method(std::string method_);
7 | 
8 | } // namespace lightseq
9 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/models/model_util.cc:
--------------------------------------------------------------------------------
 1 | #include "model_util.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | GenerateMethod get_generate_method(std::string method_) {
 6 |   if (method_ == "topk")
 7 |     return GenerateMethod::Topk;
 8 |   if (method_ == "topp")
 9 |     return GenerateMethod::Topp;
10 |   if (method_ == "beam_search")
11 |     return GenerateMethod::BeamSearch;
12 | 
13 |   printf("Error!\n");
14 |   return GenerateMethod::UnDefined;
15 | }
16 | 
17 | } // namespace lightseq
18 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops/includes/context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cublas_v2.h>
 4 | #include <cuda.h>
 5 | 
 6 | #include <iostream>
 7 | #include <string>
 8 | 
 9 | #include "cuda_util.h"
10 | namespace lightseq {
11 | namespace cuda {
12 | 
13 | class Context {
14 | public:
15 |   Context() : _stream(nullptr) {
16 |     CHECK_GPU_ERROR(cublasCreate(&_cublasHandle));
17 |     CHECK_GPU_ERROR(cublasLtCreate(&_cublasLtHandle));
18 |   }
19 | 
20 |   virtual ~Context() {}
21 | 
22 |   static Context &Instance() {
23 |     static Context _ctx;
24 |     return _ctx;
25 |   }
26 | 
27 |   void set_stream(cudaStream_t stream) {
28 |     _stream = stream;
29 |     CHECK_GPU_ERROR(cublasSetStream(_cublasHandle, _stream));
30 |   }
31 | 
32 |   cudaStream_t get_stream() { return _stream; }
33 | 
34 |   cublasHandle_t get_cublashandle() { return _cublasHandle; }
35 |   cublasLtHandle_t get_cublaslthandle() { return _cublasLtHandle; }
36 | 
37 | private:
38 |   cudaStream_t _stream;
39 |   cublasHandle_t _cublasHandle;
40 |   cublasLtHandle_t _cublasLtHandle;
41 | };
42 | 
43 | } // namespace cuda
44 | } // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops/includes/softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <stdio.h>
 6 | 
 7 | #include <fstream>
 8 | 
 9 | #include "kernels.h"
10 | 
11 | using namespace std;
12 | namespace lightseq {
13 | namespace cuda {
14 | 
15 | template <typename T> class Softmax {
16 | public:
17 |   struct Config {
18 |     size_t nhead;
19 |     bool mask_future;
20 |     Config(size_t nhead, bool mask_future = false)
21 |         : nhead(nhead), mask_future(mask_future) {}
22 |   };
23 | 
24 |   Softmax(Config config) : config_(config) {}
25 | 
26 |   ~Softmax() {}
27 | 
28 |   void Forward(T *vals, const T *attn_mask, int batch_size, int from_len,
29 |                int to_len, cudaStream_t &stream, bool mask_future = false) {
30 |     launch_attn_softmax<T>(vals, attn_mask, batch_size, config_.nhead, from_len,
31 |                            to_len, config_.mask_future | mask_future, stream);
32 |   }
33 | 
34 |   void Backward(T *out_grad, const T *soft_out, int batch_size, int from_len,
35 |                 int to_len, cudaStream_t stream) {
36 |     launch_attn_softmax_bw<T>(out_grad, soft_out,
37 |                               batch_size * config_.nhead * from_len, to_len,
38 |                               stream);
39 |   }
40 | 
41 | private:
42 |   Config config_;
43 | };
44 | } // namespace cuda
45 | } // namespace lightseq
46 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(operator_files
 2 |     beam_search_topk.cu
 3 |     sampling.cc.cu
 4 |     bias_act_dropout.cpp
 5 |     bias_add_transform_20314.cpp
 6 |     bias_dropout_residual.cpp
 7 |     concat3_dim1.cpp
 8 |     crf.cpp
 9 |     dropout.cpp
10 |     launch_dec_emb_op.cpp
11 |     launch_enc_emb.cpp
12 |     launch_gpt_emb.cpp
13 |     layer_normalize.cpp
14 |     split_head_op.cpp
15 |     linear.cpp
16 |     softmax.cpp
17 |     strided_batch_gemm.cpp
18 |     transform_0213.cpp)
19 | 
20 | add_library(lightseq_operators STATIC ${operator_files})
21 | target_link_libraries(lightseq_operators PUBLIC lsflow)
22 | target_include_directories(lightseq_operators PUBLIC includes)
23 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/bias_act_dropout.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // dropout inside ffn.
 8 | template <typename T1, typename T2> class BiasActDropoutOp : public Operator {
 9 | private:
10 |   float ratio;
11 | 
12 |   size_t _mx_cols;
13 |   size_t _mx_rows;
14 |   size_t _cols;
15 |   size_t _rows;
16 | 
17 |   Variable *_result;
18 | 
19 |   std::string _activation_fn;
20 | 
21 |   TensorPtr _mask;
22 | 
23 | public:
24 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
25 | 
26 |   BiasActDropoutOp(float r, size_t mx_rows, size_t mx_cols,
27 |                    std::string activation_fn)
28 |       : Operator("BiasActDropoutOp"), ratio(r), _activation_fn(activation_fn),
29 |         _mx_rows(mx_rows), _mx_cols(mx_cols) {
30 |     _mask.reset(new Tensor("_mask", g_dtype<uint8_t>(), _mx_rows * _mx_cols));
31 |   }
32 | 
33 |   virtual ~BiasActDropoutOp() {}
34 | 
35 |   Variable *operator()(Variable *inp, Variable *bias);
36 | 
37 |   void before_forward(size_t rows, size_t cols) {
38 |     _rows = rows, _cols = cols;
39 |     _result->set_shape({rows, cols});
40 |   }
41 | 
42 |   void forward() override;
43 | 
44 |   void backward() override;
45 | };
46 | } // namespace lightseq
47 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/bias_add_transform_20314.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | // add bias and transform 20314, execute after qkv_linear
 9 | template <typename T1, typename T2> class BiasAddTrans20314 : public Operator {
10 | private:
11 |   size_t _max_batch_tokens;
12 |   size_t _batch;
13 |   size_t _seq_len;
14 |   size_t _heads;
15 |   size_t _hidden_size;
16 |   size_t _trans_count;
17 | 
18 |   Variable *_res;
19 | 
20 | public:
21 |   BiasAddTrans20314(size_t max_batch_tokens, size_t heads, size_t hidden_size,
22 |                     size_t trans_count)
23 |       : Operator("BiasAddTrans20314"), _max_batch_tokens(max_batch_tokens),
24 |         _heads(heads), _hidden_size(hidden_size), _trans_count(trans_count) {}
25 | 
26 |   virtual ~BiasAddTrans20314() {}
27 | 
28 |   Variable *operator()(Variable *inp, Variable *bias);
29 | 
30 |   void before_forward(size_t batch, size_t seq_len) {
31 |     _batch = batch, _seq_len = seq_len;
32 |     _res->set_shape(
33 |         {_trans_count, _batch, _heads, _seq_len, _hidden_size / _heads});
34 |   }
35 | 
36 |   void forward() override;
37 | 
38 |   void backward() override;
39 | };
40 | } // namespace lightseq
41 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/bias_dropout_residual.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // transformer layer's postprocessing dropout, after attn or ffn module,
 8 | // before residual add.
 9 | template <typename T1, typename T2> class BiasDropoutResOp : public Operator {
10 | private:
11 |   float ratio;
12 | 
13 |   size_t _max_rows;
14 |   size_t _max_cols;
15 |   size_t _rows;
16 |   size_t _cols;
17 | 
18 |   TensorPtr _mask;
19 |   Variable *_result;
20 | 
21 | public:
22 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
23 | 
24 |   BiasDropoutResOp(float r, size_t max_rows, size_t max_cols)
25 |       : Operator("BiasDropoutResOp"), ratio(r), _max_rows(max_rows),
26 |         _max_cols(max_cols) {
27 |     _mask.reset(new Tensor("mask", g_dtype<uint8_t>(), _max_rows * _max_cols));
28 |   }
29 | 
30 |   virtual ~BiasDropoutResOp() {}
31 | 
32 |   Variable *operator()(Variable *inp, Variable *bias, Variable *residual);
33 | 
34 |   void before_forward(size_t rows, size_t cols) {
35 |     _rows = rows, _cols = cols;
36 |     _result->set_shape({_rows, _cols});
37 |   }
38 | 
39 |   void forward() override;
40 | 
41 |   void backward() override;
42 | };
43 | } // namespace lightseq
44 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/concat3_dim1.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | #include "tuple"
 5 | 
 6 | namespace lightseq {
 7 | 
 8 | template <typename T1, typename T2> class Concat3Dim1 : public Operator {
 9 | private:
10 |   bool _is_skip = false;
11 |   bool _is_continuous_cache;
12 | 
13 |   size_t _mx_sz0;
14 |   size_t _mx_sz1;
15 |   size_t _mx_sz2;
16 | 
17 |   size_t _sz0;
18 |   size_t _sz1_0;
19 |   size_t _sz1_1;
20 |   size_t _layer_id;
21 | 
22 |   Variable *_new_cache;
23 | 
24 | public:
25 |   Concat3Dim1(size_t mx_sz0, size_t mx_sz1, size_t mx_sz2, size_t layer_id,
26 |               bool is_continuous_cache)
27 |       : Operator("Concat3Dim1"), _mx_sz0(mx_sz0), _mx_sz1(mx_sz1),
28 |         _mx_sz2(mx_sz2), _layer_id(layer_id),
29 |         _is_continuous_cache(is_continuous_cache) {}
30 | 
31 |   virtual ~Concat3Dim1() {}
32 | 
33 |   Variable *operator()(Variable *inp, Variable *cache);
34 | 
35 |   void before_forward(size_t sz0, size_t sz1_0, size_t sz1_1,
36 |                       bool is_skip = false) {
37 |     _sz0 = sz0, _sz1_0 = sz1_0, _sz1_1 = sz1_1, _is_skip = is_skip;
38 |     if (_is_continuous_cache) {
39 |       _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2});
40 |     } else {
41 |       _new_cache->set_shape({_sz0, _sz1_0 + _sz1_1, _mx_sz2});
42 |     }
43 |   }
44 | 
45 |   void forward() override;
46 | 
47 |   void before_backward() {}
48 | 
49 |   void backward() override;
50 | };
51 | } // namespace lightseq
52 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/crf.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // linear crf
 8 | template <typename T> class CRFOP : public Operator {
 9 | private:
10 |   size_t _num_tags;
11 |   size_t _seq_len;
12 |   size_t _batch_size;
13 |   size_t _max_batch_tokens;
14 |   size_t _max_batch_size;
15 | 
16 |   bool _forward_or_decode; // true for forward, false for decode
17 |   bool _output_decode_score;
18 |   TensorPtr _history;
19 | 
20 |   Variable *_best_tags;
21 | 
22 | public:
23 |   CRFOP(size_t max_batch_tokens, size_t max_batch_size, size_t num_tags);
24 | 
25 |   virtual ~CRFOP() {}
26 | 
27 |   Variable *operator()(Variable *start_transition, Variable *end_transition,
28 |                        Variable *transition, Variable *emission, Variable *mask,
29 |                        Variable *bias);
30 | 
31 |   void before_forward(size_t batch_size, size_t seq_len, bool forward_or_decode,
32 |                       bool output_decode_score);
33 | 
34 |   void forward() override;
35 | 
36 |   void before_backward();
37 | 
38 |   void backward() override;
39 | };
40 | 
41 | } // namespace lightseq
42 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/dropout.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // after attention softmax
 8 | template <typename T1, typename T2> class DropoutOp : public Operator {
 9 | private:
10 |   float ratio;
11 |   size_t _max_ele_num;
12 |   size_t _count;
13 |   bool _is_skip;
14 | 
15 |   TensorPtr _mask;
16 |   Variable *_result = nullptr;
17 | 
18 | public:
19 |   float RATIO() const { return _context_ptr->is_training() ? ratio : 0.0; }
20 | 
21 |   DropoutOp(float r, size_t max_ele_num)
22 |       : Operator("Dropout"), ratio(r), _max_ele_num(max_ele_num) {
23 |     _mask.reset(new Tensor("mask", g_dtype<uint8_t>(), max_ele_num));
24 |   }
25 | 
26 |   virtual ~DropoutOp() {}
27 | 
28 |   Variable *operator()(Variable *inp);
29 | 
30 |   void before_forward(size_t count) {
31 |     _count = count;
32 |     if (_result)
33 |       _result->set_shape({count});
34 |   }
35 | 
36 |   void forward() override;
37 | 
38 |   void before_backward(int count) { _count = count; }
39 | 
40 |   void backward() override;
41 | };
42 | } // namespace lightseq
43 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/launch_enc_emb.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // dropout inside ffn.
 8 | template <typename T> class LaunchEncEmbOp : public Operator {
 9 | private:
10 |   size_t _max_batch_tokens;
11 |   int _pad_id;
12 |   size_t _hidden_dim;
13 |   size_t _multilg_type;
14 | 
15 |   size_t _batch_size;
16 |   size_t _seq_len;
17 | 
18 |   Variable *_result;
19 |   Variable *_pad_mask;
20 | 
21 | public:
22 |   LaunchEncEmbOp(size_t max_batch_tokens, int pad_id, size_t hidden_dim,
23 |                  size_t multilg_type)
24 |       : Operator("LaunchEncEmbOp"), _max_batch_tokens(max_batch_tokens),
25 |         _pad_id(pad_id), _hidden_dim(hidden_dim), _multilg_type(multilg_type) {}
26 | 
27 |   virtual ~LaunchEncEmbOp() {}
28 | 
29 |   std::tuple<Variable *, Variable *>
30 |   operator()(Variable *inp_tokens, Variable *token_emb, Variable *pos_emb,
31 |              Variable *lang_emb, Variable *lang_id);
32 | 
33 |   void before_forward(size_t batch_size, size_t seq_len) {
34 |     _batch_size = batch_size, _seq_len = seq_len;
35 |   }
36 | 
37 |   void forward() override;
38 | 
39 |   void backward() override {
40 |     printf("ERROR! LaunchEncEmbOp can't cal backward()\n");
41 |     exit(-1);
42 |   }
43 | };
44 | } // namespace lightseq
45 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/layer_normalize.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <class T1, class T2> class LayerNormalizeOp : public Operator {
 8 | private:
 9 |   size_t _max_batch_tokens;
10 |   size_t _hidden_dim;
11 |   size_t _batch_tokens;
12 | 
13 |   bool _use_mean;
14 | 
15 |   TensorPtr means_;
16 |   TensorPtr vars_;
17 | 
18 |   Variable *_result;
19 | 
20 | public:
21 |   LayerNormalizeOp(size_t max_batch_tokens, size_t hidden_dim,
22 |                    bool use_mean = false)
23 |       : Operator("LayerNormalizeOp"), _max_batch_tokens(max_batch_tokens),
24 |         _hidden_dim(hidden_dim), _use_mean(use_mean) {
25 |     vars_.reset(new Tensor("vars", g_dtype<T1>(), max_batch_tokens));
26 |     if (use_mean)
27 |       means_.reset(new Tensor("means", g_dtype<T1>(), max_batch_tokens));
28 |   }
29 | 
30 |   Variable *operator()(Variable *inp, Variable *gamma, Variable *betta);
31 | 
32 |   virtual ~LayerNormalizeOp();
33 | 
34 |   void before_forward(size_t batch_size, size_t seq_len);
35 | 
36 |   void forward() override;
37 | 
38 |   void backward() override;
39 | };
40 | 
41 | } // namespace lightseq
42 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/linear.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2> class LinearOp : public Operator {
 8 | private:
 9 |   size_t _output_size;
10 |   size_t _input_size;
11 |   size_t _max_batch_tokens;
12 |   size_t _batch_tokens;
13 |   std::array<int, 3> _gemm_algos;
14 | 
15 |   float _alpha;
16 |   MATRIX_OP _opA;
17 |   MATRIX_OP _opB;
18 | 
19 |   Variable *_result;
20 | 
21 | #ifdef PYBIND_INTERFACE
22 | #define weight_op MATRIX_OP::Transpose
23 | #else
24 | #define weight_op MATRIX_OP::NonTranspose
25 | #endif
26 | 
27 | public:
28 |   LinearOp(size_t max_batch_tokens, size_t output_size, size_t input_size,
29 |            MATRIX_OP opA = weight_op, MATRIX_OP opB = MATRIX_OP::NonTranspose,
30 |            float alpha = float(1.))
31 |       : Operator("LinearOp"), _max_batch_tokens(max_batch_tokens),
32 |         _output_size(output_size), _input_size(input_size), _opA(opA),
33 |         _opB(opB), _gemm_algos(std::array<int, 3>({99, 99, 99})),
34 |         _alpha(alpha) {}
35 | 
36 |   ~LinearOp() {}
37 | 
38 |   Variable *operator()(Variable *inp, Variable *weight);
39 | 
40 |   void forward() override;
41 | 
42 |   void before_forward(size_t batch_tokens) {
43 |     _batch_tokens = batch_tokens;
44 |     _result->set_shape({batch_tokens, _output_size});
45 |   }
46 | 
47 |   void backward() override;
48 | 
49 |   void before_backward() {}
50 | };
51 | 
52 | } // namespace lightseq
53 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | template <typename T1, typename T2> class SoftmaxOp : public Operator {
 8 | private:
 9 |   size_t _nhead;
10 |   size_t _max_batch_tokens;
11 |   size_t _max_seq_len;
12 |   size_t _batchs;
13 |   size_t _from_len;
14 |   size_t _to_len;
15 |   int _kv_size;
16 | 
17 |   bool _config_mask_future;
18 |   bool _mask_future;
19 | 
20 |   Variable *_result;
21 | 
22 | public:
23 |   SoftmaxOp(size_t max_batch_tokens, size_t max_seq_len, size_t nhead,
24 |             bool mask_future = false)
25 |       : Operator("SoftmaxOp"), _max_batch_tokens(max_batch_tokens),
26 |         _max_seq_len(max_seq_len), _nhead(nhead),
27 |         _config_mask_future(mask_future) {}
28 | 
29 |   virtual ~SoftmaxOp() {}
30 | 
31 |   Variable *operator()(Variable *inp, Variable *mask = nullptr);
32 | 
33 |   void forward() override;
34 | 
35 |   void before_forward(size_t batchs, size_t from_len, size_t to_len,
36 |                       int kv_size = -1, bool mask_future = false) {
37 |     _batchs = batchs;
38 |     _from_len = from_len;
39 |     _to_len = to_len;
40 |     _kv_size = (kv_size == -1 ? to_len : kv_size);
41 |     _mask_future = mask_future;
42 |     _result->set_shape({_batchs, _nhead, _from_len, _to_len});
43 |   }
44 | 
45 |   void backward() override;
46 | };
47 | 
48 | } // namespace lightseq
49 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/includes/transform_0213.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "declaration.h"
 3 | #include "node.h"
 4 | 
 5 | namespace lightseq {
 6 | 
 7 | // [sz0, sz1, sz2, sz3] -> [sz0, sz2, sz1, sz3]
 8 | template <typename T1, typename T2> class Transform0213OP : public Operator {
 9 | private:
10 |   size_t _max_numel;
11 |   size_t _sz0;
12 |   size_t _sz1;
13 |   size_t _sz2;
14 |   size_t _sz3;
15 | 
16 |   Variable *_result;
17 | 
18 | public:
19 |   Transform0213OP(size_t max_numel)
20 |       : Operator("Transform0213"), _max_numel(max_numel) {}
21 | 
22 |   virtual ~Transform0213OP() {}
23 | 
24 |   Variable *operator()(Variable *inp);
25 | 
26 |   void before_forward(size_t sz0, size_t sz1, size_t sz2, size_t sz3) {
27 |     _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3;
28 |     _result->set_shape({_sz0, _sz2, _sz1, _sz3});
29 |   }
30 | 
31 |   void forward() override;
32 | 
33 |   void before_backward(int sz0, int sz1, int sz2, int sz3) {
34 |     _sz0 = sz0, _sz1 = sz1, _sz2 = sz2, _sz3 = sz3;
35 |   }
36 | 
37 |   void backward() override;
38 | };
39 | } // namespace lightseq
40 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/ops_new/transform_0213.cpp:
--------------------------------------------------------------------------------
 1 | #include "transform_0213.h"
 2 | 
 3 | namespace lightseq {
 4 | 
 5 | template <typename T1, typename T2>
 6 | Variable *Transform0213OP<T1, T2>::operator()(Variable *inp) {
 7 |   _result = new Variable("Transform0213_res", _max_numel, g_dtype<T1>(),
 8 |                          g_dtype<T2>());
 9 |   set_parents({inp});
10 |   this->set_children({_result});
11 |   return _result;
12 | }
13 | 
14 | template <typename T1, typename T2> void Transform0213OP<T1, T2>::forward() {
15 |   T1 *inp_ptr = (T1 *)parent(0)->value();
16 |   T1 *res_ptr = (T1 *)child(0)->value();
17 | 
18 |   if (!_context_ptr->is_built()) {
19 |     return;
20 |   }
21 | #ifdef LIGHTSEQ_cuda
22 |   cudaStream_t _stream = _context_ptr->get_stream();
23 |   cuda::launch_transform_0213<T1>(inp_ptr, res_ptr, _sz0, _sz1, _sz2, _sz3,
24 |                                   _stream);
25 | #endif
26 | }
27 | 
28 | template <typename T1, typename T2> void Transform0213OP<T1, T2>::backward() {
29 |   T2 *inp_grad = (T1 *)parent(0)->grad();
30 |   T2 *out_grad = (T1 *)child(0)->grad();
31 | 
32 |   if (!_context_ptr->is_built()) {
33 |     return;
34 |   }
35 | 
36 | #ifdef LIGHTSEQ_cuda
37 |   cudaStream_t _stream = _context_ptr->get_stream();
38 |   cuda::launch_transform_0213<T2>(out_grad, inp_grad, _sz0, _sz1, _sz2, _sz3,
39 |                                   _stream);
40 | #endif
41 | }
42 | 
43 | template class Transform0213OP<float, float>;
44 | #ifdef LIGHTSEQ_cuda
45 | template class Transform0213OP<__half, __half>;
46 | #endif
47 | } // namespace lightseq
48 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/proto/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # (default) use C API for HDF5 library
 4 | find_package(HDF5 REQUIRED)
 5 | include_directories(${HDF5_INCLUDE_DIRS})
 6 | 
 7 | find_package(Protobuf REQUIRED)
 8 | include_directories(${Protobuf_INCLUDE_DIRS})
 9 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
10 | 
11 | set(PROTO_FILES bert.proto bert_crf.proto transformer.proto gpt.proto)
12 | 
13 | set(WEIGHT_FILES bert_weight.cc bert_crf_weight.cc transformer_weight.cc
14 |                  gpt_weight.cc)
15 | 
16 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER ${PROTO_FILES})
17 | add_library(weight_lib STATIC ${WEIGHT_FILES} ${PROTO_SRC} ${PROTO_HEADER}
18 |                               proto_util.cc)
19 | target_link_libraries(weight_lib PRIVATE ${HDF5_LIBRARIES})
20 | target_link_libraries(weight_lib PUBLIC ${Protobuf_LIBRARIES})
21 | target_link_libraries(weight_lib PUBLIC lightseq_kernels)
22 | 
23 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
24 | target_include_directories(weight_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
25 | target_include_directories(weight_lib PUBLIC ${HDF5_INCLUDE_DIRS})
26 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/proto/includes/proto_headers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | #include <iostream>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "hdf5.h"
 8 | #include <fcntl.h>
 9 | #include <google/protobuf/io/zero_copy_stream.h>
10 | #include <google/protobuf/io/zero_copy_stream_impl.h>
11 | #include <sys/stat.h>
12 | #include <sys/types.h>
13 | #include <unistd.h>
14 | 
15 | #include "declaration.h"
16 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/proto/includes/test_model_weight.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "bert.pb.h"
 3 | #include "proto_headers.h"
 4 | #include "proto_util.h"
 5 | 
 6 | namespace lightseq {
 7 | template <typename T> class TestModelWeight {
 8 | private:
 9 |   const T *_p_d_weight_emb;
10 |   std::vector<T> _d_weight_emb;
11 | 
12 | public:
13 |   TestModelWeight(int weight_size) {
14 |     _d_weight_emb.clear();
15 |     for (int i = 0; i < weight_size; i++) {
16 |       _d_weight_emb.push_back(rand() % 100);
17 |     }
18 |   }
19 |   const T *&weight_emb() const { return _p_d_weight_emb; }
20 | };
21 | } // namespace lightseq
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pybind/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.18)
2 | 
3 | set(LS_PYBIND_KERNEL_FILES pybind_model.cpp)
4 | pybind11_add_module(lightseq MODULE ${LS_PYBIND_KERNEL_FILES})
5 | target_link_libraries(lightseq PUBLIC liblightseq lightseq_kernels)
6 | set_target_properties(lightseq PROPERTIES OUTPUT_NAME inference)
7 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 4 | csrc_dir = os.path.dirname(cur_dir)
 5 | lightseq_dir = os.path.dirname(csrc_dir)
 6 | sys.path.insert(0, lightseq_dir)
 7 | 
 8 | from .builder.cuda_kernel_builder import CudaKernelBuilder
 9 | from .builder.x86_kernel_builder import X86KernelBuilder
10 | from .builder.cuda_layer_builder import CudaLayerBuilder
11 | 
12 | from .torch_transformer_layers import TransformerEncoderLayer, TransformerDecoderLayer
13 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/builder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builder import CUDAOpBuilder
 2 | from .cuda_kernel_builder import CudaKernelBuilder
 3 | from .x86_kernel_builder import X86KernelBuilder
 4 | from .cuda_layer_builder import CudaLayerBuilder
 5 | 
 6 | # TODO: infer this list instead of hard coded
 7 | # List of all available ops
 8 | __op_builders__ = [
 9 |     CudaKernelBuilder(),
10 |     CudaLayerBuilder(),
11 |     X86KernelBuilder(),
12 | ]
13 | 
14 | ALL_OPS = {op.name: op for op in __op_builders__}
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT)
19 | # of commit 42805f0
20 | 
21 | from .version import __version__
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/calib/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """``csrc.pytorch.pytorch_quantization.calib`` provides Calibrator classes that
20 | collect data statistics and determine pytorch_quantization parameters.
21 | """
22 | 
23 | from .max import MaxCalibrator
24 | from .histogram import *
25 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | from .modules.tensor_quantizer import *
20 | from .modules.quant_conv import *
21 | from .modules.quant_linear import *
22 | from .modules.quant_pooling import *
23 | from .modules.clip import *
24 | from .modules.quant_rnn import *
25 | from .modules.quant_bert import *
26 | from .modules.quant_instancenorm import *
27 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """Main entry of all utils"""
20 | 
21 | from .reduce_amax import reduce_amax
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/utils/quant_logging.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """A WAR for codes that messes up logging format"""
20 | 
21 | import logging
22 | 
23 | 
24 | def reset_logger_handler():
25 |     """Remove all handler in root logger"""
26 |     root_logger = logger.getLogger()
27 |     while root_logger.handlers:
28 |         root_logger.removeHandler(root_logger.handlers[0])
29 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/pytorch_quantization/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | __version__ = "2.1.2"
18 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/pytorch/sdpa_layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from csrc.pytorch.builder.cuda_layer_builder import CudaLayerBuilder
 3 | 
 4 | cuda_layer_module = CudaLayerBuilder().load()
 5 | 
 6 | 
 7 | class SdpaLayerFunc(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(
10 |         ctx,
11 |         input,
12 |         input_mask,
13 |         config,
14 |     ):
15 |         cuda_module = cuda_layer_module
16 |         forward_func = (
17 |             cuda_module.transformer_encoder_layer_fw_fp16
18 |             if config.fp16
19 |             else cuda_module.transformer_encoder_layer_fw_fp32
20 |         )
21 |         if config.fp16:
22 |             input = input.to(torch.half)
23 |             input_mask = input_mask.to(torch.half)
24 | 
25 |         (output,) = forward_func(config.layer_id, input, input_mask)
26 | 
27 |         return output
28 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information.
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/csrc/tests/__init__.py


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/tests/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | 
 3 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 4 | par_dir = os.path.dirname(cur_dir)
 5 | csrc_dir = os.path.dirname(par_dir)
 6 | lightseq_dir = os.path.dirname(csrc_dir)
 7 | 
 8 | sys.path.insert(0, lightseq_dir)
 9 | sys.path.insert(0, os.path.dirname(lightseq_dir))
10 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/csrc/triton_backend/src/triton_utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "triton/backend/backend_common.h"
 4 | #include "triton/core/tritonserver.h"
 5 | 
 6 | TRITONSERVER_DataType
 7 | transform_triton_datatype_to_lightseq(::lightseq::cuda::DataType data_type_) {
 8 |   switch (data_type_) {
 9 |   case ::lightseq::cuda::DataType::kNotSupported:
10 |     return TRITONSERVER_TYPE_INVALID;
11 |   case ::lightseq::cuda::DataType::kFloat32:
12 |     return TRITONSERVER_TYPE_FP32;
13 |   case ::lightseq::cuda::DataType::kInt32:
14 |     return TRITONSERVER_TYPE_INT32;
15 |   case ::lightseq::cuda::DataType::kInt64:
16 |     return TRITONSERVER_TYPE_INT64;
17 |   case ::lightseq::cuda::DataType::kFloat16:
18 |     return TRITONSERVER_TYPE_FP16;
19 |   case ::lightseq::cuda::DataType::kInt8:
20 |     return TRITONSERVER_TYPE_INT8;
21 |   case ::lightseq::cuda::DataType::kInt16:
22 |     return TRITONSERVER_TYPE_INT16;
23 |   case ::lightseq::cuda::DataType::kByte:
24 |     return TRITONSERVER_TYPE_BYTES;
25 |   case ::lightseq::cuda::DataType::kUInt8:
26 |     return TRITONSERVER_TYPE_UINT8;
27 |   case ::lightseq::cuda::DataType::kUInt16:
28 |     return TRITONSERVER_TYPE_UINT16;
29 |   case ::lightseq::cuda::DataType::kUInt32:
30 |     return TRITONSERVER_TYPE_UINT32;
31 |   case ::lightseq::cuda::DataType::kUInt64:
32 |     return TRITONSERVER_TYPE_UINT64;
33 |   case ::lightseq::cuda::DataType::kFloat64:
34 |     return TRITONSERVER_TYPE_FP64;
35 |   default:
36 |     return TRITONSERVER_TYPE_INVALID;
37 |   }
38 |   return TRITONSERVER_TYPE_INVALID;
39 | }
40 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | set(cuda_kernel_files
 4 |     gptKernels.cc.cu
 5 |     gptKernels_int8.cc.cu
 6 |     transformerKernels.cc.cu
 7 |     multilgKernels.cc.cu
 8 |     embKernels.cc.cu
 9 |     embKernels_int8.cc.cu
10 |     transformerKernels_int8.cc.cu
11 |     moeKernels.cc.cu
12 |     t5Kernels.cc.cu
13 |     t5EmbKernels.cc.cu)
14 | 
15 | add_library(cuda_kernels STATIC ${cuda_kernel_files})
16 | target_include_directories(cuda_kernels INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/kernels/embKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | void launch_split_multilg_request(const int *req, int *src_lang_id,
 9 |                                   int *trg_lang_id, int *src_token_id,
10 |                                   int batch_size, int req_len,
11 |                                   cudaStream_t &stream);
12 | 
13 | template <typename T>
14 | void launch_enc_emb(const T *token_emb, const T *pos_emb, const int *tokens,
15 |                     T *output, int *pad_mask, int pad_id, int batch_size,
16 |                     int seq_len, int hidden_dim, cudaStream_t stream,
17 |                     const T *lang_emb, const int *lang_id, int multilg_type);
18 | 
19 | template <typename T>
20 | void launch_dec_emb(const T *token_emb, const T *pos_emb, int *tokens,
21 |                     const T *lang_emb, const int *lang_id, T *output,
22 |                     int batch_size, int beam_size, int hidden_dim,
23 |                     int vocab_size, int step, int max_step, int multilg_type,
24 |                     cudaStream_t stream);
25 | 
26 | template <typename T>
27 | void launch_patch_emb(const T *conv_weight, const T *conv_bias,
28 |                       const T *pos_emb, const T *cls_emb, const float *input,
29 |                       T *output, int patch_size, int image_size, int batch_size,
30 |                       int max_step, int hidden_dim, int channel_input,
31 |                       cudaStream_t stream);
32 | 
33 | } // namespace cuda
34 | } // namespace lightseq
35 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/kernels/embKernels_int8.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | template <typename T>
 9 | void launch_enc_emb_i8I(const int8_t *token_emb, const T *pos_emb,
10 |                         const int *tokens, T *output, int *pad_mask, int pad_id,
11 |                         int batch_size, int seq_len, int hidden_dim,
12 |                         cudaStream_t stream, const T *lang_emb,
13 |                         const int *lang_id, int multilg_type,
14 |                         float dequant_scale, bool scaled = true);
15 | 
16 | template <typename T>
17 | void launch_dec_emb_i8I(const int8_t *token_emb, const T *pos_emb, int *tokens,
18 |                         const T *lang_emb, const int *lang_id, T *output,
19 |                         int batch_size, int beam_size, int hidden_dim,
20 |                         int vocab_size, int step, int max_step,
21 |                         int multilg_type, cudaStream_t stream,
22 |                         float dequant_scale, bool scaled = true);
23 | 
24 | } // namespace cuda
25 | } // namespace lightseq
26 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/kernels/t5EmbKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_fp16.h>
 4 | 
 5 | namespace lightseq {
 6 | namespace cuda {
 7 | 
 8 | template <typename T>
 9 | void t5_launch_enc_emb(const T *token_emb, const int *tokens, T *output,
10 |                        int *pad_mask, int pad_id, int batch_size, int seq_len,
11 |                        int hidden_dim, cudaStream_t stream, const T *lang_emb,
12 |                        const int *lang_id);
13 | 
14 | template <typename T>
15 | void t5_launch_dec_emb(const T *token_emb, int *tokens, const T *lang_emb,
16 |                        const int *lang_id, T *output, int batch_size,
17 |                        int beam_size, int hidden_dim, int vocab_size, int step,
18 |                        int max_step, int multilg_type, cudaStream_t stream);
19 | 
20 | } // namespace cuda
21 | } // namespace lightseq
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/kernels/t5Kernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cub/cub.cuh>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <curand_kernel.h>
 6 | 
 7 | namespace lightseq {
 8 | namespace cuda {
 9 | 
10 | const float t5_epsilon = 1e-6;
11 | template <typename T>
12 | void t5_ker_norm_layer_launcher(int token_num, int hidden_size,
13 |                                 cudaStream_t stream, T *matrix, T *out,
14 |                                 const T *scale, const T *bias,
15 |                                 int max_thread_per_block);
16 | 
17 | template <typename T>
18 | void t5_ker_correlation_softmax_encself_launcher(
19 |     int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
20 |     T *correlation, const int *src_padding_mask, const T *pos_emb);
21 | 
22 | template <typename T>
23 | void t5_ker_correlation_softmax_decself_launcher(
24 |     int batch_head_num, int step_num, cudaStream_t stream, T *correlation,
25 |     const T *pos_emb, int head_num);
26 | 
27 | template <typename T>
28 | void ker_gelu_first_elementmul_launcher(int batch_token_num, int block_dim,
29 |                                         cudaStream_t stream, T *input,
30 |                                         const T *input2, int feature_dim);
31 | } // namespace cuda
32 | } // namespace lightseq
33 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/pywrapper/bert.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "../model/bert_encoder.h"
 3 | #include "../proto/bert_weight.h"
 4 | #include "../tools/util.h"
 5 | #include "model_base.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType bert_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType bert_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class Bert : public LSModel {
18 | private:
19 |   typedef OperationTypeTraits<bert_optype> optraits;
20 |   std::shared_ptr<BertEncoder<bert_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   int *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   void *d_buf_;
29 |   BertWeight<bert_optype> tw_;
30 | 
31 | public:
32 |   Bert(const std::string weight_path, const int max_batch_size);
33 | 
34 |   ~Bert();
35 | 
36 |   void Infer() override;
37 |   void set_input_ptr(int index, void *input_ptr) override;
38 |   void set_output_ptr(int index, void *output_ptr) override;
39 |   const void *get_output_ptr(int index) override;
40 |   std::vector<int> get_input_max_shape(int index) override;
41 |   std::vector<int> get_output_max_shape(int index) override;
42 |   DataType get_input_dtype(int index) override;
43 |   DataType get_output_dtype(int index) override;
44 |   void benchmark_mode(bool is_benchmark) override{};
45 | };
46 | 
47 | LSMODEL_REGISTER(Bert);
48 | 
49 | } // namespace cuda
50 | } // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/pywrapper/vit.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "../model/vit_encoder.h"
 3 | #include "../proto/vit_weight.h"
 4 | #include "../tools/util.h"
 5 | #include "model_base.h"
 6 | 
 7 | #ifdef FP16_MODE
 8 | const lightseq::cuda::OperationType vit_optype =
 9 |     lightseq::cuda::OperationType::FP16;
10 | #else
11 | const lightseq::cuda::OperationType vit_optype =
12 |     lightseq::cuda::OperationType::FP32;
13 | #endif
14 | 
15 | namespace lightseq {
16 | namespace cuda {
17 | class Vit : public LSModel {
18 | private:
19 |   typedef OperationTypeTraits<vit_optype> optraits;
20 |   std::shared_ptr<VitEncoder<vit_optype>> encoder_;
21 | 
22 |   optraits::DataType *d_encoder_output_;
23 |   float *d_input_;
24 |   int *d_padding_mask_;
25 |   int _max_batch_size;
26 |   cudaStream_t stream_;
27 |   cublasHandle_t hd_;
28 |   void *d_buf_;
29 |   VitWeight<vit_optype> tw_;
30 | 
31 | public:
32 |   Vit(const std::string weight_path, const int max_batch_size);
33 | 
34 |   ~Vit();
35 | 
36 |   void Infer() override;
37 |   void set_input_ptr(int index, void *input_ptr) override;
38 |   void set_output_ptr(int index, void *output_ptr) override;
39 |   const void *get_output_ptr(int index) override;
40 |   std::vector<int> get_input_max_shape(int index) override;
41 |   std::vector<int> get_output_max_shape(int index) override;
42 |   DataType get_input_dtype(int index) override;
43 |   DataType get_output_dtype(int index) override;
44 |   void benchmark_mode(bool is_benchmark) override{};
45 | };
46 | 
47 | LSMODEL_REGISTER(Vit);
48 | 
49 | } // namespace cuda
50 | } // namespace lightseq
51 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/server/libserver.ldscript:
--------------------------------------------------------------------------------
1 | {
2 |   global:
3 |     CustomErrorString;
4 |     CustomExecute;
5 |     CustomFinalize;
6 |     CustomInitialize;
7 |   local: *;
8 | };
9 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # (default) use C API for HDF5 library
 4 | find_package(HDF5 REQUIRED)
 5 | 
 6 | add_library(utils STATIC util.cc.cu)
 7 | target_include_directories(utils PUBLIC ${HDF5_INCLUDE_DIRS})
 8 | target_include_directories(utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 9 | target_link_libraries(utils PRIVATE ${HDF5_LIBRARIES})
10 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/inference/triton_backend/src/triton_utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "model_base.h"
 3 | #include "triton/backend/backend_common.h"
 4 | #include "triton/core/tritonserver.h"
 5 | 
 6 | TRITONSERVER_DataType
 7 | transform_triton_datatype_to_lightseq(::lightseq::cuda::DataType data_type_) {
 8 |   switch (data_type_) {
 9 |   case ::lightseq::cuda::DataType::kNotSupported:
10 |     return TRITONSERVER_TYPE_INVALID;
11 |   case ::lightseq::cuda::DataType::kFloat32:
12 |     return TRITONSERVER_TYPE_FP32;
13 |   case ::lightseq::cuda::DataType::kInt32:
14 |     return TRITONSERVER_TYPE_INT32;
15 |   case ::lightseq::cuda::DataType::kInt64:
16 |     return TRITONSERVER_TYPE_INT64;
17 |   case ::lightseq::cuda::DataType::kFloat16:
18 |     return TRITONSERVER_TYPE_FP16;
19 |   case ::lightseq::cuda::DataType::kInt8:
20 |     return TRITONSERVER_TYPE_INT8;
21 |   case ::lightseq::cuda::DataType::kInt16:
22 |     return TRITONSERVER_TYPE_INT16;
23 |   case ::lightseq::cuda::DataType::kByte:
24 |     return TRITONSERVER_TYPE_BYTES;
25 |   case ::lightseq::cuda::DataType::kUInt8:
26 |     return TRITONSERVER_TYPE_UINT8;
27 |   case ::lightseq::cuda::DataType::kUInt16:
28 |     return TRITONSERVER_TYPE_UINT16;
29 |   case ::lightseq::cuda::DataType::kUInt32:
30 |     return TRITONSERVER_TYPE_UINT32;
31 |   case ::lightseq::cuda::DataType::kUInt64:
32 |     return TRITONSERVER_TYPE_UINT64;
33 |   case ::lightseq::cuda::DataType::kFloat64:
34 |     return TRITONSERVER_TYPE_FP64;
35 |   default:
36 |     return TRITONSERVER_TYPE_INVALID;
37 |   }
38 |   return TRITONSERVER_TYPE_INVALID;
39 | }
40 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.lightseq2.training.ops.pytorch.transformer_embedding_layer import (
 2 |     LSTransformerEmbeddingLayer,
 3 | )
 4 | from oslo.lightseq2.training.ops.pytorch.transformer_encoder_layer import (
 5 |     LSTransformerEncoderLayer,
 6 | )
 7 | from oslo.lightseq2.training.ops.pytorch.transformer_decoder_layer import (
 8 |     LSTransformerDecoderLayer,
 9 | )
10 | from oslo.lightseq2.training.ops.pytorch.gpt_layer import (
11 |     LSGptEncoderLayer,
12 |     ls_hf_gpt_enc_convert,
13 | )
14 | from oslo.lightseq2.training.ops.pytorch.transformer import (
15 |     LSTransformer,
16 |     LSTransformerEncoder,
17 |     LSTransformerDecoder,
18 | )
19 | 
20 | from oslo.lightseq2.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer
21 | from oslo.lightseq2.training.ops.pytorch.adam import LSAdam
22 | from oslo.lightseq2.training.ops.pytorch.export import (
23 |     export_ls_config,
24 |     export_ls_embedding,
25 |     export_ls_encoder,
26 |     export_ls_decoder,
27 |     export_pb2hdf5,
28 | )
29 | 
30 | from oslo.lightseq2.training.ops.pytorch.export_quant import (
31 |     export_ls_embedding_ptq,
32 |     export_ls_encoder_ptq,
33 |     export_ls_decoder_ptq,
34 |     export_ls_quant_embedding,
35 |     export_ls_quant_encoder,
36 |     export_ls_quant_decoder,
37 |     export_quant_pb2hdf5,
38 | )
39 | 
40 | from oslo.lightseq2.training.ops.pytorch.gemm_test import gemm_test
41 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/cli/__init__.py


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/fs_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .ls_adam import LSFSAdam
2 | from .ls_label_smoothed_cross_entropy import LSLabelSmoothedCrossEntropyCriterion
3 | from .ls_transformer import LSTransformerModel
4 | from .ls_bart import LSBARTModel
5 | from .ls_translation import LSTranslationTask
6 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/lightseq_deepspeed_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from deepspeed.launcher.runner import main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/lightseq_fairseq_generate_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from fairseq_cli.generate import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/lightseq_fairseq_train_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from oslo.lightseq2.training.gcq.ls_fs_gcq_train import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/cli/lightseq_fairseq_validate_cli.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | from fairseq_cli.validate import cli_main
 5 | 
 6 | 
 7 | def ls_cli_main(*args, **kwargs):
 8 |     user_path = pathlib.Path(__file__).parent.joinpath("fs_modules")
 9 |     sys.argv.extend(["--user-dir", str(user_path)])
10 |     cli_main(*args, **kwargs)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     ls_cli_main()
15 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/gcq/__init__.py:
--------------------------------------------------------------------------------
1 | from .gcq import GCQ, GCQState, encode_and_decode
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/ops/__init__.py


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .torch_transformer_layers import (
 2 |     TransformerEncoderLayer,
 3 |     TransformerDecoderLayer,
 4 |     TransformerEmbeddingLayer,
 5 | )
 6 | from .quantization import TensorQuantizer, act_quant_config, QuantLinear
 7 | from .builder.transformer_builder import TransformerBuilder
 8 | from .builder.operator_builder import OperatorBuilder
 9 | from .builder.layer_builder import LayerBuilder
10 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/pytorch/builder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builder import CUDAOpBuilder
 2 | from .kernel_builder import KernelBuilder
 3 | from .transformer_builder import TransformerBuilder
 4 | from .operator_builder import OperatorBuilder
 5 | from .adam_builder import AdamBuilder
 6 | from .layer_builder import LayerBuilder
 7 | 
 8 | # TODO: infer this list instead of hard coded
 9 | # List of all available ops
10 | __op_builders__ = [
11 |     LayerBuilder(),
12 |     KernelBuilder(),
13 |     OperatorBuilder(),
14 |     TransformerBuilder(),
15 |     AdamBuilder(),
16 | ]
17 | ALL_OPS = {op.name: op for op in __op_builders__}
18 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/pytorch/builder/adam_builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The LightSeq Team
 2 | # Copyright Microsoft DeepSpeed
 3 | # This builder is adapted from Microsoft DeepSpeed
 4 | 
 5 | import torch
 6 | from .builder import CUDAOpBuilder
 7 | 
 8 | 
 9 | class AdamBuilder(CUDAOpBuilder):
10 |     NAME = "adam"
11 | 
12 |     def __init__(self, name=None):
13 |         name = self.NAME if name is None else name
14 |         super().__init__(name=name)
15 | 
16 |     def absolute_name(self):
17 |         return f"op_builder.{self.NAME}_op"
18 | 
19 |     def sources(self):
20 |         return [
21 |             "csrc/kernels/fused_adam_kernel.cu",
22 |             "csrc/pybind/pybind_adam.cpp",
23 |         ]
24 | 
25 |     def include_paths(self):
26 |         return ["csrc/kernels/includes", "csrc/ops/includes", "csrc/layers/includes"]
27 | 
28 |     def nvcc_args(self):
29 |         args = [
30 |             "-O3",
31 |             "--use_fast_math",
32 |             "-std=c++14",
33 |             "-U__CUDA_NO_HALF_OPERATORS__",
34 |             "-U__CUDA_NO_HALF_CONVERSIONS__",
35 |             "-U__CUDA_NO_HALF2_OPERATORS__",
36 |         ]
37 | 
38 |         return args + self.compute_capability_args()
39 | 
40 |     def cxx_args(self):
41 |         return ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
42 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | ## Please refer to [NeurST](https://github.com/bytedance/neurst/tree/lightseq) for more information.
2 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/ops/tensorflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/lightseq2/training/ops/tensorflow/__init__.py


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # we modified pytorch_quantization in TensorRT(https://github.com/NVIDIA/TensorRT)
19 | # of commit 42805f0
20 | 
21 | from .version import __version__
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/calib/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """``oslo.lightseq2.training.pytorch_quantization.calib`` provides Calibrator classes that
20 | collect data statistics and determine pytorch_quantization parameters.
21 | """
22 | 
23 | from .max import MaxCalibrator
24 | from .histogram import *
25 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.tensor_quantizer import *
20 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_conv import *
21 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_linear import *
22 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_pooling import *
23 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.clip import *
24 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_rnn import *
25 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_bert import *
26 | from oslo.lightseq2.training.pytorch_quantization.nn.modules.quant_instancenorm import *
27 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """Main entry of all utils"""
20 | 
21 | from .reduce_amax import reduce_amax
22 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/utils/quant_logging.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """A WAR for codes that messes up logging format"""
20 | 
21 | import logging
22 | 
23 | 
24 | def reset_logger_handler():
25 |     """Remove all handler in root logger"""
26 |     root_logger = logger.getLogger()
27 |     while root_logger.handlers:
28 |         root_logger.removeHandler(root_logger.handlers[0])
29 | 


--------------------------------------------------------------------------------
/oslo/lightseq2/training/pytorch_quantization/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Bytedance Inc. All rights reserved.
 3 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | __version__ = "2.1.2"
18 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedAdagradBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_adagrad_cuda(
 4 |     int chunk_size, at::Tensor noop_flag,
 5 |     std::vector<std::vector<at::Tensor>> tensor_lists, const float lr,
 6 |     const float epsilon, const int mode, const float weight_decay);
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 9 |   m.def("multi_tensor_adagrad", &multi_tensor_adagrad_cuda,
10 |         "Compute and apply gradient update to parameters for Adam optimizer");
11 | }
12 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedAdamBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_adam_cuda(int chunk_size, at::Tensor noop_flag,
 4 |                             std::vector<std::vector<at::Tensor>> tensor_lists,
 5 |                             const float lr, const float beta1,
 6 |                             const float beta2, const float epsilon,
 7 |                             const int step, const int mode,
 8 |                             const int bias_correction,
 9 |                             const float weight_decay);
10 | 
11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
12 |   m.def("multi_tensor_adam", &multi_tensor_adam_cuda,
13 |         "Compute and apply gradient update to parameters for Adam optimizer");
14 | }
15 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedL2NormBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | std::tuple<at::Tensor, at::Tensor>
 4 | multi_tensor_l2norm_cuda(int chunk_size, at::Tensor noop_flag,
 5 |                          std::vector<std::vector<at::Tensor>> tensor_lists,
 6 |                          at::optional<bool> per_tensor_python);
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 9 |   m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda,
10 |         "Computes L2 norm for a list of contiguous tensors");
11 | }
12 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedLambBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_cuda(int chunk_size, at::Tensor noop_flag,
 4 |                             std::vector<std::vector<at::Tensor>> tensor_lists,
 5 |                             const float lr, const float beta1,
 6 |                             const float beta2, const float epsilon,
 7 |                             const int step, const int bias_correction,
 8 |                             const float weight_decay, const int grad_averaging,
 9 |                             const int mode, at::Tensor global_grad_norm,
10 |                             const float max_grad_norm,
11 |                             at::optional<bool> use_nvlamb_python);
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("multi_tensor_lamb", &multi_tensor_lamb_cuda,
15 |         "Computes and apply update for LAMB optimizer");
16 | }
17 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedMixedPrecisionL2NormBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | std::tuple<at::Tensor, at::Tensor>
 4 | multi_tensor_l2norm_mp_cuda(int chunk_size, at::Tensor noop_flag,
 5 |                             std::vector<std::vector<at::Tensor>> tensor_lists,
 6 |                             at::optional<bool> per_tensor_python);
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 9 |   m.def("multi_tensor_l2norm_mp", &multi_tensor_l2norm_mp_cuda,
10 |         "Computes L2 norm for a list of contiguous tensors");
11 | }
12 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedMixedPrecisionLambBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_mp_cuda(
 4 |     int chunk_size, at::Tensor noop_flag,
 5 |     std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor lr,
 6 |     const float beta1, const float beta2, const float epsilon, at::Tensor step,
 7 |     const int bias_correction, const float weight_decay,
 8 |     const int grad_averaging, const int mode, at::Tensor global_grad_norm,
 9 |     at::Tensor max_grad_norm, at::optional<bool> use_nvlamb_python,
10 |     at::Tensor found_inf, at::Tensor inv_scale);
11 | 
12 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
13 |   m.def("multi_tensor_lamb_mp", &multi_tensor_lamb_mp_cuda,
14 |         "Computes and apply update for LAMB optimizer");
15 | }
16 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedNovogradBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_novograd_cuda(
 4 |     int chunk_size, at::Tensor noop_flag,
 5 |     std::vector<std::vector<at::Tensor>> tensor_lists, at::Tensor grad_norms,
 6 |     const float lr, const float beta1, const float beta2, const float epsilon,
 7 |     const int step, const int bias_correction, const float weight_decay,
 8 |     const int grad_averaging, const int mode, const int norm_type);
 9 | 
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 |   m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda,
12 |         "Compute and apply gradient update to parameters for Adam optimizer");
13 | }
14 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/FusedSGDBinder.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_sgd_cuda(int chunk_size, at::Tensor noop_flag,
 4 |                            std::vector<std::vector<at::Tensor>> tensor_lists,
 5 |                            float wd, float momentum, float dampening, float lr,
 6 |                            bool nesterov, bool first_run,
 7 |                            bool wd_after_momentum, float scale);
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda,
11 |         "Fused SGD optimizer for list of contiguous tensors");
12 | }
13 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/_C/csrc/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/custom_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include "custom_cuda_layers.h"
 2 | 
 3 | __global__ void param_update_kernel(const float *input, __half *output,
 4 |                                     int size) {
 5 |   int id = blockIdx.x * blockDim.x + threadIdx.x;
 6 | 
 7 |   if (id < size) {
 8 |     output[id] = (__half)input[id];
 9 |   }
10 | }
11 | 
12 | void launch_param_update(const float *input, __half *output, int size,
13 |                          cudaStream_t stream) {
14 |   int threads = 1024;
15 | 
16 |   dim3 grid_dim((size - 1) / threads + 1);
17 |   dim3 block_dim(threads);
18 | 
19 |   param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
20 | }
21 | 
22 | __global__ void param_update_kernel_half(const float *input, __half *output,
23 |                                          int size) {
24 |   int id = blockIdx.x * blockDim.x + threadIdx.x;
25 |   __half2 *output_cast = reinterpret_cast<__half2 *>(output);
26 |   if (id < size) {
27 |     float input_f = input[id];
28 |     __half2 *input_h = reinterpret_cast<__half2 *>(&input_f);
29 |     output_cast[id] = *input_h;
30 |   }
31 | }
32 | 
33 | void launch_param_update_half(const float *input, __half *output, int size,
34 |                               cudaStream_t stream) {
35 |   int threads = 1024;
36 |   size /= 2;
37 |   dim3 grid_dim((size - 1) / threads + 1);
38 |   dim3 block_dim(threads);
39 | 
40 |   param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output,
41 |                                                                size);
42 | }
43 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/includes/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright 2021 TUNiB Inc.
 3 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | /*This code is copied from NVIDIA apex:
19 |  *     https://github.com/NVIDIA/apex
20 |  *     with minor changes. */
21 | 
22 | #ifndef TORCH_CHECK
23 | #define TORCH_CHECK AT_CHECK
24 | #endif
25 | 
26 | #ifdef VERSION_GE_1_3
27 | #define DATA_PTR data_ptr
28 | #else
29 | #define DATA_PTR data
30 | #endif
31 | 


--------------------------------------------------------------------------------
/oslo/torch/_C/csrc/includes/custom_cuda_layers.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "cuda_fp16.h"
4 | 
5 | void launch_param_update(const float *input, __half *output, int size,
6 |                          cudaStream_t stream);
7 | void launch_param_update_half(const float *input, __half *output, int size,
8 |                               cudaStream_t stream);
9 | 


--------------------------------------------------------------------------------
/oslo/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.distributed import ParallelContext, ParallelMode
2 | 
3 | __ALL__ = [ParallelContext, ParallelMode]
4 | 


--------------------------------------------------------------------------------
/oslo/torch/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.distributed.parallel_context import ParallelContext
2 | from oslo.torch.distributed.parallel_mode import ParallelMode
3 | 
4 | __ALL__ = [ParallelMode, ParallelContext]
5 | 


--------------------------------------------------------------------------------
/oslo/torch/distributed/_initializers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/_initializers/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/distributed/_initializers/initializer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class ProcessGroupInitializer(ABC):
 5 |     """
 6 |     The abstract class for process group initialization.
 7 | 
 8 |     Args:
 9 |         rank (int): The rank of current process
10 |         world_size (int): Size of whole communication world
11 |         data_parallel_size (int): Size of data parallelization
12 |     """
13 | 
14 |     def __init__(
15 |         self,
16 |         rank: int,
17 |         world_size: int,
18 |         data_parallel_size: int,
19 |         sequence_parallel_size: int,
20 |         pipeline_parallel_size: int,
21 |         tensor_parallel_size: int,
22 |         expert_parallel_size: int,
23 |     ):
24 |         self.rank = rank
25 |         self.world_size = world_size
26 |         self.data_parallel_size = data_parallel_size
27 |         self.sequence_parallel_size = sequence_parallel_size
28 |         self.pipeline_parallel_size = pipeline_parallel_size
29 |         self.tensor_parallel_size = tensor_parallel_size
30 |         self.expert_parallel_size = expert_parallel_size
31 | 
32 |     @abstractmethod
33 |     def init_dist_group(self):
34 |         raise NotImplementedError
35 | 


--------------------------------------------------------------------------------
/oslo/torch/distributed/_seed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/_seed/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/distributed/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/distributed/nn/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/distributed/parallel_mode.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ParallelMode(Enum):
 5 |     """Enum class about parallelization mode."""
 6 | 
 7 |     # global parallel groups
 8 |     GLOBAL = "global"
 9 | 
10 |     # data parallel groups
11 |     DATA = "data"
12 | 
13 |     # model parallel groups - containing tensor and pipeline parallel groups
14 |     # this is added to facilitate amp and grad clipping in hybrid parallel
15 |     MODEL = "model"
16 | 
17 |     # pipeline parallel groups
18 |     PIPELINE = "pipe"
19 | 
20 |     # tensor parallel groups - containing all ranks in tensor parallel
21 |     TENSOR = "tensor"
22 | 
23 |     # sequence parallel groups
24 |     SEQUENCE = "sequence"
25 |     SEQUENCE_DP = "sequence_dp"
26 | 
27 |     # 1D tensor parallel groups
28 |     TENSOR_1D = "tensor_1d"
29 | 
30 |     # 2D tensor parallel groups
31 |     TENSOR_2D = "tensor_2d"
32 |     TENSOR_2D_ROW = "tensor_2d_row"
33 |     TENSOR_2D_COL = "tensor_2d_col"
34 | 
35 |     # 2.5D tensor parallel groups
36 |     TENSOR_2P5D = "tensor_2p5d"
37 |     TENSOR_2P5D_ROW = "2p5d_row"
38 |     TENSOR_2P5D_COL = "2p5d_col"
39 |     TENSOR_2P5D_DEP = "2p5d_dep"
40 |     TENSOR_2P5D_XZ = "2p5d_xz"
41 | 
42 |     # 3D tensor parallel groups
43 |     TENSOR_3D = "tensor_3d"
44 |     TENSOR_3D_INPUT = "tensor_3d_input"
45 |     TENSOR_3D_WEIGHT = "tensor_3d_weight"
46 |     TENSOR_3D_OUTPUT = "tensor_3d_output"
47 | 
48 |     # Expert parallel groups
49 |     EXPERT = "expert"
50 | 


--------------------------------------------------------------------------------
/oslo/torch/jit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/jit/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/jit/_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def _set_jit_fusion_options():
 5 |     """Set PyTorch JIT layer fusion options."""
 6 |     TORCH_MAJOR = int(torch.__version__.split(".")[0])
 7 |     TORCH_MINOR = int(torch.__version__.split(".")[1])
 8 | 
 9 |     if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
10 |         # nv fuser
11 |         torch._C._jit_set_profiling_executor(True)
12 |         torch._C._jit_set_profiling_mode(True)
13 |         torch._C._jit_override_can_fuse_on_cpu(False)
14 |         torch._C._jit_override_can_fuse_on_gpu(False)
15 |         torch._C._jit_set_texpr_fuser_enabled(False)  # fuser1
16 |         torch._C._jit_set_nvfuser_enabled(True)  # fuser2
17 |         torch._C._debug_set_autodiff_subgraph_inlining(False)
18 |     else:
19 |         # legacy pytorch fuser
20 |         torch._C._jit_set_profiling_mode(False)
21 |         torch._C._jit_set_profiling_executor(False)
22 |         torch._C._jit_override_can_fuse_on_cpu(True)
23 |         torch._C._jit_override_can_fuse_on_gpu(True)
24 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.modules.conv import Conv1D
 2 | from oslo.torch.nn.modules.dropout import (
 3 |     FusedBiasDropout,
 4 | )
 5 | from oslo.torch.nn.modules.embedding import (
 6 |     Embedding1D,
 7 |     Embedding2D,
 8 |     Embedding2p5D,
 9 |     Embedding3D,
10 |     VocabParallelEmbedding1D,
11 |     VocabParallelEmbedding2D,
12 |     VocabParallelEmbedding2p5D,
13 |     VocabParallelEmbedding3D,
14 | )
15 | from oslo.torch.nn.modules.functional import _NGramRepeatBlockFunction
16 | from oslo.torch.nn.modules.functional import (
17 |     fused_bias_dropout,
18 |     fused_bias_gelu,
19 |     fused_gelu,
20 |     fused_rms_norm_affine,
21 |     fused_layer_norm,
22 |     mixed_dtype_fused_layer_norm_affine,
23 |     fused_layer_norm_affine,
24 |     mixed_dtype_fused_rms_norm_affine,
25 |     fused_rms_norm,
26 | )
27 | from oslo.torch.nn.modules.layer_norm import (
28 |     LayerNorm1D,
29 |     LayerNorm2D,
30 |     LayerNorm2p5D,
31 |     LayerNorm3D,
32 |     FusedLayerNorm,
33 |     MixedFusedLayerNorm,
34 |     MixedFusedRMSNorm,
35 |     FusedRMSNorm,
36 | )
37 | from oslo.torch.nn.modules.linear import (
38 |     ColLinear1D,
39 |     Linear,
40 |     Linear2D,
41 |     Linear2p5D,
42 |     Linear3D,
43 |     RowLinear1D,
44 | )
45 | from oslo.torch.nn.modules.loss import (
46 |     CrossEntropyLoss2D,
47 |     CrossEntropyLoss2p5D,
48 |     CrossEntropyLoss3D,
49 |     VocabParallelCrossEntropyLoss1D,
50 |     VocabParallelCrossEntropyLoss2D,
51 |     VocabParallelCrossEntropyLoss2p5D,
52 | )
53 | from oslo.torch.nn.modules.ngram_repeat_block import NGramRepeatBlock
54 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/modules/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/modules/conv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Conv1D(nn.Module):
 6 |     """
 7 |     1D-convolutional layer as defined by Radford et al.
 8 |     for OpenAI GPT (and also used in GPT-2).
 9 | 
10 |     Basically works like a linear layer but the weights are transposed.
11 | 
12 |     Args:
13 |         nf (int): The number of output features.
14 |         nx (int): The number of input features.
15 |         skip_bias_add (bool): This was added to enable performance optimization where bias
16 |                        can be fused with other elementwise operations. We skip
17 |                        adding bias but instead return it.
18 |     References:
19 |         https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py
20 |     """
21 | 
22 |     def __init__(self, nf, nx, skip_bias_add=False):
23 |         super().__init__()
24 |         self.nf = nf
25 |         self.skip_bias_add = skip_bias_add
26 | 
27 |         w = torch.empty(nx, nf)
28 |         nn.init.normal_(w, std=0.02)
29 |         self.weight = nn.Parameter(w)
30 |         self.bias = nn.Parameter(torch.zeros(nf))
31 | 
32 |     def forward(self, x):
33 |         size_out = x.size()[:-1] + (self.nf,)
34 |         if not self.skip_bias_add:
35 |             return torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight).view(
36 |                 size_out
37 |             )
38 |         else:
39 |             return (
40 |                 torch.mm(x.view(-1, x.size(-1)), self.weight).view(size_out),
41 |                 self.bias,
42 |             )
43 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/modules/dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.modules.dropout import _DropoutNd
 3 | 
 4 | from oslo.torch.nn.modules.functional import (
 5 |     fused_bias_dropout,
 6 | )
 7 | 
 8 | 
 9 | class FusedBiasDropout(_DropoutNd):
10 |     def forward(self, input: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
11 |         return fused_bias_dropout(input, bias, self.p, self.training, self.inplace)
12 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.nn.parallel.pipeline_parallel import *
2 | from oslo.torch.nn.parallel.tensor_parallel import *
3 | from oslo.torch.nn.parallel.data_parallel import *
4 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.data_parallel.data_parallel import (
 2 |     DistributedDataParallel,
 3 |     ShardingStrategy,
 4 | )
 5 | from oslo.torch.nn.parallel.data_parallel.zero import ZeroRedundancyOptimizer
 6 | 
 7 | from oslo.torch.nn.parallel.data_parallel._utils import set_params_to_ignore
 8 | 
 9 | __ALL__ = [
10 |     "DistributedDataParallel",
11 |     "ZeroRedundancyOptimizer",
12 |     "set_params_to_ignore",
13 |     "ShardingStrategy",
14 | ]
15 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | 
 3 | import torch
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | def is_ddp_ignored(p):
 8 |     return getattr(p, "_ddp_to_ignore", False)
 9 | 
10 | 
11 | def set_params_to_ignore(params_to_ignore: Iterable[torch.Tensor]) -> None:
12 |     """Sets parameters to be ignored by DDP.
13 |     This method must be called before initializing DistributedDataParallel.
14 |     Example:
15 |         >>> params_to_ignore = []
16 |         >>> for p in module.parameters():
17 |         >>>     if should_ignore(p):
18 |         >>>         params_to_ignore.append(p)
19 |         >>>         set_params_to_ignore(params_to_ignore)
20 |         >>> module = DistributedDataParallel(module)
21 |     Args:
22 |         params_to_ignore (Iterable[torch.Tensor]): A list of parameters to be ignored.
23 |     """
24 |     for p in params_to_ignore:
25 |         p._ddp_to_ignore = True
26 | 
27 | 
28 | class DistributedBackwardFunction(torch.autograd.Function):
29 |     @staticmethod
30 |     def forward(ctx, module, *inputs):
31 |         ctx.module = module
32 |         return inputs
33 | 
34 |     @staticmethod
35 |     def backward(ctx, *grad_outputs):
36 |         ctx.module._pre_backward()
37 |         # Enqueue a callback to flush the reducer.
38 |         # This callback is triggered after all gradients' calculation is completed.
39 |         Variable._execution_engine.queue_callback(ctx.module._post_backward)
40 |         return (None,) + grad_outputs
41 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.data_parallel.zero.optim.optim import (
 2 |     ZeroRedundancyOptimizer,
 3 | )
 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.data_parallel import (
 5 |     _HeteroDataParallel,
 6 | )
 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.optim import (
 8 |     _HeteroOptimizer,
 9 | )
10 | 
11 | __ALL__ = [
12 |     "ZeroRedundancyOptimizer",
13 |     "_HeteroDataParallel",
14 |     "_HeteroOptimizer",
15 | ]
16 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/hetero/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.data_parallel import (
2 |     _HeteroDataParallel,
3 | )
4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.optim import _HeteroOptimizer
5 | 
6 | __ALL__ = ["_HeteroDataParallel", "_HeteroOptimizer"]
7 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/hetero/chunk/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.chunk import (
 2 |     Chunk,
 3 |     TensorState,
 4 |     ChunkFullError,
 5 | )
 6 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.manager import ChunkManager
 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk.utils import (
 8 |     init_chunk_manager,
 9 | )
10 | 
11 | __ALL__ = [
12 |     "Chunk",
13 |     "TensorState",
14 |     "ChunkFullError",
15 |     "ChunkManager",
16 |     "init_chunk_manager",
17 | ]
18 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/hetero/memory_tracer/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.chunk_memstats_collector import (
 2 |     ChunkMemStatsCollector,
 3 | )
 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.memory_stats import (
 5 |     MemStats,
 6 | )
 7 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.param_runtime_order import (
 8 |     OrderedParamGenerator,
 9 | )
10 | 
11 | __ALL__ = ["ChunkMemStatsCollector", "MemStats", "OrderedParamGenerator"]
12 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/hetero/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from oslo.torch.nn.parallel.data_parallel.zero.hetero.chunk import Chunk
 8 | 
 9 | 
10 | def get_current_device() -> torch.device:
11 |     """
12 |     Returns currently selected device (gpu/cpu).
13 |     If cuda available, return gpu, otherwise return cpu.
14 |     """
15 |     if torch.cuda.is_available():
16 |         return torch.device(f"cuda:{torch.cuda.current_device()}")
17 |     else:
18 |         return torch.device("cpu")
19 | 
20 | 
21 | def get_temp_total_chunk_on_cuda(chunk: "Chunk") -> torch.Tensor:
22 |     if chunk.is_gathered:
23 |         return chunk.cuda_global_chunk
24 | 
25 |     if chunk.cuda_shard is not None:
26 |         shard_temp = chunk.cuda_shard
27 |     else:
28 |         shard_temp = chunk.cpu_shard.to(get_current_device())
29 | 
30 |     total_temp = torch.zeros(
31 |         chunk.chunk_size, dtype=chunk.dtype, device=get_current_device()
32 |     )
33 |     gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0))
34 |     dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg)
35 | 
36 |     return total_temp
37 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.nn.parallel.data_parallel.zero.optim.optim import (
2 |     ZeroRedundancyOptimizer,
3 | )
4 | 
5 | __ALL__ = ["ZeroRedundancyOptimizer"]
6 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/data_parallel/zero/optim/bookkeeping/__init__.py:
--------------------------------------------------------------------------------
1 | from .bucket_store import BucketStore
2 | from .gradient_store import GradientStore
3 | from .parameter_store import ParameterStore
4 | from .tensor_store import TensorBucket
5 | 
6 | __ALL__ = ["BucketStore", "GradientStore", "ParameterStore", "TensorBucket"]
7 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/expert_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/expert_parallel/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.pipeline_parallel.pipeline_parallel import (
 2 |     PipelineParallel,
 3 | )
 4 | 
 5 | # initialize workers
 6 | # TODO; better way?
 7 | from oslo.torch.nn.parallel.pipeline_parallel._workers import *
 8 | 
 9 | __ALL__ = [PipelineParallel]
10 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/pipeline_parallel/_buffers.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.pipeline_parallel._sync import (
 2 |     register_location_for_forward_counter,
 3 | )
 4 | 
 5 | 
 6 | # original forward dictionary
 7 | _ORIGINAL_FORWARDS = dict()
 8 | 
 9 | # module device(local) locations
10 | _MODULE_DEVICE_LOCATIONS = dict()
11 | 
12 | 
13 | def register_original_forward_function(location, func, device):
14 |     _ORIGINAL_FORWARDS[location] = func
15 |     _MODULE_DEVICE_LOCATIONS[location] = device
16 |     register_location_for_forward_counter(location)
17 | 
18 | 
19 | def get_original_forward_function(location):
20 |     return _ORIGINAL_FORWARDS[location]
21 | 
22 | 
23 | def get_module_device_location(location):
24 |     return _MODULE_DEVICE_LOCATIONS[location]
25 | 
26 | 
27 | # Activations
28 | _ACTIVATIONS = dict()
29 | 
30 | 
31 | def save_activation(key, activation):
32 |     _ACTIVATIONS[key] = activation
33 | 
34 | 
35 | def pop_activation(key):
36 |     return _ACTIVATIONS.pop(key, [])  # TODO; okay?
37 |     # return _ACTIVATIONS.pop(key)
38 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/pipeline_parallel/_sync.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from oslo.torch.nn.parallel.pipeline_parallel._job import Input
 4 | from oslo.torch.nn.parallel.pipeline_parallel._types import (
 5 |     SyncNotification,
 6 |     SyncQueues,
 7 | )
 8 | 
 9 | import torch
10 | 
11 | 
12 | QUEUES = SyncQueues()
13 | 
14 | NOTIFICATIONS = SyncNotification()
15 | 
16 | 
17 | def sleep():
18 |     time.sleep(0.05)
19 | 
20 | 
21 | def initialize_job(fn, is_grad_enabled, unique_key, out_queue, **kwargs):
22 |     job = Input(
23 |         fn=fn,
24 |         is_grad_enabled=is_grad_enabled,
25 |         unique_key=unique_key,
26 |         out_queue=out_queue,
27 |         **kwargs,
28 |     )
29 | 
30 |     register_job(job)
31 | 
32 | 
33 | def register_job(job):
34 |     QUEUES.JOBS.add(job)
35 | 
36 | 
37 | # TODO; support TP
38 | def select_job():
39 |     while len(QUEUES.JOBS) <= 0:
40 |         sleep()
41 | 
42 |     job = list(sorted(QUEUES.JOBS))[0]
43 |     QUEUES.JOBS.remove(job)
44 |     return job
45 | 
46 | 
47 | # for unique tag generation
48 | _NUM_FORWARD_USED_COUNTER = dict()
49 | 
50 | 
51 | def register_location_for_forward_counter(location):
52 |     _NUM_FORWARD_USED_COUNTER[location] = 0
53 | 
54 | 
55 | def make_unique_key(location, from_, to_):
56 |     cnt = _NUM_FORWARD_USED_COUNTER[location]
57 |     unique_key = (location, cnt, from_, to_)
58 |     _NUM_FORWARD_USED_COUNTER[location] += 1
59 |     return unique_key
60 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/pipeline_parallel/_utils.py:
--------------------------------------------------------------------------------
 1 | def dfs(node, bfs_dict=None):
 2 |     yield node
 3 |     if bfs_dict is not None:
 4 |         if node.depth in bfs_dict:
 5 |             bfs_dict[node.depth].append(node)
 6 |         else:
 7 |             bfs_dict[node.depth] = [node]
 8 | 
 9 |     for child in node.children:
10 |         for c in dfs(child, bfs_dict):
11 |             yield c
12 | 
13 | 
14 | def bfs(node, bfs_dict=None):
15 |     if bfs_dict is None:
16 |         bfs_dict = {}
17 |     if len(bfs_dict) == 0:
18 |         list(dfs(node, bfs_dict))
19 |     for nodes in bfs_dict.values():
20 |         for node in nodes:
21 |             yield node
22 | 
23 | 
24 | def post_order_traverse(node):
25 |     for child in node.children:
26 |         yield from post_order_traverse(child)
27 |     yield node
28 | 
29 | 
30 | # from https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/scatter_gather.py#L12
31 | def _is_namedtuple(obj):
32 |     # Check if type was created from collections.namedtuple or a typing.NamedTuple.
33 |     return (
34 |         isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
35 |     )
36 | 
37 | 
38 | def _is_primitive(obj):
39 |     return not hasattr(obj, "__dict__")
40 | 
41 | 
42 | def _is_private(attr):
43 |     return attr.startswith("__")
44 | 


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/tensor_parallel/_1d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_1d/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/tensor_parallel/_2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_2d/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/tensor_parallel/_2p5d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_2p5d/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/tensor_parallel/_3d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/nn/parallel/tensor_parallel/_3d/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/nn/parallel/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from oslo.torch.nn.parallel.tensor_parallel.mapping import (
 2 |     Column,
 3 |     Row,
 4 |     Update,
 5 |     Head,
 6 |     Other,
 7 | )
 8 | from oslo.torch.nn.parallel.tensor_parallel.tensor_parallel import TensorParallel
 9 | 
10 | __ALL__ = [TensorParallel, Column, Row, Update, Head, Other]
11 | 


--------------------------------------------------------------------------------
/oslo/torch/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.optim.fused_adagrad import FusedAdagrad
2 | from oslo.torch.optim.fused_adam import FusedAdam
3 | from oslo.torch.optim.fused_lamb import FusedLamb
4 | from oslo.torch.optim.fused_mixed_precision_lamb import FusedMixedPrecisionLamb
5 | from oslo.torch.optim.fused_novograd import FusedNovograd
6 | from oslo.torch.optim.fused_sgd import FusedSGD
7 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.utils.version import torch_version
2 | from oslo.torch.utils.common import get_free_port
3 | from oslo.torch.utils.random import set_seed
4 | from oslo.torch.utils.logging import get_dist_logger
5 | 
6 | __all__ = ["get_free_port", "set_seed", "get_dist_logger"]
7 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/checkpoint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/torch/utils/checkpoint/__init__.py


--------------------------------------------------------------------------------
/oslo/torch/utils/common.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import random
 3 | 
 4 | 
 5 | def get_free_port() -> int:
 6 |     """Get a free port on localhost.
 7 | 
 8 |     Returns:
 9 |         int: A free port on localhost.
10 |     """
11 |     while True:
12 |         port = random.randrange(20000, 65000)
13 |         try:
14 |             with socket.socket() as sock:
15 |                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
16 |                 sock.bind(("localhost", port))
17 |                 return port
18 |         except OSError:
19 |             continue
20 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/data/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.torch.utils.data.data_collators import (
2 |     SequenceParallelCollator,
3 | )
4 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/logging/__init__.py:
--------------------------------------------------------------------------------
 1 | from .logger import DistributedLogger
 2 | 
 3 | __all__ = ["get_dist_logger", "DistributedLogger"]
 4 | 
 5 | 
 6 | def get_dist_logger(name: str = "oslo") -> DistributedLogger:
 7 |     """Get logger instance based on name. The DistributedLogger will create singleton instances,
 8 |     which means that only one logger instance is created per name.
 9 | 
10 |     Args:
11 |         name (str): name of the logger, name must be unique
12 | 
13 |     Returns:
14 |         DistributedLogger: A DistributedLogger object
15 |     """
16 |     return DistributedLogger.get_instance(name=name)
17 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/multi_tensor_apply/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_tensor_apply import MultiTensorApply
2 | 
3 | multi_tensor_applier = MultiTensorApply(2048 * 32)
4 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/multi_tensor_apply/multi_tensor_apply.py:
--------------------------------------------------------------------------------
1 | class MultiTensorApply(object):
2 |     def __init__(self, chunk_size):
3 |         self.chunk_size = chunk_size
4 | 
5 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
6 |         return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
7 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/random.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def set_seed(seed: int, cuda_deterministic: bool = False):
 8 |     """Set seed for random, numpy, torch.
 9 | 
10 |     Args:
11 |         seed (int): Random seed.
12 |         cuda_deterministic (bool, optional): Deterministic for cuda. Defaults to False.
13 |     """
14 |     random.seed(seed)
15 |     np.random.seed(seed)
16 |     torch.manual_seed(seed)
17 |     if torch.cuda.is_available():
18 |         torch.cuda.manual_seed(seed)
19 |         torch.cuda.manual_seed_all(seed)
20 |     if cuda_deterministic:  # slower, more reproducible
21 |         torch.backends.cudnn.deterministic = True
22 |         torch.backends.cudnn.benchmark = False
23 |     else:
24 |         torch.backends.cudnn.deterministic = False
25 |         torch.backends.cudnn.benchmark = True
26 | 


--------------------------------------------------------------------------------
/oslo/torch/utils/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 2 | #
 3 | # This source code is licensed under the BSD license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import logging
 7 | import re
 8 | from typing import List, Tuple
 9 | 
10 | import torch
11 | 
12 | __all__: List[str] = ["torch_version"]
13 | 
14 | 
15 | def torch_version(version: str = torch.__version__) -> Tuple[int, ...]:
16 |     numbering = re.search(r"^(\d+).(\d+).(\d+)([^\+]*)(\+\S*)?$", version)
17 |     if not numbering:
18 |         return tuple()
19 |     # Catch torch version if run against internal pre-releases, like `1.8.0a0fb`,
20 |     if numbering.group(4):
21 |         # Two options here:
22 |         # - either skip this version (minor number check is not relevant)
23 |         # - or check that our codebase is not broken by this ongoing development.
24 | 
25 |         # Assuming that we're interested in the second use-case more than the first,
26 |         # return the pre-release or dev numbering
27 |         logging.warning(
28 |             f"Pytorch pre-release version {version} - assuming intent to test it"
29 |         )
30 | 
31 |     return tuple(int(numbering.group(n)) for n in range(1, 4))
32 | 


--------------------------------------------------------------------------------
/oslo/transformers/constants.py:
--------------------------------------------------------------------------------
 1 | BATCH_DIMENSIONS_TP = {
 2 |     "input_ids": 0,
 3 |     "attention_mask": 0,
 4 |     "token_type_ids": 0,
 5 |     "position_ids": 0,
 6 |     "inputs_embeds": 0,
 7 |     "decoder_input_ids": 0,
 8 |     "decoder_attention_mask": 0,
 9 |     "decoder_token_type_ids": 0,
10 |     "decoder_position_ids": 0,
11 |     "decoder_inputs_embeds": 0,
12 |     "pixel_values": 0,
13 | }
14 | 
15 | BATCH_DIMENSIONS_PP = {
16 |     "input_ids": 0,
17 |     "attention_mask": 0,
18 |     "token_type_ids": 0,
19 |     "position_ids": 0,
20 |     "inputs_embeds": 0,
21 |     "decoder_input_ids": 0,
22 |     "decoder_attention_mask": 0,
23 |     "decoder_token_type_ids": 0,
24 |     "decoder_position_ids": 0,
25 |     "decoder_inputs_embeds": 0,
26 |     "labels": 0,
27 | }
28 | 


--------------------------------------------------------------------------------
/oslo/transformers/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/data/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/modeling_utils.py:
--------------------------------------------------------------------------------
1 | class OsloModel:
2 |     """Marker Interface"""
3 | 


--------------------------------------------------------------------------------
/oslo/transformers/models/__init__.py:
--------------------------------------------------------------------------------
1 | from oslo.transformers.models.gpt2.modeling_gpt2 import (
2 |     GPT2Model,
3 |     GPT2LMHeadModel,
4 |     GPT2ForSequenceClassification,
5 |     GPT2ForTokenClassification,
6 | )
7 | 
8 | # from oslo.transformers.trainer import Trainer
9 | 


--------------------------------------------------------------------------------
/oslo/transformers/models/albert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/albert/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/bart/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/bart/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/bert/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/distilbert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/distilbert/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/electra/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/electra/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/gpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/gpt2/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/mbart/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/mbart/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/mt5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/mt5/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/roberta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/roberta/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/models/t5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/models/t5/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/tasks/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/tasks/data_base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, List, Optional
 3 | 
 4 | # from datasets.arrow_dataset import Batch
 5 | 
 6 | try:
 7 |     from transformers import PreTrainedTokenizerBase
 8 | except ImportError:
 9 |     print("You have to install `transformers` to use `oslo.transformers` modules")
10 | 
11 | 
12 | class BaseProcessor(ABC):
13 |     def __init__(self, tokenizer: PreTrainedTokenizerBase, max_seq_length: int) -> None:
14 |         self._tokenizer = tokenizer
15 |         self._max_seq_length = max_seq_length
16 |         self._chunk_size = max_seq_length - 1
17 |         self._buffer = []
18 | 
19 |     def save_tokenizer(self, path: str) -> None:
20 |         self._tokenizer.save_pretrained(path)
21 | 
22 |     @abstractmethod
23 |     # def __call__(self, examples: Batch) -> Dict[str, List[int]]:
24 |     def __call__(self, examples) -> Dict[str, List[int]]:
25 |         pass
26 | 
27 | 
28 | def pad_labels(
29 |     labels,
30 |     tokenizer,
31 |     label_pad_token_id: int,
32 |     pad_to_multiple_of: Optional[int] = None,
33 | ):
34 |     labels = tokenizer.pad(
35 |         {"input_ids": labels},
36 |         padding=True,
37 |         return_attention_mask=False,
38 |         return_tensors="pt",
39 |         pad_to_multiple_of=pad_to_multiple_of,
40 |     )["input_ids"]
41 | 
42 |     labels.masked_fill_(labels == tokenizer.pad_token_id, label_pad_token_id)
43 |     return labels
44 | 


--------------------------------------------------------------------------------
/oslo/transformers/tasks/loading/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/oslo/transformers/tasks/loading/__init__.py


--------------------------------------------------------------------------------
/oslo/transformers/trainer_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import numpy as np
 4 | import torch
 5 | import time
 6 | 
 7 | from transformers.utils import ExplicitEnum
 8 | 
 9 | 
10 | class SchedulerType(ExplicitEnum):
11 |     LINEAR = "linear"
12 |     COSINE = "cosine"
13 |     COSINE_WITH_RESTARTS = "cosine_with_restarts"
14 |     POLYNOMIAL = "polynomial"
15 |     CONSTANT = "constant"
16 |     CONSTANT_WITH_WARMUP = "constant_with_warmup"
17 | 
18 | 
19 | class OptimizerNames(ExplicitEnum):
20 |     """
21 |     Stores the acceptable string identifiers for optimizers.
22 |     """
23 | 
24 |     ADAM = "adam"
25 |     ADAMW = "adamw"
26 |     ADAGRAD = "adagrad"
27 |     ADADELTA = "adadelta"
28 |     ADAFACTOR = "adafactor"
29 |     ADAMW_BNB = "adamw_bnb_8bit"
30 |     SGD = "sgd"
31 |     NOVOGRAD = "novograd"
32 |     LAMB = "lamb"
33 | 
34 | 
35 | def log_dist(message: str, rank: int = 0, level: int = logging.INFO) -> None:
36 |     if rank == -1:
37 |         ranks = [i for i in range(int(os.environ["WORLD_SIZE"]))]
38 |     else:
39 |         ranks = [rank]
40 |     my_rank = int(os.environ.get("RANK", "0"))
41 |     if my_rank in ranks:
42 |         if level == logging.INFO:
43 |             logging.info(f"[Rank {my_rank}] {message}")
44 |         if level == logging.WARNING:
45 |             logging.warning(f"[Rank {my_rank}] {message}")
46 |         if level == logging.ERROR:
47 |             logging.error(f"[Rank {my_rank}] {message}")
48 |         if level == logging.DEBUG:
49 |             logging.debug(f"[Rank {my_rank}] {message}")
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anytree
 2 | datasets
 3 | ninja
 4 | packaging
 5 | parameterized >= 0.8.1
 6 | psutil
 7 | py-cpuinfo
 8 | pybind11
 9 | scipy
10 | torch >= 1.11.0
11 | transformers
12 | wandb
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:isort]
2 | multi_line_output = 3
3 | line_length = 79
4 | include_trailing_comma = True
5 | 
6 | [flake8]
7 | ignore = E203, E501, E731, E741, W503, W504, W605, PAI100, PAI101, PAI201, PAI202, PAI203, B009, B010, F401, F403
8 | max-line-length = 119
9 | 


--------------------------------------------------------------------------------
/tests/test_script/run_inference.sh:
--------------------------------------------------------------------------------
 1 | ## inference shell code
 2 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 bert-base-cased masked-lm ``
 3 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 ishan/bert-base-uncased-mnli sequence-classification ``
 4 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 gpt2 causal-lm ``
 5 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 EleutherAI/gpt-neo-1.3B causal-lm ``
 6 | # EXAMPLE: ``sh ./tests/test_script/run_inference.sh 4 t5-base seq2seq-lm ``
 7 | 
 8 | NUM_GPUS=$1
 9 | MODEL=$2
10 | TASK=$3
11 | 
12 | python -m torch.distributed.launch \
13 |        --nproc_per_node="$NUM_GPUS" \
14 |        ./tests/inference.py \
15 |        --task=$TASK \
16 |        --model=$MODEL \
17 |        --tensor_parallel_size="$NUM_GPUS"
18 | 


--------------------------------------------------------------------------------
/tests/test_script/run_merge.sh:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | # If you use only two gpu example
 3 | # Checkpoint directory : tests/ckpt/checkpoint_0
 4 | # saved merge directory: tests/ckpt/checkpoint_0_merge
 5 | ###########################################################
 6 | 
 7 | # EXAMPLE merge TP case BERT:`sh ./tests/test_script/run_merge.sh ishan/bert-base-uncased-mnli sequence-classification 2 1 1 2 1`
 8 | 
 9 | # EXAMPLE merge TP case GPT:`sh ./tests/test_script/run_merge.sh gpt2 causal-lm 2 1 1 2 1`
10 | 
11 | # EXAMPLE merge TP case T5:`sh ./tests/test_script/run_merge.sh t5-base seq2seq 2 1 1 2 1`
12 | 
13 | 
14 | MODEL=$1
15 | TASK=$2
16 | 
17 | NUM_GPUS=$3
18 | DATA_PARALLEL_SIZE=$4
19 | PIPELINE_PARALLEL_SIZE=$5
20 | TENSOR_PARALLEL_SIZE=$6
21 | TENSOR_PARALLEL_DEPTH=$7
22 | 
23 | # tensor parallel mode
24 | # "1D", "2D", "2D_ROW", "2D_COL", "2P5D", "2P5D_ROW", "2P5D_COL"
25 | # "2P5D_DEP", "2P5D_XZ", "3D", "3D_INPUT", "3D_WEIGHT", "3D_OUTPUT"
26 | TENSOR_PARALLEL_MODE=1D
27 | MERGE_DIR=tests/ckpt/checkpoint_0
28 | 
29 | run_cmd="torchrun --standalone --nproc_per_node=${NUM_GPUS} \
30 |        ./tests/merge.py \
31 |        --task=$TASK \
32 |        --model=$MODEL \
33 |        --tensor_parallel_size=$TENSOR_PARALLEL_SIZE \
34 |        --data_parallel_size=$DATA_PARALLEL_SIZE \
35 |        --pipeline_parallel_size=$PIPELINE_PARALLEL_SIZE \
36 |        --tensor_parallel_mode=$TENSOR_PARALLEL_MODE \
37 |        --tensor_parallel_depth=$TENSOR_PARALLEL_DEPTH \
38 |        --merge_dir=$MERGE_DIR
39 |        "
40 | 
41 | echo ${run_cmd}
42 | eval ${run_cmd}
43 | 


--------------------------------------------------------------------------------
/tests/util/arg_parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def get_args():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--local-rank", default=0, type=int)
 7 |     # parser.add_argument("--config", required=True, type=str)
 8 |     parser.add_argument("--task", required=True, type=str)
 9 |     parser.add_argument("--model", required=True, type=str)
10 |     parser.add_argument("--tokenizer", default=None, type=str)
11 |     parser.add_argument("--batch_size", required=False, type=int)
12 |     parser.add_argument("--sequence_length", required=False, type=int)
13 |     parser.add_argument("--train_step", required=False, type=int)
14 |     parser.add_argument("--save_interval", required=False, type=int)
15 |     parser.add_argument("--tensor_parallel_size", default=1, type=int)
16 |     parser.add_argument("--data_parallel_size", default=1, type=int)
17 |     parser.add_argument("--pipeline_parallel_size", default=1, type=int)
18 |     parser.add_argument("--tensor_parallel_depth", default=1, type=int)
19 |     parser.add_argument("--epoch", default=1, type=int)
20 |     parser.add_argument("--tensor_parallel_mode", default="1D", type=str)
21 |     parser.add_argument("--merge_dir", required=False, type=str)
22 |     args = parser.parse_args()
23 |     return args
24 | 


--------------------------------------------------------------------------------
/tests/util/oslo.py:
--------------------------------------------------------------------------------
 1 | import oslo
 2 | from oslo.torch.distributed.parallel_context import ParallelContext, ParallelMode
 3 | from oslo.torch.nn.parallel import TensorParallel, PipelineParallel
 4 | 
 5 | 
 6 | def initialize_oslo(args, model):
 7 |     try:
 8 |         pc = ParallelContext.from_torch(
 9 |             data_parallel_size=args.data_parallel_size,
10 |             pipeline_parallel_size=args.pipeline_parallel_size,
11 |             tensor_parallel_size=args.tensor_parallel_size,
12 |             tensor_parallel_depth=args.tensor_parallel_depth,
13 |             tensor_parallel_mode={
14 |                 "1D": ParallelMode.TENSOR_1D,
15 |                 "2D": ParallelMode.TENSOR_2D,
16 |                 "2P5D": ParallelMode.TENSOR_2P5D,
17 |                 "3D": ParallelMode.TENSOR_3D,
18 |             }[args.tensor_parallel_mode],
19 |         )
20 | 
21 |         if pc.get_world_size(ParallelMode.TENSOR) > 1:
22 |             model = TensorParallel(model, pc)
23 |         if pc.get_world_size(ParallelMode.PIPELINE) > 1:
24 |             model = PipelineParallel(model, pc)
25 |         oslo.ready(model, pc)
26 | 
27 |     except Exception as e:
28 |         print(e)
29 |         pc = None
30 |         model = model.cuda()
31 | 
32 |     return model, pc
33 | 
34 | 
35 | def print_rank_0(message, pc):
36 |     if pc is None:
37 |         print(message)
38 |     elif pc.get_global_rank() == 0:
39 |         print(f"Rank :{pc.get_global_rank()}")
40 |         print(message)
41 | 


--------------------------------------------------------------------------------
/tests_deprecated/__init__.py:
--------------------------------------------------------------------------------
1 | from transformers.modeling_outputs import Seq2SeqModelOutput
2 | 
3 | a = Seq2SeqModelOutput()
4 | print(a)
5 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/distributed/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/data_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/data_parallel/zero/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/data_parallel/zero/heterogeneous_manager/test_mem_monitor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | 
 4 | from oslo.torch.nn.parallel.data_parallel.zero.hetero.memory_tracer.memory_monitor import (
 5 |     SyncCudaMemoryMonitor,
 6 | )
 7 | 
 8 | 
 9 | class TestSyncCudaMemoryMonitor(unittest.TestCase):
10 |     @patch("torch.cuda.synchronize")
11 |     @patch("torch.cuda.reset_peak_memory_stats")
12 |     @patch("torch.cuda.max_memory_allocated", return_value=1024)
13 |     def test_sync_cuda_memory_monitor_methods(
14 |         self, mock_max_memory_allocated, mock_reset_peak_memory_stats, mock_synchronize
15 |     ):
16 |         # Create a SyncCudaMemoryMonitor instance
17 |         sync_cuda_mem_monitor = SyncCudaMemoryMonitor()
18 | 
19 |         # Test the start method
20 |         sync_cuda_mem_monitor.start()
21 |         mock_synchronize.assert_called_once()
22 |         mock_reset_peak_memory_stats.assert_called_once()
23 | 
24 |         # Test the finish method
25 |         max_usage = sync_cuda_mem_monitor.finish()
26 |         self.assertIsInstance(max_usage, int)
27 |         self.assertEqual(max_usage, 1024)  # The mock max_memory_allocated returns 1024
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/data_parallel/zero/sharded_optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/data_parallel/zero/sharded_optim/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/expert_parallel/gpt2/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | import torch
 6 | import torch.backends.cudnn as cudnn
 7 | 
 8 | import deepspeed.comm as dist
 9 | 
10 | 
11 | def create_config_from_dict(tmpdir, config_dict):
12 |     config_path = os.path.join(tmpdir, "temp_config.json")
13 |     with open(config_path, "w") as fd:
14 |         json.dump(config_dict, fd)
15 |     return config_path
16 | 
17 | 
18 | def create_deepspeed_args():
19 |     parser = argparse.ArgumentParser()
20 |     args = parser.parse_args(args="")
21 |     args.deepspeed = True
22 |     if dist.is_initialized():
23 |         # We assume up to one full node executing unit tests
24 |         assert dist.get_world_size() <= torch.cuda.device_count()
25 |         args.local_rank = dist.get_rank()
26 |     return args
27 | 
28 | 
29 | def args_from_dict(tmpdir, config_dict):
30 |     args = create_deepspeed_args()
31 |     config_path = create_config_from_dict(tmpdir, config_dict)
32 |     args.deepspeed_config = config_path
33 |     return args
34 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_pp_nopp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def main():
 7 |     file_names = os.listdir("tmp2/")
 8 |     no_pp_names = sorted([fn for fn in file_names if "no_pp" in fn])
 9 | 
10 |     print(len(no_pp_names))
11 | 
12 |     diff_cnt = 0
13 |     for no_pp_name in no_pp_names:
14 |         pp_name = no_pp_name.replace("no_pp", "pp")
15 | 
16 |         pp_path = os.path.join("tmp2", pp_name)
17 |         no_pp_path = os.path.join("tmp2", no_pp_name)
18 | 
19 |         pp_data = torch.load(pp_path, map_location="cpu")
20 |         no_pp_data = torch.load(no_pp_path, map_location="cpu")
21 | 
22 |         if not torch.allclose(pp_data, no_pp_data):
23 |             # print(torch.abs(pp_data - no_pp_data))
24 |             # print(pp_name)
25 | 
26 |             diff_cnt += 1
27 | 
28 |             # break
29 | 
30 |     print(diff_cnt)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_pptp_trial.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | main_dir = "tmp2"
 8 | 
 9 | compair_dirs = ["tmp3"]
10 | 
11 | 
12 | def main():
13 |     file_names = os.listdir(f"{main_dir}/")
14 | 
15 |     diff_names = set()
16 |     same_names = set()
17 |     for name in file_names:
18 |         left_path = os.path.join(main_dir, name)
19 |         left = torch.load(left_path, map_location="cpu")
20 | 
21 |         for rd in compair_dirs:
22 |             right_path = left_path.replace(main_dir, rd)
23 |             right = torch.load(right_path, map_location="cpu")
24 | 
25 |             if not torch.allclose(left, right):
26 |                 diff_names.add(name)
27 |             else:
28 |                 same_names.add(name)
29 | 
30 |     print("Names with difference gradient: ")
31 |     for dn in diff_names:
32 |         print(dn)
33 | 
34 |     print(f"{len(diff_names)} / {len(file_names)}")
35 | 
36 |     print("Names with same gradient: ")
37 |     for sn in same_names:
38 |         print(sn)
39 | 
40 |     print(f"{len(same_names)} / {len(file_names)}")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/compare_send_recv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def main():
 7 |     file_names = os.listdir("tmp/")
 8 |     send_names = [fn for fn in file_names if "send" in fn]
 9 | 
10 |     for send_name in send_names:
11 |         recv_name = send_name.replace("send", "recv")
12 | 
13 |         send_path = os.path.join("tmp", send_name)
14 |         recv_path = os.path.join("tmp", recv_name)
15 | 
16 |         send_data = torch.load(send_path, map_location="cpu")
17 |         recv_data = torch.load(recv_path, map_location="cpu")
18 | 
19 |         assert send_data["__KEY__"] == recv_data["__KEY__"]
20 |         assert send_data["__META__"] == recv_data["__META__"]
21 | 
22 |         assert send_data["__VALUE__"]["stub"] == recv_data["__VALUE__"]["stub"]
23 | 
24 |         send_data = send_data["__VALUE__"]["tensors"]
25 |         recv_data = recv_data["__VALUE__"]["tensors"]
26 | 
27 |         for x, y in zip(send_data, recv_data):
28 |             assert torch.allclose(x, y, atol=1e-16), send_name
29 |             assert x.dtype == y.dtype, send_name
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/test_p2p.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | 
 3 | from oslo.torch.distributed import ParallelContext
 4 | from oslo.torch.distributed.nn.functional import send, recv
 5 | 
 6 | parallel_context = ParallelContext.from_torch(pipeline_parallel_size=2)
 7 | 
 8 | example_data = [
 9 |     True,
10 |     None,
11 |     1,
12 |     2.3,
13 |     "안녕",
14 |     {"xx": "yy"},
15 |     {"1", "2", "3"},
16 |     (1, 2, 3),
17 |     complex(1, 2),
18 |     [1, 2, [1, 2, {"1": "x", "2": (1, 2, {3})}]],
19 | ]
20 | 
21 | send(example_data, src_rank=0, dst_rank=1, parallel_context=parallel_context)
22 | data = recv(src_rank=0, dst_rank=1, parallel_context=parallel_context)
23 | 
24 | if dist.get_rank() == 1:
25 |     print(data)
26 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/pipeline_parallel/test_partioning.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from transformers import T5ForConditionalGeneration
 4 | 
 5 | from oslo.torch.distributed import ParallelContext
 6 | from oslo.torch.nn.parallel.pipeline_parallel.pipeline_parallel import _PipelineParallel
 7 | from oslo.torch.nn.parallel.utils import parallelize
 8 | 
 9 | parallel_context = ParallelContext.from_torch(pipeline_parallel_size=8)
10 | model = T5ForConditionalGeneration.from_pretrained("t5-large")
11 | 
12 | wrapper_pp = _PipelineParallel(model, parallel_context=parallel_context)
13 | parallelize(wrapper_pp, parallel_context)
14 | 
15 | for rank in range(dist.get_world_size()):
16 |     if dist.get_rank() == rank:
17 |         print(f"RANK: {rank}:")
18 |         num_params = 0
19 |         for name, param in wrapper_pp.named_parameters():
20 |             if param.device != torch.device("cpu"):
21 |                 # print(f"> {name}: {param.device}")
22 |                 num_params += param.numel()
23 |         print(f"RANK {rank} params: {num_params}")
24 |     dist.barrier()
25 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/1d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/1d/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/1d/_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | from oslo.torch.distributed import ParallelMode
 5 | 
 6 | 
 7 | def split_1d(tensor, world_size, dim, parallel_context):
 8 |     tensor = tensor.chunk(world_size, dim=dim)[
 9 |         parallel_context.get_local_rank(ParallelMode.TENSOR_1D)
10 |     ]
11 |     return tensor
12 | 
13 | 
14 | def gather_1d(tensor, world_size, dim, parallel_context):
15 |     tensor_list = [torch.zeros_like(tensor) for _ in range(world_size)]
16 |     dist.all_gather(
17 |         tensor_list,
18 |         tensor.contiguous(),
19 |         parallel_context.get_group(ParallelMode.TENSOR_1D),
20 |     )
21 |     tensor = torch.cat(tensor_list, dim=dim)
22 |     return tensor
23 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/1d/deparallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import _utils
2 | 
3 | _ALL__ = [_utils]
4 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/2d/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/2d/deparallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import _utils
2 | 
3 | _ALL__ = [_utils]
4 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/2p5d/deparallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import _utils
2 | 
3 | _ALL__ = [_utils]
4 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/3d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/3d/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/3d/deparallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .. import _utils
2 | 
3 | _ALL__ = [_utils]
4 | 


--------------------------------------------------------------------------------
/tests_deprecated/torch/nn/parallel/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/nn/parallel/tensor_parallel/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/utils/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/utils/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/torch/utils/data/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/torch/utils/data/test_data_collators.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from oslo.torch.distributed import ParallelContext
 4 | from oslo.torch.utils.data import SequenceParallelCollator
 5 | 
 6 | parallel_context = ParallelContext.from_torch(sequence_parallel_size=4)
 7 | 
 8 | data = {
 9 |     "input_ids": torch.randn(16, 129).cuda(),
10 |     "attention_mask": torch.ones(16, 129).cuda(),
11 | }
12 | 
13 | collator = SequenceParallelCollator(
14 |     parallel_context=parallel_context,
15 |     parallel_keys=["input_ids", "attention_mask"],
16 |     pad_token_id=99,
17 | )
18 | 
19 | sharded = collator(**data)
20 | print(sharded["input_ids"].size())
21 | 


--------------------------------------------------------------------------------
/tests_deprecated/transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/oslo/d7c4e32e766a99cc9d56533bc090570360dc8b2a/tests_deprecated/transformers/__init__.py


--------------------------------------------------------------------------------
/tests_deprecated/transformers/test_kernel_fusion_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     AutoTokenizer,
 3 |     AutoModelForSeq2SeqLM,
 4 | )
 5 | 
 6 | from oslo.transformers.kernel_fusion_utils import fused_no_repeat_ngram_logits_processor
 7 | 
 8 | tokenizer = AutoTokenizer.from_pretrained("t5-base")
 9 | model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to("cuda")
10 | 
11 | output = model.generate(
12 |     **tokenizer("hello", return_tensors="pt").to("cuda"), no_repeat_ngram_size=2
13 | )
14 | print(tokenizer.decode(output[0]))
15 | 
16 | fused_no_repeat_ngram_logits_processor(model)
17 | 
18 | output = model.generate(
19 |     **tokenizer("hello", return_tensors="pt").to("cuda"), no_repeat_ngram_size=2
20 | )
21 | print(tokenizer.decode(output[0]))
22 | 


--------------------------------------------------------------------------------
/tests_deprecated/transformers/trainer/oslo_user_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "data_parallelism": {
3 |         "enable": true,
4 |         "parallel_size": 1,
5 |         "zero_stage": 0
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/tests_deprecated/transformers/trainer/test_oslo_config.py:
--------------------------------------------------------------------------------
 1 | from oslo.transformers.oslo_init import OsloTrainerConfig, init_oslo_features
 2 | 
 3 | 
 4 | oslo_init_dict_form = {
 5 |     "data_parallelism": {
 6 |         "enable": True,
 7 |         "parallel_size": 1,
 8 |         "zero_stage": 0,
 9 |     },
10 |     "tensor_parallelism": {
11 |         "enable": True,
12 |         "parallel_size": 1,
13 |         "parallel_mode": "1d",
14 |     },
15 |     "sequence_parallelism": {"enable": True, "parallel_size": 1},
16 | }
17 | 
18 | user_config_from_dict = OsloTrainerConfig(oslo_init_dict_form)
19 | 
20 | user_config_from_json = OsloTrainerConfig("oslo_user_config.json")
21 | 
22 | print(user_config_from_dict)
23 | 
24 | res = init_oslo_features(user_config_from_dict)
25 | 
26 | print(res)
27 | 


--------------------------------------------------------------------------------