├── .clang-format ├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── Security.md ├── examples ├── LICENSE ├── README.md ├── generate_config.sh ├── generate_hostfile.sh ├── gpt.sh ├── hostfile_deepspeed ├── hostfile_mpich ├── llm_inference.sh ├── run10p175b.sh ├── run175b.sh ├── run20b.sh ├── run3.6b.sh └── run_llama.sh ├── intel_extension_for_deepspeed ├── __init__.py ├── op_builder │ ├── __init__.py │ ├── async_io.py │ ├── builder.py │ ├── cpu_adagrad.py │ ├── cpu_adam.py │ ├── csrc │ │ ├── adagrad │ │ │ └── cpu_adagrad.cpp │ │ ├── adam │ │ │ ├── cpu_adam.cpp │ │ │ ├── cpu_adam_impl.cpp │ │ │ ├── fused_adam_frontend.cpp │ │ │ ├── multi_tensor_adam.dp.cpp │ │ │ └── multi_tensor_apply.dp.hpp │ │ ├── common │ │ │ └── custom_cuda_kernel.dp.cpp │ │ ├── flash_attn │ │ │ ├── flash_attn.dp.cpp │ │ │ ├── flash_attn.h │ │ │ ├── fmha_bwd.cpp │ │ │ ├── fmha_bwd_utils.h │ │ │ ├── fmha_fwd.cpp │ │ │ ├── fmha_policy.h │ │ │ ├── fmha_utils.h │ │ │ └── mha.h │ │ ├── includes │ │ │ ├── StopWatch.h │ │ │ ├── compat.h │ │ │ ├── context.h │ │ │ ├── conversion_utils.h │ │ │ ├── cpu_adagrad.h │ │ │ ├── cpu_adam.h │ │ │ ├── cublas_wrappers.h │ │ │ ├── custom_cuda_layers.h │ │ │ ├── dequantization_utils.h │ │ │ ├── dpct │ │ │ │ ├── atomic.hpp │ │ │ │ ├── blas_utils.hpp │ │ │ │ ├── ccl_utils.hpp │ │ │ │ ├── device.hpp │ │ │ │ ├── dnnl_utils.hpp │ │ │ │ ├── dpct.hpp │ │ │ │ ├── dpl_utils.hpp │ │ │ │ ├── fft_utils.hpp │ │ │ │ ├── image.hpp │ │ │ │ ├── kernel.hpp │ │ │ │ ├── lapack_utils.hpp │ │ │ │ ├── lib_common_utils.hpp │ │ │ │ ├── math.hpp │ │ │ │ ├── memory.hpp │ │ │ │ ├── rng_utils.hpp │ │ │ │ ├── sparse_utils.hpp │ │ │ │ └── util.hpp │ │ │ ├── ds_kernel_utils.h │ │ │ ├── gemm_test.h │ │ │ ├── memory_access_utils.h │ │ │ ├── quantization.h │ │ │ ├── quantization_utils.h │ │ │ ├── reduction_utils.h │ │ │ ├── simd.h │ │ │ └── type_shim.h │ │ ├── quantization │ │ │ ├── dequantize.dp.cpp │ │ │ ├── fake_quantizer.dp.cpp │ │ │ ├── pt_binding.cpp │ │ │ ├── quant_reduce.dp.cpp │ │ │ ├── quantize.dp.cpp │ │ │ ├── quantize_intX.dp.cpp │ │ │ └── swizzled_quantize.dp.cpp │ │ └── transformer │ │ │ └── inference │ │ │ ├── csrc │ │ │ ├── apply_rotary_pos_emb.dp.cpp │ │ │ ├── dequantize.dp.cpp │ │ │ ├── gelu.dp.cpp │ │ │ ├── layer_norm.dp.cpp │ │ │ ├── pointwise_ops.dp.cpp │ │ │ ├── pt_binding.cpp │ │ │ ├── relu.dp.cpp │ │ │ ├── rms_norm.dp.cpp │ │ │ ├── softmax.dp.cpp │ │ │ └── transform.dp.cpp │ │ │ └── includes │ │ │ ├── inference_context.h │ │ │ ├── inference_cublas_wrappers.h │ │ │ └── inference_cuda_layers.h │ ├── flash_attn.py │ ├── fused_adam.py │ ├── quantizer.py │ ├── transformer_inference.py │ └── utils.py └── xpu_accelerator.py ├── requirements.txt └── setup.py /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | # Refer to the following link for the explanation of each params: 3 | # http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html 4 | Language: Cpp 5 | # BasedOnStyle: Google 6 | AccessModifierOffset: -4 7 | AlignAfterOpenBracket: Align 8 | AlignConsecutiveAssignments: false 9 | AlignConsecutiveDeclarations: false 10 | AlignEscapedNewlines: Left 11 | AlignOperands: true 12 | AlignTrailingComments: true 13 | AllowAllParametersOfDeclarationOnNextLine: false 14 | AllowShortBlocksOnASingleLine: true 15 | AllowShortCaseLabelsOnASingleLine: true 16 | AllowShortFunctionsOnASingleLine: All 17 | AllowShortIfStatementsOnASingleLine: true 18 | AllowShortLoopsOnASingleLine: true 19 | # This is deprecated 20 | AlwaysBreakAfterDefinitionReturnType: None 21 | AlwaysBreakAfterReturnType: None 22 | AlwaysBreakBeforeMultilineStrings: true 23 | AlwaysBreakTemplateDeclarations: true 24 | BinPackArguments: false 25 | BinPackParameters: false 26 | BraceWrapping: 27 | AfterClass: false 28 | AfterControlStatement: false 29 | AfterEnum: false 30 | AfterFunction: false 31 | AfterNamespace: false 32 | AfterObjCDeclaration: false 33 | AfterStruct: false 34 | AfterUnion: false 35 | AfterExternBlock: false 36 | BeforeCatch: false 37 | BeforeElse: false 38 | IndentBraces: false 39 | # disabling the below splits, else, they'll just add to the vertical length of source files! 40 | SplitEmptyFunction: false 41 | SplitEmptyRecord: false 42 | SplitEmptyNamespace: false 43 | BreakBeforeBinaryOperators: None 44 | BreakBeforeBraces: WebKit 45 | BreakBeforeInheritanceComma: false 46 | BreakInheritanceList: BeforeColon 47 | BreakBeforeTernaryOperators: true 48 | BreakConstructorInitializersBeforeComma: false 49 | BreakConstructorInitializers: BeforeColon 50 | BreakAfterJavaFieldAnnotations: false 51 | BreakStringLiterals: true 52 | ColumnLimit: 100 53 | CommentPragmas: '^ IWYU pragma:' 54 | CompactNamespaces: false 55 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 56 | # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform 57 | ConstructorInitializerIndentWidth: 4 58 | ContinuationIndentWidth: 4 59 | Cpp11BracedListStyle: true 60 | DerivePointerAlignment: false 61 | DisableFormat: false 62 | ExperimentalAutoDetectBinPacking: false 63 | FixNamespaceComments: true 64 | ForEachMacros: 65 | - foreach 66 | - Q_FOREACH 67 | - BOOST_FOREACH 68 | IncludeBlocks: Preserve 69 | IncludeCategories: 70 | - Regex: '^' 71 | Priority: 2 72 | - Regex: '^<.*\.h>' 73 | Priority: 1 74 | - Regex: '^<.*' 75 | Priority: 2 76 | - Regex: '.*' 77 | Priority: 3 78 | IncludeIsMainRegex: '([-_](test|unittest))?$' 79 | IndentCaseLabels: true 80 | IndentPPDirectives: None 81 | IndentWidth: 4 82 | IndentWrappedFunctionNames: false 83 | JavaScriptQuotes: Leave 84 | JavaScriptWrapImports: true 85 | KeepEmptyLinesAtTheStartOfBlocks: false 86 | MacroBlockBegin: '' 87 | MacroBlockEnd: '' 88 | MaxEmptyLinesToKeep: 1 89 | NamespaceIndentation: None 90 | ObjCBinPackProtocolList: Never 91 | ObjCBlockIndentWidth: 4 92 | ObjCSpaceAfterProperty: false 93 | ObjCSpaceBeforeProtocolList: true 94 | PenaltyBreakAssignment: 4 95 | PenaltyBreakBeforeFirstCallParameter: 1 96 | PenaltyBreakComment: 300 97 | PenaltyBreakFirstLessLess: 120 98 | PenaltyBreakString: 1000 99 | PenaltyBreakTemplateDeclaration: 10 100 | PenaltyExcessCharacter: 1000000 101 | PenaltyReturnTypeOnItsOwnLine: 200 102 | PointerAlignment: Left 103 | RawStringFormats: 104 | - Language: Cpp 105 | Delimiters: 106 | - cc 107 | - CC 108 | - cpp 109 | - Cpp 110 | - CPP 111 | - 'c++' 112 | - 'C++' 113 | CanonicalDelimiter: '' 114 | - Language: TextProto 115 | Delimiters: 116 | - pb 117 | - PB 118 | - proto 119 | - PROTO 120 | EnclosingFunctions: 121 | - EqualsProto 122 | - EquivToProto 123 | - PARSE_PARTIAL_TEXT_PROTO 124 | - PARSE_TEST_PROTO 125 | - PARSE_TEXT_PROTO 126 | - ParseTextOrDie 127 | - ParseTextProtoOrDie 128 | CanonicalDelimiter: '' 129 | BasedOnStyle: google 130 | # Enabling comment reflow causes doxygen comments to be messed up in their formats! 131 | ReflowComments: true 132 | SortIncludes: true 133 | SortUsingDeclarations: true 134 | SpaceAfterCStyleCast: false 135 | SpaceAfterTemplateKeyword: true 136 | SpaceBeforeAssignmentOperators: true 137 | SpaceBeforeCpp11BracedList: false 138 | SpaceBeforeCtorInitializerColon: true 139 | SpaceBeforeInheritanceColon: true 140 | SpaceBeforeParens: ControlStatements 141 | SpaceBeforeRangeBasedForLoopColon: true 142 | SpaceInEmptyParentheses: false 143 | SpacesBeforeTrailingComments: 2 144 | SpacesInAngles: false 145 | SpacesInContainerLiterals: true 146 | SpacesInCStyleCastParentheses: false 147 | SpacesInParentheses: false 148 | SpacesInSquareBrackets: false 149 | Standard: Cpp11 150 | StatementMacros: 151 | - Q_UNUSED 152 | - QT_REQUIRE_VERSION 153 | # Be consistent with indent-width, even for people who use tab for indentation! 154 | TabWidth: 4 155 | UseTab: Never 156 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | build.log 3 | dist 4 | intel_extension_for_deepspeed.egg-info 5 | *.pyc 6 | cscope* 7 | tags 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/xetla"] 2 | path = third_party/xetla 3 | url = https://github.com/intel/xetla.git 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | CommunityCodeOfConduct AT intel DOT com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Intel Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include intel_extension_for_deepspeed/op_builder/csrc *.cpp *.hpp *.h 2 | recursive-include intel_extension_for_deepspeed *.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intel® Extension for DeepSpeed* 2 | Intel® Extension for DeepSpeed* is an extension that brings Intel GPU (XPU) support to DeepSpeed(https://github.com/Microsoft/DeepSpeed). It comes with the following components: 3 | 1. DeepSpeed Accelerator Interface implementation 4 | 2. DeepSpeed op builder implementation for XPU 5 | 3. DeepSpeed op builder kernel code 6 | 7 | DeepSpeed would automatically use Intel® Extension for DeepSpeed* when it is installed as a python package. After installation, models ported for DeepSpeed Accelerator Interface that run on DeepSpeed could run on Intel GPU device. 8 | 9 | ## Installation 10 | 11 | 1. Install Intel® Extension for PyTorch* following [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/installation.html) 12 | 13 | 2. Install Intel® Extension for DeepSpeed* 14 | ```python 15 | pip install intel-extension-for-deepspeed 16 | ``` 17 | Or, you can build from source: 18 | ```python 19 | source ${DPCPPROOT}/env/vars.sh 20 | python setup.py install 21 | ``` 22 | Generally, DPCPPROOT is /opt/intel/oneapi/compiler/latest for root account, ${HOME}/intel/oneapi/compiler/latest for other accounts. 23 | 24 | 3. Install DeepSpeed 25 | 26 | ```python 27 | pip install deepspeed 28 | ``` 29 | ## Get Started 30 | Refer to [examples](https://github.com/intel/intel-extension-for-deepspeed/tree/main/examples#readme) 31 | 32 | 33 | ## Security Policy 34 | Please report security issues or vulnerabilities to the [Intel Security Center]. 35 | 36 | For more information on how Intel works to resolve security issues, see 37 | [Vulnerability Handling Guidelines]. 38 | 39 | [Intel Security Center]:https://www.intel.com/security 40 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html 41 | 42 | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9136/badge)](https://www.bestpractices.dev/projects/9136) 43 | -------------------------------------------------------------------------------- /Security.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). 6 | 7 | -------------------------------------------------------------------------------- /examples/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes for Megatron-DeepSpeed 2 | This folder contains recipes to run models of [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) 3 | 4 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows 5 | 6 | ```bash /examples/run3.6b.sh``` 7 | 8 | ### Prepare dataset 9 | 10 | To run recipes under Megatron-DeepSpeed, please setup your own dataset or use download scripts prepared by Megatron-DeepSpeed. 11 | 12 | ### Basic usage 13 | 14 | For basic usage, we have provided 3 running recipes for 3.6 billion parameters, 20 billion parameters and 175 billion parameters training: 15 | 16 | * 3.6b: ```bash /examples/run3.6b.sh``` 17 | * 20b: ```bash /examples/run20b.sh``` 18 | * 175b: ```bash /examples/run175b.sh``` 19 | 20 | ## Run with Huggingface 21 | Intel-extension-for-deepspeed also works with [Huggingface Transformers](https://github.com/huggingface/transformers) and is able to do fine-tuning/inference tasks. 22 | 23 | Install huggingface Transformers: 24 | ```bash 25 | cd /examples 26 | git clone https://github.com/huggingface/transformers.git 27 | cd transformers 28 | pip install . 29 | ``` 30 | 31 | To run translation task with t5-small model on single gpu: 32 | ```bash 33 | cd transformers 34 | deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ 35 | --deepspeed tests/deepspeed/ds_config_zero2.json \ 36 | --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ 37 | --output_dir output_dir --overwrite_output_dir --bf16 \ 38 | --do_train --max_train_samples 500 --num_train_epochs 1 \ 39 | --dataset_name wmt16 --dataset_config "ro-en" \ 40 | --source_lang en --target_lang ro 41 | ``` 42 | 43 | To deploy on 8 gpus doing fine-tuing with Llama-2-7b model: 44 | ```bash 45 | cd transformers 46 | deepspeed --num_gpus=8 examples/pytorch/language-modeling/run_clm.py \ 47 | --deepspeed tests/deepspeed/ds_config_zero3.json \ 48 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 49 | --dataset_name wikitext \ 50 | --dataset_config_name wikitext-2-raw-v1 \ 51 | --dataloader_num_workers 0 \ 52 | --per_device_train_batch_size 1 \ 53 | --warmup_steps 10 \ 54 | --max_steps 50 \ 55 | --bf16 \ 56 | --do_train \ 57 | --output_dir /tmp/test-clm \ 58 | --overwrite_output_dir 59 | ``` 60 | 61 | For detailed usage with huggingface/transformers, please check [transformers document](https://huggingface.co/docs/transformers/en/deepspeed). 62 | -------------------------------------------------------------------------------- /examples/generate_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \ 4 | "$PP" "$DTYPE" 5 | do 6 | if [ -z $v ]; then 7 | echo "Please export required envs before execute $0" 8 | exit 1 9 | fi 10 | done 11 | 12 | if [ $# -ne 1 ]; then 13 | echo "Usage: $0 config_file" 14 | exit 1 15 | fi 16 | 17 | extra="" 18 | common="\ 19 | \"train_batch_size\": $GLOBAL_BATCH, 20 | \"train_micro_batch_size_per_gpu\": $MICRO_BATCH, 21 | \"steps_per_print\": 1, 22 | \"gradient_accumulation_steps\": $GRAD_ACC_STEPS, 23 | \"optimizer\": { 24 | \"type\": \"Adam\", 25 | \"params\": { 26 | \"lr\": 0.00015, 27 | \"weight_decay\": 1e-2 28 | } 29 | }, 30 | \"zero_allow_untested_optimizer\": true, 31 | \"gradient_clipping\": 1.0, 32 | \"activation_checkpointing\": { 33 | \"partition_activations\": true, 34 | \"contiguous_memory_optimization\": false 35 | }, 36 | \"wall_clock_breakdown\": false," 37 | 38 | flops_profiler="\ 39 | \"flops_profiler\": { 40 | \"enabled\": false, 41 | \"profile_step\": 45, 42 | \"module_depth\": -1, 43 | \"top_modules\": 1, 44 | \"detailed\": true, 45 | \"output_file\": null 46 | }" 47 | 48 | if [[ $DTYPE == "bf16" ]]; then 49 | dtype="\ 50 | \"communication_data_type\": \"bf16\", 51 | \"fp16\": { 52 | \"enabled\": false, 53 | \"loss_scale\": 0, 54 | \"loss_scale_window\": 1000, 55 | \"hysteresis\": 2, 56 | \"min_loss_scale\": 1 57 | }, 58 | \"bfloat16\": { 59 | \"enabled\": true, 60 | \"loss_scale\": 1.0 61 | }," 62 | else 63 | dtype="\ 64 | \"communication_data_type\": \"fp16\", 65 | \"fp16\": { 66 | \"enabled\": true, 67 | \"loss_scale\": 0, 68 | \"loss_scale_window\": 1000, 69 | \"hysteresis\": 2, 70 | \"min_loss_scale\": 1 71 | }, 72 | \"bfloat16\": { 73 | \"enabled\": false, 74 | \"loss_scale\": 1.0 75 | }," 76 | fi 77 | 78 | if [ $ZERO_STAGE == 3 ]; then 79 | zero="\ 80 | \"zero_optimization\": { 81 | \"stage\": 3, 82 | \"reduce_scatter\": false, 83 | \"stage3_max_live_parameters\": 3e9, 84 | \"stage3_max_reuse_distance\": 3e9, 85 | \"stage3_param_persistence_threshold\": 1e5, 86 | \"stage3_prefetch_bucket_size\": 5e7, 87 | \"contiguous_gradients\": true, 88 | \"overlap_comm\": true, 89 | \"reduce_bucket_size\": 90000000, 90 | \"sub_group_size\": 1e9, 91 | \"offload_optimizer\": { 92 | \"device\": \"none\", 93 | \"buffer_count\": 4, 94 | \"pipeline_read\": false, 95 | \"pipeline_write\": false, 96 | \"pin_memory\": true 97 | } 98 | }," 99 | elif [ $ZERO_STAGE == 2 ]; then 100 | zero="\ 101 | \"zero_optimization\": { 102 | \"stage\": $ZERO_STAGE, 103 | \"allgather_partitions\": true, 104 | \"allgather_bucket_size\": \"auto\", 105 | \"overlap_comm\": true, 106 | \"reduce_scatter\": false, 107 | \"reduce_bucket_size\": 90000000, 108 | \"contiguous_gradients\": true, 109 | \"offload_optimizer\": { 110 | \"device\": \"none\", 111 | \"buffer_count\": 4, 112 | \"pipeline_read\": false, 113 | \"pipeline_write\": false, 114 | \"pin_memory\": true 115 | } 116 | }," 117 | elif [ $ZERO_STAGE == 1 ]; then 118 | zero="\ 119 | \"zero_optimization\": { 120 | \"stage\": $ZERO_STAGE 121 | }," 122 | else 123 | echo 'Please add the correct config set!!!' 124 | fi 125 | 126 | # flops_profiler must at the end because no ',' is allowed at the end 127 | cat < $1 128 | { 129 | $common 130 | $zero 131 | $dtype 132 | $extra 133 | $flops_profiler 134 | } 135 | EOT 136 | -------------------------------------------------------------------------------- /examples/generate_hostfile.sh: -------------------------------------------------------------------------------- 1 | # set hostfile_deepspeed & hostfile_mpich 2 | echo "!!!please use generate_hostfile.sh before training" 3 | 4 | # use official mpich 5 | 6 | # setting hostfile_mpich and hostfile_deepspeed 7 | # this now supports setting up as many nodes as possible 8 | # update for borealis 9 | # for examples: 10 | # 1.$ bash generate_hostfile.sh #don't set hostfile 11 | # 2.$ bash generate_hostfile.sh x10001 #set one node 12 | # 3.$ bash generate_hostfile.sh x10001 x10002 x10003 x10004 #set 4 nodes 13 | # 4.$ bash generate_hostfile.sh x10001 x10002 x10003 x10004 x10005 x10006 x10007 x10008 #set 8 nodes 14 | # update for OAM system 15 | # for examples: 16 | # 1.$ bash generate_hostfile.sh #don't set hostfile 17 | # 2.$ bash generate_hostfile.sh oam compute1 #set one compute node 18 | # 3.$ bash generate_hostfile.sh oam compute1 compute2 #set 2 compute nodes 19 | usage() 20 | { 21 | echo "Example Usage: 22 | for 1 node: bash $0 x10001 23 | for 4 nodes: bash generate_hostfile.sh x10001 x10002 x10003 x10004" 24 | exit 2 25 | } 26 | 27 | if [ $# -gt 0 ]; then 28 | cat /dev/null > $LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich 29 | cat /dev/null > $LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed 30 | mid=" slots=" 31 | slots=12 32 | for i in "$@"; do 33 | if [ "$i" == oam ]; then 34 | slots=8 35 | else 36 | host=$i 37 | host_slot="$i$mid$slots" 38 | echo $host>>$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich 39 | echo $host_slot>>$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed 40 | fi 41 | done 42 | else 43 | usage 44 | fi 45 | -------------------------------------------------------------------------------- /examples/gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VOCAB_FILE=dataset/gpt2-vocab.json 4 | MERGE_FILE=dataset/gpt2-merges.txt 5 | DATA_PATH=dataset/BookCorpusDataset_text_document 6 | DTYPE=${DTYPE:-bf16} 7 | 8 | # Hostfile path 9 | hostfile_deepspeed=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed 10 | hostfile_mpich=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich 11 | 12 | # Disabling tensor/pipeline parallelism 13 | TP=${TP:-1} 14 | PP=${PP:-1} 15 | 16 | # Model: default 3.6b 17 | NLAYERS=${NLAYERS:-30} 18 | HIDDEN=${HIDDEN:-3072} 19 | HEADS=${HEADS:-32} 20 | SEQ=${SEQ:-2048} 21 | TRAIN_ITER=${TRAIN_ITER:-50} 22 | 23 | WORLD_SIZE=${WORLD_SIZE:-12} 24 | MICRO_BATCH=${MICRO_BATCH:-8} 25 | GLOBAL_BATCH=${GLOBAL_BATCH:-96} 26 | 27 | ZERO_STAGE=${ZERO_STAGE:-2} 28 | 29 | DS_CONFIG=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/"ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json" 30 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/generate_config.sh ${DS_CONFIG} || exit 1 31 | 32 | OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME} 33 | mkdir -p $OUTPUT_DIR 34 | echo "!!!Please see logs at ${OUTPUT_DIR}" 35 | 36 | ds_args=" " 37 | ds_args=" --deepspeed ${ds_args}" 38 | if [ $PP == 1 ]; then 39 | ds_args=" --no-pipeline-parallel ${ds_args}" 40 | fi 41 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 42 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 43 | # we are now using activation checkpoint provided by megatron, see below. 44 | # ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 45 | 46 | # take custom args 47 | custom_args=" $@" 48 | 49 | # launcher setting 50 | LAUNCHER=${LAUNCHER:-MPICH} 51 | if [[ $LAUNCHER == "deepspeed" ]]; then 52 | launcher="" 53 | else 54 | launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" 55 | fi 56 | 57 | CCL=${CCL:-ccl} 58 | 59 | run_cmd=" 60 | deepspeed $launcher pretrain_gpt.py \ 61 | --tensor-model-parallel-size $TP \ 62 | --pipeline-model-parallel-size $PP \ 63 | --num-layers $NLAYERS \ 64 | --hidden-size $HIDDEN \ 65 | --num-attention-heads $HEADS \ 66 | --seq-length $SEQ \ 67 | --max-position-embeddings $SEQ \ 68 | --micro-batch-size $MICRO_BATCH \ 69 | --global-batch-size $GLOBAL_BATCH \ 70 | --train-iters $TRAIN_ITER \ 71 | --lr 0.00015 \ 72 | --lr-warmup-fraction .01 \ 73 | --lr-decay-iters 320000 \ 74 | --lr-decay-style cosine \ 75 | --log-interval 1 \ 76 | --eval-iters 100 \ 77 | --eval-interval 100 \ 78 | --data-path $DATA_PATH \ 79 | --vocab-file $VOCAB_FILE \ 80 | --merge-file $MERGE_FILE \ 81 | --save-interval 500 \ 82 | --split 100,0,0 \ 83 | --$DTYPE \ 84 | --checkpoint-activations \ 85 | --deepspeed-activation-checkpointing 86 | $ds_args \ 87 | --no-masked-softmax-fusion \ 88 | --no-bias-gelu-fusion \ 89 | --no-bias-dropout-fusion \ 90 | --no-gradient-accumulation-fusion \ 91 | --distributed-backend $CCL \ 92 | --num-workers 0 \ 93 | $custom_args \ 94 | |& tee $OUTPUT_DIR/output.log 95 | " 96 | 97 | echo ${run_cmd} 98 | eval ${run_cmd} 99 | set +x 100 | -------------------------------------------------------------------------------- /examples/hostfile_deepspeed: -------------------------------------------------------------------------------- 1 | localhost slots=12 2 | -------------------------------------------------------------------------------- /examples/hostfile_mpich: -------------------------------------------------------------------------------- 1 | localhost 2 | -------------------------------------------------------------------------------- /examples/llm_inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DTYPE=${DTYPE:-float16} 4 | MODEL_PATH=${MODEL_PATH:-/home/username/model_path} 5 | MODEL_NAME=${MODEL_NAME:-llama2-70b} 6 | OUTPUT_DIR=logs/${MODEL_NAME}_`date +%m%d%H%M%S`_${HOSTNAME} 7 | mkdir -p $OUTPUT_DIR 8 | 9 | # Hostfile path 10 | hostfile_deepspeed=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed 11 | hostfile_mpich=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich 12 | 13 | # launcher setting 14 | LAUNCHER=${LAUNCHER:-MPICH} 15 | if [[ $LAUNCHER == "deepspeed" ]]; then 16 | launcher="" 17 | else 18 | launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'" 19 | fi 20 | 21 | CCL=${CCL:-ccl} 22 | 23 | run_cmd=" 24 | deepspeed $launcher run_generation_with_deepspeed.py \ 25 | --device xpu \ 26 | --ipex \ 27 | --dtype $DTYPE \ 28 | --input-tokens 1024 \ 29 | --max-new-tokens 128 \ 30 | --num-beam 1 \ 31 | --batch-size 1 \ 32 | --token-latency \ 33 | --benchmark \ 34 | -m $MODEL_PATH \ 35 | --sub-model-name $MODEL_NAME\ 36 | |& tee $OUTPUT_DIR/output.log 37 | " 38 | 39 | echo ${run_cmd} 40 | eval ${run_cmd} 41 | set +x 42 | -------------------------------------------------------------------------------- /examples/run10p175b.sh: -------------------------------------------------------------------------------- 1 | export WORLD_SIZE=${WORLD_SIZE:-48} 2 | export MICRO_BATCH=${MICRO_BATCH:-1} 3 | export NLAYERS=${NLAYERS:-10} 4 | export HIDDEN=${HIDDEN:-12288} 5 | export HEADS=${HEADS:-96} 6 | export SEQ=${SEQ:-2048} 7 | export TRAIN_ITER=${TRAIN_ITER:-20} 8 | export ZERO_STAGE=${ZERO_STAGE:-3} 9 | export DTYPE=${DTYPE:-bf16} 10 | export TP=${TP:-1} 11 | export PP=${PP:-1} 12 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} 13 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) 14 | 15 | echo "!!!please use generate_hostfile.sh to set hostfile for $((${WORLD_SIZE}/12)) nodes before training" 16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ 17 | -------------------------------------------------------------------------------- /examples/run175b.sh: -------------------------------------------------------------------------------- 1 | echo "!!!please use generate_hostfile.sh to set hostfile for 18 nodes before training" 2 | export WORLD_SIZE=${WORLD_SIZE:-216} 3 | export MICRO_BATCH=${MICRO_BATCH:-1} 4 | export NLAYERS=${NLAYERS:-96} 5 | export HIDDEN=${HIDDEN:-12288} 6 | export HEADS=${HEADS:-96} 7 | export SEQ=${SEQ:-2048} 8 | export TRAIN_ITER=${TRAIN_ITER:-20} 9 | export ZERO_STAGE=${ZERO_STAGE:-3} 10 | export DTYPE=${DTYPE:-bf16} 11 | export TP=${TP:-1} 12 | export PP=${PP:-1} 13 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} 14 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) 15 | 16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ 17 | -------------------------------------------------------------------------------- /examples/run20b.sh: -------------------------------------------------------------------------------- 1 | export WORLD_SIZE=${WORLD_SIZE:-48} 2 | export MICRO_BATCH=${MICRO_BATCH:-1} 3 | export NLAYERS=${NLAYERS:-44} 4 | export HIDDEN=${HIDDEN:-6144} 5 | export HEADS=${HEADS:-64} 6 | export SEQ=${SEQ:-2048} 7 | export TRAIN_ITER=${TRAIN_ITER:-20} 8 | export ZERO_STAGE=${ZERO_STAGE:-3} 9 | export DTYPE=${DTYPE:-bf16} 10 | export TP=${TP:-1} 11 | export PP=${PP:-1} 12 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} 13 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) 14 | 15 | echo "!!!please use generate_hostfile.sh to set hostfile for $((${WORLD_SIZE}/12)) nodes before training" 16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@ 17 | -------------------------------------------------------------------------------- /examples/run3.6b.sh: -------------------------------------------------------------------------------- 1 | echo "!!!please makes sure the content of hostfile for single node is localhost" 2 | export WORLD_SIZE=${WORLD_SIZE:-12} 3 | export MICRO_BATCH=${MICRO_BATCH:-8} 4 | export NLAYERS=${NLAYERS:-30} 5 | export HIDDEN=${HIDDEN:-3072} 6 | export HEADS=${HEADS:-32} 7 | export SEQ=${SEQ:-2048} 8 | export TRAIN_ITER=${TRAIN_ITER:-50} 9 | export ZERO_STAGE=${ZERO_STAGE:-2} 10 | export DTYPE=${DTYPE:-bf16} 11 | export TP=${TP:-1} 12 | export PP=${PP:-1} 13 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} 14 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) 15 | 16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh --no-query-key-layer-scaling $@ 17 | -------------------------------------------------------------------------------- /examples/run_llama.sh: -------------------------------------------------------------------------------- 1 | # please make sure the content of hostfile for single node is localhost 2 | export WORLD_SIZE=${WORLD_SIZE:-48} 3 | export MICRO_BATCH=${MICRO_BATCH:-1} 4 | export NLAYERS=${NLAYERS:-32} 5 | export HIDDEN=${HIDDEN:-4096} 6 | export HEADS=${HEADS:-32} 7 | export SEQ=${SEQ:-2048} 8 | export NUM_KV_HEADS=${NUM_KV_HEADS:-32} 9 | export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008} 10 | export TRAIN_ITER=${TRAIN_ITER:-50} 11 | export ZERO_STAGE=${ZERO_STAGE:-3} 12 | export DTYPE=${DTYPE:-bf16} 13 | export TP=${TP:-1} 14 | export PP=${PP:-1} 15 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1} 16 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP )) 17 | 18 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh --no-query-key-layer-scaling \ 19 | --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --disable-bias-linear \ 20 | --normalization rmsnorm --attention-dropout 0 --hidden-dropout 0 --use-flash-attn-builder \ 21 | --ffn-hidden-size $FFN_HIDDEN_SIZE --num-key-value-heads $NUM_KV_HEADS $@ 22 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/__init__.py: -------------------------------------------------------------------------------- 1 | from .xpu_accelerator import XPU_Accelerator 2 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import OpBuilder 2 | from .cpu_adam import CPUAdamBuilder 3 | from .cpu_adagrad import CPUAdagradBuilder 4 | from .fused_adam import FusedAdamBuilder 5 | from .transformer_inference import InferenceBuilder 6 | from .quantizer import QuantizerBuilder 7 | from .utils import UtilsBuilder 8 | from .async_io import AsyncIOBuilder 9 | from .flash_attn import FlashAttentionBuilder 10 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/async_io.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import distutils.spawn 7 | import subprocess 8 | import torch 9 | 10 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR 11 | 12 | class AsyncIOBuilder(OpBuilder): 13 | BUILD_VAR = "DS_BUILD_AIO" 14 | NAME = "async_io" 15 | 16 | def __init__(self): 17 | super().__init__(name=self.NAME) 18 | 19 | def absolute_name(self): 20 | return f'deepspeed.ops.aio.{self.NAME}_op' 21 | 22 | def sources(self): 23 | return [ 24 | 'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp', 25 | 'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 26 | 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp', 27 | 'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp', 28 | 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' 29 | ] 30 | 31 | def include_paths(self): 32 | return ['csrc/aio/py_lib', 'csrc/aio/common'] 33 | 34 | def cxx_args(self): 35 | # -O0 for improved debugging, since performance is bound by I/O 36 | CPU_ARCH = self.cpu_arch() 37 | SIMD_WIDTH = self.simd_width() 38 | TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) 39 | if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: 40 | CPP_STD = '-std=c++17' 41 | else: 42 | CPP_STD = '-std=c++14' 43 | return [ 44 | '-g', 45 | '-Wall', 46 | '-O0', 47 | CPP_STD, 48 | '-shared', 49 | '-fPIC', 50 | '-Wno-reorder', 51 | CPU_ARCH, 52 | '-fopenmp', 53 | SIMD_WIDTH, 54 | '-laio', 55 | ] 56 | 57 | def extra_ldflags(self): 58 | return ['-laio'] 59 | 60 | def check_for_libaio_pkg(self): 61 | libs = dict( 62 | dpkg=["-l", "libaio-dev", "apt"], 63 | pacman=["-Q", "libaio", "pacman"], 64 | rpm=["-q", "libaio-devel", "yum"], 65 | ) 66 | 67 | found = False 68 | for pkgmgr, data in libs.items(): 69 | flag, lib, tool = data 70 | path = distutils.spawn.find_executable(pkgmgr) 71 | if path is not None: 72 | cmd = f"{pkgmgr} {flag} {lib}" 73 | result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 74 | if result.wait() == 0: 75 | found = True 76 | else: 77 | self.warning(f"{self.NAME}: please install the {lib} package with {tool}") 78 | break 79 | return found 80 | 81 | def is_compatible(self, verbose=True): 82 | # Check for the existence of libaio by using distutils 83 | # to compile and link a test program that calls io_submit, 84 | # which is a function provided by libaio that is used in the async_io op. 85 | # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS 86 | # respectively to specify the directories for libaio.h and libaio.so. 87 | aio_compatible = self.has_function('io_pgetevents', ('aio', )) 88 | if verbose and not aio_compatible: 89 | self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") 90 | 91 | # Check for the libaio package via known package managers 92 | # to print suggestions on which package to install. 93 | self.check_for_libaio_pkg() 94 | 95 | self.warning( 96 | "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found." 97 | ) 98 | return super().is_compatible(verbose) and aio_compatible 99 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | import os 5 | import time 6 | import importlib 7 | import shutil 8 | from pathlib import Path 9 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR 10 | 11 | class SYCLOpBuilder(OpBuilder): 12 | def builder(self): 13 | try: 14 | from intel_extension_for_pytorch.xpu.cpp_extension import DPCPPExtension 15 | except ImportError: 16 | from intel_extension_for_pytorch.xpu.utils import DPCPPExtension 17 | 18 | print("dpcpp sources = {}".format(self.sources())) 19 | dpcpp_ext = DPCPPExtension( 20 | name=self.absolute_name(), 21 | sources=self.strip_empty_entries(self.sources()), 22 | include_dirs=self.strip_empty_entries(self.include_paths()), 23 | extra_compile_args={ 24 | 'cxx': self.strip_empty_entries(self.cxx_args()), 25 | }, 26 | extra_link_args=self.strip_empty_entries(self.fixed_aotflags())) 27 | return dpcpp_ext 28 | 29 | def version_dependent_macros(self): 30 | # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456 31 | version_ge_1_1 = [] 32 | if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): 33 | version_ge_1_1 = ['-DVERSION_GE_1_1'] 34 | version_ge_1_3 = [] 35 | if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): 36 | version_ge_1_3 = ['-DVERSION_GE_1_3'] 37 | version_ge_1_5 = [] 38 | if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): 39 | version_ge_1_5 = ['-DVERSION_GE_1_5'] 40 | return version_ge_1_1 + version_ge_1_3 + version_ge_1_5 41 | 42 | def cxx_args(self): 43 | cxx_flags = ['-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64', '-fno-strict-aliasing'] 44 | if os.environ.get('USE_MKL_GEMM'): 45 | cxx_flags.append('-DUSE_MKL_GEMM') 46 | return cxx_flags 47 | 48 | def extra_ldflags(self): 49 | return ['-fPIC', '-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8', '-Xs "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode"', '-Xs "-device pvc"', '-Wl,-export-dynamic'] 50 | 51 | def fixed_aotflags(self): 52 | return ['-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8', '-Xs', "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode", '-Xs', "-device pvc"] 53 | 54 | def load(self, verbose=True): 55 | from deepspeed.git_version_info import installed_ops, torch_info # noqa: F401 56 | if installed_ops.get(self.name, False): 57 | return importlib.import_module(self.absolute_name()) 58 | else: 59 | return self.jit_load(verbose) 60 | 61 | def jit_load(self, verbose=True): 62 | if not self.is_compatible(verbose): 63 | raise RuntimeError( 64 | f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}" 65 | ) 66 | try: 67 | import ninja # noqa: F401 68 | except ImportError: 69 | raise RuntimeError( 70 | f"Unable to JIT load the {self.name} op due to ninja not being installed." 71 | ) 72 | 73 | self.jit_mode = True 74 | from intel_extension_for_pytorch.xpu.cpp_extension import load 75 | 76 | start_build = time.time() 77 | # Recognize relative paths as absolute paths for jit load 78 | 79 | sources = [self.deepspeed_src_path(path) for path in self.sources()] 80 | extra_include_paths = [ 81 | self.deepspeed_src_path(path) for path in self.include_paths() 82 | ] 83 | 84 | # Torch will try and apply whatever CCs are in the arch list at compile time, 85 | # we have already set the intended targets ourselves we know that will be 86 | # needed at runtime. This prevents CC collisions such as multiple __half 87 | # implementations. Stash arch list to reset after build. 88 | ''' 89 | torch_arch_list = None 90 | if "TORCH_CUDA_ARCH_LIST" in os.environ: 91 | torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST") 92 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 93 | ''' 94 | 95 | op_module = load( 96 | name=self.name, 97 | sources=self.strip_empty_entries(sources), 98 | extra_include_paths=self.strip_empty_entries(extra_include_paths), 99 | extra_cflags=self.strip_empty_entries(self.cxx_args()), 100 | # extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()), 101 | extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), 102 | verbose=verbose) 103 | 104 | build_duration = time.time() - start_build 105 | if verbose: 106 | print(f"Time to load {self.name} op: {build_duration} seconds") 107 | ''' 108 | # Reset arch list so we are not silently removing it for other possible use cases 109 | if torch_arch_list: 110 | os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list 111 | ''' 112 | return op_module 113 | 114 | 115 | def sycl_kernel_path(code_path): 116 | # Always return a path like "SYCL_KERNEL_PATH/..." 117 | SYCL_KERNEL_PATH = "third-party" 118 | abs_source_path = os.path.join(Path(__file__).parent.absolute(), code_path) 119 | rel_target_path = os.path.join(SYCL_KERNEL_PATH, code_path) 120 | 121 | # Jit_load mode require absolute path. Use abs path for copy 122 | # To get the absolute path of deepspeed 123 | # We use a non-abstract builder class instance to call deepspeed_src_path() 124 | # FusedAdamBuilder is one of such class instance 125 | from .fused_adam import FusedAdamBuilder 126 | abs_target_path = FusedAdamBuilder().deepspeed_src_path(rel_target_path) 127 | 128 | sycl_link_path = os.path.join( 129 | os.path.dirname(FusedAdamBuilder().deepspeed_src_path("")), 130 | SYCL_KERNEL_PATH) 131 | if not os.path.exists(sycl_link_path): 132 | # Create directory and link for sycl kernel: 133 | # deepspeed/ops/SYCL_KERNEL_PATH-->../../SYCL_KERNEL_PATH 134 | sycl_dir_path = os.path.join(os.path.dirname(sycl_link_path), 135 | "../../" + SYCL_KERNEL_PATH) 136 | 137 | os.makedirs(sycl_dir_path, exist_ok=True) 138 | os.symlink("../../" + SYCL_KERNEL_PATH, sycl_link_path, True) 139 | print("Create directory and link for sycl kernel:{}-->{}".format( 140 | sycl_link_path, 141 | sycl_dir_path)) 142 | 143 | import filecmp 144 | if (os.path.exists(abs_target_path) and filecmp.cmp(abs_target_path, 145 | abs_source_path)): 146 | print("skip copy, {} and {} have the same content".format( 147 | abs_source_path, 148 | abs_target_path)) 149 | return rel_target_path 150 | 151 | print("Copying SYCL kernel file from {} to {}".format(abs_source_path, 152 | abs_target_path)) 153 | os.makedirs(os.path.dirname(abs_target_path), exist_ok=True) 154 | shutil.copyfile(abs_source_path, abs_target_path) 155 | 156 | # Prebuild install mode require paths relative to the setup.py directory. Use the relative path. 157 | return rel_target_path 158 | 159 | 160 | def sycl_kernel_include(code_path): 161 | import intel_extension_for_pytorch # noqa: F401 162 | abs_path = os.path.join(Path(__file__).parent.absolute(), code_path) 163 | return abs_path 164 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/cpu_adagrad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include 5 | 6 | 7 | class CPUAdagradBuilder(SYCLOpBuilder): 8 | BUILD_VAR = "DS_BUILD_CPU_ADAGRAD" 9 | NAME = "cpu_adagrad" 10 | 11 | def __init__(self): 12 | super().__init__(name=self.NAME) 13 | 14 | def absolute_name(self): 15 | return f'deepspeed.ops.adagrad.{self.NAME}_op' 16 | 17 | def sources(self): 18 | return [ 19 | sycl_kernel_path('csrc/adagrad/cpu_adagrad.cpp'), 20 | sycl_kernel_path('csrc/common/custom_cuda_kernel.dp.cpp'), 21 | ] 22 | 23 | def include_paths(self): 24 | return [ 25 | sycl_kernel_include('csrc/includes'), 26 | ] 27 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/cpu_adam.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include 5 | 6 | 7 | class CPUAdamBuilder(SYCLOpBuilder): 8 | BUILD_VAR = "DS_BUILD_CPU_ADAM" 9 | NAME = "cpu_adam" 10 | 11 | def __init__(self): 12 | super().__init__(name=self.NAME) 13 | 14 | def absolute_name(self): 15 | return f'deepspeed.ops.adam.{self.NAME}_op' 16 | 17 | def sources(self): 18 | return [ 19 | sycl_kernel_path('csrc/adam/cpu_adam.cpp'), 20 | sycl_kernel_path('csrc/adam/cpu_adam_impl.cpp'), 21 | sycl_kernel_path('csrc/common/custom_cuda_kernel.dp.cpp'), 22 | ] 23 | 24 | def libraries_args(self): 25 | args = super().libraries_args() 26 | return args 27 | 28 | def include_paths(self): 29 | return [ 30 | sycl_kernel_include('csrc/includes'), 31 | ] 32 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include "cpu_adagrad.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #if defined(__ENABLE_CUDA__) 15 | #include 16 | #include "cublas_v2.h" 17 | #include "cuda.h" 18 | #include "curand.h" 19 | #include "custom_cuda_layers.h" 20 | #endif 21 | 22 | static std::unordered_map> s_optimizers; 23 | 24 | // C++ interface 25 | 26 | void Adagrad_Optimizer::Step_1(float* _params, 27 | float* grads, 28 | float* _exp_avg_sq, 29 | size_t _param_size, 30 | ds_half_precision_t* dev_params, 31 | bool half_precision) 32 | { 33 | size_t rounded_size = 0; 34 | #if defined(__AVX512__) or defined(__AVX256__) 35 | Step_AVX<1>( 36 | &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); 37 | #endif 38 | if (_param_size > rounded_size) { 39 | float step_size = -1 * _alpha; 40 | ds_half_precision_t* grads_cast_h; 41 | ds_half_precision_t* params_cast_h; 42 | if (half_precision) { 43 | grads_cast_h = reinterpret_cast(grads); 44 | params_cast_h = reinterpret_cast(_params); 45 | } 46 | for (size_t t = rounded_size; t < _param_size; t += TILE) { 47 | size_t copy_size = TILE; 48 | if ((t + TILE) > _param_size) copy_size = _param_size - t; 49 | size_t offset = copy_size + t; 50 | #if defined(__ENABLE_CUDA__) 51 | if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } 52 | #elif defined(__ENABLE_CANN__) 53 | if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } 54 | #endif 55 | #pragma omp parallel for 56 | for (size_t k = t; k < offset; k++) { 57 | float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; 58 | float param = half_precision ? (float)params_cast_h[k] : _params[k]; 59 | float momentum = grads[k]; 60 | float variance = _exp_avg_sq[k]; 61 | if (_weight_decay > 0) { grad = param * _weight_decay + grad; } 62 | 63 | variance += grad * grad; 64 | 65 | grad = sqrt(variance); 66 | grad += _eps; 67 | grad = momentum / grad; 68 | param = grad * step_size + param; 69 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) 70 | if (dev_params) _doubled_buffer[_buf_index][k - t] = param; 71 | #endif 72 | if (half_precision) 73 | params_cast_h[k] = (ds_half_precision_t)param; 74 | else 75 | _params[k] = param; 76 | // STORE UPDATE TERM TO GRAD'S MEMORY 77 | grads[k] = grad * step_size; 78 | _exp_avg_sq[k] = variance; 79 | } 80 | #if defined(__ENABLE_CUDA__) 81 | if (dev_params) { 82 | launch_param_update( 83 | _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); 84 | _buf_index = !_buf_index; 85 | } 86 | #elif defined(__ENABLE_CANN__) 87 | if (dev_params) { 88 | size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); 89 | aclrtMemcpy(dev_params + t, 90 | memcpy_size, 91 | _doubled_buffer[_buf_index], 92 | memcpy_size, 93 | aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); 94 | 95 | _buf_index = !_buf_index; 96 | } 97 | #endif 98 | } 99 | } 100 | } 101 | 102 | void Adagrad_Optimizer::Step_4(float* _params, 103 | float* grads, 104 | float* _exp_avg_sq, 105 | size_t _param_size, 106 | ds_half_precision_t* dev_params, 107 | bool half_precision) 108 | { 109 | size_t rounded_size = 0; 110 | #if defined(__AVX512__) or defined(__AVX256__) 111 | Step_AVX<4>( 112 | &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); 113 | #endif 114 | if (_param_size > rounded_size) 115 | Step_1((_params + rounded_size), 116 | (grads + rounded_size), 117 | (_exp_avg_sq + rounded_size), 118 | (_param_size - rounded_size), 119 | (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), 120 | half_precision); 121 | } 122 | 123 | int create_adagrad_optimizer(int optimizer_id, 124 | float alpha = 1e-2, 125 | float eps = 1e-8, 126 | float weight_decay = 0, 127 | bool should_log = false) 128 | { 129 | auto opt = std::make_shared(alpha, eps, weight_decay); 130 | 131 | s_optimizers[optimizer_id] = opt; 132 | 133 | if (should_log) { 134 | std::string avx_type = ""; 135 | #if defined(__AVX512__) 136 | avx_type = "AVX512"; 137 | #else 138 | #if defined(__AVX256__) 139 | avx_type = "AVX2"; 140 | #else 141 | avx_type = "scalar"; 142 | #endif 143 | #endif 144 | 145 | printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n", 146 | optimizer_id, 147 | avx_type.c_str()); 148 | printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay); 149 | } 150 | 151 | return 0; 152 | } 153 | 154 | void Adagrad_Optimizer::Step_8(float* _params, 155 | float* grads, 156 | float* _exp_avg_sq, 157 | size_t _param_size, 158 | ds_half_precision_t* dev_params, 159 | bool half_precision) 160 | { 161 | size_t rounded_size = 0; 162 | #if defined(__AVX512__) or defined(__AVX256__) 163 | Step_AVX<8>( 164 | &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); 165 | #endif 166 | if (_param_size > rounded_size) 167 | Step_4((_params + rounded_size), 168 | (grads + rounded_size), 169 | (_exp_avg_sq + rounded_size), 170 | (_param_size - rounded_size), 171 | (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), 172 | half_precision); 173 | } 174 | 175 | int ds_adagrad_step(int optimizer_id, 176 | size_t step, 177 | float lr, 178 | float epsilon, 179 | float weight_decay, 180 | torch::Tensor& params, 181 | torch::Tensor& grads, 182 | torch::Tensor& exp_avg_sq) 183 | { 184 | auto params_c = params.contiguous(); 185 | auto grads_c = grads.contiguous(); 186 | auto exp_avg_sq_c = exp_avg_sq.contiguous(); 187 | 188 | float* params_ptr = (float*)params_c.data_ptr(); 189 | float* grads_ptr = (float*)grads_c.data_ptr(); 190 | float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); 191 | 192 | std::shared_ptr opt = 193 | std::static_pointer_cast(s_optimizers[optimizer_id]); 194 | opt->IncrementStep(step); 195 | opt->update_state(lr, epsilon, weight_decay); 196 | opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel()); 197 | 198 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) 199 | opt->SynchronizeStreams(); 200 | #endif 201 | return 0; 202 | } 203 | 204 | int ds_adagrad_step_plus_copy(int optimizer_id, 205 | size_t step, 206 | float lr, 207 | float epsilon, 208 | float weight_decay, 209 | torch::Tensor& params, 210 | torch::Tensor& grads, 211 | torch::Tensor& exp_avg_sq, 212 | torch::Tensor& gpu_params) 213 | { 214 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) 215 | auto params_c = params.contiguous(); 216 | auto gpu_params_c = gpu_params.contiguous(); 217 | auto exp_avg_sq_c = exp_avg_sq.contiguous(); 218 | auto grads_c = grads.contiguous(); 219 | 220 | float* params_ptr = (float*)params_c.data_ptr(); 221 | float* grads_ptr = (float*)grads_c.data_ptr(); 222 | ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr(); 223 | float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); 224 | 225 | std::shared_ptr opt = 226 | std::static_pointer_cast(s_optimizers[optimizer_id]); 227 | opt->IncrementStep(step); 228 | opt->update_state(lr, epsilon, weight_decay); 229 | opt->Step_8(params_ptr, 230 | grads_ptr, 231 | exp_avg_sq_ptr, 232 | params_c.numel(), 233 | gpu_params_ptr, 234 | (params.options().dtype() == at::kHalf)); 235 | 236 | opt->SynchronizeStreams(); 237 | #else 238 | assert(false); 239 | #endif 240 | return 0; 241 | } 242 | 243 | int destroy_adagrad_optimizer(int optimizer_id) 244 | { 245 | s_optimizers.erase(optimizer_id); 246 | 247 | return 0; 248 | } 249 | 250 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 251 | { 252 | m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)"); 253 | m.def("adagrad_update_copy", 254 | &ds_adagrad_step_plus_copy, 255 | "DeepSpeed CPU Adagrad update and param copy (C++)"); 256 | m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)"); 257 | m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)"); 258 | } 259 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include "cpu_adam.h" 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 9 | { 10 | m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)"); 11 | m.def("adam_update_copy", 12 | &ds_adam_step_plus_copy, 13 | "DeepSpeed CPU Adam update and param copy (C++)"); 14 | m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)"); 15 | m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)"); 16 | } 17 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/adam/fused_adam_frontend.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | 8 | void multi_tensor_adam_cuda(int chunk_size, 9 | at::Tensor noop_flag, 10 | std::vector> tensor_lists, 11 | const float lr, 12 | const float beta1, 13 | const float beta2, 14 | const float epsilon, 15 | const int step, 16 | const int mode, 17 | const int bias_correction, 18 | const float weight_decay); 19 | 20 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 21 | { 22 | m.def("multi_tensor_adam", 23 | &multi_tensor_adam_cuda, 24 | "Compute and apply gradient update to parameters for Adam optimizer"); 25 | } 26 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | /* 7 | Copyright NVIDIA/apex 8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | // #include 16 | // #include 17 | // Another possibility: 18 | // #include 19 | 20 | #include 21 | 22 | #include "multi_tensor_apply.dp.hpp" 23 | #include "type_shim.h" 24 | #include 25 | 26 | #define BLOCK_SIZE 512 27 | #define ILP 4 28 | 29 | typedef enum { 30 | ADAM_MODE_0 = 0, // L2 regularization mode 31 | ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW) 32 | } adamMode_t; 33 | 34 | using MATH_T = float; 35 | 36 | template 37 | struct AdamFunctor { 38 | /* 39 | DPCT1110:4: The total declared local variable size in device function operator() exceeds 128 40 | bytes and may cause high register pressure. Consult with your hardware vendor to find the total 41 | register size available and adjust the code, or use smaller sub-group size to avoid high 42 | register pressure. 43 | */ 44 | __dpct_inline__ void operator()(int chunk_size, 45 | volatile int* noop_gmem, 46 | TensorListMetadata<4>& tl, 47 | const float beta1, 48 | const float beta2, 49 | const float beta1_correction, 50 | const float beta2_correction, 51 | const float epsilon, 52 | const float lr, 53 | adamMode_t mode, 54 | const float decay) 55 | { 56 | // I'd like this kernel to propagate infs/nans. 57 | // if(*noop_gmem == 1) 58 | // return; 59 | 60 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 61 | int tensor_loc = tl.block_to_tensor[item_ct1.get_group(2)]; 62 | 63 | // potentially use to pass in list of scalar 64 | // int tensor_num = tl.start_tensor_this_launch + tensor_loc; 65 | 66 | int chunk_idx = tl.block_to_chunk[item_ct1.get_group(2)]; 67 | int n = tl.sizes[tensor_loc]; 68 | 69 | T* g = (T*)tl.addresses[0][tensor_loc]; 70 | g += chunk_idx * chunk_size; 71 | 72 | T* p = (T*)tl.addresses[1][tensor_loc]; 73 | p += chunk_idx * chunk_size; 74 | 75 | T* m = (T*)tl.addresses[2][tensor_loc]; 76 | m += chunk_idx * chunk_size; 77 | 78 | T* v = (T*)tl.addresses[3][tensor_loc]; 79 | v += chunk_idx * chunk_size; 80 | 81 | n -= chunk_idx * chunk_size; 82 | 83 | // see note in multi_tensor_scale_kernel.cu 84 | for (int i_start = 0; i_start < n && i_start < chunk_size; 85 | i_start += item_ct1.get_local_range(2) * ILP) { 86 | MATH_T r_g[ILP]; 87 | MATH_T r_p[ILP]; 88 | MATH_T r_m[ILP]; 89 | MATH_T r_v[ILP]; 90 | #pragma unroll 91 | for (int ii = 0; ii < ILP; ii++) { 92 | int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2); 93 | if (i < n && i < chunk_size) { 94 | r_g[ii] = g[i]; 95 | r_p[ii] = p[i]; 96 | r_m[ii] = m[i]; 97 | r_v[ii] = v[i]; 98 | } else { 99 | r_g[ii] = MATH_T(0); 100 | r_p[ii] = MATH_T(0); 101 | r_m[ii] = MATH_T(0); 102 | r_v[ii] = MATH_T(0); 103 | } 104 | } 105 | #pragma unroll 106 | for (int ii = 0; ii < ILP; ii++) { 107 | if (mode == ADAM_MODE_0) { // L2 108 | r_g[ii] = r_g[ii] + (decay * r_p[ii]); 109 | r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii]; 110 | r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii]; 111 | MATH_T next_m_unbiased = r_m[ii] / beta1_correction; 112 | MATH_T next_v_unbiased = r_v[ii] / beta2_correction; 113 | MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon; 114 | MATH_T update = next_m_unbiased / denom; 115 | r_p[ii] = r_p[ii] - (lr * update); 116 | } else { // weight decay 117 | r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii]; 118 | r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii]; 119 | MATH_T next_m_unbiased = r_m[ii] / beta1_correction; 120 | MATH_T next_v_unbiased = r_v[ii] / beta2_correction; 121 | MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon; 122 | MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]); 123 | r_p[ii] = r_p[ii] - (lr * update); 124 | } 125 | } 126 | #pragma unroll 127 | for (int ii = 0; ii < ILP; ii++) { 128 | int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2); 129 | if (i < n && i < chunk_size) { 130 | p[i] = r_p[ii]; 131 | m[i] = r_m[ii]; 132 | v[i] = r_v[ii]; 133 | } 134 | } 135 | } 136 | } 137 | }; 138 | 139 | void multi_tensor_adam_cuda(int chunk_size, 140 | at::Tensor noop_flag, 141 | std::vector> tensor_lists, 142 | const float lr, 143 | const float beta1, 144 | const float beta2, 145 | const float epsilon, 146 | const int step, 147 | const int mode, 148 | const int bias_correction, 149 | const float weight_decay) 150 | { 151 | using namespace at; 152 | 153 | // Handle bias correction mode 154 | float bias_correction1 = 1.0f, bias_correction2 = 1.0f; 155 | if (bias_correction == 1) { 156 | bias_correction1 = 1 - std::pow(beta1, step); 157 | bias_correction2 = 1 - std::pow(beta2, step); 158 | } 159 | 160 | // Assume single type across p,g,m1,m2 now 161 | DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 162 | 0, 163 | "adam", 164 | multi_tensor_apply<4>(BLOCK_SIZE, 165 | chunk_size, 166 | noop_flag, 167 | tensor_lists, 168 | AdamFunctor(), 169 | beta1, 170 | beta2, 171 | bias_correction1, 172 | bias_correction2, 173 | epsilon, 174 | lr, 175 | (adamMode_t)mode, 176 | weight_decay);) 177 | 178 | /* 179 | DPCT1010:9: SYCL uses exceptions to report errors and does not use the error codes. The call was 180 | replaced with 0. You need to rewrite this code. 181 | */ 182 | // AT_CUDA_CHECK(0); 183 | } 184 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_apply.dp.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | /* 7 | Copyright NVIDIA/apex 8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | // #include 16 | // #include 17 | #include 18 | #include "compat.h" 19 | #include "context.h" 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | // This header is the one-stop shop for all your multi-tensor apply needs. 26 | 27 | // TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson) 28 | constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; 29 | constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; 30 | 31 | template 32 | struct TensorListMetadata { 33 | void* addresses[n][depth_to_max_tensors[n - 1]]; 34 | int sizes[depth_to_max_tensors[n - 1]]; 35 | unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; 36 | int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int. 37 | int start_tensor_this_launch; 38 | }; 39 | 40 | template 41 | class multi_tensor_apply_kernel { 42 | public: 43 | multi_tensor_apply_kernel(int chunk_size, 44 | volatile int* noop_flag, 45 | T tl, 46 | U callable, 47 | ArgTypes... args) 48 | : chunk_size(chunk_size), noop_flag(noop_flag), tl(tl), callable(callable), args(args...) 49 | { 50 | } 51 | 52 | // This should be identical to original __global__ function 53 | static void inline __global__function(int chunk_size, 54 | volatile int* noop_flag, 55 | T tl, 56 | U callable, 57 | ArgTypes... args) 58 | { 59 | callable(chunk_size, noop_flag, tl, args...); 60 | } 61 | 62 | // If global function template contains parameter pack, 63 | // we only deal with parameter pack at the end of template parameter list 64 | template 65 | static void inline __tuple_expand_driver(int chunk_size, 66 | volatile int* noop_flag, 67 | T tl, 68 | U callable, 69 | Tuple args, 70 | std::index_sequence) 71 | { 72 | __global__function(chunk_size, noop_flag, tl, callable, std::get(args)...); 73 | } 74 | 75 | // 76 | // Because __global__ function can't really use any reference types, we can sure that args 77 | // are all good behaviors 78 | // 79 | void operator()(sycl::nd_item<3>) const 80 | { 81 | __tuple_expand_driver(chunk_size, 82 | noop_flag, 83 | tl, 84 | callable, 85 | args, 86 | std::make_index_sequence()); 87 | } 88 | 89 | private: 90 | int chunk_size; 91 | volatile int* noop_flag; 92 | T tl; 93 | U callable; 94 | std::tuple args; 95 | }; 96 | 97 | template 98 | void multi_tensor_apply(int block_size, 99 | int chunk_size, 100 | const at::Tensor& noop_flag, 101 | const std::vector>& tensor_lists, 102 | T callable, 103 | ArgTypes... args) 104 | { 105 | TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); 106 | int len0 = tensor_lists[0].size(); 107 | TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); 108 | auto ref_device = tensor_lists[0][0].device(); 109 | TORCH_CHECK(ref_device.type() == at::kXPU, "expected input to be on cuda"); 110 | for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices 111 | { 112 | TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists"); 113 | for (int t = 0; t < tensor_lists[l].size(); t++) { 114 | // TODO: Print which tensor fails. 115 | bool contiguous_memory = tensor_lists[l][t].is_contiguous(); 116 | #ifdef VERSION_GE_1_5 117 | contiguous_memory = (contiguous_memory || 118 | tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); 119 | #endif 120 | TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); 121 | TORCH_CHECK(tensor_lists[l][t].device() == ref_device, 122 | "A tensor was not on the same device as the first tensor"); 123 | TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch"); 124 | } 125 | } 126 | 127 | int ntensors = tensor_lists[0].size(); 128 | 129 | TensorListMetadata tl; 130 | 131 | /* const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); */ 132 | auto stream = at::cuda::getCurrentCUDAStream(); 133 | 134 | tl.start_tensor_this_launch = 0; 135 | int loc_block_info = 0; 136 | int loc_tensor_info = 0; 137 | for (int t = 0; t < ntensors; t++) { 138 | tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); 139 | for (int d = 0; d < depth; d++) 140 | tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); 141 | loc_tensor_info++; 142 | 143 | int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; 144 | 145 | for (int chunk = 0; chunk < chunks_this_tensor; chunk++) { 146 | // std::cout << chunks_this_tensor << std::endl; 147 | tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; 148 | tl.block_to_chunk[loc_block_info] = chunk; 149 | loc_block_info++; 150 | 151 | bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && 152 | chunk == chunks_this_tensor - 1); 153 | bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); 154 | bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); 155 | if (tensors_full || blocks_full || last_chunk) { 156 | // using accscalar_t = acc_type; 157 | /* 158 | DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To 159 | get the device limit, query info::device::max_work_group_size. Adjust the work-group 160 | size if needed. 161 | */ 162 | /* multi_tensor_apply_kernel, T, ArgTypes...> 163 | * fn(chunk_size, noop_flag.DATA_PTR(), tl, callable, args...); */ 164 | if constexpr (sizeof(multi_tensor_apply_kernel( 165 | chunk_size, noop_flag.DATA_PTR(), tl, callable, args...)) < 166 | 2048) { 167 | ((sycl::queue*)(stream)) 168 | ->parallel_for( 169 | sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) * 170 | sycl::range<3>(1, 1, block_size), 171 | sycl::range<3>(1, 1, block_size)), 172 | multi_tensor_apply_kernel( 173 | chunk_size, noop_flag.DATA_PTR(), tl, callable, args...)); 174 | } else { 175 | auto capture = multi_tensor_apply_kernel( 176 | chunk_size, noop_flag.DATA_PTR(), tl, callable, args...); 177 | sycl::buffer params(const_cast(&capture), 178 | sycl::range<1>(1)); 179 | stream->submit([&](sycl::handler& cgh) { 180 | auto device_params = 181 | params.template get_access(cgh); 183 | cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) * 184 | sycl::range<3>(1, 1, block_size), 185 | sycl::range<3>(1, 1, block_size)), 186 | [=](sycl::nd_item<3> item) { device_params[0](item); }); 187 | }); 188 | } 189 | /* 190 | DPCT1010:5: SYCL uses exceptions to report errors and does not use the error codes. 191 | The call was replaced with 0. You need to rewrite this code. 192 | */ 193 | 0; 194 | 195 | // Reset. The control flow possibilities here make my brain hurt. 196 | loc_block_info = 0; 197 | if (chunk == chunks_this_tensor - 1) { 198 | // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << 199 | // std::endl; 200 | loc_tensor_info = 0; 201 | tl.start_tensor_this_launch = t + 1; 202 | } else { 203 | // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << 204 | // std::endl; 205 | tl.sizes[0] = tl.sizes[loc_tensor_info - 1]; 206 | for (int d = 0; d < depth; d++) 207 | tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1]; 208 | loc_tensor_info = 1; 209 | tl.start_tensor_this_launch = t; 210 | } 211 | } 212 | } 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/common/custom_cuda_kernel.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "custom_cuda_layers.h" 9 | 10 | void param_update_kernel(const float* input, sycl::half* output, int size) 11 | { 12 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 13 | int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); 14 | 15 | if (id < size) { output[id] = (sycl::half)input[id]; } 16 | } 17 | 18 | void launch_param_update(const float* input, sycl::half* output, int size, dpct::queue_ptr stream) 19 | { 20 | int threads = 1024; 21 | 22 | sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1); 23 | sycl::range<3> block_dim(1, 1, threads); 24 | 25 | /* 26 | DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the 27 | device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 28 | */ 29 | { 30 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); 31 | stream->parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), 32 | [=](sycl::nd_item<3> item_ct1) { 33 | param_update_kernel(input, output, size); 34 | }); 35 | } 36 | } 37 | 38 | void param_update_kernel_half(const float* input, sycl::half* output, int size) 39 | { 40 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 41 | int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); 42 | sycl::half2* output_cast = reinterpret_cast(output); 43 | if (id < size) { 44 | float input_f = input[id]; 45 | sycl::half2* input_h = reinterpret_cast(&input_f); 46 | output_cast[id] = *input_h; 47 | } 48 | } 49 | 50 | void launch_param_update_half(const float* input, 51 | sycl::half* output, 52 | int size, 53 | dpct::queue_ptr stream) 54 | { 55 | int threads = 1024; 56 | size /= 2; 57 | sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1); 58 | sycl::range<3> block_dim(1, 1, threads); 59 | 60 | /* 61 | DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the 62 | device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 63 | */ 64 | { 65 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); 66 | stream->parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim), 67 | [=](sycl::nd_item<3> item_ct1) { 68 | param_update_kernel_half(input, output, size); 69 | }); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/flash_attn/flash_attn.dp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "context.h" 3 | #include "flash_attn.h" 4 | 5 | // [Bs, Hn, Sl, Hs] 6 | std::vector flash_attn_fwd(torch::Tensor &q, 7 | torch::Tensor &k, 8 | torch::Tensor &v, 9 | uint32_t bs, 10 | uint32_t head_number, 11 | uint32_t seqlens, 12 | uint32_t head_size, 13 | float softmax_scale, 14 | float dropout_prob, 15 | uint64_t dropout_rand_seed, 16 | bool is_causal, 17 | bool is_training, 18 | bool is_dropout) { 19 | torch::Tensor q_ = q.transpose(1, 2).contiguous().transpose(1, 2); 20 | torch::Tensor k_ = k.transpose(1, 2).contiguous().transpose(1, 2); 21 | torch::Tensor v_ = v.transpose(1, 2).contiguous().transpose(1, 2); 22 | torch::Tensor output = torch::empty_like(q_); 23 | torch::Tensor softmax_L, dropout_mask; 24 | softmax_L = torch::empty({bs * head_number, 1, seqlens}, q.options()).to(at::kFloat); 25 | 26 | void *q_ptr = (void *)q_.data_ptr(); 27 | void *k_ptr = (void *)k_.data_ptr(); 28 | void *v_ptr = (void *)v_.data_ptr(); 29 | void *output_ptr = (void *)output.data_ptr(); 30 | void *softmax_L_ptr = (void *)softmax_L.data_ptr(); 31 | void *drop_mask_ptr = nullptr; 32 | uint64_t dropout_rand_offset = 123; 33 | 34 | sycl::queue* stream = ::TrainingContext::Instance().GetCurrentStream(); 35 | FlashAttention _flash_attn = FlashAttention(); 36 | _flash_attn.Forward( 37 | *stream, 38 | output_ptr, 39 | softmax_L_ptr, 40 | bs, 41 | head_number, 42 | head_size, 43 | seqlens, 44 | seqlens, 45 | softmax_scale, 46 | q_ptr, 47 | k_ptr, 48 | v_ptr, 49 | drop_mask_ptr, 50 | dropout_prob, 51 | dropout_rand_seed, 52 | dropout_rand_offset, 53 | is_causal, 54 | is_training, 55 | is_dropout 56 | ); 57 | return {output, softmax_L}; 58 | } 59 | 60 | std::vector flash_attn_bwd(torch::Tensor &gradout, 61 | torch::Tensor &q, 62 | torch::Tensor &k, 63 | torch::Tensor &v, 64 | torch::Tensor &out, 65 | uint32_t bs, 66 | uint32_t head_number, 67 | uint32_t seqlens, 68 | uint32_t head_size, 69 | float softmax_scale, 70 | float dropout_prob, 71 | uint64_t dropout_rand_seed, 72 | bool is_causal, 73 | bool is_dropout, 74 | torch::Tensor &softmax_L) { 75 | torch::Tensor q_ = q.transpose(1, 2).contiguous().transpose(1, 2); 76 | torch::Tensor k_ = k.transpose(1, 2).contiguous().transpose(1, 2); 77 | torch::Tensor v_ = v.transpose(1, 2).contiguous().transpose(1, 2); 78 | torch::Tensor out_ = out.transpose(1, 2).contiguous().transpose(1, 2); 79 | torch::Tensor grad_out_ = gradout.transpose(1, 2).contiguous().transpose(1, 2); 80 | 81 | torch::Tensor dq = torch::zeros_like(q_); 82 | torch::Tensor dk = torch::empty_like(k_); 83 | torch::Tensor dv = torch::empty_like(v_); 84 | torch::Tensor d_buffer = torch::empty_like(softmax_L); 85 | void *gradout_ptr = (void *)grad_out_.data_ptr(); 86 | void *q_ptr = (void *)q_.data_ptr(); 87 | void *k_ptr = (void *)k_.data_ptr(); 88 | void *v_ptr = (void *)v_.data_ptr(); 89 | void *out_ptr = (void *)out_.data_ptr(); 90 | void *dq_ptr = (void *)dq.data_ptr(); 91 | void *dk_ptr = (void *)dk.data_ptr(); 92 | void *dv_ptr = (void *)dv.data_ptr(); 93 | void *softmax_L_ptr = (void *)softmax_L.data_ptr(); 94 | void *d_buffer_ptr = (void *)d_buffer.data_ptr(); 95 | void *drop_mask_ptr = nullptr; 96 | uint64_t dropout_rand_offset = 123; 97 | 98 | sycl::queue* stream = ::TrainingContext::Instance().GetCurrentStream(); 99 | FlashAttention _flash_attn = FlashAttention(); 100 | _flash_attn.Backward( 101 | *stream, 102 | dq_ptr, 103 | dk_ptr, 104 | dv_ptr, 105 | out_ptr, 106 | gradout_ptr, 107 | softmax_L_ptr, 108 | d_buffer_ptr, 109 | bs, 110 | head_number, 111 | head_size, 112 | seqlens, 113 | seqlens, 114 | softmax_scale, 115 | q_ptr, 116 | k_ptr, 117 | v_ptr, 118 | drop_mask_ptr, 119 | dropout_prob, 120 | dropout_rand_seed, 121 | dropout_rand_offset, 122 | is_causal, 123 | is_dropout 124 | ); 125 | return {dq, dk, dv}; 126 | } 127 | 128 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 129 | { 130 | m.def("flash_attn_fwd", 131 | &flash_attn_fwd, 132 | "Flash attention forward"); 133 | m.def("flash_attn_bwd", 134 | &flash_attn_bwd, 135 | "Flash attention backward"); 136 | } 137 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/flash_attn/flash_attn.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | class FlashAttention { 7 | public: 8 | virtual ~FlashAttention() {} 9 | 10 | bool Forward(sycl::queue &stream, 11 | void* output, 12 | void* softmax_L, 13 | uint32_t num_batches, 14 | uint32_t num_heads, 15 | uint32_t head_size, 16 | uint32_t num_queries, 17 | uint32_t num_keys, 18 | float hs_rsqrt_scale, 19 | void* q_ptr, 20 | void* k_ptr, 21 | void* v_ptr, 22 | void* dropout_mask = nullptr, 23 | float dropout_prob = 0.0, 24 | uint64_t rand_seed = 0, 25 | uint64_t rank_offset = 0, 26 | bool is_causal = true, 27 | bool is_training = true, 28 | bool is_dropout = true) { 29 | RECORD_FUNCTION("flash_scaled_attn_bf16_fwd", c10::ArrayRef({})); 30 | gpu::xetla::fmha_forward_kernel( 31 | gpu::xetla::XetlaType::bf16, 32 | stream, 33 | q_ptr, 34 | k_ptr, 35 | v_ptr, 36 | dropout_mask, 37 | output, 38 | softmax_L, 39 | hs_rsqrt_scale, 40 | dropout_prob, 41 | num_batches, 42 | num_heads, 43 | head_size, 44 | num_queries, 45 | num_keys, 46 | num_keys, 47 | is_causal, 48 | is_training, 49 | is_dropout, 50 | rand_seed, 51 | rank_offset 52 | ); 53 | 54 | return true; 55 | } 56 | 57 | bool Backward(sycl::queue &stream, 58 | void* dq, 59 | void* dk, 60 | void* dv, 61 | void* out, // [Bs, Hn, Sl, Hs] 62 | void* gradout, 63 | void* softmax_workspace, // [Bs*Hn, 1, Sl]: row_max + log(row_sum) 64 | void* d_buffer, // temp buffer for D = O pointmul dO [Bs*Hn, 1, Sl] 65 | uint32_t num_batches, 66 | uint32_t num_heads, 67 | uint32_t head_size, 68 | uint32_t num_queries, 69 | uint32_t num_keys, 70 | float hs_rsqrt_scale, 71 | void* q_ptr, 72 | void* k_ptr, 73 | void* v_ptr, 74 | void* dropout_mask = nullptr, 75 | float dropout_prob = 0.0, 76 | uint64_t rand_seed = 0, 77 | uint64_t rank_offset = 0, 78 | bool is_causal = true, 79 | bool is_dropout = true) { 80 | RECORD_FUNCTION("flash_scaled_attn_bf16_bwd", c10::ArrayRef({})); 81 | gpu::xetla::fmha_backward_kernel( 82 | gpu::xetla::XetlaType::bf16, 83 | stream, 84 | gradout, 85 | q_ptr, 86 | k_ptr, 87 | v_ptr, 88 | out, 89 | softmax_workspace, 90 | d_buffer, 91 | hs_rsqrt_scale, 92 | dropout_prob, 93 | dq, 94 | dk, 95 | dv, 96 | num_batches, 97 | num_heads, 98 | head_size, 99 | num_queries, 100 | num_keys, 101 | num_keys, 102 | is_causal, 103 | is_dropout, 104 | rand_seed, 105 | rank_offset 106 | ); 107 | return true; 108 | } 109 | }; 110 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/flash_attn/fmha_policy.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "xetla.hpp" 4 | 5 | namespace gpu::xetla { 6 | 7 | struct fmha_policy_base { 8 | static constexpr uint32_t accum_step = 16; 9 | static constexpr uint32_t stages = 3; 10 | static constexpr uint32_t sync_freq = 0; 11 | }; 12 | 13 | /* 14 | Note: 15 | kHm / kSgHm == kBc / kSgBc 16 | kSgHm and kSgBc should be a multiple of 16 17 | kSgBr should be a multiple of 8 18 | */ 19 | 20 | struct fmha_policy_128x128x64 : fmha_policy_base { 21 | static constexpr uint32_t kBr = 128; 22 | static constexpr uint32_t kSgBr = 16; 23 | static constexpr uint32_t kBc = 128; 24 | static constexpr uint32_t kSgBc = 32; 25 | static constexpr uint32_t kBcHm_SgBc = 16; 26 | static constexpr uint32_t kHm = 64; 27 | static constexpr uint32_t kSgHm = 16; 28 | static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc); 29 | }; 30 | 31 | struct fmha_policy_128x128x128 : fmha_policy_base { 32 | static constexpr uint32_t kBr = 128; 33 | static constexpr uint32_t kSgBr = 16; 34 | static constexpr uint32_t kBc = 128; 35 | static constexpr uint32_t kSgBc = 32; 36 | static constexpr uint32_t kBcHm_SgBc = 16; 37 | static constexpr uint32_t kHm = 128; 38 | static constexpr uint32_t kSgHm = 32; 39 | static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc); 40 | }; 41 | 42 | struct fmha_policy_128x128x256 : fmha_policy_base { 43 | static constexpr uint32_t kBr = 128; 44 | static constexpr uint32_t kSgBr = 16; 45 | static constexpr uint32_t kBc = 128; 46 | static constexpr uint32_t kSgBc = 32; 47 | static constexpr uint32_t kBcHm_SgBc = 16; 48 | static constexpr uint32_t kHm = 256; 49 | static constexpr uint32_t kSgHm = 64; 50 | static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc); 51 | }; 52 | 53 | } // namespace gpu::xetla -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/flash_attn/mha.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if __has_include() 4 | #include 5 | #elif __has_include() 6 | #include 7 | #else 8 | #error "Unsupported compiler" 9 | #endif 10 | 11 | #include 12 | #include 13 | #include 14 | #include "xetla.hpp" 15 | 16 | #define DPCPP_Q_CGF(h) [&](sycl::handler & h) 17 | 18 | #define DPCPP_Q_SUBMIT(q, cgf, ...) \ 19 | { \ 20 | auto e = (q).submit((cgf), ##__VA_ARGS__); \ 21 | (q).throw_asynchronous(); \ 22 | xpu::profiler_record("dpcpp_kernel", e); \ 23 | } 24 | 25 | namespace gpu::xetla { 26 | 27 | enum class XetlaType { 28 | fp16, 29 | bf16, 30 | }; 31 | 32 | void fmha_forward_kernel( 33 | XetlaType xeType, 34 | sycl::queue& q, 35 | void* query, 36 | void* key, 37 | void* value, 38 | void* dropout, 39 | void* out, 40 | void* log_sumexp, 41 | float alpha, 42 | float dropout_prob, 43 | uint32_t num_batches, 44 | uint32_t num_heads, 45 | uint32_t head_size, 46 | uint32_t num_queries, 47 | uint32_t num_keys, 48 | uint32_t attn_mask_padded_block_size, 49 | bool is_causal, 50 | bool is_training, 51 | bool is_dropout, 52 | uint64_t seed_t, 53 | uint64_t offset_t); 54 | 55 | void fmha_backward_kernel( 56 | XetlaType xeType, 57 | sycl::queue& q, 58 | void* grad_out, 59 | void* query, 60 | void* key, 61 | void* value, 62 | void* out, 63 | void* log_sumexp, 64 | void* workspace, 65 | float alpha, 66 | float dropout_prob, 67 | void* grad_query, 68 | void* grad_key, 69 | void* grad_value, 70 | uint32_t num_batches, 71 | uint32_t num_heads, 72 | uint32_t head_size, 73 | uint32_t num_queries, 74 | uint32_t num_keys, 75 | uint32_t attn_mask_padding, 76 | bool is_causal, 77 | bool is_dropout, 78 | uint64_t seed_t, 79 | uint64_t offset_t); 80 | 81 | } // namespace gpu::xetla 82 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/StopWatch.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | #ifdef _WIN32 8 | #include 9 | #else 10 | #include 11 | #endif 12 | 13 | #ifdef _WIN32 14 | 15 | class Stopwatch { 16 | private: 17 | double m_total_time; 18 | LARGE_INTEGER m_start_time; 19 | 20 | public: 21 | Stopwatch() { m_total_time = 0.0; } 22 | 23 | ~Stopwatch() {} 24 | 25 | void Reset() { m_total_time = 0.0; } 26 | 27 | void Start() { QueryPerformanceCounter(&m_start_time); } 28 | 29 | void Restart() 30 | { 31 | m_total_time = 0.0; 32 | QueryPerformanceCounter(&m_start_time); 33 | } 34 | 35 | void Stop() 36 | { 37 | LARGE_INTEGER frequency; 38 | LARGE_INTEGER stop_time; 39 | QueryPerformanceFrequency(&frequency); 40 | QueryPerformanceCounter(&stop_time); 41 | m_total_time += 42 | ((double)(stop_time.QuadPart - m_start_time.QuadPart) / (double)frequency.QuadPart); 43 | } 44 | 45 | double GetTimeInSeconds() { return m_total_time; } 46 | }; 47 | 48 | #else 49 | 50 | class Stopwatch { 51 | private: 52 | double m_total_time; 53 | struct timespec m_start_time; 54 | bool m_is_started; 55 | 56 | public: 57 | Stopwatch() 58 | { 59 | m_total_time = 0.0; 60 | m_is_started = false; 61 | } 62 | 63 | ~Stopwatch() {} 64 | 65 | void Reset() { m_total_time = 0.0; } 66 | 67 | void Start() 68 | { 69 | clock_gettime(CLOCK_MONOTONIC, &m_start_time); 70 | m_is_started = true; 71 | } 72 | 73 | void Restart() 74 | { 75 | m_total_time = 0.0; 76 | clock_gettime(CLOCK_MONOTONIC, &m_start_time); 77 | m_is_started = true; 78 | } 79 | 80 | void Stop() 81 | { 82 | if (m_is_started) { 83 | m_is_started = false; 84 | 85 | struct timespec end_time; 86 | clock_gettime(CLOCK_MONOTONIC, &end_time); 87 | 88 | m_total_time += (double)(end_time.tv_sec - m_start_time.tv_sec) + 89 | (double)(end_time.tv_nsec - m_start_time.tv_nsec) / 1e9; 90 | } 91 | } 92 | 93 | double GetTimeInSeconds() 94 | { 95 | if (m_is_started) { 96 | Stop(); 97 | Start(); 98 | } 99 | return m_total_time; 100 | } 101 | }; 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/compat.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | /* 7 | Copyright NVIDIA/apex 8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85 9 | */ 10 | 11 | #ifndef TORCH_CHECK 12 | #define TORCH_CHECK AT_CHECK 13 | #endif 14 | 15 | #ifdef VERSION_GE_1_3 16 | #define DATA_PTR data_ptr 17 | #else 18 | #define DATA_PTR data 19 | #endif 20 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/context.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | /* #include */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include "gemm_test.h" 19 | 20 | #include 21 | 22 | #ifndef SYCL_CUDA_STREAM 23 | #define SYCL_CUDA_STREAM 24 | namespace at { 25 | namespace cuda { 26 | inline dpct::queue_ptr getCurrentCUDAStream() { 27 | auto device_type = c10::DeviceType::XPU; 28 | c10::impl::VirtualGuardImpl impl(device_type); 29 | c10::Stream c10_stream = impl.getStream(c10::Device(device_type)); 30 | auto& queue = xpu::get_queue_from_stream(c10_stream); 31 | return &queue; 32 | } 33 | 34 | inline dpct::queue_ptr getStreamFromPool(bool) { 35 | // not implemented 36 | return nullptr; 37 | } 38 | 39 | inline dpct::queue_ptr getStreamFromPool() { 40 | // not implemented 41 | return nullptr; 42 | } 43 | } 44 | } 45 | #endif 46 | 47 | #define WARP_SIZE 32 48 | 49 | #define CUDA_CHECK(callstr) \ 50 | { \ 51 | cudaError_t error_code = callstr; \ 52 | if (error_code != cudaSuccess) { \ 53 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 54 | assert(0); \ 55 | } \ 56 | } 57 | 58 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 59 | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) 60 | 61 | #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ 62 | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \ 63 | for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y) 64 | 65 | #define DS_CUDA_NUM_THREADS 512 66 | #define DS_MAXIMUM_NUM_BLOCKS 262144 67 | 68 | inline int DS_GET_BLOCKS(const int N) 69 | { 70 | return (std::max)( 71 | (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS), 72 | // Use at least 1 block, since CUDA does not allow empty block 73 | 1); 74 | } 75 | 76 | class TrainingContext { 77 | public: 78 | TrainingContext() try : _workspace(nullptr), _seed(42), _curr_offset(0) { 79 | _gen = dpct::rng::create_host_rng(dpct::rng::random_engine_type::mcg59); 80 | _gen->set_seed(123); 81 | int stat = DPCT_CHECK_ERROR(_cublasHandle = &dpct::get_in_order_queue()); 82 | if (stat != 0) { 83 | // It would be nice to use cublasGetStatusName and 84 | // cublasGetStatusString, but they were only added in CUDA 11.4.2. 85 | auto message = std::string("Failed to create cublas handle: cublasStatus_t was ") + 86 | std::to_string(stat); 87 | std::cerr << message << std::endl; 88 | throw std::runtime_error(message); 89 | } 90 | } 91 | catch (sycl::exception const& exc) { 92 | std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ 93 | << std::endl; 94 | std::exit(1); 95 | } 96 | 97 | virtual ~TrainingContext() 98 | { 99 | _cublasHandle = nullptr; 100 | sycl::free(_workspace, dpct::get_in_order_queue()); 101 | } 102 | 103 | static TrainingContext& Instance() 104 | { 105 | static TrainingContext _ctx; 106 | return _ctx; 107 | } 108 | 109 | void SetWorkSpace(void* workspace) 110 | { 111 | if (!workspace) { throw std::runtime_error("Workspace is null."); } 112 | _workspace = workspace; 113 | } 114 | 115 | void* GetWorkSpace() { return _workspace; } 116 | 117 | dpct::rng::host_rng_ptr& GetRandGenerator() { return _gen; } 118 | 119 | dpct::queue_ptr GetCurrentStream() 120 | { 121 | // get current pytorch stream. 122 | dpct::queue_ptr stream = at::cuda::getCurrentCUDAStream(); 123 | return stream; 124 | } 125 | 126 | dpct::queue_ptr GetNewStream() { return at::cuda::getStreamFromPool(); } 127 | 128 | dpct::queue_ptr GetCublasHandle() { return _cublasHandle; } 129 | 130 | std::pair IncrementOffset(uint64_t offset_inc) 131 | { 132 | uint64_t offset = _curr_offset; 133 | _curr_offset += offset_inc; 134 | return std::pair(_seed, offset); 135 | } 136 | 137 | void SetSeed(uint64_t new_seed) { _seed = new_seed; } 138 | 139 | void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head) 140 | { 141 | // avoid rerun. 142 | if (_gemm_algos.size() > 0) return; 143 | 144 | if (test_gemm) { 145 | dpct::queue_ptr handle = GetCublasHandle(); 146 | 147 | std::unique_ptr> test_qkv_fw( 148 | new GemmTest(batch_size * seq_len, // M 149 | head_num * size_per_head, // N 150 | head_num * size_per_head, // K 151 | oneapi::mkl::transpose::trans, 152 | oneapi::mkl::transpose::nontrans, 153 | handle)); 154 | 155 | std::unique_ptr> test_inter( 156 | new GemmTest(batch_size * seq_len, // M 157 | 4 * head_num * size_per_head, // N 158 | head_num * size_per_head, // K 159 | oneapi::mkl::transpose::trans, 160 | oneapi::mkl::transpose::nontrans, 161 | handle)); 162 | 163 | std::unique_ptr> test_output( 164 | new GemmTest(batch_size * seq_len, // M 165 | head_num * size_per_head, // N 166 | 4 * head_num * size_per_head, // K 167 | oneapi::mkl::transpose::trans, 168 | oneapi::mkl::transpose::nontrans, 169 | handle)); 170 | 171 | std::unique_ptr> test_attn_scores( 172 | new StridedGemmTest(batch_size * head_num, // batch 173 | seq_len, // M 174 | seq_len, // N 175 | size_per_head, // K 176 | oneapi::mkl::transpose::trans, 177 | oneapi::mkl::transpose::nontrans, 178 | handle)); 179 | 180 | std::unique_ptr> test_attn_context( 181 | new StridedGemmTest(batch_size * head_num, // batch 182 | size_per_head, // M 183 | seq_len, // N 184 | seq_len, // K 185 | oneapi::mkl::transpose::nontrans, 186 | oneapi::mkl::transpose::nontrans, 187 | handle)); 188 | 189 | _gemm_algos.push_back(test_qkv_fw->TestAlgo(100)); 190 | _gemm_algos.push_back(test_inter->TestAlgo(100)); 191 | _gemm_algos.push_back(test_output->TestAlgo(100)); 192 | _gemm_algos.push_back(test_attn_scores->TestAlgo(100)); 193 | _gemm_algos.push_back(test_attn_context->TestAlgo(100)); 194 | } else { 195 | // Use default algo. 196 | _gemm_algos.push_back(std::array({99, 99, 99})); 197 | _gemm_algos.push_back(std::array({99, 99, 99})); 198 | _gemm_algos.push_back(std::array({99, 99, 99})); 199 | _gemm_algos.push_back(std::array({99, 99, 99})); 200 | _gemm_algos.push_back(std::array({99, 99, 99})); 201 | } 202 | } 203 | 204 | const std::vector>& GetGemmAlgos() const { return _gemm_algos; } 205 | 206 | private: 207 | dpct::rng::host_rng_ptr _gen; 208 | dpct::queue_ptr _cublasHandle; 209 | void* _workspace; 210 | uint64_t _seed; 211 | uint64_t _curr_offset; 212 | std::vector> _gemm_algos; 213 | }; 214 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #define NOMINMAX // Windows idiosyncrasy 9 | // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c 10 | 11 | #include 12 | #include 13 | #include "simd.h" 14 | 15 | #if defined(__ENABLE_CUDA__) 16 | #include 17 | #include 18 | #include "cuda.h" 19 | #include "custom_cuda_layers.h" 20 | typedef sycl::half ds_half_precision_t; 21 | #elif defined(__ENABLE_CANN__) 22 | #include "acl/acl.h" 23 | #include "torch_npu/csrc/core/npu/NPUStream.h" 24 | typedef c10::Half ds_half_precision_t; 25 | #else 26 | typedef unsigned short ds_half_precision_t; 27 | #endif 28 | 29 | #define STEP(SPAN) \ 30 | void Step_##SPAN(float* _params, \ 31 | float* grads, \ 32 | float* _exp_avg_sq, \ 33 | size_t _param_size, \ 34 | ds_half_precision_t* dev_param = nullptr, \ 35 | bool half_precision = false); 36 | 37 | class Adagrad_Optimizer { 38 | public: 39 | Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0) 40 | : _alpha(alpha), _eps(eps), _weight_decay(weight_decay) 41 | { 42 | #if defined(__ENABLE_CUDA__) 43 | cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); 44 | cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); 45 | 46 | _streams[0] = TrainingContext::Instance().GetCurrentStream(); 47 | _streams[1] = TrainingContext::Instance().GetNewStream(); 48 | _buf_index = false; 49 | #elif defined(__ENABLE_CANN__) 50 | aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); 51 | aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); 52 | 53 | _buf_index = false; 54 | #endif 55 | } 56 | ~Adagrad_Optimizer() 57 | { 58 | #if defined(__ENABLE_CUDA__) 59 | cudaFreeHost(_doubled_buffer[0]); 60 | cudaFreeHost(_doubled_buffer[1]); 61 | #elif defined(__ENABLE_CANN__) 62 | aclrtFreeHost(_doubled_buffer[0]); 63 | aclrtFreeHost(_doubled_buffer[1]); 64 | #endif 65 | } 66 | #if defined(__AVX512__) or defined(__AVX256__) 67 | template 68 | void Step_AVX(size_t* rounded_size, 69 | float* _params, 70 | float* grads, 71 | float* _exp_avg_sq, 72 | size_t param_size, 73 | ds_half_precision_t* dev_param = nullptr, 74 | bool half_precision = false); 75 | #endif 76 | STEP(1) 77 | STEP(4) 78 | STEP(8) 79 | #if defined(__ENABLE_CUDA__) 80 | inline void SynchronizeStreams() 81 | { 82 | for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]); 83 | } 84 | #elif defined(__ENABLE_CANN__) 85 | inline void SynchronizeStreams() 86 | { 87 | for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream()); 88 | } 89 | #endif 90 | inline void IncrementStep(size_t step) 91 | { 92 | _step++; 93 | if (_step != step) { _step = step; } 94 | } 95 | inline void update_state(float lr, float epsilon, float weight_decay) 96 | { 97 | _alpha = lr; 98 | _eps = epsilon; 99 | _weight_decay = weight_decay; 100 | } 101 | 102 | private: 103 | float _alpha; 104 | float _eps; 105 | float _weight_decay; 106 | 107 | float _betta1_t; 108 | float _betta2_t; 109 | size_t _step; 110 | 111 | #if defined(__ENABLE_CUDA__) 112 | bool _buf_index; 113 | float* _doubled_buffer[2]; 114 | cudaStream_t _streams[2]; 115 | #elif defined(__ENABLE_CANN__) 116 | float* _doubled_buffer[2]; 117 | c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(), 118 | c10_npu::getNPUStreamFromPool()}; 119 | bool _buf_index; 120 | #endif 121 | }; 122 | 123 | #if defined(__AVX512__) or defined(__AVX256__) 124 | template 125 | void Adagrad_Optimizer::Step_AVX(size_t* rounded_size, 126 | float* _params, 127 | float* grads, 128 | float* _exp_avg_sq, 129 | size_t _param_size, 130 | ds_half_precision_t* dev_params, 131 | bool half_precision) 132 | { 133 | size_t new_rounded_size = 0; 134 | AVX_Data eps_4; 135 | eps_4.data = SIMD_SET(_eps); 136 | 137 | float step_size = -1 * _alpha; 138 | AVX_Data step_size_4; 139 | step_size_4.data = SIMD_SET(step_size); 140 | 141 | AVX_Data weight_decay4; 142 | if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay); 143 | new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span); 144 | for (size_t t = 0; t < new_rounded_size; t += TILE) { 145 | size_t copy_size = TILE; 146 | if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t; 147 | size_t offset = copy_size + t; 148 | #if defined(__ENABLE_CUDA__) 149 | if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } 150 | #elif defined(__ENABLE_CANN__) 151 | if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } 152 | #endif 153 | #pragma omp parallel for 154 | for (size_t i = t; i < offset; i += SIMD_WIDTH * span) { 155 | AVX_Data grad_4[span]; 156 | simd_load(grad_4, grads + i, half_precision); 157 | 158 | AVX_Data momentum_4[span]; 159 | simd_load(momentum_4, grads + i, false); 160 | 161 | AVX_Data variance_4[span]; 162 | simd_load(variance_4, _exp_avg_sq + i, false); 163 | 164 | AVX_Data param_4[span]; 165 | simd_load(param_4, _params + i, half_precision); 166 | 167 | if (_weight_decay > 0) { simd_fma(grad_4, param_4, weight_decay4, grad_4); } 168 | 169 | simd_fma(variance_4, grad_4, grad_4, variance_4); 170 | simd_sqrt(grad_4, variance_4); 171 | simd_add(grad_4, grad_4, eps_4); 172 | simd_div(grad_4, momentum_4, grad_4); 173 | simd_fma(param_4, grad_4, step_size_4, param_4); 174 | 175 | simd_store(_params + i, param_4, half_precision); 176 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) 177 | if (dev_params) { 178 | simd_store(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision); 179 | } 180 | #endif 181 | simd_store(_exp_avg_sq + i, variance_4, false); 182 | } 183 | #if defined(__ENABLE_CUDA__) 184 | if (dev_params) { 185 | if (half_precision) 186 | launch_param_update_half( 187 | _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); 188 | else 189 | launch_param_update( 190 | _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); 191 | 192 | _buf_index = !_buf_index; 193 | } 194 | #elif defined(__ENABLE_CANN__) 195 | if (dev_params) { 196 | size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); 197 | if (half_precision) memoryCopySize /= 2; 198 | aclrtMemcpy(dev_params + t, 199 | memcpy_size, 200 | _doubled_buffer[_buf_index], 201 | memcpy_size, 202 | aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); 203 | 204 | _buf_index = !_buf_index; 205 | #endif 206 | } 207 | *rounded_size = new_rounded_size; 208 | } 209 | #endif 210 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/cublas_wrappers.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #ifndef __HIP_PLATFORM_AMD__ 14 | #endif 15 | #ifdef __HIP_PLATFORM_AMD__ 16 | #include 17 | #endif 18 | #include 19 | 20 | int cublas_gemm_ex(dpct::queue_ptr handle, 21 | oneapi::mkl::transpose transa, 22 | oneapi::mkl::transpose transb, 23 | int m, 24 | int n, 25 | int k, 26 | const float* alpha, 27 | const float* beta, 28 | const float* A, 29 | const float* B, 30 | float* C, 31 | #ifdef __HIP_PLATFORM_AMD__ 32 | rocblas_gemm_algo algo = rocblas_gemm_algo_standard); 33 | #else 34 | int algo = -1); 35 | #endif 36 | 37 | int cublas_gemm_ex(dpct::queue_ptr handle, 38 | oneapi::mkl::transpose transa, 39 | oneapi::mkl::transpose transb, 40 | int m, 41 | int n, 42 | int k, 43 | const float* alpha, 44 | const float* beta, 45 | const sycl::half* A, 46 | const sycl::half* B, 47 | sycl::half* C, 48 | #ifdef __HIP_PLATFORM_AMD__ 49 | rocblas_gemm_algo algo = rocblas_gemm_algo_standard); 50 | #else 51 | int algo = 99); 52 | #endif 53 | 54 | int cublas_strided_batched_gemm(dpct::queue_ptr handle, 55 | int m, 56 | int n, 57 | int k, 58 | const float* alpha, 59 | const float* beta, 60 | const float* A, 61 | const float* B, 62 | float* C, 63 | oneapi::mkl::transpose op_A, 64 | oneapi::mkl::transpose op_B, 65 | int stride_A, 66 | int stride_B, 67 | int stride_C, 68 | int batch, 69 | #ifdef __HIP_PLATFORM_AMD__ 70 | rocblas_gemm_algo algo = rocblas_gemm_algo_standard); 71 | #else 72 | int algo = -1); 73 | #endif 74 | 75 | int cublas_strided_batched_gemm(dpct::queue_ptr handle, 76 | int m, 77 | int n, 78 | int k, 79 | const float* alpha, 80 | const float* beta, 81 | const sycl::half* A, 82 | const sycl::half* B, 83 | sycl::half* C, 84 | oneapi::mkl::transpose op_A, 85 | oneapi::mkl::transpose op_B, 86 | int stride_A, 87 | int stride_B, 88 | int stride_C, 89 | int batch, 90 | #ifdef __HIP_PLATFORM_AMD__ 91 | rocblas_gemm_algo algo = rocblas_gemm_algo_standard); 92 | #else 93 | int algo = 99); 94 | #endif 95 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/dequantization_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "conversion_utils.h" 9 | #include "ds_kernel_utils.h" 10 | #include "quantization.h" 11 | #include "quantization_utils.h" 12 | 13 | #pragma once 14 | 15 | namespace dequantize { 16 | using Type = quantize::Type; 17 | 18 | template 19 | using Params = quantize::Params; 20 | 21 | constexpr int granularity = quantize::granularity; 22 | using PackedInt4 = quantize::PackedInt4; 23 | 24 | constexpr int h_per_chunk = granularity / sizeof(sycl::half); 25 | constexpr int h2_per_chunk = granularity / sizeof(sycl::half2); 26 | 27 | /* 28 | Device function that reads quantized data from global memory, dequantizes 29 | it, and stores it to global memory. 30 | Template Arguments : 31 | numBits - Number of bits in quantized element. int: 4, 8 32 | qType - Type of quantization to perform. Type::Symmetric or Type::Asymmetric 33 | unroll - Number of load steps to internally unroll int 34 | threads - Number of threads to perform dequant int 35 | Function arguments: 36 | global_output - sycl::half pointer in global memory 37 | data - Quantized data in global memory 38 | global_params - Quantization parameters in global memory 39 | elems_per_group - Number of elements in each quantization group 40 | total_elems - Tensor size (note, does not need to be multiple of elems_per_group) 41 | */ 42 | template 43 | DS_D_INLINE void to_global(sycl::half* global_output, 44 | const int8_t* data, 45 | const float* global_params, 46 | const int elems_per_group, 47 | const int total_elems); 48 | 49 | /* 50 | Device function that quantizes 16 bytes of sycl::half type input data. 51 | Template Arguments : 52 | numBits - Number of bits in quantized element. int : 8 or 4 53 | qType - Type of quantization to perform. Type::Symmetric or Type::Asymmetric 54 | Function Arguments : 55 | local_output - Local array to store dequantized data sycl::half* or sycl::half2* 56 | data - Pointer to quantized input data. int8_t* 57 | Params - Parameters for quantization. Params 58 | */ 59 | template 60 | DS_D_INLINE void chunk(sycl::half2* local_output, 61 | const int8_t* data, 62 | Params q_params); 63 | 64 | template 65 | DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params q_params); 66 | 67 | /**************** Implementations ******************/ 68 | 69 | template 70 | DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params q_params) 71 | { 72 | constexpr int32_t num_elems_packed = 8 / numBits; 73 | constexpr int32_t iters = h_per_chunk / num_elems_packed; 74 | 75 | #pragma unroll 76 | for (int i = 0; i < iters; i++) { 77 | if constexpr (num_elems_packed == 1) { 78 | local_output[i] = q_params.template dequantize(data[i]); 79 | } else { 80 | auto accessible_data = *(PackedInt4*)(&data[i]); 81 | local_output[2 * i] = q_params.template dequantize(accessible_data.low); 82 | local_output[2 * i + 1] = q_params.template dequantize(accessible_data.high); 83 | } 84 | } 85 | } 86 | 87 | template 88 | DS_D_INLINE void chunk(sycl::half2* local_output, 89 | const int8_t* data, 90 | Params q_params) 91 | { 92 | sycl::half* local_output_cast = reinterpret_cast(local_output); 93 | chunk(local_output_cast, data, q_params); 94 | } 95 | 96 | template 97 | /* 98 | DPCT1110:46: The total declared local variable size in device function _to_global exceeds 128 bytes 99 | and may cause high register pressure. Consult with your hardware vendor to find the total register 100 | size available and adjust the code, or use smaller sub-group size to avoid high register pressure. 101 | */ 102 | DS_D_INLINE void _to_global(T* global_output, 103 | const int8_t* data, 104 | const float* global_params, 105 | const int elems_per_group, 106 | const int total_elems) 107 | { 108 | sycl::group<3> tb = sycl::ext::oneapi::experimental::this_group<3>(); 109 | sycl::sub_group warp = sycl::ext::oneapi::experimental::this_sub_group(); 110 | 111 | // Load constants 112 | // TODO(cmikeh2): Refactor into functions? 113 | constexpr int load_granularity = (granularity / (sizeof(T))) / (numBits == 8 ? 1 : 2); 114 | constexpr int load_step_stride = load_granularity * threads; 115 | constexpr int load_block_stride = load_step_stride * unroll; 116 | 117 | // Store constants 118 | constexpr int T_per_chunk = granularity / sizeof(T); 119 | constexpr int store_step_stride = T_per_chunk * threads; 120 | constexpr int store_block_stride = store_step_stride * unroll; 121 | 122 | // Load offsets 123 | const int load_block_offset = tb.get_group_id()[2] * load_block_stride; 124 | // Note: we can use `load_granularity` since the dtype is `int8_t`. 125 | const int load_thread_offset = tb.get_local_id()[2] * load_granularity; 126 | const int8_t* load_base = data + load_block_offset + load_thread_offset; 127 | 128 | // Store offsets 129 | const int store_block_offset = tb.get_group_id()[2] * store_block_stride; 130 | const int store_thread_offset = tb.get_local_id()[2] * T_per_chunk; 131 | const int elem_id_base = store_block_offset + store_thread_offset; 132 | 133 | int8_t local_load_buffer[load_granularity * unroll]; 134 | T local_dequant_buffer[T_per_chunk * unroll]; 135 | 136 | /* 137 | Note: Splitting this loop in half gave about 3-5% performance increase for reasons that aren't 138 | totally clear to me, so this is a deliberately weird code structure. 139 | */ 140 | #pragma unroll 141 | for (int i = 0; i < unroll; i++) { 142 | const int elem_id_iter = elem_id_base + i * store_step_stride; 143 | 144 | if (elem_id_iter < total_elems) { 145 | mem_access::load_global(local_load_buffer + i * load_granularity, 146 | load_base + i * load_step_stride); 147 | } 148 | } 149 | 150 | #pragma unroll 151 | for (int i = 0; i < unroll; i++) { 152 | const int elem_id_iter = elem_id_base + i * store_step_stride; 153 | if (elem_id_iter < total_elems) { 154 | // TODO(cmikeh2): Can we amortize this division? Perform once on the first iteration and 155 | // use indexing math to do division free interpolation of the successive groups? 156 | const int group_index = elem_id_iter / elems_per_group; 157 | Params q_params(global_params, group_index); 158 | 159 | chunk(local_dequant_buffer + i * T_per_chunk, 160 | local_load_buffer + i * load_granularity, 161 | q_params); 162 | mem_access::store_global(global_output + elem_id_iter, 163 | local_dequant_buffer + i * T_per_chunk); 164 | } 165 | } 166 | } 167 | 168 | template 169 | DS_D_INLINE void to_global(T* global_output, 170 | const int8_t* data, 171 | const float* global_params, 172 | const int elems_per_group, 173 | const int total_elems) 174 | { 175 | if constexpr (numBits == 4 || numBits == 8) { 176 | _to_global( 177 | global_output, data, global_params, elems_per_group, total_elems); 178 | } else if constexpr (numBits == 3) { 179 | // TODO(cmikeh2): Need this implementation 180 | assert(false); 181 | } else { 182 | assert(false); 183 | } 184 | } 185 | 186 | } // namespace dequantize 187 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/dpct.hpp: -------------------------------------------------------------------------------- 1 | //==---- dpct.hpp ---------------------------------*- C++ -*----------------==// 2 | // 3 | // Copyright (C) Intel Corporation 4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5 | // See https://llvm.org/LICENSE.txt for license information. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #ifndef __DPCT_HPP__ 10 | #define __DPCT_HPP__ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | template class dpct_kernel_name; 18 | template class dpct_kernel_scalar; 19 | 20 | #include "atomic.hpp" 21 | #include "device.hpp" 22 | #include "image.hpp" 23 | #include "kernel.hpp" 24 | #include "math.hpp" 25 | #include "memory.hpp" 26 | #include "util.hpp" 27 | 28 | #if defined(_MSC_VER) 29 | #define __dpct_align__(n) __declspec(align(n)) 30 | #define __dpct_inline__ __forceinline 31 | #else 32 | #define __dpct_align__(n) __attribute__((aligned(n))) 33 | #define __dpct_inline__ __inline__ __attribute__((always_inline)) 34 | #endif 35 | 36 | #if defined(_MSC_VER) 37 | #define __dpct_noinline__ __declspec(noinline) 38 | #else 39 | #define __dpct_noinline__ __attribute__((noinline)) 40 | #endif 41 | 42 | #define DPCT_COMPATIBILITY_TEMP (600) 43 | 44 | namespace dpct{ 45 | enum error_code { success = 0, default_error = 999 }; 46 | } 47 | 48 | #define DPCT_CHECK_ERROR(expr) \ 49 | [&]() { \ 50 | try { \ 51 | expr; \ 52 | return dpct::success; \ 53 | } catch (std::exception const &e) { \ 54 | std::cerr << e.what() << std::endl; \ 55 | return dpct::default_error; \ 56 | } \ 57 | }() 58 | 59 | #define DPCT_PI_F (3.14159274101257f) 60 | #define DPCT_PI (3.141592653589793115998) 61 | 62 | #endif // __DPCT_HPP__ 63 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/dpl_utils.hpp: -------------------------------------------------------------------------------- 1 | //==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==// 2 | // 3 | // Copyright (C) Intel Corporation 4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5 | // See https://llvm.org/LICENSE.txt for license information. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #ifndef __DPCT_DPL_UTILS_HPP__ 10 | #define __DPCT_DPL_UTILS_HPP__ 11 | 12 | #define ONEDPL_USE_DPCPP_BACKEND 1 13 | #define __USE_DPCT 1 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "dpl_extras/memory.h" 20 | #include "dpl_extras/algorithm.h" 21 | #include "dpl_extras/numeric.h" 22 | #include "dpl_extras/iterators.h" 23 | #include "dpl_extras/vector.h" 24 | #include "dpl_extras/dpcpp_extensions.h" 25 | 26 | #endif // __DPCT_DPL_UTILS_HPP__ 27 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/lib_common_utils.hpp: -------------------------------------------------------------------------------- 1 | //==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==// 2 | // 3 | // Copyright (C) Intel Corporation 4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 5 | // See https://llvm.org/LICENSE.txt for license information. 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #ifndef __DPCT_LIB_COMMON_UTILS_HPP__ 10 | #define __DPCT_LIB_COMMON_UTILS_HPP__ 11 | 12 | #include 13 | #include 14 | #include "memory.hpp" 15 | #include "util.hpp" 16 | 17 | namespace dpct { 18 | namespace detail { 19 | template inline auto get_memory(const void *x) { 20 | T *new_x = reinterpret_cast(const_cast(x)); 21 | #ifdef DPCT_USM_LEVEL_NONE 22 | return dpct::get_buffer>(new_x); 23 | #else 24 | return new_x; 25 | #endif 26 | } 27 | 28 | template 29 | inline typename DataType::T2 get_value(const T *s, sycl::queue &q) { 30 | using Ty = typename DataType::T2; 31 | Ty s_h; 32 | if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) 33 | detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host) 34 | .wait(); 35 | else 36 | s_h = *reinterpret_cast(s); 37 | return s_h; 38 | } 39 | } // namespace detail 40 | 41 | enum class version_field : int { major, minor, update, patch }; 42 | 43 | /// Returns the requested field of Intel(R) oneAPI Math Kernel Library version. 44 | /// \param field The version information field (major, minor, update or patch). 45 | /// \param result The result value. 46 | inline void mkl_get_version(version_field field, int *result) { 47 | #ifndef __INTEL_MKL__ 48 | throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " 49 | "Project does not support this API."); 50 | #else 51 | MKLVersion version; 52 | mkl_get_version(&version); 53 | if (version_field::major == field) { 54 | *result = version.MajorVersion; 55 | } else if (version_field::minor == field) { 56 | *result = version.MinorVersion; 57 | } else if (version_field::update == field) { 58 | *result = version.UpdateVersion; 59 | } else if (version_field::patch == field) { 60 | *result = 0; 61 | } else { 62 | throw std::runtime_error("unknown field"); 63 | } 64 | #endif 65 | } 66 | 67 | enum class library_data_t : unsigned char { 68 | real_float = 0, 69 | complex_float, 70 | real_double, 71 | complex_double, 72 | real_half, 73 | complex_half, 74 | real_bfloat16, 75 | complex_bfloat16, 76 | real_int4, 77 | complex_int4, 78 | real_uint4, 79 | complex_uint4, 80 | real_int8, 81 | complex_int8, 82 | real_uint8, 83 | complex_uint8, 84 | real_int16, 85 | complex_int16, 86 | real_uint16, 87 | complex_uint16, 88 | real_int32, 89 | complex_int32, 90 | real_uint32, 91 | complex_uint32, 92 | real_int64, 93 | complex_int64, 94 | real_uint64, 95 | complex_uint64, 96 | real_int8_4, 97 | real_int8_32, 98 | real_uint8_4, 99 | library_data_t_size 100 | }; 101 | 102 | namespace detail { 103 | template 104 | inline constexpr std::uint64_t get_type_combination_id(ArgT Val) { 105 | static_assert((unsigned char)library_data_t::library_data_t_size <= 106 | std::numeric_limits::max() && 107 | "library_data_t size exceeds limit."); 108 | static_assert(std::is_same_v, "Unsupported ArgT"); 109 | return (std::uint64_t)Val; 110 | } 111 | 112 | template 113 | inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal, 114 | RestT... RestVal) { 115 | static_assert((std::uint8_t)library_data_t::library_data_t_size <= 116 | std::numeric_limits::max() && 117 | "library_data_t size exceeds limit."); 118 | static_assert(sizeof...(RestT) <= 8 && "Too many parameters"); 119 | static_assert(std::is_same_v, "Unsupported FirstT"); 120 | return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal); 121 | } 122 | 123 | inline constexpr std::size_t library_data_size[] = { 124 | 8 * sizeof(float), // real_float 125 | 8 * sizeof(std::complex), // complex_float 126 | 8 * sizeof(double), // real_double 127 | 8 * sizeof(std::complex), // complex_double 128 | 8 * sizeof(sycl::half), // real_half 129 | 8 * sizeof(std::complex), // complex_half 130 | 16, // real_bfloat16 131 | 16 * 2, // complex_bfloat16 132 | 4, // real_int4 133 | 4 * 2, // complex_int4 134 | 4, // real_uint4 135 | 4 * 2, // complex_uint4 136 | 8, // real_int8 137 | 8 * 2, // complex_int8 138 | 8, // real_uint8 139 | 8 * 2, // complex_uint8 140 | 16, // real_int16 141 | 16 * 2, // complex_int16 142 | 16, // real_uint16 143 | 16 * 2, // complex_uint16 144 | 32, // real_int32 145 | 32 * 2, // complex_int32 146 | 32, // real_uint32 147 | 32 * 2, // complex_uint32 148 | 64, // real_int64 149 | 64 * 2, // complex_int64 150 | 64, // real_uint64 151 | 64 * 2, // complex_uint64 152 | 8, // real_int8_4 153 | 8, // real_int8_32 154 | 8 // real_uint8_4 155 | }; 156 | } // namespace detail 157 | } // namespace dpct 158 | 159 | #endif // __DPCT_LIB_COMMON_UTILS_HPP__ 160 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/ds_kernel_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | /* 7 | Centralized header file for preprocessor macros and constants 8 | used throughout the codebase. 9 | */ 10 | 11 | #pragma once 12 | 13 | #include 14 | #include 15 | 16 | #ifdef BF16_AVAILABLE 17 | #endif 18 | 19 | #define DS_HD_INLINE __dpct_inline__ 20 | #define DS_D_INLINE __dpct_inline__ 21 | 22 | #ifdef __HIP_PLATFORM_AMD__ 23 | 24 | // constexpr variant of warpSize for templating 25 | constexpr int hw_warp_size = 64; 26 | #define HALF_PRECISION_AVAILABLE = 1 27 | #include 28 | #include 29 | 30 | #else // !__HIP_PLATFORM_AMD__ 31 | 32 | // constexpr variant of warpSize for templating 33 | constexpr int hw_warp_size = 32; 34 | 35 | #if DPCT_COMPATIBILITY_TEMP >= 530 36 | #define HALF_PRECISION_AVAILABLE = 1 37 | // #define PTX_AVAILABLE 38 | #endif // __CUDA_ARCH__ >= 530 39 | 40 | #if DPCT_COMPATIBILITY_TEMP >= 800 41 | #define ASYNC_COPY_AVAILABLE 42 | #endif // __CUDA_ARCH__ >= 800 43 | 44 | #endif //__HIP_PLATFORM_AMD__ 45 | 46 | inline int next_pow2(const int val) 47 | { 48 | int rounded_val = val - 1; 49 | rounded_val |= rounded_val >> 1; 50 | rounded_val |= rounded_val >> 2; 51 | rounded_val |= rounded_val >> 4; 52 | rounded_val |= rounded_val >> 8; 53 | return rounded_val + 1; 54 | } 55 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/quantization.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include "ds_kernel_utils.h" 11 | 12 | namespace quantize { 13 | 14 | enum class Type { Symmetric, Asymmetric }; 15 | 16 | struct PackedInt4 { 17 | int8_t high : 4; 18 | int8_t low : 4; 19 | }; 20 | 21 | DS_HD_INLINE bool requires_offset(Type qType) { return qType == Type::Asymmetric; } 22 | 23 | } // namespace quantize 24 | 25 | void launch_quant(int8_t* output_data, 26 | float* params, 27 | const sycl::half* input_data, 28 | const int groups, 29 | const int elems_per_group, 30 | const int num_bits, 31 | const quantize::Type quant_type, 32 | dpct::queue_ptr stream); 33 | 34 | template 35 | void launch_dequantize_kernel(T* dequant_data, 36 | const int8_t* q_data, 37 | const float* q_params, 38 | quantize::Type q_type, 39 | int num_bits, 40 | int elems_per_group, 41 | int total_elems, 42 | dpct::queue_ptr stream); 43 | 44 | void launch_swizzled_quant(int8_t* q_data, 45 | float* q_scales, 46 | const sycl::half* input_data, 47 | int num_bits, 48 | quantize::Type q_type, 49 | int groups, 50 | int elems_per_group, 51 | int pipelining, 52 | int nodes, 53 | int devices_per_node, 54 | dpct::queue_ptr stream); 55 | 56 | void launch_dequant_reduce(int8_t* reduced_data, 57 | float* reduced_scales, 58 | const int8_t* input_data, 59 | const float* input_scales, 60 | int num_gpus, 61 | int num_bits, 62 | quantize::Type quant_type, 63 | int out_groups, 64 | int elems_per_out_group, 65 | int elems_per_in_tensor, 66 | int groups_per_in_tensor, 67 | int elems_per_in_group, 68 | dpct::queue_ptr stream); 69 | 70 | template 71 | void launch_fake_quantize_kernel(T* vals, 72 | int total_count, 73 | int group_num, 74 | int num_bits, 75 | dpct::queue_ptr stream); 76 | template 77 | void launch_sr_fake_quantize_kernel(T* vals, 78 | int total_count, 79 | int group_num, 80 | int num_bits, 81 | dpct::queue_ptr stream); 82 | template 83 | void launch_fake_quantize_kernel_asym(T* vals, 84 | int total_count, 85 | int group_num, 86 | int num_bits, 87 | dpct::queue_ptr stream); 88 | template 89 | void launch_sr_fake_quantize_kernel_asym(T* vals, 90 | int total_count, 91 | int group_num, 92 | int num_bits, 93 | dpct::queue_ptr stream); 94 | 95 | void launch_dequantize_int4_to_half_experimental(uint8_t* data_in, 96 | sycl::half* data_out, 97 | sycl::half* scale_buffer, 98 | sycl::half* min_val_buffer, 99 | int num_group, 100 | int group_size, 101 | dpct::queue_ptr stream); 102 | 103 | void launch_dequantize_int8_to_half_experimental(uint8_t* data_in, 104 | sycl::half* data_out, 105 | sycl::half* scale_buffer, 106 | sycl::half* min_val_buffer, 107 | int num_group, 108 | int group_size, 109 | dpct::queue_ptr stream); 110 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/simd.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #if (__x86_64__ || __i386__) 9 | #include 10 | #include 11 | #endif 12 | 13 | #define TILE (128 * 1024 * 1024) 14 | #if defined(__AVX512__) or defined(__AVX256__) 15 | 16 | #define ROUND_DOWN(size, step) ((size) & ~((step)-1)) 17 | 18 | #if defined(__AVX512__) 19 | #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) 20 | #define SIMD_LOAD(x) _mm512_loadu_ps(x) 21 | #define SIMD_SET(x) _mm512_set1_ps(x) 22 | #define SIMD_ADD(x, y) _mm512_add_ps(x, y) 23 | #define SIMD_MUL(x, y) _mm512_mul_ps(x, y) 24 | #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c) 25 | #define SIMD_SQRT(x) _mm512_sqrt_ps(x) 26 | #define SIMD_DIV(x, y) _mm512_div_ps(x, y) 27 | #define SIMD_AND(x, y) _mm512_and_ps(x, y) 28 | #define SIMD_ANDNOT(x, y) _mm512_andnot_ps(x, y) 29 | #define SIMD_OR(x, y) _mm512_or_ps(x, y) 30 | #define SIMD_XOR(x, y) _mm512_xor_ps(x, y) 31 | #define SIMD_WIDTH 16 32 | 33 | #define SIMD_LOAD2(x, h) \ 34 | ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x)) 35 | #define SIMD_STORE2(x, d, h) \ 36 | ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \ 37 | : _mm512_storeu_ps(x, d)) 38 | 39 | #define INTV __m256i 40 | #elif defined(__AVX256__) 41 | #define SIMD_STORE(a, d) _mm256_storeu_ps(a, d) 42 | #define SIMD_LOAD(x) _mm256_loadu_ps(x) 43 | #define SIMD_SET(x) _mm256_set1_ps(x) 44 | #define SIMD_ADD(x, y) _mm256_add_ps(x, y) 45 | #define SIMD_MUL(x, y) _mm256_mul_ps(x, y) 46 | #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c) 47 | #define SIMD_SQRT(x) _mm256_sqrt_ps(x) 48 | #define SIMD_DIV(x, y) _mm256_div_ps(x, y) 49 | #define SIMD_AND(x, y) _mm256_and_ps(x, y) 50 | #define SIMD_ANDNOT(x, y) _mm256_andnot_ps(x, y) 51 | #define SIMD_OR(x, y) _mm256_or_ps(x, y) 52 | #define SIMD_XOR(x, y) _mm256_xor_ps(x, y) 53 | #define SIMD_WIDTH 8 54 | 55 | #define SIMD_LOAD2(x, h) \ 56 | ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x)) 57 | #define SIMD_STORE2(x, d, h) \ 58 | ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \ 59 | : _mm256_storeu_ps(x, d)) 60 | 61 | #define INTV __m128i 62 | #endif 63 | 64 | union AVX_Data { 65 | #if defined(__AVX512__) 66 | __m512 data; 67 | #elif defined(__AVX256__) 68 | __m256 data; 69 | #endif 70 | // float data_f[16]; 71 | }; 72 | 73 | template 74 | inline void simd_store(float* dst, AVX_Data* src, bool half_precision) 75 | { 76 | size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH); 77 | #pragma unroll 78 | for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); } 79 | } 80 | template 81 | inline void simd_load(AVX_Data* dst, float* src, bool half_precision) 82 | { 83 | size_t width = (half_precision ? 1 : SIMD_WIDTH); 84 | #pragma unroll 85 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); } 86 | } 87 | template 88 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a) 89 | { 90 | #pragma unroll 91 | for (size_t i = 0; i < span; ++i) { 92 | dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data); 93 | } 94 | } 95 | template 96 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a) 97 | { 98 | #pragma unroll 99 | for (size_t i = 0; i < span; ++i) { 100 | dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data); 101 | } 102 | } 103 | template 104 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a) 105 | { 106 | #pragma unroll 107 | for (size_t i = 0; i < span; ++i) { 108 | dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data); 109 | } 110 | } 111 | template 112 | inline void simd_sqrt(AVX_Data* dst, AVX_Data* src) 113 | { 114 | #pragma unroll 115 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); } 116 | } 117 | template 118 | inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 119 | { 120 | #pragma unroll 121 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); } 122 | } 123 | template 124 | inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 125 | { 126 | #pragma unroll 127 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); } 128 | } 129 | template 130 | inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 131 | { 132 | #pragma unroll 133 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); } 134 | } 135 | template 136 | inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 137 | { 138 | #pragma unroll 139 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); } 140 | } 141 | template 142 | inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 143 | { 144 | #pragma unroll 145 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); } 146 | } 147 | template 148 | inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 149 | { 150 | #pragma unroll 151 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r.data); } 152 | } 153 | template 154 | inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 155 | { 156 | #pragma unroll 157 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r[i].data); } 158 | } 159 | template 160 | inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 161 | { 162 | #pragma unroll 163 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r.data); } 164 | } 165 | template 166 | inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 167 | { 168 | #pragma unroll 169 | for (size_t i = 0; i < span; ++i) { 170 | dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r[i].data); 171 | } 172 | } 173 | template 174 | inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 175 | { 176 | #pragma unroll 177 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r.data); } 178 | } 179 | template 180 | inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 181 | { 182 | #pragma unroll 183 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r[i].data); } 184 | } 185 | template 186 | inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r) 187 | { 188 | #pragma unroll 189 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r.data); } 190 | } 191 | template 192 | inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r) 193 | { 194 | #pragma unroll 195 | for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r[i].data); } 196 | } 197 | 198 | #endif 199 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/includes/type_shim.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */ 7 | #include 8 | #include 9 | #include 10 | 11 | // Forward/backward compatibility hack around 12 | // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288 13 | // pending more future-proof guidance from upstream. 14 | // struct TypeShim 15 | // { 16 | // const at::Type& payload; 17 | // TypeShim(const at::Type& type) : payload(type) {} 18 | // // Enable trivial conversion to a const at::Type& for pre-3aeb78 19 | // operator const at::Type&(){ return payload; }; 20 | // // Enable dispatch switch statements to take *this directly for post-3aeb78 21 | // //operator at::ScalarType(){ return payload.; }; 22 | // }; 23 | 24 | #define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \ 25 | switch (TYPE) { \ 26 | case at::ScalarType::Float: { \ 27 | using scalar_t_##LEVEL = float; \ 28 | __VA_ARGS__; \ 29 | break; \ 30 | } \ 31 | case at::ScalarType::Half: { \ 32 | using scalar_t_##LEVEL = at::Half; \ 33 | __VA_ARGS__; \ 34 | break; \ 35 | } \ 36 | case at::ScalarType::BFloat16: { \ 37 | using scalar_t_##LEVEL = at::BFloat16; \ 38 | __VA_ARGS__; \ 39 | break; \ 40 | } \ 41 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 42 | } 43 | 44 | #define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \ 45 | switch (TYPE) { \ 46 | case at::ScalarType::Double: { \ 47 | using scalar_t_##LEVEL = double; \ 48 | __VA_ARGS__; \ 49 | break; \ 50 | } \ 51 | case at::ScalarType::Float: { \ 52 | using scalar_t_##LEVEL = float; \ 53 | __VA_ARGS__; \ 54 | break; \ 55 | } \ 56 | case at::ScalarType::Half: { \ 57 | using scalar_t_##LEVEL = at::Half; \ 58 | __VA_ARGS__; \ 59 | break; \ 60 | } \ 61 | case at::ScalarType::BFloat16: { \ 62 | using scalar_t_##LEVEL = at::BFloat16; \ 63 | __VA_ARGS__; \ 64 | break; \ 65 | } \ 66 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 67 | } 68 | 69 | #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...) \ 70 | switch (TYPE) { \ 71 | case at::ScalarType::Double: { \ 72 | using scalar_t_##LEVEL = double; \ 73 | __VA_ARGS__; \ 74 | break; \ 75 | } \ 76 | case at::ScalarType::Float: { \ 77 | using scalar_t_##LEVEL = float; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 82 | } 83 | 84 | template 85 | __dpct_inline__ T 86 | reduce_block_into_lanes(T* x, 87 | T val, 88 | int lanes = 1, 89 | bool share_result = false) // lanes is intended to be <= 32. 90 | { 91 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 92 | int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); 93 | int blockSize = item_ct1.get_local_range(2) * 94 | item_ct1.get_local_range(1); // blockSize is intended to be a multiple of 32. 95 | 96 | if (blockSize >= 64) { 97 | x[tid] = val; 98 | /* 99 | DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control 100 | flow. You may need to adjust the code. 101 | */ 102 | /* 103 | DPCT1065:6: Consider replacing sycl::nd_item::barrier() with 104 | sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if 105 | there is no access to global memory. 106 | */ 107 | item_ct1.barrier(); 108 | } 109 | 110 | #pragma unroll 111 | for (int i = (blockSize >> 1); i >= 64; i >>= 1) { 112 | if (tid < i) x[tid] = x[tid] + x[tid + i]; 113 | /* 114 | DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control 115 | flow. You may need to adjust the code. 116 | */ 117 | /* 118 | DPCT1065:7: Consider replacing sycl::nd_item::barrier() with 119 | sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if 120 | there is no access to global memory. 121 | */ 122 | item_ct1.barrier(); 123 | } 124 | 125 | T final; 126 | 127 | if (tid < 32) { 128 | if (blockSize >= 64) 129 | final = x[tid] + x[tid + 32]; 130 | else 131 | final = val; 132 | // __SYNCWARP(); 133 | 134 | #pragma unroll 135 | for (int i = 16; i >= lanes; i >>= 1) 136 | final = final + __shfl_down_sync(0xffffffff, final, i); 137 | } 138 | 139 | if (share_result) { 140 | if (tid < lanes) x[tid] = final; // EpilogueOp 141 | // Make sure the smem result is visible to all warps. 142 | /* 143 | DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control 144 | flow. You may need to adjust the code. 145 | */ 146 | /* 147 | DPCT1065:8: Consider replacing sycl::nd_item::barrier() with 148 | sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if 149 | there is no access to global memory. 150 | */ 151 | item_ct1.barrier(); 152 | } 153 | 154 | return final; 155 | } 156 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/quantization/dequantize.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "dequantization_utils.h" 9 | #include "memory_access_utils.h" 10 | 11 | template 12 | void dequantize_kernel(T* __restrict__ dequant_data, 13 | const int8_t* __restrict__ q_data, 14 | const float* __restrict__ q_params, 15 | int elems_per_group, 16 | int total_elems) 17 | { 18 | dequantize::to_global( 19 | dequant_data, q_data, q_params, elems_per_group, total_elems); 20 | } 21 | 22 | /* 23 | DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device 24 | limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 25 | */ 26 | #define LAUNCH_DEQUANT_KERNEL(num_bits, q_type) \ 27 | { \ 28 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp64, sycl::aspect::fp16}); \ 29 | stream->submit([&](sycl::handler& cgh) { \ 30 | T* dequant_data_ct0 = dequant_data; \ 31 | const int8_t* q_data_ct1 = q_data; \ 32 | const float* q_params_ct2 = q_params; \ 33 | auto elems_per_group_ct3 = elems_per_group; \ 34 | auto total_elems_ct4 = total_elems; \ 35 | \ 36 | cgh.parallel_for( \ 37 | sycl::nd_range<3>(grid * block, block), \ 38 | [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \ 39 | dequantize_kernel( \ 40 | dequant_data_ct0, q_data_ct1, q_params_ct2, elems_per_group_ct3, total_elems_ct4); \ 41 | }); \ 42 | }); \ 43 | } 44 | 45 | template 46 | void launch_dequantize_kernel(T* dequant_data, 47 | const int8_t* q_data, 48 | const float* q_params, 49 | quantize::Type q_type, 50 | int num_bits, 51 | int elems_per_group, 52 | int total_elems, 53 | dpct::queue_ptr stream) 54 | { 55 | constexpr int unroll = 8; 56 | constexpr int threads = 512; 57 | constexpr int elems_per_block = unroll * threads * dequantize::granularity / (sizeof(T)); 58 | 59 | const sycl::range<3> block(1, 1, threads); 60 | const sycl::range<3> grid(1, 1, (total_elems + elems_per_block - 1) / elems_per_block); 61 | 62 | // TODO(cmikeh2): It may make sense to tune unroll, there is perf benefit for large 63 | // problem sizes with this large unroll value. 64 | if (num_bits == 8 && q_type == quantize::Type::Symmetric) { 65 | LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Symmetric); 66 | } else if (num_bits == 8 && q_type == quantize::Type::Asymmetric) { 67 | LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Asymmetric); 68 | } else if (num_bits == 4 && q_type == quantize::Type::Symmetric) { 69 | LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Symmetric); 70 | } else if (num_bits == 4 && q_type == quantize::Type::Asymmetric) { 71 | LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Asymmetric); 72 | } 73 | } 74 | 75 | template void launch_dequantize_kernel(sycl::half* dequant_data, 76 | const int8_t* q_data, 77 | const float* q_params, 78 | quantize::Type q_type, 79 | int num_bits, 80 | int elems_per_group, 81 | int total_elems, 82 | dpct::queue_ptr stream); 83 | 84 | template void launch_dequantize_kernel(float* dequant_data, 85 | const int8_t* q_data, 86 | const float* q_params, 87 | quantize::Type q_type, 88 | int num_bits, 89 | int elems_per_group, 90 | int total_elems, 91 | dpct::queue_ptr stream); 92 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/quantization/quantize.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "ds_kernel_utils.h" 9 | #include "memory_access_utils.h" 10 | #include "quantization.h" 11 | #include "quantization_utils.h" 12 | #include "reduction_utils.h" 13 | 14 | /* 15 | Pure quantization kernel with no fusion. 16 | */ 17 | template 23 | /* 24 | DPCT1110:46: The total declared local variable size in device function cached_quantization exceeds 25 | 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total 26 | register size available and adjust the code, or use smaller sub-group size to avoid high register 27 | pressure. 28 | */ 29 | void cached_quantization(int8_t* __restrict__ output_data, 30 | float* __restrict__ params, 31 | const sycl::half* __restrict__ input_data, 32 | int groups, 33 | int elems_per_group) 34 | { 35 | sycl::group<3> tb = sycl::ext::oneapi::experimental::this_group<3>(); 36 | sycl::sub_group warp = sycl::ext::oneapi::experimental::this_sub_group(); 37 | 38 | // Indexing offsets 39 | const int block_offset = 40 | (tb.get_group_id()[2] * (max_threads / threads_per_group) * elems_per_group) + 41 | (tb.get_local_id()[1] * elems_per_group); 42 | const int elem_offset = tb.get_local_id()[2] * quantize::h_per_load; 43 | const int base_offset = block_offset + elem_offset; 44 | const int stride = sycl::ext::oneapi::experimental::this_group<3>().get_local_linear_range() * 45 | quantize::h_per_load; 46 | 47 | const sycl::half* input_base = input_data + base_offset; //.. 48 | 49 | sycl::half2 local_buffer[UNROLL * internal_unroll * quantize::h2_per_load]; 50 | 51 | #pragma unroll 52 | for (int i = 0; i < UNROLL; i++) { 53 | // Convenience helper, should resolve to register indices and not realize. 54 | sycl::half2* iteration_buffer = local_buffer + i * internal_unroll * quantize::h2_per_load; 55 | #pragma unroll 56 | for (int j = 0; j < internal_unroll; j++) { 57 | const int iteration = i * internal_unroll + j; 58 | mem_access::load_global( 59 | iteration_buffer + j * quantize::h2_per_load, 60 | input_base + iteration * stride, 61 | elem_offset + iteration * stride < elems_per_group); 62 | } 63 | } 64 | 65 | quantize:: 66 | local_array( 67 | local_buffer, params, output_data, elems_per_group, groups); 68 | } 69 | 70 | /********* Launcher methods ***********/ 71 | /* 72 | DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device 73 | limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 74 | */ 75 | #define LAUNCH_CACHED_QUANT_CALL(q_bits, quant_type) \ 76 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp64, sycl::aspect::fp16}); \ 77 | stream->submit([&](sycl::handler& cgh) { \ 78 | int8_t* output_data_ct0 = output_data; \ 79 | float* params_ct1 = params; \ 80 | const sycl::half* input_data_ct2 = input_data; \ 81 | int groups_ct3 = groups; \ 82 | int elems_per_group_ct4 = elems_per_group; \ 83 | \ 84 | cgh.parallel_for( \ 85 | sycl::nd_range<3>(grid * block, block), \ 86 | [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { \ 87 | cached_quantization( \ 93 | output_data_ct0, params_ct1, input_data_ct2, groups_ct3, elems_per_group_ct4); \ 94 | }); \ 95 | }); 96 | 97 | #define LAUNCH_CACHED_QUANT( \ 98 | q_bits, quant_type, unroll_factor_in, internal_unroll_in, threads_per_group_in) \ 99 | const int unroll_factor = unroll_factor_in; \ 100 | const int internal_unroll_l = internal_unroll_in; \ 101 | const int threads_per_group = threads_per_group_in; \ 102 | if (q_bits == 4) { \ 103 | if (quant_type == quantize::Type::Asymmetric) { \ 104 | LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Asymmetric) \ 105 | } else { \ 106 | LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Symmetric) \ 107 | } \ 108 | } else { \ 109 | if (quant_type == quantize::Type::Asymmetric) { \ 110 | LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Asymmetric) \ 111 | } else { \ 112 | LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Symmetric) \ 113 | } \ 114 | } 115 | 116 | void launch_quant(int8_t* output_data, 117 | float* params, 118 | const sycl::half* input_data, 119 | const int groups, 120 | const int elems_per_group, 121 | const int num_bits, 122 | const quantize::Type quant_type, 123 | dpct::queue_ptr stream) 124 | { 125 | constexpr int max_threads = 256; 126 | 127 | constexpr int internal_unroll = 2; 128 | 129 | const bool is_subblock_schedule = (elems_per_group <= 128) ? true : false; 130 | const int h_per_step = is_subblock_schedule ? quantize::h_per_load 131 | : quantize::h_per_load * internal_unroll; 132 | 133 | // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of 134 | // warp-sized blocks rather than stepping up to 64/96 threads 135 | const int one_step_threads = next_pow2((elems_per_group + h_per_step - 1) / h_per_step); 136 | const int threads_per_group = (one_step_threads < max_threads) ? one_step_threads : max_threads; 137 | 138 | const int groups_per_block = 139 | is_subblock_schedule ? (max_threads + threads_per_group - 1) / threads_per_group : 1; 140 | const int groups_launch = (groups_per_block + groups - 1) / groups_per_block; 141 | 142 | sycl::range<3> block(1, groups_per_block, threads_per_group); 143 | sycl::range<3> grid(1, 1, groups_launch); 144 | 145 | const int elems_per_step = threads_per_group * h_per_step; 146 | const int external_unroll = (elems_per_group + elems_per_step - 1) / elems_per_step; 147 | 148 | if (is_subblock_schedule) { 149 | // <=128 150 | if (threads_per_group == 1) { 151 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 1); 152 | } else if (threads_per_group == 2) { 153 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 2); 154 | } else if (threads_per_group == 4) { 155 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 4); 156 | } else if (threads_per_group == 8) { 157 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 8); 158 | } else if (threads_per_group == 16) { 159 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 16); 160 | } 161 | } else if (external_unroll == 1) { 162 | // 129 - 4096 elems 163 | // (this can launch with 1-7 warps as well) 164 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, internal_unroll, max_threads); 165 | } else if (external_unroll == 2) { 166 | // 4097 - 8192 elems 167 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 2, internal_unroll, max_threads); 168 | } else if (external_unroll == 3) { 169 | // 8193 - 12288 elems 170 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 3, internal_unroll, max_threads); 171 | } else if (external_unroll == 4) { 172 | // 12289 - 16384 elems 173 | LAUNCH_CACHED_QUANT(num_bits, quant_type, 4, internal_unroll, max_threads); 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/quantization/quantize_intX.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include 9 | #include "memory_access_utils.h" 10 | #include 11 | 12 | template 13 | struct alignas(sizeof(T) * N) AlignedArray { 14 | using Element = T; 15 | static const int kElements = N; 16 | 17 | AlignedArray() {} 18 | 19 | AlignedArray(const T& rhs) 20 | { 21 | #pragma unroll 22 | for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = rhs; } 23 | } 24 | 25 | T& operator[](int offset) 26 | { 27 | return reinterpret_cast(this->buffer[offset]); 28 | } 29 | 30 | const T& operator[](int offset) const 31 | { 32 | return reinterpret_cast(this->buffer[offset]); 33 | } 34 | 35 | T& at(int offset) { return reinterpret_cast(this->buffer[offset]); } 36 | 37 | const T& at(int offset) const 38 | { 39 | return reinterpret_cast(this->buffer[offset]); 40 | } 41 | 42 | AlignedArray operator+(const AlignedArray& rhs) const 43 | { 44 | AlignedArray ret; 45 | 46 | #pragma unroll 47 | for (int idx = 0; idx < kElements; ++idx) { ret[idx] = this->at(idx) + rhs.at(idx); } 48 | 49 | return ret; 50 | } 51 | 52 | __dpct_inline__ void clear() 53 | { 54 | #pragma unroll 55 | for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = Element(0); } 56 | } 57 | 58 | Element buffer[N]; 59 | }; 60 | 61 | template 62 | struct reduce_max { 63 | __dpct_inline__ T operator()(const T& lhs, const T& rhs) 64 | { 65 | return lhs > rhs ? lhs : rhs; 66 | } 67 | }; 68 | 69 | template 70 | struct reduce_min { 71 | __dpct_inline__ T operator()(const T& lhs, const T& rhs) 72 | { 73 | return lhs < rhs ? lhs : rhs; 74 | } 75 | }; 76 | 77 | template 78 | struct subtract { 79 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs, const T& rhs) 80 | { 81 | AlignedArray ret; 82 | 83 | #pragma unroll 84 | for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] - rhs; } 85 | 86 | return ret; 87 | } 88 | }; 89 | 90 | template 91 | struct plus { 92 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs, const T& rhs) 93 | { 94 | AlignedArray ret; 95 | 96 | #pragma unroll 97 | for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] + rhs; } 98 | 99 | return ret; 100 | } 101 | }; 102 | 103 | template 104 | struct multiply { 105 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs, const T& rhs) 106 | { 107 | AlignedArray ret; 108 | 109 | #pragma unroll 110 | for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] * rhs; } 111 | 112 | return ret; 113 | } 114 | }; 115 | 116 | template 117 | struct clamp { 118 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs, 119 | const T& min_val, 120 | const T& max_val) 121 | { 122 | AlignedArray ret; 123 | 124 | #pragma unroll 125 | for (int idx = 0; idx < N; ++idx) { 126 | ret[idx] = reduce_max()(reduce_min()(lhs[idx], max_val), min_val); 127 | } 128 | 129 | return ret; 130 | } 131 | }; 132 | 133 | template 134 | struct round_int; 135 | 136 | template 137 | struct round_int { 138 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs) 139 | { 140 | AlignedArray ret; 141 | 142 | #pragma unroll 143 | for (int idx = 0; idx < N; ++idx) { ret[idx] = hrint(lhs[idx]); } 144 | 145 | return ret; 146 | } 147 | }; 148 | 149 | template 150 | struct divide { 151 | __dpct_inline__ AlignedArray operator()(const AlignedArray& lhs, const T& rhs) 152 | { 153 | AlignedArray ret; 154 | 155 | #pragma unroll 156 | for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] / rhs; } 157 | 158 | return ret; 159 | } 160 | }; 161 | 162 | template 163 | __dpct_inline__ T to_scalar(const AlignedArray& data) 164 | { 165 | Reducer re; 166 | T res = data[0]; 167 | 168 | #pragma unroll 169 | for (int idx = 1; idx < N; ++idx) { res = re(res, data[idx]); } 170 | 171 | return res; 172 | } 173 | 174 | template 175 | __dpct_inline__ AlignedArray int4_to_half(const AlignedArray& data) 176 | { 177 | AlignedArray ret; 178 | 179 | #pragma unroll 180 | for (int idx = 0; idx < N * 2; idx += 2) { 181 | ret[idx] = sycl::half(int(data[idx / 2] >> 4)); 182 | ret[idx + 1] = sycl::half(int(data[idx / 2] & 0xf)); 183 | } 184 | 185 | return ret; 186 | } 187 | 188 | void dequantize_int4_to_half(uint8_t* data_in, 189 | sycl::half* data_out, 190 | sycl::half* scale_buffer, 191 | sycl::half* min_val_buffer, 192 | int num_group, 193 | int group_size) 194 | { 195 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 196 | using AccessType = AlignedArray; 197 | using AccessTypeOut = AlignedArray; 198 | 199 | for (int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); 200 | idx < num_group * group_size / 8; 201 | idx += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { 202 | int id_group = idx / (group_size / 8); 203 | AccessType value = reinterpret_cast(data_in)[idx]; 204 | sycl::half scale = scale_buffer[id_group]; 205 | sycl::half min_value = min_val_buffer[id_group]; 206 | 207 | AccessTypeOut output = int4_to_half(value); 208 | output = divide()(output, scale); 209 | output = plus()(output, min_value); 210 | 211 | reinterpret_cast(data_out)[idx] = output; 212 | } 213 | } 214 | 215 | void launch_dequantize_int4_to_half_experimental(uint8_t* data_in, 216 | sycl::half* data_out, 217 | sycl::half* scale_buffer, 218 | sycl::half* min_val_buffer, 219 | int num_group, 220 | int group_size, 221 | dpct::queue_ptr stream) 222 | { 223 | int num_warp = num_group / 4; 224 | int num_block = num_warp / 8; // 256 trd / block 225 | 226 | { 227 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); 228 | stream->parallel_for( 229 | sycl::nd_range<3>(sycl::range<3>(1, 1, num_block) * sycl::range<3>(1, 1, 256), 230 | sycl::range<3>(1, 1, 256)), 231 | [=](sycl::nd_item<3> item_ct1) { 232 | dequantize_int4_to_half( 233 | data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size); 234 | }); 235 | } 236 | } 237 | 238 | template 239 | __dpct_inline__ AlignedArray int8_to_half(const AlignedArray& data) 240 | { 241 | AlignedArray ret; 242 | 243 | #pragma unroll 244 | for (int idx = 0; idx < N; idx += 1) { ret[idx] = sycl::half(int(data[idx])); } 245 | 246 | return ret; 247 | } 248 | 249 | void dequantize_int8_to_half(uint8_t* data_in, 250 | sycl::half* data_out, 251 | sycl::half* scale_buffer, 252 | sycl::half* min_val_buffer, 253 | int num_group, 254 | int group_size) 255 | { 256 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 257 | using AccessType = AlignedArray; 258 | using AccessTypeOut = AlignedArray; 259 | 260 | for (int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); 261 | idx < num_group * group_size / 8; 262 | idx += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { 263 | int id_group = idx / (group_size / 8); 264 | AccessType value = reinterpret_cast(data_in)[idx]; 265 | sycl::half scale = scale_buffer[id_group]; 266 | sycl::half min_value = min_val_buffer[id_group]; 267 | 268 | AccessTypeOut output = int8_to_half(value); 269 | output = divide()(output, scale); 270 | output = plus()(output, min_value); 271 | 272 | reinterpret_cast(data_out)[idx] = output; 273 | } 274 | } 275 | 276 | void launch_dequantize_int8_to_half_experimental(uint8_t* data_in, 277 | sycl::half* data_out, 278 | sycl::half* scale_buffer, 279 | sycl::half* min_val_buffer, 280 | int num_group, 281 | int group_size, 282 | dpct::queue_ptr stream) 283 | { 284 | int num_warp = num_group / 4; 285 | int num_block = num_warp / 8; // 256 trd / block 286 | 287 | { 288 | dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); 289 | stream->parallel_for( 290 | sycl::nd_range<3>(sycl::range<3>(1, 1, num_block) * sycl::range<3>(1, 1, 256), 291 | sycl::range<3>(1, 1, 256)), 292 | [=](sycl::nd_item<3> item_ct1) { 293 | dequantize_int8_to_half( 294 | data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size); 295 | }); 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/dequantize.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "conversion_utils.h" 9 | #include "inference_cuda_layers.h" 10 | 11 | #define MAX_QUANTIZE_GROUPING 1024 12 | 13 | #define loop_unroll 1 14 | #define loop_unroll_bits 1 15 | 16 | template 17 | void dequantize_kernel(T* output, 18 | const int8_t* input, 19 | const float* qscale, 20 | int output_size, 21 | int hidden_dim, 22 | int groups, 23 | int merge_count) 24 | { 25 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 26 | unsigned merge_hidden = hidden_dim >> merge_count; 27 | unsigned quantization_stride = (merge_hidden * output_size) / groups; 28 | 29 | unsigned bid = item_ct1.get_group(2); 30 | unsigned tid = item_ct1.get_local_id(2); 31 | 32 | while (tid < output_size) { 33 | unsigned w_index = bid / merge_hidden; 34 | unsigned q_index = tid + bid * output_size; 35 | 36 | auto q = input[q_index]; 37 | 38 | unsigned merge_hidden_total = w_index * merge_hidden; 39 | unsigned scale_index = 40 | ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride) 41 | << merge_count) + 42 | w_index; 43 | 44 | float scale_data = qscale[scale_index]; 45 | 46 | output[q_index] = conversion::to(scale_data * (float)q); 47 | tid += item_ct1.get_local_range(2); 48 | } 49 | } 50 | 51 | template 52 | void launch_dequantize(T* output, 53 | const int8_t* input, 54 | const float* qscale, 55 | unsigned output_size, 56 | unsigned hidden_dim, 57 | unsigned groups, 58 | unsigned merge_count, 59 | dpct::queue_ptr stream) 60 | { 61 | unsigned threads = 1024; 62 | sycl::range<3> block_dims(1, 1, threads); 63 | sycl::range<3> grid_dims(1, 1, hidden_dim); 64 | 65 | /* 66 | DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the 67 | device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 68 | */ 69 | { 70 | dpct::has_capability_or_fail(stream->get_device(), 71 | {sycl::aspect::fp64, sycl::aspect::fp16}); 72 | stream->parallel_for( 73 | sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { 74 | dequantize_kernel( 75 | output, input, qscale, output_size, hidden_dim, groups, merge_count); 76 | }); 77 | } 78 | } 79 | 80 | #define INSTANTIATE_DEQUANTIZE_MERGE(T) \ 81 | template void launch_dequantize( \ 82 | T*, const int8_t*, const float*, unsigned, unsigned, unsigned, unsigned, dpct::queue_ptr); 83 | 84 | INSTANTIATE_DEQUANTIZE_MERGE(float); 85 | #ifdef BF16_AVAILABLE 86 | INSTANTIATE_DEQUANTIZE_MERGE(sycl::ext::oneapi::bfloat16); 87 | #endif 88 | INSTANTIATE_DEQUANTIZE_MERGE(sycl::half); 89 | 90 | void dequantize_kernel(float* output, 91 | const int8_t* input, 92 | const float* qscale, 93 | int hidden_dim, 94 | unsigned merge_hidden, 95 | int cnt) 96 | { 97 | } 98 | 99 | template 100 | void dequantize_kernel(T* output, 101 | const int8_t* input, 102 | const float* qscale, 103 | unsigned hidden_dim, 104 | unsigned merge_hidden, 105 | int cnt) 106 | { 107 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 108 | unsigned bid = item_ct1.get_group(2) * item_ct1.get_group_range(1) + item_ct1.get_group(1); 109 | unsigned tid = item_ct1.get_local_id(2); 110 | 111 | float local_scale = qscale[item_ct1.get_group(2)]; 112 | 113 | const float* input_cast = reinterpret_cast(input); 114 | sycl::float2* output_cast = reinterpret_cast(output); 115 | 116 | input_cast += bid * merge_hidden; 117 | output_cast += bid * merge_hidden; 118 | 119 | for (int c = 0; c < cnt; c++) { 120 | if (tid < merge_hidden) { 121 | float q = input_cast[tid]; 122 | int8_t* q_int8 = (int8_t*)&q; 123 | 124 | sycl::float2 q_f; 125 | T* q_h = (T*)&q_f; 126 | 127 | q_h[0] = conversion::to(local_scale * (float)q_int8[0]); 128 | q_h[1] = conversion::to(local_scale * (float)q_int8[1]); 129 | q_h[2] = conversion::to(local_scale * (float)q_int8[2]); 130 | q_h[3] = conversion::to(local_scale * (float)q_int8[3]); 131 | output_cast[tid] = q_f; 132 | tid += item_ct1.get_local_range(2); 133 | } 134 | } 135 | } 136 | 137 | template 138 | void launch_dequantize(T* output, 139 | const int8_t* input, 140 | const float* qscale, 141 | unsigned output_size, 142 | unsigned hidden_dim, 143 | unsigned groups, 144 | dpct::queue_ptr stream) 145 | { 146 | unsigned threads = 1024; 147 | hidden_dim /= 4; 148 | unsigned thd_cnt = (hidden_dim - 1) / threads + 1; 149 | 150 | assert(output_size % groups == 0); 151 | unsigned blocks = output_size / groups; 152 | 153 | sycl::range<3> block_dims(1, 1, threads); 154 | sycl::range<3> grid_dims(1, blocks, groups); 155 | 156 | /* 157 | DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the 158 | device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 159 | */ 160 | { 161 | dpct::has_capability_or_fail(stream->get_device(), 162 | {sycl::aspect::fp64, sycl::aspect::fp16}); 163 | stream->parallel_for( 164 | sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { 165 | dequantize_kernel(output, input, qscale, hidden_dim, hidden_dim, thd_cnt); 166 | }); 167 | } 168 | } 169 | 170 | #define INSTANTIATE_DEQUANTIZE_NO_MERGE(T) \ 171 | template void launch_dequantize( \ 172 | T*, const int8_t*, const float*, unsigned, unsigned, unsigned, dpct::queue_ptr); 173 | 174 | INSTANTIATE_DEQUANTIZE_NO_MERGE(float); 175 | #ifdef BF16_AVAILABLE 176 | INSTANTIATE_DEQUANTIZE_NO_MERGE(sycl::ext::oneapi::bfloat16); 177 | #endif 178 | INSTANTIATE_DEQUANTIZE_NO_MERGE(sycl::half); 179 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/pointwise_ops.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "conversion_utils.h" 9 | #include "ds_kernel_utils.h" 10 | #include "memory_access_utils.h" 11 | 12 | namespace pwise { 13 | constexpr int granularity = 16; 14 | constexpr int unroll = 4; 15 | constexpr int threads = 256; 16 | } // namespace pwise 17 | 18 | template 19 | void vector_add_kernel(T* out, const T* a, const T* b, float gamma, int num_elems) 20 | { 21 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 22 | constexpr int T_per_access = pwise::granularity / sizeof(T); 23 | 24 | const int block_offset = item_ct1.get_group(2) * pwise::threads * pwise::unroll * T_per_access; 25 | const int thread_offset = item_ct1.get_local_id(2) * T_per_access; 26 | const int total_offset = block_offset + thread_offset; 27 | constexpr int stride = pwise::threads * T_per_access; 28 | 29 | #pragma unroll 30 | for (int i = 0; i < pwise::unroll; i++) { 31 | T temp_buf_a[T_per_access], temp_buf_b[T_per_access]; 32 | 33 | const int iter_idx = total_offset + i * stride; 34 | 35 | mem_access::load_global(temp_buf_a, a + iter_idx, iter_idx < num_elems); 36 | mem_access::load_global(temp_buf_b, b + iter_idx, iter_idx < num_elems); 37 | 38 | #pragma unroll 39 | for (int j = 0; j < T_per_access; j++) { 40 | float up_cast_a = conversion::to(temp_buf_a[j]); 41 | float up_cast_b = conversion::to(temp_buf_b[j]); 42 | temp_buf_a[j] = conversion::to((gamma * up_cast_a) + up_cast_b); 43 | } 44 | 45 | if (iter_idx < num_elems) { 46 | mem_access::store_global(out + iter_idx, temp_buf_a); 47 | } 48 | } 49 | } 50 | 51 | template 52 | void launch_vector_add(T* out, 53 | const T* a, 54 | const T* b, 55 | float gamma, 56 | int num_elems, 57 | dpct::queue_ptr stream) 58 | { 59 | constexpr int T_per_access = pwise::granularity / sizeof(T); 60 | constexpr int T_per_block = pwise::threads * T_per_access * pwise::unroll; 61 | 62 | sycl::range<3> block(1, 1, pwise::threads); 63 | sycl::range<3> grid(1, 1, (num_elems + T_per_block - 1) / T_per_block); 64 | 65 | { 66 | dpct::has_capability_or_fail(stream->get_device(), 67 | {sycl::aspect::fp64, sycl::aspect::fp16}); 68 | stream->parallel_for(sycl::nd_range<3>(grid * block, block), 69 | [=](sycl::nd_item<3> item_ct1) { 70 | vector_add_kernel(out, a, b, gamma, num_elems); 71 | }); 72 | } 73 | } 74 | 75 | #define INSTANTIATE_VECTOR_ADD(T) \ 76 | template void launch_vector_add( \ 77 | T * out, const T* a, const T* b, float gamma, int num_elems, dpct::queue_ptr stream); 78 | 79 | INSTANTIATE_VECTOR_ADD(float) 80 | INSTANTIATE_VECTOR_ADD(sycl::half) 81 | #ifdef BF16_AVAILABLE 82 | INSTANTIATE_VECTOR_ADD(sycl::ext::oneapi::bfloat16) 83 | #endif 84 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/relu.dp.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #include 7 | #include 8 | #include "conversion_utils.h" 9 | #include "inference_cuda_layers.h" 10 | #include "memory_access_utils.h" 11 | 12 | #define MAX_CAP 4 13 | #define MAX_SEQ 2048 14 | 15 | inline float relu(const float x) { return x < 0 ? 0 : x; } 16 | 17 | /* 18 | In-place relu(biasAdd(x)) for channels last 19 | */ 20 | template 21 | void fused_bias_relu(T* input, const T* bias, int total_count, int intermediate_size) 22 | { 23 | // Input restriction: intermediate_size % vals_per_access == 0 24 | auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); 25 | constexpr int granularity = 16; 26 | constexpr int values_per_access = granularity / sizeof(T); 27 | const int offset = 28 | (item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2)) * 29 | values_per_access; 30 | 31 | if (offset < total_count) { 32 | T data[values_per_access]; 33 | T data_bias[values_per_access]; 34 | mem_access::load_global(data, input + offset); 35 | mem_access::load_global( 36 | data_bias, bias + (offset % intermediate_size), bias != nullptr); 37 | 38 | #pragma unroll 39 | for (int i = 0; i < values_per_access; i++) { 40 | float data_f = conversion::to(data[i]); 41 | float bias_f = conversion::to(data_bias[i]); 42 | data[i] = conversion::to(relu(data_f + bias_f)); 43 | } 44 | 45 | mem_access::store_global(input + offset, data); 46 | } 47 | } 48 | 49 | template 50 | void launch_bias_relu(T* input, 51 | const T* bias, 52 | int intermediate_size, 53 | int batch_size, 54 | dpct::queue_ptr stream) 55 | { 56 | constexpr int threads = 1024; 57 | constexpr int granularity = 16; 58 | 59 | const int total_count = batch_size * intermediate_size; 60 | const int elems_per_block = threads * (granularity / sizeof(T)); 61 | sycl::range<3> block_dims(1, 1, threads); 62 | sycl::range<3> grid_dims(1, 1, (total_count + elems_per_block - 1) / elems_per_block); 63 | 64 | /* 65 | DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the 66 | device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. 67 | */ 68 | { 69 | dpct::has_capability_or_fail(stream->get_device(), 70 | {sycl::aspect::fp64, sycl::aspect::fp16}); 71 | stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), 72 | [=](sycl::nd_item<3> item_ct1) { 73 | fused_bias_relu(input, bias, total_count, intermediate_size); 74 | }); 75 | } 76 | } 77 | 78 | #define INSTANTIATE_LAUNCH_BIAS_RELU(T) \ 79 | template void launch_bias_relu(T*, const T*, int, int, dpct::queue_ptr); 80 | 81 | INSTANTIATE_LAUNCH_BIAS_RELU(float) 82 | #ifdef BF16_AVAILABLE 83 | INSTANTIATE_LAUNCH_BIAS_RELU(sycl::ext::oneapi::bfloat16) 84 | #endif 85 | INSTANTIATE_LAUNCH_BIAS_RELU(sycl::half) 86 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/includes/inference_cuda_layers.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // DeepSpeed Team 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include "ds_kernel_utils.h" 11 | 12 | #ifdef BF16_AVAILABLE 13 | #endif 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define MAX_WARP_NUM 32 20 | #define WARP_SIZE 32 21 | 22 | #define MAX_THREADS 1024 23 | #define SMs 80 24 | 25 | #define MAX_REGISTERS 256 26 | 27 | template 28 | void launch_attn_softmax_v2(T* vals, 29 | T* mask, 30 | T* alibi, 31 | float layer_scale, 32 | bool triangular, 33 | bool recompute, 34 | bool local_attention, 35 | int window_size, 36 | int batch_size, 37 | int heads, 38 | int num_seq, 39 | int sequence_length, 40 | int offset, 41 | int mask_stride, 42 | int mp_size, 43 | dpct::queue_ptr stream); 44 | 45 | // Fused bias add with gelu activation 46 | template 47 | void launch_bias_gelu(T* input, 48 | const T* bias, 49 | int intermediate_size, 50 | int batch_size, 51 | dpct::queue_ptr stream); 52 | 53 | template 54 | void launch_gated_activation(T* output, 55 | const T* activation, 56 | const T* bias, 57 | int rows, 58 | int output_stride, 59 | int elems_per_row, 60 | bool use_gelu, 61 | dpct::queue_ptr stream); 62 | 63 | // Fused bias add with relu activation 64 | template 65 | void launch_bias_relu(T* input, 66 | const T* bias, 67 | int intermediate_size, 68 | int batch_size, 69 | dpct::queue_ptr stream); 70 | 71 | template 72 | void launch_bias_add(T* input, 73 | const T* bias, 74 | int hidden_size, 75 | int batch_size, 76 | dpct::queue_ptr stream); 77 | 78 | template 79 | void launch_bias_residual(T* input, 80 | T* output, 81 | T* attn, 82 | T* bias, 83 | T* attn_bias, 84 | int batch, 85 | int hidden_dim, 86 | int mp_size, 87 | bool preln, 88 | dpct::queue_ptr stream); 89 | 90 | template 91 | void launch_fused_ln(T* output, 92 | const T* vals, 93 | const T* gamma, 94 | const T* beta, 95 | float epsilon, 96 | int rows, 97 | int elems_per_row, 98 | dpct::queue_ptr stream); 99 | 100 | template 101 | void launch_fused_residual_ln(T* output, 102 | const T* vals, 103 | const T* residual, 104 | const T* bias, 105 | const T* gamma, 106 | const T* beta, 107 | float epsilon, 108 | int rows, 109 | int elems_per_row, 110 | dpct::queue_ptr stream); 111 | 112 | template 113 | void launch_fused_residual_ln_store_pre_ln_res(T* norm_output, 114 | T* res_output, 115 | const T* vals, 116 | const T* residual, 117 | const T* bias, 118 | const T* gamma, 119 | const T* beta, 120 | float epsilon, 121 | int rows, 122 | int elems_per_row, 123 | dpct::queue_ptr stream); 124 | 125 | template 126 | void launch_rms_norm(T* norm_output, 127 | T* res_output, 128 | const T* vals, 129 | const T* residual, 130 | const T* gamma, 131 | float epsilon, 132 | int rows, 133 | int elems_per_row, 134 | dpct::queue_ptr stream); 135 | 136 | template 137 | void launch_dequantize(T* output, 138 | const int8_t* input, 139 | const float* qscale, 140 | unsigned output_size, 141 | unsigned hidden_dim, 142 | unsigned groups, 143 | unsigned merge_count, 144 | dpct::queue_ptr stream); 145 | 146 | template 147 | void launch_dequantize(T* output, 148 | const int8_t* input, 149 | const float* qscale, 150 | unsigned output_size, 151 | unsigned hidden_dim, 152 | unsigned groups, 153 | dpct::queue_ptr stream); 154 | template 155 | void launch_gptj_residual_add(T* input, 156 | T* output, 157 | T* attn, 158 | T* bias, 159 | T* attn_bias, 160 | int batch, 161 | int head_size, 162 | int mp_size, 163 | dpct::queue_ptr stream); 164 | 165 | template 166 | void launch_apply_rotary_pos_emb(T* mixed_query, 167 | T* key_layer, 168 | unsigned head_size, 169 | unsigned seq_len, 170 | unsigned rotary_dim, 171 | unsigned offset, 172 | unsigned num_heads, 173 | unsigned batch, 174 | float rope_theta, 175 | dpct::queue_ptr stream, 176 | int max_out_tokens); 177 | 178 | template 179 | void launch_moe_res_matmul(T* residual, 180 | T* coef, 181 | T* mlp_out, 182 | int seq_len, 183 | int hidden_dim, 184 | dpct::queue_ptr stream); 185 | 186 | // 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3] 187 | template 188 | void launch_transform4d_0213(T* out, 189 | const T* in, 190 | int batch_size, 191 | int heads, 192 | int seq_length, 193 | int hidden_dim, 194 | dpct::queue_ptr stream, 195 | int trans_count); 196 | template 197 | void launch_bias_add_transform_0213(T* outputs, 198 | T* vals, 199 | T* vals1, 200 | const T* vals2, 201 | const T* bias, 202 | int batch_size, 203 | int seq_length, 204 | unsigned seq_offset, 205 | int seq_length1, 206 | int hidden_dim, 207 | int heads, 208 | int num_kv, 209 | int rotary_dim, 210 | bool rotate_half, 211 | bool rotate_every_two, 212 | dpct::queue_ptr stream, 213 | int trans_count, 214 | int max_out_tokens, 215 | float rope_theta); 216 | template 217 | void pad_data(T* padded_output, 218 | T* output, 219 | int bsz, 220 | int head_size, 221 | int padded_head_size, 222 | dpct::queue_ptr stream); 223 | 224 | template 225 | void pad_head_seq(T* padded_output, 226 | T* output, 227 | int bsz, 228 | int seq_len, 229 | int padded_seq_len, 230 | int head_size, 231 | int padded_head_size, 232 | dpct::queue_ptr stream); 233 | 234 | template 235 | void launch_pad_add_transform_0213(T* output, 236 | const T* vals, 237 | int batch_size, 238 | int hidden_dim, 239 | int seq_length, 240 | int padded_seq_len, 241 | int heads, 242 | int padded_head_size, 243 | dpct::queue_ptr stream); 244 | 245 | template 246 | void launch_vector_add(T* out, 247 | const T* a, 248 | const T* b, 249 | float gamma, 250 | int num_elems, 251 | dpct::queue_ptr stream); 252 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/flash_attn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Function 7 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR 8 | import fmha_module 9 | 10 | 11 | class FlashAttnFunc(Function): 12 | 13 | @staticmethod 14 | def forward(ctx, query, key, value, dropout_p, softmax_scale, is_causal): 15 | """ 16 | Shape of qkv and out: [Bs, Hn, Sl, Hs] 17 | Bs: batch size 18 | Hn: head number 19 | Sl: sequence length 20 | Hs: head size 21 | """ 22 | bs, hn, sl, hs = query.shape 23 | if softmax_scale is None: 24 | softmax_scale = hs ** (-0.5) 25 | dropout_seed = torch.seed() 26 | is_training = True 27 | is_dropout = (dropout_p != 0) 28 | 29 | out, softmax_L = fmha_module.flash_attn_fwd( 30 | query, key, value, bs, hn, sl, hs, softmax_scale, 31 | dropout_p, dropout_seed, 32 | is_causal, is_training, is_dropout 33 | ) 34 | 35 | ctx.save_for_backward(query, key, value, out, softmax_L) 36 | ctx.dropout_p = dropout_p 37 | ctx.dropout_seed = dropout_seed 38 | ctx.softmax_scale = softmax_scale 39 | ctx.is_causal = is_causal 40 | ctx.is_dropout = is_dropout 41 | 42 | return out 43 | 44 | @staticmethod 45 | def backward(ctx, dout, *args): 46 | q, k, v, out, softmax_L = ctx.saved_tensors 47 | bs, hn, sl, hs = q.shape 48 | 49 | dq, dk, dv = fmha_module.flash_attn_bwd( 50 | dout, q, k, v, out, bs, hn, sl, hs, ctx.softmax_scale, 51 | ctx.dropout_p, ctx.dropout_seed, 52 | ctx.is_causal, ctx.is_dropout, softmax_L 53 | ) 54 | return dq, dk, dv, None, None, None 55 | 56 | 57 | class FlashAttentionBuilderObject(): 58 | def __init__(self): 59 | pass 60 | 61 | # general functions 62 | def flash_attn_func_v2(self, q, k, v, 63 | dropout_p, softmax_scale, is_causal): 64 | if q.shape[-1] <= 256: 65 | return FlashAttnFunc.apply(q, k, v, dropout_p, softmax_scale, is_causal) 66 | else: 67 | return self.flash_attn_fwd_func(q, k, v, dropout_p) 68 | 69 | # forward functions 70 | def flash_attn_fwd_func(self, q, k, v, dropout_p): 71 | hs_rsqrt_scale = q.shape[-1] ** (-0.5) 72 | attention_scores = torch.matmul(q, k.transpose(-1, -2)) 73 | attention_scores = attention_scores * hs_rsqrt_scale 74 | 75 | triu_mask = (torch.triu(torch.ones_like(attention_scores), diagonal=1) == 1) 76 | attention_scores.masked_fill_(triu_mask, -torch.inf) 77 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 78 | 79 | attention_probs = nn.Dropout(dropout_p)(attention_probs) 80 | 81 | context_layer = torch.matmul(attention_probs, v) 82 | return context_layer 83 | 84 | 85 | 86 | class FlashAttentionBuilder(OpBuilder): 87 | BUILD_VAR = "DS_BUILD_FlashAttention" 88 | NAME = "flash_attn" 89 | 90 | def __init__(self): 91 | super().__init__(name=self.NAME) 92 | 93 | def absolute_name(self): 94 | return f'deepspeed.ops.{self.NAME}_op' 95 | 96 | def sources(self): 97 | return [ 98 | sycl_kernel_path('csrc/flash_attn/flash_attn.dp.cpp'), 99 | sycl_kernel_path('csrc/flash_attn/fmha_fwd.cpp'), 100 | sycl_kernel_path('csrc/flash_attn/fmha_bwd.cpp'), 101 | ] 102 | 103 | def include_paths(self): 104 | return [] 105 | 106 | def extra_ldflags(self): 107 | return [] 108 | 109 | def cxx_args(self): 110 | return [] 111 | 112 | def load(self): 113 | return FlashAttentionBuilderObject() 114 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/fused_adam.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include 5 | 6 | 7 | class FusedAdamBuilder(SYCLOpBuilder): 8 | BUILD_VAR = "DS_BUILD_FUSED_ADAM" 9 | NAME = "fused_adam" 10 | 11 | def __init__(self): 12 | super().__init__(name=self.NAME) 13 | 14 | def absolute_name(self): 15 | return f'deepspeed.ops.adam.{self.NAME}_op' 16 | 17 | def sources(self): 18 | return [ 19 | sycl_kernel_path('csrc/adam/fused_adam_frontend.cpp'), 20 | sycl_kernel_path('csrc/adam/multi_tensor_adam.dp.cpp'), 21 | ] 22 | 23 | def include_paths(self): 24 | return [ 25 | sycl_kernel_include('csrc/includes'), 26 | sycl_kernel_include('csrc/adam'), 27 | ] 28 | 29 | def cxx_args(self): 30 | args = super().cxx_args() 31 | return args + self.version_dependent_macros() 32 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/quantizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include 7 | 8 | 9 | class QuantizerBuilder(SYCLOpBuilder): 10 | BUILD_VAR = "DS_BUILD_QUANTIZER" 11 | NAME = "quantizer" 12 | 13 | def __init__(self, name=None): 14 | name = self.NAME if name is None else name 15 | super().__init__(name=name) 16 | 17 | def absolute_name(self): 18 | return f'deepspeed.ops.quantizer.{self.NAME}_op' 19 | 20 | def sources(self): 21 | return [ 22 | sycl_kernel_path('csrc/quantization/pt_binding.cpp'), 23 | sycl_kernel_path('csrc/quantization/fake_quantizer.dp.cpp'), 24 | sycl_kernel_path('csrc/quantization/quantize.dp.cpp'), 25 | sycl_kernel_path('csrc/quantization/quantize_intX.dp.cpp'), 26 | sycl_kernel_path('csrc/quantization/dequantize.dp.cpp'), 27 | sycl_kernel_path('csrc/quantization/swizzled_quantize.dp.cpp'), 28 | sycl_kernel_path('csrc/quantization/quant_reduce.dp.cpp'), 29 | ] 30 | 31 | def include_paths(self): 32 | return [sycl_kernel_include('csrc/includes')] 33 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/transformer_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include 7 | 8 | 9 | class InferenceBuilder(SYCLOpBuilder): 10 | BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE" 11 | NAME = "transformer_inference" 12 | 13 | def __init__(self, name=None): 14 | name = self.NAME if name is None else name 15 | super().__init__(name=name) 16 | 17 | def absolute_name(self): 18 | return f'deepspeed.ops.transformer.inference.{self.NAME}_op' 19 | 20 | def is_compatible(self, verbose=True): 21 | return super().is_compatible(verbose) 22 | 23 | def cxx_args(self): 24 | args = super().cxx_args() 25 | args.append('-DBF16_AVAILABLE') 26 | return args 27 | 28 | def sources(self): 29 | return [ 30 | sycl_kernel_path('csrc/transformer/inference/csrc/pt_binding.cpp'), 31 | sycl_kernel_path('csrc/transformer/inference/csrc/gelu.dp.cpp'), 32 | sycl_kernel_path('csrc/transformer/inference/csrc/relu.dp.cpp'), 33 | sycl_kernel_path('csrc/transformer/inference/csrc/layer_norm.dp.cpp'), 34 | sycl_kernel_path('csrc/transformer/inference/csrc/rms_norm.dp.cpp'), 35 | sycl_kernel_path('csrc/transformer/inference/csrc/softmax.dp.cpp'), 36 | sycl_kernel_path('csrc/transformer/inference/csrc/dequantize.dp.cpp'), 37 | sycl_kernel_path('csrc/transformer/inference/csrc/apply_rotary_pos_emb.dp.cpp'), 38 | sycl_kernel_path('csrc/transformer/inference/csrc/transform.dp.cpp'), 39 | sycl_kernel_path('csrc/transformer/inference/csrc/pointwise_ops.dp.cpp'), 40 | ] 41 | 42 | def include_paths(self): 43 | includes = [ 44 | sycl_kernel_include('csrc/transformer/inference/includes'), 45 | sycl_kernel_include('csrc/includes'), 46 | ] 47 | return includes 48 | 49 | -------------------------------------------------------------------------------- /intel_extension_for_deepspeed/op_builder/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | """ 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include # noqa: F401 5 | 6 | 7 | class UtilsBuilder(SYCLOpBuilder): 8 | BUILD_VAR = "DS_BUILD_UTILS" 9 | NAME = "utils" 10 | 11 | def __init__(self, name=None): 12 | name = self.NAME if name is None else name 13 | super().__init__(name=name) 14 | 15 | def absolute_name(self): 16 | return f'deepspeed.ops.{self.NAME}_op' 17 | 18 | def sources(self): 19 | return ['csrc/utils/flatten_unflatten.cpp'] 20 | 21 | def include_paths(self): 22 | return [] 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ninja -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import subprocess 3 | import os 4 | import intel_extension_for_pytorch 5 | from torch.xpu.cpp_extension import DPCPPExtension, DpcppBuildExtension 6 | 7 | PACKAGE_NAME="intel_extension_for_deepspeed" 8 | 9 | version_str = "0.9.4" 10 | git_branch_cmd = "git rev-parse --abbrev-ref HEAD" 11 | git_hash_cmd = "git rev-parse --short HEAD" 12 | 13 | def get_project_dir(): 14 | project_root_dir = os.path.dirname(__file__) 15 | return os.path.abspath(project_root_dir) 16 | 17 | def get_csrc_dir(op_name=""): 18 | project_root_dir = os.path.join(get_project_dir(), PACKAGE_NAME + "/op_builder/csrc/" + str(op_name)) 19 | return os.path.abspath(project_root_dir) 20 | 21 | def get_xetla_dir(): 22 | project_root_dir = os.path.join(get_project_dir(), "third_party/xetla/include") 23 | return os.path.abspath(project_root_dir) 24 | 25 | def create_ext_modules(op_name=""): 26 | cpp_files = [] 27 | include_dirs = [] 28 | 29 | for path, dir_list, file_list in os.walk(get_csrc_dir(op_name)): 30 | for file_name in file_list: 31 | if file_name.endswith('.cpp'): 32 | cpp_files += [os.path.join(path, file_name)] 33 | for path, dir_list, file_list in os.walk(get_csrc_dir()): 34 | for file_name in file_list: 35 | if file_name.endswith('.hpp') or file_name.endswith('.h'): 36 | include_dirs += [path] 37 | break 38 | include_dirs += [get_xetla_dir()] 39 | cxx_flags = [ 40 | '-fsycl', '-O3', '-std=c++20', '-w', '-fPIC', '-DMKL_ILP64', 41 | '-fsycl-targets=spir64_gen', 42 | "-Xs \"-device pvc -options '-vc-disable-indvars-opt -vc-codegen -doubleGRF -Xfinalizer -printregusage -Xfinalizer -enableBCR -DPASTokenReduction '\" " 43 | ] 44 | extra_ldflags = [ 45 | '-fsycl', '-fPIC', '-Wl,-export-dynamic', '-fsycl-targets=spir64_gen', 46 | "-Xs \"-device pvc -options '-vc-disable-indvars-opt -vc-codegen -doubleGRF -Xfinalizer -printregusage -Xfinalizer -enableBCR -DPASTokenReduction '\" " 47 | ] 48 | ext_modules = [ 49 | DPCPPExtension(name="fmha_module", 50 | sources=cpp_files, 51 | include_dirs=include_dirs, 52 | extra_compile_args={'cxx': cxx_flags}, 53 | extra_link_args=extra_ldflags) 54 | ] 55 | return ext_modules 56 | 57 | def command_exists(cmd): 58 | result = subprocess.Popen(f'type {cmd}', 59 | stdout=subprocess.PIPE, 60 | shell=True) 61 | return result.wait() == 0 62 | 63 | 64 | if command_exists('git'): 65 | try: 66 | result = subprocess.check_output(git_hash_cmd, shell=True) 67 | git_hash = result.decode('utf-8').strip() 68 | result = subprocess.check_output(git_branch_cmd, shell=True) 69 | git_branch = result.decode('utf-8').strip() 70 | except subprocess.CalledProcessError: 71 | git_hash = "unknown" 72 | git_branch = "unknown" 73 | else: 74 | git_hash = "unknown" 75 | git_branch = "unknown" 76 | 77 | 78 | def _build_installation_dependency(): 79 | install_requires = [] 80 | install_requires.append("setuptools") 81 | return install_requires 82 | 83 | def _check_env_flag(name, default=""): 84 | return os.getenv(name, default).upper() in ["Y", "1"]; 85 | 86 | print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}") 87 | 88 | if _check_env_flag("GIT_VERSIONED_BUILD", default="1"): 89 | version_str += f'+{git_hash}' 90 | 91 | ext_modules = create_ext_modules("flash_attn") 92 | cmdclass = {'build_ext': DpcppBuildExtension} 93 | 94 | long_description = "" 95 | currentdir = os.path.abspath(os.path.dirname(__file__)) 96 | with open(os.path.join(currentdir, "README.md"), encoding="utf-8") as f: 97 | long_description = f.read() 98 | 99 | setup(name=PACKAGE_NAME, 100 | version=version_str, 101 | description="Intel® Extension for DeepSpeed*", 102 | long_description=long_description, 103 | long_description_content_type="text/markdown", 104 | url="https://github.com/intel/intel-extension-for-deepspeed", 105 | author="Intel Corporation", 106 | install_requires=_build_installation_dependency(), 107 | include_package_data=True, 108 | packages=[PACKAGE_NAME], 109 | ext_modules=ext_modules, 110 | cmdclass=cmdclass, 111 | license="https://opensource.org/license/mit") 112 | --------------------------------------------------------------------------------