├── .clang-format
├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── Security.md
├── examples
    ├── LICENSE
    ├── README.md
    ├── generate_config.sh
    ├── generate_hostfile.sh
    ├── gpt.sh
    ├── hostfile_deepspeed
    ├── hostfile_mpich
    ├── llm_inference.sh
    ├── run10p175b.sh
    ├── run175b.sh
    ├── run20b.sh
    ├── run3.6b.sh
    └── run_llama.sh
├── intel_extension_for_deepspeed
    ├── __init__.py
    ├── op_builder
    │   ├── __init__.py
    │   ├── async_io.py
    │   ├── builder.py
    │   ├── cpu_adagrad.py
    │   ├── cpu_adam.py
    │   ├── csrc
    │   │   ├── adagrad
    │   │   │   └── cpu_adagrad.cpp
    │   │   ├── adam
    │   │   │   ├── cpu_adam.cpp
    │   │   │   ├── cpu_adam_impl.cpp
    │   │   │   ├── fused_adam_frontend.cpp
    │   │   │   ├── multi_tensor_adam.dp.cpp
    │   │   │   └── multi_tensor_apply.dp.hpp
    │   │   ├── common
    │   │   │   └── custom_cuda_kernel.dp.cpp
    │   │   ├── flash_attn
    │   │   │   ├── flash_attn.dp.cpp
    │   │   │   ├── flash_attn.h
    │   │   │   ├── fmha_bwd.cpp
    │   │   │   ├── fmha_bwd_utils.h
    │   │   │   ├── fmha_fwd.cpp
    │   │   │   ├── fmha_policy.h
    │   │   │   ├── fmha_utils.h
    │   │   │   └── mha.h
    │   │   ├── includes
    │   │   │   ├── StopWatch.h
    │   │   │   ├── compat.h
    │   │   │   ├── context.h
    │   │   │   ├── conversion_utils.h
    │   │   │   ├── cpu_adagrad.h
    │   │   │   ├── cpu_adam.h
    │   │   │   ├── cublas_wrappers.h
    │   │   │   ├── custom_cuda_layers.h
    │   │   │   ├── dequantization_utils.h
    │   │   │   ├── dpct
    │   │   │   │   ├── atomic.hpp
    │   │   │   │   ├── blas_utils.hpp
    │   │   │   │   ├── ccl_utils.hpp
    │   │   │   │   ├── device.hpp
    │   │   │   │   ├── dnnl_utils.hpp
    │   │   │   │   ├── dpct.hpp
    │   │   │   │   ├── dpl_utils.hpp
    │   │   │   │   ├── fft_utils.hpp
    │   │   │   │   ├── image.hpp
    │   │   │   │   ├── kernel.hpp
    │   │   │   │   ├── lapack_utils.hpp
    │   │   │   │   ├── lib_common_utils.hpp
    │   │   │   │   ├── math.hpp
    │   │   │   │   ├── memory.hpp
    │   │   │   │   ├── rng_utils.hpp
    │   │   │   │   ├── sparse_utils.hpp
    │   │   │   │   └── util.hpp
    │   │   │   ├── ds_kernel_utils.h
    │   │   │   ├── gemm_test.h
    │   │   │   ├── memory_access_utils.h
    │   │   │   ├── quantization.h
    │   │   │   ├── quantization_utils.h
    │   │   │   ├── reduction_utils.h
    │   │   │   ├── simd.h
    │   │   │   └── type_shim.h
    │   │   ├── quantization
    │   │   │   ├── dequantize.dp.cpp
    │   │   │   ├── fake_quantizer.dp.cpp
    │   │   │   ├── pt_binding.cpp
    │   │   │   ├── quant_reduce.dp.cpp
    │   │   │   ├── quantize.dp.cpp
    │   │   │   ├── quantize_intX.dp.cpp
    │   │   │   └── swizzled_quantize.dp.cpp
    │   │   └── transformer
    │   │   │   └── inference
    │   │   │       ├── csrc
    │   │   │           ├── apply_rotary_pos_emb.dp.cpp
    │   │   │           ├── dequantize.dp.cpp
    │   │   │           ├── gelu.dp.cpp
    │   │   │           ├── layer_norm.dp.cpp
    │   │   │           ├── pointwise_ops.dp.cpp
    │   │   │           ├── pt_binding.cpp
    │   │   │           ├── relu.dp.cpp
    │   │   │           ├── rms_norm.dp.cpp
    │   │   │           ├── softmax.dp.cpp
    │   │   │           └── transform.dp.cpp
    │   │   │       └── includes
    │   │   │           ├── inference_context.h
    │   │   │           ├── inference_cublas_wrappers.h
    │   │   │           └── inference_cuda_layers.h
    │   ├── flash_attn.py
    │   ├── fused_adam.py
    │   ├── quantizer.py
    │   ├── transformer_inference.py
    │   └── utils.py
    └── xpu_accelerator.py
├── requirements.txt
└── setup.py


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | # Refer to the following link for the explanation of each params:
  3 | #   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
  4 | Language: Cpp
  5 | # BasedOnStyle: Google
  6 | AccessModifierOffset: -4
  7 | AlignAfterOpenBracket: Align
  8 | AlignConsecutiveAssignments: false
  9 | AlignConsecutiveDeclarations: false
 10 | AlignEscapedNewlines: Left
 11 | AlignOperands: true
 12 | AlignTrailingComments: true
 13 | AllowAllParametersOfDeclarationOnNextLine: false
 14 | AllowShortBlocksOnASingleLine: true
 15 | AllowShortCaseLabelsOnASingleLine: true
 16 | AllowShortFunctionsOnASingleLine: All
 17 | AllowShortIfStatementsOnASingleLine: true
 18 | AllowShortLoopsOnASingleLine: true
 19 | # This is deprecated
 20 | AlwaysBreakAfterDefinitionReturnType: None
 21 | AlwaysBreakAfterReturnType: None
 22 | AlwaysBreakBeforeMultilineStrings: true
 23 | AlwaysBreakTemplateDeclarations: true
 24 | BinPackArguments:  false
 25 | BinPackParameters: false
 26 | BraceWrapping:
 27 |   AfterClass:            false
 28 |   AfterControlStatement: false
 29 |   AfterEnum:             false
 30 |   AfterFunction:         false
 31 |   AfterNamespace:        false
 32 |   AfterObjCDeclaration:  false
 33 |   AfterStruct:           false
 34 |   AfterUnion:            false
 35 |   AfterExternBlock:      false
 36 |   BeforeCatch:           false
 37 |   BeforeElse:            false
 38 |   IndentBraces:          false
 39 |   # disabling the below splits, else, they'll just add to the vertical length of source files!
 40 |   SplitEmptyFunction: false
 41 |   SplitEmptyRecord: false
 42 |   SplitEmptyNamespace: false
 43 | BreakBeforeBinaryOperators: None
 44 | BreakBeforeBraces: WebKit
 45 | BreakBeforeInheritanceComma: false
 46 | BreakInheritanceList: BeforeColon
 47 | BreakBeforeTernaryOperators: true
 48 | BreakConstructorInitializersBeforeComma: false
 49 | BreakConstructorInitializers: BeforeColon
 50 | BreakAfterJavaFieldAnnotations: false
 51 | BreakStringLiterals: true
 52 | ColumnLimit: 100
 53 | CommentPragmas: '^ IWYU pragma:'
 54 | CompactNamespaces: false
 55 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 56 | # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
 57 | ConstructorInitializerIndentWidth: 4
 58 | ContinuationIndentWidth: 4
 59 | Cpp11BracedListStyle: true
 60 | DerivePointerAlignment: false
 61 | DisableFormat: false
 62 | ExperimentalAutoDetectBinPacking: false
 63 | FixNamespaceComments: true
 64 | ForEachMacros:
 65 |   - foreach
 66 |   - Q_FOREACH
 67 |   - BOOST_FOREACH
 68 | IncludeBlocks: Preserve
 69 | IncludeCategories:
 70 |   - Regex:           '^<ext/.*\.h>'
 71 |     Priority:        2
 72 |   - Regex:           '^<.*\.h>'
 73 |     Priority:        1
 74 |   - Regex:           '^<.*'
 75 |     Priority:        2
 76 |   - Regex:           '.*'
 77 |     Priority:        3
 78 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 79 | IndentCaseLabels: true
 80 | IndentPPDirectives: None
 81 | IndentWidth:     4
 82 | IndentWrappedFunctionNames: false
 83 | JavaScriptQuotes: Leave
 84 | JavaScriptWrapImports: true
 85 | KeepEmptyLinesAtTheStartOfBlocks: false
 86 | MacroBlockBegin: ''
 87 | MacroBlockEnd:   ''
 88 | MaxEmptyLinesToKeep: 1
 89 | NamespaceIndentation: None
 90 | ObjCBinPackProtocolList: Never
 91 | ObjCBlockIndentWidth: 4
 92 | ObjCSpaceAfterProperty: false
 93 | ObjCSpaceBeforeProtocolList: true
 94 | PenaltyBreakAssignment: 4
 95 | PenaltyBreakBeforeFirstCallParameter: 1
 96 | PenaltyBreakComment: 300
 97 | PenaltyBreakFirstLessLess: 120
 98 | PenaltyBreakString: 1000
 99 | PenaltyBreakTemplateDeclaration: 10
100 | PenaltyExcessCharacter: 1000000
101 | PenaltyReturnTypeOnItsOwnLine: 200
102 | PointerAlignment: Left
103 | RawStringFormats:
104 |   - Language: Cpp
105 |     Delimiters:
106 |       - cc
107 |       - CC
108 |       - cpp
109 |       - Cpp
110 |       - CPP
111 |       - 'c++'
112 |       - 'C++'
113 |     CanonicalDelimiter: ''
114 |   - Language: TextProto
115 |     Delimiters:
116 |       - pb
117 |       - PB
118 |       - proto
119 |       - PROTO
120 |     EnclosingFunctions:
121 |       - EqualsProto
122 |       - EquivToProto
123 |       - PARSE_PARTIAL_TEXT_PROTO
124 |       - PARSE_TEST_PROTO
125 |       - PARSE_TEXT_PROTO
126 |       - ParseTextOrDie
127 |       - ParseTextProtoOrDie
128 |     CanonicalDelimiter: ''
129 |     BasedOnStyle: google
130 | # Enabling comment reflow causes doxygen comments to be messed up in their formats!
131 | ReflowComments: true
132 | SortIncludes: true
133 | SortUsingDeclarations: true
134 | SpaceAfterCStyleCast: false
135 | SpaceAfterTemplateKeyword: true
136 | SpaceBeforeAssignmentOperators: true
137 | SpaceBeforeCpp11BracedList: false
138 | SpaceBeforeCtorInitializerColon: true
139 | SpaceBeforeInheritanceColon: true
140 | SpaceBeforeParens: ControlStatements
141 | SpaceBeforeRangeBasedForLoopColon: true
142 | SpaceInEmptyParentheses: false
143 | SpacesBeforeTrailingComments: 2
144 | SpacesInAngles: false
145 | SpacesInContainerLiterals: true
146 | SpacesInCStyleCastParentheses: false
147 | SpacesInParentheses: false
148 | SpacesInSquareBrackets: false
149 | Standard: Cpp11
150 | StatementMacros:
151 |   - Q_UNUSED
152 |   - QT_REQUIRE_VERSION
153 | # Be consistent with indent-width, even for people who use tab for indentation!
154 | TabWidth: 4
155 | UseTab: Never
156 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | build.log
3 | dist
4 | intel_extension_for_deepspeed.egg-info
5 | *.pyc
6 | cscope*
7 | tags
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/xetla"]
2 | 	path = third_party/xetla
3 | 	url = https://github.com/intel/xetla.git
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | CommunityCodeOfConduct AT intel DOT com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | <PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Intel Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include intel_extension_for_deepspeed/op_builder/csrc *.cpp *.hpp *.h
2 | recursive-include intel_extension_for_deepspeed *.py
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Intel® Extension for DeepSpeed*
 2 | Intel® Extension for DeepSpeed* is an extension that brings Intel GPU (XPU) support to DeepSpeed(https://github.com/Microsoft/DeepSpeed). It comes with the following components:
 3 | 1. DeepSpeed Accelerator Interface implementation
 4 | 2. DeepSpeed op builder implementation for XPU
 5 | 3. DeepSpeed op builder kernel code
 6 | 
 7 | DeepSpeed would automatically use Intel® Extension for DeepSpeed* when it is installed as a python package.   After installation, models ported for DeepSpeed Accelerator Interface that run on DeepSpeed could run on Intel GPU device.
 8 | 
 9 | ## Installation
10 | 
11 | 1. Install Intel® Extension for PyTorch* following [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/installation.html)
12 | 
13 | 2. Install Intel® Extension for DeepSpeed*
14 | ```python
15 | pip install intel-extension-for-deepspeed
16 | ```
17 |  Or, you can build from source:
18 | ```python
19 | source ${DPCPPROOT}/env/vars.sh
20 | python setup.py install
21 | ```
22 | Generally, DPCPPROOT is /opt/intel/oneapi/compiler/latest for root account, ${HOME}/intel/oneapi/compiler/latest for other accounts.
23 | 
24 | 3. Install DeepSpeed
25 | 
26 | ```python
27 | pip install deepspeed
28 | ```
29 | ## Get Started
30 | Refer to [examples](https://github.com/intel/intel-extension-for-deepspeed/tree/main/examples#readme)
31 | 
32 | 
33 | ## Security Policy
34 | Please report security issues or vulnerabilities to the [Intel Security Center].
35 | 
36 | For more information on how Intel works to resolve security issues, see
37 | [Vulnerability Handling Guidelines].
38 | 
39 | [Intel Security Center]:https://www.intel.com/security
40 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
41 | 
42 | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9136/badge)](https://www.bestpractices.dev/projects/9136)
43 | 


--------------------------------------------------------------------------------
/Security.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 
7 | 


--------------------------------------------------------------------------------
/examples/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ## Recipes for Megatron-DeepSpeed
 2 | This folder contains recipes to run models of [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
 3 | 
 4 | To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
 5 | 
 6 | ```bash <path-to-this-repo>/examples/run3.6b.sh```
 7 | 
 8 | ### Prepare dataset
 9 | 
10 | To run recipes under Megatron-DeepSpeed, please setup your own dataset or use download scripts prepared by Megatron-DeepSpeed.
11 | 
12 | ### Basic usage
13 | 
14 | For basic usage, we have provided 3 running recipes for 3.6 billion parameters, 20 billion parameters and 175 billion parameters training:
15 | 
16 | * 3.6b:     ```bash <path-to-this-repo>/examples/run3.6b.sh```
17 | * 20b:      ```bash <path-to-this-repo>/examples/run20b.sh```
18 | * 175b:     ```bash <path-to-this-repo>/examples/run175b.sh```
19 | 
20 | ## Run with Huggingface
21 | Intel-extension-for-deepspeed also works with [Huggingface Transformers](https://github.com/huggingface/transformers) and is able to do fine-tuning/inference tasks.
22 | 
23 | Install huggingface Transformers:
24 | ```bash
25 | cd <path-to-this-repo>/examples
26 | git clone https://github.com/huggingface/transformers.git
27 | cd transformers
28 | pip install .
29 | ```
30 | 
31 | To run translation task with t5-small model on single gpu:
32 | ```bash
33 | cd transformers
34 | deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
35 | --deepspeed tests/deepspeed/ds_config_zero2.json \
36 | --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
37 | --output_dir output_dir --overwrite_output_dir --bf16 \
38 | --do_train --max_train_samples 500 --num_train_epochs 1 \
39 | --dataset_name wmt16 --dataset_config "ro-en" \
40 | --source_lang en --target_lang ro
41 | ```
42 | 
43 | To deploy on 8 gpus doing fine-tuing with Llama-2-7b model:
44 | ```bash
45 | cd transformers
46 | deepspeed --num_gpus=8 examples/pytorch/language-modeling/run_clm.py \
47 | --deepspeed tests/deepspeed/ds_config_zero3.json \
48 | --model_name_or_path meta-llama/Llama-2-7b-hf \
49 | --dataset_name wikitext \
50 | --dataset_config_name wikitext-2-raw-v1 \
51 | --dataloader_num_workers 0 \
52 | --per_device_train_batch_size 1 \
53 | --warmup_steps 10 \
54 | --max_steps 50 \
55 | --bf16 \
56 | --do_train \
57 | --output_dir /tmp/test-clm \
58 | --overwrite_output_dir
59 | ```
60 | 
61 | For detailed usage with huggingface/transformers, please check [transformers document](https://huggingface.co/docs/transformers/en/deepspeed).
62 | 


--------------------------------------------------------------------------------
/examples/generate_config.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | for v in "$GLOBAL_BATCH" "$MICRO_BATCH" "$GRAD_ACC_STEPS" "$ZERO_STAGE" \
  4 |          "$PP" "$DTYPE"
  5 | do
  6 |   if [ -z $v ]; then
  7 |     echo "Please export required envs before execute $0"
  8 |     exit 1
  9 |   fi
 10 | done
 11 | 
 12 | if [ $# -ne 1 ]; then
 13 |   echo "Usage: $0 config_file"
 14 |   exit 1
 15 | fi
 16 | 
 17 | extra=""
 18 | common="\
 19 |     \"train_batch_size\": $GLOBAL_BATCH,
 20 |     \"train_micro_batch_size_per_gpu\": $MICRO_BATCH,
 21 |     \"steps_per_print\": 1,
 22 |     \"gradient_accumulation_steps\": $GRAD_ACC_STEPS,
 23 |     \"optimizer\": {
 24 |       \"type\": \"Adam\",
 25 |       \"params\": {
 26 |         \"lr\": 0.00015,
 27 |         \"weight_decay\": 1e-2
 28 |       }
 29 |     },
 30 |     \"zero_allow_untested_optimizer\": true,
 31 |     \"gradient_clipping\": 1.0,
 32 |     \"activation_checkpointing\": {
 33 |       \"partition_activations\": true,
 34 |       \"contiguous_memory_optimization\": false
 35 |     },
 36 |     \"wall_clock_breakdown\": false,"
 37 | 
 38 | flops_profiler="\
 39 |     \"flops_profiler\": {
 40 |       \"enabled\": false,
 41 |       \"profile_step\": 45,
 42 |       \"module_depth\": -1,
 43 |       \"top_modules\": 1,
 44 |       \"detailed\": true,
 45 |       \"output_file\": null
 46 |     }"
 47 | 
 48 | if [[ $DTYPE == "bf16" ]]; then
 49 | dtype="\
 50 |     \"communication_data_type\": \"bf16\",
 51 |     \"fp16\": {
 52 |       \"enabled\": false,
 53 |       \"loss_scale\": 0,
 54 |       \"loss_scale_window\": 1000,
 55 |       \"hysteresis\": 2,
 56 |       \"min_loss_scale\": 1
 57 |     },
 58 |     \"bfloat16\": {
 59 |       \"enabled\": true,
 60 |       \"loss_scale\": 1.0
 61 |     },"
 62 | else
 63 | dtype="\
 64 |     \"communication_data_type\": \"fp16\",
 65 |     \"fp16\": {
 66 |       \"enabled\": true,
 67 |       \"loss_scale\": 0,
 68 |       \"loss_scale_window\": 1000,
 69 |       \"hysteresis\": 2,
 70 |       \"min_loss_scale\": 1
 71 |     },
 72 |     \"bfloat16\": {
 73 |       \"enabled\": false,
 74 |       \"loss_scale\": 1.0
 75 |     },"
 76 | fi
 77 | 
 78 | if [ $ZERO_STAGE == 3 ]; then
 79 | zero="\
 80 |     \"zero_optimization\": {
 81 |       \"stage\": 3,
 82 |       \"reduce_scatter\": false,
 83 |       \"stage3_max_live_parameters\": 3e9,
 84 |       \"stage3_max_reuse_distance\": 3e9,
 85 |       \"stage3_param_persistence_threshold\": 1e5,
 86 |       \"stage3_prefetch_bucket_size\": 5e7,
 87 |       \"contiguous_gradients\": true,
 88 |       \"overlap_comm\": true,
 89 |       \"reduce_bucket_size\": 90000000,
 90 |       \"sub_group_size\": 1e9,
 91 |       \"offload_optimizer\": {
 92 |         \"device\": \"none\",
 93 |         \"buffer_count\": 4,
 94 |         \"pipeline_read\": false,
 95 |         \"pipeline_write\": false,
 96 |         \"pin_memory\": true
 97 |       }
 98 |     },"
 99 | elif [ $ZERO_STAGE == 2 ]; then
100 | zero="\
101 |     \"zero_optimization\": {
102 |       \"stage\": $ZERO_STAGE,
103 |       \"allgather_partitions\": true,
104 |       \"allgather_bucket_size\": \"auto\",
105 |       \"overlap_comm\": true,
106 |       \"reduce_scatter\": false,
107 |       \"reduce_bucket_size\": 90000000,
108 |       \"contiguous_gradients\": true,
109 |       \"offload_optimizer\": {
110 |         \"device\": \"none\",
111 |         \"buffer_count\": 4,
112 |         \"pipeline_read\": false,
113 |         \"pipeline_write\": false,
114 |         \"pin_memory\": true
115 |       }
116 |     },"
117 | elif [ $ZERO_STAGE == 1 ]; then
118 | zero="\
119 |     \"zero_optimization\": {
120 |       \"stage\": $ZERO_STAGE
121 |     },"
122 | else
123 |   echo 'Please add the correct config set!!!'
124 | fi
125 | 
126 | # flops_profiler must at the end because no ',' is allowed at the end
127 | cat <<EOT > $1
128 | {
129 | $common
130 | $zero
131 | $dtype
132 | $extra
133 | $flops_profiler
134 | }
135 | EOT
136 | 


--------------------------------------------------------------------------------
/examples/generate_hostfile.sh:
--------------------------------------------------------------------------------
 1 | # set hostfile_deepspeed & hostfile_mpich
 2 | echo "!!!please use generate_hostfile.sh before training"
 3 | 
 4 | # use official mpich
 5 | 
 6 | # setting hostfile_mpich and hostfile_deepspeed
 7 | # this now supports setting up as many nodes as possible
 8 | # update for borealis
 9 | # for examples:
10 | #     1.$ bash generate_hostfile.sh #don't set hostfile
11 | #     2.$ bash generate_hostfile.sh x10001 #set one node
12 | #     3.$ bash generate_hostfile.sh x10001 x10002 x10003 x10004 #set 4 nodes
13 | #     4.$ bash generate_hostfile.sh x10001 x10002 x10003 x10004 x10005 x10006 x10007 x10008 #set 8 nodes
14 | # update for OAM system
15 | # for examples:
16 | #     1.$ bash generate_hostfile.sh #don't set hostfile
17 | #     2.$ bash generate_hostfile.sh oam compute1 #set one compute node
18 | #     3.$ bash generate_hostfile.sh oam compute1 compute2 #set 2 compute nodes
19 | usage()
20 | {
21 |   echo "Example Usage:
22 |             for 1 node: bash $0 x10001
23 |             for 4 nodes: bash generate_hostfile.sh x10001 x10002 x10003 x10004"
24 |   exit 2
25 | }
26 | 
27 | if [ $# -gt 0 ]; then
28 |     cat /dev/null > $LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich
29 |     cat /dev/null > $LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed
30 |     mid=" slots="
31 |     slots=12
32 |     for i in "$@"; do
33 |         if [ "$i" == oam ]; then
34 |             slots=8
35 |         else
36 |             host=$i
37 |             host_slot="$i$mid$slots"
38 |             echo $host>>$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich
39 |             echo $host_slot>>$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed
40 |         fi
41 |     done
42 | else
43 |     usage
44 | fi
45 | 


--------------------------------------------------------------------------------
/examples/gpt.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | VOCAB_FILE=dataset/gpt2-vocab.json
  4 | MERGE_FILE=dataset/gpt2-merges.txt
  5 | DATA_PATH=dataset/BookCorpusDataset_text_document
  6 | DTYPE=${DTYPE:-bf16}
  7 | 
  8 | # Hostfile path
  9 | hostfile_deepspeed=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed
 10 | hostfile_mpich=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich
 11 | 
 12 | # Disabling tensor/pipeline parallelism
 13 | TP=${TP:-1}
 14 | PP=${PP:-1}
 15 | 
 16 | # Model: default 3.6b
 17 | NLAYERS=${NLAYERS:-30}
 18 | HIDDEN=${HIDDEN:-3072}
 19 | HEADS=${HEADS:-32}
 20 | SEQ=${SEQ:-2048}
 21 | TRAIN_ITER=${TRAIN_ITER:-50}
 22 | 
 23 | WORLD_SIZE=${WORLD_SIZE:-12}
 24 | MICRO_BATCH=${MICRO_BATCH:-8}
 25 | GLOBAL_BATCH=${GLOBAL_BATCH:-96}
 26 | 
 27 | ZERO_STAGE=${ZERO_STAGE:-2}
 28 | 
 29 | DS_CONFIG=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/"ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json"
 30 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/generate_config.sh ${DS_CONFIG} || exit 1
 31 | 
 32 | OUTPUT_DIR=logs/ds_stage${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_pp${PP}_tp${TP}_${DTYPE}_`date +%m%d%H%M%S`_${HOSTNAME}
 33 | mkdir -p $OUTPUT_DIR
 34 | echo "!!!Please see logs at ${OUTPUT_DIR}"
 35 | 
 36 | ds_args=" "
 37 | ds_args=" --deepspeed ${ds_args}"
 38 | if [ $PP == 1 ]; then
 39 |    ds_args=" --no-pipeline-parallel ${ds_args}" 
 40 | fi
 41 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 42 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 43 | # we are now using activation checkpoint provided by megatron, see below.
 44 | # ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 45 | 
 46 | # take custom args
 47 | custom_args=" $@"
 48 | 
 49 | # launcher setting
 50 | LAUNCHER=${LAUNCHER:-MPICH}
 51 | if [[ $LAUNCHER == "deepspeed" ]]; then
 52 |     launcher=""
 53 | else
 54 |     launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'"
 55 | fi
 56 | 
 57 | CCL=${CCL:-ccl}
 58 | 
 59 | run_cmd="
 60 |     deepspeed $launcher pretrain_gpt.py \
 61 |     --tensor-model-parallel-size $TP \
 62 |     --pipeline-model-parallel-size $PP \
 63 |     --num-layers $NLAYERS \
 64 |     --hidden-size $HIDDEN \
 65 |     --num-attention-heads $HEADS \
 66 |     --seq-length $SEQ \
 67 |     --max-position-embeddings $SEQ \
 68 |     --micro-batch-size $MICRO_BATCH \
 69 |     --global-batch-size $GLOBAL_BATCH \
 70 |     --train-iters $TRAIN_ITER \
 71 |     --lr 0.00015 \
 72 |     --lr-warmup-fraction .01 \
 73 |     --lr-decay-iters 320000 \
 74 |     --lr-decay-style cosine \
 75 |     --log-interval 1 \
 76 |     --eval-iters 100 \
 77 |     --eval-interval 100 \
 78 |     --data-path $DATA_PATH \
 79 |     --vocab-file $VOCAB_FILE \
 80 |     --merge-file $MERGE_FILE \
 81 |     --save-interval 500 \
 82 |     --split 100,0,0 \
 83 |     --$DTYPE \
 84 |     --checkpoint-activations \
 85 |     --deepspeed-activation-checkpointing
 86 |     $ds_args \
 87 |     --no-masked-softmax-fusion \
 88 |     --no-bias-gelu-fusion \
 89 |     --no-bias-dropout-fusion \
 90 |     --no-gradient-accumulation-fusion \
 91 |     --distributed-backend $CCL \
 92 |     --num-workers 0 \
 93 |     $custom_args \
 94 |     |& tee $OUTPUT_DIR/output.log
 95 |     "
 96 | 
 97 | echo ${run_cmd}
 98 | eval ${run_cmd}
 99 | set +x
100 | 


--------------------------------------------------------------------------------
/examples/hostfile_deepspeed:
--------------------------------------------------------------------------------
1 | localhost slots=12
2 | 


--------------------------------------------------------------------------------
/examples/hostfile_mpich:
--------------------------------------------------------------------------------
1 | localhost
2 | 


--------------------------------------------------------------------------------
/examples/llm_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DTYPE=${DTYPE:-float16}
 4 | MODEL_PATH=${MODEL_PATH:-/home/username/model_path}
 5 | MODEL_NAME=${MODEL_NAME:-llama2-70b}
 6 | OUTPUT_DIR=logs/${MODEL_NAME}_`date +%m%d%H%M%S`_${HOSTNAME}
 7 | mkdir -p $OUTPUT_DIR
 8 | 
 9 | # Hostfile path
10 | hostfile_deepspeed=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_deepspeed
11 | hostfile_mpich=$LLM_DK_DIR/intel-extension-for-deepspeed/examples/hostfile_mpich
12 | 
13 | # launcher setting
14 | LAUNCHER=${LAUNCHER:-MPICH}
15 | if [[ $LAUNCHER == "deepspeed" ]]; then
16 |     launcher=""
17 | else
18 |     launcher="--force_multi --hostfile $hostfile_deepspeed --launcher=${LAUNCHER} --launcher_args='-hostfile ${hostfile_mpich}'"
19 | fi
20 | 
21 | CCL=${CCL:-ccl}
22 | 
23 | run_cmd="
24 |     deepspeed $launcher run_generation_with_deepspeed.py \
25 |     --device xpu \
26 |     --ipex \
27 |     --dtype $DTYPE \
28 |     --input-tokens 1024 \
29 |     --max-new-tokens 128 \
30 |     --num-beam 1 \
31 |     --batch-size 1 \
32 |     --token-latency \
33 |     --benchmark \
34 |     -m $MODEL_PATH \
35 |     --sub-model-name $MODEL_NAME\
36 |     |& tee $OUTPUT_DIR/output.log
37 |     "
38 | 
39 | echo ${run_cmd}
40 | eval ${run_cmd}
41 | set +x
42 | 


--------------------------------------------------------------------------------
/examples/run10p175b.sh:
--------------------------------------------------------------------------------
 1 | export WORLD_SIZE=${WORLD_SIZE:-48}
 2 | export MICRO_BATCH=${MICRO_BATCH:-1}
 3 | export NLAYERS=${NLAYERS:-10}
 4 | export HIDDEN=${HIDDEN:-12288}
 5 | export HEADS=${HEADS:-96}
 6 | export SEQ=${SEQ:-2048}
 7 | export TRAIN_ITER=${TRAIN_ITER:-20}
 8 | export ZERO_STAGE=${ZERO_STAGE:-3}
 9 | export DTYPE=${DTYPE:-bf16}
10 | export TP=${TP:-1}
11 | export PP=${PP:-1}
12 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}
13 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP ))
14 | 
15 | echo "!!!please use generate_hostfile.sh to set hostfile for $((${WORLD_SIZE}/12)) nodes before training"
16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@
17 | 


--------------------------------------------------------------------------------
/examples/run175b.sh:
--------------------------------------------------------------------------------
 1 | echo "!!!please use generate_hostfile.sh to set hostfile for 18 nodes before training"
 2 | export WORLD_SIZE=${WORLD_SIZE:-216}
 3 | export MICRO_BATCH=${MICRO_BATCH:-1}
 4 | export NLAYERS=${NLAYERS:-96}
 5 | export HIDDEN=${HIDDEN:-12288}
 6 | export HEADS=${HEADS:-96}
 7 | export SEQ=${SEQ:-2048}
 8 | export TRAIN_ITER=${TRAIN_ITER:-20}
 9 | export ZERO_STAGE=${ZERO_STAGE:-3}
10 | export DTYPE=${DTYPE:-bf16}
11 | export TP=${TP:-1}
12 | export PP=${PP:-1}
13 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}
14 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP ))
15 | 
16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@
17 | 


--------------------------------------------------------------------------------
/examples/run20b.sh:
--------------------------------------------------------------------------------
 1 | export WORLD_SIZE=${WORLD_SIZE:-48}
 2 | export MICRO_BATCH=${MICRO_BATCH:-1}
 3 | export NLAYERS=${NLAYERS:-44}
 4 | export HIDDEN=${HIDDEN:-6144}
 5 | export HEADS=${HEADS:-64}
 6 | export SEQ=${SEQ:-2048}
 7 | export TRAIN_ITER=${TRAIN_ITER:-20}
 8 | export ZERO_STAGE=${ZERO_STAGE:-3}
 9 | export DTYPE=${DTYPE:-bf16}
10 | export TP=${TP:-1}
11 | export PP=${PP:-1}
12 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}
13 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP ))
14 | 
15 | echo "!!!please use generate_hostfile.sh to set hostfile for $((${WORLD_SIZE}/12)) nodes before training"
16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh $@
17 | 


--------------------------------------------------------------------------------
/examples/run3.6b.sh:
--------------------------------------------------------------------------------
 1 | echo "!!!please makes sure the content of hostfile for single node is localhost"
 2 | export WORLD_SIZE=${WORLD_SIZE:-12}
 3 | export MICRO_BATCH=${MICRO_BATCH:-8}
 4 | export NLAYERS=${NLAYERS:-30}
 5 | export HIDDEN=${HIDDEN:-3072}
 6 | export HEADS=${HEADS:-32}
 7 | export SEQ=${SEQ:-2048}
 8 | export TRAIN_ITER=${TRAIN_ITER:-50}
 9 | export ZERO_STAGE=${ZERO_STAGE:-2}
10 | export DTYPE=${DTYPE:-bf16}
11 | export TP=${TP:-1}
12 | export PP=${PP:-1}
13 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}
14 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP ))
15 | 
16 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh --no-query-key-layer-scaling $@
17 | 


--------------------------------------------------------------------------------
/examples/run_llama.sh:
--------------------------------------------------------------------------------
 1 | # please make sure the content of hostfile for single node is localhost
 2 | export WORLD_SIZE=${WORLD_SIZE:-48}
 3 | export MICRO_BATCH=${MICRO_BATCH:-1}
 4 | export NLAYERS=${NLAYERS:-32}
 5 | export HIDDEN=${HIDDEN:-4096}
 6 | export HEADS=${HEADS:-32}
 7 | export SEQ=${SEQ:-2048}
 8 | export NUM_KV_HEADS=${NUM_KV_HEADS:-32}
 9 | export FFN_HIDDEN_SIZE=${FFN_HIDDEN_SIZE:-11008}
10 | export TRAIN_ITER=${TRAIN_ITER:-50}
11 | export ZERO_STAGE=${ZERO_STAGE:-3}
12 | export DTYPE=${DTYPE:-bf16}
13 | export TP=${TP:-1}
14 | export PP=${PP:-1}
15 | export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}
16 | export GLOBAL_BATCH=$(( $WORLD_SIZE * $MICRO_BATCH * $GRAD_ACC_STEPS / $TP / $PP ))
17 | 
18 | bash $LLM_DK_DIR/intel-extension-for-deepspeed/examples/gpt.sh --no-query-key-layer-scaling \
19 | --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --disable-bias-linear \
20 | --normalization rmsnorm --attention-dropout 0 --hidden-dropout 0 --use-flash-attn-builder \
21 | --ffn-hidden-size $FFN_HIDDEN_SIZE --num-key-value-heads $NUM_KV_HEADS $@
22 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/__init__.py:
--------------------------------------------------------------------------------
1 | from .xpu_accelerator import XPU_Accelerator
2 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .builder import OpBuilder
 2 | from .cpu_adam import CPUAdamBuilder
 3 | from .cpu_adagrad import CPUAdagradBuilder
 4 | from .fused_adam import FusedAdamBuilder
 5 | from .transformer_inference import InferenceBuilder
 6 | from .quantizer import QuantizerBuilder
 7 | from .utils import UtilsBuilder
 8 | from .async_io import AsyncIOBuilder
 9 | from .flash_attn import FlashAttentionBuilder
10 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/async_io.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import distutils.spawn
 7 | import subprocess
 8 | import torch
 9 | 
10 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR
11 | 
12 | class AsyncIOBuilder(OpBuilder):
13 |     BUILD_VAR = "DS_BUILD_AIO"
14 |     NAME = "async_io"
15 | 
16 |     def __init__(self):
17 |         super().__init__(name=self.NAME)
18 | 
19 |     def absolute_name(self):
20 |         return f'deepspeed.ops.aio.{self.NAME}_op'
21 | 
22 |     def sources(self):
23 |         return [
24 |             'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
25 |             'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
26 |             'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
27 |             'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
28 |             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
29 |         ]
30 | 
31 |     def include_paths(self):
32 |         return ['csrc/aio/py_lib', 'csrc/aio/common']
33 | 
34 |     def cxx_args(self):
35 |         # -O0 for improved debugging, since performance is bound by I/O
36 |         CPU_ARCH = self.cpu_arch()
37 |         SIMD_WIDTH = self.simd_width()
38 |         TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
39 |         if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
40 |             CPP_STD = '-std=c++17'
41 |         else:
42 |             CPP_STD = '-std=c++14'
43 |         return [
44 |             '-g',
45 |             '-Wall',
46 |             '-O0',
47 |             CPP_STD,
48 |             '-shared',
49 |             '-fPIC',
50 |             '-Wno-reorder',
51 |             CPU_ARCH,
52 |             '-fopenmp',
53 |             SIMD_WIDTH,
54 |             '-laio',
55 |         ]
56 | 
57 |     def extra_ldflags(self):
58 |         return ['-laio']
59 | 
60 |     def check_for_libaio_pkg(self):
61 |         libs = dict(
62 |             dpkg=["-l", "libaio-dev", "apt"],
63 |             pacman=["-Q", "libaio", "pacman"],
64 |             rpm=["-q", "libaio-devel", "yum"],
65 |         )
66 | 
67 |         found = False
68 |         for pkgmgr, data in libs.items():
69 |             flag, lib, tool = data
70 |             path = distutils.spawn.find_executable(pkgmgr)
71 |             if path is not None:
72 |                 cmd = f"{pkgmgr} {flag} {lib}"
73 |                 result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
74 |                 if result.wait() == 0:
75 |                     found = True
76 |                 else:
77 |                     self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
78 |                 break
79 |         return found
80 | 
81 |     def is_compatible(self, verbose=True):
82 |         # Check for the existence of libaio by using distutils
83 |         # to compile and link a test program that calls io_submit,
84 |         # which is a function provided by libaio that is used in the async_io op.
85 |         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
86 |         # respectively to specify the directories for libaio.h and libaio.so.
87 |         aio_compatible = self.has_function('io_pgetevents', ('aio', ))
88 |         if verbose and not aio_compatible:
89 |             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
90 | 
91 |             # Check for the libaio package via known package managers
92 |             # to print suggestions on which package to install.
93 |             self.check_for_libaio_pkg()
94 | 
95 |             self.warning(
96 |                 "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
97 |             )
98 |         return super().is_compatible(verbose) and aio_compatible
99 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/builder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The Microsoft DeepSpeed Team
  3 | """
  4 | import os
  5 | import time
  6 | import importlib
  7 | import shutil
  8 | from pathlib import Path
  9 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR
 10 | 
 11 | class SYCLOpBuilder(OpBuilder):
 12 |     def builder(self):
 13 |         try:
 14 |             from intel_extension_for_pytorch.xpu.cpp_extension import DPCPPExtension
 15 |         except ImportError:
 16 |             from intel_extension_for_pytorch.xpu.utils import DPCPPExtension
 17 | 
 18 |         print("dpcpp sources = {}".format(self.sources()))
 19 |         dpcpp_ext = DPCPPExtension(
 20 |             name=self.absolute_name(),
 21 |             sources=self.strip_empty_entries(self.sources()),
 22 |             include_dirs=self.strip_empty_entries(self.include_paths()),
 23 |             extra_compile_args={
 24 |                 'cxx': self.strip_empty_entries(self.cxx_args()),
 25 |             },
 26 |             extra_link_args=self.strip_empty_entries(self.fixed_aotflags()))
 27 |         return dpcpp_ext
 28 | 
 29 |     def version_dependent_macros(self):
 30 |         # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
 31 |         version_ge_1_1 = []
 32 |         if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
 33 |             version_ge_1_1 = ['-DVERSION_GE_1_1']
 34 |         version_ge_1_3 = []
 35 |         if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
 36 |             version_ge_1_3 = ['-DVERSION_GE_1_3']
 37 |         version_ge_1_5 = []
 38 |         if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
 39 |             version_ge_1_5 = ['-DVERSION_GE_1_5']
 40 |         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 41 | 
 42 |     def cxx_args(self):
 43 |         cxx_flags = ['-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64', '-fno-strict-aliasing']
 44 |         if os.environ.get('USE_MKL_GEMM'):
 45 |             cxx_flags.append('-DUSE_MKL_GEMM')
 46 |         return cxx_flags
 47 | 
 48 |     def extra_ldflags(self):
 49 |         return ['-fPIC', '-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8', '-Xs "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode"', '-Xs "-device pvc"', '-Wl,-export-dynamic']
 50 | 
 51 |     def fixed_aotflags(self):
 52 |         return ['-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8', '-Xs', "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode", '-Xs', "-device pvc"]
 53 | 
 54 |     def load(self, verbose=True):
 55 |         from deepspeed.git_version_info import installed_ops, torch_info  # noqa: F401
 56 |         if installed_ops.get(self.name, False):
 57 |             return importlib.import_module(self.absolute_name())
 58 |         else:
 59 |             return self.jit_load(verbose)
 60 | 
 61 |     def jit_load(self, verbose=True):
 62 |         if not self.is_compatible(verbose):
 63 |             raise RuntimeError(
 64 |                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
 65 |             )
 66 |         try:
 67 |             import ninja  # noqa: F401
 68 |         except ImportError:
 69 |             raise RuntimeError(
 70 |                 f"Unable to JIT load the {self.name} op due to ninja not being installed."
 71 |             )
 72 | 
 73 |         self.jit_mode = True
 74 |         from intel_extension_for_pytorch.xpu.cpp_extension import load
 75 | 
 76 |         start_build = time.time()
 77 |         # Recognize relative paths as absolute paths for jit load
 78 | 
 79 |         sources = [self.deepspeed_src_path(path) for path in self.sources()]
 80 |         extra_include_paths = [
 81 |             self.deepspeed_src_path(path) for path in self.include_paths()
 82 |         ]
 83 | 
 84 |         # Torch will try and apply whatever CCs are in the arch list at compile time,
 85 |         # we have already set the intended targets ourselves we know that will be
 86 |         # needed at runtime. This prevents CC collisions such as multiple __half
 87 |         # implementations. Stash arch list to reset after build.
 88 |         '''
 89 |         torch_arch_list = None
 90 |         if "TORCH_CUDA_ARCH_LIST" in os.environ:
 91 |             torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
 92 |             os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 93 |         '''
 94 | 
 95 |         op_module = load(
 96 |             name=self.name,
 97 |             sources=self.strip_empty_entries(sources),
 98 |             extra_include_paths=self.strip_empty_entries(extra_include_paths),
 99 |             extra_cflags=self.strip_empty_entries(self.cxx_args()),
100 |             # extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
101 |             extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
102 |             verbose=verbose)
103 | 
104 |         build_duration = time.time() - start_build
105 |         if verbose:
106 |             print(f"Time to load {self.name} op: {build_duration} seconds")
107 |         '''
108 |         # Reset arch list so we are not silently removing it for other possible use cases
109 |         if torch_arch_list:
110 |             os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
111 |         '''
112 |         return op_module
113 | 
114 | 
115 | def sycl_kernel_path(code_path):
116 |     # Always return a path like "SYCL_KERNEL_PATH/..."
117 |     SYCL_KERNEL_PATH = "third-party"
118 |     abs_source_path = os.path.join(Path(__file__).parent.absolute(), code_path)
119 |     rel_target_path = os.path.join(SYCL_KERNEL_PATH, code_path)
120 | 
121 |     # Jit_load mode require absolute path. Use abs path for copy
122 |     # To get the absolute path of deepspeed
123 |     # We use a non-abstract builder class instance to call deepspeed_src_path()
124 |     # FusedAdamBuilder is one of such class instance
125 |     from .fused_adam import FusedAdamBuilder
126 |     abs_target_path = FusedAdamBuilder().deepspeed_src_path(rel_target_path)
127 | 
128 |     sycl_link_path = os.path.join(
129 |         os.path.dirname(FusedAdamBuilder().deepspeed_src_path("")),
130 |         SYCL_KERNEL_PATH)
131 |     if not os.path.exists(sycl_link_path):
132 |         # Create directory and link for sycl kernel:
133 |         #   deepspeed/ops/SYCL_KERNEL_PATH-->../../SYCL_KERNEL_PATH
134 |         sycl_dir_path = os.path.join(os.path.dirname(sycl_link_path),
135 |                                      "../../" + SYCL_KERNEL_PATH)
136 | 
137 |         os.makedirs(sycl_dir_path, exist_ok=True)
138 |         os.symlink("../../" + SYCL_KERNEL_PATH, sycl_link_path, True)
139 |         print("Create directory and link for sycl kernel:{}-->{}".format(
140 |             sycl_link_path,
141 |             sycl_dir_path))
142 | 
143 |     import filecmp
144 |     if (os.path.exists(abs_target_path) and filecmp.cmp(abs_target_path,
145 |                                                         abs_source_path)):
146 |         print("skip copy, {} and {} have the same content".format(
147 |             abs_source_path,
148 |             abs_target_path))
149 |         return rel_target_path
150 | 
151 |     print("Copying SYCL kernel file from {} to {}".format(abs_source_path,
152 |                                                           abs_target_path))
153 |     os.makedirs(os.path.dirname(abs_target_path), exist_ok=True)
154 |     shutil.copyfile(abs_source_path, abs_target_path)
155 | 
156 |     # Prebuild install mode require paths relative to the setup.py directory. Use the relative path.
157 |     return rel_target_path
158 | 
159 | 
160 | def sycl_kernel_include(code_path):
161 |     import intel_extension_for_pytorch  # noqa: F401
162 |     abs_path = os.path.join(Path(__file__).parent.absolute(), code_path)
163 |     return abs_path
164 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/cpu_adagrad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The Microsoft DeepSpeed Team
 3 | """
 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include
 5 | 
 6 | 
 7 | class CPUAdagradBuilder(SYCLOpBuilder):
 8 |     BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
 9 |     NAME = "cpu_adagrad"
10 | 
11 |     def __init__(self):
12 |         super().__init__(name=self.NAME)
13 | 
14 |     def absolute_name(self):
15 |         return f'deepspeed.ops.adagrad.{self.NAME}_op'
16 | 
17 |     def sources(self):
18 |         return [
19 |             sycl_kernel_path('csrc/adagrad/cpu_adagrad.cpp'),
20 |             sycl_kernel_path('csrc/common/custom_cuda_kernel.dp.cpp'),
21 |         ]
22 | 
23 |     def include_paths(self):
24 |         return [
25 |             sycl_kernel_include('csrc/includes'),
26 |         ]
27 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/cpu_adam.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The Microsoft DeepSpeed Team
 3 | """
 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include
 5 | 
 6 | 
 7 | class CPUAdamBuilder(SYCLOpBuilder):
 8 |     BUILD_VAR = "DS_BUILD_CPU_ADAM"
 9 |     NAME = "cpu_adam"
10 | 
11 |     def __init__(self):
12 |         super().__init__(name=self.NAME)
13 | 
14 |     def absolute_name(self):
15 |         return f'deepspeed.ops.adam.{self.NAME}_op'
16 | 
17 |     def sources(self):
18 |         return [
19 |             sycl_kernel_path('csrc/adam/cpu_adam.cpp'),
20 |             sycl_kernel_path('csrc/adam/cpu_adam_impl.cpp'),
21 |             sycl_kernel_path('csrc/common/custom_cuda_kernel.dp.cpp'),
22 |         ]
23 | 
24 |     def libraries_args(self):
25 |         args = super().libraries_args()
26 |         return args
27 | 
28 |     def include_paths(self):
29 |         return [
30 |             sycl_kernel_include('csrc/includes'),
31 |         ]
32 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/adagrad/cpu_adagrad.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #include "cpu_adagrad.h"
  7 | #include <torch/extension.h>
  8 | #include <iostream>
  9 | #include <memory>
 10 | #include <type_traits>
 11 | #include <unordered_map>
 12 | #include <cmath>
 13 | 
 14 | #if defined(__ENABLE_CUDA__)
 15 | #include <cuda_runtime_api.h>
 16 | #include "cublas_v2.h"
 17 | #include "cuda.h"
 18 | #include "curand.h"
 19 | #include "custom_cuda_layers.h"
 20 | #endif
 21 | 
 22 | static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 23 | 
 24 | // C++ interface
 25 | 
 26 | void Adagrad_Optimizer::Step_1(float* _params,
 27 |                                float* grads,
 28 |                                float* _exp_avg_sq,
 29 |                                size_t _param_size,
 30 |                                ds_half_precision_t* dev_params,
 31 |                                bool half_precision)
 32 | {
 33 |     size_t rounded_size = 0;
 34 | #if defined(__AVX512__) or defined(__AVX256__)
 35 |     Step_AVX<1>(
 36 |         &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
 37 | #endif
 38 |     if (_param_size > rounded_size) {
 39 |         float step_size = -1 * _alpha;
 40 |         ds_half_precision_t* grads_cast_h;
 41 |         ds_half_precision_t* params_cast_h;
 42 |         if (half_precision) {
 43 |             grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
 44 |             params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
 45 |         }
 46 |         for (size_t t = rounded_size; t < _param_size; t += TILE) {
 47 |             size_t copy_size = TILE;
 48 |             if ((t + TILE) > _param_size) copy_size = _param_size - t;
 49 |             size_t offset = copy_size + t;
 50 | #if defined(__ENABLE_CUDA__)
 51 |             if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
 52 | #elif defined(__ENABLE_CANN__)
 53 |             if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
 54 | #endif
 55 | #pragma omp parallel for
 56 |             for (size_t k = t; k < offset; k++) {
 57 |                 float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
 58 |                 float param = half_precision ? (float)params_cast_h[k] : _params[k];
 59 |                 float momentum = grads[k];
 60 |                 float variance = _exp_avg_sq[k];
 61 |                 if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
 62 | 
 63 |                 variance += grad * grad;
 64 | 
 65 |                 grad = sqrt(variance);
 66 |                 grad += _eps;
 67 |                 grad = momentum / grad;
 68 |                 param = grad * step_size + param;
 69 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
 70 |                 if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
 71 | #endif
 72 |                 if (half_precision)
 73 |                     params_cast_h[k] = (ds_half_precision_t)param;
 74 |                 else
 75 |                     _params[k] = param;
 76 |                 // STORE UPDATE TERM TO GRAD'S MEMORY
 77 |                 grads[k] = grad * step_size;
 78 |                 _exp_avg_sq[k] = variance;
 79 |             }
 80 | #if defined(__ENABLE_CUDA__)
 81 |             if (dev_params) {
 82 |                 launch_param_update(
 83 |                     _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
 84 |                 _buf_index = !_buf_index;
 85 |             }
 86 | #elif defined(__ENABLE_CANN__)
 87 |             if (dev_params) {
 88 |                 size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
 89 |                 aclrtMemcpy(dev_params + t,
 90 |                             memcpy_size,
 91 |                             _doubled_buffer[_buf_index],
 92 |                             memcpy_size,
 93 |                             aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
 94 | 
 95 |                 _buf_index = !_buf_index;
 96 |             }
 97 | #endif
 98 |         }
 99 |     }
100 | }
101 | 
102 | void Adagrad_Optimizer::Step_4(float* _params,
103 |                                float* grads,
104 |                                float* _exp_avg_sq,
105 |                                size_t _param_size,
106 |                                ds_half_precision_t* dev_params,
107 |                                bool half_precision)
108 | {
109 |     size_t rounded_size = 0;
110 | #if defined(__AVX512__) or defined(__AVX256__)
111 |     Step_AVX<4>(
112 |         &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
113 | #endif
114 |     if (_param_size > rounded_size)
115 |         Step_1((_params + rounded_size),
116 |                (grads + rounded_size),
117 |                (_exp_avg_sq + rounded_size),
118 |                (_param_size - rounded_size),
119 |                (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
120 |                half_precision);
121 | }
122 | 
123 | int create_adagrad_optimizer(int optimizer_id,
124 |                              float alpha = 1e-2,
125 |                              float eps = 1e-8,
126 |                              float weight_decay = 0,
127 |                              bool should_log = false)
128 | {
129 |     auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
130 | 
131 |     s_optimizers[optimizer_id] = opt;
132 | 
133 |     if (should_log) {
134 |         std::string avx_type = "";
135 | #if defined(__AVX512__)
136 |         avx_type = "AVX512";
137 | #else
138 | #if defined(__AVX256__)
139 |         avx_type = "AVX2";
140 | #else
141 |         avx_type = "scalar";
142 | #endif
143 | #endif
144 | 
145 |         printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
146 |                optimizer_id,
147 |                avx_type.c_str());
148 |         printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
149 |     }
150 | 
151 |     return 0;
152 | }
153 | 
154 | void Adagrad_Optimizer::Step_8(float* _params,
155 |                                float* grads,
156 |                                float* _exp_avg_sq,
157 |                                size_t _param_size,
158 |                                ds_half_precision_t* dev_params,
159 |                                bool half_precision)
160 | {
161 |     size_t rounded_size = 0;
162 | #if defined(__AVX512__) or defined(__AVX256__)
163 |     Step_AVX<8>(
164 |         &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
165 | #endif
166 |     if (_param_size > rounded_size)
167 |         Step_4((_params + rounded_size),
168 |                (grads + rounded_size),
169 |                (_exp_avg_sq + rounded_size),
170 |                (_param_size - rounded_size),
171 |                (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
172 |                half_precision);
173 | }
174 | 
175 | int ds_adagrad_step(int optimizer_id,
176 |                     size_t step,
177 |                     float lr,
178 |                     float epsilon,
179 |                     float weight_decay,
180 |                     torch::Tensor& params,
181 |                     torch::Tensor& grads,
182 |                     torch::Tensor& exp_avg_sq)
183 | {
184 |     auto params_c = params.contiguous();
185 |     auto grads_c = grads.contiguous();
186 |     auto exp_avg_sq_c = exp_avg_sq.contiguous();
187 | 
188 |     float* params_ptr = (float*)params_c.data_ptr();
189 |     float* grads_ptr = (float*)grads_c.data_ptr();
190 |     float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
191 | 
192 |     std::shared_ptr<Adagrad_Optimizer> opt =
193 |         std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
194 |     opt->IncrementStep(step);
195 |     opt->update_state(lr, epsilon, weight_decay);
196 |     opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
197 | 
198 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
199 |     opt->SynchronizeStreams();
200 | #endif
201 |     return 0;
202 | }
203 | 
204 | int ds_adagrad_step_plus_copy(int optimizer_id,
205 |                               size_t step,
206 |                               float lr,
207 |                               float epsilon,
208 |                               float weight_decay,
209 |                               torch::Tensor& params,
210 |                               torch::Tensor& grads,
211 |                               torch::Tensor& exp_avg_sq,
212 |                               torch::Tensor& gpu_params)
213 | {
214 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
215 |     auto params_c = params.contiguous();
216 |     auto gpu_params_c = gpu_params.contiguous();
217 |     auto exp_avg_sq_c = exp_avg_sq.contiguous();
218 |     auto grads_c = grads.contiguous();
219 | 
220 |     float* params_ptr = (float*)params_c.data_ptr();
221 |     float* grads_ptr = (float*)grads_c.data_ptr();
222 |     ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
223 |     float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
224 | 
225 |     std::shared_ptr<Adagrad_Optimizer> opt =
226 |         std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
227 |     opt->IncrementStep(step);
228 |     opt->update_state(lr, epsilon, weight_decay);
229 |     opt->Step_8(params_ptr,
230 |                 grads_ptr,
231 |                 exp_avg_sq_ptr,
232 |                 params_c.numel(),
233 |                 gpu_params_ptr,
234 |                 (params.options().dtype() == at::kHalf));
235 | 
236 |     opt->SynchronizeStreams();
237 | #else
238 |     assert(false);
239 | #endif
240 |     return 0;
241 | }
242 | 
243 | int destroy_adagrad_optimizer(int optimizer_id)
244 | {
245 |     s_optimizers.erase(optimizer_id);
246 | 
247 |     return 0;
248 | }
249 | 
250 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
251 | {
252 |     m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
253 |     m.def("adagrad_update_copy",
254 |           &ds_adagrad_step_plus_copy,
255 |           "DeepSpeed CPU Adagrad update and param copy (C++)");
256 |     m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
257 |     m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
258 | }
259 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/adam/cpu_adam.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include "cpu_adam.h"
 7 | 
 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 9 | {
10 |     m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
11 |     m.def("adam_update_copy",
12 |           &ds_adam_step_plus_copy,
13 |           "DeepSpeed CPU Adam update and param copy (C++)");
14 |     m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
15 |     m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
16 | }
17 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/adam/fused_adam_frontend.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include <torch/extension.h>
 7 | 
 8 | void multi_tensor_adam_cuda(int chunk_size,
 9 |                             at::Tensor noop_flag,
10 |                             std::vector<std::vector<at::Tensor>> tensor_lists,
11 |                             const float lr,
12 |                             const float beta1,
13 |                             const float beta2,
14 |                             const float epsilon,
15 |                             const int step,
16 |                             const int mode,
17 |                             const int bias_correction,
18 |                             const float weight_decay);
19 | 
20 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
21 | {
22 |     m.def("multi_tensor_adam",
23 |           &multi_tensor_adam_cuda,
24 |           "Compute and apply gradient update to parameters for Adam optimizer");
25 | }
26 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_adam.dp.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | /*
  7 | Copyright NVIDIA/apex
  8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85
  9 | */
 10 | 
 11 | #include <sycl/sycl.hpp>
 12 | #include <dpct/dpct.hpp>
 13 | #include <ATen/ATen.h>
 14 | #include <ATen/AccumulateType.h>
 15 | // #include <ATen/cuda/CUDAContext.h>
 16 | // #include <ATen/cuda/Exceptions.h>
 17 | // Another possibility:
 18 | // #include <torch/all.h>
 19 | 
 20 | #include <assert.h>
 21 | 
 22 | #include "multi_tensor_apply.dp.hpp"
 23 | #include "type_shim.h"
 24 | #include <cmath>
 25 | 
 26 | #define BLOCK_SIZE 512
 27 | #define ILP 4
 28 | 
 29 | typedef enum {
 30 |     ADAM_MODE_0 = 0,  // L2 regularization mode
 31 |     ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
 32 | } adamMode_t;
 33 | 
 34 | using MATH_T = float;
 35 | 
 36 | template <typename T>
 37 | struct AdamFunctor {
 38 |     /*
 39 |     DPCT1110:4: The total declared local variable size in device function operator() exceeds 128
 40 |     bytes and may cause high register pressure. Consult with your hardware vendor to find the total
 41 |     register size available and adjust the code, or use smaller sub-group size to avoid high
 42 |     register pressure.
 43 |     */
 44 |     __dpct_inline__ void operator()(int chunk_size,
 45 |                                     volatile int* noop_gmem,
 46 |                                     TensorListMetadata<4>& tl,
 47 |                                     const float beta1,
 48 |                                     const float beta2,
 49 |                                     const float beta1_correction,
 50 |                                     const float beta2_correction,
 51 |                                     const float epsilon,
 52 |                                     const float lr,
 53 |                                     adamMode_t mode,
 54 |                                     const float decay)
 55 |     {
 56 |         // I'd like this kernel to propagate infs/nans.
 57 |         // if(*noop_gmem == 1)
 58 |         //   return;
 59 | 
 60 |         auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
 61 |         int tensor_loc = tl.block_to_tensor[item_ct1.get_group(2)];
 62 | 
 63 |         // potentially use to pass in list of scalar
 64 |         // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
 65 | 
 66 |         int chunk_idx = tl.block_to_chunk[item_ct1.get_group(2)];
 67 |         int n = tl.sizes[tensor_loc];
 68 | 
 69 |         T* g = (T*)tl.addresses[0][tensor_loc];
 70 |         g += chunk_idx * chunk_size;
 71 | 
 72 |         T* p = (T*)tl.addresses[1][tensor_loc];
 73 |         p += chunk_idx * chunk_size;
 74 | 
 75 |         T* m = (T*)tl.addresses[2][tensor_loc];
 76 |         m += chunk_idx * chunk_size;
 77 | 
 78 |         T* v = (T*)tl.addresses[3][tensor_loc];
 79 |         v += chunk_idx * chunk_size;
 80 | 
 81 |         n -= chunk_idx * chunk_size;
 82 | 
 83 |         // see note in multi_tensor_scale_kernel.cu
 84 |         for (int i_start = 0; i_start < n && i_start < chunk_size;
 85 |              i_start += item_ct1.get_local_range(2) * ILP) {
 86 |             MATH_T r_g[ILP];
 87 |             MATH_T r_p[ILP];
 88 |             MATH_T r_m[ILP];
 89 |             MATH_T r_v[ILP];
 90 | #pragma unroll
 91 |             for (int ii = 0; ii < ILP; ii++) {
 92 |                 int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2);
 93 |                 if (i < n && i < chunk_size) {
 94 |                     r_g[ii] = g[i];
 95 |                     r_p[ii] = p[i];
 96 |                     r_m[ii] = m[i];
 97 |                     r_v[ii] = v[i];
 98 |                 } else {
 99 |                     r_g[ii] = MATH_T(0);
100 |                     r_p[ii] = MATH_T(0);
101 |                     r_m[ii] = MATH_T(0);
102 |                     r_v[ii] = MATH_T(0);
103 |                 }
104 |             }
105 | #pragma unroll
106 |             for (int ii = 0; ii < ILP; ii++) {
107 |                 if (mode == ADAM_MODE_0) {  // L2
108 |                     r_g[ii] = r_g[ii] + (decay * r_p[ii]);
109 |                     r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
110 |                     r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
111 |                     MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
112 |                     MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
113 |                     MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon;
114 |                     MATH_T update = next_m_unbiased / denom;
115 |                     r_p[ii] = r_p[ii] - (lr * update);
116 |                 } else {  // weight decay
117 |                     r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
118 |                     r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
119 |                     MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
120 |                     MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
121 |                     MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon;
122 |                     MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
123 |                     r_p[ii] = r_p[ii] - (lr * update);
124 |                 }
125 |             }
126 | #pragma unroll
127 |             for (int ii = 0; ii < ILP; ii++) {
128 |                 int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2);
129 |                 if (i < n && i < chunk_size) {
130 |                     p[i] = r_p[ii];
131 |                     m[i] = r_m[ii];
132 |                     v[i] = r_v[ii];
133 |                 }
134 |             }
135 |         }
136 |     }
137 | };
138 | 
139 | void multi_tensor_adam_cuda(int chunk_size,
140 |                             at::Tensor noop_flag,
141 |                             std::vector<std::vector<at::Tensor>> tensor_lists,
142 |                             const float lr,
143 |                             const float beta1,
144 |                             const float beta2,
145 |                             const float epsilon,
146 |                             const int step,
147 |                             const int mode,
148 |                             const int bias_correction,
149 |                             const float weight_decay)
150 | {
151 |     using namespace at;
152 | 
153 |     // Handle bias correction mode
154 |     float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
155 |     if (bias_correction == 1) {
156 |         bias_correction1 = 1 - std::pow(beta1, step);
157 |         bias_correction2 = 1 - std::pow(beta2, step);
158 |     }
159 | 
160 |     // Assume single type across p,g,m1,m2 now
161 |     DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
162 |                                    0,
163 |                                    "adam",
164 |                                    multi_tensor_apply<4>(BLOCK_SIZE,
165 |                                                          chunk_size,
166 |                                                          noop_flag,
167 |                                                          tensor_lists,
168 |                                                          AdamFunctor<scalar_t_0>(),
169 |                                                          beta1,
170 |                                                          beta2,
171 |                                                          bias_correction1,
172 |                                                          bias_correction2,
173 |                                                          epsilon,
174 |                                                          lr,
175 |                                                          (adamMode_t)mode,
176 |                                                          weight_decay);)
177 | 
178 |     /*
179 |     DPCT1010:9: SYCL uses exceptions to report errors and does not use the error codes. The call was
180 |     replaced with 0. You need to rewrite this code.
181 |     */
182 |     // AT_CUDA_CHECK(0);
183 | }
184 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/adam/multi_tensor_apply.dp.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | /*
  7 | Copyright NVIDIA/apex
  8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85
  9 | */
 10 | 
 11 | #include <ATen/ATen.h>
 12 | #include <ATen/AccumulateType.h>
 13 | #include <dpct/dpct.hpp>
 14 | #include <sycl/sycl.hpp>
 15 | // #include <ATen/cuda/CUDAContext.h>
 16 | // #include <ATen/cuda/Exceptions.h>
 17 | #include <ipex.h>
 18 | #include "compat.h"
 19 | #include "context.h"
 20 | 
 21 | #include <assert.h>
 22 | #include <tuple>
 23 | #include <utility>
 24 | 
 25 | // This header is the one-stop shop for all your multi-tensor apply needs.
 26 | 
 27 | // TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
 28 | constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 29 | constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
 30 | 
 31 | template <int n>
 32 | struct TensorListMetadata {
 33 |     void* addresses[n][depth_to_max_tensors[n - 1]];
 34 |     int sizes[depth_to_max_tensors[n - 1]];
 35 |     unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
 36 |     int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
 37 |     int start_tensor_this_launch;
 38 | };
 39 | 
 40 | template <typename T, typename U, typename... ArgTypes>
 41 | class multi_tensor_apply_kernel {
 42 | public:
 43 |     multi_tensor_apply_kernel(int chunk_size,
 44 |                               volatile int* noop_flag,
 45 |                               T tl,
 46 |                               U callable,
 47 |                               ArgTypes... args)
 48 |         : chunk_size(chunk_size), noop_flag(noop_flag), tl(tl), callable(callable), args(args...)
 49 |     {
 50 |     }
 51 | 
 52 |     // This should be identical to original __global__ function
 53 |     static void inline __global__function(int chunk_size,
 54 |                                           volatile int* noop_flag,
 55 |                                           T tl,
 56 |                                           U callable,
 57 |                                           ArgTypes... args)
 58 |     {
 59 |         callable(chunk_size, noop_flag, tl, args...);
 60 |     }
 61 | 
 62 |     // If global function template contains parameter pack,
 63 |     // we only deal with parameter pack at the end of template parameter list
 64 |     template <typename Tuple, std::size_t... I>
 65 |     static void inline __tuple_expand_driver(int chunk_size,
 66 |                                              volatile int* noop_flag,
 67 |                                              T tl,
 68 |                                              U callable,
 69 |                                              Tuple args,
 70 |                                              std::index_sequence<I...>)
 71 |     {
 72 |         __global__function(chunk_size, noop_flag, tl, callable, std::get<I>(args)...);
 73 |     }
 74 | 
 75 |     //
 76 |     // Because __global__ function can't really use any reference types, we can sure that args
 77 |     // are all good behaviors
 78 |     //
 79 |     void operator()(sycl::nd_item<3>) const
 80 |     {
 81 |         __tuple_expand_driver(chunk_size,
 82 |                               noop_flag,
 83 |                               tl,
 84 |                               callable,
 85 |                               args,
 86 |                               std::make_index_sequence<sizeof...(ArgTypes)>());
 87 |     }
 88 | 
 89 | private:
 90 |     int chunk_size;
 91 |     volatile int* noop_flag;
 92 |     T tl;
 93 |     U callable;
 94 |     std::tuple<ArgTypes...> args;
 95 | };
 96 | 
 97 | template <int depth, typename T, typename... ArgTypes>
 98 | void multi_tensor_apply(int block_size,
 99 |                         int chunk_size,
100 |                         const at::Tensor& noop_flag,
101 |                         const std::vector<std::vector<at::Tensor>>& tensor_lists,
102 |                         T callable,
103 |                         ArgTypes... args)
104 | {
105 |     TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
106 |     int len0 = tensor_lists[0].size();
107 |     TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
108 |     auto ref_device = tensor_lists[0][0].device();
109 |     TORCH_CHECK(ref_device.type() == at::kXPU, "expected input to be on cuda");
110 |     for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
111 |     {
112 |         TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
113 |         for (int t = 0; t < tensor_lists[l].size(); t++) {
114 |             // TODO:  Print which tensor fails.
115 |             bool contiguous_memory = tensor_lists[l][t].is_contiguous();
116 | #ifdef VERSION_GE_1_5
117 |             contiguous_memory = (contiguous_memory ||
118 |                                  tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
119 | #endif
120 |             TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
121 |             TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
122 |                         "A tensor was not on the same device as the first tensor");
123 |             TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
124 |         }
125 |     }
126 | 
127 |     int ntensors = tensor_lists[0].size();
128 | 
129 |     TensorListMetadata<depth> tl;
130 | 
131 |     /* const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); */
132 |     auto stream = at::cuda::getCurrentCUDAStream();
133 | 
134 |     tl.start_tensor_this_launch = 0;
135 |     int loc_block_info = 0;
136 |     int loc_tensor_info = 0;
137 |     for (int t = 0; t < ntensors; t++) {
138 |         tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
139 |         for (int d = 0; d < depth; d++)
140 |             tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
141 |         loc_tensor_info++;
142 | 
143 |         int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
144 | 
145 |         for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
146 |             // std::cout << chunks_this_tensor << std::endl;
147 |             tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
148 |             tl.block_to_chunk[loc_block_info] = chunk;
149 |             loc_block_info++;
150 | 
151 |             bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
152 |                                  chunk == chunks_this_tensor - 1);
153 |             bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
154 |             bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
155 |             if (tensors_full || blocks_full || last_chunk) {
156 |                 // using accscalar_t = acc_type<scalar_t, true>;
157 |                 /*
158 |                 DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To
159 |                 get the device limit, query info::device::max_work_group_size. Adjust the work-group
160 |                 size if needed.
161 |                 */
162 |                 /* multi_tensor_apply_kernel<TensorListMetadata<depth>, T, ArgTypes...>
163 |                  * fn(chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...); */
164 |                 if constexpr (sizeof(multi_tensor_apply_kernel(
165 |                                   chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...)) <
166 |                               2048) {
167 |                     ((sycl::queue*)(stream))
168 |                         ->parallel_for(
169 |                             sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) *
170 |                                                   sycl::range<3>(1, 1, block_size),
171 |                                               sycl::range<3>(1, 1, block_size)),
172 |                             multi_tensor_apply_kernel(
173 |                                 chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...));
174 |                 } else {
175 |                     auto capture = multi_tensor_apply_kernel(
176 |                         chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
177 |                     sycl::buffer params(const_cast<const decltype(capture)*>(&capture),
178 |                                         sycl::range<1>(1));
179 |                     stream->submit([&](sycl::handler& cgh) {
180 |                         auto device_params =
181 |                             params.template get_access<sycl::access_mode::read,
182 |                                                        sycl::target::constant_buffer>(cgh);
183 |                         cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) *
184 |                                                                sycl::range<3>(1, 1, block_size),
185 |                                                            sycl::range<3>(1, 1, block_size)),
186 |                                          [=](sycl::nd_item<3> item) { device_params[0](item); });
187 |                     });
188 |                 }
189 |                 /*
190 |                 DPCT1010:5: SYCL uses exceptions to report errors and does not use the error codes.
191 |                 The call was replaced with 0. You need to rewrite this code.
192 |                 */
193 |                 0;
194 | 
195 |                 // Reset.  The control flow possibilities here make my brain hurt.
196 |                 loc_block_info = 0;
197 |                 if (chunk == chunks_this_tensor - 1) {
198 |                     // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
199 |                     // std::endl;
200 |                     loc_tensor_info = 0;
201 |                     tl.start_tensor_this_launch = t + 1;
202 |                 } else {
203 |                     // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
204 |                     // std::endl;
205 |                     tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
206 |                     for (int d = 0; d < depth; d++)
207 |                         tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
208 |                     loc_tensor_info = 1;
209 |                     tl.start_tensor_this_launch = t;
210 |                 }
211 |             }
212 |         }
213 |     }
214 | }
215 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/common/custom_cuda_kernel.dp.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include <sycl/sycl.hpp>
 7 | #include <dpct/dpct.hpp>
 8 | #include "custom_cuda_layers.h"
 9 | 
10 | void param_update_kernel(const float* input, sycl::half* output, int size)
11 | {
12 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
13 |     int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
14 | 
15 |     if (id < size) { output[id] = (sycl::half)input[id]; }
16 | }
17 | 
18 | void launch_param_update(const float* input, sycl::half* output, int size, dpct::queue_ptr stream)
19 | {
20 |     int threads = 1024;
21 | 
22 |     sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1);
23 |     sycl::range<3> block_dim(1, 1, threads);
24 | 
25 |     /*
26 |     DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the
27 |     device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
28 |     */
29 |     {
30 |         dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
31 |         stream->parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim),
32 |                              [=](sycl::nd_item<3> item_ct1) {
33 |                                  param_update_kernel(input, output, size);
34 |                              });
35 |     }
36 | }
37 | 
38 | void param_update_kernel_half(const float* input, sycl::half* output, int size)
39 | {
40 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
41 |     int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
42 |     sycl::half2* output_cast = reinterpret_cast<sycl::half2*>(output);
43 |     if (id < size) {
44 |         float input_f = input[id];
45 |         sycl::half2* input_h = reinterpret_cast<sycl::half2*>(&input_f);
46 |         output_cast[id] = *input_h;
47 |     }
48 | }
49 | 
50 | void launch_param_update_half(const float* input,
51 |                               sycl::half* output,
52 |                               int size,
53 |                               dpct::queue_ptr stream)
54 | {
55 |     int threads = 1024;
56 |     size /= 2;
57 |     sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1);
58 |     sycl::range<3> block_dim(1, 1, threads);
59 | 
60 |     /*
61 |     DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the
62 |     device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
63 |     */
64 |     {
65 |         dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
66 |         stream->parallel_for(sycl::nd_range<3>(grid_dim * block_dim, block_dim),
67 |                              [=](sycl::nd_item<3> item_ct1) {
68 |                                  param_update_kernel_half(input, output, size);
69 |                              });
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/flash_attn/flash_attn.dp.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include "context.h"
  3 | #include "flash_attn.h"
  4 | 
  5 | // [Bs, Hn, Sl, Hs]
  6 | std::vector<torch::Tensor> flash_attn_fwd(torch::Tensor &q,
  7 |                                           torch::Tensor &k,
  8 |                                           torch::Tensor &v,
  9 |                                           uint32_t bs,
 10 |                                           uint32_t head_number,
 11 |                                           uint32_t seqlens,
 12 |                                           uint32_t head_size,
 13 |                                           float softmax_scale,
 14 |                                           float dropout_prob,
 15 |                                           uint64_t dropout_rand_seed,
 16 |                                           bool is_causal,
 17 |                                           bool is_training,
 18 |                                           bool is_dropout) {
 19 |     torch::Tensor q_ = q.transpose(1, 2).contiguous().transpose(1, 2);
 20 |     torch::Tensor k_ = k.transpose(1, 2).contiguous().transpose(1, 2);
 21 |     torch::Tensor v_ = v.transpose(1, 2).contiguous().transpose(1, 2);
 22 |     torch::Tensor output = torch::empty_like(q_);
 23 |     torch::Tensor softmax_L, dropout_mask;
 24 |     softmax_L = torch::empty({bs * head_number, 1, seqlens}, q.options()).to(at::kFloat);
 25 | 
 26 |     void *q_ptr = (void *)q_.data_ptr();
 27 |     void *k_ptr = (void *)k_.data_ptr();
 28 |     void *v_ptr = (void *)v_.data_ptr();
 29 |     void *output_ptr = (void *)output.data_ptr();
 30 |     void *softmax_L_ptr = (void *)softmax_L.data_ptr();
 31 |     void *drop_mask_ptr = nullptr;
 32 |     uint64_t dropout_rand_offset = 123;
 33 | 
 34 |     sycl::queue* stream = ::TrainingContext::Instance().GetCurrentStream();
 35 |     FlashAttention _flash_attn = FlashAttention();
 36 |     _flash_attn.Forward(
 37 |         *stream,
 38 |         output_ptr,
 39 |         softmax_L_ptr,
 40 |         bs,
 41 |         head_number,
 42 |         head_size,
 43 |         seqlens,
 44 |         seqlens,
 45 |         softmax_scale,
 46 |         q_ptr,
 47 |         k_ptr,
 48 |         v_ptr,
 49 |         drop_mask_ptr,
 50 |         dropout_prob,
 51 |         dropout_rand_seed,
 52 |         dropout_rand_offset,
 53 |         is_causal,
 54 |         is_training,
 55 |         is_dropout
 56 |     );
 57 |     return {output, softmax_L};
 58 | }
 59 | 
 60 | std::vector<torch::Tensor> flash_attn_bwd(torch::Tensor &gradout,
 61 |                                           torch::Tensor &q,
 62 |                                           torch::Tensor &k,
 63 |                                           torch::Tensor &v,
 64 |                                           torch::Tensor &out,
 65 |                                           uint32_t bs,
 66 |                                           uint32_t head_number,
 67 |                                           uint32_t seqlens,
 68 |                                           uint32_t head_size,
 69 |                                           float softmax_scale,
 70 |                                           float dropout_prob,
 71 |                                           uint64_t dropout_rand_seed,
 72 |                                           bool is_causal,
 73 |                                           bool is_dropout,
 74 |                                           torch::Tensor &softmax_L) {
 75 |     torch::Tensor q_ = q.transpose(1, 2).contiguous().transpose(1, 2);
 76 |     torch::Tensor k_ = k.transpose(1, 2).contiguous().transpose(1, 2);
 77 |     torch::Tensor v_ = v.transpose(1, 2).contiguous().transpose(1, 2);
 78 |     torch::Tensor out_ = out.transpose(1, 2).contiguous().transpose(1, 2);
 79 |     torch::Tensor grad_out_ = gradout.transpose(1, 2).contiguous().transpose(1, 2);
 80 | 
 81 |     torch::Tensor dq = torch::zeros_like(q_);
 82 |     torch::Tensor dk = torch::empty_like(k_);
 83 |     torch::Tensor dv = torch::empty_like(v_);
 84 |     torch::Tensor d_buffer = torch::empty_like(softmax_L);
 85 |     void *gradout_ptr = (void *)grad_out_.data_ptr();
 86 |     void *q_ptr = (void *)q_.data_ptr();
 87 |     void *k_ptr = (void *)k_.data_ptr();
 88 |     void *v_ptr = (void *)v_.data_ptr();
 89 |     void *out_ptr = (void *)out_.data_ptr();
 90 |     void *dq_ptr = (void *)dq.data_ptr();
 91 |     void *dk_ptr = (void *)dk.data_ptr();
 92 |     void *dv_ptr = (void *)dv.data_ptr();
 93 |     void *softmax_L_ptr = (void *)softmax_L.data_ptr();
 94 |     void *d_buffer_ptr = (void *)d_buffer.data_ptr();
 95 |     void *drop_mask_ptr = nullptr;
 96 |     uint64_t dropout_rand_offset = 123;
 97 | 
 98 |     sycl::queue* stream = ::TrainingContext::Instance().GetCurrentStream();
 99 |     FlashAttention _flash_attn = FlashAttention();
100 |     _flash_attn.Backward(
101 |         *stream,
102 |         dq_ptr,
103 |         dk_ptr,
104 |         dv_ptr,
105 |         out_ptr,
106 |         gradout_ptr,
107 |         softmax_L_ptr,
108 |         d_buffer_ptr,
109 |         bs,
110 |         head_number,
111 |         head_size,
112 |         seqlens,
113 |         seqlens,
114 |         softmax_scale,
115 |         q_ptr,
116 |         k_ptr,
117 |         v_ptr,
118 |         drop_mask_ptr,
119 |         dropout_prob,
120 |         dropout_rand_seed,
121 |         dropout_rand_offset,
122 |         is_causal,
123 |         is_dropout
124 |     );
125 |     return {dq, dk, dv};
126 | }
127 | 
128 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
129 | {
130 |     m.def("flash_attn_fwd",
131 |           &flash_attn_fwd,
132 |           "Flash attention forward");
133 |     m.def("flash_attn_bwd",
134 |           &flash_attn_bwd,
135 |           "Flash attention backward");
136 | }
137 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/flash_attn/flash_attn.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <mha.h>
  4 | 
  5 | 
  6 | class FlashAttention {
  7 | public:
  8 |     virtual ~FlashAttention() {}
  9 |     
 10 |     bool Forward(sycl::queue &stream,
 11 |                  void* output,
 12 |                  void* softmax_L,
 13 |                  uint32_t num_batches,
 14 |                  uint32_t num_heads,
 15 |                  uint32_t head_size,
 16 |                  uint32_t num_queries,
 17 |                  uint32_t num_keys,
 18 |                  float hs_rsqrt_scale,
 19 |                  void* q_ptr,
 20 |                  void* k_ptr,
 21 |                  void* v_ptr,
 22 |                  void* dropout_mask = nullptr,
 23 |                  float dropout_prob = 0.0,
 24 |                  uint64_t rand_seed = 0,
 25 |                  uint64_t rank_offset = 0,
 26 |                  bool is_causal = true,
 27 |                  bool is_training = true,
 28 |                  bool is_dropout = true) {
 29 |         RECORD_FUNCTION("flash_scaled_attn_bf16_fwd", c10::ArrayRef<c10::IValue>({}));
 30 |         gpu::xetla::fmha_forward_kernel(
 31 |             gpu::xetla::XetlaType::bf16,
 32 |             stream,
 33 |             q_ptr,
 34 |             k_ptr,
 35 |             v_ptr,
 36 |             dropout_mask,
 37 |             output,
 38 |             softmax_L,
 39 |             hs_rsqrt_scale,
 40 |             dropout_prob,
 41 |             num_batches,
 42 |             num_heads,
 43 |             head_size,
 44 |             num_queries,
 45 |             num_keys,
 46 |             num_keys,
 47 |             is_causal,
 48 |             is_training,
 49 |             is_dropout,
 50 |             rand_seed,
 51 |             rank_offset
 52 |         );
 53 | 
 54 |         return true;
 55 |     }
 56 | 
 57 |     bool Backward(sycl::queue &stream,
 58 |                   void* dq,
 59 |                   void* dk,
 60 |                   void* dv,
 61 |                   void* out, // [Bs, Hn, Sl, Hs]
 62 |                   void* gradout,
 63 |                   void* softmax_workspace, // [Bs*Hn, 1, Sl]: row_max + log(row_sum)
 64 |                   void* d_buffer, // temp buffer for D = O pointmul dO [Bs*Hn, 1, Sl]
 65 |                   uint32_t num_batches,
 66 |                   uint32_t num_heads,
 67 |                   uint32_t head_size,
 68 |                   uint32_t num_queries,
 69 |                   uint32_t num_keys,
 70 |                   float hs_rsqrt_scale,
 71 |                   void* q_ptr,
 72 |                   void* k_ptr,
 73 |                   void* v_ptr,
 74 |                   void* dropout_mask = nullptr,
 75 |                   float dropout_prob = 0.0,
 76 |                   uint64_t rand_seed = 0,
 77 |                   uint64_t rank_offset = 0,
 78 |                   bool is_causal = true,
 79 |                   bool is_dropout = true) {
 80 |         RECORD_FUNCTION("flash_scaled_attn_bf16_bwd", c10::ArrayRef<c10::IValue>({}));
 81 |         gpu::xetla::fmha_backward_kernel(
 82 |             gpu::xetla::XetlaType::bf16,
 83 |             stream,
 84 |             gradout,
 85 |             q_ptr,
 86 |             k_ptr,
 87 |             v_ptr,
 88 |             out,
 89 |             softmax_workspace,
 90 |             d_buffer,
 91 |             hs_rsqrt_scale,
 92 |             dropout_prob,
 93 |             dq,
 94 |             dk,
 95 |             dv,
 96 |             num_batches,
 97 |             num_heads,
 98 |             head_size,
 99 |             num_queries,
100 |             num_keys,
101 |             num_keys,
102 |             is_causal,
103 |             is_dropout,
104 |             rand_seed,
105 |             rank_offset
106 |         );
107 |         return true;
108 |     }
109 | };
110 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/flash_attn/fmha_policy.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "xetla.hpp"
 4 | 
 5 | namespace gpu::xetla {
 6 | 
 7 | struct fmha_policy_base {
 8 |   static constexpr uint32_t accum_step = 16;
 9 |   static constexpr uint32_t stages = 3;
10 |   static constexpr uint32_t sync_freq = 0;
11 | };
12 | 
13 | /*
14 | Note:
15 |   kHm / kSgHm == kBc / kSgBc
16 |   kSgHm and kSgBc should be a multiple of 16
17 |   kSgBr should be a multiple of 8
18 | */
19 | 
20 | struct fmha_policy_128x128x64 : fmha_policy_base {
21 |   static constexpr uint32_t kBr = 128;
22 |   static constexpr uint32_t kSgBr = 16;
23 |   static constexpr uint32_t kBc = 128;
24 |   static constexpr uint32_t kSgBc = 32;
25 |   static constexpr uint32_t kBcHm_SgBc = 16;
26 |   static constexpr uint32_t kHm = 64;
27 |   static constexpr uint32_t kSgHm = 16;
28 |   static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc);
29 | };
30 | 
31 | struct fmha_policy_128x128x128 : fmha_policy_base {
32 |   static constexpr uint32_t kBr = 128;
33 |   static constexpr uint32_t kSgBr = 16;
34 |   static constexpr uint32_t kBc = 128;
35 |   static constexpr uint32_t kSgBc = 32;
36 |   static constexpr uint32_t kBcHm_SgBc = 16;
37 |   static constexpr uint32_t kHm = 128;
38 |   static constexpr uint32_t kSgHm = 32;
39 |   static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc);
40 | };
41 | 
42 | struct fmha_policy_128x128x256 : fmha_policy_base {
43 |   static constexpr uint32_t kBr = 128;
44 |   static constexpr uint32_t kSgBr = 16;
45 |   static constexpr uint32_t kBc = 128;
46 |   static constexpr uint32_t kSgBc = 32;
47 |   static constexpr uint32_t kBcHm_SgBc = 16;
48 |   static constexpr uint32_t kHm = 256;
49 |   static constexpr uint32_t kSgHm = 64;
50 |   static constexpr uint32_t thread_num = (kBr / kSgBr) * (kBc / kSgBc);
51 | };
52 | 
53 | } // namespace gpu::xetla


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/flash_attn/mha.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if __has_include(<sycl/sycl.hpp>)
 4 | #include <sycl/sycl.hpp>
 5 | #elif __has_include(<CL/sycl.hpp>)
 6 | #include <CL/sycl.hpp>
 7 | #else
 8 | #error "Unsupported compiler"
 9 | #endif
10 | 
11 | #include <stddef.h>
12 | #include <ipex.h>
13 | #include <torch/extension.h>
14 | #include "xetla.hpp"
15 | 
16 | #define DPCPP_Q_CGF(h) [&](sycl::handler & h)
17 | 
18 | #define DPCPP_Q_SUBMIT(q, cgf, ...)                                          \
19 |   {                                                                          \
20 |     auto e = (q).submit((cgf), ##__VA_ARGS__);                               \
21 |     (q).throw_asynchronous();                                                \
22 |     xpu::profiler_record("dpcpp_kernel", e);                                 \
23 |   }
24 | 
25 | namespace gpu::xetla {
26 | 
27 | enum class XetlaType {
28 |   fp16,
29 |   bf16,
30 | };
31 | 
32 | void fmha_forward_kernel(
33 |     XetlaType xeType,
34 |     sycl::queue& q,
35 |     void* query,
36 |     void* key,
37 |     void* value,
38 |     void* dropout,
39 |     void* out,
40 |     void* log_sumexp,
41 |     float alpha,
42 |     float dropout_prob,
43 |     uint32_t num_batches,
44 |     uint32_t num_heads,
45 |     uint32_t head_size,
46 |     uint32_t num_queries,
47 |     uint32_t num_keys,
48 |     uint32_t attn_mask_padded_block_size,
49 |     bool is_causal,
50 |     bool is_training,
51 |     bool is_dropout,
52 |     uint64_t seed_t,
53 |     uint64_t offset_t);
54 | 
55 | void fmha_backward_kernel(
56 |     XetlaType xeType,
57 |     sycl::queue& q,
58 |     void* grad_out,
59 |     void* query,
60 |     void* key,
61 |     void* value,
62 |     void* out,
63 |     void* log_sumexp,
64 |     void* workspace,
65 |     float alpha,
66 |     float dropout_prob,
67 |     void* grad_query,
68 |     void* grad_key,
69 |     void* grad_value,
70 |     uint32_t num_batches,
71 |     uint32_t num_heads,
72 |     uint32_t head_size,
73 |     uint32_t num_queries,
74 |     uint32_t num_keys,
75 |     uint32_t attn_mask_padding,
76 |     bool is_causal,
77 |     bool is_dropout,
78 |     uint64_t seed_t,
79 |     uint64_t offset_t);
80 | 
81 | } // namespace gpu::xetla
82 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/StopWatch.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | #ifdef _WIN32
  8 | #include <windows.h>
  9 | #else
 10 | #include <time.h>
 11 | #endif
 12 | 
 13 | #ifdef _WIN32
 14 | 
 15 | class Stopwatch {
 16 | private:
 17 |     double m_total_time;
 18 |     LARGE_INTEGER m_start_time;
 19 | 
 20 | public:
 21 |     Stopwatch() { m_total_time = 0.0; }
 22 | 
 23 |     ~Stopwatch() {}
 24 | 
 25 |     void Reset() { m_total_time = 0.0; }
 26 | 
 27 |     void Start() { QueryPerformanceCounter(&m_start_time); }
 28 | 
 29 |     void Restart()
 30 |     {
 31 |         m_total_time = 0.0;
 32 |         QueryPerformanceCounter(&m_start_time);
 33 |     }
 34 | 
 35 |     void Stop()
 36 |     {
 37 |         LARGE_INTEGER frequency;
 38 |         LARGE_INTEGER stop_time;
 39 |         QueryPerformanceFrequency(&frequency);
 40 |         QueryPerformanceCounter(&stop_time);
 41 |         m_total_time +=
 42 |             ((double)(stop_time.QuadPart - m_start_time.QuadPart) / (double)frequency.QuadPart);
 43 |     }
 44 | 
 45 |     double GetTimeInSeconds() { return m_total_time; }
 46 | };
 47 | 
 48 | #else
 49 | 
 50 | class Stopwatch {
 51 | private:
 52 |     double m_total_time;
 53 |     struct timespec m_start_time;
 54 |     bool m_is_started;
 55 | 
 56 | public:
 57 |     Stopwatch()
 58 |     {
 59 |         m_total_time = 0.0;
 60 |         m_is_started = false;
 61 |     }
 62 | 
 63 |     ~Stopwatch() {}
 64 | 
 65 |     void Reset() { m_total_time = 0.0; }
 66 | 
 67 |     void Start()
 68 |     {
 69 |         clock_gettime(CLOCK_MONOTONIC, &m_start_time);
 70 |         m_is_started = true;
 71 |     }
 72 | 
 73 |     void Restart()
 74 |     {
 75 |         m_total_time = 0.0;
 76 |         clock_gettime(CLOCK_MONOTONIC, &m_start_time);
 77 |         m_is_started = true;
 78 |     }
 79 | 
 80 |     void Stop()
 81 |     {
 82 |         if (m_is_started) {
 83 |             m_is_started = false;
 84 | 
 85 |             struct timespec end_time;
 86 |             clock_gettime(CLOCK_MONOTONIC, &end_time);
 87 | 
 88 |             m_total_time += (double)(end_time.tv_sec - m_start_time.tv_sec) +
 89 |                             (double)(end_time.tv_nsec - m_start_time.tv_nsec) / 1e9;
 90 |         }
 91 |     }
 92 | 
 93 |     double GetTimeInSeconds()
 94 |     {
 95 |         if (m_is_started) {
 96 |             Stop();
 97 |             Start();
 98 |         }
 99 |         return m_total_time;
100 |     }
101 | };
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/compat.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | /*
 7 | Copyright NVIDIA/apex
 8 | This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 9 | */
10 | 
11 | #ifndef TORCH_CHECK
12 | #define TORCH_CHECK AT_CHECK
13 | #endif
14 | 
15 | #ifdef VERSION_GE_1_3
16 | #define DATA_PTR data_ptr
17 | #else
18 | #define DATA_PTR data
19 | #endif
20 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/context.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <sycl/sycl.hpp>
  9 | #include <dpct/dpct.hpp>
 10 | /* #include <ATen/cuda/CUDAContext.h> */
 11 | #include <cassert>
 12 | #include <iostream>
 13 | #include <vector>
 14 | #include <dpct/blas_utils.hpp>
 15 | 
 16 | #include <dpct/rng_utils.hpp>
 17 | 
 18 | #include "gemm_test.h"
 19 | 
 20 | #include <ipex.h>
 21 | 
 22 | #ifndef SYCL_CUDA_STREAM
 23 | #define SYCL_CUDA_STREAM
 24 | namespace at {
 25 |   namespace cuda {
 26 |     inline dpct::queue_ptr getCurrentCUDAStream() {
 27 |       auto device_type = c10::DeviceType::XPU;
 28 |       c10::impl::VirtualGuardImpl impl(device_type);
 29 |       c10::Stream c10_stream = impl.getStream(c10::Device(device_type));
 30 |       auto& queue = xpu::get_queue_from_stream(c10_stream);
 31 |       return &queue;
 32 |     }
 33 | 
 34 |     inline dpct::queue_ptr getStreamFromPool(bool) {
 35 |       // not implemented
 36 |       return nullptr;
 37 |     }
 38 |     
 39 |     inline dpct::queue_ptr getStreamFromPool() {
 40 |       // not implemented
 41 |       return nullptr;
 42 |     }
 43 |   }
 44 | }
 45 | #endif
 46 | 
 47 | #define WARP_SIZE 32
 48 | 
 49 | #define CUDA_CHECK(callstr)                                                                    \
 50 |     {                                                                                          \
 51 |         cudaError_t error_code = callstr;                                                      \
 52 |         if (error_code != cudaSuccess) {                                                       \
 53 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
 54 |             assert(0);                                                                         \
 55 |         }                                                                                      \
 56 |     }
 57 | 
 58 | #define CUDA_1D_KERNEL_LOOP(i, n) \
 59 |     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 60 | 
 61 | #define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
 62 |     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
 63 |         for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
 64 | 
 65 | #define DS_CUDA_NUM_THREADS 512
 66 | #define DS_MAXIMUM_NUM_BLOCKS 262144
 67 | 
 68 | inline int DS_GET_BLOCKS(const int N)
 69 | {
 70 |     return (std::max)(
 71 |         (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
 72 |         // Use at least 1 block, since CUDA does not allow empty block
 73 |         1);
 74 | }
 75 | 
 76 | class TrainingContext {
 77 | public:
 78 |     TrainingContext() try : _workspace(nullptr), _seed(42), _curr_offset(0) {
 79 |         _gen = dpct::rng::create_host_rng(dpct::rng::random_engine_type::mcg59);
 80 |         _gen->set_seed(123);
 81 |         int stat = DPCT_CHECK_ERROR(_cublasHandle = &dpct::get_in_order_queue());
 82 |         if (stat != 0) {
 83 |             // It would be nice to use cublasGetStatusName and
 84 |             // cublasGetStatusString, but they were only added in CUDA 11.4.2.
 85 |             auto message = std::string("Failed to create cublas handle: cublasStatus_t was ") +
 86 |                            std::to_string(stat);
 87 |             std::cerr << message << std::endl;
 88 |             throw std::runtime_error(message);
 89 |         }
 90 |     }
 91 |     catch (sycl::exception const& exc) {
 92 |       std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__
 93 |                 << std::endl;
 94 |       std::exit(1);
 95 |     }
 96 | 
 97 |     virtual ~TrainingContext()
 98 |     {
 99 |         _cublasHandle = nullptr;
100 |         sycl::free(_workspace, dpct::get_in_order_queue());
101 |     }
102 | 
103 |     static TrainingContext& Instance()
104 |     {
105 |         static TrainingContext _ctx;
106 |         return _ctx;
107 |     }
108 | 
109 |     void SetWorkSpace(void* workspace)
110 |     {
111 |         if (!workspace) { throw std::runtime_error("Workspace is null."); }
112 |         _workspace = workspace;
113 |     }
114 | 
115 |     void* GetWorkSpace() { return _workspace; }
116 | 
117 |     dpct::rng::host_rng_ptr& GetRandGenerator() { return _gen; }
118 | 
119 |     dpct::queue_ptr GetCurrentStream()
120 |     {
121 |         // get current pytorch stream.
122 |         dpct::queue_ptr stream = at::cuda::getCurrentCUDAStream();
123 |         return stream;
124 |     }
125 | 
126 |     dpct::queue_ptr GetNewStream() { return at::cuda::getStreamFromPool(); }
127 | 
128 |     dpct::queue_ptr GetCublasHandle() { return _cublasHandle; }
129 | 
130 |     std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
131 |     {
132 |         uint64_t offset = _curr_offset;
133 |         _curr_offset += offset_inc;
134 |         return std::pair<uint64_t, uint64_t>(_seed, offset);
135 |     }
136 | 
137 |     void SetSeed(uint64_t new_seed) { _seed = new_seed; }
138 | 
139 |     void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
140 |     {
141 |         // avoid rerun.
142 |         if (_gemm_algos.size() > 0) return;
143 | 
144 |         if (test_gemm) {
145 |             dpct::queue_ptr handle = GetCublasHandle();
146 | 
147 |             std::unique_ptr<GemmTest<sycl::half>> test_qkv_fw(
148 |                 new GemmTest<sycl::half>(batch_size * seq_len,      // M
149 |                                      head_num * size_per_head,  // N
150 |                                      head_num * size_per_head,  // K
151 |                                      oneapi::mkl::transpose::trans,
152 |                                      oneapi::mkl::transpose::nontrans,
153 |                                      handle));
154 | 
155 |             std::unique_ptr<GemmTest<sycl::half>> test_inter(
156 |                 new GemmTest<sycl::half>(batch_size * seq_len,          // M
157 |                                      4 * head_num * size_per_head,  // N
158 |                                      head_num * size_per_head,      // K
159 |                                      oneapi::mkl::transpose::trans,
160 |                                      oneapi::mkl::transpose::nontrans,
161 |                                      handle));
162 | 
163 |             std::unique_ptr<GemmTest<sycl::half>> test_output(
164 |                 new GemmTest<sycl::half>(batch_size * seq_len,          // M
165 |                                      head_num * size_per_head,      // N
166 |                                      4 * head_num * size_per_head,  // K
167 |                                      oneapi::mkl::transpose::trans,
168 |                                      oneapi::mkl::transpose::nontrans,
169 |                                      handle));
170 | 
171 |             std::unique_ptr<StridedGemmTest<sycl::half>> test_attn_scores(
172 |                 new StridedGemmTest<sycl::half>(batch_size * head_num,  // batch
173 |                                             seq_len,                // M
174 |                                             seq_len,                // N
175 |                                             size_per_head,          // K
176 |                                             oneapi::mkl::transpose::trans,
177 |                                             oneapi::mkl::transpose::nontrans,
178 |                                             handle));
179 | 
180 |             std::unique_ptr<StridedGemmTest<sycl::half>> test_attn_context(
181 |                 new StridedGemmTest<sycl::half>(batch_size * head_num,  // batch
182 |                                             size_per_head,          // M
183 |                                             seq_len,                // N
184 |                                             seq_len,                // K
185 |                                             oneapi::mkl::transpose::nontrans,
186 |                                             oneapi::mkl::transpose::nontrans,
187 |                                             handle));
188 | 
189 |             _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
190 |             _gemm_algos.push_back(test_inter->TestAlgo(100));
191 |             _gemm_algos.push_back(test_output->TestAlgo(100));
192 |             _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
193 |             _gemm_algos.push_back(test_attn_context->TestAlgo(100));
194 |         } else {
195 |             // Use default algo.
196 |             _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
197 |             _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
198 |             _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
199 |             _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
200 |             _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
201 |         }
202 |     }
203 | 
204 |     const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
205 | 
206 | private:
207 |     dpct::rng::host_rng_ptr _gen;
208 |     dpct::queue_ptr _cublasHandle;
209 |     void* _workspace;
210 |     uint64_t _seed;
211 |     uint64_t _curr_offset;
212 |     std::vector<std::array<int, 3>> _gemm_algos;
213 | };
214 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/cpu_adagrad.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | 
  8 | #define NOMINMAX  // Windows idiosyncrasy
  9 |                   // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
 10 | 
 11 | #include <stdio.h>
 12 | #include <cassert>
 13 | #include "simd.h"
 14 | 
 15 | #if defined(__ENABLE_CUDA__)
 16 | #include <cuda_fp16.h>
 17 | #include <cuda_runtime_api.h>
 18 | #include "cuda.h"
 19 | #include "custom_cuda_layers.h"
 20 | typedef sycl::half ds_half_precision_t;
 21 | #elif defined(__ENABLE_CANN__)
 22 | #include "acl/acl.h"
 23 | #include "torch_npu/csrc/core/npu/NPUStream.h"
 24 | typedef c10::Half ds_half_precision_t;
 25 | #else
 26 | typedef unsigned short ds_half_precision_t;
 27 | #endif
 28 | 
 29 | #define STEP(SPAN)                                             \
 30 |     void Step_##SPAN(float* _params,                           \
 31 |                      float* grads,                             \
 32 |                      float* _exp_avg_sq,                       \
 33 |                      size_t _param_size,                       \
 34 |                      ds_half_precision_t* dev_param = nullptr, \
 35 |                      bool half_precision = false);
 36 | 
 37 | class Adagrad_Optimizer {
 38 | public:
 39 |     Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
 40 |         : _alpha(alpha), _eps(eps), _weight_decay(weight_decay)
 41 |     {
 42 | #if defined(__ENABLE_CUDA__)
 43 |         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
 44 |         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 45 | 
 46 |         _streams[0] = TrainingContext::Instance().GetCurrentStream();
 47 |         _streams[1] = TrainingContext::Instance().GetNewStream();
 48 |         _buf_index = false;
 49 | #elif defined(__ENABLE_CANN__)
 50 |         aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
 51 |         aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 52 | 
 53 |         _buf_index = false;
 54 | #endif
 55 |     }
 56 |     ~Adagrad_Optimizer()
 57 |     {
 58 | #if defined(__ENABLE_CUDA__)
 59 |         cudaFreeHost(_doubled_buffer[0]);
 60 |         cudaFreeHost(_doubled_buffer[1]);
 61 | #elif defined(__ENABLE_CANN__)
 62 |         aclrtFreeHost(_doubled_buffer[0]);
 63 |         aclrtFreeHost(_doubled_buffer[1]);
 64 | #endif
 65 |     }
 66 | #if defined(__AVX512__) or defined(__AVX256__)
 67 |     template <int span>
 68 |     void Step_AVX(size_t* rounded_size,
 69 |                   float* _params,
 70 |                   float* grads,
 71 |                   float* _exp_avg_sq,
 72 |                   size_t param_size,
 73 |                   ds_half_precision_t* dev_param = nullptr,
 74 |                   bool half_precision = false);
 75 | #endif
 76 |     STEP(1)
 77 |     STEP(4)
 78 |     STEP(8)
 79 | #if defined(__ENABLE_CUDA__)
 80 |     inline void SynchronizeStreams()
 81 |     {
 82 |         for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
 83 |     }
 84 | #elif defined(__ENABLE_CANN__)
 85 |     inline void SynchronizeStreams()
 86 |     {
 87 |         for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
 88 |     }
 89 | #endif
 90 |     inline void IncrementStep(size_t step)
 91 |     {
 92 |         _step++;
 93 |         if (_step != step) { _step = step; }
 94 |     }
 95 |     inline void update_state(float lr, float epsilon, float weight_decay)
 96 |     {
 97 |         _alpha = lr;
 98 |         _eps = epsilon;
 99 |         _weight_decay = weight_decay;
100 |     }
101 | 
102 | private:
103 |     float _alpha;
104 |     float _eps;
105 |     float _weight_decay;
106 | 
107 |     float _betta1_t;
108 |     float _betta2_t;
109 |     size_t _step;
110 | 
111 | #if defined(__ENABLE_CUDA__)
112 |     bool _buf_index;
113 |     float* _doubled_buffer[2];
114 |     cudaStream_t _streams[2];
115 | #elif defined(__ENABLE_CANN__)
116 |     float* _doubled_buffer[2];
117 |     c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
118 |                                       c10_npu::getNPUStreamFromPool()};
119 |     bool _buf_index;
120 | #endif
121 | };
122 | 
123 | #if defined(__AVX512__) or defined(__AVX256__)
124 | template <int span>
125 | void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
126 |                                  float* _params,
127 |                                  float* grads,
128 |                                  float* _exp_avg_sq,
129 |                                  size_t _param_size,
130 |                                  ds_half_precision_t* dev_params,
131 |                                  bool half_precision)
132 | {
133 |     size_t new_rounded_size = 0;
134 |     AVX_Data eps_4;
135 |     eps_4.data = SIMD_SET(_eps);
136 | 
137 |     float step_size = -1 * _alpha;
138 |     AVX_Data step_size_4;
139 |     step_size_4.data = SIMD_SET(step_size);
140 | 
141 |     AVX_Data weight_decay4;
142 |     if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
143 |     new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
144 |     for (size_t t = 0; t < new_rounded_size; t += TILE) {
145 |         size_t copy_size = TILE;
146 |         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
147 |         size_t offset = copy_size + t;
148 | #if defined(__ENABLE_CUDA__)
149 |         if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
150 | #elif defined(__ENABLE_CANN__)
151 |         if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
152 | #endif
153 | #pragma omp parallel for
154 |         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
155 |             AVX_Data grad_4[span];
156 |             simd_load<span>(grad_4, grads + i, half_precision);
157 | 
158 |             AVX_Data momentum_4[span];
159 |             simd_load<span>(momentum_4, grads + i, false);
160 | 
161 |             AVX_Data variance_4[span];
162 |             simd_load<span>(variance_4, _exp_avg_sq + i, false);
163 | 
164 |             AVX_Data param_4[span];
165 |             simd_load<span>(param_4, _params + i, half_precision);
166 | 
167 |             if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
168 | 
169 |             simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
170 |             simd_sqrt<span>(grad_4, variance_4);
171 |             simd_add<span>(grad_4, grad_4, eps_4);
172 |             simd_div<span>(grad_4, momentum_4, grad_4);
173 |             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
174 | 
175 |             simd_store<span>(_params + i, param_4, half_precision);
176 | #if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
177 |             if (dev_params) {
178 |                 simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
179 |             }
180 | #endif
181 |             simd_store<span>(_exp_avg_sq + i, variance_4, false);
182 |         }
183 | #if defined(__ENABLE_CUDA__)
184 |         if (dev_params) {
185 |             if (half_precision)
186 |                 launch_param_update_half(
187 |                     _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
188 |             else
189 |                 launch_param_update(
190 |                     _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
191 | 
192 |             _buf_index = !_buf_index;
193 |         }
194 | #elif defined(__ENABLE_CANN__)
195 |         if (dev_params) {
196 |             size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
197 |             if (half_precision) memoryCopySize /= 2;
198 |             aclrtMemcpy(dev_params + t,
199 |                         memcpy_size,
200 |                         _doubled_buffer[_buf_index],
201 |                         memcpy_size,
202 |                         aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
203 | 
204 |             _buf_index = !_buf_index;
205 | #endif
206 |     }
207 |     *rounded_size = new_rounded_size;
208 | }
209 | #endif
210 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/cublas_wrappers.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sycl/sycl.hpp>
 9 | #include <dpct/dpct.hpp>
10 | #include <assert.h>
11 | #include <dpct/blas_utils.hpp>
12 | 
13 | #ifndef __HIP_PLATFORM_AMD__
14 | #endif
15 | #ifdef __HIP_PLATFORM_AMD__
16 | #include <rocblas/rocblas.h>
17 | #endif
18 | #include <stdio.h>
19 | 
20 | int cublas_gemm_ex(dpct::queue_ptr handle,
21 |                    oneapi::mkl::transpose transa,
22 |                    oneapi::mkl::transpose transb,
23 |                    int m,
24 |                    int n,
25 |                    int k,
26 |                    const float* alpha,
27 |                    const float* beta,
28 |                    const float* A,
29 |                    const float* B,
30 |                    float* C,
31 | #ifdef __HIP_PLATFORM_AMD__
32 |                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
33 | #else
34 |                    int algo = -1);
35 | #endif
36 | 
37 | int cublas_gemm_ex(dpct::queue_ptr handle,
38 |                    oneapi::mkl::transpose transa,
39 |                    oneapi::mkl::transpose transb,
40 |                    int m,
41 |                    int n,
42 |                    int k,
43 |                    const float* alpha,
44 |                    const float* beta,
45 |                    const sycl::half* A,
46 |                    const sycl::half* B,
47 |                    sycl::half* C,
48 | #ifdef __HIP_PLATFORM_AMD__
49 |                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
50 | #else
51 |                    int algo = 99);
52 | #endif
53 | 
54 | int cublas_strided_batched_gemm(dpct::queue_ptr handle,
55 |                                 int m,
56 |                                 int n,
57 |                                 int k,
58 |                                 const float* alpha,
59 |                                 const float* beta,
60 |                                 const float* A,
61 |                                 const float* B,
62 |                                 float* C,
63 |                                 oneapi::mkl::transpose op_A,
64 |                                 oneapi::mkl::transpose op_B,
65 |                                 int stride_A,
66 |                                 int stride_B,
67 |                                 int stride_C,
68 |                                 int batch,
69 | #ifdef __HIP_PLATFORM_AMD__
70 |                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
71 | #else
72 |                                 int algo = -1);
73 | #endif
74 | 
75 | int cublas_strided_batched_gemm(dpct::queue_ptr handle,
76 |                                 int m,
77 |                                 int n,
78 |                                 int k,
79 |                                 const float* alpha,
80 |                                 const float* beta,
81 |                                 const sycl::half* A,
82 |                                 const sycl::half* B,
83 |                                 sycl::half* C,
84 |                                 oneapi::mkl::transpose op_A,
85 |                                 oneapi::mkl::transpose op_B,
86 |                                 int stride_A,
87 |                                 int stride_B,
88 |                                 int stride_C,
89 |                                 int batch,
90 | #ifdef __HIP_PLATFORM_AMD__
91 |                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
92 | #else
93 |                                 int algo = 99);
94 | #endif
95 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/dequantization_utils.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #include <sycl/sycl.hpp>
  7 | #include <dpct/dpct.hpp>
  8 | #include "conversion_utils.h"
  9 | #include "ds_kernel_utils.h"
 10 | #include "quantization.h"
 11 | #include "quantization_utils.h"
 12 | 
 13 | #pragma once
 14 | 
 15 | namespace dequantize {
 16 | using Type = quantize::Type;
 17 | 
 18 | template <Type qType, int numBits>
 19 | using Params = quantize::Params<qType, numBits>;
 20 | 
 21 | constexpr int granularity = quantize::granularity;
 22 | using PackedInt4 = quantize::PackedInt4;
 23 | 
 24 | constexpr int h_per_chunk = granularity / sizeof(sycl::half);
 25 | constexpr int h2_per_chunk = granularity / sizeof(sycl::half2);
 26 | 
 27 | /*
 28 | Device function that reads quantized data from global memory, dequantizes
 29 | it, and stores it to global memory.
 30 | Template Arguments :
 31 |     numBits - Number of bits in quantized element.      int: 4, 8
 32 |     qType - Type of quantization to perform.            Type::Symmetric or Type::Asymmetric
 33 |     unroll - Number of load steps to internally unroll  int
 34 |     threads - Number of threads to perform dequant      int
 35 | Function arguments:
 36 |     global_output - sycl::half pointer in global memory
 37 |     data - Quantized data in global memory
 38 |     global_params - Quantization parameters in global memory
 39 |     elems_per_group - Number of elements in each quantization group
 40 |     total_elems - Tensor size (note, does not need to be multiple of elems_per_group)
 41 | */
 42 | template <int numBits, Type qType, int unroll, int threads>
 43 | DS_D_INLINE void to_global(sycl::half* global_output,
 44 |                            const int8_t* data,
 45 |                            const float* global_params,
 46 |                            const int elems_per_group,
 47 |                            const int total_elems);
 48 | 
 49 | /*
 50 | Device function that quantizes 16 bytes of sycl::half type input data.
 51 | Template Arguments :
 52 |     numBits -   Number of bits in quantized element.    int : 8 or 4
 53 |     qType   - Type of quantization to perform.          Type::Symmetric or Type::Asymmetric
 54 | Function Arguments :
 55 |     local_output -  Local array to store dequantized data       sycl::half* or sycl::half2*
 56 |     data         -  Pointer to quantized input data.            int8_t*
 57 |     Params       -  Parameters for quantization.                Params<qType, numBits>
 58 | */
 59 | template <int numBits, Type qType>
 60 | DS_D_INLINE void chunk(sycl::half2* local_output,
 61 |                        const int8_t* data,
 62 |                        Params<qType, numBits> q_params);
 63 | 
 64 | template <typename T, int numBits, Type qType>
 65 | DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params);
 66 | 
 67 | /**************** Implementations ******************/
 68 | 
 69 | template <typename T, int numBits, Type qType>
 70 | DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params)
 71 | {
 72 |     constexpr int32_t num_elems_packed = 8 / numBits;
 73 |     constexpr int32_t iters = h_per_chunk / num_elems_packed;
 74 | 
 75 | #pragma unroll
 76 |     for (int i = 0; i < iters; i++) {
 77 |         if constexpr (num_elems_packed == 1) {
 78 |             local_output[i] = q_params.template dequantize<T>(data[i]);
 79 |         } else {
 80 |             auto accessible_data = *(PackedInt4*)(&data[i]);
 81 |             local_output[2 * i] = q_params.template dequantize<T>(accessible_data.low);
 82 |             local_output[2 * i + 1] = q_params.template dequantize<T>(accessible_data.high);
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | template <int numBits, Type qType>
 88 | DS_D_INLINE void chunk(sycl::half2* local_output,
 89 |                        const int8_t* data,
 90 |                        Params<qType, numBits> q_params)
 91 | {
 92 |     sycl::half* local_output_cast = reinterpret_cast<sycl::half*>(local_output);
 93 |     chunk<sycl::half, numBits>(local_output_cast, data, q_params);
 94 | }
 95 | 
 96 | template <typename T, int numBits, Type qType, int unroll, int threads>
 97 | /*
 98 | DPCT1110:46: The total declared local variable size in device function _to_global exceeds 128 bytes
 99 | and may cause high register pressure. Consult with your hardware vendor to find the total register
100 | size available and adjust the code, or use smaller sub-group size to avoid high register pressure.
101 | */
102 | DS_D_INLINE void _to_global(T* global_output,
103 |                             const int8_t* data,
104 |                             const float* global_params,
105 |                             const int elems_per_group,
106 |                             const int total_elems)
107 | {
108 |     sycl::group<3> tb = sycl::ext::oneapi::experimental::this_group<3>();
109 |     sycl::sub_group warp = sycl::ext::oneapi::experimental::this_sub_group();
110 | 
111 |     // Load constants
112 |     // TODO(cmikeh2): Refactor into functions?
113 |     constexpr int load_granularity = (granularity / (sizeof(T))) / (numBits == 8 ? 1 : 2);
114 |     constexpr int load_step_stride = load_granularity * threads;
115 |     constexpr int load_block_stride = load_step_stride * unroll;
116 | 
117 |     // Store constants
118 |     constexpr int T_per_chunk = granularity / sizeof(T);
119 |     constexpr int store_step_stride = T_per_chunk * threads;
120 |     constexpr int store_block_stride = store_step_stride * unroll;
121 | 
122 |     // Load offsets
123 |     const int load_block_offset = tb.get_group_id()[2] * load_block_stride;
124 |     // Note: we can use `load_granularity` since the dtype is `int8_t`.
125 |     const int load_thread_offset = tb.get_local_id()[2] * load_granularity;
126 |     const int8_t* load_base = data + load_block_offset + load_thread_offset;
127 | 
128 |     // Store offsets
129 |     const int store_block_offset = tb.get_group_id()[2] * store_block_stride;
130 |     const int store_thread_offset = tb.get_local_id()[2] * T_per_chunk;
131 |     const int elem_id_base = store_block_offset + store_thread_offset;
132 | 
133 |     int8_t local_load_buffer[load_granularity * unroll];
134 |     T local_dequant_buffer[T_per_chunk * unroll];
135 | 
136 |     /*
137 |     Note: Splitting this loop in half gave about 3-5% performance increase for reasons that aren't
138 |     totally clear to me, so this is a deliberately weird code structure.
139 |     */
140 | #pragma unroll
141 |     for (int i = 0; i < unroll; i++) {
142 |         const int elem_id_iter = elem_id_base + i * store_step_stride;
143 | 
144 |         if (elem_id_iter < total_elems) {
145 |             mem_access::load_global<load_granularity>(local_load_buffer + i * load_granularity,
146 |                                                       load_base + i * load_step_stride);
147 |         }
148 |     }
149 | 
150 | #pragma unroll
151 |     for (int i = 0; i < unroll; i++) {
152 |         const int elem_id_iter = elem_id_base + i * store_step_stride;
153 |         if (elem_id_iter < total_elems) {
154 |             // TODO(cmikeh2): Can we amortize this division? Perform once on the first iteration and
155 |             // use indexing math to do division free interpolation of the successive groups?
156 |             const int group_index = elem_id_iter / elems_per_group;
157 |             Params<qType, numBits> q_params(global_params, group_index);
158 | 
159 |             chunk<T, numBits, qType>(local_dequant_buffer + i * T_per_chunk,
160 |                                      local_load_buffer + i * load_granularity,
161 |                                      q_params);
162 |             mem_access::store_global<granularity>(global_output + elem_id_iter,
163 |                                                   local_dequant_buffer + i * T_per_chunk);
164 |         }
165 |     }
166 | }
167 | 
168 | template <typename T, int numBits, Type qType, int unroll, int threads>
169 | DS_D_INLINE void to_global(T* global_output,
170 |                            const int8_t* data,
171 |                            const float* global_params,
172 |                            const int elems_per_group,
173 |                            const int total_elems)
174 | {
175 |     if constexpr (numBits == 4 || numBits == 8) {
176 |         _to_global<T, numBits, qType, unroll, threads>(
177 |             global_output, data, global_params, elems_per_group, total_elems);
178 |     } else if constexpr (numBits == 3) {
179 |         // TODO(cmikeh2): Need this implementation
180 |         assert(false);
181 |     } else {
182 |         assert(false);
183 |     }
184 | }
185 | 
186 | }  // namespace dequantize
187 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/dpct.hpp:
--------------------------------------------------------------------------------
 1 | //==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
 2 | //
 3 | // Copyright (C) Intel Corporation
 4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 5 | // See https://llvm.org/LICENSE.txt for license information.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #ifndef __DPCT_HPP__
10 | #define __DPCT_HPP__
11 | 
12 | #include <sycl/sycl.hpp>
13 | #include <iostream>
14 | #include <limits.h>
15 | #include <math.h>
16 | 
17 | template <class... Args> class dpct_kernel_name;
18 | template <int Arg> class dpct_kernel_scalar;
19 | 
20 | #include "atomic.hpp"
21 | #include "device.hpp"
22 | #include "image.hpp"
23 | #include "kernel.hpp"
24 | #include "math.hpp"
25 | #include "memory.hpp"
26 | #include "util.hpp"
27 | 
28 | #if defined(_MSC_VER)
29 | #define __dpct_align__(n) __declspec(align(n))
30 | #define __dpct_inline__ __forceinline
31 | #else
32 | #define __dpct_align__(n) __attribute__((aligned(n)))
33 | #define __dpct_inline__ __inline__ __attribute__((always_inline))
34 | #endif
35 | 
36 | #if defined(_MSC_VER)
37 | #define __dpct_noinline__ __declspec(noinline)
38 | #else
39 | #define __dpct_noinline__ __attribute__((noinline))
40 | #endif
41 | 
42 | #define DPCT_COMPATIBILITY_TEMP (600)
43 | 
44 | namespace dpct{
45 | enum error_code { success = 0, default_error = 999 };
46 | }
47 | 
48 | #define DPCT_CHECK_ERROR(expr)                                                 \
49 |   [&]() {                                                                      \
50 |     try {                                                                      \
51 |       expr;                                                                    \
52 |       return dpct::success;                                                    \
53 |     } catch (std::exception const &e) {                                        \
54 |       std::cerr << e.what() << std::endl;                                      \
55 |       return dpct::default_error;                                              \
56 |     }                                                                          \
57 |   }()
58 | 
59 | #define DPCT_PI_F (3.14159274101257f)
60 | #define DPCT_PI (3.141592653589793115998)
61 | 
62 | #endif // __DPCT_HPP__
63 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/dpl_utils.hpp:
--------------------------------------------------------------------------------
 1 | //==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
 2 | //
 3 | // Copyright (C) Intel Corporation
 4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 5 | // See https://llvm.org/LICENSE.txt for license information.
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #ifndef __DPCT_DPL_UTILS_HPP__
10 | #define __DPCT_DPL_UTILS_HPP__
11 | 
12 | #define ONEDPL_USE_DPCPP_BACKEND 1
13 | #define __USE_DPCT 1
14 | 
15 | #include <oneapi/dpl/execution>
16 | #include <oneapi/dpl/algorithm>
17 | #include <oneapi/dpl/numeric>
18 | 
19 | #include "dpl_extras/memory.h"
20 | #include "dpl_extras/algorithm.h"
21 | #include "dpl_extras/numeric.h"
22 | #include "dpl_extras/iterators.h"
23 | #include "dpl_extras/vector.h"
24 | #include "dpl_extras/dpcpp_extensions.h"
25 | 
26 | #endif // __DPCT_DPL_UTILS_HPP__
27 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/dpct/lib_common_utils.hpp:
--------------------------------------------------------------------------------
  1 | //==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
  2 | //
  3 | // Copyright (C) Intel Corporation
  4 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  5 | // See https://llvm.org/LICENSE.txt for license information.
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | 
  9 | #ifndef __DPCT_LIB_COMMON_UTILS_HPP__
 10 | #define __DPCT_LIB_COMMON_UTILS_HPP__
 11 | 
 12 | #include <sycl/sycl.hpp>
 13 | #include <oneapi/mkl.hpp>
 14 | #include "memory.hpp"
 15 | #include "util.hpp"
 16 | 
 17 | namespace dpct {
 18 | namespace detail {
 19 | template <typename T> inline auto get_memory(const void *x) {
 20 |   T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
 21 | #ifdef DPCT_USM_LEVEL_NONE
 22 |   return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
 23 | #else
 24 |   return new_x;
 25 | #endif
 26 | }
 27 | 
 28 | template <typename T>
 29 | inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
 30 |   using Ty = typename DataType<T>::T2;
 31 |   Ty s_h;
 32 |   if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
 33 |     detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
 34 |         .wait();
 35 |   else
 36 |     s_h = *reinterpret_cast<const Ty *>(s);
 37 |   return s_h;
 38 | }
 39 | } // namespace detail
 40 | 
 41 | enum class version_field : int { major, minor, update, patch };
 42 | 
 43 | /// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
 44 | /// \param field The version information field (major, minor, update or patch).
 45 | /// \param result The result value.
 46 | inline void mkl_get_version(version_field field, int *result) {
 47 | #ifndef __INTEL_MKL__
 48 |   throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
 49 |                            "Project does not support this API.");
 50 | #else
 51 |   MKLVersion version;
 52 |   mkl_get_version(&version);
 53 |   if (version_field::major == field) {
 54 |     *result = version.MajorVersion;
 55 |   } else if (version_field::minor == field) {
 56 |     *result = version.MinorVersion;
 57 |   } else if (version_field::update == field) {
 58 |     *result = version.UpdateVersion;
 59 |   } else if (version_field::patch == field) {
 60 |     *result = 0;
 61 |   } else {
 62 |     throw std::runtime_error("unknown field");
 63 |   }
 64 | #endif
 65 | }
 66 | 
 67 | enum class library_data_t : unsigned char {
 68 |   real_float = 0,
 69 |   complex_float,
 70 |   real_double,
 71 |   complex_double,
 72 |   real_half,
 73 |   complex_half,
 74 |   real_bfloat16,
 75 |   complex_bfloat16,
 76 |   real_int4,
 77 |   complex_int4,
 78 |   real_uint4,
 79 |   complex_uint4,
 80 |   real_int8,
 81 |   complex_int8,
 82 |   real_uint8,
 83 |   complex_uint8,
 84 |   real_int16,
 85 |   complex_int16,
 86 |   real_uint16,
 87 |   complex_uint16,
 88 |   real_int32,
 89 |   complex_int32,
 90 |   real_uint32,
 91 |   complex_uint32,
 92 |   real_int64,
 93 |   complex_int64,
 94 |   real_uint64,
 95 |   complex_uint64,
 96 |   real_int8_4,
 97 |   real_int8_32,
 98 |   real_uint8_4,
 99 |   library_data_t_size
100 | };
101 | 
102 | namespace detail {
103 | template <typename ArgT>
104 | inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
105 |   static_assert((unsigned char)library_data_t::library_data_t_size <=
106 |                     std::numeric_limits<unsigned char>::max() &&
107 |                 "library_data_t size exceeds limit.");
108 |   static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
109 |   return (std::uint64_t)Val;
110 | }
111 | 
112 | template <typename FirstT, typename... RestT>
113 | inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
114 |                                                        RestT... RestVal) {
115 |   static_assert((std::uint8_t)library_data_t::library_data_t_size <=
116 |                     std::numeric_limits<unsigned char>::max() &&
117 |                 "library_data_t size exceeds limit.");
118 |   static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
119 |   static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
120 |   return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
121 | }
122 | 
123 | inline constexpr std::size_t library_data_size[] = {
124 |     8 * sizeof(float),                    // real_float
125 |     8 * sizeof(std::complex<float>),      // complex_float
126 |     8 * sizeof(double),                   // real_double
127 |     8 * sizeof(std::complex<double>),     // complex_double
128 |     8 * sizeof(sycl::half),               // real_half
129 |     8 * sizeof(std::complex<sycl::half>), // complex_half
130 |     16,                                   // real_bfloat16
131 |     16 * 2,                               // complex_bfloat16
132 |     4,                                    // real_int4
133 |     4 * 2,                                // complex_int4
134 |     4,                                    // real_uint4
135 |     4 * 2,                                // complex_uint4
136 |     8,                                    // real_int8
137 |     8 * 2,                                // complex_int8
138 |     8,                                    // real_uint8
139 |     8 * 2,                                // complex_uint8
140 |     16,                                   // real_int16
141 |     16 * 2,                               // complex_int16
142 |     16,                                   // real_uint16
143 |     16 * 2,                               // complex_uint16
144 |     32,                                   // real_int32
145 |     32 * 2,                               // complex_int32
146 |     32,                                   // real_uint32
147 |     32 * 2,                               // complex_uint32
148 |     64,                                   // real_int64
149 |     64 * 2,                               // complex_int64
150 |     64,                                   // real_uint64
151 |     64 * 2,                               // complex_uint64
152 |     8,                                    // real_int8_4
153 |     8,                                    // real_int8_32
154 |     8                                     // real_uint8_4
155 | };
156 | } // namespace detail
157 | } // namespace dpct
158 | 
159 | #endif // __DPCT_LIB_COMMON_UTILS_HPP__
160 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/ds_kernel_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | /*
 7 | Centralized header file for preprocessor macros and constants
 8 | used throughout the codebase.
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include <sycl/sycl.hpp>
14 | #include <dpct/dpct.hpp>
15 | 
16 | #ifdef BF16_AVAILABLE
17 | #endif
18 | 
19 | #define DS_HD_INLINE __dpct_inline__
20 | #define DS_D_INLINE __dpct_inline__
21 | 
22 | #ifdef __HIP_PLATFORM_AMD__
23 | 
24 | // constexpr variant of warpSize for templating
25 | constexpr int hw_warp_size = 64;
26 | #define HALF_PRECISION_AVAILABLE = 1
27 | #include <hip/hip_cooperative_groups.h>
28 | #include <hip/hip_fp16.h>
29 | 
30 | #else  // !__HIP_PLATFORM_AMD__
31 | 
32 | // constexpr variant of warpSize for templating
33 | constexpr int hw_warp_size = 32;
34 | 
35 | #if DPCT_COMPATIBILITY_TEMP >= 530
36 | #define HALF_PRECISION_AVAILABLE = 1
37 | // #define PTX_AVAILABLE
38 | #endif  // __CUDA_ARCH__ >= 530
39 | 
40 | #if DPCT_COMPATIBILITY_TEMP >= 800
41 | #define ASYNC_COPY_AVAILABLE
42 | #endif  // __CUDA_ARCH__ >= 800
43 | 
44 | #endif  //__HIP_PLATFORM_AMD__
45 | 
46 | inline int next_pow2(const int val)
47 | {
48 |     int rounded_val = val - 1;
49 |     rounded_val |= rounded_val >> 1;
50 |     rounded_val |= rounded_val >> 2;
51 |     rounded_val |= rounded_val >> 4;
52 |     rounded_val |= rounded_val >> 8;
53 |     return rounded_val + 1;
54 | }
55 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/quantization.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <sycl/sycl.hpp>
  9 | #include <dpct/dpct.hpp>
 10 | #include "ds_kernel_utils.h"
 11 | 
 12 | namespace quantize {
 13 | 
 14 | enum class Type { Symmetric, Asymmetric };
 15 | 
 16 | struct PackedInt4 {
 17 |     int8_t high : 4;
 18 |     int8_t low : 4;
 19 | };
 20 | 
 21 | DS_HD_INLINE bool requires_offset(Type qType) { return qType == Type::Asymmetric; }
 22 | 
 23 | }  // namespace quantize
 24 | 
 25 | void launch_quant(int8_t* output_data,
 26 |                   float* params,
 27 |                   const sycl::half* input_data,
 28 |                   const int groups,
 29 |                   const int elems_per_group,
 30 |                   const int num_bits,
 31 |                   const quantize::Type quant_type,
 32 |                   dpct::queue_ptr stream);
 33 | 
 34 | template <typename T>
 35 | void launch_dequantize_kernel(T* dequant_data,
 36 |                               const int8_t* q_data,
 37 |                               const float* q_params,
 38 |                               quantize::Type q_type,
 39 |                               int num_bits,
 40 |                               int elems_per_group,
 41 |                               int total_elems,
 42 |                               dpct::queue_ptr stream);
 43 | 
 44 | void launch_swizzled_quant(int8_t* q_data,
 45 |                            float* q_scales,
 46 |                            const sycl::half* input_data,
 47 |                            int num_bits,
 48 |                            quantize::Type q_type,
 49 |                            int groups,
 50 |                            int elems_per_group,
 51 |                            int pipelining,
 52 |                            int nodes,
 53 |                            int devices_per_node,
 54 |                            dpct::queue_ptr stream);
 55 | 
 56 | void launch_dequant_reduce(int8_t* reduced_data,
 57 |                            float* reduced_scales,
 58 |                            const int8_t* input_data,
 59 |                            const float* input_scales,
 60 |                            int num_gpus,
 61 |                            int num_bits,
 62 |                            quantize::Type quant_type,
 63 |                            int out_groups,
 64 |                            int elems_per_out_group,
 65 |                            int elems_per_in_tensor,
 66 |                            int groups_per_in_tensor,
 67 |                            int elems_per_in_group,
 68 |                            dpct::queue_ptr stream);
 69 | 
 70 | template <typename T>
 71 | void launch_fake_quantize_kernel(T* vals,
 72 |                                  int total_count,
 73 |                                  int group_num,
 74 |                                  int num_bits,
 75 |                                  dpct::queue_ptr stream);
 76 | template <typename T>
 77 | void launch_sr_fake_quantize_kernel(T* vals,
 78 |                                     int total_count,
 79 |                                     int group_num,
 80 |                                     int num_bits,
 81 |                                     dpct::queue_ptr stream);
 82 | template <typename T>
 83 | void launch_fake_quantize_kernel_asym(T* vals,
 84 |                                       int total_count,
 85 |                                       int group_num,
 86 |                                       int num_bits,
 87 |                                       dpct::queue_ptr stream);
 88 | template <typename T>
 89 | void launch_sr_fake_quantize_kernel_asym(T* vals,
 90 |                                          int total_count,
 91 |                                          int group_num,
 92 |                                          int num_bits,
 93 |                                          dpct::queue_ptr stream);
 94 | 
 95 | void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
 96 |                                                  sycl::half* data_out,
 97 |                                                  sycl::half* scale_buffer,
 98 |                                                  sycl::half* min_val_buffer,
 99 |                                                  int num_group,
100 |                                                  int group_size,
101 |                                                  dpct::queue_ptr stream);
102 | 
103 | void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
104 |                                                  sycl::half* data_out,
105 |                                                  sycl::half* scale_buffer,
106 |                                                  sycl::half* min_val_buffer,
107 |                                                  int num_group,
108 |                                                  int group_size,
109 |                                                  dpct::queue_ptr stream);
110 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/simd.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | 
  8 | #if (__x86_64__ || __i386__)
  9 | #include <cpuid.h>
 10 | #include <x86intrin.h>
 11 | #endif
 12 | 
 13 | #define TILE (128 * 1024 * 1024)
 14 | #if defined(__AVX512__) or defined(__AVX256__)
 15 | 
 16 | #define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 17 | 
 18 | #if defined(__AVX512__)
 19 | #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
 20 | #define SIMD_LOAD(x) _mm512_loadu_ps(x)
 21 | #define SIMD_SET(x) _mm512_set1_ps(x)
 22 | #define SIMD_ADD(x, y) _mm512_add_ps(x, y)
 23 | #define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
 24 | #define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
 25 | #define SIMD_SQRT(x) _mm512_sqrt_ps(x)
 26 | #define SIMD_DIV(x, y) _mm512_div_ps(x, y)
 27 | #define SIMD_AND(x, y) _mm512_and_ps(x, y)
 28 | #define SIMD_ANDNOT(x, y) _mm512_andnot_ps(x, y)
 29 | #define SIMD_OR(x, y) _mm512_or_ps(x, y)
 30 | #define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
 31 | #define SIMD_WIDTH 16
 32 | 
 33 | #define SIMD_LOAD2(x, h) \
 34 |     ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
 35 | #define SIMD_STORE2(x, d, h)                                                                      \
 36 |     ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
 37 |          : _mm512_storeu_ps(x, d))
 38 | 
 39 | #define INTV __m256i
 40 | #elif defined(__AVX256__)
 41 | #define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
 42 | #define SIMD_LOAD(x) _mm256_loadu_ps(x)
 43 | #define SIMD_SET(x) _mm256_set1_ps(x)
 44 | #define SIMD_ADD(x, y) _mm256_add_ps(x, y)
 45 | #define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
 46 | #define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
 47 | #define SIMD_SQRT(x) _mm256_sqrt_ps(x)
 48 | #define SIMD_DIV(x, y) _mm256_div_ps(x, y)
 49 | #define SIMD_AND(x, y) _mm256_and_ps(x, y)
 50 | #define SIMD_ANDNOT(x, y) _mm256_andnot_ps(x, y)
 51 | #define SIMD_OR(x, y) _mm256_or_ps(x, y)
 52 | #define SIMD_XOR(x, y) _mm256_xor_ps(x, y)
 53 | #define SIMD_WIDTH 8
 54 | 
 55 | #define SIMD_LOAD2(x, h) \
 56 |     ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
 57 | #define SIMD_STORE2(x, d, h)                                                                \
 58 |     ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
 59 |          : _mm256_storeu_ps(x, d))
 60 | 
 61 | #define INTV __m128i
 62 | #endif
 63 | 
 64 | union AVX_Data {
 65 | #if defined(__AVX512__)
 66 |     __m512 data;
 67 | #elif defined(__AVX256__)
 68 |     __m256 data;
 69 | #endif
 70 |     // float data_f[16];
 71 | };
 72 | 
 73 | template <int span>
 74 | inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
 75 | {
 76 |     size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
 77 | #pragma unroll
 78 |     for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); }
 79 | }
 80 | template <int span>
 81 | inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
 82 | {
 83 |     size_t width = (half_precision ? 1 : SIMD_WIDTH);
 84 | #pragma unroll
 85 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
 86 | }
 87 | template <int span>
 88 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
 89 | {
 90 | #pragma unroll
 91 |     for (size_t i = 0; i < span; ++i) {
 92 |         dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
 93 |     }
 94 | }
 95 | template <int span>
 96 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
 97 | {
 98 | #pragma unroll
 99 |     for (size_t i = 0; i < span; ++i) {
100 |         dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
101 |     }
102 | }
103 | template <int span>
104 | inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
105 | {
106 | #pragma unroll
107 |     for (size_t i = 0; i < span; ++i) {
108 |         dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
109 |     }
110 | }
111 | template <int span>
112 | inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
113 | {
114 | #pragma unroll
115 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
116 | }
117 | template <int span>
118 | inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
119 | {
120 | #pragma unroll
121 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
122 | }
123 | template <int span>
124 | inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
125 | {
126 | #pragma unroll
127 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
128 | }
129 | template <int span>
130 | inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
131 | {
132 | #pragma unroll
133 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
134 | }
135 | template <int span>
136 | inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
137 | {
138 | #pragma unroll
139 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
140 | }
141 | template <int span>
142 | inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
143 | {
144 | #pragma unroll
145 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
146 | }
147 | template <int span>
148 | inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
149 | {
150 | #pragma unroll
151 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r.data); }
152 | }
153 | template <int span>
154 | inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
155 | {
156 | #pragma unroll
157 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r[i].data); }
158 | }
159 | template <int span>
160 | inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
161 | {
162 | #pragma unroll
163 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r.data); }
164 | }
165 | template <int span>
166 | inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
167 | {
168 | #pragma unroll
169 |     for (size_t i = 0; i < span; ++i) {
170 |         dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r[i].data);
171 |     }
172 | }
173 | template <int span>
174 | inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
175 | {
176 | #pragma unroll
177 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r.data); }
178 | }
179 | template <int span>
180 | inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
181 | {
182 | #pragma unroll
183 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r[i].data); }
184 | }
185 | template <int span>
186 | inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
187 | {
188 | #pragma unroll
189 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r.data); }
190 | }
191 | template <int span>
192 | inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
193 | {
194 | #pragma unroll
195 |     for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r[i].data); }
196 | }
197 | 
198 | #endif
199 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/includes/type_shim.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
  7 | #include <sycl/sycl.hpp>
  8 | #include <dpct/dpct.hpp>
  9 | #include <ATen/ATen.h>
 10 | 
 11 | // Forward/backward compatibility hack around
 12 | // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
 13 | // pending more future-proof guidance from upstream.
 14 | // struct TypeShim
 15 | // {
 16 | //   const at::Type& payload;
 17 | //   TypeShim(const at::Type& type) : payload(type) {}
 18 | //   // Enable trivial conversion to a const at::Type& for pre-3aeb78
 19 | //   operator const at::Type&(){ return payload; };
 20 | //   // Enable dispatch switch statements to take *this directly for  post-3aeb78
 21 | //   //operator at::ScalarType(){ return payload.; };
 22 | // };
 23 | 
 24 | #define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
 25 |     switch (TYPE) {                                                              \
 26 |         case at::ScalarType::Float: {                                            \
 27 |             using scalar_t_##LEVEL = float;                                      \
 28 |             __VA_ARGS__;                                                         \
 29 |             break;                                                               \
 30 |         }                                                                        \
 31 |         case at::ScalarType::Half: {                                             \
 32 |             using scalar_t_##LEVEL = at::Half;                                   \
 33 |             __VA_ARGS__;                                                         \
 34 |             break;                                                               \
 35 |         }                                                                        \
 36 |         case at::ScalarType::BFloat16: {                                         \
 37 |             using scalar_t_##LEVEL = at::BFloat16;                               \
 38 |             __VA_ARGS__;                                                         \
 39 |             break;                                                               \
 40 |         }                                                                        \
 41 |         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
 42 |     }
 43 | 
 44 | #define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
 45 |     switch (TYPE) {                                                              \
 46 |         case at::ScalarType::Double: {                                           \
 47 |             using scalar_t_##LEVEL = double;                                     \
 48 |             __VA_ARGS__;                                                         \
 49 |             break;                                                               \
 50 |         }                                                                        \
 51 |         case at::ScalarType::Float: {                                            \
 52 |             using scalar_t_##LEVEL = float;                                      \
 53 |             __VA_ARGS__;                                                         \
 54 |             break;                                                               \
 55 |         }                                                                        \
 56 |         case at::ScalarType::Half: {                                             \
 57 |             using scalar_t_##LEVEL = at::Half;                                   \
 58 |             __VA_ARGS__;                                                         \
 59 |             break;                                                               \
 60 |         }                                                                        \
 61 |         case at::ScalarType::BFloat16: {                                         \
 62 |             using scalar_t_##LEVEL = at::BFloat16;                               \
 63 |             __VA_ARGS__;                                                         \
 64 |             break;                                                               \
 65 |         }                                                                        \
 66 |         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
 67 |     }
 68 | 
 69 | #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
 70 |     switch (TYPE) {                                                              \
 71 |         case at::ScalarType::Double: {                                           \
 72 |             using scalar_t_##LEVEL = double;                                     \
 73 |             __VA_ARGS__;                                                         \
 74 |             break;                                                               \
 75 |         }                                                                        \
 76 |         case at::ScalarType::Float: {                                            \
 77 |             using scalar_t_##LEVEL = float;                                      \
 78 |             __VA_ARGS__;                                                         \
 79 |             break;                                                               \
 80 |         }                                                                        \
 81 |         default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
 82 |     }
 83 | 
 84 | template <typename T>
 85 | __dpct_inline__ T
 86 | reduce_block_into_lanes(T* x,
 87 |                         T val,
 88 |                         int lanes = 1,
 89 |                         bool share_result = false)  // lanes is intended to be <= 32.
 90 | {
 91 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
 92 |     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
 93 |     int blockSize = item_ct1.get_local_range(2) *
 94 |                     item_ct1.get_local_range(1);  // blockSize is intended to be a multiple of 32.
 95 | 
 96 |     if (blockSize >= 64) {
 97 |         x[tid] = val;
 98 |         /*
 99 |         DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control
100 |         flow. You may need to adjust the code.
101 |         */
102 |         /*
103 |         DPCT1065:6: Consider replacing sycl::nd_item::barrier() with
104 |         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
105 |         there is no access to global memory.
106 |         */
107 |         item_ct1.barrier();
108 |     }
109 | 
110 | #pragma unroll
111 |     for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
112 |         if (tid < i) x[tid] = x[tid] + x[tid + i];
113 |         /*
114 |         DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control
115 |         flow. You may need to adjust the code.
116 |         */
117 |         /*
118 |         DPCT1065:7: Consider replacing sycl::nd_item::barrier() with
119 |         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
120 |         there is no access to global memory.
121 |         */
122 |         item_ct1.barrier();
123 |     }
124 | 
125 |     T final;
126 | 
127 |     if (tid < 32) {
128 |         if (blockSize >= 64)
129 |             final = x[tid] + x[tid + 32];
130 |         else
131 |             final = val;
132 |             // __SYNCWARP();
133 | 
134 | #pragma unroll
135 |         for (int i = 16; i >= lanes; i >>= 1)
136 |             final = final + __shfl_down_sync(0xffffffff, final, i);
137 |     }
138 | 
139 |     if (share_result) {
140 |         if (tid < lanes) x[tid] = final;  // EpilogueOp
141 |         // Make sure the smem result is visible to all warps.
142 |         /*
143 |         DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control
144 |         flow. You may need to adjust the code.
145 |         */
146 |         /*
147 |         DPCT1065:8: Consider replacing sycl::nd_item::barrier() with
148 |         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
149 |         there is no access to global memory.
150 |         */
151 |         item_ct1.barrier();
152 |     }
153 | 
154 |     return final;
155 | }
156 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/quantization/dequantize.dp.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include <sycl/sycl.hpp>
 7 | #include <dpct/dpct.hpp>
 8 | #include "dequantization_utils.h"
 9 | #include "memory_access_utils.h"
10 | 
11 | template <typename T, int numBits, dequantize::Type qType, int unroll, int threads>
12 | void dequantize_kernel(T* __restrict__ dequant_data,
13 |                                   const int8_t* __restrict__ q_data,
14 |                                   const float* __restrict__ q_params,
15 |                                   int elems_per_group,
16 |                                   int total_elems)
17 | {
18 |     dequantize::to_global<T, numBits, qType, unroll, threads>(
19 |         dequant_data, q_data, q_params, elems_per_group, total_elems);
20 | }
21 | 
22 | /*
23 | DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device
24 | limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
25 | */
26 | #define LAUNCH_DEQUANT_KERNEL(num_bits, q_type)                                                    \
27 |   {                                                                                                \
28 |     dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp64, sycl::aspect::fp16});  \
29 |     stream->submit([&](sycl::handler& cgh) {                                                       \
30 |       T* dequant_data_ct0 = dequant_data;                                                          \
31 |       const int8_t* q_data_ct1 = q_data;                                                           \
32 |       const float* q_params_ct2 = q_params;                                                        \
33 |       auto elems_per_group_ct3 = elems_per_group;                                                  \
34 |       auto total_elems_ct4 = total_elems;                                                          \
35 |                                                                                                    \
36 |       cgh.parallel_for(                                                                            \
37 |           sycl::nd_range<3>(grid * block, block),                                                  \
38 |           [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {                      \
39 |             dequantize_kernel<T, num_bits, q_type, unroll, threads>(                               \
40 |                 dequant_data_ct0, q_data_ct1, q_params_ct2, elems_per_group_ct3, total_elems_ct4); \
41 |           });                                                                                      \
42 |     });                                                                                            \
43 |   }
44 | 
45 | template <typename T>
46 | void launch_dequantize_kernel(T* dequant_data,
47 |                               const int8_t* q_data,
48 |                               const float* q_params,
49 |                               quantize::Type q_type,
50 |                               int num_bits,
51 |                               int elems_per_group,
52 |                               int total_elems,
53 |                               dpct::queue_ptr stream)
54 | {
55 |     constexpr int unroll = 8;
56 |     constexpr int threads = 512;
57 |     constexpr int elems_per_block = unroll * threads * dequantize::granularity / (sizeof(T));
58 | 
59 |     const sycl::range<3> block(1, 1, threads);
60 |     const sycl::range<3> grid(1, 1, (total_elems + elems_per_block - 1) / elems_per_block);
61 | 
62 |     // TODO(cmikeh2): It may make sense to tune unroll, there is perf benefit for large
63 |     // problem sizes with this large unroll value.
64 |     if (num_bits == 8 && q_type == quantize::Type::Symmetric) {
65 |         LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Symmetric);
66 |     } else if (num_bits == 8 && q_type == quantize::Type::Asymmetric) {
67 |         LAUNCH_DEQUANT_KERNEL(8, quantize::Type::Asymmetric);
68 |     } else if (num_bits == 4 && q_type == quantize::Type::Symmetric) {
69 |         LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Symmetric);
70 |     } else if (num_bits == 4 && q_type == quantize::Type::Asymmetric) {
71 |         LAUNCH_DEQUANT_KERNEL(4, quantize::Type::Asymmetric);
72 |     }
73 | }
74 | 
75 | template void launch_dequantize_kernel(sycl::half* dequant_data,
76 |                                        const int8_t* q_data,
77 |                                        const float* q_params,
78 |                                        quantize::Type q_type,
79 |                                        int num_bits,
80 |                                        int elems_per_group,
81 |                                        int total_elems,
82 |                                        dpct::queue_ptr stream);
83 | 
84 | template void launch_dequantize_kernel(float* dequant_data,
85 |                                        const int8_t* q_data,
86 |                                        const float* q_params,
87 |                                        quantize::Type q_type,
88 |                                        int num_bits,
89 |                                        int elems_per_group,
90 |                                        int total_elems,
91 |                                        dpct::queue_ptr stream);
92 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/quantization/quantize.dp.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #include <sycl/sycl.hpp>
  7 | #include <dpct/dpct.hpp>
  8 | #include "ds_kernel_utils.h"
  9 | #include "memory_access_utils.h"
 10 | #include "quantization.h"
 11 | #include "quantization_utils.h"
 12 | #include "reduction_utils.h"
 13 | 
 14 | /*
 15 | Pure quantization kernel with no fusion.
 16 | */
 17 | template <int q_bits,
 18 |           quantize::Type quant_type,
 19 |           int UNROLL,
 20 |           int internal_unroll,
 21 |           int threads_per_group,
 22 |           int max_threads>
 23 | /*
 24 | DPCT1110:46: The total declared local variable size in device function cached_quantization exceeds
 25 | 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total
 26 | register size available and adjust the code, or use smaller sub-group size to avoid high register
 27 | pressure.
 28 | */
 29 | void cached_quantization(int8_t* __restrict__ output_data,
 30 |                          float* __restrict__ params,
 31 |                          const sycl::half* __restrict__ input_data,
 32 |                          int groups,
 33 |                          int elems_per_group)
 34 | {
 35 |     sycl::group<3> tb = sycl::ext::oneapi::experimental::this_group<3>();
 36 |     sycl::sub_group warp = sycl::ext::oneapi::experimental::this_sub_group();
 37 | 
 38 |     // Indexing offsets
 39 |     const int block_offset =
 40 |         (tb.get_group_id()[2] * (max_threads / threads_per_group) * elems_per_group) +
 41 |         (tb.get_local_id()[1] * elems_per_group);
 42 |     const int elem_offset = tb.get_local_id()[2] * quantize::h_per_load;
 43 |     const int base_offset = block_offset + elem_offset;
 44 |     const int stride = sycl::ext::oneapi::experimental::this_group<3>().get_local_linear_range() *
 45 |                        quantize::h_per_load;
 46 | 
 47 |     const sycl::half* input_base = input_data + base_offset;  //..
 48 | 
 49 |     sycl::half2 local_buffer[UNROLL * internal_unroll * quantize::h2_per_load];
 50 | 
 51 | #pragma unroll
 52 |     for (int i = 0; i < UNROLL; i++) {
 53 |         // Convenience helper, should resolve to register indices and not realize.
 54 |         sycl::half2* iteration_buffer = local_buffer + i * internal_unroll * quantize::h2_per_load;
 55 | #pragma unroll
 56 |         for (int j = 0; j < internal_unroll; j++) {
 57 |             const int iteration = i * internal_unroll + j;
 58 |             mem_access::load_global<quantize::granularity>(
 59 |                 iteration_buffer + j * quantize::h2_per_load,
 60 |                 input_base + iteration * stride,
 61 |                 elem_offset + iteration * stride < elems_per_group);
 62 |         }
 63 |     }
 64 | 
 65 |     quantize::
 66 |         local_array<quant_type, q_bits, UNROLL * internal_unroll, threads_per_group, max_threads>(
 67 |             local_buffer, params, output_data, elems_per_group, groups);
 68 | }
 69 | 
 70 | /********* Launcher methods ***********/
 71 | /*
 72 | DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device
 73 | limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
 74 | */
 75 | #define LAUNCH_CACHED_QUANT_CALL(q_bits, quant_type)                                           \
 76 |  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp64, sycl::aspect::fp16}); \
 77 |  stream->submit([&](sycl::handler& cgh) {                                                      \
 78 |   int8_t* output_data_ct0 = output_data;                                                       \
 79 |   float* params_ct1 = params;                                                                  \
 80 |   const sycl::half* input_data_ct2 = input_data;                                               \
 81 |   int groups_ct3 = groups;                                                                     \
 82 |   int elems_per_group_ct4 = elems_per_group;                                                   \
 83 |                                                                                                \
 84 |   cgh.parallel_for(                                                                            \
 85 |       sycl::nd_range<3>(grid * block, block),                                                  \
 86 |       [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {                      \
 87 |        cached_quantization<q_bits,                                                             \
 88 |                            quant_type,                                                         \
 89 |                            unroll_factor,                                                      \
 90 |                            internal_unroll_l,                                                  \
 91 |                            threads_per_group,                                                  \
 92 |                            max_threads>(                                                       \
 93 |            output_data_ct0, params_ct1, input_data_ct2, groups_ct3, elems_per_group_ct4);      \
 94 |       });                                                                                      \
 95 |  });
 96 | 
 97 | #define LAUNCH_CACHED_QUANT(                                                        \
 98 |     q_bits, quant_type, unroll_factor_in, internal_unroll_in, threads_per_group_in) \
 99 |     const int unroll_factor = unroll_factor_in;                                     \
100 |     const int internal_unroll_l = internal_unroll_in;                               \
101 |     const int threads_per_group = threads_per_group_in;                             \
102 |     if (q_bits == 4) {                                                              \
103 |         if (quant_type == quantize::Type::Asymmetric) {                             \
104 |             LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Asymmetric)                 \
105 |         } else {                                                                    \
106 |             LAUNCH_CACHED_QUANT_CALL(4, quantize::Type::Symmetric)                  \
107 |         }                                                                           \
108 |     } else {                                                                        \
109 |         if (quant_type == quantize::Type::Asymmetric) {                             \
110 |             LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Asymmetric)                 \
111 |         } else {                                                                    \
112 |             LAUNCH_CACHED_QUANT_CALL(8, quantize::Type::Symmetric)                  \
113 |         }                                                                           \
114 |     }
115 | 
116 | void launch_quant(int8_t* output_data,
117 |                   float* params,
118 |                   const sycl::half* input_data,
119 |                   const int groups,
120 |                   const int elems_per_group,
121 |                   const int num_bits,
122 |                   const quantize::Type quant_type,
123 |                   dpct::queue_ptr stream)
124 | {
125 |     constexpr int max_threads = 256;
126 | 
127 |     constexpr int internal_unroll = 2;
128 | 
129 |     const bool is_subblock_schedule = (elems_per_group <= 128) ? true : false;
130 |     const int h_per_step = is_subblock_schedule ? quantize::h_per_load
131 |                                                 : quantize::h_per_load * internal_unroll;
132 | 
133 |     // Scheduling concern: may be slightly faster for some inputs to assign multiple stages of
134 |     // warp-sized blocks rather than stepping up to 64/96 threads
135 |     const int one_step_threads = next_pow2((elems_per_group + h_per_step - 1) / h_per_step);
136 |     const int threads_per_group = (one_step_threads < max_threads) ? one_step_threads : max_threads;
137 | 
138 |     const int groups_per_block =
139 |         is_subblock_schedule ? (max_threads + threads_per_group - 1) / threads_per_group : 1;
140 |     const int groups_launch = (groups_per_block + groups - 1) / groups_per_block;
141 | 
142 |     sycl::range<3> block(1, groups_per_block, threads_per_group);
143 |     sycl::range<3> grid(1, 1, groups_launch);
144 | 
145 |     const int elems_per_step = threads_per_group * h_per_step;
146 |     const int external_unroll = (elems_per_group + elems_per_step - 1) / elems_per_step;
147 | 
148 |     if (is_subblock_schedule) {
149 |         // <=128
150 |         if (threads_per_group == 1) {
151 |             LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 1);
152 |         } else if (threads_per_group == 2) {
153 |             LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 2);
154 |         } else if (threads_per_group == 4) {
155 |             LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 4);
156 |         } else if (threads_per_group == 8) {
157 |             LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 8);
158 |         } else if (threads_per_group == 16) {
159 |             LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, 1, 16);
160 |         }
161 |     } else if (external_unroll == 1) {
162 |         // 129 - 4096 elems
163 |         // (this can launch with 1-7 warps as well)
164 |         LAUNCH_CACHED_QUANT(num_bits, quant_type, 1, internal_unroll, max_threads);
165 |     } else if (external_unroll == 2) {
166 |         // 4097 - 8192 elems
167 |         LAUNCH_CACHED_QUANT(num_bits, quant_type, 2, internal_unroll, max_threads);
168 |     } else if (external_unroll == 3) {
169 |         // 8193 - 12288 elems
170 |         LAUNCH_CACHED_QUANT(num_bits, quant_type, 3, internal_unroll, max_threads);
171 |     } else if (external_unroll == 4) {
172 |         // 12289 - 16384 elems
173 |         LAUNCH_CACHED_QUANT(num_bits, quant_type, 4, internal_unroll, max_threads);
174 |     }
175 | }
176 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/quantization/quantize_intX.dp.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #include <sycl/sycl.hpp>
  7 | #include <dpct/dpct.hpp>
  8 | #include <assert.h>
  9 | #include "memory_access_utils.h"
 10 | #include <cmath>
 11 | 
 12 | template <typename T, int N>
 13 | struct alignas(sizeof(T) * N) AlignedArray {
 14 |     using Element = T;
 15 |     static const int kElements = N;
 16 | 
 17 |     AlignedArray() {}
 18 | 
 19 |     AlignedArray(const T& rhs)
 20 |     {
 21 | #pragma unroll
 22 |         for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = rhs; }
 23 |     }
 24 | 
 25 |     T& operator[](int offset)
 26 |     {
 27 |         return reinterpret_cast<T&>(this->buffer[offset]);
 28 |     }
 29 | 
 30 |     const T& operator[](int offset) const
 31 |     {
 32 |         return reinterpret_cast<const T&>(this->buffer[offset]);
 33 |     }
 34 | 
 35 |     T& at(int offset) { return reinterpret_cast<T&>(this->buffer[offset]); }
 36 | 
 37 |     const T& at(int offset) const
 38 |     {
 39 |         return reinterpret_cast<const T&>(this->buffer[offset]);
 40 |     }
 41 | 
 42 |     AlignedArray<T, N> operator+(const AlignedArray<T, N>& rhs) const
 43 |     {
 44 |         AlignedArray<T, N> ret;
 45 | 
 46 | #pragma unroll
 47 |         for (int idx = 0; idx < kElements; ++idx) { ret[idx] = this->at(idx) + rhs.at(idx); }
 48 | 
 49 |         return ret;
 50 |     }
 51 | 
 52 |     __dpct_inline__ void clear()
 53 |     {
 54 | #pragma unroll
 55 |         for (int idx = 0; idx < kElements; ++idx) { this->at(idx) = Element(0); }
 56 |     }
 57 | 
 58 |     Element buffer[N];
 59 | };
 60 | 
 61 | template <typename T>
 62 | struct reduce_max {
 63 |     __dpct_inline__ T operator()(const T& lhs, const T& rhs)
 64 |     {
 65 |         return lhs > rhs ? lhs : rhs;
 66 |     }
 67 | };
 68 | 
 69 | template <typename T>
 70 | struct reduce_min {
 71 |     __dpct_inline__ T operator()(const T& lhs, const T& rhs)
 72 |     {
 73 |         return lhs < rhs ? lhs : rhs;
 74 |     }
 75 | };
 76 | 
 77 | template <typename T, int N>
 78 | struct subtract {
 79 |     __dpct_inline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs, const T& rhs)
 80 |     {
 81 |         AlignedArray<T, N> ret;
 82 | 
 83 | #pragma unroll
 84 |         for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] - rhs; }
 85 | 
 86 |         return ret;
 87 |     }
 88 | };
 89 | 
 90 | template <typename T, int N>
 91 | struct plus {
 92 |     __dpct_inline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs, const T& rhs)
 93 |     {
 94 |         AlignedArray<T, N> ret;
 95 | 
 96 | #pragma unroll
 97 |         for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] + rhs; }
 98 | 
 99 |         return ret;
100 |     }
101 | };
102 | 
103 | template <typename T, int N>
104 | struct multiply {
105 |     __dpct_inline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs, const T& rhs)
106 |     {
107 |         AlignedArray<T, N> ret;
108 | 
109 | #pragma unroll
110 |         for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] * rhs; }
111 | 
112 |         return ret;
113 |     }
114 | };
115 | 
116 | template <typename T, int N>
117 | struct clamp {
118 |     __dpct_inline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs,
119 |                                                   const T& min_val,
120 |                                                   const T& max_val)
121 |     {
122 |         AlignedArray<T, N> ret;
123 | 
124 | #pragma unroll
125 |         for (int idx = 0; idx < N; ++idx) {
126 |             ret[idx] = reduce_max<T>()(reduce_min<T>()(lhs[idx], max_val), min_val);
127 |         }
128 | 
129 |         return ret;
130 |     }
131 | };
132 | 
133 | template <typename T, int N>
134 | struct round_int;
135 | 
136 | template <int N>
137 | struct round_int<sycl::half, N> {
138 |     __dpct_inline__ AlignedArray<sycl::half, N> operator()(const AlignedArray<sycl::half, N>& lhs)
139 |     {
140 |         AlignedArray<sycl::half, N> ret;
141 | 
142 | #pragma unroll
143 |         for (int idx = 0; idx < N; ++idx) { ret[idx] = hrint(lhs[idx]); }
144 | 
145 |         return ret;
146 |     }
147 | };
148 | 
149 | template <typename T, int N>
150 | struct divide {
151 |     __dpct_inline__ AlignedArray<T, N> operator()(const AlignedArray<T, N>& lhs, const T& rhs)
152 |     {
153 |         AlignedArray<T, N> ret;
154 | 
155 | #pragma unroll
156 |         for (int idx = 0; idx < N; ++idx) { ret[idx] = lhs[idx] / rhs; }
157 | 
158 |         return ret;
159 |     }
160 | };
161 | 
162 | template <typename T, int N, typename Reducer>
163 | __dpct_inline__ T to_scalar(const AlignedArray<T, N>& data)
164 | {
165 |     Reducer re;
166 |     T res = data[0];
167 | 
168 | #pragma unroll
169 |     for (int idx = 1; idx < N; ++idx) { res = re(res, data[idx]); }
170 | 
171 |     return res;
172 | }
173 | 
174 | template <int N>
175 | __dpct_inline__ AlignedArray<sycl::half, N * 2> int4_to_half(const AlignedArray<uint8_t, N>& data)
176 | {
177 |     AlignedArray<sycl::half, N * 2> ret;
178 | 
179 | #pragma unroll
180 |     for (int idx = 0; idx < N * 2; idx += 2) {
181 |         ret[idx] = sycl::half(int(data[idx / 2] >> 4));
182 |         ret[idx + 1] = sycl::half(int(data[idx / 2] & 0xf));
183 |     }
184 | 
185 |     return ret;
186 | }
187 | 
188 | void dequantize_int4_to_half(uint8_t* data_in,
189 |                              sycl::half* data_out,
190 |                              sycl::half* scale_buffer,
191 |                              sycl::half* min_val_buffer,
192 |                              int num_group,
193 |                              int group_size)
194 | {
195 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
196 |     using AccessType = AlignedArray<uint8_t, 4>;
197 |     using AccessTypeOut = AlignedArray<sycl::half, 8>;
198 | 
199 |     for (int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
200 |          idx < num_group * group_size / 8;
201 |          idx += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
202 |         int id_group = idx / (group_size / 8);
203 |         AccessType value = reinterpret_cast<AccessType*>(data_in)[idx];
204 |         sycl::half scale = scale_buffer[id_group];
205 |         sycl::half min_value = min_val_buffer[id_group];
206 | 
207 |         AccessTypeOut output = int4_to_half(value);
208 |         output = divide<sycl::half, 8>()(output, scale);
209 |         output = plus<sycl::half, 8>()(output, min_value);
210 | 
211 |         reinterpret_cast<AccessTypeOut*>(data_out)[idx] = output;
212 |     }
213 | }
214 | 
215 | void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
216 |                                                  sycl::half* data_out,
217 |                                                  sycl::half* scale_buffer,
218 |                                                  sycl::half* min_val_buffer,
219 |                                                  int num_group,
220 |                                                  int group_size,
221 |                                                  dpct::queue_ptr stream)
222 | {
223 |     int num_warp = num_group / 4;
224 |     int num_block = num_warp / 8;  // 256 trd / block
225 | 
226 |     {
227 |         dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
228 |         stream->parallel_for(
229 |             sycl::nd_range<3>(sycl::range<3>(1, 1, num_block) * sycl::range<3>(1, 1, 256),
230 |                               sycl::range<3>(1, 1, 256)),
231 |             [=](sycl::nd_item<3> item_ct1) {
232 |                 dequantize_int4_to_half(
233 |                     data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
234 |             });
235 |     }
236 | }
237 | 
238 | template <int N>
239 | __dpct_inline__ AlignedArray<sycl::half, N> int8_to_half(const AlignedArray<uint8_t, N>& data)
240 | {
241 |     AlignedArray<sycl::half, N> ret;
242 | 
243 | #pragma unroll
244 |     for (int idx = 0; idx < N; idx += 1) { ret[idx] = sycl::half(int(data[idx])); }
245 | 
246 |     return ret;
247 | }
248 | 
249 | void dequantize_int8_to_half(uint8_t* data_in,
250 |                              sycl::half* data_out,
251 |                              sycl::half* scale_buffer,
252 |                              sycl::half* min_val_buffer,
253 |                              int num_group,
254 |                              int group_size)
255 | {
256 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
257 |     using AccessType = AlignedArray<uint8_t, 8>;
258 |     using AccessTypeOut = AlignedArray<sycl::half, 8>;
259 | 
260 |     for (int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
261 |          idx < num_group * group_size / 8;
262 |          idx += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
263 |         int id_group = idx / (group_size / 8);
264 |         AccessType value = reinterpret_cast<AccessType*>(data_in)[idx];
265 |         sycl::half scale = scale_buffer[id_group];
266 |         sycl::half min_value = min_val_buffer[id_group];
267 | 
268 |         AccessTypeOut output = int8_to_half(value);
269 |         output = divide<sycl::half, 8>()(output, scale);
270 |         output = plus<sycl::half, 8>()(output, min_value);
271 | 
272 |         reinterpret_cast<AccessTypeOut*>(data_out)[idx] = output;
273 |     }
274 | }
275 | 
276 | void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
277 |                                                  sycl::half* data_out,
278 |                                                  sycl::half* scale_buffer,
279 |                                                  sycl::half* min_val_buffer,
280 |                                                  int num_group,
281 |                                                  int group_size,
282 |                                                  dpct::queue_ptr stream)
283 | {
284 |     int num_warp = num_group / 4;
285 |     int num_block = num_warp / 8;  // 256 trd / block
286 | 
287 |     {
288 |         dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
289 |         stream->parallel_for(
290 |             sycl::nd_range<3>(sycl::range<3>(1, 1, num_block) * sycl::range<3>(1, 1, 256),
291 |                               sycl::range<3>(1, 1, 256)),
292 |             [=](sycl::nd_item<3> item_ct1) {
293 |                 dequantize_int8_to_half(
294 |                     data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
295 |             });
296 |     }
297 | }
298 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/dequantize.dp.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #include <sycl/sycl.hpp>
  7 | #include <dpct/dpct.hpp>
  8 | #include "conversion_utils.h"
  9 | #include "inference_cuda_layers.h"
 10 | 
 11 | #define MAX_QUANTIZE_GROUPING 1024
 12 | 
 13 | #define loop_unroll 1
 14 | #define loop_unroll_bits 1
 15 | 
 16 | template <typename T>
 17 | void dequantize_kernel(T* output,
 18 |                                   const int8_t* input,
 19 |                                   const float* qscale,
 20 |                                   int output_size,
 21 |                                   int hidden_dim,
 22 |                                   int groups,
 23 |                                   int merge_count)
 24 | {
 25 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
 26 |     unsigned merge_hidden = hidden_dim >> merge_count;
 27 |     unsigned quantization_stride = (merge_hidden * output_size) / groups;
 28 | 
 29 |     unsigned bid = item_ct1.get_group(2);
 30 |     unsigned tid = item_ct1.get_local_id(2);
 31 | 
 32 |     while (tid < output_size) {
 33 |         unsigned w_index = bid / merge_hidden;
 34 |         unsigned q_index = tid + bid * output_size;
 35 | 
 36 |         auto q = input[q_index];
 37 | 
 38 |         unsigned merge_hidden_total = w_index * merge_hidden;
 39 |         unsigned scale_index =
 40 |             ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
 41 |              << merge_count) +
 42 |             w_index;
 43 | 
 44 |         float scale_data = qscale[scale_index];
 45 | 
 46 |         output[q_index] = conversion::to<T>(scale_data * (float)q);
 47 |         tid += item_ct1.get_local_range(2);
 48 |     }
 49 | }
 50 | 
 51 | template <typename T>
 52 | void launch_dequantize(T* output,
 53 |                        const int8_t* input,
 54 |                        const float* qscale,
 55 |                        unsigned output_size,
 56 |                        unsigned hidden_dim,
 57 |                        unsigned groups,
 58 |                        unsigned merge_count,
 59 |                        dpct::queue_ptr stream)
 60 | {
 61 |     unsigned threads = 1024;
 62 |     sycl::range<3> block_dims(1, 1, threads);
 63 |     sycl::range<3> grid_dims(1, 1, hidden_dim);
 64 | 
 65 |     /*
 66 |     DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the
 67 |     device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
 68 |     */
 69 |     {
 70 |         dpct::has_capability_or_fail(stream->get_device(),
 71 |                                      {sycl::aspect::fp64, sycl::aspect::fp16});
 72 |         stream->parallel_for(
 73 |             sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
 74 |                 dequantize_kernel(
 75 |                     output, input, qscale, output_size, hidden_dim, groups, merge_count);
 76 |             });
 77 |     }
 78 | }
 79 | 
 80 | #define INSTANTIATE_DEQUANTIZE_MERGE(T) \
 81 |     template void launch_dequantize<T>( \
 82 |         T*, const int8_t*, const float*, unsigned, unsigned, unsigned, unsigned, dpct::queue_ptr);
 83 | 
 84 | INSTANTIATE_DEQUANTIZE_MERGE(float);
 85 | #ifdef BF16_AVAILABLE
 86 | INSTANTIATE_DEQUANTIZE_MERGE(sycl::ext::oneapi::bfloat16);
 87 | #endif
 88 | INSTANTIATE_DEQUANTIZE_MERGE(sycl::half);
 89 | 
 90 | void dequantize_kernel(float* output,
 91 |                                   const int8_t* input,
 92 |                                   const float* qscale,
 93 |                                   int hidden_dim,
 94 |                                   unsigned merge_hidden,
 95 |                                   int cnt)
 96 | {
 97 | }
 98 | 
 99 | template <typename T>
100 | void dequantize_kernel(T* output,
101 |                                   const int8_t* input,
102 |                                   const float* qscale,
103 |                                   unsigned hidden_dim,
104 |                                   unsigned merge_hidden,
105 |                                   int cnt)
106 | {
107 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
108 |     unsigned bid = item_ct1.get_group(2) * item_ct1.get_group_range(1) + item_ct1.get_group(1);
109 |     unsigned tid = item_ct1.get_local_id(2);
110 | 
111 |     float local_scale = qscale[item_ct1.get_group(2)];
112 | 
113 |     const float* input_cast = reinterpret_cast<const float*>(input);
114 |     sycl::float2* output_cast = reinterpret_cast<sycl::float2*>(output);
115 | 
116 |     input_cast += bid * merge_hidden;
117 |     output_cast += bid * merge_hidden;
118 | 
119 |     for (int c = 0; c < cnt; c++) {
120 |         if (tid < merge_hidden) {
121 |             float q = input_cast[tid];
122 |             int8_t* q_int8 = (int8_t*)&q;
123 | 
124 |             sycl::float2 q_f;
125 |             T* q_h = (T*)&q_f;
126 | 
127 |             q_h[0] = conversion::to<T>(local_scale * (float)q_int8[0]);
128 |             q_h[1] = conversion::to<T>(local_scale * (float)q_int8[1]);
129 |             q_h[2] = conversion::to<T>(local_scale * (float)q_int8[2]);
130 |             q_h[3] = conversion::to<T>(local_scale * (float)q_int8[3]);
131 |             output_cast[tid] = q_f;
132 |             tid += item_ct1.get_local_range(2);
133 |         }
134 |     }
135 | }
136 | 
137 | template <typename T>
138 | void launch_dequantize(T* output,
139 |                        const int8_t* input,
140 |                        const float* qscale,
141 |                        unsigned output_size,
142 |                        unsigned hidden_dim,
143 |                        unsigned groups,
144 |                        dpct::queue_ptr stream)
145 | {
146 |     unsigned threads = 1024;
147 |     hidden_dim /= 4;
148 |     unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
149 | 
150 |     assert(output_size % groups == 0);
151 |     unsigned blocks = output_size / groups;
152 | 
153 |     sycl::range<3> block_dims(1, 1, threads);
154 |     sycl::range<3> grid_dims(1, blocks, groups);
155 | 
156 |     /*
157 |     DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the limit. To get the
158 |     device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
159 |     */
160 |     {
161 |         dpct::has_capability_or_fail(stream->get_device(),
162 |                                      {sycl::aspect::fp64, sycl::aspect::fp16});
163 |         stream->parallel_for(
164 |             sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
165 |                 dequantize_kernel(output, input, qscale, hidden_dim, hidden_dim, thd_cnt);
166 |             });
167 |     }
168 | }
169 | 
170 | #define INSTANTIATE_DEQUANTIZE_NO_MERGE(T) \
171 |     template void launch_dequantize<T>(    \
172 |         T*, const int8_t*, const float*, unsigned, unsigned, unsigned, dpct::queue_ptr);
173 | 
174 | INSTANTIATE_DEQUANTIZE_NO_MERGE(float);
175 | #ifdef BF16_AVAILABLE
176 | INSTANTIATE_DEQUANTIZE_NO_MERGE(sycl::ext::oneapi::bfloat16);
177 | #endif
178 | INSTANTIATE_DEQUANTIZE_NO_MERGE(sycl::half);
179 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/pointwise_ops.dp.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include <sycl/sycl.hpp>
 7 | #include <dpct/dpct.hpp>
 8 | #include "conversion_utils.h"
 9 | #include "ds_kernel_utils.h"
10 | #include "memory_access_utils.h"
11 | 
12 | namespace pwise {
13 | constexpr int granularity = 16;
14 | constexpr int unroll = 4;
15 | constexpr int threads = 256;
16 | }  // namespace pwise
17 | 
18 | template <typename T>
19 | void vector_add_kernel(T* out, const T* a, const T* b, float gamma, int num_elems)
20 | {
21 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
22 |     constexpr int T_per_access = pwise::granularity / sizeof(T);
23 | 
24 |     const int block_offset = item_ct1.get_group(2) * pwise::threads * pwise::unroll * T_per_access;
25 |     const int thread_offset = item_ct1.get_local_id(2) * T_per_access;
26 |     const int total_offset = block_offset + thread_offset;
27 |     constexpr int stride = pwise::threads * T_per_access;
28 | 
29 | #pragma unroll
30 |     for (int i = 0; i < pwise::unroll; i++) {
31 |         T temp_buf_a[T_per_access], temp_buf_b[T_per_access];
32 | 
33 |         const int iter_idx = total_offset + i * stride;
34 | 
35 |         mem_access::load_global<pwise::granularity>(temp_buf_a, a + iter_idx, iter_idx < num_elems);
36 |         mem_access::load_global<pwise::granularity>(temp_buf_b, b + iter_idx, iter_idx < num_elems);
37 | 
38 | #pragma unroll
39 |         for (int j = 0; j < T_per_access; j++) {
40 |             float up_cast_a = conversion::to<float>(temp_buf_a[j]);
41 |             float up_cast_b = conversion::to<float>(temp_buf_b[j]);
42 |             temp_buf_a[j] = conversion::to<T>((gamma * up_cast_a) + up_cast_b);
43 |         }
44 | 
45 |         if (iter_idx < num_elems) {
46 |             mem_access::store_global<pwise::granularity>(out + iter_idx, temp_buf_a);
47 |         }
48 |     }
49 | }
50 | 
51 | template <typename T>
52 | void launch_vector_add(T* out,
53 |                        const T* a,
54 |                        const T* b,
55 |                        float gamma,
56 |                        int num_elems,
57 |                        dpct::queue_ptr stream)
58 | {
59 |     constexpr int T_per_access = pwise::granularity / sizeof(T);
60 |     constexpr int T_per_block = pwise::threads * T_per_access * pwise::unroll;
61 | 
62 |     sycl::range<3> block(1, 1, pwise::threads);
63 |     sycl::range<3> grid(1, 1, (num_elems + T_per_block - 1) / T_per_block);
64 | 
65 |     {
66 |         dpct::has_capability_or_fail(stream->get_device(),
67 |                                      {sycl::aspect::fp64, sycl::aspect::fp16});
68 |         stream->parallel_for(sycl::nd_range<3>(grid * block, block),
69 |                              [=](sycl::nd_item<3> item_ct1) {
70 |                                  vector_add_kernel(out, a, b, gamma, num_elems);
71 |                              });
72 |     }
73 | }
74 | 
75 | #define INSTANTIATE_VECTOR_ADD(T)       \
76 |     template void launch_vector_add<T>( \
77 |         T * out, const T* a, const T* b, float gamma, int num_elems, dpct::queue_ptr stream);
78 | 
79 | INSTANTIATE_VECTOR_ADD(float)
80 | INSTANTIATE_VECTOR_ADD(sycl::half)
81 | #ifdef BF16_AVAILABLE
82 | INSTANTIATE_VECTOR_ADD(sycl::ext::oneapi::bfloat16)
83 | #endif
84 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/csrc/relu.dp.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // DeepSpeed Team
 5 | 
 6 | #include <sycl/sycl.hpp>
 7 | #include <dpct/dpct.hpp>
 8 | #include "conversion_utils.h"
 9 | #include "inference_cuda_layers.h"
10 | #include "memory_access_utils.h"
11 | 
12 | #define MAX_CAP 4
13 | #define MAX_SEQ 2048
14 | 
15 | inline float relu(const float x) { return x < 0 ? 0 : x; }
16 | 
17 | /*
18 | In-place relu(biasAdd(x)) for channels last
19 | */
20 | template <typename T>
21 | void fused_bias_relu(T* input, const T* bias, int total_count, int intermediate_size)
22 | {
23 |     // Input restriction: intermediate_size % vals_per_access == 0
24 |     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
25 |     constexpr int granularity = 16;
26 |     constexpr int values_per_access = granularity / sizeof(T);
27 |     const int offset =
28 |         (item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2)) *
29 |         values_per_access;
30 | 
31 |     if (offset < total_count) {
32 |         T data[values_per_access];
33 |         T data_bias[values_per_access];
34 |         mem_access::load_global<granularity>(data, input + offset);
35 |         mem_access::load_global<granularity>(
36 |             data_bias, bias + (offset % intermediate_size), bias != nullptr);
37 | 
38 | #pragma unroll
39 |         for (int i = 0; i < values_per_access; i++) {
40 |             float data_f = conversion::to<float>(data[i]);
41 |             float bias_f = conversion::to<float>(data_bias[i]);
42 |             data[i] = conversion::to<T>(relu(data_f + bias_f));
43 |         }
44 | 
45 |         mem_access::store_global<granularity>(input + offset, data);
46 |     }
47 | }
48 | 
49 | template <typename T>
50 | void launch_bias_relu(T* input,
51 |                       const T* bias,
52 |                       int intermediate_size,
53 |                       int batch_size,
54 |                       dpct::queue_ptr stream)
55 | {
56 |     constexpr int threads = 1024;
57 |     constexpr int granularity = 16;
58 | 
59 |     const int total_count = batch_size * intermediate_size;
60 |     const int elems_per_block = threads * (granularity / sizeof(T));
61 |     sycl::range<3> block_dims(1, 1, threads);
62 |     sycl::range<3> grid_dims(1, 1, (total_count + elems_per_block - 1) / elems_per_block);
63 | 
64 |     /*
65 |     DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the limit. To get the
66 |     device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.
67 |     */
68 |     {
69 |         dpct::has_capability_or_fail(stream->get_device(),
70 |                                      {sycl::aspect::fp64, sycl::aspect::fp16});
71 |         stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
72 |                              [=](sycl::nd_item<3> item_ct1) {
73 |                                  fused_bias_relu(input, bias, total_count, intermediate_size);
74 |                              });
75 |     }
76 | }
77 | 
78 | #define INSTANTIATE_LAUNCH_BIAS_RELU(T) \
79 |     template void launch_bias_relu<T>(T*, const T*, int, int, dpct::queue_ptr);
80 | 
81 | INSTANTIATE_LAUNCH_BIAS_RELU(float)
82 | #ifdef BF16_AVAILABLE
83 | INSTANTIATE_LAUNCH_BIAS_RELU(sycl::ext::oneapi::bfloat16)
84 | #endif
85 | INSTANTIATE_LAUNCH_BIAS_RELU(sycl::half)
86 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/csrc/transformer/inference/includes/inference_cuda_layers.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // DeepSpeed Team
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <sycl/sycl.hpp>
  9 | #include <dpct/dpct.hpp>
 10 | #include "ds_kernel_utils.h"
 11 | 
 12 | #ifdef BF16_AVAILABLE
 13 | #endif
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <cassert>
 17 | #include <iostream>
 18 | 
 19 | #define MAX_WARP_NUM 32
 20 | #define WARP_SIZE 32
 21 | 
 22 | #define MAX_THREADS 1024
 23 | #define SMs 80
 24 | 
 25 | #define MAX_REGISTERS 256
 26 | 
 27 | template <typename T>
 28 | void launch_attn_softmax_v2(T* vals,
 29 |                             T* mask,
 30 |                             T* alibi,
 31 |                             float layer_scale,
 32 |                             bool triangular,
 33 |                             bool recompute,
 34 |                             bool local_attention,
 35 |                             int window_size,
 36 |                             int batch_size,
 37 |                             int heads,
 38 |                             int num_seq,
 39 |                             int sequence_length,
 40 |                             int offset,
 41 |                             int mask_stride,
 42 |                             int mp_size,
 43 |                             dpct::queue_ptr stream);
 44 | 
 45 | // Fused bias add with gelu activation
 46 | template <typename T>
 47 | void launch_bias_gelu(T* input,
 48 |                       const T* bias,
 49 |                       int intermediate_size,
 50 |                       int batch_size,
 51 |                       dpct::queue_ptr stream);
 52 | 
 53 | template <typename T>
 54 | void launch_gated_activation(T* output,
 55 |                              const T* activation,
 56 |                              const T* bias,
 57 |                              int rows,
 58 |                              int output_stride,
 59 |                              int elems_per_row,
 60 |                              bool use_gelu,
 61 |                              dpct::queue_ptr stream);
 62 | 
 63 | // Fused bias add with relu activation
 64 | template <typename T>
 65 | void launch_bias_relu(T* input,
 66 |                       const T* bias,
 67 |                       int intermediate_size,
 68 |                       int batch_size,
 69 |                       dpct::queue_ptr stream);
 70 | 
 71 | template <typename T>
 72 | void launch_bias_add(T* input,
 73 |                      const T* bias,
 74 |                      int hidden_size,
 75 |                      int batch_size,
 76 |                      dpct::queue_ptr stream);
 77 | 
 78 | template <typename T>
 79 | void launch_bias_residual(T* input,
 80 |                           T* output,
 81 |                           T* attn,
 82 |                           T* bias,
 83 |                           T* attn_bias,
 84 |                           int batch,
 85 |                           int hidden_dim,
 86 |                           int mp_size,
 87 |                           bool preln,
 88 |                           dpct::queue_ptr stream);
 89 | 
 90 | template <typename T>
 91 | void launch_fused_ln(T* output,
 92 |                      const T* vals,
 93 |                      const T* gamma,
 94 |                      const T* beta,
 95 |                      float epsilon,
 96 |                      int rows,
 97 |                      int elems_per_row,
 98 |                      dpct::queue_ptr stream);
 99 | 
100 | template <typename T>
101 | void launch_fused_residual_ln(T* output,
102 |                               const T* vals,
103 |                               const T* residual,
104 |                               const T* bias,
105 |                               const T* gamma,
106 |                               const T* beta,
107 |                               float epsilon,
108 |                               int rows,
109 |                               int elems_per_row,
110 |                               dpct::queue_ptr stream);
111 | 
112 | template <typename T>
113 | void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
114 |                                                T* res_output,
115 |                                                const T* vals,
116 |                                                const T* residual,
117 |                                                const T* bias,
118 |                                                const T* gamma,
119 |                                                const T* beta,
120 |                                                float epsilon,
121 |                                                int rows,
122 |                                                int elems_per_row,
123 |                                                dpct::queue_ptr stream);
124 | 
125 | template <typename T>
126 | void launch_rms_norm(T* norm_output,
127 |                      T* res_output,
128 |                      const T* vals,
129 |                      const T* residual,
130 |                      const T* gamma,
131 |                      float epsilon,
132 |                      int rows,
133 |                      int elems_per_row,
134 |                      dpct::queue_ptr stream);
135 | 
136 | template <typename T>
137 | void launch_dequantize(T* output,
138 |                        const int8_t* input,
139 |                        const float* qscale,
140 |                        unsigned output_size,
141 |                        unsigned hidden_dim,
142 |                        unsigned groups,
143 |                        unsigned merge_count,
144 |                        dpct::queue_ptr stream);
145 | 
146 | template <typename T>
147 | void launch_dequantize(T* output,
148 |                        const int8_t* input,
149 |                        const float* qscale,
150 |                        unsigned output_size,
151 |                        unsigned hidden_dim,
152 |                        unsigned groups,
153 |                        dpct::queue_ptr stream);
154 | template <typename T>
155 | void launch_gptj_residual_add(T* input,
156 |                               T* output,
157 |                               T* attn,
158 |                               T* bias,
159 |                               T* attn_bias,
160 |                               int batch,
161 |                               int head_size,
162 |                               int mp_size,
163 |                               dpct::queue_ptr stream);
164 | 
165 | template <typename T>
166 | void launch_apply_rotary_pos_emb(T* mixed_query,
167 |                                  T* key_layer,
168 |                                  unsigned head_size,
169 |                                  unsigned seq_len,
170 |                                  unsigned rotary_dim,
171 |                                  unsigned offset,
172 |                                  unsigned num_heads,
173 |                                  unsigned batch,
174 |                                  float rope_theta,
175 |                                  dpct::queue_ptr stream,
176 |                                  int max_out_tokens);
177 | 
178 | template <typename T>
179 | void launch_moe_res_matmul(T* residual,
180 |                            T* coef,
181 |                            T* mlp_out,
182 |                            int seq_len,
183 |                            int hidden_dim,
184 |                            dpct::queue_ptr stream);
185 | 
186 | // 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
187 | template <typename T>
188 | void launch_transform4d_0213(T* out,
189 |                              const T* in,
190 |                              int batch_size,
191 |                              int heads,
192 |                              int seq_length,
193 |                              int hidden_dim,
194 |                              dpct::queue_ptr stream,
195 |                              int trans_count);
196 | template <typename T>
197 | void launch_bias_add_transform_0213(T* outputs,
198 |                                     T* vals,
199 |                                     T* vals1,
200 |                                     const T* vals2,
201 |                                     const T* bias,
202 |                                     int batch_size,
203 |                                     int seq_length,
204 |                                     unsigned seq_offset,
205 |                                     int seq_length1,
206 |                                     int hidden_dim,
207 |                                     int heads,
208 |                                     int num_kv,
209 |                                     int rotary_dim,
210 |                                     bool rotate_half,
211 |                                     bool rotate_every_two,
212 |                                     dpct::queue_ptr stream,
213 |                                     int trans_count,
214 |                                     int max_out_tokens,
215 |                                     float rope_theta);
216 | template <typename T>
217 | void pad_data(T* padded_output,
218 |               T* output,
219 |               int bsz,
220 |               int head_size,
221 |               int padded_head_size,
222 |               dpct::queue_ptr stream);
223 | 
224 | template <typename T>
225 | void pad_head_seq(T* padded_output,
226 |                   T* output,
227 |                   int bsz,
228 |                   int seq_len,
229 |                   int padded_seq_len,
230 |                   int head_size,
231 |                   int padded_head_size,
232 |                   dpct::queue_ptr stream);
233 | 
234 | template <typename T>
235 | void launch_pad_add_transform_0213(T* output,
236 |                                    const T* vals,
237 |                                    int batch_size,
238 |                                    int hidden_dim,
239 |                                    int seq_length,
240 |                                    int padded_seq_len,
241 |                                    int heads,
242 |                                    int padded_head_size,
243 |                                    dpct::queue_ptr stream);
244 | 
245 | template <typename T>
246 | void launch_vector_add(T* out,
247 |                        const T* a,
248 |                        const T* b,
249 |                        float gamma,
250 |                        int num_elems,
251 |                        dpct::queue_ptr stream);
252 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/flash_attn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The Microsoft DeepSpeed Team
  3 | """
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.autograd import Function
  7 | from deepspeed.ops.op_builder.builder import OpBuilder, TORCH_MAJOR, TORCH_MINOR
  8 | import fmha_module
  9 | 
 10 | 
 11 | class FlashAttnFunc(Function):
 12 | 
 13 |     @staticmethod
 14 |     def forward(ctx, query, key, value, dropout_p, softmax_scale, is_causal):
 15 |         """
 16 |         Shape of qkv and out: [Bs, Hn, Sl, Hs]
 17 |         Bs: batch size
 18 |         Hn: head number
 19 |         Sl: sequence length
 20 |         Hs: head size
 21 |         """
 22 |         bs, hn, sl, hs = query.shape
 23 |         if softmax_scale is None:
 24 |             softmax_scale = hs ** (-0.5)
 25 |         dropout_seed = torch.seed()
 26 |         is_training = True
 27 |         is_dropout = (dropout_p != 0)
 28 | 
 29 |         out, softmax_L = fmha_module.flash_attn_fwd(
 30 |             query, key, value, bs, hn, sl, hs, softmax_scale,
 31 |             dropout_p, dropout_seed,
 32 |             is_causal, is_training, is_dropout
 33 |         )
 34 | 
 35 |         ctx.save_for_backward(query, key, value, out, softmax_L)
 36 |         ctx.dropout_p = dropout_p
 37 |         ctx.dropout_seed = dropout_seed
 38 |         ctx.softmax_scale = softmax_scale
 39 |         ctx.is_causal = is_causal
 40 |         ctx.is_dropout = is_dropout
 41 | 
 42 |         return out
 43 | 
 44 |     @staticmethod
 45 |     def backward(ctx, dout, *args):
 46 |         q, k, v, out, softmax_L = ctx.saved_tensors
 47 |         bs, hn, sl, hs = q.shape
 48 | 
 49 |         dq, dk, dv = fmha_module.flash_attn_bwd(
 50 |             dout, q, k, v, out, bs, hn, sl, hs, ctx.softmax_scale,
 51 |             ctx.dropout_p, ctx.dropout_seed,
 52 |             ctx.is_causal, ctx.is_dropout, softmax_L
 53 |         )
 54 |         return dq, dk, dv, None, None, None
 55 | 
 56 | 
 57 | class FlashAttentionBuilderObject():
 58 |     def __init__(self):
 59 |         pass
 60 |     
 61 |     # general functions
 62 |     def flash_attn_func_v2(self, q, k, v,
 63 |             dropout_p, softmax_scale, is_causal):
 64 |         if q.shape[-1] <= 256:
 65 |             return FlashAttnFunc.apply(q, k, v, dropout_p, softmax_scale, is_causal)
 66 |         else:
 67 |             return self.flash_attn_fwd_func(q, k, v, dropout_p)
 68 | 
 69 |     # forward functions
 70 |     def flash_attn_fwd_func(self, q, k, v, dropout_p):
 71 |         hs_rsqrt_scale = q.shape[-1] ** (-0.5)
 72 |         attention_scores = torch.matmul(q, k.transpose(-1, -2))
 73 |         attention_scores = attention_scores * hs_rsqrt_scale
 74 | 
 75 |         triu_mask = (torch.triu(torch.ones_like(attention_scores), diagonal=1) == 1)
 76 |         attention_scores.masked_fill_(triu_mask, -torch.inf)
 77 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 78 | 
 79 |         attention_probs = nn.Dropout(dropout_p)(attention_probs)
 80 | 
 81 |         context_layer = torch.matmul(attention_probs, v)
 82 |         return context_layer
 83 | 
 84 | 
 85 | 
 86 | class FlashAttentionBuilder(OpBuilder):
 87 |     BUILD_VAR = "DS_BUILD_FlashAttention"
 88 |     NAME = "flash_attn"
 89 | 
 90 |     def __init__(self):
 91 |         super().__init__(name=self.NAME)
 92 | 
 93 |     def absolute_name(self):
 94 |         return f'deepspeed.ops.{self.NAME}_op'
 95 | 
 96 |     def sources(self):
 97 |         return [
 98 |             sycl_kernel_path('csrc/flash_attn/flash_attn.dp.cpp'),
 99 |             sycl_kernel_path('csrc/flash_attn/fmha_fwd.cpp'),
100 |             sycl_kernel_path('csrc/flash_attn/fmha_bwd.cpp'),
101 |         ]
102 | 
103 |     def include_paths(self):
104 |         return []
105 | 
106 |     def extra_ldflags(self):
107 |         return []
108 | 
109 |     def cxx_args(self):
110 |         return []
111 | 
112 |     def load(self):
113 |         return FlashAttentionBuilderObject()
114 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/fused_adam.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The Microsoft DeepSpeed Team
 3 | """
 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include
 5 | 
 6 | 
 7 | class FusedAdamBuilder(SYCLOpBuilder):
 8 |     BUILD_VAR = "DS_BUILD_FUSED_ADAM"
 9 |     NAME = "fused_adam"
10 | 
11 |     def __init__(self):
12 |         super().__init__(name=self.NAME)
13 | 
14 |     def absolute_name(self):
15 |         return f'deepspeed.ops.adam.{self.NAME}_op'
16 | 
17 |     def sources(self):
18 |         return [
19 |             sycl_kernel_path('csrc/adam/fused_adam_frontend.cpp'),
20 |             sycl_kernel_path('csrc/adam/multi_tensor_adam.dp.cpp'),
21 |         ]
22 | 
23 |     def include_paths(self):
24 |         return [
25 |             sycl_kernel_include('csrc/includes'),
26 |             sycl_kernel_include('csrc/adam'),
27 |         ]
28 | 
29 |     def cxx_args(self):
30 |         args = super().cxx_args()
31 |         return args + self.version_dependent_macros()
32 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/quantizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include
 7 | 
 8 | 
 9 | class QuantizerBuilder(SYCLOpBuilder):
10 |     BUILD_VAR = "DS_BUILD_QUANTIZER"
11 |     NAME = "quantizer"
12 | 
13 |     def __init__(self, name=None):
14 |         name = self.NAME if name is None else name
15 |         super().__init__(name=name)
16 | 
17 |     def absolute_name(self):
18 |         return f'deepspeed.ops.quantizer.{self.NAME}_op'
19 | 
20 |     def sources(self):
21 |         return [
22 |             sycl_kernel_path('csrc/quantization/pt_binding.cpp'),
23 |             sycl_kernel_path('csrc/quantization/fake_quantizer.dp.cpp'),
24 |             sycl_kernel_path('csrc/quantization/quantize.dp.cpp'),
25 |             sycl_kernel_path('csrc/quantization/quantize_intX.dp.cpp'),
26 |             sycl_kernel_path('csrc/quantization/dequantize.dp.cpp'),
27 |             sycl_kernel_path('csrc/quantization/swizzled_quantize.dp.cpp'),
28 |             sycl_kernel_path('csrc/quantization/quant_reduce.dp.cpp'),
29 |         ]
30 | 
31 |     def include_paths(self):
32 |         return [sycl_kernel_include('csrc/includes')]
33 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/transformer_inference.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include
 7 | 
 8 | 
 9 | class InferenceBuilder(SYCLOpBuilder):
10 |     BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
11 |     NAME = "transformer_inference"
12 | 
13 |     def __init__(self, name=None):
14 |         name = self.NAME if name is None else name
15 |         super().__init__(name=name)
16 | 
17 |     def absolute_name(self):
18 |         return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
19 | 
20 |     def is_compatible(self, verbose=True):
21 |         return super().is_compatible(verbose)
22 | 
23 |     def cxx_args(self):
24 |         args = super().cxx_args()
25 |         args.append('-DBF16_AVAILABLE')
26 |         return args
27 | 
28 |     def sources(self):
29 |         return [
30 |             sycl_kernel_path('csrc/transformer/inference/csrc/pt_binding.cpp'),
31 |             sycl_kernel_path('csrc/transformer/inference/csrc/gelu.dp.cpp'),
32 |             sycl_kernel_path('csrc/transformer/inference/csrc/relu.dp.cpp'),
33 |             sycl_kernel_path('csrc/transformer/inference/csrc/layer_norm.dp.cpp'),
34 |             sycl_kernel_path('csrc/transformer/inference/csrc/rms_norm.dp.cpp'),
35 |             sycl_kernel_path('csrc/transformer/inference/csrc/softmax.dp.cpp'),
36 |             sycl_kernel_path('csrc/transformer/inference/csrc/dequantize.dp.cpp'),
37 |             sycl_kernel_path('csrc/transformer/inference/csrc/apply_rotary_pos_emb.dp.cpp'),
38 |             sycl_kernel_path('csrc/transformer/inference/csrc/transform.dp.cpp'),
39 |             sycl_kernel_path('csrc/transformer/inference/csrc/pointwise_ops.dp.cpp'),
40 |         ]
41 | 
42 |     def include_paths(self):
43 |         includes = [
44 |             sycl_kernel_include('csrc/transformer/inference/includes'),
45 |             sycl_kernel_include('csrc/includes'),
46 |         ]
47 |         return includes
48 | 
49 | 


--------------------------------------------------------------------------------
/intel_extension_for_deepspeed/op_builder/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The Microsoft DeepSpeed Team
 3 | """
 4 | from .builder import SYCLOpBuilder, sycl_kernel_path, sycl_kernel_include  # noqa: F401
 5 | 
 6 | 
 7 | class UtilsBuilder(SYCLOpBuilder):
 8 |     BUILD_VAR = "DS_BUILD_UTILS"
 9 |     NAME = "utils"
10 | 
11 |     def __init__(self, name=None):
12 |         name = self.NAME if name is None else name
13 |         super().__init__(name=name)
14 | 
15 |     def absolute_name(self):
16 |         return f'deepspeed.ops.{self.NAME}_op'
17 | 
18 |     def sources(self):
19 |         return ['csrc/utils/flatten_unflatten.cpp']
20 | 
21 |     def include_paths(self):
22 |         return []
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ninja


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup
  2 | import subprocess
  3 | import os
  4 | import intel_extension_for_pytorch
  5 | from torch.xpu.cpp_extension import DPCPPExtension, DpcppBuildExtension
  6 | 
  7 | PACKAGE_NAME="intel_extension_for_deepspeed"
  8 | 
  9 | version_str = "0.9.4"
 10 | git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
 11 | git_hash_cmd = "git rev-parse --short HEAD"
 12 | 
 13 | def get_project_dir():
 14 |     project_root_dir = os.path.dirname(__file__)
 15 |     return os.path.abspath(project_root_dir)
 16 | 
 17 | def get_csrc_dir(op_name=""):
 18 |     project_root_dir = os.path.join(get_project_dir(), PACKAGE_NAME + "/op_builder/csrc/" + str(op_name))
 19 |     return os.path.abspath(project_root_dir)
 20 | 
 21 | def get_xetla_dir():
 22 |     project_root_dir = os.path.join(get_project_dir(), "third_party/xetla/include")
 23 |     return os.path.abspath(project_root_dir)
 24 | 
 25 | def create_ext_modules(op_name=""):
 26 |     cpp_files = []
 27 |     include_dirs = []
 28 | 
 29 |     for path, dir_list, file_list in os.walk(get_csrc_dir(op_name)):
 30 |         for file_name in file_list:
 31 |             if file_name.endswith('.cpp'):
 32 |                 cpp_files += [os.path.join(path, file_name)]
 33 |     for path, dir_list, file_list in os.walk(get_csrc_dir()):
 34 |         for file_name in file_list:
 35 |             if file_name.endswith('.hpp') or file_name.endswith('.h'):
 36 |                 include_dirs += [path]
 37 |                 break
 38 |     include_dirs += [get_xetla_dir()]
 39 |     cxx_flags = [
 40 |         '-fsycl', '-O3', '-std=c++20', '-w', '-fPIC', '-DMKL_ILP64',
 41 |         '-fsycl-targets=spir64_gen',
 42 |         "-Xs \"-device pvc -options '-vc-disable-indvars-opt -vc-codegen -doubleGRF -Xfinalizer -printregusage -Xfinalizer -enableBCR -DPASTokenReduction '\" "
 43 |     ]
 44 |     extra_ldflags = [
 45 |         '-fsycl', '-fPIC', '-Wl,-export-dynamic', '-fsycl-targets=spir64_gen',
 46 |         "-Xs \"-device pvc -options '-vc-disable-indvars-opt -vc-codegen -doubleGRF -Xfinalizer -printregusage -Xfinalizer -enableBCR -DPASTokenReduction '\" "
 47 |     ]
 48 |     ext_modules = [
 49 |         DPCPPExtension(name="fmha_module",
 50 |                        sources=cpp_files,
 51 |                        include_dirs=include_dirs,
 52 |                        extra_compile_args={'cxx': cxx_flags},
 53 |                        extra_link_args=extra_ldflags)
 54 |     ]
 55 |     return ext_modules
 56 | 
 57 | def command_exists(cmd):
 58 |     result = subprocess.Popen(f'type {cmd}',
 59 |                               stdout=subprocess.PIPE,
 60 |                               shell=True)
 61 |     return result.wait() == 0
 62 | 
 63 | 
 64 | if command_exists('git'):
 65 |     try:
 66 |         result = subprocess.check_output(git_hash_cmd, shell=True)
 67 |         git_hash = result.decode('utf-8').strip()
 68 |         result = subprocess.check_output(git_branch_cmd, shell=True)
 69 |         git_branch = result.decode('utf-8').strip()
 70 |     except subprocess.CalledProcessError:
 71 |         git_hash = "unknown"
 72 |         git_branch = "unknown"
 73 | else:
 74 |     git_hash = "unknown"
 75 |     git_branch = "unknown"
 76 | 
 77 | 
 78 | def _build_installation_dependency():
 79 |     install_requires = []
 80 |     install_requires.append("setuptools")
 81 |     return install_requires
 82 | 
 83 | def _check_env_flag(name, default=""):
 84 |     return os.getenv(name, default).upper() in ["Y", "1"];
 85 | 
 86 | print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
 87 | 
 88 | if _check_env_flag("GIT_VERSIONED_BUILD", default="1"):
 89 |     version_str += f'+{git_hash}'
 90 | 
 91 | ext_modules = create_ext_modules("flash_attn")
 92 | cmdclass = {'build_ext': DpcppBuildExtension}
 93 | 
 94 | long_description = ""
 95 | currentdir = os.path.abspath(os.path.dirname(__file__))
 96 | with open(os.path.join(currentdir, "README.md"), encoding="utf-8") as f:
 97 |         long_description = f.read()
 98 | 
 99 | setup(name=PACKAGE_NAME,
100 |       version=version_str,
101 |       description="Intel® Extension for DeepSpeed*",
102 |       long_description=long_description,
103 |       long_description_content_type="text/markdown",
104 |       url="https://github.com/intel/intel-extension-for-deepspeed",
105 |       author="Intel Corporation",
106 |       install_requires=_build_installation_dependency(),
107 |       include_package_data=True,
108 |       packages=[PACKAGE_NAME],
109 |       ext_modules=ext_modules,
110 |       cmdclass=cmdclass,
111 |       license="https://opensource.org/license/mit")
112 | 


--------------------------------------------------------------------------------