├── .clang-format
├── .clang-tidy
├── .gitignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── benchmark_example.md
├── cmake
    ├── arch.cmake
    ├── clang-format.cmake
    ├── compilation-flags.cmake
    ├── sources.cmake
    ├── test_aarch64_sha_ni.c
    ├── test_endianess.c
    ├── test_x86_64_avx2.c
    ├── test_x86_64_avx512.c
    └── test_x86_64_sha_ni.c
├── include
    ├── internal
    │   ├── avx2_defs.h
    │   ├── avx512_defs.h
    │   ├── avx_defs.h
    │   ├── defs.h
    │   ├── measurements.h
    │   ├── neon_defs.h
    │   ├── sha256_defs.h
    │   └── sha512_defs.h
    └── sha.h
├── src
    ├── openssl
    │   ├── README.md
    │   ├── linux
    │   │   ├── sha256-armv8.S
    │   │   ├── sha256-x86_64.s
    │   │   ├── sha512-armv8.S
    │   │   └── sha512-x86_64.s
    │   ├── macos
    │   │   ├── sha256-x86_64.s
    │   │   └── sha512-x86_64.s
    │   └── openssl_cpu_globals.c
    ├── sha256.c
    ├── sha256_compress_aarch64_sha_ext.c
    ├── sha256_compress_generic.c
    ├── sha256_compress_x86_64_avx.c
    ├── sha256_compress_x86_64_avx2.c
    ├── sha256_compress_x86_64_avx512.c
    ├── sha256_compress_x86_64_avx_helper.c
    ├── sha256_compress_x86_64_sha_ext.c
    ├── sha256_consts.c
    ├── sha512.c
    ├── sha512_compress_generic.c
    ├── sha512_compress_x86_64_avx.c
    ├── sha512_compress_x86_64_avx2.c
    ├── sha512_compress_x86_64_avx512.c
    ├── sha512_compress_x86_64_avx_helper.c
    └── sha512_consts.c
└── tests
    ├── main_speed.c
    ├── main_tests.c
    ├── pre-commit-script.sh
    └── test.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | AlignAfterOpenBracket: true
 2 | AlignConsecutiveMacros: true
 3 | AlignConsecutiveAssignments: true
 4 | AlignConsecutiveDeclarations: true
 5 | AlignEscapedNewlines: Left 
 6 | AlignTrailingComments: true
 7 | AllowAllParametersOfDeclarationOnNextLine: true
 8 | AllowAllArgumentsOnNextLine: false
 9 | AllowShortCaseLabelsOnASingleLine: true 
10 | AllowShortFunctionsOnASingleLine: true
11 | AllowShortIfStatementsOnASingleLine: true
12 | AllowShortLoopsOnASingleLine: true
13 | AlwaysBreakBeforeMultilineStrings: false
14 | AlwaysBreakAfterReturnType: None
15 | BinPackParameters: false
16 | BreakBeforeBraces: Custom
17 | BraceWrapping:
18 |   AfterCaseLabel: false
19 |   AfterControlStatement: false
20 |   AfterEnum: true
21 |   AfterExternBlock: false
22 |   AfterFunction: true
23 |   AfterNamespace: false
24 |   AfterStruct: false
25 |   AfterUnion: false
26 |   BeforeElse: false
27 |   SplitEmptyFunction: false
28 | BreakBeforeBinaryOperators: false
29 | ColumnLimit: 82
30 | ContinuationIndentWidth: 2
31 | DerivePointerAlignment: false
32 | IndentCaseLabels: true
33 | IndentPPDirectives: AfterHash
34 | IndentWidth: 2
35 | IndentWrappedFunctionNames: false
36 | MaxEmptyLinesToKeep: 1
37 | NamespaceIndentation: None 
38 | PenaltyReturnTypeOnItsOwnLine: 25
39 | PointerAlignment: Right
40 | ReflowComments: true
41 | SpaceAfterCStyleCast: false
42 | SpaceBeforeAssignmentOperators: true
43 | SpaceBeforeParens: Never
44 | SpaceInEmptyParentheses: false
45 | SpacesBeforeTrailingComments: 1
46 | SpacesInContainerLiterals: false
47 | SortIncludes: true
48 | UseTab: Never
49 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | # We remove the cert* checks that are related to rand() and srand()
 2 | 
 3 | Checks: '-*, 
 4 |         bugprone-*,
 5 |         cert-*,
 6 |         -cert-msc50-cpp,
 7 |         -cert-msc51-cpp,
 8 |         -cert-msc30-c,
 9 |         -cert-msc32-c,
10 |         darwin-*,
11 |         hicpp-*,
12 |         -hicpp-signed-bitwise,
13 |         -hicpp-no-assembler,
14 |         misc-*,
15 |         readability-*'
16 | 
17 | WarningsAsErrors: '*'
18 | HeaderFilterRegex: '.*'
19 | FormatStyle: 'file'
20 | CheckOptions:
21 |   - key:             bugprone-argument-comment.StrictMode
22 |     value:           '1'
23 |   - key:             bugprone-argument-comment.CommentBoolLiterals
24 |     value:           '1'
25 |   - key:             bugprone-argument-comment.CommentIntegerLiterals
26 |     value:           '0'
27 |   - key:             bugprone-argument-comment.CommentFloatLiterals
28 |     value:           '1'
29 |   - key:             bugprone-argument-comment.CommentCharacterLiterals
30 |     value:           '1'
31 |   - key:             bugprone-argument-comment.CommentUserDefinedLiterals
32 |     value:           '1'
33 |   - key:             bugprone-argument-comment.CommentNullPtrs
34 |     value:           '1'
35 |   - key:             bugprone-misplaced-widening-cast.CheckImplicitCasts
36 |     value:           '1'
37 |   - key:             bugprone-sizeof-expression.WarnOnSizeOfConstant
38 |     value:           '1'
39 |   - key:             bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression
40 |     value:           '1'
41 |   - key:             bugprone-sizeof-expression.WarnOnSizeOfCompareToConstant
42 |     value:           '1'
43 |   - key:             bugprone-suspicious-string-compare.WarnOnImplicitComparison
44 |     value:           '1'
45 |   - key:             bugprone-suspicious-string-compare.WarnOnLogicalNotComparison
46 |     value:           '1'
47 |   - key:             bugprone-suspicious-string-compare.StringCompareLikeFunctions
48 |     value:           '1'
49 |   - key:             google-runtime-int.TypeSufix
50 |     value:           '_t'
51 |   - key:             readability-magic-numbers.IgnoredIntegerValues
52 |     value:           '0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15'
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | # CMake compilation dir
55 | build
56 | 
57 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0.0)
 2 | project (sha-with-intrinsic C ASM)
 3 | 
 4 | set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
 5 | set(SRC_DIR ${PROJECT_SOURCE_DIR}/src)
 6 | set(TESTS_DIR ${PROJECT_SOURCE_DIR}/tests)
 7 | 
 8 | include_directories(${INCLUDE_DIR})
 9 | include_directories(${INCLUDE_DIR}/internal)
10 | 
11 | include(cmake/arch.cmake)
12 | 
13 | include(cmake/compilation-flags.cmake)
14 | 
15 | # Depends on SRC_DIR
16 | # and on arch.cmake
17 | include(cmake/sources.cmake)
18 | 
19 | include(cmake/clang-format.cmake)
20 | 
21 | set(OPENSSL_USE_STATIC_LIBS TRUE)
22 | find_package(OpenSSL REQUIRED)
23 | 
24 | add_executable(${PROJECT_NAME}
25 |  
26 |                ${SHA_SOURCES}
27 |                ${OPENSSL_SOURCES}
28 |                ${MAIN_SOURCE}
29 | )
30 | target_link_libraries(${PROJECT_NAME} OpenSSL::Crypto)
31 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sha2-with-intrinsic
 2 | 
 3 | This sample code package is an optimized version of SHA256 and SHA512. 
 4 | 
 5 | The code is written by Nir Drucker and Shay Gueron, AWS Cryptographic Algorithms Group.
 6 | 
 7 | While C code is easier to maintain and review, the performance obtained by compilation (e.g., with gcc-9 and clang-9) is often slower than the performance of hand written assembly code (e.g., the code in this example). This sample code is made publicly available to help compiler designers understand this use case by reviewing the code and its generated assembler. We hope this information will improve compiler's abilities to generate efficient assembler. 
 8 | 
 9 | This sample code provides testing binaries but no shared or a static libraries. This is because the code is desgined to be used for benchmarking purposes only and not in final products.
10 | 
11 | The x86-64 AVX code is based on the paper:
12 | - Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). https://doi.org/10.1007/s13389-012-0037-z
13 | Some parts of the code were translated from (Perl)assembly (OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744)) to C.
14 | 
15 | The code version that uses Intel SHA Extensions instructions is based on the following reference:
16 | - https://software.intel.com/en-us/articles/intel-sha-extensions
17 | 
18 | ## License
19 | 
20 | This project is licensed under the Apache-2.0 License.
21 | 
22 | Dependencies
23 | -----
24 | This package requires 
25 | - CMake 3 and above 
26 | - A compiler that supports the required C intrinsics (e.g., AVX/ AVX2/ AVX512/ SHA_NI on x86-64 machines). For example, GCC-9 and Clang-9.
27 | - An installation of OpenSSL for testing
28 | 
29 | BUILD
30 | -----
31 | 
32 | To build the directory first create a working directory
33 | ```
34 | mkdir build
35 | cd build
36 | ```
37 | 
38 | Then, run CMake and compile
39 | ```
40 | cmake -DCMAKE_BUILD_TYPE=Release ..
41 | make
42 | ```
43 | 
44 | Additional CMake compilation flags:
45 |  - TEST_SPEED               - Measure and prints the performance in cycles
46 |  - ALTERNATIVE_AVX512_IMPL  - The X86-64 AVX512 extension provides a rotate intrinsic. Setting this flag tells the AVX/AVX2/AVX512 implementations to use this intrinsic. To test this implementation the binary should be compiled with this flag set.
47 |  - DONT_USE_UNROLL_PRAGMA   - The code by default uses the unroll pragma. Use this flag to disable this.
48 |  - ASAN/MSAN/TSAN/UBSAN     - Compiling using Address/Memory/Thread/Undefined-Behaviour sanitizer respectively. 
49 |  - MONTE_CARLO_NUM_OF_TESTS - Set the number of Monte Carlo tests (default:100,000)
50 | 
51 | To clean - remove the `build` directory. Note that a "clean" is required prior to compilation with modified flags.
52 | 
53 | To format (`clang-format-9` or above is required):
54 | 
55 | `make format`
56 | 
57 | To use clang-tidy (`clang-tidy-9` is required):
58 | 
59 | ```
60 | CC=clang-9 cmake -DCMAKE_C_CLANG_TIDY="clang-tidy-9;--fix-errors;--format-style=file" ..
61 | make 
62 | ```
63 | 
64 | Before committing code, please test it using
65 | `tests/pre-commit-script.sh` 
66 | This will run all the sanitizers and also `clang-format` and `clang-tidy` (requires clang-9 to be installed).
67 | 
68 | The package was compiled and tested with gcc-9 and clang-9 in 64-bit mode. 
69 | Tests were run on a Linux (Ubuntu 18.04.4 LTS) OS on x86-64 and AARCH64 machines. 
70 | Compilation on other platforms may require some adjustments.
71 | 
72 | Performance measurements
73 | ------------------------
74 | When using the TEST_SPEED flag the performance measurements are reported in processor cycles (per single core). The results are obtained using the following methodology. Each measured function was isolated, run 25 times (warm-up), followed  by  100  iterations  that  were  clocked and averaged. To minimize the effect of background tasks running on the system, every experiment was repeated 10 times, and the minimum result is reported.
75 | 
76 | The library reports the results only for supported code by the OS/compiler. It also compares the results of the C with intrinsic code to the assembly code of OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744) (see [here](/src/openssl/README.md) for more details).
77 | 
78 | A benchmark example is found [here](benchmark_example.md).
79 | 
80 | Testing
81 | -------
82 | - The library uses OpenSSL for its testings. It compares the results of running its SHA256/SHA512 implementation to the OpenSSL results on strings in different lengths (0-1000 bytes). 
83 | - The library was run using Address/Memory/Thread/Undefined-Behaviour sanitizers.
84 | 


--------------------------------------------------------------------------------
/benchmark_example.md:
--------------------------------------------------------------------------------
 1 | A benchmark example on a Dell XPS 13 7390 2-in-1 laptop. It has a 10th generation Intel(c) Core(TM) processor (microarchitecture codename "Ice Lake"[ICL]). The specifics are Intel(c) Core(TM) i7-1065G7 CPU 1.30GHz. This platform has 16 GB RAM, 48K L1d cache, 32K L1i cache, 512K L2 cache, and 8MiB L3 cache. The Intel(c) Turbo Boost Technology was turned off. 
 2 | The code was compiled with clang-9 and ran on Ubuntu 18.04.2 LTS.
 3 | The results are in CPU cycles.
 4 | 
 5 | SHA-256 Benchmark:
 6 | ------------------
 7 | ```
 8 |         msg     generic      avx (C)   avx (ossl)     avx2 (C)  avx2 (ossl)   avx512 (C)  sha ext (C) sha ext (ossl) 
 9 |     1 bytes        1024          791          754          821          724          879          287          288 
10 |     2 bytes        1026          792          754          823          724          882          287          288 
11 |     4 bytes        1024          791          753          823          725          878          288          288 
12 |     8 bytes        1027          792          753          824          724          877          287          288 
13 |    16 bytes        1023          777          728          820          702          877          288          283 
14 |    32 bytes        1023          782          726          814          699          879          281          280 
15 |    64 bytes        1992         1518         1394         1582         1342         1708          424          446 
16 |   128 bytes        2906         2244         2051         2247         1918         2360          605          644 
17 |   256 bytes        4721         3693         3363         3620         3121         3621          956         1027 
18 |   512 bytes        8381         6595         5987         6373         5522         6318         1666         1797 
19 |  1024 bytes       15566        12409        11236        11892        10339        11728         3104         3340 
20 |  2048 bytes       29955        24038        21753        22925        19961        22507         5968         6424 
21 |  4096 bytes       58970        47377        42843        45037        39326        44219        11692        12594 
22 |  8192 bytes      116991        94007        84981        89160        78060        87494        23148        24936 
23 | 16384 bytes      232664       187280       169477       177774       154960       174157        45780        49741 
24 | 32768 bytes      464254       373359       337247       354136       309742       346105        91667        99088 
25 | 65536 bytes      927237       747417       675843       709146       620729       694132       183685       198528 
26 | ```
27 | 
28 | SHA-512 Benchmark:
29 | ------------------
30 | ```
31 |         msg     generic      avx (C)   avx (ossl)     avx2 (C)  avx2 (ossl)   avx512 (C)
32 |     1 bytes        1428         1026          972         1062          961         1139 
33 |     2 bytes        1432         1023          969         1066          957         1138 
34 |     4 bytes        1432         1025          973         1063          955         1138 
35 |     8 bytes        1433         1026          973         1063          955         1138 
36 |    16 bytes        1431         1018          969         1063          955         1137 
37 |    32 bytes        1420         1019          967         1060          951         1135 
38 |    64 bytes        1418         1019          962         1058          949         1132 
39 |   128 bytes        2766         1963         1843         2042         1823         2211 
40 |   256 bytes        4084         2891         2692         2951         2537         3114 
41 |   512 bytes        6710         4738         4388         4725         4096         4810 
42 |  1024 bytes       11932         8436         7779         8291         7206         8379 
43 |  2048 bytes       22472        15837        14570        15389        13456        15458 
44 |  4096 bytes       43792        30662        28283        29798        26022        29482 
45 |  8192 bytes       85757        60456        55429        58261        50945        57625 
46 | 16384 bytes      169996       119761       109960       114952       101207       113801 
47 | 32768 bytes      338123       238652       219153       228957       201284       226530 
48 | 65536 bytes      675827       476242       437028       456221       402204       450687 
49 | ```
50 | 


--------------------------------------------------------------------------------
/cmake/arch.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|amd64|AMD64)$")
 5 |   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DX86_64")
 6 |   set(X86_64 1)
 7 | elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(aarch64|arm64|arm64e)$")
 8 |   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAARCH64 -DNEON_SUPPORT")
 9 |   set(AARCH64 1)
10 | endif()
11 | 
12 | # Only little endian systems are supported
13 | try_run(RUN_RESULT COMPILE_RESULT
14 |         "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_endianess.c"
15 |         COMPILE_DEFINITIONS "-Werror -Wall -Wpedantic"
16 |         OUTPUT_VARIABLE OUTPUT
17 | )
18 | 
19 | if((NOT ${COMPILE_RESULT}) OR (NOT RUN_RESULT EQUAL 0))
20 |     message(FATAL "Only little endian systems are supported")
21 | endif()
22 | 
23 | if(X86_64)
24 |     # Test AVX2
25 |     try_run(RUN_RESULT COMPILE_RESULT
26 |             "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_avx2.c"
27 |             COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic"
28 |             OUTPUT_VARIABLE OUTPUT
29 |     )
30 | 
31 |     if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0))
32 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAVX2_SUPPORT")
33 |         set(AVX2 1)        
34 |     else()
35 |         message(STATUS "The AVX2 implementation is not supported")
36 |     endif()
37 | 
38 |     # Test AVX512
39 |     try_run(RUN_RESULT COMPILE_RESULT
40 |             "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_avx512.c"
41 |             COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic"
42 |             OUTPUT_VARIABLE OUTPUT
43 |     )
44 | 
45 |     if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0))
46 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAVX512_SUPPORT")
47 |         set(AVX512 1)        
48 |     else()
49 |         message(STATUS "The AVX512 implementation is not supported")
50 |     endif()
51 |     
52 |     # Test SHA extension
53 |     try_run(RUN_RESULT COMPILE_RESULT
54 |             "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_sha_ni.c"
55 |             COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic"
56 |             OUTPUT_VARIABLE OUTPUT
57 |     )
58 | 
59 |     if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0))
60 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DX86_64_SHA_SUPPORT")
61 |         set(SHA_EXT 1)
62 |     else()
63 |         message(STATUS "The SHA_EXT implementation is not supported")
64 |     endif()
65 | endif()
66 | 
67 | if(AARCH64)
68 |     # Test AVX2
69 |     try_run(RUN_RESULT COMPILE_RESULT
70 |             "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_aarch64_sha_ni.c"
71 |             COMPILE_DEFINITIONS "-I${INCLUDE_DIR}/internal -mcpu=native -Werror -Wall -Wpedantic"
72 |             OUTPUT_VARIABLE OUTPUT
73 |     )
74 | 
75 |     if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0))
76 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAARCH64_SHA_SUPPORT")
77 |         set(SHA_EXT 1)        
78 |     else()
79 |         message(STATUS "The SHA_EXT implementation is not supported")
80 |     endif()
81 | endif()
82 | 


--------------------------------------------------------------------------------
/cmake/clang-format.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # Some of the definitions in .clang-format require clang-format-9 and above.
 5 | find_program(CLANG_FORMAT
 6 |              NAMES 
 7 |                    clang-format-11
 8 |                    clang-format-10
 9 |                    clang-format-9
10 |                    clang-format)
11 | 
12 | IF(CLANG_FORMAT)
13 |   # Get the major version of clang-format 
14 |   # CLANG_FORMAT_VERSION should be in the format "clang-format version [Major].[Minor].[Patch] <additional string>"
15 |   exec_program(${CLANG_FORMAT} ${CMAKE_CURRENT_SOURCE_DIR} ARGS --version OUTPUT_VARIABLE CLANG_FORMAT_VERSION)
16 |   STRING(REGEX REPLACE ".* ([0-9]+)\\.[0-9]+\\.[0-9]+.*" "\\1" CLANG_FORMAT_MAJOR_VERSION ${CLANG_FORMAT_VERSION})
17 |   
18 |   message(STATUS "Found version ${CLANG_FORMAT_MAJOR_VERSION} of clang-format.")
19 |   if(${CLANG_FORMAT_MAJOR_VERSION} LESS "9")
20 |     message(STATUS "To run the format target clang-format version >= 9 is required.")
21 |   else()
22 |     set(CLANG_FORMAT_FILE_TYPES ${CLANG_FORMAT_FILE_TYPES} )
23 |     file(GLOB_RECURSE CF_FILES1 ${SRC_DIR}/*.c ${SRC_DIR}/crypto/*.h)
24 |     file(GLOB_RECURSE CF_FILES2 ${INCLUDE_DIR}/*.h ${INCLUDE_DIR}/internal/*.h)
25 |     file(GLOB_RECURSE CF_FILES3 ${TESTS_DIR}/*.c ${TESTS_DIR}/crypto/*.h)
26 |     set(FILES_TO_FORMAT "${CF_FILES1}" "${CF_FILES2}" "${CF_FILES3}")
27 | 
28 |     ADD_CUSTOM_TARGET(
29 |       format
30 |       COMMAND ${CLANG_FORMAT} -i -style=file ${FILES_TO_FORMAT}
31 |       COMMENT "Clang-formatting all (*.c/*.h) source files"
32 |     )
33 |   endif()
34 | else()
35 |   message(STATUS "Did not find clang-format.")
36 | endif()
37 | 


--------------------------------------------------------------------------------
/cmake/compilation-flags.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | if(CMAKE_C_COMPILER_ID MATCHES "Clang")
 5 |     set(CLANG 1)
 6 | endif()
 7 | 
 8 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb -O3 -fPIC -std=c99")
 9 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden -Wall -Wextra -Werror -Wpedantic")
10 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wunused -Wcomment -Wchar-subscripts -Wuninitialized -Wshadow")
11 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wwrite-strings -Wformat-security -Wcast-qual -Wunused-result")
12 | 
13 | if(X86_64)
14 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mno-red-zone")
15 | else()
16 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
17 | endif()
18 | 
19 | # Avoiding GCC 4.8 bug
20 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-braces -Wno-missing-field-initializers")
21 | 
22 | if(CLANG)
23 |     # CMAKE sends the `-isystem` flag to clang for assembly files.
24 |     # Currently clang unrecognizes it.
25 |     set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wno-error=unused-command-line-argument")
26 | endif ()
27 | 
28 | set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ggdb -fPIC -Wall -Wextra -Werror -Wpedantic")
29 | 
30 | if(MSAN)
31 |     if(NOT CLANG)
32 |         message(FATAL_ERROR "Cannot enable MSAN unless using Clang")
33 |     endif()
34 | 
35 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins -fno-omit-frame-pointer")
36 | endif()
37 | 
38 | if(ASAN)
39 |     if(NOT CLANG)
40 |         message(FATAL_ERROR "Cannot enable ASAN unless using Clang")
41 |     endif()
42 | 
43 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize-address-use-after-scope -fno-omit-frame-pointer")
44 | endif()
45 | 
46 | if(TSAN)
47 |     if(NOT CLANG)
48 |         message(FATAL_ERROR "Cannot enable TSAN unless using Clang")
49 |     endif()
50 | 
51 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
52 | endif()
53 | 
54 | if(UBSAN)
55 |     if(NOT CLANG)
56 |         message(FATAL_ERROR "Cannot enable UBSAN unless using Clang")
57 |     endif()
58 | 
59 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
60 | endif()
61 | 
62 | if(TEST_SPEED)
63 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTEST_SPEED -DRTDSC")
64 | endif()
65 | 
66 | if(ALTERNATIVE_AVX512_IMPL)
67 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DALTERNATIVE_AVX512_IMPL")
68 | endif()
69 | 
70 | if(DONT_USE_UNROLL_PRAGMA)
71 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDONT_USE_UNROLL_PRAGMA")
72 | endif()
73 | 
74 | if(MONTE_CARLO_NUM_OF_TESTS)
75 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMONTE_CARLO_NUM_OF_TESTS=${MONTE_CARLO_NUM_OF_TESTS}")
76 | endif()
77 | 


--------------------------------------------------------------------------------
/cmake/sources.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | set(SHA_SOURCES 
 5 |     ${SRC_DIR}/sha256.c 
 6 |     ${SRC_DIR}/sha256_consts.c 
 7 |     ${SRC_DIR}/sha256_compress_generic.c
 8 |     
 9 |     ${SRC_DIR}/sha512.c 
10 |     ${SRC_DIR}/sha512_consts.c 
11 |     ${SRC_DIR}/sha512_compress_generic.c
12 | )
13 | 
14 | set(OPENSSL_DIR ${SRC_DIR}/openssl)
15 | 
16 | if(APPLE)
17 |     set(OPENSSL_ASM_DIR ${OPENSSL_DIR}/macos)
18 | else()
19 |     set(OPENSSL_ASM_DIR ${OPENSSL_DIR}/linux)
20 | endif()
21 | 
22 | set(OPENSSL_SOURCES 
23 |     ${OPENSSL_DIR}/openssl_cpu_globals.c
24 | )
25 | 
26 | if(X86_64)
27 |     set(SHA_SOURCES ${SHA_SOURCES}
28 |         ${SRC_DIR}/sha256_compress_x86_64_avx.c
29 |         ${SRC_DIR}/sha512_compress_x86_64_avx.c
30 |     )
31 | 
32 |     if(AVX2)
33 |         set(SHA_SOURCES ${SHA_SOURCES}
34 |             ${SRC_DIR}/sha256_compress_x86_64_avx2.c
35 |             ${SRC_DIR}/sha512_compress_x86_64_avx2.c
36 |         )
37 |     endif()
38 |     
39 |     if(AVX512)
40 |         set(SHA_SOURCES ${SHA_SOURCES}
41 |             ${SRC_DIR}/sha256_compress_x86_64_avx512.c
42 |             ${SRC_DIR}/sha512_compress_x86_64_avx512.c
43 |         )
44 |     endif()
45 |     
46 |     if(SHA_EXT)
47 |         set(SHA_SOURCES ${SHA_SOURCES}
48 |             ${SRC_DIR}/sha256_compress_x86_64_sha_ext.c
49 |         )
50 |     endif()
51 | 
52 |     set(OPENSSL_SOURCES ${OPENSSL_SOURCES}
53 |         ${OPENSSL_ASM_DIR}/sha256-x86_64.s
54 |         ${OPENSSL_ASM_DIR}/sha512-x86_64.s
55 |     )
56 | endif()
57 | 
58 | if(AARCH64)
59 |     if(SHA_EXT)
60 |         set(SHA_SOURCES ${SHA_SOURCES}
61 |             ${SRC_DIR}/sha256_compress_aarch64_sha_ext.c
62 |         )
63 |     endif()
64 |     
65 |     set(OPENSSL_SOURCES ${OPENSSL_SOURCES}
66 |         ${OPENSSL_ASM_DIR}/sha256-armv8.S
67 |         ${OPENSSL_ASM_DIR}/sha512-armv8.S
68 |     )
69 | endif()
70 | 
71 | if(TEST_SPEED)
72 |     set(MAIN_SOURCE ${TESTS_DIR}/main_speed.c)
73 | else()
74 |     set(MAIN_SOURCE ${TESTS_DIR}/main_tests.c)
75 | endif()
76 | 


--------------------------------------------------------------------------------
/cmake/test_aarch64_sha_ni.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include <stdint.h>
 5 | #include "neon_defs.h"
 6 | 
 7 | int main(void)
 8 | {
 9 |     uint8_t data[8*16*4];
10 |     uint32x4_t TMP[2] = {0};
11 | 
12 |     // Check for vld1q_u8_x4 intrinsic
13 |     uint8x16x4_t d = vld1q_u8_x4(data);
14 |     TMP[0]            = vreinterpretq_u32_u8(vrev32q_u8(d.val[0]));
15 | 
16 |     uint8x16x2_t d0 = vld1q_u8_x2(data);
17 |     TMP[1]           = vreinterpretq_u32_u8(vrev32q_u8(d0.val[0]));
18 | 
19 |     // Check for vsha256h2q_u32 intrinsic
20 |     vsha256h2q_u32(TMP[0], TMP[1], TMP[0]);
21 | }
22 | 


--------------------------------------------------------------------------------
/cmake/test_endianess.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | 
 7 | int main(void)
 8 | {
 9 |   uint16_t uint_with_2_bytes = 0x0001;
10 |   if (sizeof(uint_with_2_bytes) != 2) {
11 |       printf("Undefined behaviour.\n");
12 |       return 1;
13 |   }      
14 |       
15 |   uint8_t *byte_array = (uint8_t*)&uint_with_2_bytes;
16 |   if (byte_array[0] != 1) {
17 |       printf("The code does not support big endian systems.\n");
18 |       return 1;
19 |   }
20 |   
21 |   printf("A little endian system.\n");
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/cmake/test_x86_64_avx2.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include <stdint.h>
 5 | #include <immintrin.h>
 6 | 
 7 | int main(void)
 8 | {
 9 |   __m256i reg;
10 |   uint64_t mem[4];
11 |   reg = _mm256_loadu_si256((const __m256i*)mem);
12 |   _mm256_storeu_si256((__m256i*)mem, reg);
13 | }
14 | 


--------------------------------------------------------------------------------
/cmake/test_x86_64_avx512.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include <stdint.h>
 5 | #include <immintrin.h>
 6 | 
 7 | int main(void)
 8 | {
 9 |   __m512i reg;
10 |   uint64_t mem[8];
11 |   reg = _mm512_loadu_si512((const __m512i*)mem);
12 |   _mm512_storeu_si512((__m512i*)mem, reg);
13 | }
14 | 


--------------------------------------------------------------------------------
/cmake/test_x86_64_sha_ni.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include <immintrin.h>
 5 | 
 6 | int main(void)
 7 | {
 8 |   __m128i a = _mm_setzero_si128();
 9 |   _mm_sha256msg1_epu32(a, a);
10 | }
11 | 


--------------------------------------------------------------------------------
/include/internal/avx2_defs.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <immintrin.h>
 7 | 
 8 | typedef __m256i vec_t;
 9 | 
10 | #define ADD32(a, b)             (_mm256_add_epi32(a, b))
11 | #define ADD64(a, b)             (_mm256_add_epi64(a, b))
12 | #define ALIGNR8(a, b, mask)     (_mm256_alignr_epi8(a, b, mask))
13 | #define LOAD(mem)               (_mm256_loadu_si256((const __m256i *)(mem)))
14 | #define MADD32(src, imm8, a, b) (_mm256_mask_add_epi32(src, imm8, a, b))
15 | #define ROR32(a, imm8)          (_mm256_ror_epi32(a, imm8))
16 | #define ROR64(a, imm8)          (_mm256_ror_epi64(a, imm8))
17 | #define SHUF8(a, mask)          (_mm256_shuffle_epi8(a, mask))
18 | #define SHUF32(a, mask)         (_mm256_shuffle_epi32(a, mask))
19 | #define SLL32(a, imm8)          (_mm256_slli_epi32(a, imm8))
20 | #define SLL64(a, imm8)          (_mm256_slli_epi64(a, imm8))
21 | #define SRL32(a, imm8)          (_mm256_srli_epi32(a, imm8))
22 | #define SRL64(a, imm8)          (_mm256_srli_epi64(a, imm8))
23 | #define STORE(mem, reg)         (_mm256_store_si256((__m256i *)(mem), reg))
24 | 
25 | #define LOAD128(mem)       (_mm_loadu_si128((const __m128i *)(mem)))
26 | #define STORE128(mem, reg) (_mm_store_si128((__m128i *)(mem), reg))
27 | 
28 | // The _mm256_storeu2_m128i and _mm256_loadu2_m128i APIs are defined in Clang but
29 | // not in GCC
30 | #if defined(__clang__)
31 | #  define STOREU2(hi_mem, lo_mem, reg) \
32 |     (_mm256_storeu2_m128i((__m128i *)(hi_mem), (__m128i *)(lo_mem), reg))
33 | 
34 | #  define LOADU2(hi_mem, lo_mem, reg)                       \
35 |     ((reg) = _mm256_loadu2_m128i((const __m128i *)(hi_mem), \
36 |                                  (const __m128i *)(lo_mem)))
37 | 
38 | #else
39 | #  define STOREU2(hi_mem, lo_mem, reg)                    \
40 |     do {                                                  \
41 |       STORE128(lo_mem, _mm256_extracti128_si256(reg, 0)); \
42 |       STORE128(hi_mem, _mm256_extracti128_si256(reg, 1)); \
43 |     } while(0)
44 | 
45 | #  define LOADU2(hi_mem, lo_mem, reg)                          \
46 |     do {                                                       \
47 |       reg = _mm256_insertf128_si256(reg, LOAD128(hi_mem), 1);  \
48 |       reg = _mm256_insertf128_si256(x[i], LOAD128(lo_mem), 0); \
49 |     } while(0)
50 | #endif
51 | 
52 | // In every 128-bit value choose the two lowest 32-bit values.
53 | #define LOW32X2_MASK (0x33)
54 | // In every 128-bit value choose the two highest 32-bit values.
55 | #define HIGH32X2_MASK (0xcc)
56 | 


--------------------------------------------------------------------------------
/include/internal/avx512_defs.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <immintrin.h>
 7 | 
 8 | typedef __m512i vec_t;
 9 | 
10 | #define ADD64(a, b)             (_mm512_add_epi64(a, b))
11 | #define ADD32(a, b)             (_mm512_add_epi32(a, b))
12 | #define ALIGNR8(a, b, mask)     (_mm512_alignr_epi8(a, b, mask))
13 | #define LOAD(mem)               (_mm512_loadu_si512((const vec_t *)(mem)))
14 | #define MADD32(src, imm8, a, b) (_mm512_mask_add_epi32(src, imm8, a, b))
15 | #define ROR32(a, imm8)          (_mm512_ror_epi32(a, imm8))
16 | #define ROR64(a, imm8)          (_mm512_ror_epi64(a, imm8))
17 | #define SHUF32(a, mask)         (_mm512_shuffle_epi32(a, mask))
18 | #define SHUF8(a, mask)          (_mm512_shuffle_epi8(a, mask))
19 | #define SLL32(a, imm8)          (_mm512_slli_epi32(a, imm8))
20 | #define SLL64(a, imm8)          (_mm512_slli_epi64(a, imm8))
21 | #define SRL32(a, imm8)          (_mm512_srli_epi32(a, imm8))
22 | #define SRL64(a, imm8)          (_mm512_srli_epi64(a, imm8))
23 | #define STORE(mem, reg)         (_mm512_store_si512((vec_t *)(mem), reg))
24 | 
25 | #define LOAD128(mem)       (_mm_loadu_si128((const __m128i *)(mem)))
26 | #define STORE128(mem, reg) (_mm_store_si128((__m128i *)(mem), reg))
27 | 
28 | #define STOREU4(mem3, mem2, mem1, mem0, reg)           \
29 |   do {                                                 \
30 |     STORE128(mem0, _mm512_extracti32x4_epi32(reg, 0)); \
31 |     STORE128(mem1, _mm512_extracti32x4_epi32(reg, 1)); \
32 |     STORE128(mem2, _mm512_extracti32x4_epi32(reg, 2)); \
33 |     STORE128(mem3, _mm512_extracti32x4_epi32(reg, 3)); \
34 |   } while(0)
35 | 
36 | #define LOADU4(mem3, mem2, mem1, mem0, reg)            \
37 |   do {                                                 \
38 |     (reg) = _mm512_inserti32x4(reg, LOAD128(mem0), 0); \
39 |     (reg) = _mm512_inserti32x4(reg, LOAD128(mem1), 1); \
40 |     (reg) = _mm512_inserti32x4(reg, LOAD128(mem2), 2); \
41 |     (reg) = _mm512_inserti32x4(reg, LOAD128(mem3), 3); \
42 |   } while(0)
43 | 
44 | // In every 128-bit value choose the two lowest 32-bit values.
45 | #define LOW32X2_MASK (0x3333)
46 | // In every 128-bit value choose the two highest 32-bit values.
47 | #define HIGH32X2_MASK (0xcccc)
48 | 


--------------------------------------------------------------------------------
/include/internal/avx_defs.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <immintrin.h>
 7 | 
 8 | typedef __m128i vec_t;
 9 | 
10 | #define ADD32(a, b)             (_mm_add_epi32(a, b))
11 | #define ADD64(a, b)             (_mm_add_epi64(a, b))
12 | #define ALIGNR8(a, b, mask)     (_mm_alignr_epi8(a, b, mask))
13 | #define BLEND16(a, b, mask)     (_mm_blend_epi16(a, b, mask))
14 | #define LOAD(mem)               (_mm_loadu_si128((const __m128i *)(mem)))
15 | #define MADD32(src, imm8, a, b) (_mm_mask_add_epi32(src, imm8, a, b))
16 | #define ROR32(a, imm8)          (_mm_ror_epi32(a, imm8))
17 | #define ROR64(a, imm8)          (_mm_ror_epi64(a, imm8))
18 | #define SETR32(e0, e1, e2, e3)  (_mm_setr_epi32(e0, e1, e2, e3))
19 | #define SET64(e1, e0)           (_mm_set_epi64x(e1, e0))
20 | #define SHUF8(a, mask)          (_mm_shuffle_epi8(a, mask))
21 | #define SHUF32(a, mask)         (_mm_shuffle_epi32(a, mask))
22 | #define SLL32(a, imm8)          (_mm_slli_epi32(a, imm8))
23 | #define SLL64(a, imm8)          (_mm_slli_epi64(a, imm8))
24 | #define SRL32(a, imm8)          (_mm_srli_epi32(a, imm8))
25 | #define SRL64(a, imm8)          (_mm_srli_epi64(a, imm8))
26 | #define STORE(mem, reg)         (_mm_store_si128((__m128i *)(mem), reg))
27 | 
28 | #define LOW32X2_MASK  (0x3)
29 | #define HIGH32X2_MASK (0xc)
30 | 


--------------------------------------------------------------------------------
/include/internal/defs.h:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #pragma once
  5 | 
  6 | #include <stdint.h>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | 
 10 | #define IN
 11 | #define OUT
 12 | 
 13 | #define _INLINE_ static inline
 14 | #define ALIGN(n) __attribute__((aligned(n)))
 15 | 
 16 | #if defined(__GNUC__) || defined(__clang__)
 17 | #  define UNUSED __attribute__((unused))
 18 | #else
 19 | #  define UNUSED
 20 | #endif
 21 | 
 22 | #define LSB1(x) ((x)&0x1)
 23 | #define LSB2(x) ((x)&0x3)
 24 | #define LSB4(x) ((x)&0xf)
 25 | 
 26 | #define ROTR16(x, s) (((x) >> (s)) | (x) << (16 - (s)))
 27 | #define ROTR32(x, s) (((x) >> (s)) | (x) << (32 - (s)))
 28 | #define ROTR64(x, s) (((x) >> (s)) | (x) << (64 - (s)))
 29 | 
 30 | #if defined(__GNUC__) && __GNUC__ >= 2
 31 | _INLINE_ uint64_t bswap_64(uint64_t x) { return __builtin_bswap64(x); }
 32 | _INLINE_ uint64_t bswap_32(uint64_t x) { return __builtin_bswap32(x); }
 33 | #else
 34 | _INLINE_ uint32_t bswap_32(uint32_t x)
 35 | {
 36 |   x = ROTR16(x, 16);
 37 |   x = ((x & UINT32_C(0xff00ff00)) >> 8) | ((x & UINT32_C(0x00ff00ff)) << 8);
 38 |   return x;
 39 | }
 40 | 
 41 | _INLINE_ uint64_t bswap_64(uint64_t x)
 42 | {
 43 |   return bswap_32(x >> 32) | (((uint64_t)bswap_32(x)) << 32);
 44 | }
 45 | #endif
 46 | 
 47 | #if defined(__GNUC__) && (__GNUC__ >= 8)
 48 | #  define GCC_SUPPORT_UNROLL_PRAGMA
 49 | #endif
 50 | 
 51 | // A better macro should have the form
 52 | // #define PRAGMA_LOOP_UNROLL(x)  _Pragma("GCC unroll x")
 53 | // But apparantly this is hard to achieve with different compilers
 54 | #if defined(DONT_USE_UNROLL_PRAGMA)
 55 | #  define PRAGMA_LOOP_UNROLL_2
 56 | #  define PRAGMA_LOOP_UNROLL_4
 57 | #  define PRAGMA_LOOP_UNROLL_8
 58 | #  define PRAGMA_LOOP_UNROLL_12
 59 | #  define PRAGMA_LOOP_UNROLL_16
 60 | #  define PRAGMA_LOOP_UNROLL_48
 61 | #  define PRAGMA_LOOP_UNROLL_64
 62 | #  define PRAGMA_LOOP_UNROLL_80
 63 | #else
 64 | #  if defined(GCC_SUPPORT_UNROLL_PRAGMA)
 65 | #    define PRAGMA_LOOP_UNROLL_2  _Pragma("GCC unroll 2")
 66 | #    define PRAGMA_LOOP_UNROLL_4  _Pragma("GCC unroll 4")
 67 | #    define PRAGMA_LOOP_UNROLL_8  _Pragma("GCC unroll 8")
 68 | #    define PRAGMA_LOOP_UNROLL_12 _Pragma("GCC unroll 12")
 69 | #    define PRAGMA_LOOP_UNROLL_16 _Pragma("GCC unroll 16")
 70 | #    define PRAGMA_LOOP_UNROLL_48 _Pragma("GCC unroll 48")
 71 | #    define PRAGMA_LOOP_UNROLL_64 _Pragma("GCC unroll 64")
 72 | #    define PRAGMA_LOOP_UNROLL_80 _Pragma("GCC unroll 80")
 73 | #  elif defined(__clang__)
 74 | #    define PRAGMA_LOOP_UNROLL_2  _Pragma("unroll")
 75 | #    define PRAGMA_LOOP_UNROLL_4  _Pragma("unroll")
 76 | #    define PRAGMA_LOOP_UNROLL_8  _Pragma("unroll")
 77 | #    define PRAGMA_LOOP_UNROLL_12 _Pragma("unroll")
 78 | #    define PRAGMA_LOOP_UNROLL_16 _Pragma("unroll")
 79 | #    define PRAGMA_LOOP_UNROLL_48 _Pragma("unroll")
 80 | #    define PRAGMA_LOOP_UNROLL_64 _Pragma("unroll")
 81 | #    define PRAGMA_LOOP_UNROLL_80 _Pragma("unroll")
 82 | #  else
 83 | #    define PRAGMA_LOOP_UNROLL_2
 84 | #    define PRAGMA_LOOP_UNROLL_4
 85 | #    define PRAGMA_LOOP_UNROLL_8
 86 | #    define PRAGMA_LOOP_UNROLL_12
 87 | #    define PRAGMA_LOOP_UNROLL_16
 88 | #    define PRAGMA_LOOP_UNROLL_48
 89 | #    define PRAGMA_LOOP_UNROLL_64
 90 | #    define PRAGMA_LOOP_UNROLL_80
 91 | #  endif
 92 | #endif
 93 | 
 94 | //////////////////////////
 95 | //  Helper functions
 96 | ///////////////////////////
 97 | 
 98 | // my_memcpy avoids the undefined behaviour of memcpy when byte_len=0
 99 | _INLINE_ void *my_memcpy(void *dst, const void *src, size_t byte_len)
100 | {
101 |   if(byte_len == 0) {
102 |     return dst;
103 |   }
104 | 
105 |   return memcpy(dst, src, byte_len);
106 | }
107 | 
108 | // my_memset avoids the undefined behaviour of memset when byte_len=0
109 | _INLINE_ void *my_memset(void *dst, const int ch, size_t byte_len)
110 | {
111 |   if(byte_len == 0) {
112 |     return dst;
113 |   }
114 | 
115 |   return memset(dst, ch, byte_len);
116 | }
117 | 
118 | _INLINE_ void secure_clean(OUT void *p, IN const size_t byte_len)
119 | {
120 |   typedef void *(*memset_t)(void *, int, size_t);
121 |   static volatile memset_t memset_func = my_memset;
122 |   memset_func(p, 0, byte_len);
123 | }
124 | 
125 | ///////////////////////////////////////////
126 | //  Controlling the OpenSSL borrowed code
127 | ///////////////////////////////////////////
128 | 
129 | #if defined(X86_64)
130 | // In OpenSSL the OPENSSL_ia32cap_P array holds the return values (in
131 | // RAX,RBX,RCX,RDX registesrs) of executing the Intel CPUID leaf 7 instruction.
132 | // The assembly code chooses the relevant SHA implementation according to this
133 | // array.
134 | 
135 | extern unsigned int OPENSSL_ia32cap_P_local[4];
136 | 
137 | #  define CLEAR_OPENSSL_CAP_ARRAY     \
138 |     do {                              \
139 |       OPENSSL_ia32cap_P_local[0] = 0; \
140 |       OPENSSL_ia32cap_P_local[1] = 0; \
141 |       OPENSSL_ia32cap_P_local[2] = 0; \
142 |       OPENSSL_ia32cap_P_local[3] = 0; \
143 |     } while(0)
144 | 
145 | // RAX[30] - Intel CPU bit
146 | // RBX[9]  - SSSE3 bit
147 | // RBX[28] - AVX bit
148 | #  define RUN_OPENSSL_CODE_WITH_AVX(x)                      \
149 |     do {                                                    \
150 |       OPENSSL_ia32cap_P_local[0] |= (1 << 30);              \
151 |       OPENSSL_ia32cap_P_local[1] |= ((1 << 9) | (1 << 28)); \
152 |       {x} CLEAR_OPENSSL_CAP_ARRAY;                          \
153 |     } while(0)
154 | 
155 | // RCX[3] - BMI1 bit
156 | // RCX[5] - AVX2 bit
157 | // RCX[8] - BMI2 bit
158 | #  define RUN_OPENSSL_CODE_WITH_AVX2(x)                               \
159 |     do {                                                              \
160 |       OPENSSL_ia32cap_P_local[2] |= ((1 << 8) | (1 << 5) | (1 << 3)); \
161 |       {x} CLEAR_OPENSSL_CAP_ARRAY;                                    \
162 |     } while(0)
163 | 
164 | // RCX[29] - SHA_NI (EXT) bit
165 | #  define RUN_OPENSSL_CODE_WITH_SHA_EXT(x)     \
166 |     do {                                       \
167 |       OPENSSL_ia32cap_P_local[2] |= (1 << 29); \
168 |       {x} CLEAR_OPENSSL_CAP_ARRAY;             \
169 |     } while(0)
170 | 
171 | #endif
172 | 
173 | #if defined(AARCH64)
174 | 
175 | extern unsigned int OPENSSL_armcap_P_local;
176 | 
177 | #  define CLEAR_OPENSSL_CAP_ARRAY \
178 |     do {                          \
179 |       OPENSSL_armcap_P_local = 0; \
180 |     } while(0)
181 | 
182 | #  define ARMV7_NEON   (1 << 0)
183 | #  define ARMV8_SHA256 (1 << 4)
184 | #  define ARMV8_SHA512 (1 << 6)
185 | 
186 | #  define RUN_OPENSSL_CODE_WITH_NEON(x)     \
187 |     do {                                    \
188 |       OPENSSL_armcap_P_local |= ARMV7_NEON; \
189 |       {x} CLEAR_OPENSSL_CAP_ARRAY;          \
190 |     } while(0)
191 | 
192 | #  define RUN_OPENSSL_CODE_WITH_SHA256_EXT(x) \
193 |     do {                                      \
194 |       OPENSSL_armcap_P_local |= ARMV8_SHA256; \
195 |       {x} CLEAR_OPENSSL_CAP_ARRAY;            \
196 |     } while(0)
197 | 
198 | #  define RUN_OPENSSL_CODE_WITH_SHA512_EXT(x) \
199 |     do {                                      \
200 |       OPENSSL_armcap_P_local |= ARMV8_SHA512; \
201 |       {x} CLEAR_OPENSSL_CAP_ARRAY;            \
202 |     } while(0)
203 | #endif
204 | 


--------------------------------------------------------------------------------
/include/internal/measurements.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <float.h>
 7 | #include <stdint.h>
 8 | 
 9 | #ifndef REPEAT
10 | #  define REPEAT 100
11 | #endif
12 | 
13 | #ifndef OUTER_REPEAT
14 | #  define OUTER_REPEAT 10
15 | #endif
16 | 
17 | #ifndef WARMUP
18 | #  define WARMUP (REPEAT / 4)
19 | #endif
20 | 
21 | uint64_t start_clk, end_clk;
22 | double   total_clk;
23 | double   temp_clk;
24 | size_t   rdtsc_itr;
25 | size_t   rdtsc_outer_itr;
26 | 
27 | #define HALF_GPR_SIZE UINT8_C(32)
28 | 
29 | #if defined(X86_64)
30 | inline static uint64_t get_Clks(void)
31 | {
32 |   uint64_t hi;
33 |   uint64_t lo;
34 |   __asm__ __volatile__("rdtscp\n\t" : "=a"(lo), "=d"(hi)::"rcx");
35 |   return lo ^ (hi << HALF_GPR_SIZE);
36 | }
37 | #endif
38 | 
39 | #if defined(AARCH64)
40 | inline static uint64_t get_Clks(void)
41 | {
42 |   /*uint32_t hi;
43 |   uint32_t lo;
44 |   __asm__ __volatile__("rdtscp\n\t" : "=a"(lo), "=d"(hi)::"rcx");
45 |   return ((uint64_t)lo) ^ (((uint64_t)hi) << HALF_GPR_SIZE);*/
46 |   uint64_t value;
47 |   __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(value));
48 |   return value;
49 | }
50 | #endif
51 | 
52 | // This MACRO measures the number of cycles "x" runs. This is the flow:
53 | //   1) it repeats "x" WARMUP times, in order to warm the cache.
54 | //   2) it reads the Time Stamp Counter at the beginning of the test.
55 | //   3) it repeats "x" REPEAT number of times.
56 | //   4) it reads the Time Stamp Counter again at the end of the test
57 | //   5) it calculates the average number of cycles per one iteration of "x", by
58 | //      calculating the total number of cycles, and dividing it by REPEAT
59 | #define RDTSC_MEASURE(x)                                                        \
60 |   for(rdtsc_itr = 0; rdtsc_itr < WARMUP; rdtsc_itr++) {                         \
61 |     {x};                                                                        \
62 |   }                                                                             \
63 |   total_clk = DBL_MAX;                                                          \
64 |   for(rdtsc_outer_itr = 0; rdtsc_outer_itr < OUTER_REPEAT; rdtsc_outer_itr++) { \
65 |     start_clk = get_Clks();                                                     \
66 |     for(rdtsc_itr = 0; rdtsc_itr < REPEAT; rdtsc_itr++) {                       \
67 |       {x};                                                                      \
68 |     }                                                                           \
69 |     end_clk  = get_Clks();                                                      \
70 |     temp_clk = (double)(end_clk - start_clk) / REPEAT;                          \
71 |     if(total_clk > temp_clk) total_clk = temp_clk;                              \
72 |   }                                                                             \
73 |   printf("%12.0f ", total_clk);
74 | 
75 | #define MEASURE(x) RDTSC_MEASURE(x)
76 | 


--------------------------------------------------------------------------------
/include/internal/neon_defs.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #if defined(__ARM_NEON)
 5 | #  include <arm_neon.h>
 6 | #endif
 7 | 
 8 | #if !defined(__clang__)
 9 | static inline uint8x16x4_t vld1q_u8_x4(const uint8_t *mem)
10 | {
11 |   uint8x16x2_t d0 = vld1q_u8_x2(mem);
12 |   uint8x16x2_t d1 = vld1q_u8_x2(&mem[32]);
13 | 
14 |   uint8x16x4_t ret;
15 |   ret.val[0] = d0.val[0];
16 |   ret.val[1] = d0.val[1];
17 |   ret.val[2] = d1.val[0];
18 |   ret.val[3] = d1.val[1];
19 |   return ret;
20 | }
21 | 
22 | static inline void vst1q_u32_x2(uint32_t *mem, const uint32x4x2_t v)
23 | {
24 |   vst1q_u32(mem, v.val[0]);
25 |   vst1q_u32(mem + 4, v.val[1]);
26 | }
27 | #endif // __clang__
28 | 


--------------------------------------------------------------------------------
/include/internal/sha256_defs.h:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #pragma once
  5 | 
  6 | #include "sha.h"
  7 | 
  8 | typedef uint32_t sha256_word_t;
  9 | 
 10 | #define SHA256_BLOCK_BYTE_LEN  64
 11 | #define SHA256_ROUNDS_NUM      64
 12 | #define SHA256_MSG_END_SYMBOL  (0x80)
 13 | #define SHA256_HASH_WORDS_NUM  (SHA256_HASH_BYTE_LEN / sizeof(sha256_word_t))
 14 | #define SHA256_BLOCK_WORDS_NUM (SHA256_BLOCK_BYTE_LEN / sizeof(sha256_word_t))
 15 | 
 16 | #define SHA256_FINAL_ROUND_START_IDX 48
 17 | 
 18 | // The SHA state: parameters a-h
 19 | typedef ALIGN(64) struct sha256_state_st {
 20 |   sha256_word_t w[SHA256_HASH_WORDS_NUM];
 21 | } sha256_state_t;
 22 | 
 23 | typedef ALIGN(64) struct sha256_msg_schedule_st {
 24 |   sha256_word_t w[SHA256_BLOCK_WORDS_NUM];
 25 | } sha256_msg_schedule_t;
 26 | 
 27 | #define Sigma0_0 2
 28 | #define Sigma0_1 13
 29 | #define Sigma0_2 22
 30 | #define Sigma1_0 6
 31 | #define Sigma1_1 11
 32 | #define Sigma1_2 25
 33 | 
 34 | #define sigma0_0 7
 35 | #define sigma0_1 18
 36 | #define sigma0_2 3
 37 | #define sigma1_0 17
 38 | #define sigma1_1 19
 39 | #define sigma1_2 10
 40 | 
 41 | #define DUP2(x, y, z, w) x, y, z, w, x, y, z, w                         // NOLINT
 42 | #define DUP4(x, y, z, w) x, y, z, w, x, y, z, w, x, y, z, w, x, y, z, w // NOLINT
 43 | 
 44 | #define ROTR(x, v)   ROTR32(x, v)
 45 | #define Sigma0(x)    (ROTR(x, Sigma0_0) ^ ROTR(x, Sigma0_1) ^ ROTR(x, Sigma0_2))
 46 | #define Sigma1(x)    (ROTR(x, Sigma1_0) ^ ROTR(x, Sigma1_1) ^ ROTR(x, Sigma1_2))
 47 | #define sigma0(x)    (ROTR(x, sigma0_0) ^ ROTR(x, sigma0_1) ^ ((x) >> sigma0_2))
 48 | #define sigma1(x)    (ROTR(x, sigma1_0) ^ ROTR(x, sigma1_1) ^ ((x) >> sigma1_2))
 49 | #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 50 | #define Ch(x, y, z)  (((x) & (y)) ^ ((~(x)) & (z)))
 51 | 
 52 | // In the AVX* implementations we operate on 1/2/4 blocks in parllel
 53 | // In these cases, it is faster to duplicate the same line in memory
 54 | // and load it instead of broadcasting it.
 55 | ALIGN(64) extern const sha256_word_t K256[SHA256_ROUNDS_NUM];
 56 | ALIGN(64) extern const sha256_word_t K256x2[2 * SHA256_ROUNDS_NUM];
 57 | ALIGN(64) extern const sha256_word_t K256x4[4 * SHA256_ROUNDS_NUM];
 58 | 
 59 | #define ROTATE_STATE(s)                  \
 60 |   do {                                   \
 61 |     const sha256_word_t tmp = (s)->w[7]; \
 62 |     (s)->w[7]               = (s)->w[6]; \
 63 |     (s)->w[6]               = (s)->w[5]; \
 64 |     (s)->w[5]               = (s)->w[4]; \
 65 |     (s)->w[4]               = (s)->w[3]; \
 66 |     (s)->w[3]               = (s)->w[2]; \
 67 |     (s)->w[2]               = (s)->w[1]; \
 68 |     (s)->w[1]               = (s)->w[0]; \
 69 |     (s)->w[0]               = tmp;       \
 70 |   } while(0)
 71 | 
 72 | _INLINE_ void sha_round(IN OUT sha256_state_t *s,
 73 |                         IN const sha256_word_t x,
 74 |                         IN const sha256_word_t k)
 75 | {
 76 |   sha256_word_t t = x + s->w[7] + Sigma1(s->w[4]);
 77 | 
 78 |   t += Ch(s->w[4], s->w[5], s->w[6]) + k;
 79 |   s->w[7] = t + Sigma0(s->w[0]) + Maj(s->w[0], s->w[1], s->w[2]);
 80 |   s->w[3] += t;
 81 |   ROTATE_STATE(s);
 82 | }
 83 | 
 84 | _INLINE_ void accumulate_state(IN OUT sha256_state_t *dst,
 85 |                                IN const sha256_state_t *src)
 86 | {
 87 |   for(size_t i = 0; i < SHA256_HASH_WORDS_NUM; i++) {
 88 |     dst->w[i] += src->w[i];
 89 |   }
 90 | }
 91 | 
 92 | void sha256_compress_generic(IN OUT sha256_state_t *state,
 93 |                              IN const uint8_t *data,
 94 |                              IN size_t         blocks_num);
 95 | 
 96 | #if defined(X86_64)
 97 | 
 98 | void sha256_compress_x86_64_avx(IN OUT sha256_state_t *state,
 99 |                                 IN const uint8_t *data,
100 |                                 IN size_t         blocks_num);
101 | 
102 | void sha256_compress_x86_64_avx2(IN OUT sha256_state_t *state,
103 |                                  IN const uint8_t *data,
104 |                                  IN size_t         blocks_num);
105 | 
106 | void sha256_compress_x86_64_avx512(IN OUT sha256_state_t *state,
107 |                                    IN const uint8_t *data,
108 |                                    IN size_t         blocks_num);
109 | 
110 | void sha256_compress_x86_64_sha_ext(IN OUT sha256_state_t *state,
111 |                                     IN const uint8_t *data,
112 |                                     IN size_t         blocks_num);
113 | #endif // X86_64
114 | 
115 | #if defined(AARCH64)
116 | void sha256_compress_aarch64_sha_ext(IN OUT sha256_state_t *state,
117 |                                      IN const uint8_t *data,
118 |                                      IN size_t         blocks_num);
119 | #endif
120 | 
121 | // This ASM code was borrowed from OpenSSL as is.
122 | extern void sha256_block_data_order_local(IN OUT sha256_word_t *state,
123 |                                           IN const uint8_t *data,
124 |                                           IN size_t         blocks_num);
125 | 


--------------------------------------------------------------------------------
/include/internal/sha512_defs.h:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #pragma once
  5 | 
  6 | #include "sha.h"
  7 | 
  8 | typedef uint64_t sha512_word_t;
  9 | 
 10 | #define SHA512_BLOCK_BYTE_LEN  128
 11 | #define SHA512_ROUNDS_NUM      80
 12 | #define SHA512_MSG_END_SYMBOL  (0x80)
 13 | #define SHA512_HASH_WORDS_NUM  (SHA512_HASH_BYTE_LEN / sizeof(sha512_word_t))
 14 | #define SHA512_BLOCK_WORDS_NUM (SHA512_BLOCK_BYTE_LEN / sizeof(sha512_word_t))
 15 | 
 16 | #define SHA512_FINAL_ROUND_START_IDX 64
 17 | 
 18 | // The SHA state: parameters a-h
 19 | typedef struct sha512_state_st {
 20 |   ALIGN(64) sha512_word_t w[SHA512_HASH_WORDS_NUM];
 21 | } sha512_state_t;
 22 | 
 23 | typedef struct sha512_msg_schedule_st {
 24 |   ALIGN(64) sha512_word_t w[SHA512_BLOCK_WORDS_NUM];
 25 | } sha512_msg_schedule_t;
 26 | 
 27 | #define Sigma0_0 28
 28 | #define Sigma0_1 34
 29 | #define Sigma0_2 39
 30 | #define Sigma1_0 14
 31 | #define Sigma1_1 18
 32 | #define Sigma1_2 41
 33 | 
 34 | #define sigma0_0 1
 35 | #define sigma0_1 8
 36 | #define sigma0_2 7
 37 | #define sigma1_0 19
 38 | #define sigma1_1 61
 39 | #define sigma1_2 6
 40 | 
 41 | #define DUP2(x, y) x, y, x, y             // NOLINT
 42 | #define DUP4(x, y) x, y, x, y, x, y, x, y // NOLINT
 43 | 
 44 | #define ROTR(x, v)   ROTR64(x, v)
 45 | #define Sigma0(x)    (ROTR(x, Sigma0_0) ^ ROTR(x, Sigma0_1) ^ ROTR(x, Sigma0_2))
 46 | #define Sigma1(x)    (ROTR(x, Sigma1_0) ^ ROTR(x, Sigma1_1) ^ ROTR(x, Sigma1_2))
 47 | #define sigma0(x)    (ROTR(x, sigma0_0) ^ ROTR(x, sigma0_1) ^ ((x) >> sigma0_2))
 48 | #define sigma1(x)    (ROTR(x, sigma1_0) ^ ROTR(x, sigma1_1) ^ ((x) >> sigma1_2))
 49 | #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 50 | #define Ch(x, y, z)  (((x) & (y)) ^ ((~(x)) & (z)))
 51 | 
 52 | // In the AVX* implementations we operate on 1/2/4 blocks in parllel
 53 | // In these cases, it is faster to duplicate the same line in memory
 54 | // and load it instead of broadcasting it.
 55 | ALIGN(64) extern const sha512_word_t K512[SHA512_ROUNDS_NUM];
 56 | ALIGN(64) extern const sha512_word_t K512x2[2 * SHA512_ROUNDS_NUM];
 57 | ALIGN(64) extern const sha512_word_t K512x4[4 * SHA512_ROUNDS_NUM];
 58 | 
 59 | #define ROTATE_STATE(s)                  \
 60 |   do {                                   \
 61 |     const sha512_word_t tmp = (s)->w[7]; \
 62 |     (s)->w[7]               = (s)->w[6]; \
 63 |     (s)->w[6]               = (s)->w[5]; \
 64 |     (s)->w[5]               = (s)->w[4]; \
 65 |     (s)->w[4]               = (s)->w[3]; \
 66 |     (s)->w[3]               = (s)->w[2]; \
 67 |     (s)->w[2]               = (s)->w[1]; \
 68 |     (s)->w[1]               = (s)->w[0]; \
 69 |     (s)->w[0]               = tmp;       \
 70 |   } while(0)
 71 | 
 72 | _INLINE_ void sha_round(IN OUT sha512_state_t *s,
 73 |                         IN const sha512_word_t x,
 74 |                         IN const sha512_word_t k)
 75 | {
 76 |   sha512_word_t t = x + s->w[7] + Sigma1(s->w[4]);
 77 | 
 78 |   t += Ch(s->w[4], s->w[5], s->w[6]) + k;
 79 |   s->w[7] = t + Sigma0(s->w[0]) + Maj(s->w[0], s->w[1], s->w[2]);
 80 |   s->w[3] += t;
 81 |   ROTATE_STATE(s);
 82 | }
 83 | 
 84 | _INLINE_ void accumulate_state(IN OUT sha512_state_t *dst,
 85 |                                IN const sha512_state_t *src)
 86 | {
 87 |   for(size_t i = 0; i < SHA512_HASH_WORDS_NUM; i++) {
 88 |     dst->w[i] += src->w[i];
 89 |   }
 90 | }
 91 | 
 92 | void sha512_compress_generic(IN OUT sha512_state_t *state,
 93 |                              IN const uint8_t *data,
 94 |                              IN size_t         blocks_num);
 95 | 
 96 | #if defined(X86_64)
 97 | void sha512_compress_x86_64_avx(IN OUT sha512_state_t *state,
 98 |                                 IN const uint8_t *data,
 99 |                                 IN size_t         blocks_num);
100 | 
101 | void sha512_compress_x86_64_avx2(IN OUT sha512_state_t *state,
102 |                                  IN const uint8_t *data,
103 |                                  IN size_t         blocks_num);
104 | 
105 | void sha512_compress_x86_64_avx512(IN OUT sha512_state_t *state,
106 |                                    IN const uint8_t *data,
107 |                                    IN size_t         blocks_num);
108 | #endif // X86_64
109 | 
110 | // This ASM code was borrowed from OpenSSL as is.
111 | extern void sha512_block_data_order_local(IN OUT sha512_word_t *state,
112 |                                           IN const uint8_t *data,
113 |                                           IN size_t         blocks_num);
114 | 


--------------------------------------------------------------------------------
/include/sha.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "internal/defs.h"
 7 | 
 8 | typedef enum sha_impl_e
 9 | {
10 |   GENERIC_IMPL,
11 | 
12 | #if defined(X86_64)
13 |   AVX_IMPL,
14 |   OPENSSL_AVX_IMPL,
15 | #endif
16 | 
17 | #if defined(AVX2_SUPPORT)
18 |   AVX2_IMPL,
19 |   OPENSSL_AVX2_IMPL,
20 | #endif
21 | 
22 | #if defined(AVX512_SUPPORT)
23 |   AVX512_IMPL,
24 | #endif
25 | 
26 | #if defined(X86_64_SHA_SUPPORT)
27 |   SHA_EXT_IMPL,
28 |   OPENSSL_SHA_EXT_IMPL,
29 | #endif
30 | 
31 | #if defined(NEON_SUPPORT)
32 |   NEON_IMPL,
33 |   OPENSSL_NEON_IMPL,
34 | #endif
35 | 
36 | #if defined(AARCH64_SHA_SUPPORT)
37 |   SHA_EXT_IMPL,
38 |   OPENSSL_SHA_EXT_IMPL,
39 | #endif
40 | 
41 | } sha_impl_t;
42 | 
43 | #define SHA256_HASH_BYTE_LEN 32
44 | #define SHA512_HASH_BYTE_LEN 64
45 | 
46 | void sha256(OUT uint8_t *dgst,
47 |             IN const uint8_t *data,
48 |             IN size_t         byte_len,
49 |             IN sha_impl_t     impl);
50 | 
51 | void sha512(OUT uint8_t *dgst,
52 |             IN const uint8_t *data,
53 |             IN size_t         byte_len,
54 |             IN sha_impl_t     impl);
55 | 


--------------------------------------------------------------------------------
/src/openssl/README.md:
--------------------------------------------------------------------------------
 1 | The code in this directory was copied from the compilation artifacts of OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744) Feb 24, 2020. 
 2 | 
 3 | To reproduce on a platform equipped with Intel 10th generation CPU:
 4 | 
 5 | ```
 6 | git clone https://github.com/openssl/openssl
 7 | cd openssl
 8 | git checkout e32c608e0733d5b295c9aa119153133413c5d744
 9 | ./config
10 | make
11 | ```
12 | 
13 | and the files are found in:
14 | 
15 | ```
16 | ./crypto/sha/sha256-x86_64.s
17 | ./crypto/sha/sha152-x86_64.s
18 | ```
19 | 
20 | These files include several implementations of SHA256 and SHA512 in x86-64 assembly. In particular, they include AVX/AVX2 implementations and for SHA256 also an implementation that uses the new SHA extension that is available on Intel's 10th generation CPUs.
21 | 
22 | The relevant implementation is chosen according to the value of the OPENSSL_ia32cap_P array.
23 | 
24 | On an AARCH64 machine, the files are found in:
25 | 
26 | ```
27 | ./crypto/sha/sha256-armv8.S
28 | ./crypto/sha/sha512-armv8.S
29 | ```
30 | The relevant implementation is chosen according to the value of the OPENSSL_armcap_P array.
31 | 
32 | To avoid symbols conflicts/mistakes the name of the function `sha256_block_data_order` was changed to `sha256_block_data_order_local`, the parameter `OPENSSL_ia32cap_P` was changed to `OPENSSL_ia32cap_P_local`, the parameter `OPENSSL_armcap_P` was changed to `OPENSSL_armcap_P_local` and in the aarch64 files the include files (dependencies) were removed.
33 | 


--------------------------------------------------------------------------------
/src/openssl/linux/sha512-armv8.S:
--------------------------------------------------------------------------------
   1 | // Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
   2 | //
   3 | // Licensed under the Apache License 2.0 (the "License").  You may not use
   4 | // this file except in compliance with the License.  You can obtain a copy
   5 | // in the file LICENSE in the source distribution or at
   6 | // https://www.openssl.org/source/license.html
   7 | 
   8 | // ====================================================================
   9 | // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10 | // project. The module is, however, dual licensed under OpenSSL and
  11 | // CRYPTOGAMS licenses depending on where you obtain it. For further
  12 | // details see http://www.openssl.org/~appro/cryptogams/.
  13 | //
  14 | // Permission to use under GPLv2 terms is granted.
  15 | // ====================================================================
  16 | //
  17 | // SHA256/512 for ARMv8.
  18 | //
  19 | // Performance in cycles per processed byte and improvement coefficient
  20 | // over code generated with "default" compiler:
  21 | //
  22 | //		SHA256-hw	SHA256(*)	SHA512
  23 | // Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
  24 | // Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
  25 | // Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
  26 | // Denver	2.01		10.5 (+26%)	6.70 (+8%)
  27 | // X-Gene			20.0 (+100%)	12.8 (+300%(***))
  28 | // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
  29 | // Kryo		1.92		17.4 (+30%)	11.2 (+8%)
  30 | // ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
  31 | //
  32 | // (*)	Software SHA256 results are of lesser relevance, presented
  33 | //	mostly for informational purposes.
  34 | // (**)	The result is a trade-off: it's possible to improve it by
  35 | //	10% (or by 1 cycle per round), but at the cost of 20% loss
  36 | //	on Cortex-A53 (or by 4 cycles per round).
  37 | // (***)	Super-impressive coefficients over gcc-generated code are
  38 | //	indication of some compiler "pathology", most notably code
  39 | //	generated with -mgeneral-regs-only is significantly faster
  40 | //	and the gap is only 40-90%.
  41 | //
  42 | // October 2016.
  43 | //
  44 | // Originally it was reckoned that it makes no sense to implement NEON
  45 | // version of SHA256 for 64-bit processors. This is because performance
  46 | // improvement on most wide-spread Cortex-A5x processors was observed
  47 | // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
  48 | // observed that 32-bit NEON SHA256 performs significantly better than
  49 | // 64-bit scalar version on *some* of the more recent processors. As
  50 | // result 64-bit NEON version of SHA256 was added to provide best
  51 | // all-round performance. For example it executes ~30% faster on X-Gene
  52 | // and Mongoose. [For reference, NEON version of SHA512 is bound to
  53 | // deliver much less improvement, likely *negative* on Cortex-A5x.
  54 | // Which is why NEON support is limited to SHA256.]
  55 | 
  56 | // $output is the last argument if it looks like a file (it has an extension)
  57 | // $flavour is the first argument if it doesn't look like a file
  58 | # define ARMV7_NEON      (1<<0)
  59 | # define ARMV8_SHA256    (1<<4)
  60 | # define ARMV8_SHA512    (1<<6)
  61 | 
  62 | .text
  63 | 
  64 | .globl	sha512_block_data_order_local
  65 | .type	sha512_block_data_order_local,%function
  66 | .align	6
  67 | sha512_block_data_order_local:
  68 | #ifndef	__KERNEL__
  69 | 	adrp	x16,OPENSSL_armcap_P_local
  70 | 	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P_local]
  71 | 	tst	w16,#ARMV8_SHA512
  72 | 	b.ne	.Lv8_entry
  73 | #endif
  74 | .inst	0xd503233f				// paciasp
  75 | 	stp	x29,x30,[sp,#-128]!
  76 | 	add	x29,sp,#0
  77 | 
  78 | 	stp	x19,x20,[sp,#16]
  79 | 	stp	x21,x22,[sp,#32]
  80 | 	stp	x23,x24,[sp,#48]
  81 | 	stp	x25,x26,[sp,#64]
  82 | 	stp	x27,x28,[sp,#80]
  83 | 	sub	sp,sp,#4*8
  84 | 
  85 | 	ldp	x20,x21,[x0]				// load context
  86 | 	ldp	x22,x23,[x0,#2*8]
  87 | 	ldp	x24,x25,[x0,#4*8]
  88 | 	add	x2,x1,x2,lsl#7	// end of input
  89 | 	ldp	x26,x27,[x0,#6*8]
  90 | 	adr	x30,.LK512
  91 | 	stp	x0,x2,[x29,#96]
  92 | 
  93 | .Loop:
  94 | 	ldp	x3,x4,[x1],#2*8
  95 | 	ldr	x19,[x30],#8			// *K++
  96 | 	eor	x28,x21,x22				// magic seed
  97 | 	str	x1,[x29,#112]
  98 | #ifndef	__AARCH64EB__
  99 | 	rev	x3,x3			// 0
 100 | #endif
 101 | 	ror	x16,x24,#14
 102 | 	add	x27,x27,x19			// h+=K[i]
 103 | 	eor	x6,x24,x24,ror#23
 104 | 	and	x17,x25,x24
 105 | 	bic	x19,x26,x24
 106 | 	add	x27,x27,x3			// h+=X[i]
 107 | 	orr	x17,x17,x19			// Ch(e,f,g)
 108 | 	eor	x19,x20,x21			// a^b, b^c in next round
 109 | 	eor	x16,x16,x6,ror#18	// Sigma1(e)
 110 | 	ror	x6,x20,#28
 111 | 	add	x27,x27,x17			// h+=Ch(e,f,g)
 112 | 	eor	x17,x20,x20,ror#5
 113 | 	add	x27,x27,x16			// h+=Sigma1(e)
 114 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 115 | 	add	x23,x23,x27			// d+=h
 116 | 	eor	x28,x28,x21			// Maj(a,b,c)
 117 | 	eor	x17,x6,x17,ror#34	// Sigma0(a)
 118 | 	add	x27,x27,x28			// h+=Maj(a,b,c)
 119 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 120 | 	//add	x27,x27,x17			// h+=Sigma0(a)
 121 | #ifndef	__AARCH64EB__
 122 | 	rev	x4,x4			// 1
 123 | #endif
 124 | 	ldp	x5,x6,[x1],#2*8
 125 | 	add	x27,x27,x17			// h+=Sigma0(a)
 126 | 	ror	x16,x23,#14
 127 | 	add	x26,x26,x28			// h+=K[i]
 128 | 	eor	x7,x23,x23,ror#23
 129 | 	and	x17,x24,x23
 130 | 	bic	x28,x25,x23
 131 | 	add	x26,x26,x4			// h+=X[i]
 132 | 	orr	x17,x17,x28			// Ch(e,f,g)
 133 | 	eor	x28,x27,x20			// a^b, b^c in next round
 134 | 	eor	x16,x16,x7,ror#18	// Sigma1(e)
 135 | 	ror	x7,x27,#28
 136 | 	add	x26,x26,x17			// h+=Ch(e,f,g)
 137 | 	eor	x17,x27,x27,ror#5
 138 | 	add	x26,x26,x16			// h+=Sigma1(e)
 139 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 140 | 	add	x22,x22,x26			// d+=h
 141 | 	eor	x19,x19,x20			// Maj(a,b,c)
 142 | 	eor	x17,x7,x17,ror#34	// Sigma0(a)
 143 | 	add	x26,x26,x19			// h+=Maj(a,b,c)
 144 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 145 | 	//add	x26,x26,x17			// h+=Sigma0(a)
 146 | #ifndef	__AARCH64EB__
 147 | 	rev	x5,x5			// 2
 148 | #endif
 149 | 	add	x26,x26,x17			// h+=Sigma0(a)
 150 | 	ror	x16,x22,#14
 151 | 	add	x25,x25,x19			// h+=K[i]
 152 | 	eor	x8,x22,x22,ror#23
 153 | 	and	x17,x23,x22
 154 | 	bic	x19,x24,x22
 155 | 	add	x25,x25,x5			// h+=X[i]
 156 | 	orr	x17,x17,x19			// Ch(e,f,g)
 157 | 	eor	x19,x26,x27			// a^b, b^c in next round
 158 | 	eor	x16,x16,x8,ror#18	// Sigma1(e)
 159 | 	ror	x8,x26,#28
 160 | 	add	x25,x25,x17			// h+=Ch(e,f,g)
 161 | 	eor	x17,x26,x26,ror#5
 162 | 	add	x25,x25,x16			// h+=Sigma1(e)
 163 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 164 | 	add	x21,x21,x25			// d+=h
 165 | 	eor	x28,x28,x27			// Maj(a,b,c)
 166 | 	eor	x17,x8,x17,ror#34	// Sigma0(a)
 167 | 	add	x25,x25,x28			// h+=Maj(a,b,c)
 168 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 169 | 	//add	x25,x25,x17			// h+=Sigma0(a)
 170 | #ifndef	__AARCH64EB__
 171 | 	rev	x6,x6			// 3
 172 | #endif
 173 | 	ldp	x7,x8,[x1],#2*8
 174 | 	add	x25,x25,x17			// h+=Sigma0(a)
 175 | 	ror	x16,x21,#14
 176 | 	add	x24,x24,x28			// h+=K[i]
 177 | 	eor	x9,x21,x21,ror#23
 178 | 	and	x17,x22,x21
 179 | 	bic	x28,x23,x21
 180 | 	add	x24,x24,x6			// h+=X[i]
 181 | 	orr	x17,x17,x28			// Ch(e,f,g)
 182 | 	eor	x28,x25,x26			// a^b, b^c in next round
 183 | 	eor	x16,x16,x9,ror#18	// Sigma1(e)
 184 | 	ror	x9,x25,#28
 185 | 	add	x24,x24,x17			// h+=Ch(e,f,g)
 186 | 	eor	x17,x25,x25,ror#5
 187 | 	add	x24,x24,x16			// h+=Sigma1(e)
 188 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 189 | 	add	x20,x20,x24			// d+=h
 190 | 	eor	x19,x19,x26			// Maj(a,b,c)
 191 | 	eor	x17,x9,x17,ror#34	// Sigma0(a)
 192 | 	add	x24,x24,x19			// h+=Maj(a,b,c)
 193 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 194 | 	//add	x24,x24,x17			// h+=Sigma0(a)
 195 | #ifndef	__AARCH64EB__
 196 | 	rev	x7,x7			// 4
 197 | #endif
 198 | 	add	x24,x24,x17			// h+=Sigma0(a)
 199 | 	ror	x16,x20,#14
 200 | 	add	x23,x23,x19			// h+=K[i]
 201 | 	eor	x10,x20,x20,ror#23
 202 | 	and	x17,x21,x20
 203 | 	bic	x19,x22,x20
 204 | 	add	x23,x23,x7			// h+=X[i]
 205 | 	orr	x17,x17,x19			// Ch(e,f,g)
 206 | 	eor	x19,x24,x25			// a^b, b^c in next round
 207 | 	eor	x16,x16,x10,ror#18	// Sigma1(e)
 208 | 	ror	x10,x24,#28
 209 | 	add	x23,x23,x17			// h+=Ch(e,f,g)
 210 | 	eor	x17,x24,x24,ror#5
 211 | 	add	x23,x23,x16			// h+=Sigma1(e)
 212 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 213 | 	add	x27,x27,x23			// d+=h
 214 | 	eor	x28,x28,x25			// Maj(a,b,c)
 215 | 	eor	x17,x10,x17,ror#34	// Sigma0(a)
 216 | 	add	x23,x23,x28			// h+=Maj(a,b,c)
 217 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 218 | 	//add	x23,x23,x17			// h+=Sigma0(a)
 219 | #ifndef	__AARCH64EB__
 220 | 	rev	x8,x8			// 5
 221 | #endif
 222 | 	ldp	x9,x10,[x1],#2*8
 223 | 	add	x23,x23,x17			// h+=Sigma0(a)
 224 | 	ror	x16,x27,#14
 225 | 	add	x22,x22,x28			// h+=K[i]
 226 | 	eor	x11,x27,x27,ror#23
 227 | 	and	x17,x20,x27
 228 | 	bic	x28,x21,x27
 229 | 	add	x22,x22,x8			// h+=X[i]
 230 | 	orr	x17,x17,x28			// Ch(e,f,g)
 231 | 	eor	x28,x23,x24			// a^b, b^c in next round
 232 | 	eor	x16,x16,x11,ror#18	// Sigma1(e)
 233 | 	ror	x11,x23,#28
 234 | 	add	x22,x22,x17			// h+=Ch(e,f,g)
 235 | 	eor	x17,x23,x23,ror#5
 236 | 	add	x22,x22,x16			// h+=Sigma1(e)
 237 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 238 | 	add	x26,x26,x22			// d+=h
 239 | 	eor	x19,x19,x24			// Maj(a,b,c)
 240 | 	eor	x17,x11,x17,ror#34	// Sigma0(a)
 241 | 	add	x22,x22,x19			// h+=Maj(a,b,c)
 242 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 243 | 	//add	x22,x22,x17			// h+=Sigma0(a)
 244 | #ifndef	__AARCH64EB__
 245 | 	rev	x9,x9			// 6
 246 | #endif
 247 | 	add	x22,x22,x17			// h+=Sigma0(a)
 248 | 	ror	x16,x26,#14
 249 | 	add	x21,x21,x19			// h+=K[i]
 250 | 	eor	x12,x26,x26,ror#23
 251 | 	and	x17,x27,x26
 252 | 	bic	x19,x20,x26
 253 | 	add	x21,x21,x9			// h+=X[i]
 254 | 	orr	x17,x17,x19			// Ch(e,f,g)
 255 | 	eor	x19,x22,x23			// a^b, b^c in next round
 256 | 	eor	x16,x16,x12,ror#18	// Sigma1(e)
 257 | 	ror	x12,x22,#28
 258 | 	add	x21,x21,x17			// h+=Ch(e,f,g)
 259 | 	eor	x17,x22,x22,ror#5
 260 | 	add	x21,x21,x16			// h+=Sigma1(e)
 261 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 262 | 	add	x25,x25,x21			// d+=h
 263 | 	eor	x28,x28,x23			// Maj(a,b,c)
 264 | 	eor	x17,x12,x17,ror#34	// Sigma0(a)
 265 | 	add	x21,x21,x28			// h+=Maj(a,b,c)
 266 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 267 | 	//add	x21,x21,x17			// h+=Sigma0(a)
 268 | #ifndef	__AARCH64EB__
 269 | 	rev	x10,x10			// 7
 270 | #endif
 271 | 	ldp	x11,x12,[x1],#2*8
 272 | 	add	x21,x21,x17			// h+=Sigma0(a)
 273 | 	ror	x16,x25,#14
 274 | 	add	x20,x20,x28			// h+=K[i]
 275 | 	eor	x13,x25,x25,ror#23
 276 | 	and	x17,x26,x25
 277 | 	bic	x28,x27,x25
 278 | 	add	x20,x20,x10			// h+=X[i]
 279 | 	orr	x17,x17,x28			// Ch(e,f,g)
 280 | 	eor	x28,x21,x22			// a^b, b^c in next round
 281 | 	eor	x16,x16,x13,ror#18	// Sigma1(e)
 282 | 	ror	x13,x21,#28
 283 | 	add	x20,x20,x17			// h+=Ch(e,f,g)
 284 | 	eor	x17,x21,x21,ror#5
 285 | 	add	x20,x20,x16			// h+=Sigma1(e)
 286 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 287 | 	add	x24,x24,x20			// d+=h
 288 | 	eor	x19,x19,x22			// Maj(a,b,c)
 289 | 	eor	x17,x13,x17,ror#34	// Sigma0(a)
 290 | 	add	x20,x20,x19			// h+=Maj(a,b,c)
 291 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 292 | 	//add	x20,x20,x17			// h+=Sigma0(a)
 293 | #ifndef	__AARCH64EB__
 294 | 	rev	x11,x11			// 8
 295 | #endif
 296 | 	add	x20,x20,x17			// h+=Sigma0(a)
 297 | 	ror	x16,x24,#14
 298 | 	add	x27,x27,x19			// h+=K[i]
 299 | 	eor	x14,x24,x24,ror#23
 300 | 	and	x17,x25,x24
 301 | 	bic	x19,x26,x24
 302 | 	add	x27,x27,x11			// h+=X[i]
 303 | 	orr	x17,x17,x19			// Ch(e,f,g)
 304 | 	eor	x19,x20,x21			// a^b, b^c in next round
 305 | 	eor	x16,x16,x14,ror#18	// Sigma1(e)
 306 | 	ror	x14,x20,#28
 307 | 	add	x27,x27,x17			// h+=Ch(e,f,g)
 308 | 	eor	x17,x20,x20,ror#5
 309 | 	add	x27,x27,x16			// h+=Sigma1(e)
 310 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 311 | 	add	x23,x23,x27			// d+=h
 312 | 	eor	x28,x28,x21			// Maj(a,b,c)
 313 | 	eor	x17,x14,x17,ror#34	// Sigma0(a)
 314 | 	add	x27,x27,x28			// h+=Maj(a,b,c)
 315 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 316 | 	//add	x27,x27,x17			// h+=Sigma0(a)
 317 | #ifndef	__AARCH64EB__
 318 | 	rev	x12,x12			// 9
 319 | #endif
 320 | 	ldp	x13,x14,[x1],#2*8
 321 | 	add	x27,x27,x17			// h+=Sigma0(a)
 322 | 	ror	x16,x23,#14
 323 | 	add	x26,x26,x28			// h+=K[i]
 324 | 	eor	x15,x23,x23,ror#23
 325 | 	and	x17,x24,x23
 326 | 	bic	x28,x25,x23
 327 | 	add	x26,x26,x12			// h+=X[i]
 328 | 	orr	x17,x17,x28			// Ch(e,f,g)
 329 | 	eor	x28,x27,x20			// a^b, b^c in next round
 330 | 	eor	x16,x16,x15,ror#18	// Sigma1(e)
 331 | 	ror	x15,x27,#28
 332 | 	add	x26,x26,x17			// h+=Ch(e,f,g)
 333 | 	eor	x17,x27,x27,ror#5
 334 | 	add	x26,x26,x16			// h+=Sigma1(e)
 335 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 336 | 	add	x22,x22,x26			// d+=h
 337 | 	eor	x19,x19,x20			// Maj(a,b,c)
 338 | 	eor	x17,x15,x17,ror#34	// Sigma0(a)
 339 | 	add	x26,x26,x19			// h+=Maj(a,b,c)
 340 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 341 | 	//add	x26,x26,x17			// h+=Sigma0(a)
 342 | #ifndef	__AARCH64EB__
 343 | 	rev	x13,x13			// 10
 344 | #endif
 345 | 	add	x26,x26,x17			// h+=Sigma0(a)
 346 | 	ror	x16,x22,#14
 347 | 	add	x25,x25,x19			// h+=K[i]
 348 | 	eor	x0,x22,x22,ror#23
 349 | 	and	x17,x23,x22
 350 | 	bic	x19,x24,x22
 351 | 	add	x25,x25,x13			// h+=X[i]
 352 | 	orr	x17,x17,x19			// Ch(e,f,g)
 353 | 	eor	x19,x26,x27			// a^b, b^c in next round
 354 | 	eor	x16,x16,x0,ror#18	// Sigma1(e)
 355 | 	ror	x0,x26,#28
 356 | 	add	x25,x25,x17			// h+=Ch(e,f,g)
 357 | 	eor	x17,x26,x26,ror#5
 358 | 	add	x25,x25,x16			// h+=Sigma1(e)
 359 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 360 | 	add	x21,x21,x25			// d+=h
 361 | 	eor	x28,x28,x27			// Maj(a,b,c)
 362 | 	eor	x17,x0,x17,ror#34	// Sigma0(a)
 363 | 	add	x25,x25,x28			// h+=Maj(a,b,c)
 364 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 365 | 	//add	x25,x25,x17			// h+=Sigma0(a)
 366 | #ifndef	__AARCH64EB__
 367 | 	rev	x14,x14			// 11
 368 | #endif
 369 | 	ldp	x15,x0,[x1],#2*8
 370 | 	add	x25,x25,x17			// h+=Sigma0(a)
 371 | 	str	x6,[sp,#24]
 372 | 	ror	x16,x21,#14
 373 | 	add	x24,x24,x28			// h+=K[i]
 374 | 	eor	x6,x21,x21,ror#23
 375 | 	and	x17,x22,x21
 376 | 	bic	x28,x23,x21
 377 | 	add	x24,x24,x14			// h+=X[i]
 378 | 	orr	x17,x17,x28			// Ch(e,f,g)
 379 | 	eor	x28,x25,x26			// a^b, b^c in next round
 380 | 	eor	x16,x16,x6,ror#18	// Sigma1(e)
 381 | 	ror	x6,x25,#28
 382 | 	add	x24,x24,x17			// h+=Ch(e,f,g)
 383 | 	eor	x17,x25,x25,ror#5
 384 | 	add	x24,x24,x16			// h+=Sigma1(e)
 385 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 386 | 	add	x20,x20,x24			// d+=h
 387 | 	eor	x19,x19,x26			// Maj(a,b,c)
 388 | 	eor	x17,x6,x17,ror#34	// Sigma0(a)
 389 | 	add	x24,x24,x19			// h+=Maj(a,b,c)
 390 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 391 | 	//add	x24,x24,x17			// h+=Sigma0(a)
 392 | #ifndef	__AARCH64EB__
 393 | 	rev	x15,x15			// 12
 394 | #endif
 395 | 	add	x24,x24,x17			// h+=Sigma0(a)
 396 | 	str	x7,[sp,#0]
 397 | 	ror	x16,x20,#14
 398 | 	add	x23,x23,x19			// h+=K[i]
 399 | 	eor	x7,x20,x20,ror#23
 400 | 	and	x17,x21,x20
 401 | 	bic	x19,x22,x20
 402 | 	add	x23,x23,x15			// h+=X[i]
 403 | 	orr	x17,x17,x19			// Ch(e,f,g)
 404 | 	eor	x19,x24,x25			// a^b, b^c in next round
 405 | 	eor	x16,x16,x7,ror#18	// Sigma1(e)
 406 | 	ror	x7,x24,#28
 407 | 	add	x23,x23,x17			// h+=Ch(e,f,g)
 408 | 	eor	x17,x24,x24,ror#5
 409 | 	add	x23,x23,x16			// h+=Sigma1(e)
 410 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 411 | 	add	x27,x27,x23			// d+=h
 412 | 	eor	x28,x28,x25			// Maj(a,b,c)
 413 | 	eor	x17,x7,x17,ror#34	// Sigma0(a)
 414 | 	add	x23,x23,x28			// h+=Maj(a,b,c)
 415 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 416 | 	//add	x23,x23,x17			// h+=Sigma0(a)
 417 | #ifndef	__AARCH64EB__
 418 | 	rev	x0,x0			// 13
 419 | #endif
 420 | 	ldp	x1,x2,[x1]
 421 | 	add	x23,x23,x17			// h+=Sigma0(a)
 422 | 	str	x8,[sp,#8]
 423 | 	ror	x16,x27,#14
 424 | 	add	x22,x22,x28			// h+=K[i]
 425 | 	eor	x8,x27,x27,ror#23
 426 | 	and	x17,x20,x27
 427 | 	bic	x28,x21,x27
 428 | 	add	x22,x22,x0			// h+=X[i]
 429 | 	orr	x17,x17,x28			// Ch(e,f,g)
 430 | 	eor	x28,x23,x24			// a^b, b^c in next round
 431 | 	eor	x16,x16,x8,ror#18	// Sigma1(e)
 432 | 	ror	x8,x23,#28
 433 | 	add	x22,x22,x17			// h+=Ch(e,f,g)
 434 | 	eor	x17,x23,x23,ror#5
 435 | 	add	x22,x22,x16			// h+=Sigma1(e)
 436 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 437 | 	add	x26,x26,x22			// d+=h
 438 | 	eor	x19,x19,x24			// Maj(a,b,c)
 439 | 	eor	x17,x8,x17,ror#34	// Sigma0(a)
 440 | 	add	x22,x22,x19			// h+=Maj(a,b,c)
 441 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 442 | 	//add	x22,x22,x17			// h+=Sigma0(a)
 443 | #ifndef	__AARCH64EB__
 444 | 	rev	x1,x1			// 14
 445 | #endif
 446 | 	ldr	x6,[sp,#24]
 447 | 	add	x22,x22,x17			// h+=Sigma0(a)
 448 | 	str	x9,[sp,#16]
 449 | 	ror	x16,x26,#14
 450 | 	add	x21,x21,x19			// h+=K[i]
 451 | 	eor	x9,x26,x26,ror#23
 452 | 	and	x17,x27,x26
 453 | 	bic	x19,x20,x26
 454 | 	add	x21,x21,x1			// h+=X[i]
 455 | 	orr	x17,x17,x19			// Ch(e,f,g)
 456 | 	eor	x19,x22,x23			// a^b, b^c in next round
 457 | 	eor	x16,x16,x9,ror#18	// Sigma1(e)
 458 | 	ror	x9,x22,#28
 459 | 	add	x21,x21,x17			// h+=Ch(e,f,g)
 460 | 	eor	x17,x22,x22,ror#5
 461 | 	add	x21,x21,x16			// h+=Sigma1(e)
 462 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 463 | 	add	x25,x25,x21			// d+=h
 464 | 	eor	x28,x28,x23			// Maj(a,b,c)
 465 | 	eor	x17,x9,x17,ror#34	// Sigma0(a)
 466 | 	add	x21,x21,x28			// h+=Maj(a,b,c)
 467 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 468 | 	//add	x21,x21,x17			// h+=Sigma0(a)
 469 | #ifndef	__AARCH64EB__
 470 | 	rev	x2,x2			// 15
 471 | #endif
 472 | 	ldr	x7,[sp,#0]
 473 | 	add	x21,x21,x17			// h+=Sigma0(a)
 474 | 	str	x10,[sp,#24]
 475 | 	ror	x16,x25,#14
 476 | 	add	x20,x20,x28			// h+=K[i]
 477 | 	ror	x9,x4,#1
 478 | 	and	x17,x26,x25
 479 | 	ror	x8,x1,#19
 480 | 	bic	x28,x27,x25
 481 | 	ror	x10,x21,#28
 482 | 	add	x20,x20,x2			// h+=X[i]
 483 | 	eor	x16,x16,x25,ror#18
 484 | 	eor	x9,x9,x4,ror#8
 485 | 	orr	x17,x17,x28			// Ch(e,f,g)
 486 | 	eor	x28,x21,x22			// a^b, b^c in next round
 487 | 	eor	x16,x16,x25,ror#41	// Sigma1(e)
 488 | 	eor	x10,x10,x21,ror#34
 489 | 	add	x20,x20,x17			// h+=Ch(e,f,g)
 490 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 491 | 	eor	x8,x8,x1,ror#61
 492 | 	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
 493 | 	add	x20,x20,x16			// h+=Sigma1(e)
 494 | 	eor	x19,x19,x22			// Maj(a,b,c)
 495 | 	eor	x17,x10,x21,ror#39	// Sigma0(a)
 496 | 	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
 497 | 	add	x3,x3,x12
 498 | 	add	x24,x24,x20			// d+=h
 499 | 	add	x20,x20,x19			// h+=Maj(a,b,c)
 500 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 501 | 	add	x3,x3,x9
 502 | 	add	x20,x20,x17			// h+=Sigma0(a)
 503 | 	add	x3,x3,x8
 504 | .Loop_16_xx:
 505 | 	ldr	x8,[sp,#8]
 506 | 	str	x11,[sp,#0]
 507 | 	ror	x16,x24,#14
 508 | 	add	x27,x27,x19			// h+=K[i]
 509 | 	ror	x10,x5,#1
 510 | 	and	x17,x25,x24
 511 | 	ror	x9,x2,#19
 512 | 	bic	x19,x26,x24
 513 | 	ror	x11,x20,#28
 514 | 	add	x27,x27,x3			// h+=X[i]
 515 | 	eor	x16,x16,x24,ror#18
 516 | 	eor	x10,x10,x5,ror#8
 517 | 	orr	x17,x17,x19			// Ch(e,f,g)
 518 | 	eor	x19,x20,x21			// a^b, b^c in next round
 519 | 	eor	x16,x16,x24,ror#41	// Sigma1(e)
 520 | 	eor	x11,x11,x20,ror#34
 521 | 	add	x27,x27,x17			// h+=Ch(e,f,g)
 522 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 523 | 	eor	x9,x9,x2,ror#61
 524 | 	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
 525 | 	add	x27,x27,x16			// h+=Sigma1(e)
 526 | 	eor	x28,x28,x21			// Maj(a,b,c)
 527 | 	eor	x17,x11,x20,ror#39	// Sigma0(a)
 528 | 	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
 529 | 	add	x4,x4,x13
 530 | 	add	x23,x23,x27			// d+=h
 531 | 	add	x27,x27,x28			// h+=Maj(a,b,c)
 532 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 533 | 	add	x4,x4,x10
 534 | 	add	x27,x27,x17			// h+=Sigma0(a)
 535 | 	add	x4,x4,x9
 536 | 	ldr	x9,[sp,#16]
 537 | 	str	x12,[sp,#8]
 538 | 	ror	x16,x23,#14
 539 | 	add	x26,x26,x28			// h+=K[i]
 540 | 	ror	x11,x6,#1
 541 | 	and	x17,x24,x23
 542 | 	ror	x10,x3,#19
 543 | 	bic	x28,x25,x23
 544 | 	ror	x12,x27,#28
 545 | 	add	x26,x26,x4			// h+=X[i]
 546 | 	eor	x16,x16,x23,ror#18
 547 | 	eor	x11,x11,x6,ror#8
 548 | 	orr	x17,x17,x28			// Ch(e,f,g)
 549 | 	eor	x28,x27,x20			// a^b, b^c in next round
 550 | 	eor	x16,x16,x23,ror#41	// Sigma1(e)
 551 | 	eor	x12,x12,x27,ror#34
 552 | 	add	x26,x26,x17			// h+=Ch(e,f,g)
 553 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 554 | 	eor	x10,x10,x3,ror#61
 555 | 	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
 556 | 	add	x26,x26,x16			// h+=Sigma1(e)
 557 | 	eor	x19,x19,x20			// Maj(a,b,c)
 558 | 	eor	x17,x12,x27,ror#39	// Sigma0(a)
 559 | 	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
 560 | 	add	x5,x5,x14
 561 | 	add	x22,x22,x26			// d+=h
 562 | 	add	x26,x26,x19			// h+=Maj(a,b,c)
 563 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 564 | 	add	x5,x5,x11
 565 | 	add	x26,x26,x17			// h+=Sigma0(a)
 566 | 	add	x5,x5,x10
 567 | 	ldr	x10,[sp,#24]
 568 | 	str	x13,[sp,#16]
 569 | 	ror	x16,x22,#14
 570 | 	add	x25,x25,x19			// h+=K[i]
 571 | 	ror	x12,x7,#1
 572 | 	and	x17,x23,x22
 573 | 	ror	x11,x4,#19
 574 | 	bic	x19,x24,x22
 575 | 	ror	x13,x26,#28
 576 | 	add	x25,x25,x5			// h+=X[i]
 577 | 	eor	x16,x16,x22,ror#18
 578 | 	eor	x12,x12,x7,ror#8
 579 | 	orr	x17,x17,x19			// Ch(e,f,g)
 580 | 	eor	x19,x26,x27			// a^b, b^c in next round
 581 | 	eor	x16,x16,x22,ror#41	// Sigma1(e)
 582 | 	eor	x13,x13,x26,ror#34
 583 | 	add	x25,x25,x17			// h+=Ch(e,f,g)
 584 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 585 | 	eor	x11,x11,x4,ror#61
 586 | 	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
 587 | 	add	x25,x25,x16			// h+=Sigma1(e)
 588 | 	eor	x28,x28,x27			// Maj(a,b,c)
 589 | 	eor	x17,x13,x26,ror#39	// Sigma0(a)
 590 | 	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
 591 | 	add	x6,x6,x15
 592 | 	add	x21,x21,x25			// d+=h
 593 | 	add	x25,x25,x28			// h+=Maj(a,b,c)
 594 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 595 | 	add	x6,x6,x12
 596 | 	add	x25,x25,x17			// h+=Sigma0(a)
 597 | 	add	x6,x6,x11
 598 | 	ldr	x11,[sp,#0]
 599 | 	str	x14,[sp,#24]
 600 | 	ror	x16,x21,#14
 601 | 	add	x24,x24,x28			// h+=K[i]
 602 | 	ror	x13,x8,#1
 603 | 	and	x17,x22,x21
 604 | 	ror	x12,x5,#19
 605 | 	bic	x28,x23,x21
 606 | 	ror	x14,x25,#28
 607 | 	add	x24,x24,x6			// h+=X[i]
 608 | 	eor	x16,x16,x21,ror#18
 609 | 	eor	x13,x13,x8,ror#8
 610 | 	orr	x17,x17,x28			// Ch(e,f,g)
 611 | 	eor	x28,x25,x26			// a^b, b^c in next round
 612 | 	eor	x16,x16,x21,ror#41	// Sigma1(e)
 613 | 	eor	x14,x14,x25,ror#34
 614 | 	add	x24,x24,x17			// h+=Ch(e,f,g)
 615 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 616 | 	eor	x12,x12,x5,ror#61
 617 | 	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
 618 | 	add	x24,x24,x16			// h+=Sigma1(e)
 619 | 	eor	x19,x19,x26			// Maj(a,b,c)
 620 | 	eor	x17,x14,x25,ror#39	// Sigma0(a)
 621 | 	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
 622 | 	add	x7,x7,x0
 623 | 	add	x20,x20,x24			// d+=h
 624 | 	add	x24,x24,x19			// h+=Maj(a,b,c)
 625 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 626 | 	add	x7,x7,x13
 627 | 	add	x24,x24,x17			// h+=Sigma0(a)
 628 | 	add	x7,x7,x12
 629 | 	ldr	x12,[sp,#8]
 630 | 	str	x15,[sp,#0]
 631 | 	ror	x16,x20,#14
 632 | 	add	x23,x23,x19			// h+=K[i]
 633 | 	ror	x14,x9,#1
 634 | 	and	x17,x21,x20
 635 | 	ror	x13,x6,#19
 636 | 	bic	x19,x22,x20
 637 | 	ror	x15,x24,#28
 638 | 	add	x23,x23,x7			// h+=X[i]
 639 | 	eor	x16,x16,x20,ror#18
 640 | 	eor	x14,x14,x9,ror#8
 641 | 	orr	x17,x17,x19			// Ch(e,f,g)
 642 | 	eor	x19,x24,x25			// a^b, b^c in next round
 643 | 	eor	x16,x16,x20,ror#41	// Sigma1(e)
 644 | 	eor	x15,x15,x24,ror#34
 645 | 	add	x23,x23,x17			// h+=Ch(e,f,g)
 646 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 647 | 	eor	x13,x13,x6,ror#61
 648 | 	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
 649 | 	add	x23,x23,x16			// h+=Sigma1(e)
 650 | 	eor	x28,x28,x25			// Maj(a,b,c)
 651 | 	eor	x17,x15,x24,ror#39	// Sigma0(a)
 652 | 	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
 653 | 	add	x8,x8,x1
 654 | 	add	x27,x27,x23			// d+=h
 655 | 	add	x23,x23,x28			// h+=Maj(a,b,c)
 656 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 657 | 	add	x8,x8,x14
 658 | 	add	x23,x23,x17			// h+=Sigma0(a)
 659 | 	add	x8,x8,x13
 660 | 	ldr	x13,[sp,#16]
 661 | 	str	x0,[sp,#8]
 662 | 	ror	x16,x27,#14
 663 | 	add	x22,x22,x28			// h+=K[i]
 664 | 	ror	x15,x10,#1
 665 | 	and	x17,x20,x27
 666 | 	ror	x14,x7,#19
 667 | 	bic	x28,x21,x27
 668 | 	ror	x0,x23,#28
 669 | 	add	x22,x22,x8			// h+=X[i]
 670 | 	eor	x16,x16,x27,ror#18
 671 | 	eor	x15,x15,x10,ror#8
 672 | 	orr	x17,x17,x28			// Ch(e,f,g)
 673 | 	eor	x28,x23,x24			// a^b, b^c in next round
 674 | 	eor	x16,x16,x27,ror#41	// Sigma1(e)
 675 | 	eor	x0,x0,x23,ror#34
 676 | 	add	x22,x22,x17			// h+=Ch(e,f,g)
 677 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 678 | 	eor	x14,x14,x7,ror#61
 679 | 	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
 680 | 	add	x22,x22,x16			// h+=Sigma1(e)
 681 | 	eor	x19,x19,x24			// Maj(a,b,c)
 682 | 	eor	x17,x0,x23,ror#39	// Sigma0(a)
 683 | 	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
 684 | 	add	x9,x9,x2
 685 | 	add	x26,x26,x22			// d+=h
 686 | 	add	x22,x22,x19			// h+=Maj(a,b,c)
 687 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 688 | 	add	x9,x9,x15
 689 | 	add	x22,x22,x17			// h+=Sigma0(a)
 690 | 	add	x9,x9,x14
 691 | 	ldr	x14,[sp,#24]
 692 | 	str	x1,[sp,#16]
 693 | 	ror	x16,x26,#14
 694 | 	add	x21,x21,x19			// h+=K[i]
 695 | 	ror	x0,x11,#1
 696 | 	and	x17,x27,x26
 697 | 	ror	x15,x8,#19
 698 | 	bic	x19,x20,x26
 699 | 	ror	x1,x22,#28
 700 | 	add	x21,x21,x9			// h+=X[i]
 701 | 	eor	x16,x16,x26,ror#18
 702 | 	eor	x0,x0,x11,ror#8
 703 | 	orr	x17,x17,x19			// Ch(e,f,g)
 704 | 	eor	x19,x22,x23			// a^b, b^c in next round
 705 | 	eor	x16,x16,x26,ror#41	// Sigma1(e)
 706 | 	eor	x1,x1,x22,ror#34
 707 | 	add	x21,x21,x17			// h+=Ch(e,f,g)
 708 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 709 | 	eor	x15,x15,x8,ror#61
 710 | 	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
 711 | 	add	x21,x21,x16			// h+=Sigma1(e)
 712 | 	eor	x28,x28,x23			// Maj(a,b,c)
 713 | 	eor	x17,x1,x22,ror#39	// Sigma0(a)
 714 | 	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
 715 | 	add	x10,x10,x3
 716 | 	add	x25,x25,x21			// d+=h
 717 | 	add	x21,x21,x28			// h+=Maj(a,b,c)
 718 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 719 | 	add	x10,x10,x0
 720 | 	add	x21,x21,x17			// h+=Sigma0(a)
 721 | 	add	x10,x10,x15
 722 | 	ldr	x15,[sp,#0]
 723 | 	str	x2,[sp,#24]
 724 | 	ror	x16,x25,#14
 725 | 	add	x20,x20,x28			// h+=K[i]
 726 | 	ror	x1,x12,#1
 727 | 	and	x17,x26,x25
 728 | 	ror	x0,x9,#19
 729 | 	bic	x28,x27,x25
 730 | 	ror	x2,x21,#28
 731 | 	add	x20,x20,x10			// h+=X[i]
 732 | 	eor	x16,x16,x25,ror#18
 733 | 	eor	x1,x1,x12,ror#8
 734 | 	orr	x17,x17,x28			// Ch(e,f,g)
 735 | 	eor	x28,x21,x22			// a^b, b^c in next round
 736 | 	eor	x16,x16,x25,ror#41	// Sigma1(e)
 737 | 	eor	x2,x2,x21,ror#34
 738 | 	add	x20,x20,x17			// h+=Ch(e,f,g)
 739 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 740 | 	eor	x0,x0,x9,ror#61
 741 | 	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
 742 | 	add	x20,x20,x16			// h+=Sigma1(e)
 743 | 	eor	x19,x19,x22			// Maj(a,b,c)
 744 | 	eor	x17,x2,x21,ror#39	// Sigma0(a)
 745 | 	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
 746 | 	add	x11,x11,x4
 747 | 	add	x24,x24,x20			// d+=h
 748 | 	add	x20,x20,x19			// h+=Maj(a,b,c)
 749 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 750 | 	add	x11,x11,x1
 751 | 	add	x20,x20,x17			// h+=Sigma0(a)
 752 | 	add	x11,x11,x0
 753 | 	ldr	x0,[sp,#8]
 754 | 	str	x3,[sp,#0]
 755 | 	ror	x16,x24,#14
 756 | 	add	x27,x27,x19			// h+=K[i]
 757 | 	ror	x2,x13,#1
 758 | 	and	x17,x25,x24
 759 | 	ror	x1,x10,#19
 760 | 	bic	x19,x26,x24
 761 | 	ror	x3,x20,#28
 762 | 	add	x27,x27,x11			// h+=X[i]
 763 | 	eor	x16,x16,x24,ror#18
 764 | 	eor	x2,x2,x13,ror#8
 765 | 	orr	x17,x17,x19			// Ch(e,f,g)
 766 | 	eor	x19,x20,x21			// a^b, b^c in next round
 767 | 	eor	x16,x16,x24,ror#41	// Sigma1(e)
 768 | 	eor	x3,x3,x20,ror#34
 769 | 	add	x27,x27,x17			// h+=Ch(e,f,g)
 770 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 771 | 	eor	x1,x1,x10,ror#61
 772 | 	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
 773 | 	add	x27,x27,x16			// h+=Sigma1(e)
 774 | 	eor	x28,x28,x21			// Maj(a,b,c)
 775 | 	eor	x17,x3,x20,ror#39	// Sigma0(a)
 776 | 	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
 777 | 	add	x12,x12,x5
 778 | 	add	x23,x23,x27			// d+=h
 779 | 	add	x27,x27,x28			// h+=Maj(a,b,c)
 780 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 781 | 	add	x12,x12,x2
 782 | 	add	x27,x27,x17			// h+=Sigma0(a)
 783 | 	add	x12,x12,x1
 784 | 	ldr	x1,[sp,#16]
 785 | 	str	x4,[sp,#8]
 786 | 	ror	x16,x23,#14
 787 | 	add	x26,x26,x28			// h+=K[i]
 788 | 	ror	x3,x14,#1
 789 | 	and	x17,x24,x23
 790 | 	ror	x2,x11,#19
 791 | 	bic	x28,x25,x23
 792 | 	ror	x4,x27,#28
 793 | 	add	x26,x26,x12			// h+=X[i]
 794 | 	eor	x16,x16,x23,ror#18
 795 | 	eor	x3,x3,x14,ror#8
 796 | 	orr	x17,x17,x28			// Ch(e,f,g)
 797 | 	eor	x28,x27,x20			// a^b, b^c in next round
 798 | 	eor	x16,x16,x23,ror#41	// Sigma1(e)
 799 | 	eor	x4,x4,x27,ror#34
 800 | 	add	x26,x26,x17			// h+=Ch(e,f,g)
 801 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 802 | 	eor	x2,x2,x11,ror#61
 803 | 	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
 804 | 	add	x26,x26,x16			// h+=Sigma1(e)
 805 | 	eor	x19,x19,x20			// Maj(a,b,c)
 806 | 	eor	x17,x4,x27,ror#39	// Sigma0(a)
 807 | 	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
 808 | 	add	x13,x13,x6
 809 | 	add	x22,x22,x26			// d+=h
 810 | 	add	x26,x26,x19			// h+=Maj(a,b,c)
 811 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 812 | 	add	x13,x13,x3
 813 | 	add	x26,x26,x17			// h+=Sigma0(a)
 814 | 	add	x13,x13,x2
 815 | 	ldr	x2,[sp,#24]
 816 | 	str	x5,[sp,#16]
 817 | 	ror	x16,x22,#14
 818 | 	add	x25,x25,x19			// h+=K[i]
 819 | 	ror	x4,x15,#1
 820 | 	and	x17,x23,x22
 821 | 	ror	x3,x12,#19
 822 | 	bic	x19,x24,x22
 823 | 	ror	x5,x26,#28
 824 | 	add	x25,x25,x13			// h+=X[i]
 825 | 	eor	x16,x16,x22,ror#18
 826 | 	eor	x4,x4,x15,ror#8
 827 | 	orr	x17,x17,x19			// Ch(e,f,g)
 828 | 	eor	x19,x26,x27			// a^b, b^c in next round
 829 | 	eor	x16,x16,x22,ror#41	// Sigma1(e)
 830 | 	eor	x5,x5,x26,ror#34
 831 | 	add	x25,x25,x17			// h+=Ch(e,f,g)
 832 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 833 | 	eor	x3,x3,x12,ror#61
 834 | 	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
 835 | 	add	x25,x25,x16			// h+=Sigma1(e)
 836 | 	eor	x28,x28,x27			// Maj(a,b,c)
 837 | 	eor	x17,x5,x26,ror#39	// Sigma0(a)
 838 | 	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
 839 | 	add	x14,x14,x7
 840 | 	add	x21,x21,x25			// d+=h
 841 | 	add	x25,x25,x28			// h+=Maj(a,b,c)
 842 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 843 | 	add	x14,x14,x4
 844 | 	add	x25,x25,x17			// h+=Sigma0(a)
 845 | 	add	x14,x14,x3
 846 | 	ldr	x3,[sp,#0]
 847 | 	str	x6,[sp,#24]
 848 | 	ror	x16,x21,#14
 849 | 	add	x24,x24,x28			// h+=K[i]
 850 | 	ror	x5,x0,#1
 851 | 	and	x17,x22,x21
 852 | 	ror	x4,x13,#19
 853 | 	bic	x28,x23,x21
 854 | 	ror	x6,x25,#28
 855 | 	add	x24,x24,x14			// h+=X[i]
 856 | 	eor	x16,x16,x21,ror#18
 857 | 	eor	x5,x5,x0,ror#8
 858 | 	orr	x17,x17,x28			// Ch(e,f,g)
 859 | 	eor	x28,x25,x26			// a^b, b^c in next round
 860 | 	eor	x16,x16,x21,ror#41	// Sigma1(e)
 861 | 	eor	x6,x6,x25,ror#34
 862 | 	add	x24,x24,x17			// h+=Ch(e,f,g)
 863 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 864 | 	eor	x4,x4,x13,ror#61
 865 | 	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
 866 | 	add	x24,x24,x16			// h+=Sigma1(e)
 867 | 	eor	x19,x19,x26			// Maj(a,b,c)
 868 | 	eor	x17,x6,x25,ror#39	// Sigma0(a)
 869 | 	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
 870 | 	add	x15,x15,x8
 871 | 	add	x20,x20,x24			// d+=h
 872 | 	add	x24,x24,x19			// h+=Maj(a,b,c)
 873 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 874 | 	add	x15,x15,x5
 875 | 	add	x24,x24,x17			// h+=Sigma0(a)
 876 | 	add	x15,x15,x4
 877 | 	ldr	x4,[sp,#8]
 878 | 	str	x7,[sp,#0]
 879 | 	ror	x16,x20,#14
 880 | 	add	x23,x23,x19			// h+=K[i]
 881 | 	ror	x6,x1,#1
 882 | 	and	x17,x21,x20
 883 | 	ror	x5,x14,#19
 884 | 	bic	x19,x22,x20
 885 | 	ror	x7,x24,#28
 886 | 	add	x23,x23,x15			// h+=X[i]
 887 | 	eor	x16,x16,x20,ror#18
 888 | 	eor	x6,x6,x1,ror#8
 889 | 	orr	x17,x17,x19			// Ch(e,f,g)
 890 | 	eor	x19,x24,x25			// a^b, b^c in next round
 891 | 	eor	x16,x16,x20,ror#41	// Sigma1(e)
 892 | 	eor	x7,x7,x24,ror#34
 893 | 	add	x23,x23,x17			// h+=Ch(e,f,g)
 894 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 895 | 	eor	x5,x5,x14,ror#61
 896 | 	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
 897 | 	add	x23,x23,x16			// h+=Sigma1(e)
 898 | 	eor	x28,x28,x25			// Maj(a,b,c)
 899 | 	eor	x17,x7,x24,ror#39	// Sigma0(a)
 900 | 	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
 901 | 	add	x0,x0,x9
 902 | 	add	x27,x27,x23			// d+=h
 903 | 	add	x23,x23,x28			// h+=Maj(a,b,c)
 904 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 905 | 	add	x0,x0,x6
 906 | 	add	x23,x23,x17			// h+=Sigma0(a)
 907 | 	add	x0,x0,x5
 908 | 	ldr	x5,[sp,#16]
 909 | 	str	x8,[sp,#8]
 910 | 	ror	x16,x27,#14
 911 | 	add	x22,x22,x28			// h+=K[i]
 912 | 	ror	x7,x2,#1
 913 | 	and	x17,x20,x27
 914 | 	ror	x6,x15,#19
 915 | 	bic	x28,x21,x27
 916 | 	ror	x8,x23,#28
 917 | 	add	x22,x22,x0			// h+=X[i]
 918 | 	eor	x16,x16,x27,ror#18
 919 | 	eor	x7,x7,x2,ror#8
 920 | 	orr	x17,x17,x28			// Ch(e,f,g)
 921 | 	eor	x28,x23,x24			// a^b, b^c in next round
 922 | 	eor	x16,x16,x27,ror#41	// Sigma1(e)
 923 | 	eor	x8,x8,x23,ror#34
 924 | 	add	x22,x22,x17			// h+=Ch(e,f,g)
 925 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 926 | 	eor	x6,x6,x15,ror#61
 927 | 	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
 928 | 	add	x22,x22,x16			// h+=Sigma1(e)
 929 | 	eor	x19,x19,x24			// Maj(a,b,c)
 930 | 	eor	x17,x8,x23,ror#39	// Sigma0(a)
 931 | 	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
 932 | 	add	x1,x1,x10
 933 | 	add	x26,x26,x22			// d+=h
 934 | 	add	x22,x22,x19			// h+=Maj(a,b,c)
 935 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 936 | 	add	x1,x1,x7
 937 | 	add	x22,x22,x17			// h+=Sigma0(a)
 938 | 	add	x1,x1,x6
 939 | 	ldr	x6,[sp,#24]
 940 | 	str	x9,[sp,#16]
 941 | 	ror	x16,x26,#14
 942 | 	add	x21,x21,x19			// h+=K[i]
 943 | 	ror	x8,x3,#1
 944 | 	and	x17,x27,x26
 945 | 	ror	x7,x0,#19
 946 | 	bic	x19,x20,x26
 947 | 	ror	x9,x22,#28
 948 | 	add	x21,x21,x1			// h+=X[i]
 949 | 	eor	x16,x16,x26,ror#18
 950 | 	eor	x8,x8,x3,ror#8
 951 | 	orr	x17,x17,x19			// Ch(e,f,g)
 952 | 	eor	x19,x22,x23			// a^b, b^c in next round
 953 | 	eor	x16,x16,x26,ror#41	// Sigma1(e)
 954 | 	eor	x9,x9,x22,ror#34
 955 | 	add	x21,x21,x17			// h+=Ch(e,f,g)
 956 | 	and	x28,x28,x19			// (b^c)&=(a^b)
 957 | 	eor	x7,x7,x0,ror#61
 958 | 	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
 959 | 	add	x21,x21,x16			// h+=Sigma1(e)
 960 | 	eor	x28,x28,x23			// Maj(a,b,c)
 961 | 	eor	x17,x9,x22,ror#39	// Sigma0(a)
 962 | 	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
 963 | 	add	x2,x2,x11
 964 | 	add	x25,x25,x21			// d+=h
 965 | 	add	x21,x21,x28			// h+=Maj(a,b,c)
 966 | 	ldr	x28,[x30],#8		// *K++, x19 in next round
 967 | 	add	x2,x2,x8
 968 | 	add	x21,x21,x17			// h+=Sigma0(a)
 969 | 	add	x2,x2,x7
 970 | 	ldr	x7,[sp,#0]
 971 | 	str	x10,[sp,#24]
 972 | 	ror	x16,x25,#14
 973 | 	add	x20,x20,x28			// h+=K[i]
 974 | 	ror	x9,x4,#1
 975 | 	and	x17,x26,x25
 976 | 	ror	x8,x1,#19
 977 | 	bic	x28,x27,x25
 978 | 	ror	x10,x21,#28
 979 | 	add	x20,x20,x2			// h+=X[i]
 980 | 	eor	x16,x16,x25,ror#18
 981 | 	eor	x9,x9,x4,ror#8
 982 | 	orr	x17,x17,x28			// Ch(e,f,g)
 983 | 	eor	x28,x21,x22			// a^b, b^c in next round
 984 | 	eor	x16,x16,x25,ror#41	// Sigma1(e)
 985 | 	eor	x10,x10,x21,ror#34
 986 | 	add	x20,x20,x17			// h+=Ch(e,f,g)
 987 | 	and	x19,x19,x28			// (b^c)&=(a^b)
 988 | 	eor	x8,x8,x1,ror#61
 989 | 	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
 990 | 	add	x20,x20,x16			// h+=Sigma1(e)
 991 | 	eor	x19,x19,x22			// Maj(a,b,c)
 992 | 	eor	x17,x10,x21,ror#39	// Sigma0(a)
 993 | 	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
 994 | 	add	x3,x3,x12
 995 | 	add	x24,x24,x20			// d+=h
 996 | 	add	x20,x20,x19			// h+=Maj(a,b,c)
 997 | 	ldr	x19,[x30],#8		// *K++, x28 in next round
 998 | 	add	x3,x3,x9
 999 | 	add	x20,x20,x17			// h+=Sigma0(a)
1000 | 	add	x3,x3,x8
1001 | 	cbnz	x19,.Loop_16_xx
1002 | 
1003 | 	ldp	x0,x2,[x29,#96]
1004 | 	ldr	x1,[x29,#112]
1005 | 	sub	x30,x30,#648		// rewind
1006 | 
1007 | 	ldp	x3,x4,[x0]
1008 | 	ldp	x5,x6,[x0,#2*8]
1009 | 	add	x1,x1,#14*8			// advance input pointer
1010 | 	ldp	x7,x8,[x0,#4*8]
1011 | 	add	x20,x20,x3
1012 | 	ldp	x9,x10,[x0,#6*8]
1013 | 	add	x21,x21,x4
1014 | 	add	x22,x22,x5
1015 | 	add	x23,x23,x6
1016 | 	stp	x20,x21,[x0]
1017 | 	add	x24,x24,x7
1018 | 	add	x25,x25,x8
1019 | 	stp	x22,x23,[x0,#2*8]
1020 | 	add	x26,x26,x9
1021 | 	add	x27,x27,x10
1022 | 	cmp	x1,x2
1023 | 	stp	x24,x25,[x0,#4*8]
1024 | 	stp	x26,x27,[x0,#6*8]
1025 | 	b.ne	.Loop
1026 | 
1027 | 	ldp	x19,x20,[x29,#16]
1028 | 	add	sp,sp,#4*8
1029 | 	ldp	x21,x22,[x29,#32]
1030 | 	ldp	x23,x24,[x29,#48]
1031 | 	ldp	x25,x26,[x29,#64]
1032 | 	ldp	x27,x28,[x29,#80]
1033 | 	ldp	x29,x30,[sp],#128
1034 | .inst	0xd50323bf				// autiasp
1035 | 	ret
1036 | .size	sha512_block_data_order_local,.-sha512_block_data_order_local
1037 | 
1038 | .align	6
1039 | .type	.LK512,%object
1040 | .LK512:
1041 | .quad	0x428a2f98d728ae22,0x7137449123ef65cd
1042 | .quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1043 | .quad	0x3956c25bf348b538,0x59f111f1b605d019
1044 | .quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1045 | .quad	0xd807aa98a3030242,0x12835b0145706fbe
1046 | .quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1047 | .quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1048 | .quad	0x9bdc06a725c71235,0xc19bf174cf692694
1049 | .quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1050 | .quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1051 | .quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1052 | .quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1053 | .quad	0x983e5152ee66dfab,0xa831c66d2db43210
1054 | .quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1055 | .quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1056 | .quad	0x06ca6351e003826f,0x142929670a0e6e70
1057 | .quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1058 | .quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1059 | .quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1060 | .quad	0x81c2c92e47edaee6,0x92722c851482353b
1061 | .quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1062 | .quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1063 | .quad	0xd192e819d6ef5218,0xd69906245565a910
1064 | .quad	0xf40e35855771202a,0x106aa07032bbd1b8
1065 | .quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1066 | .quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1067 | .quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1068 | .quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1069 | .quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1070 | .quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1071 | .quad	0x90befffa23631e28,0xa4506cebde82bde9
1072 | .quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1073 | .quad	0xca273eceea26619c,0xd186b8c721c0c207
1074 | .quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1075 | .quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1076 | .quad	0x113f9804bef90dae,0x1b710b35131c471b
1077 | .quad	0x28db77f523047d84,0x32caab7b40c72493
1078 | .quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1079 | .quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1080 | .quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1081 | .quad	0	// terminator
1082 | .size	.LK512,.-.LK512
1083 | .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1084 | .align	2
1085 | .align	2
1086 | #ifndef	__KERNEL__
1087 | .type	sha512_block_armv8,%function
1088 | .align	6
1089 | sha512_block_armv8:
1090 | .Lv8_entry:
1091 | 	stp	x29,x30,[sp,#-16]!
1092 | 	add	x29,sp,#0
1093 | 
1094 | 	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1095 | 	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1096 | 
1097 | 	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1098 | 	adr	x3,.LK512
1099 | 
1100 | 	rev64	v16.16b,v16.16b
1101 | 	rev64	v17.16b,v17.16b
1102 | 	rev64	v18.16b,v18.16b
1103 | 	rev64	v19.16b,v19.16b
1104 | 	rev64	v20.16b,v20.16b
1105 | 	rev64	v21.16b,v21.16b
1106 | 	rev64	v22.16b,v22.16b
1107 | 	rev64	v23.16b,v23.16b
1108 | 	b	.Loop_hw
1109 | 
1110 | .align	4
1111 | .Loop_hw:
1112 | 	ld1	{v24.2d},[x3],#16
1113 | 	subs	x2,x2,#1
1114 | 	sub	x4,x1,#128
1115 | 	orr	v26.16b,v0.16b,v0.16b			// offload
1116 | 	orr	v27.16b,v1.16b,v1.16b
1117 | 	orr	v28.16b,v2.16b,v2.16b
1118 | 	orr	v29.16b,v3.16b,v3.16b
1119 | 	csel	x1,x1,x4,ne			// conditional rewind
1120 | 	add	v24.2d,v24.2d,v16.2d
1121 | 	ld1	{v25.2d},[x3],#16
1122 | 	ext	v24.16b,v24.16b,v24.16b,#8
1123 | 	ext	v5.16b,v2.16b,v3.16b,#8
1124 | 	ext	v6.16b,v1.16b,v2.16b,#8
1125 | 	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1126 | .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1127 | 	ext	v7.16b,v20.16b,v21.16b,#8
1128 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1129 | .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1130 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1131 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1132 | 	add	v25.2d,v25.2d,v17.2d
1133 | 	ld1	{v24.2d},[x3],#16
1134 | 	ext	v25.16b,v25.16b,v25.16b,#8
1135 | 	ext	v5.16b,v4.16b,v2.16b,#8
1136 | 	ext	v6.16b,v0.16b,v4.16b,#8
1137 | 	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1138 | .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1139 | 	ext	v7.16b,v21.16b,v22.16b,#8
1140 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1141 | .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1142 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1143 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1144 | 	add	v24.2d,v24.2d,v18.2d
1145 | 	ld1	{v25.2d},[x3],#16
1146 | 	ext	v24.16b,v24.16b,v24.16b,#8
1147 | 	ext	v5.16b,v1.16b,v4.16b,#8
1148 | 	ext	v6.16b,v3.16b,v1.16b,#8
1149 | 	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1150 | .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1151 | 	ext	v7.16b,v22.16b,v23.16b,#8
1152 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1153 | .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1154 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1155 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1156 | 	add	v25.2d,v25.2d,v19.2d
1157 | 	ld1	{v24.2d},[x3],#16
1158 | 	ext	v25.16b,v25.16b,v25.16b,#8
1159 | 	ext	v5.16b,v0.16b,v1.16b,#8
1160 | 	ext	v6.16b,v2.16b,v0.16b,#8
1161 | 	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1162 | .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1163 | 	ext	v7.16b,v23.16b,v16.16b,#8
1164 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1165 | .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1166 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1167 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1168 | 	add	v24.2d,v24.2d,v20.2d
1169 | 	ld1	{v25.2d},[x3],#16
1170 | 	ext	v24.16b,v24.16b,v24.16b,#8
1171 | 	ext	v5.16b,v3.16b,v0.16b,#8
1172 | 	ext	v6.16b,v4.16b,v3.16b,#8
1173 | 	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1174 | .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1175 | 	ext	v7.16b,v16.16b,v17.16b,#8
1176 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1177 | .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1178 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1179 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1180 | 	add	v25.2d,v25.2d,v21.2d
1181 | 	ld1	{v24.2d},[x3],#16
1182 | 	ext	v25.16b,v25.16b,v25.16b,#8
1183 | 	ext	v5.16b,v2.16b,v3.16b,#8
1184 | 	ext	v6.16b,v1.16b,v2.16b,#8
1185 | 	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1186 | .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1187 | 	ext	v7.16b,v17.16b,v18.16b,#8
1188 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1189 | .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1190 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1191 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1192 | 	add	v24.2d,v24.2d,v22.2d
1193 | 	ld1	{v25.2d},[x3],#16
1194 | 	ext	v24.16b,v24.16b,v24.16b,#8
1195 | 	ext	v5.16b,v4.16b,v2.16b,#8
1196 | 	ext	v6.16b,v0.16b,v4.16b,#8
1197 | 	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1198 | .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1199 | 	ext	v7.16b,v18.16b,v19.16b,#8
1200 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1201 | .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1202 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1203 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1204 | 	add	v25.2d,v25.2d,v23.2d
1205 | 	ld1	{v24.2d},[x3],#16
1206 | 	ext	v25.16b,v25.16b,v25.16b,#8
1207 | 	ext	v5.16b,v1.16b,v4.16b,#8
1208 | 	ext	v6.16b,v3.16b,v1.16b,#8
1209 | 	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1210 | .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1211 | 	ext	v7.16b,v19.16b,v20.16b,#8
1212 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1213 | .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1214 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1215 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1216 | 	add	v24.2d,v24.2d,v16.2d
1217 | 	ld1	{v25.2d},[x3],#16
1218 | 	ext	v24.16b,v24.16b,v24.16b,#8
1219 | 	ext	v5.16b,v0.16b,v1.16b,#8
1220 | 	ext	v6.16b,v2.16b,v0.16b,#8
1221 | 	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1222 | .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1223 | 	ext	v7.16b,v20.16b,v21.16b,#8
1224 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1225 | .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1226 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1227 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1228 | 	add	v25.2d,v25.2d,v17.2d
1229 | 	ld1	{v24.2d},[x3],#16
1230 | 	ext	v25.16b,v25.16b,v25.16b,#8
1231 | 	ext	v5.16b,v3.16b,v0.16b,#8
1232 | 	ext	v6.16b,v4.16b,v3.16b,#8
1233 | 	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1234 | .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1235 | 	ext	v7.16b,v21.16b,v22.16b,#8
1236 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1237 | .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1238 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1239 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1240 | 	add	v24.2d,v24.2d,v18.2d
1241 | 	ld1	{v25.2d},[x3],#16
1242 | 	ext	v24.16b,v24.16b,v24.16b,#8
1243 | 	ext	v5.16b,v2.16b,v3.16b,#8
1244 | 	ext	v6.16b,v1.16b,v2.16b,#8
1245 | 	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1246 | .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1247 | 	ext	v7.16b,v22.16b,v23.16b,#8
1248 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1249 | .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1250 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1251 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1252 | 	add	v25.2d,v25.2d,v19.2d
1253 | 	ld1	{v24.2d},[x3],#16
1254 | 	ext	v25.16b,v25.16b,v25.16b,#8
1255 | 	ext	v5.16b,v4.16b,v2.16b,#8
1256 | 	ext	v6.16b,v0.16b,v4.16b,#8
1257 | 	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1258 | .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1259 | 	ext	v7.16b,v23.16b,v16.16b,#8
1260 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1261 | .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1262 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1263 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1264 | 	add	v24.2d,v24.2d,v20.2d
1265 | 	ld1	{v25.2d},[x3],#16
1266 | 	ext	v24.16b,v24.16b,v24.16b,#8
1267 | 	ext	v5.16b,v1.16b,v4.16b,#8
1268 | 	ext	v6.16b,v3.16b,v1.16b,#8
1269 | 	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1270 | .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1271 | 	ext	v7.16b,v16.16b,v17.16b,#8
1272 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1273 | .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1274 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1275 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1276 | 	add	v25.2d,v25.2d,v21.2d
1277 | 	ld1	{v24.2d},[x3],#16
1278 | 	ext	v25.16b,v25.16b,v25.16b,#8
1279 | 	ext	v5.16b,v0.16b,v1.16b,#8
1280 | 	ext	v6.16b,v2.16b,v0.16b,#8
1281 | 	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1282 | .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1283 | 	ext	v7.16b,v17.16b,v18.16b,#8
1284 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1285 | .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1286 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1287 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1288 | 	add	v24.2d,v24.2d,v22.2d
1289 | 	ld1	{v25.2d},[x3],#16
1290 | 	ext	v24.16b,v24.16b,v24.16b,#8
1291 | 	ext	v5.16b,v3.16b,v0.16b,#8
1292 | 	ext	v6.16b,v4.16b,v3.16b,#8
1293 | 	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1294 | .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1295 | 	ext	v7.16b,v18.16b,v19.16b,#8
1296 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1297 | .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1298 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1299 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1300 | 	add	v25.2d,v25.2d,v23.2d
1301 | 	ld1	{v24.2d},[x3],#16
1302 | 	ext	v25.16b,v25.16b,v25.16b,#8
1303 | 	ext	v5.16b,v2.16b,v3.16b,#8
1304 | 	ext	v6.16b,v1.16b,v2.16b,#8
1305 | 	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1306 | .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1307 | 	ext	v7.16b,v19.16b,v20.16b,#8
1308 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1309 | .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1310 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1311 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1312 | 	add	v24.2d,v24.2d,v16.2d
1313 | 	ld1	{v25.2d},[x3],#16
1314 | 	ext	v24.16b,v24.16b,v24.16b,#8
1315 | 	ext	v5.16b,v4.16b,v2.16b,#8
1316 | 	ext	v6.16b,v0.16b,v4.16b,#8
1317 | 	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1318 | .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1319 | 	ext	v7.16b,v20.16b,v21.16b,#8
1320 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1321 | .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1322 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1323 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1324 | 	add	v25.2d,v25.2d,v17.2d
1325 | 	ld1	{v24.2d},[x3],#16
1326 | 	ext	v25.16b,v25.16b,v25.16b,#8
1327 | 	ext	v5.16b,v1.16b,v4.16b,#8
1328 | 	ext	v6.16b,v3.16b,v1.16b,#8
1329 | 	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1330 | .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1331 | 	ext	v7.16b,v21.16b,v22.16b,#8
1332 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1333 | .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1334 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1335 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1336 | 	add	v24.2d,v24.2d,v18.2d
1337 | 	ld1	{v25.2d},[x3],#16
1338 | 	ext	v24.16b,v24.16b,v24.16b,#8
1339 | 	ext	v5.16b,v0.16b,v1.16b,#8
1340 | 	ext	v6.16b,v2.16b,v0.16b,#8
1341 | 	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1342 | .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1343 | 	ext	v7.16b,v22.16b,v23.16b,#8
1344 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1345 | .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1346 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1347 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1348 | 	add	v25.2d,v25.2d,v19.2d
1349 | 	ld1	{v24.2d},[x3],#16
1350 | 	ext	v25.16b,v25.16b,v25.16b,#8
1351 | 	ext	v5.16b,v3.16b,v0.16b,#8
1352 | 	ext	v6.16b,v4.16b,v3.16b,#8
1353 | 	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1354 | .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1355 | 	ext	v7.16b,v23.16b,v16.16b,#8
1356 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1357 | .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1358 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1359 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1360 | 	add	v24.2d,v24.2d,v20.2d
1361 | 	ld1	{v25.2d},[x3],#16
1362 | 	ext	v24.16b,v24.16b,v24.16b,#8
1363 | 	ext	v5.16b,v2.16b,v3.16b,#8
1364 | 	ext	v6.16b,v1.16b,v2.16b,#8
1365 | 	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1366 | .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1367 | 	ext	v7.16b,v16.16b,v17.16b,#8
1368 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1369 | .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1370 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1371 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1372 | 	add	v25.2d,v25.2d,v21.2d
1373 | 	ld1	{v24.2d},[x3],#16
1374 | 	ext	v25.16b,v25.16b,v25.16b,#8
1375 | 	ext	v5.16b,v4.16b,v2.16b,#8
1376 | 	ext	v6.16b,v0.16b,v4.16b,#8
1377 | 	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1378 | .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1379 | 	ext	v7.16b,v17.16b,v18.16b,#8
1380 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1381 | .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1382 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1383 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1384 | 	add	v24.2d,v24.2d,v22.2d
1385 | 	ld1	{v25.2d},[x3],#16
1386 | 	ext	v24.16b,v24.16b,v24.16b,#8
1387 | 	ext	v5.16b,v1.16b,v4.16b,#8
1388 | 	ext	v6.16b,v3.16b,v1.16b,#8
1389 | 	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1390 | .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1391 | 	ext	v7.16b,v18.16b,v19.16b,#8
1392 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1393 | .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1394 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1395 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1396 | 	add	v25.2d,v25.2d,v23.2d
1397 | 	ld1	{v24.2d},[x3],#16
1398 | 	ext	v25.16b,v25.16b,v25.16b,#8
1399 | 	ext	v5.16b,v0.16b,v1.16b,#8
1400 | 	ext	v6.16b,v2.16b,v0.16b,#8
1401 | 	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1402 | .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1403 | 	ext	v7.16b,v19.16b,v20.16b,#8
1404 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1405 | .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1406 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1407 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1408 | 	add	v24.2d,v24.2d,v16.2d
1409 | 	ld1	{v25.2d},[x3],#16
1410 | 	ext	v24.16b,v24.16b,v24.16b,#8
1411 | 	ext	v5.16b,v3.16b,v0.16b,#8
1412 | 	ext	v6.16b,v4.16b,v3.16b,#8
1413 | 	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1414 | .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1415 | 	ext	v7.16b,v20.16b,v21.16b,#8
1416 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1417 | .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1418 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1419 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1420 | 	add	v25.2d,v25.2d,v17.2d
1421 | 	ld1	{v24.2d},[x3],#16
1422 | 	ext	v25.16b,v25.16b,v25.16b,#8
1423 | 	ext	v5.16b,v2.16b,v3.16b,#8
1424 | 	ext	v6.16b,v1.16b,v2.16b,#8
1425 | 	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1426 | .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1427 | 	ext	v7.16b,v21.16b,v22.16b,#8
1428 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1429 | .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1430 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1431 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1432 | 	add	v24.2d,v24.2d,v18.2d
1433 | 	ld1	{v25.2d},[x3],#16
1434 | 	ext	v24.16b,v24.16b,v24.16b,#8
1435 | 	ext	v5.16b,v4.16b,v2.16b,#8
1436 | 	ext	v6.16b,v0.16b,v4.16b,#8
1437 | 	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1438 | .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1439 | 	ext	v7.16b,v22.16b,v23.16b,#8
1440 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1441 | .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1442 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1443 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1444 | 	add	v25.2d,v25.2d,v19.2d
1445 | 	ld1	{v24.2d},[x3],#16
1446 | 	ext	v25.16b,v25.16b,v25.16b,#8
1447 | 	ext	v5.16b,v1.16b,v4.16b,#8
1448 | 	ext	v6.16b,v3.16b,v1.16b,#8
1449 | 	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1450 | .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1451 | 	ext	v7.16b,v23.16b,v16.16b,#8
1452 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1453 | .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1454 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1455 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1456 | 	add	v24.2d,v24.2d,v20.2d
1457 | 	ld1	{v25.2d},[x3],#16
1458 | 	ext	v24.16b,v24.16b,v24.16b,#8
1459 | 	ext	v5.16b,v0.16b,v1.16b,#8
1460 | 	ext	v6.16b,v2.16b,v0.16b,#8
1461 | 	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1462 | .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1463 | 	ext	v7.16b,v16.16b,v17.16b,#8
1464 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1465 | .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1466 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1467 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1468 | 	add	v25.2d,v25.2d,v21.2d
1469 | 	ld1	{v24.2d},[x3],#16
1470 | 	ext	v25.16b,v25.16b,v25.16b,#8
1471 | 	ext	v5.16b,v3.16b,v0.16b,#8
1472 | 	ext	v6.16b,v4.16b,v3.16b,#8
1473 | 	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1474 | .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1475 | 	ext	v7.16b,v17.16b,v18.16b,#8
1476 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1477 | .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1478 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1479 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1480 | 	add	v24.2d,v24.2d,v22.2d
1481 | 	ld1	{v25.2d},[x3],#16
1482 | 	ext	v24.16b,v24.16b,v24.16b,#8
1483 | 	ext	v5.16b,v2.16b,v3.16b,#8
1484 | 	ext	v6.16b,v1.16b,v2.16b,#8
1485 | 	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1486 | .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1487 | 	ext	v7.16b,v18.16b,v19.16b,#8
1488 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1489 | .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1490 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1491 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1492 | 	add	v25.2d,v25.2d,v23.2d
1493 | 	ld1	{v24.2d},[x3],#16
1494 | 	ext	v25.16b,v25.16b,v25.16b,#8
1495 | 	ext	v5.16b,v4.16b,v2.16b,#8
1496 | 	ext	v6.16b,v0.16b,v4.16b,#8
1497 | 	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1498 | .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1499 | 	ext	v7.16b,v19.16b,v20.16b,#8
1500 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1501 | .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1502 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1503 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1504 | 	ld1	{v25.2d},[x3],#16
1505 | 	add	v24.2d,v24.2d,v16.2d
1506 | 	ld1	{v16.16b},[x1],#16		// load next input
1507 | 	ext	v24.16b,v24.16b,v24.16b,#8
1508 | 	ext	v5.16b,v1.16b,v4.16b,#8
1509 | 	ext	v6.16b,v3.16b,v1.16b,#8
1510 | 	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1511 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1512 | 	rev64	v16.16b,v16.16b
1513 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1514 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1515 | 	ld1	{v24.2d},[x3],#16
1516 | 	add	v25.2d,v25.2d,v17.2d
1517 | 	ld1	{v17.16b},[x1],#16		// load next input
1518 | 	ext	v25.16b,v25.16b,v25.16b,#8
1519 | 	ext	v5.16b,v0.16b,v1.16b,#8
1520 | 	ext	v6.16b,v2.16b,v0.16b,#8
1521 | 	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1522 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1523 | 	rev64	v17.16b,v17.16b
1524 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1525 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1526 | 	ld1	{v25.2d},[x3],#16
1527 | 	add	v24.2d,v24.2d,v18.2d
1528 | 	ld1	{v18.16b},[x1],#16		// load next input
1529 | 	ext	v24.16b,v24.16b,v24.16b,#8
1530 | 	ext	v5.16b,v3.16b,v0.16b,#8
1531 | 	ext	v6.16b,v4.16b,v3.16b,#8
1532 | 	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1533 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1534 | 	rev64	v18.16b,v18.16b
1535 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1536 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1537 | 	ld1	{v24.2d},[x3],#16
1538 | 	add	v25.2d,v25.2d,v19.2d
1539 | 	ld1	{v19.16b},[x1],#16		// load next input
1540 | 	ext	v25.16b,v25.16b,v25.16b,#8
1541 | 	ext	v5.16b,v2.16b,v3.16b,#8
1542 | 	ext	v6.16b,v1.16b,v2.16b,#8
1543 | 	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1544 | .inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1545 | 	rev64	v19.16b,v19.16b
1546 | 	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1547 | .inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1548 | 	ld1	{v25.2d},[x3],#16
1549 | 	add	v24.2d,v24.2d,v20.2d
1550 | 	ld1	{v20.16b},[x1],#16		// load next input
1551 | 	ext	v24.16b,v24.16b,v24.16b,#8
1552 | 	ext	v5.16b,v4.16b,v2.16b,#8
1553 | 	ext	v6.16b,v0.16b,v4.16b,#8
1554 | 	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1555 | .inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1556 | 	rev64	v20.16b,v20.16b
1557 | 	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1558 | .inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1559 | 	ld1	{v24.2d},[x3],#16
1560 | 	add	v25.2d,v25.2d,v21.2d
1561 | 	ld1	{v21.16b},[x1],#16		// load next input
1562 | 	ext	v25.16b,v25.16b,v25.16b,#8
1563 | 	ext	v5.16b,v1.16b,v4.16b,#8
1564 | 	ext	v6.16b,v3.16b,v1.16b,#8
1565 | 	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1566 | .inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1567 | 	rev64	v21.16b,v21.16b
1568 | 	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1569 | .inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1570 | 	ld1	{v25.2d},[x3],#16
1571 | 	add	v24.2d,v24.2d,v22.2d
1572 | 	ld1	{v22.16b},[x1],#16		// load next input
1573 | 	ext	v24.16b,v24.16b,v24.16b,#8
1574 | 	ext	v5.16b,v0.16b,v1.16b,#8
1575 | 	ext	v6.16b,v2.16b,v0.16b,#8
1576 | 	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1577 | .inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1578 | 	rev64	v22.16b,v22.16b
1579 | 	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1580 | .inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1581 | 	sub	x3,x3,#80*8	// rewind
1582 | 	add	v25.2d,v25.2d,v23.2d
1583 | 	ld1	{v23.16b},[x1],#16		// load next input
1584 | 	ext	v25.16b,v25.16b,v25.16b,#8
1585 | 	ext	v5.16b,v3.16b,v0.16b,#8
1586 | 	ext	v6.16b,v4.16b,v3.16b,#8
1587 | 	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1588 | .inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1589 | 	rev64	v23.16b,v23.16b
1590 | 	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1591 | .inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1592 | 	add	v0.2d,v0.2d,v26.2d			// accumulate
1593 | 	add	v1.2d,v1.2d,v27.2d
1594 | 	add	v2.2d,v2.2d,v28.2d
1595 | 	add	v3.2d,v3.2d,v29.2d
1596 | 
1597 | 	cbnz	x2,.Loop_hw
1598 | 
1599 | 	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1600 | 
1601 | 	ldr	x29,[sp],#16
1602 | 	ret
1603 | .size	sha512_block_armv8,.-sha512_block_armv8
1604 | #endif
1605 | #if !defined(__KERNEL__) && !defined(_WIN64)
1606 | .comm	OPENSSL_armcap_P_local,4,4
1607 | #endif
1608 | 


--------------------------------------------------------------------------------
/src/openssl/openssl_cpu_globals.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #if defined(X86_64)
 5 | // In OpenSSL the OPENSSL_ia32cap_P array holds the return values (in
 6 | // RAX,RBX,RCX,RDX registesrs) of executing the Intel CPUID leaf 7 instruction.
 7 | // The assembly code chooses the relevant SHA implementation according to this
 8 | // array.
 9 | unsigned int OPENSSL_ia32cap_P_local[4] = {0};
10 | #endif
11 | 
12 | #if defined(AARCH64)
13 | unsigned int OPENSSL_armcap_P_local = 0;
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/sha256.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include <assert.h>
  5 | 
  6 | #include "sha256_defs.h"
  7 | 
  8 | #define LAST_BLOCK_BYTE_LEN (2 * SHA256_BLOCK_BYTE_LEN)
  9 | 
 10 | typedef struct sha256_hash_s {
 11 |   ALIGN(64) sha256_state_t state;
 12 |   uint64_t len;
 13 | 
 14 |   ALIGN(64) uint8_t data[LAST_BLOCK_BYTE_LEN];
 15 | 
 16 |   sha256_word_t rem;
 17 |   sha_impl_t    impl;
 18 | } sha256_ctx_t;
 19 | 
 20 | _INLINE_ void sha256_init(OUT sha256_ctx_t *ctx)
 21 | {
 22 |   ctx->state.w[0] = UINT32_C(0x6a09e667);
 23 |   ctx->state.w[1] = UINT32_C(0xbb67ae85);
 24 |   ctx->state.w[2] = UINT32_C(0x3c6ef372);
 25 |   ctx->state.w[3] = UINT32_C(0xa54ff53a);
 26 |   ctx->state.w[4] = UINT32_C(0x510e527f);
 27 |   ctx->state.w[5] = UINT32_C(0x9b05688c);
 28 |   ctx->state.w[6] = UINT32_C(0x1f83d9ab);
 29 |   ctx->state.w[7] = UINT32_C(0x5be0cd19);
 30 | }
 31 | 
 32 | _INLINE_ void sha256_compress(IN OUT sha256_ctx_t *ctx,
 33 |                               IN const uint8_t *data,
 34 |                               IN const size_t   blocks_num)
 35 | {
 36 |   assert((ctx != NULL) && (data != NULL));
 37 | 
 38 |   // OpenSSL code can crash without this check
 39 |   if(blocks_num == 0) {
 40 |     return;
 41 |   }
 42 | 
 43 |   switch(ctx->impl) {
 44 | #if defined(X86_64)
 45 |     case AVX_IMPL:
 46 |       sha256_compress_x86_64_avx(&ctx->state, data, blocks_num);
 47 |       break;
 48 | 
 49 |     case OPENSSL_AVX_IMPL:
 50 |       RUN_OPENSSL_CODE_WITH_AVX(
 51 |         sha256_block_data_order_local(ctx->state.w, data, blocks_num););
 52 |       break;
 53 | #endif
 54 | 
 55 | #if defined(AVX2_SUPPORT)
 56 |     case AVX2_IMPL:
 57 |       sha256_compress_x86_64_avx2(&ctx->state, data, blocks_num);
 58 |       break;
 59 | 
 60 |     case OPENSSL_AVX2_IMPL:
 61 |       RUN_OPENSSL_CODE_WITH_AVX2(
 62 |         sha256_block_data_order_local(ctx->state.w, data, blocks_num););
 63 |       break;
 64 | #endif
 65 | 
 66 | #if defined(AVX512_SUPPORT)
 67 |     case AVX512_IMPL:
 68 |       sha256_compress_x86_64_avx512(&ctx->state, data, blocks_num);
 69 |       break;
 70 | #endif
 71 | 
 72 | #if defined(X86_64_SHA_SUPPORT)
 73 |     case SHA_EXT_IMPL:
 74 |       sha256_compress_x86_64_sha_ext(&ctx->state, data, blocks_num);
 75 |       break;
 76 | 
 77 |     case OPENSSL_SHA_EXT_IMPL:
 78 |       RUN_OPENSSL_CODE_WITH_SHA_EXT(
 79 |         sha256_block_data_order_local(ctx->state.w, data, blocks_num););
 80 |       break;
 81 | #endif
 82 | 
 83 | #if defined(NEON_SUPPORT)
 84 |     case OPENSSL_NEON_IMPL:
 85 |       RUN_OPENSSL_CODE_WITH_NEON(
 86 |         sha256_block_data_order_local(ctx->state.w, data, blocks_num););
 87 |       break;
 88 | #endif
 89 | 
 90 | #if defined(AARCH64_SHA_SUPPORT)
 91 |     case SHA_EXT_IMPL:
 92 |       sha256_compress_aarch64_sha_ext(&ctx->state, data, blocks_num);
 93 |       break;
 94 | 
 95 |     case OPENSSL_SHA_EXT_IMPL:
 96 |       RUN_OPENSSL_CODE_WITH_SHA256_EXT(
 97 |         sha256_block_data_order_local(ctx->state.w, data, blocks_num););
 98 |       break;
 99 | #endif
100 |     default: sha256_compress_generic(&ctx->state, data, blocks_num); break;
101 |   }
102 | }
103 | 
104 | _INLINE_ void sha256_update(IN OUT sha256_ctx_t *ctx,
105 |                             IN const uint8_t *data,
106 |                             IN size_t         byte_len)
107 | {
108 |   // On exiting this function ctx->rem < SHA256_BLOCK_BYTE_LEN
109 | 
110 |   assert((ctx != NULL) && (data != NULL));
111 | 
112 |   if(byte_len == 0) {
113 |     return;
114 |   }
115 | 
116 |   // Accumulate the overall size
117 |   ctx->len += byte_len;
118 | 
119 |   // Less than a block. Store the data in a temporary buffer
120 |   if((ctx->rem != 0) && ((ctx->rem + byte_len) < SHA256_BLOCK_BYTE_LEN)) {
121 |     my_memcpy(&ctx->data[ctx->rem], data, byte_len);
122 |     ctx->rem += byte_len;
123 |     return;
124 |   }
125 | 
126 |   // Complete and compress a previously stored block
127 |   if(ctx->rem != 0) {
128 |     const size_t clen = SHA256_BLOCK_BYTE_LEN - ctx->rem;
129 |     my_memcpy(&ctx->data[ctx->rem], data, clen);
130 |     sha256_compress(ctx, ctx->data, 1);
131 | 
132 |     data += clen;
133 |     byte_len -= clen;
134 | 
135 |     ctx->rem = 0;
136 |     secure_clean(ctx->data, SHA256_BLOCK_BYTE_LEN);
137 |   }
138 | 
139 |   // Compress full blocks
140 |   if(byte_len >= SHA256_BLOCK_BYTE_LEN) {
141 |     const size_t blocks_num           = (byte_len >> 6);
142 |     const size_t full_blocks_byte_len = (blocks_num << 6);
143 | 
144 |     sha256_compress(ctx, data, blocks_num);
145 | 
146 |     data += full_blocks_byte_len;
147 |     byte_len -= full_blocks_byte_len;
148 |   }
149 | 
150 |   // Store the reminder
151 |   my_memcpy(ctx->data, data, byte_len);
152 |   ctx->rem = byte_len;
153 | }
154 | 
155 | _INLINE_ void sha256_final(OUT uint8_t *dgst, IN OUT sha256_ctx_t *ctx)
156 | {
157 |   assert((ctx != NULL) && (dgst != NULL));
158 |   assert(ctx->rem < SHA256_BLOCK_BYTE_LEN);
159 | 
160 |   // Byteswap the length in bits of the hashed message
161 |   const uint64_t bswap_len      = bswap_64(8 * ctx->len);
162 |   const size_t   last_block_num = (ctx->rem < 56) ? 1 : 2;
163 |   const size_t   last_qw_pos =
164 |     (last_block_num * SHA256_BLOCK_BYTE_LEN) - sizeof(bswap_len);
165 | 
166 |   ctx->data[ctx->rem++] = SHA256_MSG_END_SYMBOL;
167 | 
168 |   // Reset the rest of the data buffer
169 |   my_memset(&ctx->data[ctx->rem], 0, sizeof(ctx->data) - ctx->rem);
170 |   my_memcpy(&ctx->data[last_qw_pos], (const uint8_t *)&bswap_len,
171 |             sizeof(bswap_len));
172 | 
173 |   // Compress the final block
174 |   sha256_compress(ctx, ctx->data, last_block_num);
175 | 
176 |   // This implementation assumes running on a Little endian machine
177 |   ctx->state.w[0] = bswap_32(ctx->state.w[0]);
178 |   ctx->state.w[1] = bswap_32(ctx->state.w[1]);
179 |   ctx->state.w[2] = bswap_32(ctx->state.w[2]);
180 |   ctx->state.w[3] = bswap_32(ctx->state.w[3]);
181 |   ctx->state.w[4] = bswap_32(ctx->state.w[4]);
182 |   ctx->state.w[5] = bswap_32(ctx->state.w[5]);
183 |   ctx->state.w[6] = bswap_32(ctx->state.w[6]);
184 |   ctx->state.w[7] = bswap_32(ctx->state.w[7]);
185 |   my_memcpy(dgst, &ctx->state, SHA256_HASH_BYTE_LEN);
186 | 
187 |   secure_clean(ctx, sizeof(*ctx));
188 | }
189 | 
190 | void sha256(OUT uint8_t *dgst,
191 |             IN const uint8_t *  data,
192 |             IN const size_t     byte_len,
193 |             IN const sha_impl_t impl)
194 | {
195 |   assert((data != NULL) || (dgst != NULL));
196 | 
197 |   sha256_ctx_t ctx = {0};
198 |   ctx.impl         = impl;
199 |   sha256_init(&ctx);
200 |   sha256_update(&ctx, data, byte_len);
201 |   sha256_final(dgst, &ctx);
202 | }
203 | 


--------------------------------------------------------------------------------
/src/sha256_compress_aarch64_sha_ext.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | // An implementation of the compress function of SHA256 using AARCH64 SHA
 5 | // extension It was translated from assembly (OpenSSL) to C by
 6 | //
 7 | // Nir Drucker and Shay Gueron
 8 | // AWS Cryptographic Algorithms Group.
 9 | // (ndrucker@amazon.com, gueron@amazon.com)
10 | 
11 | #include "neon_defs.h"
12 | #include "sha256_defs.h"
13 | 
14 | _INLINE_ void load_data(uint32x4_t ms[4], const uint8_t *data)
15 | {
16 |   uint8x16x4_t d = vld1q_u8_x4(data);
17 |   ms[0]          = vreinterpretq_u32_u8(vrev32q_u8(d.val[0]));
18 |   ms[1]          = vreinterpretq_u32_u8(vrev32q_u8(d.val[1]));
19 |   ms[2]          = vreinterpretq_u32_u8(vrev32q_u8(d.val[2]));
20 |   ms[3]          = vreinterpretq_u32_u8(vrev32q_u8(d.val[3]));
21 | }
22 | 
23 | _INLINE_ void rotate_ms(uint32x4_t ms[4])
24 | {
25 |   uint32x4_t tmp = ms[0];
26 |   ms[0]          = ms[1];
27 |   ms[1]          = ms[2];
28 |   ms[2]          = ms[3];
29 |   ms[3]          = tmp;
30 | }
31 | 
32 | void sha256_compress_aarch64_sha_ext(IN OUT sha256_state_t *state,
33 |                                      IN const uint8_t *data,
34 |                                      IN size_t         blocks_num)
35 | {
36 |   uint32x4_t   ms[4];
37 |   uint32x4_t   tmp[3];
38 |   uint32x4x2_t st;
39 |   uint32x4x2_t st_save;
40 | 
41 |   st = vld1q_u32_x2(state->w);
42 | 
43 |   for(size_t j = 0; j < blocks_num; j++) {
44 |     // Save current state
45 |     st_save = st;
46 | 
47 |     load_data(ms, data);
48 | 
49 |     tmp[0] = vaddq_u32(ms[0], vld1q_u32(&K256[0]));
50 | 
51 |     // Rounds 0-47
52 |     PRAGMA_LOOP_UNROLL_12
53 | 
54 |     for(size_t i = 0; i < 12; i++) {
55 |       ms[0]     = vsha256su0q_u32(ms[0], ms[1]);
56 |       tmp[2]    = st.val[0];
57 |       tmp[1]    = vaddq_u32(ms[1], vld1q_u32(&K256[4 * (i + 1)]));
58 |       st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[0]);
59 |       st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[0]);
60 |       ms[0]     = vsha256su1q_u32(ms[0], ms[2], ms[3]);
61 | 
62 |       rotate_ms(ms);
63 | 
64 |       uint32x4_t t = tmp[0];
65 |       tmp[0]       = tmp[1];
66 |       tmp[1]       = t;
67 |     }
68 | 
69 |     // Rounds 48-51
70 |     PRAGMA_LOOP_UNROLL_4
71 | 
72 |     for(size_t i = 0; i < 3; i++) {
73 |       tmp[2] = st.val[0];
74 |       tmp[LSB1(i + 1)] =
75 |         vaddq_u32(ms[LSB2(i + 1)], vld1q_u32(&K256[4 * (i + 13)]));
76 |       st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[LSB1(i)]);
77 |       st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[LSB1(i)]);
78 |     }
79 | 
80 |     // Rounds 60-63
81 |     tmp[2]    = st.val[0];
82 |     st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[1]);
83 |     st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[1]);
84 | 
85 |     // Accumluate state
86 |     st.val[0] = vaddq_u32(st.val[0], st_save.val[0]);
87 |     st.val[1] = vaddq_u32(st.val[1], st_save.val[1]);
88 | 
89 |     data += SHA256_BLOCK_BYTE_LEN;
90 |   }
91 | 
92 |   // Store state
93 |   vst1q_u32_x2(state->w, st);
94 | }
95 | 


--------------------------------------------------------------------------------
/src/sha256_compress_generic.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include "sha256_defs.h"
 5 | 
 6 | // In the generic implementation we use memcpy to avoid align issues
 7 | _INLINE_ sha256_word_t load_be32(IN const void *ptr)
 8 | {
 9 |   sha256_word_t ret;
10 |   my_memcpy(&ret, ptr, sizeof(ret));
11 |   return bswap_32(ret);
12 | }
13 | 
14 | _INLINE_ void load_data_and_rounds_00_15(OUT sha256_msg_schedule_t *ms,
15 |                                          IN OUT sha256_state_t *cur_state,
16 |                                          IN const uint8_t *data)
17 | {
18 |   PRAGMA_LOOP_UNROLL_4
19 | 
20 |   for(size_t i = 0; i < SHA256_BLOCK_WORDS_NUM; i++) {
21 |     ms->w[i] = load_be32(&data[sizeof(sha256_word_t) * i]);
22 |     sha_round(cur_state, ms->w[i], K256[i]);
23 |   }
24 | }
25 | 
26 | _INLINE_ void rounds_16_63(IN OUT sha256_state_t *cur_state,
27 |                            IN OUT sha256_msg_schedule_t *ms)
28 | {
29 |   PRAGMA_LOOP_UNROLL_48
30 | 
31 |   for(size_t i = SHA256_BLOCK_WORDS_NUM; i < SHA256_ROUNDS_NUM; i++) {
32 |     const sha256_word_t x1  = ms->w[LSB4(i + 1)];
33 |     const sha256_word_t x9  = ms->w[LSB4(i + 9)];
34 |     const sha256_word_t x14 = ms->w[LSB4(i + 14)];
35 | 
36 |     ms->w[LSB4(i)] += sigma0(x1) + sigma1(x14) + x9;
37 |     sha_round(cur_state, ms->w[LSB4(i)], K256[i]);
38 |   }
39 | }
40 | 
41 | void sha256_compress_generic(IN OUT sha256_state_t *state,
42 |                              IN const uint8_t *data,
43 |                              IN size_t         blocks_num)
44 | {
45 |   sha256_state_t        cur_state;
46 |   sha256_msg_schedule_t ms;
47 | 
48 |   while(blocks_num--) {
49 |     my_memcpy(&cur_state, state, sizeof(cur_state));
50 | 
51 |     load_data_and_rounds_00_15(&ms, &cur_state, data);
52 |     data += SHA256_BLOCK_BYTE_LEN;
53 | 
54 |     rounds_16_63(&cur_state, &ms);
55 |     accumulate_state(state, &cur_state);
56 |   }
57 | 
58 |   secure_clean(&cur_state, sizeof(cur_state));
59 |   secure_clean(&ms, sizeof(ms));
60 | }
61 | 


--------------------------------------------------------------------------------
/src/sha256_compress_x86_64_avx.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA256 using avx
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx_defs.h"
 15 | #include "sha256_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32,
 18 | // SLL32, SRL64 that are defined in avx_defs.h
 19 | #include "sha256_compress_x86_64_avx_helper.c"
 20 | 
 21 | #define MS_VEC_NUM   (SHA256_BLOCK_BYTE_LEN / sizeof(vec_t))
 22 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha256_word_t))
 23 | 
 24 | _INLINE_ void load_data(OUT vec_t x[MS_VEC_NUM],
 25 |                         IN OUT sha256_msg_schedule_t *ms,
 26 |                         IN const uint8_t *data)
 27 | {
 28 |   // 32 bits (4 bytes) swap masks
 29 |   const vec_t shuf_mask =
 30 |     _mm_setr_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
 31 | 
 32 |   PRAGMA_LOOP_UNROLL_4
 33 | 
 34 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 35 |     x[i] = LOAD(&data[sizeof(vec_t) * i]);
 36 |     x[i] = SHUF8(x[i], shuf_mask);
 37 |     STORE(&ms->w[WORDS_IN_VEC * i], ADD32(x[i], LOAD(&K256[WORDS_IN_VEC * i])));
 38 |   }
 39 | }
 40 | 
 41 | _INLINE_ void rounds_0_47(sha256_state_t *       cur_state,
 42 |                           vec_t                  x[MS_VEC_NUM],
 43 |                           sha256_msg_schedule_t *ms)
 44 | {
 45 |   const vec_t lo_mask = _mm_setr_epi32(0x03020100, 0x0b0a0908, -1, -1);
 46 |   const vec_t hi_mask = _mm_setr_epi32(-1, -1, 0x03020100, 0x0b0a0908);
 47 | 
 48 |   // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in
 49 |   // load_data(...).
 50 |   size_t k256_idx = SHA256_BLOCK_WORDS_NUM;
 51 | 
 52 |   // Rounds 0-47 (0-15, 16-31, 32-47)
 53 |   for(size_t i = 0; i < 3; i++) {
 54 | 
 55 |     PRAGMA_LOOP_UNROLL_4
 56 | 
 57 |     for(size_t j = 0; j < MS_VEC_NUM; j++) {
 58 |       const size_t pos = WORDS_IN_VEC * j;
 59 | 
 60 |       const vec_t y = sha256_update_x_avx(x, &K256[k256_idx], lo_mask, hi_mask);
 61 | 
 62 |       sha_round(cur_state, ms->w[pos + 0], 0);
 63 |       sha_round(cur_state, ms->w[pos + 1], 0);
 64 |       sha_round(cur_state, ms->w[pos + 2], 0);
 65 |       sha_round(cur_state, ms->w[pos + 3], 0);
 66 | 
 67 |       STORE(&ms->w[pos], y);
 68 |       k256_idx += WORDS_IN_VEC;
 69 |     }
 70 |   }
 71 | }
 72 | 
 73 | _INLINE_ void rounds_48_63(sha256_state_t *             cur_state,
 74 |                            const sha256_msg_schedule_t *ms)
 75 | {
 76 |   PRAGMA_LOOP_UNROLL_16
 77 | 
 78 |   for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) {
 79 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 80 |   }
 81 | }
 82 | 
 83 | void sha256_compress_x86_64_avx(sha256_state_t *state,
 84 |                                 const uint8_t * data,
 85 |                                 size_t          blocks_num)
 86 | {
 87 |   sha256_state_t        cur_state;
 88 |   sha256_msg_schedule_t ms;
 89 |   vec_t                 x[MS_VEC_NUM];
 90 | 
 91 |   while(blocks_num--) {
 92 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
 93 | 
 94 |     load_data(x, &ms, data);
 95 |     data += SHA256_BLOCK_BYTE_LEN;
 96 | 
 97 |     rounds_0_47(&cur_state, x, &ms);
 98 |     rounds_48_63(&cur_state, &ms);
 99 |     accumulate_state(state, &cur_state);
100 |   }
101 | 
102 |   secure_clean(&cur_state, sizeof(cur_state));
103 |   secure_clean(&ms, sizeof(ms));
104 | }
105 | 


--------------------------------------------------------------------------------
/src/sha256_compress_x86_64_avx2.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA256 using avx2
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx2_defs.h"
 15 | #include "sha256_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32,
 18 | // SLL32, SRL64 that are defined in avx2_defs.h
 19 | #include "sha256_compress_x86_64_avx_helper.c"
 20 | 
 21 | // Processing 2 blocks in parallel
 22 | #define MS_VEC_NUM           ((2 * SHA256_BLOCK_BYTE_LEN) / sizeof(vec_t))
 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha256_word_t))
 24 | #define WORDS_IN_VEC         (sizeof(vec_t) / sizeof(sha256_word_t))
 25 | 
 26 | _INLINE_ void load_data(vec_t                  x[MS_VEC_NUM],
 27 |                         sha256_msg_schedule_t *ms,
 28 |                         sha256_word_t          t2[SHA256_ROUNDS_NUM],
 29 |                         const uint8_t *        data)
 30 | {
 31 |   // 32 bits (4 bytes) swap masks
 32 |   const vec_t shuf_mask =
 33 |     _mm256_setr_epi32(DUP2(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f));
 34 | 
 35 |   PRAGMA_LOOP_UNROLL_4
 36 | 
 37 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 38 |     const size_t pos0 = (sizeof(vec_t) / 2) * i;
 39 |     const size_t pos1 = pos0 + SHA256_BLOCK_BYTE_LEN;
 40 | 
 41 |     LOADU2(&data[pos1], &data[pos0], x[i]);
 42 |     x[i]    = SHUF8(x[i], shuf_mask);
 43 |     vec_t y = ADD32(x[i], LOAD(&K256x2[8 * i]));
 44 |     STOREU2(&t2[4 * i], &ms->w[4 * i], y);
 45 |   }
 46 | }
 47 | 
 48 | _INLINE_ void rounds_0_47(sha256_state_t *       cur_state,
 49 |                           vec_t                  x[MS_VEC_NUM],
 50 |                           sha256_msg_schedule_t *ms,
 51 |                           sha256_word_t          t2[SHA256_ROUNDS_NUM])
 52 | {
 53 |   const vec_t lo_mask = _mm256_setr_epi32(DUP2(0x03020100, 0x0b0a0908, -1, -1));
 54 |   const vec_t hi_mask = _mm256_setr_epi32(DUP2(-1, -1, 0x03020100, 0x0b0a0908));
 55 | 
 56 |   // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in
 57 |   // load_data(...).
 58 |   size_t k256_idx = 2 * SHA256_BLOCK_WORDS_NUM;
 59 | 
 60 |   // Rounds 0-47 (0-15, 16-31, 32-47)
 61 |   for(size_t i = 1; i < 4; i++) {
 62 | 
 63 |     PRAGMA_LOOP_UNROLL_4
 64 | 
 65 |     for(size_t j = 0; j < WORDS_IN_128_BIT_VEC; j++) {
 66 |       const size_t pos = WORDS_IN_128_BIT_VEC * j;
 67 | 
 68 |       const vec_t y = sha256_update_x_avx(x, &K256x2[k256_idx], lo_mask, hi_mask);
 69 | 
 70 |       sha_round(cur_state, ms->w[pos + 0], 0);
 71 |       sha_round(cur_state, ms->w[pos + 1], 0);
 72 |       sha_round(cur_state, ms->w[pos + 2], 0);
 73 |       sha_round(cur_state, ms->w[pos + 3], 0);
 74 |       STOREU2(&t2[(16 * i) + pos], &ms->w[pos], y);
 75 | 
 76 |       k256_idx += WORDS_IN_VEC;
 77 |     }
 78 |   }
 79 | }
 80 | 
 81 | _INLINE_ void rounds_48_63(sha256_state_t *             cur_state,
 82 |                            const sha256_msg_schedule_t *ms)
 83 | {
 84 |   PRAGMA_LOOP_UNROLL_16
 85 | 
 86 |   for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) {
 87 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 88 |   }
 89 | }
 90 | 
 91 | _INLINE_ void process_second_block(sha256_state_t *    cur_state,
 92 |                                    const sha256_word_t t2[SHA256_ROUNDS_NUM])
 93 | {
 94 |   PRAGMA_LOOP_UNROLL_64
 95 | 
 96 |   for(size_t i = 0; i < SHA256_ROUNDS_NUM; i++) {
 97 |     sha_round(cur_state, t2[i], 0);
 98 |   }
 99 | }
100 | 
101 | void sha256_compress_x86_64_avx2(sha256_state_t *state,
102 |                                  const uint8_t * data,
103 |                                  size_t          blocks_num)
104 | {
105 |   ALIGN(64) sha256_msg_schedule_t ms;
106 |   ALIGN(64) sha256_word_t         t2[SHA256_ROUNDS_NUM];
107 |   sha256_state_t                  cur_state;
108 |   vec_t                           x[MS_VEC_NUM];
109 | 
110 |   if(blocks_num & 1) {
111 |     sha256_compress_x86_64_avx(state, data, 1);
112 |     data += SHA256_BLOCK_BYTE_LEN;
113 |     blocks_num--;
114 |   }
115 | 
116 |   // Perform two blocks in parallel
117 |   // Here blocks_num is even
118 |   for(size_t b = blocks_num; b != 0; b -= 2) {
119 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
120 | 
121 |     load_data(x, &ms, t2, data);
122 |     data += 2 * SHA256_BLOCK_BYTE_LEN;
123 | 
124 |     // First block
125 |     rounds_0_47(&cur_state, x, &ms, t2);
126 |     rounds_48_63(&cur_state, &ms);
127 |     accumulate_state(state, &cur_state);
128 | 
129 |     // Second block
130 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
131 |     process_second_block(&cur_state, t2);
132 |     accumulate_state(state, &cur_state);
133 |   }
134 | 
135 |   secure_clean(&cur_state, sizeof(cur_state));
136 |   secure_clean(&ms, sizeof(ms));
137 |   secure_clean(t2, sizeof(t2));
138 | }
139 | 


--------------------------------------------------------------------------------
/src/sha256_compress_x86_64_avx512.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA256 using avx512
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx512_defs.h"
 15 | #include "sha256_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32,
 18 | // SLL32, SRL64 that are defined in avx512_defs.h
 19 | #include "sha256_compress_x86_64_avx_helper.c"
 20 | 
 21 | // Processing 4 blocks in parallel
 22 | #define MS_VEC_NUM           ((4 * SHA256_BLOCK_BYTE_LEN) / sizeof(vec_t))
 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha256_word_t))
 24 | #define WORDS_IN_VEC         (sizeof(vec_t) / sizeof(sha256_word_t))
 25 | 
 26 | _INLINE_ void load_data(vec_t                  x[MS_VEC_NUM],
 27 |                         sha256_msg_schedule_t *ms,
 28 |                         sha256_word_t          x2_4[][SHA256_ROUNDS_NUM],
 29 |                         const uint8_t *        data)
 30 | {
 31 |   // 32 bits (4 bytes) swap masks
 32 |   const vec_t shuf_mask =
 33 |     _mm512_set_epi32(DUP4(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203));
 34 | 
 35 |   PRAGMA_LOOP_UNROLL_4
 36 | 
 37 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 38 |     const size_t pos0 = (sizeof(vec_t) / 4) * i;
 39 |     const size_t pos1 = pos0 + SHA256_BLOCK_BYTE_LEN;
 40 |     const size_t pos2 = pos1 + SHA256_BLOCK_BYTE_LEN;
 41 |     const size_t pos3 = pos2 + SHA256_BLOCK_BYTE_LEN;
 42 | 
 43 |     LOADU4(&data[pos3], &data[pos2], &data[pos1], &data[pos0], x[i]);
 44 | 
 45 |     x[i]    = SHUF8(x[i], shuf_mask);
 46 |     vec_t y = ADD32(x[i], LOAD(&K256x4[16 * i]));
 47 | 
 48 |     STOREU4(&x2_4[2][4 * i], &x2_4[1][4 * i], &x2_4[0][4 * i], &ms->w[4 * i], y);
 49 |   }
 50 | }
 51 | 
 52 | _INLINE_ void rounds_0_47(sha256_state_t *       cur_state,
 53 |                           vec_t                  x[MS_VEC_NUM],
 54 |                           sha256_msg_schedule_t *ms,
 55 |                           sha256_word_t          x2_4[][SHA256_ROUNDS_NUM])
 56 | {
 57 |   const vec_t lo_mask = _mm512_set_epi32(DUP4(-1, -1, 0x0b0a0908, 0x03020100));
 58 |   const vec_t hi_mask = _mm512_set_epi32(DUP4(0x0b0a0908, 0x03020100, -1, -1));
 59 | 
 60 |   // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in
 61 |   // load_data(...).
 62 |   size_t k256_idx = 4 * SHA256_BLOCK_WORDS_NUM;
 63 | 
 64 |   // Rounds 0-47 (0-15, 16-31, 32-47)
 65 |   for(size_t i = 1; i < 4; i++) {
 66 | 
 67 |     PRAGMA_LOOP_UNROLL_4
 68 | 
 69 |     for(size_t j = 0; j < MS_VEC_NUM; j++) {
 70 |       const size_t pos = WORDS_IN_128_BIT_VEC * j;
 71 | 
 72 |       const vec_t y = sha256_update_x_avx(x, &K256x4[k256_idx], lo_mask, hi_mask);
 73 | 
 74 |       sha_round(cur_state, ms->w[pos + 0], 0);
 75 |       sha_round(cur_state, ms->w[pos + 1], 0);
 76 |       sha_round(cur_state, ms->w[pos + 2], 0);
 77 |       sha_round(cur_state, ms->w[pos + 3], 0);
 78 |       const size_t idx = (k256_idx >> 2);
 79 | 
 80 |       STOREU4(&x2_4[2][idx], &x2_4[1][idx], &x2_4[0][idx], &ms->w[pos], y);
 81 |       k256_idx += WORDS_IN_VEC;
 82 |     }
 83 |   }
 84 | }
 85 | 
 86 | _INLINE_ void rounds_48_63(sha256_state_t *             cur_state,
 87 |                            const sha256_msg_schedule_t *ms)
 88 | {
 89 |   PRAGMA_LOOP_UNROLL_16
 90 | 
 91 |   for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) {
 92 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 93 |   }
 94 | }
 95 | 
 96 | _INLINE_ void process_extra_block(sha256_state_t *    cur_state,
 97 |                                   const sha256_word_t t[SHA256_ROUNDS_NUM])
 98 | {
 99 |   PRAGMA_LOOP_UNROLL_64
100 | 
101 |   for(size_t i = 0; i < SHA256_ROUNDS_NUM; i++) {
102 |     sha_round(cur_state, t[i], 0);
103 |   }
104 | }
105 | 
106 | void sha256_compress_x86_64_avx512(sha256_state_t *state,
107 |                                    const uint8_t * data,
108 |                                    size_t          blocks_num)
109 | {
110 |   ALIGN(64) sha256_msg_schedule_t ms;
111 |   ALIGN(64) sha256_word_t         x2_4[3][SHA256_ROUNDS_NUM];
112 |   sha256_state_t                  cur_state;
113 |   vec_t                           x[MS_VEC_NUM];
114 | 
115 |   const size_t rem = LSB2(blocks_num);
116 |   if(rem != 0) {
117 |     sha256_compress_x86_64_avx2(state, data, rem);
118 |     data += rem * SHA256_BLOCK_BYTE_LEN;
119 |     blocks_num -= rem;
120 |   }
121 | 
122 |   // Process four blocks in parallel
123 |   // Here blocks_num is divided by 4
124 |   for(size_t b = blocks_num; b != 0; b -= 4) {
125 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
126 | 
127 |     load_data(x, &ms, x2_4, data);
128 |     data += 4 * SHA256_BLOCK_BYTE_LEN;
129 | 
130 |     // First block
131 |     rounds_0_47(&cur_state, x, &ms, x2_4);
132 |     rounds_48_63(&cur_state, &ms);
133 |     accumulate_state(state, &cur_state);
134 | 
135 |     for(size_t i = 0; i <= 2; i++) {
136 |       my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
137 |       process_extra_block(&cur_state, x2_4[i]);
138 |       accumulate_state(state, &cur_state);
139 |     }
140 |   }
141 | 
142 |   secure_clean(&cur_state, sizeof(cur_state));
143 |   secure_clean(&ms, sizeof(ms));
144 |   secure_clean(x2_4, sizeof(x2_4));
145 | }
146 | 


--------------------------------------------------------------------------------
/src/sha256_compress_x86_64_avx_helper.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // An implementation of the compress function of SHA256 using avx/avx2/avx512
  5 | // It was translated from assembly (OpenSSL) to C by
  6 | //
  7 | // Nir Drucker and Shay Gueron
  8 | // AWS Cryptographic Algorithms Group.
  9 | // (ndrucker@amazon.com, gueron@amazon.com)
 10 | 
 11 | // This file depends on vec_t and on the following macros:
 12 | // LOAD, ADD32, ALIGNR8, SRL32, SLL32, SRL64
 13 | 
 14 | #define SHA256_WORD_BIT_LEN (8 * sizeof(sha256_word_t))
 15 | 
 16 | _INLINE_ void rotate_x(vec_t x[4])
 17 | {
 18 |   const vec_t tmp = x[0];
 19 |   x[0]            = x[1];
 20 |   x[1]            = x[2];
 21 |   x[2]            = x[3];
 22 |   x[3]            = tmp;
 23 | }
 24 | 
 25 | #ifndef ALTERNATIVE_AVX512_IMPL
 26 | 
 27 | _INLINE_ vec_t sha256_update_x_avx(vec_t                x[4],
 28 |                                    const sha256_word_t *k256_p,
 29 |                                    const vec_t          lo_mask,
 30 |                                    const vec_t          hi_mask)
 31 | {
 32 |   vec_t t[4];
 33 | 
 34 |   // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates:
 35 |   // s0 = sigma0(d[(i + 1) % 16])
 36 |   // s1 = sigma1(d[(i + 14) % 16])
 37 |   // d[i % 16] += s0 + s1 + d[(i + 9) % 16]
 38 |   //
 39 |   // For x[0]=d[3:0]
 40 |   //
 41 |   // This means that
 42 |   // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9]
 43 |   // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10]
 44 |   // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11]
 45 |   // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12]
 46 | 
 47 |   t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1]
 48 |   t[3] = ALIGNR8(x[3], x[2], 4); // d[12:9]
 49 |   t[2] = SRL32(t[0], sigma0_0);  // d[4:1] >> s0[0]
 50 |   x[0] = ADD32(x[0], t[3]);      // d[3:0] + d[12:9]
 51 | 
 52 |   t[3] = SRL32(t[0], sigma0_2);                       // d[4:1] >> s0[2]
 53 |   t[1] = SLL32(t[0], SHA256_WORD_BIT_LEN - sigma0_1); // d[4:1] << (32 - s0[1])
 54 |   t[0] = t[3] ^ t[2];                                 // (d[4:1] >> s0[2]) ^
 55 |                                                       //   (d[4:1] >> s0[0])
 56 |   t[3] = SHUF32(x[3], 0xfa);                          // d[15,15,14,14]
 57 |   t[2] = SRL32(t[2], sigma0_1 - sigma0_0);            // d[4:1] >> s0[1]
 58 |   t[0] ^= t[1] ^ t[2];                                // ROTR(d[4:1], s0[1]) ^
 59 |                                                       //   (d[4:1] >> s0[2]) ^
 60 |                                                       //   (d[4:1] >> s0[0])
 61 |   t[1] = SLL32(t[1], sigma0_1 - sigma0_0);            // d[4:1] << (32 - s0[0])
 62 |   t[2] = SRL32(t[3], sigma1_2);                       // d[15,15,14,14] >> s1[2]
 63 |   t[3] = SRL64(t[3], sigma1_0);                       // ROTR(d[-,15,-,14], s1[0])
 64 |   x[0] = ADD32(x[0], t[0] ^ t[1]);                    // d[3:0] + sigma0(d[4:1])
 65 | 
 66 |   t[2] ^= t[3]; // d[15,15,14,14] >> s1[2] ^ ROTR(d[-,15,-,14], s1[0])
 67 |   t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,15,-,14], s1[1])
 68 |   t[2] = SHUF8(t[2] ^ t[3], lo_mask);      // sigma1(d[Zero,Zero,15,14])
 69 |   x[0] = ADD32(x[0], t[2]);                // d[3:0] + sigma0(d[4:1]) +
 70 |                                            // sigma1(d[-,-,15,14]) + d[12:9]
 71 | 
 72 |   // When calculating s1 = sigma1(s1) for the upper dwords
 73 |   // we use the already updated d[1:0]
 74 |   t[3] = SHUF32(x[0], 0x50);               // d[1,1,0,0]
 75 |   t[2] = SRL32(t[3], sigma1_2);            // d[1,1,0,0] >> s1[2]
 76 |   t[3] = SRL64(t[3], sigma1_0);            // ROTR(d[-,1,-,0]) >> s1[0]
 77 |   t[2] ^= t[3];                            // ROTR(d[-,1,-,0]) >> s1[0] ^
 78 |                                            //   d[1,1,0,0] >> s1[2]
 79 |   t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,1,-,0]) >> s1[1]
 80 | 
 81 |   // sigma1(d[0,x[1],0,x[0]])
 82 |   // sigma1(d[x[1],x[0],Zero,Zero])
 83 |   x[0] = ADD32(x[0], SHUF8(t[2] ^ t[3], hi_mask));
 84 | 
 85 |   rotate_x(x);
 86 | 
 87 |   return ADD32(x[3], LOAD(k256_p));
 88 | }
 89 | 
 90 | #else
 91 | 
 92 | _INLINE_ vec_t sha256_update_x_avx(vec_t                x[4],
 93 |                                    const sha256_word_t *k256_p,
 94 |                                    UNUSED const vec_t   lo_mask,
 95 |                                    UNUSED const vec_t   hi_mask)
 96 | {
 97 |   vec_t t[2];
 98 |   vec_t s0;
 99 |   vec_t s1;
100 | 
101 |   // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates:
102 |   // s0 = sigma0(d[(i + 1) % 16])
103 |   // s1 = sigma1(d[(i + 14) % 16])
104 |   // d[i % 16] += s0 + s1 + d[(i + 9) % 16]
105 |   //
106 |   // For x[0]=d[3:0]
107 |   //
108 |   // This means that
109 |   // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9]
110 |   // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10]
111 |   // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11]
112 |   // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12]
113 | 
114 |   t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1]
115 |   t[1] = ALIGNR8(x[3], x[2], 4); // d[12:9]
116 |   x[0] = ADD32(x[0], t[1]);      // d[3:0] + d[12:9]
117 |   s0   = ROR32(t[0], sigma0_0) ^ ROR32(t[0], sigma0_1) ^ SRL32(t[0], sigma0_2);
118 |   x[0] = ADD32(x[0], s0); // d[3:0] + d[12:9] + sigma0(d[4:1])
119 | 
120 |   t[1] = SHUF32(x[3], 0xfe); // d[-,-,15,14]
121 |   s1   = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2);
122 |   x[0] = MADD32(x[0], LOW32X2_MASK, x[0], s1);
123 | 
124 |   t[1] = SHUF32(x[0], 0x40);
125 |   s1   = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2);
126 |   x[0] = MADD32(x[0], HIGH32X2_MASK, x[0], s1);
127 | 
128 |   rotate_x(x);
129 | 
130 |   return ADD32(x[3], LOAD(k256_p));
131 | }
132 | 
133 | #endif
134 | 


--------------------------------------------------------------------------------
/src/sha256_compress_x86_64_sha_ext.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA256 using the SHA extension
  5 | // The implementation is based on:
  6 | // https://software.intel.com/en-us/articles/intel-sha-extensions
  7 | //
  8 | // Written by Nir Drucker and Shay Gueron
  9 | // AWS Cryptographic Algorithms Group.
 10 | // (ndrucker@amazon.com, gueron@amazon.com)
 11 | 
 12 | #include "avx_defs.h"
 13 | #include "sha256_defs.h"
 14 | 
 15 | #define RND2(s0, s1, data) (_mm_sha256rnds2_epu32(s0, s1, data))
 16 | #define SHAMSG1(m1, m2)    (_mm_sha256msg1_epu32(m1, m2))
 17 | #define SHAMSG2(m1, m2)    (_mm_sha256msg2_epu32(m1, m2))
 18 | 
 19 | #define SET_K(i)                                                   \
 20 |   (SETR32(K256[4 * (i)], K256[(4 * (i)) + 1], K256[(4 * (i)) + 2], \
 21 |           K256[(4 * (i)) + 3]))
 22 | 
 23 | void sha256_compress_x86_64_sha_ext(IN OUT sha256_state_t *state,
 24 |                                     IN const uint8_t *data,
 25 |                                     IN size_t         blocks_num)
 26 | {
 27 |   vec_t state0;
 28 |   vec_t state1;
 29 |   vec_t msg;
 30 |   vec_t tmp;
 31 |   vec_t msgtmp[4];
 32 |   vec_t ABEF_SAVE;
 33 |   vec_t CDGH_SAVE;
 34 | 
 35 |   const vec_t shuf_mask =
 36 |     SET64(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203));
 37 | 
 38 |   tmp    = SHUF32(LOAD(&state->w[0]), 0xB1); // CDAB
 39 |   state1 = SHUF32(LOAD(&state->w[4]), 0x1B); // EFGH
 40 |   state0 = ALIGNR8(tmp, state1, 8);          // ABEF
 41 |   state1 = BLEND16(state1, tmp, 0xF0);       // CDGH
 42 | 
 43 |   while(blocks_num--) {
 44 |     // Save the current state
 45 |     ABEF_SAVE = state0;
 46 |     CDGH_SAVE = state1;
 47 | 
 48 |     // Rounds 0-3
 49 |     msgtmp[0] = SHUF8(LOAD(&data[0]), shuf_mask);
 50 |     msg       = ADD32(msgtmp[0], SET_K(0));
 51 |     state1    = RND2(state1, state0, msg);
 52 |     msg       = SHUF32(msg, 0x0E);
 53 |     state0    = RND2(state0, state1, msg);
 54 | 
 55 |     PRAGMA_LOOP_UNROLL_2
 56 | 
 57 |     // Rounds 4-7 (i=1)
 58 |     // Rounds 8-11 (i=2)
 59 |     for(size_t i = 1; i <= 2; i++) {
 60 |       msgtmp[i]     = SHUF8(LOAD(&data[16 * i]), shuf_mask);
 61 |       msg           = ADD32(msgtmp[i], SET_K(i));
 62 |       state1        = RND2(state1, state0, msg);
 63 |       msg           = SHUF32(msg, 0x0E);
 64 |       state0        = RND2(state0, state1, msg);
 65 |       msgtmp[i - 1] = SHAMSG1(msgtmp[i - 1], msgtmp[i]);
 66 |     }
 67 | 
 68 |     // Rounds 12-59 in blocks of 4 (12 multi-rounds)
 69 |     msgtmp[3] = SHUF8(LOAD(&data[48]), shuf_mask);
 70 | 
 71 |     PRAGMA_LOOP_UNROLL_12
 72 | 
 73 |     for(size_t i = 3; i <= 14; i++) {
 74 |       const size_t prev = LSB2(i - 1);
 75 |       const size_t curr = LSB2(i);
 76 |       const size_t next = LSB2(i + 1);
 77 | 
 78 |       msg          = ADD32(msgtmp[curr], SET_K(i));
 79 |       state1       = RND2(state1, state0, msg);
 80 |       tmp          = ALIGNR8(msgtmp[curr], msgtmp[prev], 4);
 81 |       msgtmp[next] = ADD32(msgtmp[next], tmp);
 82 |       msgtmp[next] = SHAMSG2(msgtmp[next], msgtmp[curr]);
 83 |       msg          = SHUF32(msg, 0x0E);
 84 |       state0       = RND2(state0, state1, msg);
 85 |       msgtmp[prev] = SHAMSG1(msgtmp[prev], msgtmp[curr]);
 86 |     }
 87 | 
 88 |     // Rounds 60-63
 89 |     msg    = ADD32(msgtmp[3], SET_K(15));
 90 |     state1 = RND2(state1, state0, msg);
 91 |     msg    = SHUF32(msg, 0x0E);
 92 |     state0 = RND2(state0, state1, msg);
 93 | 
 94 |     // Accumulate state
 95 |     state0 = ADD32(state0, ABEF_SAVE);
 96 |     state1 = ADD32(state1, CDGH_SAVE);
 97 | 
 98 |     data += SHA256_BLOCK_BYTE_LEN;
 99 |   }
100 | 
101 |   tmp    = SHUF32(state0, 0x1B);       // FEBA
102 |   state1 = SHUF32(state1, 0xB1);       // DCHG
103 |   state0 = BLEND16(tmp, state1, 0xF0); // DCBA
104 |   state1 = ALIGNR8(state1, tmp, 8);    // ABEF
105 | 
106 |   STORE((vec_t *)&state->w[0], state0);
107 |   STORE((vec_t *)&state->w[4], state1);
108 | }
109 | 


--------------------------------------------------------------------------------
/src/sha256_consts.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include "sha256_defs.h"
  5 | 
  6 | #define K256_0  UINT32_C(0x428a2f98)
  7 | #define K256_1  UINT32_C(0x71374491)
  8 | #define K256_2  UINT32_C(0xb5c0fbcf)
  9 | #define K256_3  UINT32_C(0xe9b5dba5)
 10 | #define K256_4  UINT32_C(0x3956c25b)
 11 | #define K256_5  UINT32_C(0x59f111f1)
 12 | #define K256_6  UINT32_C(0x923f82a4)
 13 | #define K256_7  UINT32_C(0xab1c5ed5)
 14 | #define K256_8  UINT32_C(0xd807aa98)
 15 | #define K256_9  UINT32_C(0x12835b01)
 16 | #define K256_10 UINT32_C(0x243185be)
 17 | #define K256_11 UINT32_C(0x550c7dc3)
 18 | #define K256_12 UINT32_C(0x72be5d74)
 19 | #define K256_13 UINT32_C(0x80deb1fe)
 20 | #define K256_14 UINT32_C(0x9bdc06a7)
 21 | #define K256_15 UINT32_C(0xc19bf174)
 22 | #define K256_16 UINT32_C(0xe49b69c1)
 23 | #define K256_17 UINT32_C(0xefbe4786)
 24 | #define K256_18 UINT32_C(0x0fc19dc6)
 25 | #define K256_19 UINT32_C(0x240ca1cc)
 26 | #define K256_20 UINT32_C(0x2de92c6f)
 27 | #define K256_21 UINT32_C(0x4a7484aa)
 28 | #define K256_22 UINT32_C(0x5cb0a9dc)
 29 | #define K256_23 UINT32_C(0x76f988da)
 30 | #define K256_24 UINT32_C(0x983e5152)
 31 | #define K256_25 UINT32_C(0xa831c66d)
 32 | #define K256_26 UINT32_C(0xb00327c8)
 33 | #define K256_27 UINT32_C(0xbf597fc7)
 34 | #define K256_28 UINT32_C(0xc6e00bf3)
 35 | #define K256_29 UINT32_C(0xd5a79147)
 36 | #define K256_30 UINT32_C(0x06ca6351)
 37 | #define K256_31 UINT32_C(0x14292967)
 38 | #define K256_32 UINT32_C(0x27b70a85)
 39 | #define K256_33 UINT32_C(0x2e1b2138)
 40 | #define K256_34 UINT32_C(0x4d2c6dfc)
 41 | #define K256_35 UINT32_C(0x53380d13)
 42 | #define K256_36 UINT32_C(0x650a7354)
 43 | #define K256_37 UINT32_C(0x766a0abb)
 44 | #define K256_38 UINT32_C(0x81c2c92e)
 45 | #define K256_39 UINT32_C(0x92722c85)
 46 | #define K256_40 UINT32_C(0xa2bfe8a1)
 47 | #define K256_41 UINT32_C(0xa81a664b)
 48 | #define K256_42 UINT32_C(0xc24b8b70)
 49 | #define K256_43 UINT32_C(0xc76c51a3)
 50 | #define K256_44 UINT32_C(0xd192e819)
 51 | #define K256_45 UINT32_C(0xd6990624)
 52 | #define K256_46 UINT32_C(0xf40e3585)
 53 | #define K256_47 UINT32_C(0x106aa070)
 54 | #define K256_48 UINT32_C(0x19a4c116)
 55 | #define K256_49 UINT32_C(0x1e376c08)
 56 | #define K256_50 UINT32_C(0x2748774c)
 57 | #define K256_51 UINT32_C(0x34b0bcb5)
 58 | #define K256_52 UINT32_C(0x391c0cb3)
 59 | #define K256_53 UINT32_C(0x4ed8aa4a)
 60 | #define K256_54 UINT32_C(0x5b9cca4f)
 61 | #define K256_55 UINT32_C(0x682e6ff3)
 62 | #define K256_56 UINT32_C(0x748f82ee)
 63 | #define K256_57 UINT32_C(0x78a5636f)
 64 | #define K256_58 UINT32_C(0x84c87814)
 65 | #define K256_59 UINT32_C(0x8cc70208)
 66 | #define K256_60 UINT32_C(0x90befffa)
 67 | #define K256_61 UINT32_C(0xa4506ceb)
 68 | #define K256_62 UINT32_C(0xbef9a3f7)
 69 | #define K256_63 UINT32_C(0xc67178f2)
 70 | 
 71 | ALIGN(64)
 72 | const sha256_word_t K256[SHA256_ROUNDS_NUM] = {
 73 |   K256_0,  K256_1,  K256_2,  K256_3,  K256_4,  K256_5,  K256_6,  K256_7,
 74 |   K256_8,  K256_9,  K256_10, K256_11, K256_12, K256_13, K256_14, K256_15,
 75 |   K256_16, K256_17, K256_18, K256_19, K256_20, K256_21, K256_22, K256_23,
 76 |   K256_24, K256_25, K256_26, K256_27, K256_28, K256_29, K256_30, K256_31,
 77 |   K256_32, K256_33, K256_34, K256_35, K256_36, K256_37, K256_38, K256_39,
 78 |   K256_40, K256_41, K256_42, K256_43, K256_44, K256_45, K256_46, K256_47,
 79 |   K256_48, K256_49, K256_50, K256_51, K256_52, K256_53, K256_54, K256_55,
 80 |   K256_56, K256_57, K256_58, K256_59, K256_60, K256_61, K256_62, K256_63};
 81 | 
 82 | ALIGN(64)
 83 | const sha256_word_t K256x2[2 * SHA256_ROUNDS_NUM] = {
 84 |   DUP2(K256_0, K256_1, K256_2, K256_3),
 85 |   DUP2(K256_4, K256_5, K256_6, K256_7),
 86 |   DUP2(K256_8, K256_9, K256_10, K256_11),
 87 |   DUP2(K256_12, K256_13, K256_14, K256_15),
 88 |   DUP2(K256_16, K256_17, K256_18, K256_19),
 89 |   DUP2(K256_20, K256_21, K256_22, K256_23),
 90 |   DUP2(K256_24, K256_25, K256_26, K256_27),
 91 |   DUP2(K256_28, K256_29, K256_30, K256_31),
 92 |   DUP2(K256_32, K256_33, K256_34, K256_35),
 93 |   DUP2(K256_36, K256_37, K256_38, K256_39),
 94 |   DUP2(K256_40, K256_41, K256_42, K256_43),
 95 |   DUP2(K256_44, K256_45, K256_46, K256_47),
 96 |   DUP2(K256_48, K256_49, K256_50, K256_51),
 97 |   DUP2(K256_52, K256_53, K256_54, K256_55),
 98 |   DUP2(K256_56, K256_57, K256_58, K256_59),
 99 |   DUP2(K256_60, K256_61, K256_62, K256_63)};
100 | 
101 | ALIGN(64)
102 | const sha256_word_t K256x4[4 * SHA256_ROUNDS_NUM] = {
103 |   DUP4(K256_0, K256_1, K256_2, K256_3),
104 |   DUP4(K256_4, K256_5, K256_6, K256_7),
105 |   DUP4(K256_8, K256_9, K256_10, K256_11),
106 |   DUP4(K256_12, K256_13, K256_14, K256_15),
107 |   DUP4(K256_16, K256_17, K256_18, K256_19),
108 |   DUP4(K256_20, K256_21, K256_22, K256_23),
109 |   DUP4(K256_24, K256_25, K256_26, K256_27),
110 |   DUP4(K256_28, K256_29, K256_30, K256_31),
111 |   DUP4(K256_32, K256_33, K256_34, K256_35),
112 |   DUP4(K256_36, K256_37, K256_38, K256_39),
113 |   DUP4(K256_40, K256_41, K256_42, K256_43),
114 |   DUP4(K256_44, K256_45, K256_46, K256_47),
115 |   DUP4(K256_48, K256_49, K256_50, K256_51),
116 |   DUP4(K256_52, K256_53, K256_54, K256_55),
117 |   DUP4(K256_56, K256_57, K256_58, K256_59),
118 |   DUP4(K256_60, K256_61, K256_62, K256_63)};
119 | 


--------------------------------------------------------------------------------
/src/sha512.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include <assert.h>
  5 | 
  6 | #include "sha512_defs.h"
  7 | 
  8 | #define LAST_BLOCK_BYTE_LEN (2 * SHA512_BLOCK_BYTE_LEN)
  9 | 
 10 | typedef struct sha512_hash_s {
 11 |   ALIGN(64) sha512_state_t state;
 12 |   uint64_t len;
 13 | 
 14 |   ALIGN(64) uint8_t data[LAST_BLOCK_BYTE_LEN];
 15 | 
 16 |   sha512_word_t rem;
 17 |   sha_impl_t    impl;
 18 | } sha512_ctx_t;
 19 | 
 20 | _INLINE_ void sha512_init(OUT sha512_ctx_t *ctx)
 21 | {
 22 |   ctx->state.w[0] = UINT64_C(0x6a09e667f3bcc908);
 23 |   ctx->state.w[1] = UINT64_C(0xbb67ae8584caa73b);
 24 |   ctx->state.w[2] = UINT64_C(0x3c6ef372fe94f82b);
 25 |   ctx->state.w[3] = UINT64_C(0xa54ff53a5f1d36f1);
 26 |   ctx->state.w[4] = UINT64_C(0x510e527fade682d1);
 27 |   ctx->state.w[5] = UINT64_C(0x9b05688c2b3e6c1f);
 28 |   ctx->state.w[6] = UINT64_C(0x1f83d9abfb41bd6b);
 29 |   ctx->state.w[7] = UINT64_C(0x5be0cd19137e2179);
 30 | }
 31 | 
 32 | _INLINE_ void sha512_compress(IN OUT sha512_ctx_t *ctx,
 33 |                               IN const uint8_t *data,
 34 |                               IN const size_t   blocks_num)
 35 | {
 36 |   assert((ctx != NULL) && (data != NULL));
 37 | 
 38 |   // OpenSSL code can crash without this check
 39 |   if(blocks_num == 0) {
 40 |     return;
 41 |   }
 42 | 
 43 |   switch(ctx->impl) {
 44 | #if defined(X86_64)
 45 |     case AVX_IMPL:
 46 |       sha512_compress_x86_64_avx(&ctx->state, data, blocks_num);
 47 |       break;
 48 | 
 49 |     case OPENSSL_AVX_IMPL:
 50 |       RUN_OPENSSL_CODE_WITH_AVX(
 51 |         sha512_block_data_order_local(ctx->state.w, data, blocks_num););
 52 |       break;
 53 | #endif
 54 | 
 55 | #if defined(AVX2_SUPPORT)
 56 |     case AVX2_IMPL:
 57 |       sha512_compress_x86_64_avx2(&ctx->state, data, blocks_num);
 58 |       break;
 59 | 
 60 |     case OPENSSL_AVX2_IMPL:
 61 |       RUN_OPENSSL_CODE_WITH_AVX2(
 62 |         sha512_block_data_order_local(ctx->state.w, data, blocks_num););
 63 |       break;
 64 | #endif
 65 | 
 66 | #if defined(AVX512_SUPPORT)
 67 |     case AVX512_IMPL:
 68 |       sha512_compress_x86_64_avx512(&ctx->state, data, blocks_num);
 69 |       break;
 70 | #endif
 71 | 
 72 | #if defined(NEON_SUPPORT)
 73 |     case OPENSSL_NEON_IMPL:
 74 |       RUN_OPENSSL_CODE_WITH_NEON(
 75 |         sha512_block_data_order_local(ctx->state.w, data, blocks_num););
 76 |       break;
 77 | #endif
 78 | 
 79 |     default: sha512_compress_generic(&ctx->state, data, blocks_num); break;
 80 |   }
 81 | }
 82 | 
 83 | _INLINE_ void sha512_update(IN OUT sha512_ctx_t *ctx,
 84 |                             IN const uint8_t *data,
 85 |                             IN size_t         byte_len)
 86 | {
 87 |   // On exiting this function ctx->rem < SHA512_BLOCK_BYTE_LEN
 88 | 
 89 |   assert((ctx != NULL) && (data != NULL));
 90 | 
 91 |   if(byte_len == 0) {
 92 |     return;
 93 |   }
 94 | 
 95 |   // Accumulate the overall size
 96 |   ctx->len += byte_len;
 97 | 
 98 |   // Less than a block. Store the data in a temporary buffer
 99 |   if((ctx->rem != 0) && (ctx->rem + byte_len < SHA512_BLOCK_BYTE_LEN)) {
100 |     my_memcpy(&ctx->data[ctx->rem], data, byte_len);
101 |     ctx->rem += byte_len;
102 |     return;
103 |   }
104 | 
105 |   // Complete and compress a previously stored block
106 |   if(ctx->rem != 0) {
107 |     const size_t clen = SHA512_BLOCK_BYTE_LEN - ctx->rem;
108 |     my_memcpy(&ctx->data[ctx->rem], data, clen);
109 |     sha512_compress(ctx, ctx->data, 1);
110 | 
111 |     data += clen;
112 |     byte_len -= clen;
113 | 
114 |     ctx->rem = 0;
115 |     secure_clean(ctx->data, SHA512_BLOCK_BYTE_LEN);
116 |   }
117 | 
118 |   // Compress full blocks
119 |   if(byte_len >= SHA512_BLOCK_BYTE_LEN) {
120 |     const size_t blocks_num           = (byte_len >> 7);
121 |     const size_t full_blocks_byte_len = (blocks_num << 7);
122 | 
123 |     sha512_compress(ctx, data, blocks_num);
124 | 
125 |     data += full_blocks_byte_len;
126 |     byte_len -= full_blocks_byte_len;
127 |   }
128 | 
129 |   // Store the reminder
130 |   my_memcpy(ctx->data, data, byte_len);
131 |   ctx->rem = byte_len;
132 | }
133 | 
134 | _INLINE_ void sha512_final(OUT uint8_t *dgst, IN OUT sha512_ctx_t *ctx)
135 | {
136 |   assert((ctx != NULL) && (dgst != NULL));
137 |   assert(ctx->rem < SHA512_BLOCK_BYTE_LEN);
138 | 
139 |   // Byteswap the length in bits of the hashed message
140 |   const uint64_t bswap_len      = bswap_64(8 * ctx->len);
141 |   const size_t   last_block_num = (ctx->rem < 112) ? 1 : 2;
142 |   const size_t   last_qw_pos =
143 |     (last_block_num * SHA512_BLOCK_BYTE_LEN) - sizeof(bswap_len);
144 | 
145 |   ctx->data[ctx->rem++] = SHA512_MSG_END_SYMBOL;
146 | 
147 |   // Reset the rest of the data buffer
148 |   my_memset(&ctx->data[ctx->rem], 0, sizeof(ctx->data) - ctx->rem);
149 |   my_memcpy(&ctx->data[last_qw_pos], (const uint8_t *)&bswap_len,
150 |             sizeof(bswap_len));
151 | 
152 |   // Compress the final block
153 |   sha512_compress(ctx, ctx->data, last_block_num);
154 | 
155 |   // This implementation assumes running on a Little endian machine
156 |   ctx->state.w[0] = bswap_64(ctx->state.w[0]);
157 |   ctx->state.w[1] = bswap_64(ctx->state.w[1]);
158 |   ctx->state.w[2] = bswap_64(ctx->state.w[2]);
159 |   ctx->state.w[3] = bswap_64(ctx->state.w[3]);
160 |   ctx->state.w[4] = bswap_64(ctx->state.w[4]);
161 |   ctx->state.w[5] = bswap_64(ctx->state.w[5]);
162 |   ctx->state.w[6] = bswap_64(ctx->state.w[6]);
163 |   ctx->state.w[7] = bswap_64(ctx->state.w[7]);
164 |   my_memcpy(dgst, ctx->state.w, SHA512_HASH_BYTE_LEN);
165 | 
166 |   secure_clean(ctx, sizeof(*ctx));
167 | }
168 | 
169 | void sha512(OUT uint8_t *dgst,
170 |             IN const uint8_t *  data,
171 |             IN const size_t     byte_len,
172 |             IN const sha_impl_t impl)
173 | {
174 |   assert((data != NULL) || (dgst != NULL));
175 | 
176 |   sha512_ctx_t ctx = {0};
177 |   ctx.impl         = impl;
178 |   sha512_init(&ctx);
179 |   sha512_update(&ctx, data, byte_len);
180 |   sha512_final(dgst, &ctx);
181 | }
182 | 


--------------------------------------------------------------------------------
/src/sha512_compress_generic.c:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #include "sha512_defs.h"
 5 | 
 6 | // In the generic implementation we use memcpy to avoid align issues
 7 | _INLINE_ sha512_word_t load_be64(IN const void *ptr)
 8 | {
 9 |   sha512_word_t ret;
10 |   my_memcpy(&ret, ptr, sizeof(ret));
11 |   return bswap_64(ret);
12 | }
13 | 
14 | _INLINE_ void load_data_and_rounds_00_15(OUT sha512_msg_schedule_t *ms,
15 |                                          IN OUT sha512_state_t *cur_state,
16 |                                          IN const uint8_t *data)
17 | {
18 |   PRAGMA_LOOP_UNROLL_4
19 | 
20 |   for(size_t i = 0; i < SHA512_BLOCK_WORDS_NUM; i++) {
21 |     ms->w[i] = load_be64(&data[sizeof(sha512_word_t) * i]);
22 |     sha_round(cur_state, ms->w[i], K512[i]);
23 |   }
24 | }
25 | 
26 | _INLINE_ void rounds_16_79(IN OUT sha512_state_t *cur_state,
27 |                            IN OUT sha512_msg_schedule_t *ms)
28 | {
29 |   PRAGMA_LOOP_UNROLL_64
30 | 
31 |   for(size_t i = SHA512_BLOCK_WORDS_NUM; i < SHA512_ROUNDS_NUM; i++) {
32 |     const sha512_word_t x1  = ms->w[LSB4(i + 1)];
33 |     const sha512_word_t x9  = ms->w[LSB4(i + 9)];
34 |     const sha512_word_t x14 = ms->w[LSB4(i + 14)];
35 | 
36 |     ms->w[LSB4(i)] += sigma0(x1) + sigma1(x14) + x9;
37 |     sha_round(cur_state, ms->w[LSB4(i)], K512[i]);
38 |   }
39 | }
40 | 
41 | void sha512_compress_generic(IN OUT sha512_state_t *state,
42 |                              IN const uint8_t *data,
43 |                              IN size_t         blocks_num)
44 | {
45 |   sha512_state_t        cur_state;
46 |   sha512_msg_schedule_t ms;
47 | 
48 |   while(blocks_num--) {
49 |     my_memcpy(&cur_state, state, sizeof(cur_state));
50 | 
51 |     load_data_and_rounds_00_15(&ms, &cur_state, data);
52 |     data += SHA512_BLOCK_BYTE_LEN;
53 | 
54 |     rounds_16_79(&cur_state, &ms);
55 |     accumulate_state(state, &cur_state);
56 |   }
57 | 
58 |   secure_clean(&cur_state, sizeof(cur_state));
59 |   secure_clean(&ms, sizeof(ms));
60 | }
61 | 


--------------------------------------------------------------------------------
/src/sha512_compress_x86_64_avx.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA512 using avx
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx_defs.h"
 15 | #include "sha512_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64
 18 | // that are defined in avx512_defs.h
 19 | #include "sha512_compress_x86_64_avx_helper.c"
 20 | 
 21 | #define MS_VEC_NUM   (SHA512_BLOCK_BYTE_LEN / sizeof(vec_t))
 22 | #define WORDS_IN_VEC (16 / sizeof(sha512_word_t))
 23 | 
 24 | _INLINE_ void load_data(OUT vec_t x[MS_VEC_NUM],
 25 |                         IN OUT sha512_msg_schedule_t *ms,
 26 |                         IN const uint8_t *data)
 27 | {
 28 |   // 64 bits (8 bytes) swap masks
 29 |   const vec_t shuf_mask =
 30 |     _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
 31 | 
 32 |   PRAGMA_LOOP_UNROLL_8
 33 | 
 34 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 35 |     const size_t pos = WORDS_IN_VEC * i;
 36 | 
 37 |     x[i] = LOAD(&data[sizeof(vec_t) * i]);
 38 |     x[i] = SHUF8(x[i], shuf_mask);
 39 |     STORE(&ms->w[pos], ADD64(x[i], LOAD(&K512[pos])));
 40 |   }
 41 | }
 42 | 
 43 | _INLINE_ void rounds_0_63(sha512_state_t *       cur_state,
 44 |                           vec_t                  x[MS_VEC_NUM],
 45 |                           sha512_msg_schedule_t *ms)
 46 | {
 47 |   // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in
 48 |   // load_data(...).
 49 |   size_t k512_idx = SHA512_BLOCK_WORDS_NUM;
 50 | 
 51 |   // Rounds 0-63 (0-15, 16-31, 32-47, 48-63)
 52 |   for(size_t i = 0; i < 4; i++) {
 53 | 
 54 |     PRAGMA_LOOP_UNROLL_8
 55 | 
 56 |     for(size_t j = 0; j < MS_VEC_NUM; j++) {
 57 |       const size_t pos = WORDS_IN_VEC * j;
 58 | 
 59 |       const vec_t y = sha512_update_x_avx(x, &K512[k512_idx]);
 60 | 
 61 |       sha_round(cur_state, ms->w[pos], 0);
 62 |       sha_round(cur_state, ms->w[pos + 1], 0);
 63 | 
 64 |       STORE(&ms->w[pos], y);
 65 |       k512_idx += WORDS_IN_VEC;
 66 |     }
 67 |   }
 68 | }
 69 | 
 70 | _INLINE_ void rounds_64_79(sha512_state_t *             cur_state,
 71 |                            const sha512_msg_schedule_t *ms)
 72 | {
 73 |   PRAGMA_LOOP_UNROLL_16
 74 | 
 75 |   for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) {
 76 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 77 |   }
 78 | }
 79 | 
 80 | void sha512_compress_x86_64_avx(sha512_state_t *state,
 81 |                                 const uint8_t * data,
 82 |                                 size_t          blocks_num)
 83 | {
 84 |   sha512_state_t        cur_state;
 85 |   sha512_msg_schedule_t ms;
 86 |   vec_t                 x[MS_VEC_NUM];
 87 | 
 88 |   while(blocks_num--) {
 89 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
 90 | 
 91 |     load_data(x, &ms, data);
 92 |     data += SHA512_BLOCK_BYTE_LEN;
 93 | 
 94 |     rounds_0_63(&cur_state, x, &ms);
 95 |     rounds_64_79(&cur_state, &ms);
 96 |     accumulate_state(state, &cur_state);
 97 |   }
 98 | 
 99 |   secure_clean(&cur_state, sizeof(cur_state));
100 |   secure_clean(&ms, sizeof(ms));
101 | }
102 | 


--------------------------------------------------------------------------------
/src/sha512_compress_x86_64_avx2.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA512 using avx2
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx2_defs.h"
 15 | #include "sha512_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64
 18 | // that are defined in avx512_defs.h
 19 | #include "sha512_compress_x86_64_avx_helper.c"
 20 | 
 21 | // Processing 2 blocks in parallel
 22 | #define MS_VEC_NUM           ((2 * SHA512_BLOCK_BYTE_LEN) / sizeof(vec_t))
 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha512_word_t))
 24 | #define WORDS_IN_VEC         (sizeof(vec_t) / sizeof(sha512_word_t))
 25 | 
 26 | _INLINE_ void load_data(vec_t                  x[MS_VEC_NUM],
 27 |                         sha512_msg_schedule_t *ms,
 28 |                         sha512_word_t          t2[SHA512_ROUNDS_NUM],
 29 |                         const uint8_t *        data)
 30 | {
 31 |   // 64 bits (8 bytes) swap masks
 32 |   const vec_t shuf_mask =
 33 |     _mm256_set_epi64x(DUP2(0x08090a0b0c0d0e0f, 0x0001020304050607));
 34 | 
 35 |   PRAGMA_LOOP_UNROLL_8
 36 | 
 37 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 38 |     const size_t pos0 = (sizeof(vec_t) / 2) * i;
 39 |     const size_t pos1 = pos0 + SHA512_BLOCK_BYTE_LEN;
 40 | 
 41 |     LOADU2(&data[pos1], &data[pos0], x[i]);
 42 |     x[i]    = SHUF8(x[i], shuf_mask);
 43 |     vec_t y = ADD64(x[i], LOAD(&K512x2[4 * i]));
 44 |     STOREU2(&t2[2 * i], &ms->w[2 * i], y);
 45 |   }
 46 | }
 47 | 
 48 | _INLINE_ void rounds_0_63(sha512_state_t *       cur_state,
 49 |                           vec_t                  x[MS_VEC_NUM],
 50 |                           sha512_msg_schedule_t *ms,
 51 |                           sha512_word_t          t2[SHA512_ROUNDS_NUM])
 52 | {
 53 |   // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in
 54 |   // load_data(...).
 55 |   size_t k512_idx = 2 * SHA512_BLOCK_WORDS_NUM;
 56 | 
 57 |   // Rounds 0-63 (0-15, 16-31, 32-47, 48-63)
 58 |   for(size_t i = 1; i < 5; i++) {
 59 | 
 60 |     PRAGMA_LOOP_UNROLL_8
 61 | 
 62 |     for(size_t j = 0; j < 8; j++) {
 63 |       const size_t pos = WORDS_IN_128_BIT_VEC * j;
 64 | 
 65 |       const vec_t y = sha512_update_x_avx(x, &K512x2[k512_idx]);
 66 | 
 67 |       sha_round(cur_state, ms->w[pos], 0);
 68 |       sha_round(cur_state, ms->w[pos + 1], 0);
 69 |       STOREU2(&t2[(16 * i) + pos], &ms->w[pos], y);
 70 |       k512_idx += WORDS_IN_VEC;
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | _INLINE_ void rounds_64_79(sha512_state_t *             cur_state,
 76 |                            const sha512_msg_schedule_t *ms)
 77 | {
 78 |   PRAGMA_LOOP_UNROLL_16
 79 | 
 80 |   for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) {
 81 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 82 |   }
 83 | }
 84 | 
 85 | _INLINE_ void process_second_block(sha512_state_t *    cur_state,
 86 |                                    const sha512_word_t t2[SHA512_ROUNDS_NUM])
 87 | {
 88 |   PRAGMA_LOOP_UNROLL_80
 89 | 
 90 |   for(size_t i = 0; i < SHA512_ROUNDS_NUM; i++) {
 91 |     sha_round(cur_state, t2[i], 0);
 92 |   }
 93 | }
 94 | 
 95 | void sha512_compress_x86_64_avx2(sha512_state_t *state,
 96 |                                  const uint8_t * data,
 97 |                                  size_t          blocks_num)
 98 | {
 99 |   ALIGN(64) sha512_msg_schedule_t ms;
100 |   ALIGN(64) sha512_word_t         t2[SHA512_ROUNDS_NUM];
101 |   sha512_state_t                  cur_state;
102 |   vec_t                           x[MS_VEC_NUM];
103 | 
104 |   if(LSB1(blocks_num)) {
105 |     sha512_compress_x86_64_avx(state, data, 1);
106 |     data += SHA512_BLOCK_BYTE_LEN;
107 |     blocks_num--;
108 |   }
109 | 
110 |   // Process two blocks in parallel
111 |   // Here blocks_num is even
112 |   for(size_t b = blocks_num; b != 0; b -= 2) {
113 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
114 | 
115 |     load_data(x, &ms, t2, data);
116 |     data += 2 * SHA512_BLOCK_BYTE_LEN;
117 | 
118 |     // First block
119 |     rounds_0_63(&cur_state, x, &ms, t2);
120 |     rounds_64_79(&cur_state, &ms);
121 |     accumulate_state(state, &cur_state);
122 | 
123 |     // Second block
124 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
125 |     process_second_block(&cur_state, t2);
126 |     accumulate_state(state, &cur_state);
127 |   }
128 | 
129 |   secure_clean(&cur_state, sizeof(cur_state));
130 |   secure_clean(&ms, sizeof(ms));
131 |   secure_clean(t2, sizeof(t2));
132 | }
133 | 


--------------------------------------------------------------------------------
/src/sha512_compress_x86_64_avx512.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | // An implementation of the compress function of SHA512 using avx512
  5 | // The implementation is based on:
  6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the
  7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012).
  8 | // https://doi.org/10.1007/s13389-012-0037-z
  9 | //
 10 | // Written by Nir Drucker and Shay Gueron
 11 | // AWS Cryptographic Algorithms Group.
 12 | // (ndrucker@amazon.com, gueron@amazon.com)
 13 | 
 14 | #include "internal/avx512_defs.h"
 15 | #include "sha512_defs.h"
 16 | 
 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64
 18 | // that are defined in avx512_defs.h
 19 | #include "sha512_compress_x86_64_avx_helper.c"
 20 | 
 21 | // Processing 4 blocks in parallel
 22 | #define MS_VEC_NUM           ((4 * SHA512_BLOCK_BYTE_LEN) / sizeof(vec_t))
 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha512_word_t))
 24 | #define WORDS_IN_VEC         (sizeof(vec_t) / sizeof(sha512_word_t))
 25 | 
 26 | _INLINE_ void load_data(vec_t                  x[MS_VEC_NUM],
 27 |                         sha512_msg_schedule_t *ms,
 28 |                         sha512_word_t          x2_4[][SHA512_ROUNDS_NUM],
 29 |                         const uint8_t *        data)
 30 | {
 31 |   // 64 bits (8 bytes) swap masks
 32 |   const vec_t shuf_mask =
 33 |     _mm512_set_epi64(DUP4(0x08090a0b0c0d0e0f, 0x0001020304050607));
 34 | 
 35 |   PRAGMA_LOOP_UNROLL_8
 36 | 
 37 |   for(size_t i = 0; i < MS_VEC_NUM; i++) {
 38 |     const size_t pos0 = (sizeof(vec_t) / 4) * i;
 39 |     const size_t pos1 = pos0 + SHA512_BLOCK_BYTE_LEN;
 40 |     const size_t pos2 = pos1 + SHA512_BLOCK_BYTE_LEN;
 41 |     const size_t pos3 = pos2 + SHA512_BLOCK_BYTE_LEN;
 42 |     LOADU4(&data[pos3], &data[pos2], &data[pos1], &data[pos0], x[i]);
 43 | 
 44 |     x[i]    = SHUF8(x[i], shuf_mask);
 45 |     vec_t y = ADD64(x[i], LOAD(&K512x4[8 * i]));
 46 | 
 47 |     STOREU4(&x2_4[2][2 * i], &x2_4[1][2 * i], &x2_4[0][2 * i], &ms->w[2 * i], y);
 48 |   }
 49 | }
 50 | 
 51 | _INLINE_ void rounds_0_63(sha512_state_t *       cur_state,
 52 |                           vec_t                  x[MS_VEC_NUM],
 53 |                           sha512_msg_schedule_t *ms,
 54 |                           sha512_word_t          x2_4[][SHA512_ROUNDS_NUM])
 55 | {
 56 |   // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in
 57 |   // load_data(...).
 58 |   size_t k512_idx = 4 * SHA512_BLOCK_WORDS_NUM;
 59 | 
 60 |   // Rounds 0-63 (0-15, 16-31, 32-47, 48-63)
 61 |   for(size_t i = 1; i < 5; i++) {
 62 | 
 63 |     PRAGMA_LOOP_UNROLL_8
 64 | 
 65 |     for(size_t j = 0; j < MS_VEC_NUM; j++) {
 66 |       const size_t pos = WORDS_IN_128_BIT_VEC * j;
 67 |       const vec_t  y   = sha512_update_x_avx(x, &K512x4[k512_idx]);
 68 | 
 69 |       sha_round(cur_state, ms->w[pos], 0);
 70 |       sha_round(cur_state, ms->w[pos + 1], 0);
 71 |       const size_t idx = k512_idx >> 2;
 72 | 
 73 |       STOREU4(&x2_4[2][idx], &x2_4[1][idx], &x2_4[0][idx], &ms->w[pos], y);
 74 |       k512_idx += WORDS_IN_VEC;
 75 |     }
 76 |   }
 77 | }
 78 | 
 79 | _INLINE_ void rounds_64_79(sha512_state_t *             cur_state,
 80 |                            const sha512_msg_schedule_t *ms)
 81 | {
 82 |   PRAGMA_LOOP_UNROLL_16
 83 | 
 84 |   for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) {
 85 |     sha_round(cur_state, ms->w[LSB4(i)], 0);
 86 |   }
 87 | }
 88 | 
 89 | _INLINE_ void process_extra_block(sha512_state_t *    cur_state,
 90 |                                   const sha512_word_t t[SHA512_ROUNDS_NUM])
 91 | {
 92 |   PRAGMA_LOOP_UNROLL_80
 93 | 
 94 |   for(size_t i = 0; i < SHA512_ROUNDS_NUM; i++) {
 95 |     sha_round(cur_state, t[i], 0);
 96 |   }
 97 | }
 98 | 
 99 | void sha512_compress_x86_64_avx512(sha512_state_t *state,
100 |                                    const uint8_t * data,
101 |                                    size_t          blocks_num)
102 | {
103 |   ALIGN(64) sha512_msg_schedule_t ms;
104 |   ALIGN(64) sha512_word_t         x2_4[3][SHA512_ROUNDS_NUM];
105 |   sha512_state_t                  cur_state;
106 |   vec_t                           x[MS_VEC_NUM];
107 | 
108 |   const size_t rem = LSB2(blocks_num);
109 |   if(rem != 0) {
110 |     sha512_compress_x86_64_avx2(state, data, rem);
111 |     data += rem * SHA512_BLOCK_BYTE_LEN;
112 |     blocks_num -= rem;
113 |   }
114 | 
115 |   // Process four blocks in parallel
116 |   // Here blocks_num is divided by 4
117 |   for(size_t b = blocks_num; b != 0; b -= 4) {
118 |     my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
119 | 
120 |     load_data(x, &ms, x2_4, data);
121 |     data += 4 * SHA512_BLOCK_BYTE_LEN;
122 | 
123 |     // First block
124 |     rounds_0_63(&cur_state, x, &ms, x2_4);
125 |     rounds_64_79(&cur_state, &ms);
126 |     accumulate_state(state, &cur_state);
127 | 
128 |     for(size_t i = 0; i <= 2; i++) {
129 |       my_memcpy(cur_state.w, state->w, sizeof(cur_state.w));
130 |       process_extra_block(&cur_state, x2_4[i]);
131 |       accumulate_state(state, &cur_state);
132 |     }
133 |   }
134 | 
135 |   secure_clean(&cur_state, sizeof(cur_state));
136 |   secure_clean(&ms, sizeof(ms));
137 |   secure_clean(x2_4, sizeof(x2_4));
138 | }
139 | 


--------------------------------------------------------------------------------
/src/sha512_compress_x86_64_avx_helper.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | // An implementation of the compress function of SHA512 using avx/avx2/avx512
  5 | // It was translated from assembly (OpenSSL) to C by
  6 | //
  7 | // Nir Drucker and Shay Gueron
  8 | // AWS Cryptographic Algorithms Group.
  9 | // (ndrucker@amazon.com, gueron@amazon.com)
 10 | 
 11 | // This file depends on vec_t and on the following macros:
 12 | // LOAD, ADD64, ALIGNR8, SRL64, SLL64
 13 | 
 14 | #define SHA512_WORD_BIT_LEN (8 * sizeof(sha512_word_t))
 15 | 
 16 | _INLINE_ void rotate_x(vec_t x[8])
 17 | {
 18 |   const vec_t tmp = x[0];
 19 | 
 20 |   for(size_t i = 0; i < 7; i++) {
 21 |     x[i] = x[i + 1];
 22 |   }
 23 | 
 24 |   x[7] = tmp;
 25 | }
 26 | 
 27 | #ifndef ALTERNATIVE_AVX512_IMPL
 28 | 
 29 | _INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *K512_p)
 30 | {
 31 |   vec_t t[4];
 32 | 
 33 |   // This function recieves 8 128-bit registers X[7:0]=q[15:0] and calculates:
 34 |   // s0 = sigma0(q[(i + 1) % 16])
 35 |   // s1 = sigma1(q[(i + 14) % 16])
 36 |   // q[i % 16] += s0 + s1 + q[(i + 9) % 16]
 37 |   //
 38 |   // For X[0]=q[3:0]
 39 |   //
 40 |   // This means that
 41 |   // res[0] depends on q[1] (for s0) q[14] (for s1) and q[9]
 42 |   // res[1] depends on q[2] (for s0) q[15] (for s1) and q[10]
 43 |   // res[2] depends on q[3] (for s0) res[0] (for s1) and q[11]
 44 |   // res[3] depends on q[4] (for s0) res[1] (for s1) and q[12]
 45 | 
 46 |   t[0] = ALIGNR8(x[1], x[0], 8);                      // q[2:1]
 47 |   t[3] = ALIGNR8(x[5], x[4], 8);                      // q[10:9]
 48 |   t[2] = SRL64(t[0], sigma0_0);                       // q[2:1] >> s0[0]
 49 |   x[0] = ADD64(x[0], t[3]);                           // q[1:0] + q[10:9]
 50 |   t[3] = SRL64(t[0], sigma0_2);                       // q[2:1] >> s0[2]
 51 |   t[1] = SLL64(t[0], SHA512_WORD_BIT_LEN - sigma0_1); // q[2:1] << (64 - s0[1])
 52 |   t[0] = t[3] ^ t[2];                                 // (q[2:1] >> s0[2]) ^
 53 |                                                       //   (q[2:1] >> s0[0])
 54 |   t[2] = SRL64(t[2], sigma0_1 - sigma0_0);            // q[2:1] >> s0[1]
 55 |   t[0] ^= t[1];                                       // (q[2:1] >> s0[2]) ^
 56 |                                                       //  (q[2:1] >> s0[0]) ^
 57 |                                                       //  q[2:1] << (64 - s0[1])
 58 |   t[1] = SLL64(t[1], sigma0_1 - sigma0_0);            // q[2:1] << (64 - s0[0])
 59 |   t[0] ^= t[2] ^ t[1];                                // sigma1(q[2:1])
 60 |   t[3] = SRL64(x[7], sigma1_2);                       // q[15:14] >> s1[2]
 61 |   t[2] = SLL64(x[7], SHA512_WORD_BIT_LEN - sigma1_1); // q[15:14] >> (64 - s1[1])
 62 |   x[0] = ADD64(x[0], t[0]);                           // q[1:0] + sigma0(q[2:1])
 63 |   t[1] = SRL64(x[7], sigma1_0);                       // q[15:14] >> s1[0]
 64 |   t[3] ^= t[2];                                       // q[15:14] >> s1[2] ^
 65 |                                                       //  q[15:14] >> (64 - s1[1])
 66 |   t[2] = SLL64(t[2], sigma1_1 - sigma1_0);            // q[15:14] >> (64 - s1[0])
 67 |   t[3] ^= t[1];                                       // q[15:14] >> s1[2] ^
 68 |                 //  q[15:14] >> (64 - s1[1] ^
 69 |                 //  q[15:14] >> s1[0]
 70 |   t[1] = SRL64(t[1], sigma1_1 - sigma1_0); // q[15:14] >> s1[1]
 71 |   t[3] ^= t[2] ^ t[1];                     // sigma1(q[15:14])
 72 | 
 73 |   // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1])
 74 |   x[0] = ADD64(x[0], t[3]);
 75 | 
 76 |   rotate_x(x);
 77 | 
 78 |   return ADD64(x[7], LOAD(K512_p));
 79 | }
 80 | 
 81 | #else
 82 | 
 83 | _INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *k512_p)
 84 | {
 85 |   vec_t t[2];
 86 |   vec_t s0;
 87 |   vec_t s1;
 88 | 
 89 |   // This function recieves 8 wide registers X[7:0]=q[15:0] and calculates:
 90 |   // s0 = sigma0(q[2:1])
 91 |   // s1 = sigma1(q[15:14])
 92 |   // q[1:0] += s0 + s1 + q[10:9]
 93 | 
 94 |   t[0] = ALIGNR8(x[1], x[0], 8); // q[2:1]
 95 |   t[1] = ALIGNR8(x[5], x[4], 8); // q[10:9]
 96 |   s0   = ROR64(t[0], sigma0_0) ^ ROR64(t[0], sigma0_1) ^ SRL64(t[0], sigma0_2);
 97 |   s1   = ROR64(x[7], sigma1_0) ^ ROR64(x[7], sigma1_1) ^ SRL64(x[7], sigma1_2);
 98 |   x[0] = ADD64(ADD64(ADD64(x[0], s1), s0), t[1]);
 99 | 
100 |   rotate_x(x);
101 | 
102 |   return ADD64(x[7], LOAD(k512_p));
103 | }
104 | 
105 | #endif
106 | 


--------------------------------------------------------------------------------
/src/sha512_consts.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include "sha512_defs.h"
  5 | 
  6 | #define K512_0  UINT64_C(0x428a2f98d728ae22)
  7 | #define K512_1  UINT64_C(0x7137449123ef65cd)
  8 | #define K512_2  UINT64_C(0xb5c0fbcfec4d3b2f)
  9 | #define K512_3  UINT64_C(0xe9b5dba58189dbbc)
 10 | #define K512_4  UINT64_C(0x3956c25bf348b538)
 11 | #define K512_5  UINT64_C(0x59f111f1b605d019)
 12 | #define K512_6  UINT64_C(0x923f82a4af194f9b)
 13 | #define K512_7  UINT64_C(0xab1c5ed5da6d8118)
 14 | #define K512_8  UINT64_C(0xd807aa98a3030242)
 15 | #define K512_9  UINT64_C(0x12835b0145706fbe)
 16 | #define K512_10 UINT64_C(0x243185be4ee4b28c)
 17 | #define K512_11 UINT64_C(0x550c7dc3d5ffb4e2)
 18 | #define K512_12 UINT64_C(0x72be5d74f27b896f)
 19 | #define K512_13 UINT64_C(0x80deb1fe3b1696b1)
 20 | #define K512_14 UINT64_C(0x9bdc06a725c71235)
 21 | #define K512_15 UINT64_C(0xc19bf174cf692694)
 22 | #define K512_16 UINT64_C(0xe49b69c19ef14ad2)
 23 | #define K512_17 UINT64_C(0xefbe4786384f25e3)
 24 | #define K512_18 UINT64_C(0x0fc19dc68b8cd5b5)
 25 | #define K512_19 UINT64_C(0x240ca1cc77ac9c65)
 26 | #define K512_20 UINT64_C(0x2de92c6f592b0275)
 27 | #define K512_21 UINT64_C(0x4a7484aa6ea6e483)
 28 | #define K512_22 UINT64_C(0x5cb0a9dcbd41fbd4)
 29 | #define K512_23 UINT64_C(0x76f988da831153b5)
 30 | #define K512_24 UINT64_C(0x983e5152ee66dfab)
 31 | #define K512_25 UINT64_C(0xa831c66d2db43210)
 32 | #define K512_26 UINT64_C(0xb00327c898fb213f)
 33 | #define K512_27 UINT64_C(0xbf597fc7beef0ee4)
 34 | #define K512_28 UINT64_C(0xc6e00bf33da88fc2)
 35 | #define K512_29 UINT64_C(0xd5a79147930aa725)
 36 | #define K512_30 UINT64_C(0x06ca6351e003826f)
 37 | #define K512_31 UINT64_C(0x142929670a0e6e70)
 38 | #define K512_32 UINT64_C(0x27b70a8546d22ffc)
 39 | #define K512_33 UINT64_C(0x2e1b21385c26c926)
 40 | #define K512_34 UINT64_C(0x4d2c6dfc5ac42aed)
 41 | #define K512_35 UINT64_C(0x53380d139d95b3df)
 42 | #define K512_36 UINT64_C(0x650a73548baf63de)
 43 | #define K512_37 UINT64_C(0x766a0abb3c77b2a8)
 44 | #define K512_38 UINT64_C(0x81c2c92e47edaee6)
 45 | #define K512_39 UINT64_C(0x92722c851482353b)
 46 | #define K512_40 UINT64_C(0xa2bfe8a14cf10364)
 47 | #define K512_41 UINT64_C(0xa81a664bbc423001)
 48 | #define K512_42 UINT64_C(0xc24b8b70d0f89791)
 49 | #define K512_43 UINT64_C(0xc76c51a30654be30)
 50 | #define K512_44 UINT64_C(0xd192e819d6ef5218)
 51 | #define K512_45 UINT64_C(0xd69906245565a910)
 52 | #define K512_46 UINT64_C(0xf40e35855771202a)
 53 | #define K512_47 UINT64_C(0x106aa07032bbd1b8)
 54 | #define K512_48 UINT64_C(0x19a4c116b8d2d0c8)
 55 | #define K512_49 UINT64_C(0x1e376c085141ab53)
 56 | #define K512_50 UINT64_C(0x2748774cdf8eeb99)
 57 | #define K512_51 UINT64_C(0x34b0bcb5e19b48a8)
 58 | #define K512_52 UINT64_C(0x391c0cb3c5c95a63)
 59 | #define K512_53 UINT64_C(0x4ed8aa4ae3418acb)
 60 | #define K512_54 UINT64_C(0x5b9cca4f7763e373)
 61 | #define K512_55 UINT64_C(0x682e6ff3d6b2b8a3)
 62 | #define K512_56 UINT64_C(0x748f82ee5defb2fc)
 63 | #define K512_57 UINT64_C(0x78a5636f43172f60)
 64 | #define K512_58 UINT64_C(0x84c87814a1f0ab72)
 65 | #define K512_59 UINT64_C(0x8cc702081a6439ec)
 66 | #define K512_60 UINT64_C(0x90befffa23631e28)
 67 | #define K512_61 UINT64_C(0xa4506cebde82bde9)
 68 | #define K512_62 UINT64_C(0xbef9a3f7b2c67915)
 69 | #define K512_63 UINT64_C(0xc67178f2e372532b)
 70 | #define K512_64 UINT64_C(0xca273eceea26619c)
 71 | #define K512_65 UINT64_C(0xd186b8c721c0c207)
 72 | #define K512_66 UINT64_C(0xeada7dd6cde0eb1e)
 73 | #define K512_67 UINT64_C(0xf57d4f7fee6ed178)
 74 | #define K512_68 UINT64_C(0x06f067aa72176fba)
 75 | #define K512_69 UINT64_C(0x0a637dc5a2c898a6)
 76 | #define K512_70 UINT64_C(0x113f9804bef90dae)
 77 | #define K512_71 UINT64_C(0x1b710b35131c471b)
 78 | #define K512_72 UINT64_C(0x28db77f523047d84)
 79 | #define K512_73 UINT64_C(0x32caab7b40c72493)
 80 | #define K512_74 UINT64_C(0x3c9ebe0a15c9bebc)
 81 | #define K512_75 UINT64_C(0x431d67c49c100d4c)
 82 | #define K512_76 UINT64_C(0x4cc5d4becb3e42b6)
 83 | #define K512_77 UINT64_C(0x597f299cfc657e2a)
 84 | #define K512_78 UINT64_C(0x5fcb6fab3ad6faec)
 85 | #define K512_79 UINT64_C(0x6c44198c4a475817)
 86 | 
 87 | ALIGN(64)
 88 | const sha512_word_t K512[SHA512_ROUNDS_NUM] = {
 89 |   K512_0,  K512_1,  K512_2,  K512_3,  K512_4,  K512_5,  K512_6,  K512_7,  K512_8,
 90 |   K512_9,  K512_10, K512_11, K512_12, K512_13, K512_14, K512_15, K512_16, K512_17,
 91 |   K512_18, K512_19, K512_20, K512_21, K512_22, K512_23, K512_24, K512_25, K512_26,
 92 |   K512_27, K512_28, K512_29, K512_30, K512_31, K512_32, K512_33, K512_34, K512_35,
 93 |   K512_36, K512_37, K512_38, K512_39, K512_40, K512_41, K512_42, K512_43, K512_44,
 94 |   K512_45, K512_46, K512_47, K512_48, K512_49, K512_50, K512_51, K512_52, K512_53,
 95 |   K512_54, K512_55, K512_56, K512_57, K512_58, K512_59, K512_60, K512_61, K512_62,
 96 |   K512_63, K512_64, K512_65, K512_66, K512_67, K512_68, K512_69, K512_70, K512_71,
 97 |   K512_72, K512_73, K512_74, K512_75, K512_76, K512_77, K512_78, K512_79,
 98 | };
 99 | 
100 | ALIGN(64)
101 | const sha512_word_t K512x2[2 * SHA512_ROUNDS_NUM] = {
102 |   DUP2(K512_0, K512_1),   DUP2(K512_2, K512_3),   DUP2(K512_4, K512_5),
103 |   DUP2(K512_6, K512_7),   DUP2(K512_8, K512_9),   DUP2(K512_10, K512_11),
104 |   DUP2(K512_12, K512_13), DUP2(K512_14, K512_15), DUP2(K512_16, K512_17),
105 |   DUP2(K512_18, K512_19), DUP2(K512_20, K512_21), DUP2(K512_22, K512_23),
106 |   DUP2(K512_24, K512_25), DUP2(K512_26, K512_27), DUP2(K512_28, K512_29),
107 |   DUP2(K512_30, K512_31), DUP2(K512_32, K512_33), DUP2(K512_34, K512_35),
108 |   DUP2(K512_36, K512_37), DUP2(K512_38, K512_39), DUP2(K512_40, K512_41),
109 |   DUP2(K512_42, K512_43), DUP2(K512_44, K512_45), DUP2(K512_46, K512_47),
110 |   DUP2(K512_48, K512_49), DUP2(K512_50, K512_51), DUP2(K512_52, K512_53),
111 |   DUP2(K512_54, K512_55), DUP2(K512_56, K512_57), DUP2(K512_58, K512_59),
112 |   DUP2(K512_60, K512_61), DUP2(K512_62, K512_63), DUP2(K512_64, K512_65),
113 |   DUP2(K512_66, K512_67), DUP2(K512_68, K512_69), DUP2(K512_70, K512_71),
114 |   DUP2(K512_72, K512_73), DUP2(K512_74, K512_75), DUP2(K512_76, K512_77),
115 |   DUP2(K512_78, K512_79),
116 | };
117 | 
118 | ALIGN(64)
119 | const sha512_word_t K512x4[4 * SHA512_ROUNDS_NUM] = {
120 |   DUP4(K512_0, K512_1),   DUP4(K512_2, K512_3),   DUP4(K512_4, K512_5),
121 |   DUP4(K512_6, K512_7),   DUP4(K512_8, K512_9),   DUP4(K512_10, K512_11),
122 |   DUP4(K512_12, K512_13), DUP4(K512_14, K512_15), DUP4(K512_16, K512_17),
123 |   DUP4(K512_18, K512_19), DUP4(K512_20, K512_21), DUP4(K512_22, K512_23),
124 |   DUP4(K512_24, K512_25), DUP4(K512_26, K512_27), DUP4(K512_28, K512_29),
125 |   DUP4(K512_30, K512_31), DUP4(K512_32, K512_33), DUP4(K512_34, K512_35),
126 |   DUP4(K512_36, K512_37), DUP4(K512_38, K512_39), DUP4(K512_40, K512_41),
127 |   DUP4(K512_42, K512_43), DUP4(K512_44, K512_45), DUP4(K512_46, K512_47),
128 |   DUP4(K512_48, K512_49), DUP4(K512_50, K512_51), DUP4(K512_52, K512_53),
129 |   DUP4(K512_54, K512_55), DUP4(K512_56, K512_57), DUP4(K512_58, K512_59),
130 |   DUP4(K512_60, K512_61), DUP4(K512_62, K512_63), DUP4(K512_64, K512_65),
131 |   DUP4(K512_66, K512_67), DUP4(K512_68, K512_69), DUP4(K512_70, K512_71),
132 |   DUP4(K512_72, K512_73), DUP4(K512_74, K512_75), DUP4(K512_76, K512_77),
133 |   DUP4(K512_78, K512_79),
134 | };
135 | 


--------------------------------------------------------------------------------
/tests/main_speed.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #include "measurements.h"
  8 | #include "sha.h"
  9 | #include "test.h"
 10 | 
 11 | #define MAX_MSG_BYTE_LEN (65536UL)
 12 | 
 13 | _INLINE_ void speed_sha256(void)
 14 | {
 15 |   uint8_t dgst[SHA256_HASH_BYTE_LEN] = {0};
 16 |   uint8_t data[MAX_MSG_BYTE_LEN]     = {0};
 17 | 
 18 |   // Use a deterministic seed.
 19 |   srand(0);
 20 |   rand_data(data, sizeof(data));
 21 | 
 22 |   printf("\nSHA-256 Benchmark:");
 23 |   printf("\n------------------\n");
 24 |   printf("        msg     generic");
 25 | 
 26 |   // X86-64 specific options
 27 |   RUN_X86_64(printf("      avx (C)   avx (ossl)"););
 28 |   RUN_AVX2(printf("     avx2 (C)  avx2 (ossl)"););
 29 |   RUN_AVX512(printf("   avx512 (C)"););
 30 |   RUN_X86_64_SHA_EXT(printf("  sha ext (C) sha ext (ossl) \n"););
 31 | 
 32 |   // Aarch64 specific options
 33 |   RUN_NEON(printf("  neon (ossl)"););
 34 |   RUN_AARCH64_SHA_EXT(printf("  sha ext (C) sha ext (ossl) \n"););
 35 | 
 36 |   printf("\n");
 37 |   for(size_t msg_byte_len = 1; msg_byte_len <= MAX_MSG_BYTE_LEN;
 38 |       msg_byte_len <<= 1) {
 39 | 
 40 |     printf("%5ld bytes", msg_byte_len);
 41 |     MEASURE(sha256(dgst, data, msg_byte_len, GENERIC_IMPL););
 42 | 
 43 |     // X86-64 specific options
 44 |     RUN_X86_64(MEASURE(sha256(dgst, data, msg_byte_len, AVX_IMPL);););
 45 |     RUN_X86_64(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_AVX_IMPL);););
 46 |     RUN_AVX2(MEASURE(sha256(dgst, data, msg_byte_len, AVX2_IMPL);););
 47 |     RUN_AVX2(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_AVX2_IMPL);););
 48 |     RUN_AVX512(MEASURE(sha256(dgst, data, msg_byte_len, AVX512_IMPL);););
 49 |     RUN_X86_64_SHA_EXT(MEASURE(sha256(dgst, data, msg_byte_len, SHA_EXT_IMPL);););
 50 |     RUN_X86_64_SHA_EXT(
 51 |       MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_SHA_EXT_IMPL);););
 52 | 
 53 |     // Aarch64 specific options
 54 |     RUN_NEON(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_NEON_IMPL);););
 55 |     RUN_AARCH64_SHA_EXT(
 56 |       MEASURE(sha256(dgst, data, msg_byte_len, SHA_EXT_IMPL);););
 57 |     RUN_AARCH64_SHA_EXT(
 58 |       MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_SHA_EXT_IMPL);););
 59 | 
 60 |     printf("\n");
 61 |   }
 62 | }
 63 | 
 64 | _INLINE_ void speed_sha512(void)
 65 | {
 66 |   uint8_t dgst[SHA512_HASH_BYTE_LEN] = {0};
 67 |   uint8_t data[MAX_MSG_BYTE_LEN]     = {0};
 68 | 
 69 |   // Use a deterministic seed.
 70 |   srand(0);
 71 |   rand_data(data, sizeof(data));
 72 | 
 73 |   printf("\nSHA-512 Benchmark:");
 74 |   printf("\n------------------\n");
 75 |   printf("        msg     generic");
 76 | 
 77 |   // X86-64 specific options
 78 |   RUN_X86_64(printf("      avx (C)   avx (ossl)"););
 79 |   RUN_AVX2(printf("     avx2 (C)  avx2 (ossl)"););
 80 |   RUN_AVX512(printf("   avx512 (C)"););
 81 | 
 82 |   // Aarch64 specific options
 83 |   RUN_NEON(printf("  neon (ossl)"););
 84 | 
 85 |   printf("\n");
 86 | 
 87 |   for(size_t msg_byte_len = 1; msg_byte_len <= MAX_MSG_BYTE_LEN;
 88 |       msg_byte_len <<= 1) {
 89 | 
 90 |     printf("%5ld bytes", msg_byte_len);
 91 |     MEASURE(sha512(dgst, data, msg_byte_len, GENERIC_IMPL););
 92 | 
 93 |     // X86-64 specific options
 94 |     RUN_X86_64(MEASURE(sha512(dgst, data, msg_byte_len, AVX_IMPL);););
 95 |     RUN_X86_64(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_AVX_IMPL);););
 96 |     RUN_AVX2(MEASURE(sha512(dgst, data, msg_byte_len, AVX2_IMPL);););
 97 |     RUN_AVX2(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_AVX2_IMPL);););
 98 |     RUN_AVX512(MEASURE(sha512(dgst, data, msg_byte_len, AVX512_IMPL);););
 99 | 
100 |     // Aarch64 specific options
101 |     RUN_NEON(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_NEON_IMPL);););
102 | 
103 |     printf("\n");
104 |   }
105 | }
106 | 
107 | int main(void)
108 | {
109 |   speed_sha256();
110 |   speed_sha512();
111 | 
112 |   return 0;
113 | }
114 | 


--------------------------------------------------------------------------------
/tests/main_tests.c:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | 
  8 | #include <openssl/sha.h>
  9 | 
 10 | #include "sha.h"
 11 | #include "test.h"
 12 | 
 13 | #define SHA256_TEST_MAX_MSG_BYTE_LEN (6400)
 14 | #define SHA512_TEST_MAX_MSG_BYTE_LEN (12800)
 15 | 
 16 | #if !defined(MONTE_CARLO_NUM_OF_TESTS)
 17 | #  define MONTE_CARLO_NUM_OF_TESTS (100000)
 18 | #endif
 19 | 
 20 | _INLINE_ int test_sha256_impl(IN const sha_impl_t impl,
 21 |                               IN const uint8_t *data,
 22 |                               IN const uint8_t *ref_dgst,
 23 |                               IN const size_t   byte_len)
 24 | {
 25 |   uint8_t tst_dgst[SHA256_HASH_BYTE_LEN] = {0};
 26 |   sha256(tst_dgst, data, byte_len, impl);
 27 | 
 28 |   if(0 != memcmp(ref_dgst, tst_dgst, SHA256_HASH_BYTE_LEN)) {
 29 |     printf("Digest mismatch for impl=%d and size=%ld\n", impl, byte_len);
 30 |     print(ref_dgst, SHA256_HASH_BYTE_LEN);
 31 |     print(tst_dgst, SHA256_HASH_BYTE_LEN);
 32 |     return FAILURE;
 33 |   }
 34 | 
 35 |   return SUCCESS;
 36 | }
 37 | 
 38 | _INLINE_ int test_sha256()
 39 | {
 40 |   uint8_t ref_dgst[SHA256_HASH_BYTE_LEN]     = {0};
 41 |   uint8_t data[SHA256_TEST_MAX_MSG_BYTE_LEN] = {0};
 42 | 
 43 |   // Use a deterministic seed.
 44 |   srand(0);
 45 |   rand_data(data, sizeof(data));
 46 | 
 47 |   printf("Testing SHA256 Short/Long tests\n");
 48 | 
 49 |   for(size_t byte_len = 0; byte_len <= sizeof(data); byte_len++) {
 50 |     SHA256(data, byte_len, ref_dgst);
 51 | 
 52 |     GUARD(test_sha256_impl(GENERIC_IMPL, data, ref_dgst, byte_len));
 53 | 
 54 |     // X86-64 specific options
 55 |     RUN_X86_64(GUARD(test_sha256_impl(AVX_IMPL, data, ref_dgst, byte_len)););
 56 |     RUN_AVX2(GUARD(test_sha256_impl(AVX2_IMPL, data, ref_dgst, byte_len)););
 57 |     RUN_AVX512(GUARD(test_sha256_impl(AVX512_IMPL, data, ref_dgst, byte_len)););
 58 |     RUN_X86_64_SHA_EXT(
 59 |       GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len)););
 60 | 
 61 |     // Aarch64 specific options
 62 |     RUN_AARCH64_SHA_EXT(
 63 |       GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len)););
 64 |   }
 65 | 
 66 |   printf("Testing SHA256 Monte Carlo tests\n");
 67 | 
 68 |   // Perform 100,000 Monte Carlo tests.
 69 |   for(size_t i = 0; i < MONTE_CARLO_NUM_OF_TESTS; i++) {
 70 | 
 71 |     printf("\rTesting case=%ld", i);
 72 | 
 73 |     // Generate a random message and a reference digest.
 74 |     size_t byte_len = rand() % sizeof(data);
 75 |     rand_data(data, byte_len);
 76 |     SHA256(data, byte_len, ref_dgst);
 77 | 
 78 |     // X86-64 specific options
 79 |     RUN_X86_64(GUARD(test_sha256_impl(AVX_IMPL, data, ref_dgst, byte_len)););
 80 |     RUN_AVX2(GUARD(test_sha256_impl(AVX2_IMPL, data, ref_dgst, byte_len)););
 81 |     RUN_AVX512(GUARD(test_sha256_impl(AVX512_IMPL, data, ref_dgst, byte_len)););
 82 |     RUN_X86_64_SHA_EXT(
 83 |       GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len)););
 84 | 
 85 |     // Aarch64 specific options
 86 |     RUN_AARCH64_SHA_EXT(
 87 |       GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len)););
 88 |   }
 89 | 
 90 |   printf("\n");
 91 |   return SUCCESS;
 92 | }
 93 | 
 94 | _INLINE_ int test_sha512_impl(IN const sha_impl_t impl,
 95 |                               IN const uint8_t *data,
 96 |                               IN const uint8_t *ref_dgst,
 97 |                               IN const size_t   byte_len)
 98 | {
 99 |   uint8_t tst_dgst[SHA512_HASH_BYTE_LEN] = {0};
100 |   sha512(tst_dgst, data, byte_len, impl);
101 | 
102 |   if(0 != memcmp(ref_dgst, tst_dgst, SHA512_HASH_BYTE_LEN)) {
103 |     printf("Digest mismatch for impl=%d and size=%ld\n", impl, byte_len);
104 |     print(ref_dgst, SHA512_HASH_BYTE_LEN);
105 |     print(tst_dgst, SHA512_HASH_BYTE_LEN);
106 |     return FAILURE;
107 |   }
108 | 
109 |   return SUCCESS;
110 | }
111 | 
112 | _INLINE_ int test_sha512()
113 | {
114 |   uint8_t ref_dgst[SHA512_HASH_BYTE_LEN]     = {0};
115 |   uint8_t data[SHA512_TEST_MAX_MSG_BYTE_LEN] = {0};
116 | 
117 |   // Use a deterministic seed.
118 |   srand(0);
119 |   rand_data(data, sizeof(data));
120 | 
121 |   printf("Testing SHA512 Short/Long tests\n");
122 | 
123 |   for(size_t byte_len = 0; byte_len <= sizeof(data); byte_len++) {
124 |     SHA512(data, byte_len, ref_dgst);
125 | 
126 |     GUARD(test_sha512_impl(GENERIC_IMPL, data, ref_dgst, byte_len));
127 | 
128 |     // X86-64 specific options
129 |     RUN_X86_64(GUARD(test_sha512_impl(AVX_IMPL, data, ref_dgst, byte_len)););
130 |     RUN_AVX2(GUARD(test_sha512_impl(AVX2_IMPL, data, ref_dgst, byte_len)););
131 |     RUN_AVX512(GUARD(test_sha512_impl(AVX512_IMPL, data, ref_dgst, byte_len)););
132 |   }
133 | 
134 |   printf("Testing SHA512 Monte Carlo tests\n");
135 | 
136 |   // Perform 100,000 Monte Carlo tests.
137 |   for(size_t i = 0; i < MONTE_CARLO_NUM_OF_TESTS; i++) {
138 | 
139 |     printf("\rTesting case=%ld", i);
140 | 
141 |     // Generate a random message and a reference digest.
142 |     size_t byte_len = rand() % sizeof(data);
143 |     rand_data(data, byte_len);
144 |     SHA512(data, byte_len, ref_dgst);
145 | 
146 |     GUARD(test_sha512_impl(GENERIC_IMPL, data, ref_dgst, byte_len));
147 | 
148 |     // X86-64 specific options
149 |     RUN_X86_64(GUARD(test_sha512_impl(AVX_IMPL, data, ref_dgst, byte_len)););
150 |     RUN_AVX2(GUARD(test_sha512_impl(AVX2_IMPL, data, ref_dgst, byte_len)););
151 |     RUN_AVX512(GUARD(test_sha512_impl(AVX512_IMPL, data, ref_dgst, byte_len)););
152 |   }
153 | 
154 |   printf("\n");
155 |   return SUCCESS;
156 | }
157 | 
158 | int main(void)
159 | {
160 |   GUARD(test_sha256());
161 |   GUARD(test_sha512());
162 | 
163 |   return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/tests/pre-commit-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # Avoid removing the "build" directory if the script does not run from the 
 6 | # package root directory 
 7 | basedir=`pwd`
 8 | if [[ ! -f "$basedir/tests/pre-commit-script.sh" ]]; then
 9 |   >&2 echo "Script does not run from the root directory"
10 |   exit 0
11 | fi
12 | 
13 | if [ $# -ne 0 ]; then
14 |   # For speed testing when the first parameter is set we set the number
15 |   # of monte carlo tests to 10. This should not be set before commiting a code.
16 |   monte="-DMONTE_CARLO_NUM_OF_TESTS=10"
17 | else
18 |   # Use the default (100,000)
19 |   monte=""
20 | fi
21 | 
22 | # Clean previous build content
23 | rm -rf build;
24 | 
25 | mkdir build;
26 | cd build;
27 | 
28 | # Test clang-format
29 | cmake ..; make format; 
30 | rm -rf *
31 | 
32 | for method in "" "-DALTERNATIVE_AVX512_IMPL=1"; do
33 |   # Test clang-tidy
34 |   CC=clang-9 cmake $method -DCMAKE_C_CLANG_TIDY="clang-tidy-9;--fix-errors;--format-style=file" ..
35 |   make -j20
36 |   rm -rf *
37 | 
38 |   for flag in "" "-DTEST_SPEED=1" "-DASAN=1" "-DMSAN=1" "-DTSAN=1" "-DUBSAN=1" ; do
39 |     CC=clang-9 cmake $method $flag $monte ..; 
40 |     make -j20 
41 |     ./sha-with-intrinsic
42 |     rm -rf *
43 |   done
44 | done
45 | 


--------------------------------------------------------------------------------
/tests/test.h:
--------------------------------------------------------------------------------
 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | #pragma once
 5 | 
 6 | #define SUCCESS 0
 7 | #define FAILURE (-1)
 8 | #define GUARD(x)         \
 9 |   do {                   \
10 |     if(SUCCESS != (x)) { \
11 |       return FAILURE;    \
12 |     }                    \
13 |   } while(0)
14 | 
15 | /////////////////////////////
16 | //  X86_64 specific options
17 | /////////////////////////////
18 | 
19 | #if defined(X86_64)
20 | #  define RUN_X86_64(x) \
21 |     do {                \
22 |       x                 \
23 |     } while(0)
24 | #else
25 | #  define RUN_X86_64(x)
26 | #endif
27 | 
28 | #if defined(AVX2_SUPPORT)
29 | #  define RUN_AVX2(x) \
30 |     do {              \
31 |       x               \
32 |     } while(0)
33 | #else
34 | #  define RUN_AVX2(x)
35 | #endif
36 | 
37 | #if defined(AVX512_SUPPORT)
38 | #  define RUN_AVX512(x) \
39 |     do {                \
40 |       x                 \
41 |     } while(0)
42 | #else
43 | #  define RUN_AVX512(x)
44 | #endif
45 | 
46 | #if defined(X86_64_SHA_SUPPORT)
47 | #  define RUN_X86_64_SHA_EXT(x) \
48 |     do {                        \
49 |       x                         \
50 |     } while(0)
51 | #else
52 | #  define RUN_X86_64_SHA_EXT(x)
53 | #endif
54 | 
55 | /////////////////////////////
56 | //  AARCH64 specific options
57 | /////////////////////////////
58 | 
59 | #if defined(NEON_SUPPORT)
60 | #  define RUN_NEON(x) \
61 |     do {              \
62 |       x               \
63 |     } while(0)
64 | #else
65 | #  define RUN_NEON(x)
66 | #endif
67 | 
68 | #if defined(AARCH64_SHA_SUPPORT)
69 | #  define RUN_AARCH64_SHA_EXT(x) \
70 |     do {                         \
71 |       x                          \
72 |     } while(0)
73 | #else
74 | #  define RUN_AARCH64_SHA_EXT(x)
75 | #endif
76 |     
77 | /////////////////////////////
78 | //  Inline utilities
79 | /////////////////////////////
80 | 
81 | void print(const uint8_t *a, const int byte_len)
82 | {
83 |   for(int i = byte_len - 1; i >= 0; i--) {
84 |     printf("%.2x", a[i]);
85 |   }
86 |   printf("\n\n");
87 | }
88 | 
89 | _INLINE_ void rand_data(OUT uint8_t *out, IN const size_t byte_len)
90 | {
91 |   for(size_t i = 0; i < byte_len; i++) {
92 |     out[i] = rand();
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------