├── .clang-format ├── .clang-tidy ├── .gitignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── benchmark_example.md ├── cmake ├── arch.cmake ├── clang-format.cmake ├── compilation-flags.cmake ├── sources.cmake ├── test_aarch64_sha_ni.c ├── test_endianess.c ├── test_x86_64_avx2.c ├── test_x86_64_avx512.c └── test_x86_64_sha_ni.c ├── include ├── internal │ ├── avx2_defs.h │ ├── avx512_defs.h │ ├── avx_defs.h │ ├── defs.h │ ├── measurements.h │ ├── neon_defs.h │ ├── sha256_defs.h │ └── sha512_defs.h └── sha.h ├── src ├── openssl │ ├── README.md │ ├── linux │ │ ├── sha256-armv8.S │ │ ├── sha256-x86_64.s │ │ ├── sha512-armv8.S │ │ └── sha512-x86_64.s │ ├── macos │ │ ├── sha256-x86_64.s │ │ └── sha512-x86_64.s │ └── openssl_cpu_globals.c ├── sha256.c ├── sha256_compress_aarch64_sha_ext.c ├── sha256_compress_generic.c ├── sha256_compress_x86_64_avx.c ├── sha256_compress_x86_64_avx2.c ├── sha256_compress_x86_64_avx512.c ├── sha256_compress_x86_64_avx_helper.c ├── sha256_compress_x86_64_sha_ext.c ├── sha256_consts.c ├── sha512.c ├── sha512_compress_generic.c ├── sha512_compress_x86_64_avx.c ├── sha512_compress_x86_64_avx2.c ├── sha512_compress_x86_64_avx512.c ├── sha512_compress_x86_64_avx_helper.c └── sha512_consts.c └── tests ├── main_speed.c ├── main_tests.c ├── pre-commit-script.sh └── test.h /.clang-format: -------------------------------------------------------------------------------- 1 | AlignAfterOpenBracket: true 2 | AlignConsecutiveMacros: true 3 | AlignConsecutiveAssignments: true 4 | AlignConsecutiveDeclarations: true 5 | AlignEscapedNewlines: Left 6 | AlignTrailingComments: true 7 | AllowAllParametersOfDeclarationOnNextLine: true 8 | AllowAllArgumentsOnNextLine: false 9 | AllowShortCaseLabelsOnASingleLine: true 10 | AllowShortFunctionsOnASingleLine: true 11 | AllowShortIfStatementsOnASingleLine: true 12 | AllowShortLoopsOnASingleLine: true 13 | AlwaysBreakBeforeMultilineStrings: false 14 | AlwaysBreakAfterReturnType: None 15 | BinPackParameters: false 16 | BreakBeforeBraces: Custom 17 | BraceWrapping: 18 | AfterCaseLabel: false 19 | AfterControlStatement: false 20 | AfterEnum: true 21 | AfterExternBlock: false 22 | AfterFunction: true 23 | AfterNamespace: false 24 | AfterStruct: false 25 | AfterUnion: false 26 | BeforeElse: false 27 | SplitEmptyFunction: false 28 | BreakBeforeBinaryOperators: false 29 | ColumnLimit: 82 30 | ContinuationIndentWidth: 2 31 | DerivePointerAlignment: false 32 | IndentCaseLabels: true 33 | IndentPPDirectives: AfterHash 34 | IndentWidth: 2 35 | IndentWrappedFunctionNames: false 36 | MaxEmptyLinesToKeep: 1 37 | NamespaceIndentation: None 38 | PenaltyReturnTypeOnItsOwnLine: 25 39 | PointerAlignment: Right 40 | ReflowComments: true 41 | SpaceAfterCStyleCast: false 42 | SpaceBeforeAssignmentOperators: true 43 | SpaceBeforeParens: Never 44 | SpaceInEmptyParentheses: false 45 | SpacesBeforeTrailingComments: 1 46 | SpacesInContainerLiterals: false 47 | SortIncludes: true 48 | UseTab: Never 49 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | # We remove the cert* checks that are related to rand() and srand() 2 | 3 | Checks: '-*, 4 | bugprone-*, 5 | cert-*, 6 | -cert-msc50-cpp, 7 | -cert-msc51-cpp, 8 | -cert-msc30-c, 9 | -cert-msc32-c, 10 | darwin-*, 11 | hicpp-*, 12 | -hicpp-signed-bitwise, 13 | -hicpp-no-assembler, 14 | misc-*, 15 | readability-*' 16 | 17 | WarningsAsErrors: '*' 18 | HeaderFilterRegex: '.*' 19 | FormatStyle: 'file' 20 | CheckOptions: 21 | - key: bugprone-argument-comment.StrictMode 22 | value: '1' 23 | - key: bugprone-argument-comment.CommentBoolLiterals 24 | value: '1' 25 | - key: bugprone-argument-comment.CommentIntegerLiterals 26 | value: '0' 27 | - key: bugprone-argument-comment.CommentFloatLiterals 28 | value: '1' 29 | - key: bugprone-argument-comment.CommentCharacterLiterals 30 | value: '1' 31 | - key: bugprone-argument-comment.CommentUserDefinedLiterals 32 | value: '1' 33 | - key: bugprone-argument-comment.CommentNullPtrs 34 | value: '1' 35 | - key: bugprone-misplaced-widening-cast.CheckImplicitCasts 36 | value: '1' 37 | - key: bugprone-sizeof-expression.WarnOnSizeOfConstant 38 | value: '1' 39 | - key: bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression 40 | value: '1' 41 | - key: bugprone-sizeof-expression.WarnOnSizeOfCompareToConstant 42 | value: '1' 43 | - key: bugprone-suspicious-string-compare.WarnOnImplicitComparison 44 | value: '1' 45 | - key: bugprone-suspicious-string-compare.WarnOnLogicalNotComparison 46 | value: '1' 47 | - key: bugprone-suspicious-string-compare.StringCompareLikeFunctions 48 | value: '1' 49 | - key: google-runtime-int.TypeSufix 50 | value: '_t' 51 | - key: readability-magic-numbers.IgnoredIntegerValues 52 | value: '0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15' 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | # CMake compilation dir 55 | build 56 | 57 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0.0) 2 | project (sha-with-intrinsic C ASM) 3 | 4 | set(INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) 5 | set(SRC_DIR ${PROJECT_SOURCE_DIR}/src) 6 | set(TESTS_DIR ${PROJECT_SOURCE_DIR}/tests) 7 | 8 | include_directories(${INCLUDE_DIR}) 9 | include_directories(${INCLUDE_DIR}/internal) 10 | 11 | include(cmake/arch.cmake) 12 | 13 | include(cmake/compilation-flags.cmake) 14 | 15 | # Depends on SRC_DIR 16 | # and on arch.cmake 17 | include(cmake/sources.cmake) 18 | 19 | include(cmake/clang-format.cmake) 20 | 21 | set(OPENSSL_USE_STATIC_LIBS TRUE) 22 | find_package(OpenSSL REQUIRED) 23 | 24 | add_executable(${PROJECT_NAME} 25 | 26 | ${SHA_SOURCES} 27 | ${OPENSSL_SOURCES} 28 | ${MAIN_SOURCE} 29 | ) 30 | target_link_libraries(${PROJECT_NAME} OpenSSL::Crypto) 31 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sha2-with-intrinsic 2 | 3 | This sample code package is an optimized version of SHA256 and SHA512. 4 | 5 | The code is written by Nir Drucker and Shay Gueron, AWS Cryptographic Algorithms Group. 6 | 7 | While C code is easier to maintain and review, the performance obtained by compilation (e.g., with gcc-9 and clang-9) is often slower than the performance of hand written assembly code (e.g., the code in this example). This sample code is made publicly available to help compiler designers understand this use case by reviewing the code and its generated assembler. We hope this information will improve compiler's abilities to generate efficient assembler. 8 | 9 | This sample code provides testing binaries but no shared or a static libraries. This is because the code is desgined to be used for benchmarking purposes only and not in final products. 10 | 11 | The x86-64 AVX code is based on the paper: 12 | - Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). https://doi.org/10.1007/s13389-012-0037-z 13 | Some parts of the code were translated from (Perl)assembly (OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744)) to C. 14 | 15 | The code version that uses Intel SHA Extensions instructions is based on the following reference: 16 | - https://software.intel.com/en-us/articles/intel-sha-extensions 17 | 18 | ## License 19 | 20 | This project is licensed under the Apache-2.0 License. 21 | 22 | Dependencies 23 | ----- 24 | This package requires 25 | - CMake 3 and above 26 | - A compiler that supports the required C intrinsics (e.g., AVX/ AVX2/ AVX512/ SHA_NI on x86-64 machines). For example, GCC-9 and Clang-9. 27 | - An installation of OpenSSL for testing 28 | 29 | BUILD 30 | ----- 31 | 32 | To build the directory first create a working directory 33 | ``` 34 | mkdir build 35 | cd build 36 | ``` 37 | 38 | Then, run CMake and compile 39 | ``` 40 | cmake -DCMAKE_BUILD_TYPE=Release .. 41 | make 42 | ``` 43 | 44 | Additional CMake compilation flags: 45 | - TEST_SPEED - Measure and prints the performance in cycles 46 | - ALTERNATIVE_AVX512_IMPL - The X86-64 AVX512 extension provides a rotate intrinsic. Setting this flag tells the AVX/AVX2/AVX512 implementations to use this intrinsic. To test this implementation the binary should be compiled with this flag set. 47 | - DONT_USE_UNROLL_PRAGMA - The code by default uses the unroll pragma. Use this flag to disable this. 48 | - ASAN/MSAN/TSAN/UBSAN - Compiling using Address/Memory/Thread/Undefined-Behaviour sanitizer respectively. 49 | - MONTE_CARLO_NUM_OF_TESTS - Set the number of Monte Carlo tests (default:100,000) 50 | 51 | To clean - remove the `build` directory. Note that a "clean" is required prior to compilation with modified flags. 52 | 53 | To format (`clang-format-9` or above is required): 54 | 55 | `make format` 56 | 57 | To use clang-tidy (`clang-tidy-9` is required): 58 | 59 | ``` 60 | CC=clang-9 cmake -DCMAKE_C_CLANG_TIDY="clang-tidy-9;--fix-errors;--format-style=file" .. 61 | make 62 | ``` 63 | 64 | Before committing code, please test it using 65 | `tests/pre-commit-script.sh` 66 | This will run all the sanitizers and also `clang-format` and `clang-tidy` (requires clang-9 to be installed). 67 | 68 | The package was compiled and tested with gcc-9 and clang-9 in 64-bit mode. 69 | Tests were run on a Linux (Ubuntu 18.04.4 LTS) OS on x86-64 and AARCH64 machines. 70 | Compilation on other platforms may require some adjustments. 71 | 72 | Performance measurements 73 | ------------------------ 74 | When using the TEST_SPEED flag the performance measurements are reported in processor cycles (per single core). The results are obtained using the following methodology. Each measured function was isolated, run 25 times (warm-up), followed by 100 iterations that were clocked and averaged. To minimize the effect of background tasks running on the system, every experiment was repeated 10 times, and the minimum result is reported. 75 | 76 | The library reports the results only for supported code by the OS/compiler. It also compares the results of the C with intrinsic code to the assembly code of OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744) (see [here](/src/openssl/README.md) for more details). 77 | 78 | A benchmark example is found [here](benchmark_example.md). 79 | 80 | Testing 81 | ------- 82 | - The library uses OpenSSL for its testings. It compares the results of running its SHA256/SHA512 implementation to the OpenSSL results on strings in different lengths (0-1000 bytes). 83 | - The library was run using Address/Memory/Thread/Undefined-Behaviour sanitizers. 84 | -------------------------------------------------------------------------------- /benchmark_example.md: -------------------------------------------------------------------------------- 1 | A benchmark example on a Dell XPS 13 7390 2-in-1 laptop. It has a 10th generation Intel(c) Core(TM) processor (microarchitecture codename "Ice Lake"[ICL]). The specifics are Intel(c) Core(TM) i7-1065G7 CPU 1.30GHz. This platform has 16 GB RAM, 48K L1d cache, 32K L1i cache, 512K L2 cache, and 8MiB L3 cache. The Intel(c) Turbo Boost Technology was turned off. 2 | The code was compiled with clang-9 and ran on Ubuntu 18.04.2 LTS. 3 | The results are in CPU cycles. 4 | 5 | SHA-256 Benchmark: 6 | ------------------ 7 | ``` 8 | msg generic avx (C) avx (ossl) avx2 (C) avx2 (ossl) avx512 (C) sha ext (C) sha ext (ossl) 9 | 1 bytes 1024 791 754 821 724 879 287 288 10 | 2 bytes 1026 792 754 823 724 882 287 288 11 | 4 bytes 1024 791 753 823 725 878 288 288 12 | 8 bytes 1027 792 753 824 724 877 287 288 13 | 16 bytes 1023 777 728 820 702 877 288 283 14 | 32 bytes 1023 782 726 814 699 879 281 280 15 | 64 bytes 1992 1518 1394 1582 1342 1708 424 446 16 | 128 bytes 2906 2244 2051 2247 1918 2360 605 644 17 | 256 bytes 4721 3693 3363 3620 3121 3621 956 1027 18 | 512 bytes 8381 6595 5987 6373 5522 6318 1666 1797 19 | 1024 bytes 15566 12409 11236 11892 10339 11728 3104 3340 20 | 2048 bytes 29955 24038 21753 22925 19961 22507 5968 6424 21 | 4096 bytes 58970 47377 42843 45037 39326 44219 11692 12594 22 | 8192 bytes 116991 94007 84981 89160 78060 87494 23148 24936 23 | 16384 bytes 232664 187280 169477 177774 154960 174157 45780 49741 24 | 32768 bytes 464254 373359 337247 354136 309742 346105 91667 99088 25 | 65536 bytes 927237 747417 675843 709146 620729 694132 183685 198528 26 | ``` 27 | 28 | SHA-512 Benchmark: 29 | ------------------ 30 | ``` 31 | msg generic avx (C) avx (ossl) avx2 (C) avx2 (ossl) avx512 (C) 32 | 1 bytes 1428 1026 972 1062 961 1139 33 | 2 bytes 1432 1023 969 1066 957 1138 34 | 4 bytes 1432 1025 973 1063 955 1138 35 | 8 bytes 1433 1026 973 1063 955 1138 36 | 16 bytes 1431 1018 969 1063 955 1137 37 | 32 bytes 1420 1019 967 1060 951 1135 38 | 64 bytes 1418 1019 962 1058 949 1132 39 | 128 bytes 2766 1963 1843 2042 1823 2211 40 | 256 bytes 4084 2891 2692 2951 2537 3114 41 | 512 bytes 6710 4738 4388 4725 4096 4810 42 | 1024 bytes 11932 8436 7779 8291 7206 8379 43 | 2048 bytes 22472 15837 14570 15389 13456 15458 44 | 4096 bytes 43792 30662 28283 29798 26022 29482 45 | 8192 bytes 85757 60456 55429 58261 50945 57625 46 | 16384 bytes 169996 119761 109960 114952 101207 113801 47 | 32768 bytes 338123 238652 219153 228957 201284 226530 48 | 65536 bytes 675827 476242 437028 456221 402204 450687 49 | ``` 50 | -------------------------------------------------------------------------------- /cmake/arch.cmake: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|amd64|AMD64)$") 5 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DX86_64") 6 | set(X86_64 1) 7 | elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(aarch64|arm64|arm64e)$") 8 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAARCH64 -DNEON_SUPPORT") 9 | set(AARCH64 1) 10 | endif() 11 | 12 | # Only little endian systems are supported 13 | try_run(RUN_RESULT COMPILE_RESULT 14 | "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_endianess.c" 15 | COMPILE_DEFINITIONS "-Werror -Wall -Wpedantic" 16 | OUTPUT_VARIABLE OUTPUT 17 | ) 18 | 19 | if((NOT ${COMPILE_RESULT}) OR (NOT RUN_RESULT EQUAL 0)) 20 | message(FATAL "Only little endian systems are supported") 21 | endif() 22 | 23 | if(X86_64) 24 | # Test AVX2 25 | try_run(RUN_RESULT COMPILE_RESULT 26 | "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_avx2.c" 27 | COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic" 28 | OUTPUT_VARIABLE OUTPUT 29 | ) 30 | 31 | if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0)) 32 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAVX2_SUPPORT") 33 | set(AVX2 1) 34 | else() 35 | message(STATUS "The AVX2 implementation is not supported") 36 | endif() 37 | 38 | # Test AVX512 39 | try_run(RUN_RESULT COMPILE_RESULT 40 | "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_avx512.c" 41 | COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic" 42 | OUTPUT_VARIABLE OUTPUT 43 | ) 44 | 45 | if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0)) 46 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAVX512_SUPPORT") 47 | set(AVX512 1) 48 | else() 49 | message(STATUS "The AVX512 implementation is not supported") 50 | endif() 51 | 52 | # Test SHA extension 53 | try_run(RUN_RESULT COMPILE_RESULT 54 | "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_x86_64_sha_ni.c" 55 | COMPILE_DEFINITIONS "-march=native -Werror -Wall -Wpedantic" 56 | OUTPUT_VARIABLE OUTPUT 57 | ) 58 | 59 | if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0)) 60 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DX86_64_SHA_SUPPORT") 61 | set(SHA_EXT 1) 62 | else() 63 | message(STATUS "The SHA_EXT implementation is not supported") 64 | endif() 65 | endif() 66 | 67 | if(AARCH64) 68 | # Test AVX2 69 | try_run(RUN_RESULT COMPILE_RESULT 70 | "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/cmake/test_aarch64_sha_ni.c" 71 | COMPILE_DEFINITIONS "-I${INCLUDE_DIR}/internal -mcpu=native -Werror -Wall -Wpedantic" 72 | OUTPUT_VARIABLE OUTPUT 73 | ) 74 | 75 | if(${COMPILE_RESULT} AND (RUN_RESULT EQUAL 0)) 76 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAARCH64_SHA_SUPPORT") 77 | set(SHA_EXT 1) 78 | else() 79 | message(STATUS "The SHA_EXT implementation is not supported") 80 | endif() 81 | endif() 82 | -------------------------------------------------------------------------------- /cmake/clang-format.cmake: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Some of the definitions in .clang-format require clang-format-9 and above. 5 | find_program(CLANG_FORMAT 6 | NAMES 7 | clang-format-11 8 | clang-format-10 9 | clang-format-9 10 | clang-format) 11 | 12 | IF(CLANG_FORMAT) 13 | # Get the major version of clang-format 14 | # CLANG_FORMAT_VERSION should be in the format "clang-format version [Major].[Minor].[Patch] " 15 | exec_program(${CLANG_FORMAT} ${CMAKE_CURRENT_SOURCE_DIR} ARGS --version OUTPUT_VARIABLE CLANG_FORMAT_VERSION) 16 | STRING(REGEX REPLACE ".* ([0-9]+)\\.[0-9]+\\.[0-9]+.*" "\\1" CLANG_FORMAT_MAJOR_VERSION ${CLANG_FORMAT_VERSION}) 17 | 18 | message(STATUS "Found version ${CLANG_FORMAT_MAJOR_VERSION} of clang-format.") 19 | if(${CLANG_FORMAT_MAJOR_VERSION} LESS "9") 20 | message(STATUS "To run the format target clang-format version >= 9 is required.") 21 | else() 22 | set(CLANG_FORMAT_FILE_TYPES ${CLANG_FORMAT_FILE_TYPES} ) 23 | file(GLOB_RECURSE CF_FILES1 ${SRC_DIR}/*.c ${SRC_DIR}/crypto/*.h) 24 | file(GLOB_RECURSE CF_FILES2 ${INCLUDE_DIR}/*.h ${INCLUDE_DIR}/internal/*.h) 25 | file(GLOB_RECURSE CF_FILES3 ${TESTS_DIR}/*.c ${TESTS_DIR}/crypto/*.h) 26 | set(FILES_TO_FORMAT "${CF_FILES1}" "${CF_FILES2}" "${CF_FILES3}") 27 | 28 | ADD_CUSTOM_TARGET( 29 | format 30 | COMMAND ${CLANG_FORMAT} -i -style=file ${FILES_TO_FORMAT} 31 | COMMENT "Clang-formatting all (*.c/*.h) source files" 32 | ) 33 | endif() 34 | else() 35 | message(STATUS "Did not find clang-format.") 36 | endif() 37 | -------------------------------------------------------------------------------- /cmake/compilation-flags.cmake: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | if(CMAKE_C_COMPILER_ID MATCHES "Clang") 5 | set(CLANG 1) 6 | endif() 7 | 8 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ggdb -O3 -fPIC -std=c99") 9 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden -Wall -Wextra -Werror -Wpedantic") 10 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wunused -Wcomment -Wchar-subscripts -Wuninitialized -Wshadow") 11 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wwrite-strings -Wformat-security -Wcast-qual -Wunused-result") 12 | 13 | if(X86_64) 14 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mno-red-zone") 15 | else() 16 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native") 17 | endif() 18 | 19 | # Avoiding GCC 4.8 bug 20 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-braces -Wno-missing-field-initializers") 21 | 22 | if(CLANG) 23 | # CMAKE sends the `-isystem` flag to clang for assembly files. 24 | # Currently clang unrecognizes it. 25 | set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wno-error=unused-command-line-argument") 26 | endif () 27 | 28 | set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ggdb -fPIC -Wall -Wextra -Werror -Wpedantic") 29 | 30 | if(MSAN) 31 | if(NOT CLANG) 32 | message(FATAL_ERROR "Cannot enable MSAN unless using Clang") 33 | endif() 34 | 35 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins -fno-omit-frame-pointer") 36 | endif() 37 | 38 | if(ASAN) 39 | if(NOT CLANG) 40 | message(FATAL_ERROR "Cannot enable ASAN unless using Clang") 41 | endif() 42 | 43 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fsanitize-address-use-after-scope -fno-omit-frame-pointer") 44 | endif() 45 | 46 | if(TSAN) 47 | if(NOT CLANG) 48 | message(FATAL_ERROR "Cannot enable TSAN unless using Clang") 49 | endif() 50 | 51 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") 52 | endif() 53 | 54 | if(UBSAN) 55 | if(NOT CLANG) 56 | message(FATAL_ERROR "Cannot enable UBSAN unless using Clang") 57 | endif() 58 | 59 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") 60 | endif() 61 | 62 | if(TEST_SPEED) 63 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTEST_SPEED -DRTDSC") 64 | endif() 65 | 66 | if(ALTERNATIVE_AVX512_IMPL) 67 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DALTERNATIVE_AVX512_IMPL") 68 | endif() 69 | 70 | if(DONT_USE_UNROLL_PRAGMA) 71 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DDONT_USE_UNROLL_PRAGMA") 72 | endif() 73 | 74 | if(MONTE_CARLO_NUM_OF_TESTS) 75 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DMONTE_CARLO_NUM_OF_TESTS=${MONTE_CARLO_NUM_OF_TESTS}") 76 | endif() 77 | -------------------------------------------------------------------------------- /cmake/sources.cmake: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | set(SHA_SOURCES 5 | ${SRC_DIR}/sha256.c 6 | ${SRC_DIR}/sha256_consts.c 7 | ${SRC_DIR}/sha256_compress_generic.c 8 | 9 | ${SRC_DIR}/sha512.c 10 | ${SRC_DIR}/sha512_consts.c 11 | ${SRC_DIR}/sha512_compress_generic.c 12 | ) 13 | 14 | set(OPENSSL_DIR ${SRC_DIR}/openssl) 15 | 16 | if(APPLE) 17 | set(OPENSSL_ASM_DIR ${OPENSSL_DIR}/macos) 18 | else() 19 | set(OPENSSL_ASM_DIR ${OPENSSL_DIR}/linux) 20 | endif() 21 | 22 | set(OPENSSL_SOURCES 23 | ${OPENSSL_DIR}/openssl_cpu_globals.c 24 | ) 25 | 26 | if(X86_64) 27 | set(SHA_SOURCES ${SHA_SOURCES} 28 | ${SRC_DIR}/sha256_compress_x86_64_avx.c 29 | ${SRC_DIR}/sha512_compress_x86_64_avx.c 30 | ) 31 | 32 | if(AVX2) 33 | set(SHA_SOURCES ${SHA_SOURCES} 34 | ${SRC_DIR}/sha256_compress_x86_64_avx2.c 35 | ${SRC_DIR}/sha512_compress_x86_64_avx2.c 36 | ) 37 | endif() 38 | 39 | if(AVX512) 40 | set(SHA_SOURCES ${SHA_SOURCES} 41 | ${SRC_DIR}/sha256_compress_x86_64_avx512.c 42 | ${SRC_DIR}/sha512_compress_x86_64_avx512.c 43 | ) 44 | endif() 45 | 46 | if(SHA_EXT) 47 | set(SHA_SOURCES ${SHA_SOURCES} 48 | ${SRC_DIR}/sha256_compress_x86_64_sha_ext.c 49 | ) 50 | endif() 51 | 52 | set(OPENSSL_SOURCES ${OPENSSL_SOURCES} 53 | ${OPENSSL_ASM_DIR}/sha256-x86_64.s 54 | ${OPENSSL_ASM_DIR}/sha512-x86_64.s 55 | ) 56 | endif() 57 | 58 | if(AARCH64) 59 | if(SHA_EXT) 60 | set(SHA_SOURCES ${SHA_SOURCES} 61 | ${SRC_DIR}/sha256_compress_aarch64_sha_ext.c 62 | ) 63 | endif() 64 | 65 | set(OPENSSL_SOURCES ${OPENSSL_SOURCES} 66 | ${OPENSSL_ASM_DIR}/sha256-armv8.S 67 | ${OPENSSL_ASM_DIR}/sha512-armv8.S 68 | ) 69 | endif() 70 | 71 | if(TEST_SPEED) 72 | set(MAIN_SOURCE ${TESTS_DIR}/main_speed.c) 73 | else() 74 | set(MAIN_SOURCE ${TESTS_DIR}/main_tests.c) 75 | endif() 76 | -------------------------------------------------------------------------------- /cmake/test_aarch64_sha_ni.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include "neon_defs.h" 6 | 7 | int main(void) 8 | { 9 | uint8_t data[8*16*4]; 10 | uint32x4_t TMP[2] = {0}; 11 | 12 | // Check for vld1q_u8_x4 intrinsic 13 | uint8x16x4_t d = vld1q_u8_x4(data); 14 | TMP[0] = vreinterpretq_u32_u8(vrev32q_u8(d.val[0])); 15 | 16 | uint8x16x2_t d0 = vld1q_u8_x2(data); 17 | TMP[1] = vreinterpretq_u32_u8(vrev32q_u8(d0.val[0])); 18 | 19 | // Check for vsha256h2q_u32 intrinsic 20 | vsha256h2q_u32(TMP[0], TMP[1], TMP[0]); 21 | } 22 | -------------------------------------------------------------------------------- /cmake/test_endianess.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include 6 | 7 | int main(void) 8 | { 9 | uint16_t uint_with_2_bytes = 0x0001; 10 | if (sizeof(uint_with_2_bytes) != 2) { 11 | printf("Undefined behaviour.\n"); 12 | return 1; 13 | } 14 | 15 | uint8_t *byte_array = (uint8_t*)&uint_with_2_bytes; 16 | if (byte_array[0] != 1) { 17 | printf("The code does not support big endian systems.\n"); 18 | return 1; 19 | } 20 | 21 | printf("A little endian system.\n"); 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /cmake/test_x86_64_avx2.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include 6 | 7 | int main(void) 8 | { 9 | __m256i reg; 10 | uint64_t mem[4]; 11 | reg = _mm256_loadu_si256((const __m256i*)mem); 12 | _mm256_storeu_si256((__m256i*)mem, reg); 13 | } 14 | -------------------------------------------------------------------------------- /cmake/test_x86_64_avx512.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include 6 | 7 | int main(void) 8 | { 9 | __m512i reg; 10 | uint64_t mem[8]; 11 | reg = _mm512_loadu_si512((const __m512i*)mem); 12 | _mm512_storeu_si512((__m512i*)mem, reg); 13 | } 14 | -------------------------------------------------------------------------------- /cmake/test_x86_64_sha_ni.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | 6 | int main(void) 7 | { 8 | __m128i a = _mm_setzero_si128(); 9 | _mm_sha256msg1_epu32(a, a); 10 | } 11 | -------------------------------------------------------------------------------- /include/internal/avx2_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include 7 | 8 | typedef __m256i vec_t; 9 | 10 | #define ADD32(a, b) (_mm256_add_epi32(a, b)) 11 | #define ADD64(a, b) (_mm256_add_epi64(a, b)) 12 | #define ALIGNR8(a, b, mask) (_mm256_alignr_epi8(a, b, mask)) 13 | #define LOAD(mem) (_mm256_loadu_si256((const __m256i *)(mem))) 14 | #define MADD32(src, imm8, a, b) (_mm256_mask_add_epi32(src, imm8, a, b)) 15 | #define ROR32(a, imm8) (_mm256_ror_epi32(a, imm8)) 16 | #define ROR64(a, imm8) (_mm256_ror_epi64(a, imm8)) 17 | #define SHUF8(a, mask) (_mm256_shuffle_epi8(a, mask)) 18 | #define SHUF32(a, mask) (_mm256_shuffle_epi32(a, mask)) 19 | #define SLL32(a, imm8) (_mm256_slli_epi32(a, imm8)) 20 | #define SLL64(a, imm8) (_mm256_slli_epi64(a, imm8)) 21 | #define SRL32(a, imm8) (_mm256_srli_epi32(a, imm8)) 22 | #define SRL64(a, imm8) (_mm256_srli_epi64(a, imm8)) 23 | #define STORE(mem, reg) (_mm256_store_si256((__m256i *)(mem), reg)) 24 | 25 | #define LOAD128(mem) (_mm_loadu_si128((const __m128i *)(mem))) 26 | #define STORE128(mem, reg) (_mm_store_si128((__m128i *)(mem), reg)) 27 | 28 | // The _mm256_storeu2_m128i and _mm256_loadu2_m128i APIs are defined in Clang but 29 | // not in GCC 30 | #if defined(__clang__) 31 | # define STOREU2(hi_mem, lo_mem, reg) \ 32 | (_mm256_storeu2_m128i((__m128i *)(hi_mem), (__m128i *)(lo_mem), reg)) 33 | 34 | # define LOADU2(hi_mem, lo_mem, reg) \ 35 | ((reg) = _mm256_loadu2_m128i((const __m128i *)(hi_mem), \ 36 | (const __m128i *)(lo_mem))) 37 | 38 | #else 39 | # define STOREU2(hi_mem, lo_mem, reg) \ 40 | do { \ 41 | STORE128(lo_mem, _mm256_extracti128_si256(reg, 0)); \ 42 | STORE128(hi_mem, _mm256_extracti128_si256(reg, 1)); \ 43 | } while(0) 44 | 45 | # define LOADU2(hi_mem, lo_mem, reg) \ 46 | do { \ 47 | reg = _mm256_insertf128_si256(reg, LOAD128(hi_mem), 1); \ 48 | reg = _mm256_insertf128_si256(x[i], LOAD128(lo_mem), 0); \ 49 | } while(0) 50 | #endif 51 | 52 | // In every 128-bit value choose the two lowest 32-bit values. 53 | #define LOW32X2_MASK (0x33) 54 | // In every 128-bit value choose the two highest 32-bit values. 55 | #define HIGH32X2_MASK (0xcc) 56 | -------------------------------------------------------------------------------- /include/internal/avx512_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include 7 | 8 | typedef __m512i vec_t; 9 | 10 | #define ADD64(a, b) (_mm512_add_epi64(a, b)) 11 | #define ADD32(a, b) (_mm512_add_epi32(a, b)) 12 | #define ALIGNR8(a, b, mask) (_mm512_alignr_epi8(a, b, mask)) 13 | #define LOAD(mem) (_mm512_loadu_si512((const vec_t *)(mem))) 14 | #define MADD32(src, imm8, a, b) (_mm512_mask_add_epi32(src, imm8, a, b)) 15 | #define ROR32(a, imm8) (_mm512_ror_epi32(a, imm8)) 16 | #define ROR64(a, imm8) (_mm512_ror_epi64(a, imm8)) 17 | #define SHUF32(a, mask) (_mm512_shuffle_epi32(a, mask)) 18 | #define SHUF8(a, mask) (_mm512_shuffle_epi8(a, mask)) 19 | #define SLL32(a, imm8) (_mm512_slli_epi32(a, imm8)) 20 | #define SLL64(a, imm8) (_mm512_slli_epi64(a, imm8)) 21 | #define SRL32(a, imm8) (_mm512_srli_epi32(a, imm8)) 22 | #define SRL64(a, imm8) (_mm512_srli_epi64(a, imm8)) 23 | #define STORE(mem, reg) (_mm512_store_si512((vec_t *)(mem), reg)) 24 | 25 | #define LOAD128(mem) (_mm_loadu_si128((const __m128i *)(mem))) 26 | #define STORE128(mem, reg) (_mm_store_si128((__m128i *)(mem), reg)) 27 | 28 | #define STOREU4(mem3, mem2, mem1, mem0, reg) \ 29 | do { \ 30 | STORE128(mem0, _mm512_extracti32x4_epi32(reg, 0)); \ 31 | STORE128(mem1, _mm512_extracti32x4_epi32(reg, 1)); \ 32 | STORE128(mem2, _mm512_extracti32x4_epi32(reg, 2)); \ 33 | STORE128(mem3, _mm512_extracti32x4_epi32(reg, 3)); \ 34 | } while(0) 35 | 36 | #define LOADU4(mem3, mem2, mem1, mem0, reg) \ 37 | do { \ 38 | (reg) = _mm512_inserti32x4(reg, LOAD128(mem0), 0); \ 39 | (reg) = _mm512_inserti32x4(reg, LOAD128(mem1), 1); \ 40 | (reg) = _mm512_inserti32x4(reg, LOAD128(mem2), 2); \ 41 | (reg) = _mm512_inserti32x4(reg, LOAD128(mem3), 3); \ 42 | } while(0) 43 | 44 | // In every 128-bit value choose the two lowest 32-bit values. 45 | #define LOW32X2_MASK (0x3333) 46 | // In every 128-bit value choose the two highest 32-bit values. 47 | #define HIGH32X2_MASK (0xcccc) 48 | -------------------------------------------------------------------------------- /include/internal/avx_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include 7 | 8 | typedef __m128i vec_t; 9 | 10 | #define ADD32(a, b) (_mm_add_epi32(a, b)) 11 | #define ADD64(a, b) (_mm_add_epi64(a, b)) 12 | #define ALIGNR8(a, b, mask) (_mm_alignr_epi8(a, b, mask)) 13 | #define BLEND16(a, b, mask) (_mm_blend_epi16(a, b, mask)) 14 | #define LOAD(mem) (_mm_loadu_si128((const __m128i *)(mem))) 15 | #define MADD32(src, imm8, a, b) (_mm_mask_add_epi32(src, imm8, a, b)) 16 | #define ROR32(a, imm8) (_mm_ror_epi32(a, imm8)) 17 | #define ROR64(a, imm8) (_mm_ror_epi64(a, imm8)) 18 | #define SETR32(e0, e1, e2, e3) (_mm_setr_epi32(e0, e1, e2, e3)) 19 | #define SET64(e1, e0) (_mm_set_epi64x(e1, e0)) 20 | #define SHUF8(a, mask) (_mm_shuffle_epi8(a, mask)) 21 | #define SHUF32(a, mask) (_mm_shuffle_epi32(a, mask)) 22 | #define SLL32(a, imm8) (_mm_slli_epi32(a, imm8)) 23 | #define SLL64(a, imm8) (_mm_slli_epi64(a, imm8)) 24 | #define SRL32(a, imm8) (_mm_srli_epi32(a, imm8)) 25 | #define SRL64(a, imm8) (_mm_srli_epi64(a, imm8)) 26 | #define STORE(mem, reg) (_mm_store_si128((__m128i *)(mem), reg)) 27 | 28 | #define LOW32X2_MASK (0x3) 29 | #define HIGH32X2_MASK (0xc) 30 | -------------------------------------------------------------------------------- /include/internal/defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #define IN 11 | #define OUT 12 | 13 | #define _INLINE_ static inline 14 | #define ALIGN(n) __attribute__((aligned(n))) 15 | 16 | #if defined(__GNUC__) || defined(__clang__) 17 | # define UNUSED __attribute__((unused)) 18 | #else 19 | # define UNUSED 20 | #endif 21 | 22 | #define LSB1(x) ((x)&0x1) 23 | #define LSB2(x) ((x)&0x3) 24 | #define LSB4(x) ((x)&0xf) 25 | 26 | #define ROTR16(x, s) (((x) >> (s)) | (x) << (16 - (s))) 27 | #define ROTR32(x, s) (((x) >> (s)) | (x) << (32 - (s))) 28 | #define ROTR64(x, s) (((x) >> (s)) | (x) << (64 - (s))) 29 | 30 | #if defined(__GNUC__) && __GNUC__ >= 2 31 | _INLINE_ uint64_t bswap_64(uint64_t x) { return __builtin_bswap64(x); } 32 | _INLINE_ uint64_t bswap_32(uint64_t x) { return __builtin_bswap32(x); } 33 | #else 34 | _INLINE_ uint32_t bswap_32(uint32_t x) 35 | { 36 | x = ROTR16(x, 16); 37 | x = ((x & UINT32_C(0xff00ff00)) >> 8) | ((x & UINT32_C(0x00ff00ff)) << 8); 38 | return x; 39 | } 40 | 41 | _INLINE_ uint64_t bswap_64(uint64_t x) 42 | { 43 | return bswap_32(x >> 32) | (((uint64_t)bswap_32(x)) << 32); 44 | } 45 | #endif 46 | 47 | #if defined(__GNUC__) && (__GNUC__ >= 8) 48 | # define GCC_SUPPORT_UNROLL_PRAGMA 49 | #endif 50 | 51 | // A better macro should have the form 52 | // #define PRAGMA_LOOP_UNROLL(x) _Pragma("GCC unroll x") 53 | // But apparantly this is hard to achieve with different compilers 54 | #if defined(DONT_USE_UNROLL_PRAGMA) 55 | # define PRAGMA_LOOP_UNROLL_2 56 | # define PRAGMA_LOOP_UNROLL_4 57 | # define PRAGMA_LOOP_UNROLL_8 58 | # define PRAGMA_LOOP_UNROLL_12 59 | # define PRAGMA_LOOP_UNROLL_16 60 | # define PRAGMA_LOOP_UNROLL_48 61 | # define PRAGMA_LOOP_UNROLL_64 62 | # define PRAGMA_LOOP_UNROLL_80 63 | #else 64 | # if defined(GCC_SUPPORT_UNROLL_PRAGMA) 65 | # define PRAGMA_LOOP_UNROLL_2 _Pragma("GCC unroll 2") 66 | # define PRAGMA_LOOP_UNROLL_4 _Pragma("GCC unroll 4") 67 | # define PRAGMA_LOOP_UNROLL_8 _Pragma("GCC unroll 8") 68 | # define PRAGMA_LOOP_UNROLL_12 _Pragma("GCC unroll 12") 69 | # define PRAGMA_LOOP_UNROLL_16 _Pragma("GCC unroll 16") 70 | # define PRAGMA_LOOP_UNROLL_48 _Pragma("GCC unroll 48") 71 | # define PRAGMA_LOOP_UNROLL_64 _Pragma("GCC unroll 64") 72 | # define PRAGMA_LOOP_UNROLL_80 _Pragma("GCC unroll 80") 73 | # elif defined(__clang__) 74 | # define PRAGMA_LOOP_UNROLL_2 _Pragma("unroll") 75 | # define PRAGMA_LOOP_UNROLL_4 _Pragma("unroll") 76 | # define PRAGMA_LOOP_UNROLL_8 _Pragma("unroll") 77 | # define PRAGMA_LOOP_UNROLL_12 _Pragma("unroll") 78 | # define PRAGMA_LOOP_UNROLL_16 _Pragma("unroll") 79 | # define PRAGMA_LOOP_UNROLL_48 _Pragma("unroll") 80 | # define PRAGMA_LOOP_UNROLL_64 _Pragma("unroll") 81 | # define PRAGMA_LOOP_UNROLL_80 _Pragma("unroll") 82 | # else 83 | # define PRAGMA_LOOP_UNROLL_2 84 | # define PRAGMA_LOOP_UNROLL_4 85 | # define PRAGMA_LOOP_UNROLL_8 86 | # define PRAGMA_LOOP_UNROLL_12 87 | # define PRAGMA_LOOP_UNROLL_16 88 | # define PRAGMA_LOOP_UNROLL_48 89 | # define PRAGMA_LOOP_UNROLL_64 90 | # define PRAGMA_LOOP_UNROLL_80 91 | # endif 92 | #endif 93 | 94 | ////////////////////////// 95 | // Helper functions 96 | /////////////////////////// 97 | 98 | // my_memcpy avoids the undefined behaviour of memcpy when byte_len=0 99 | _INLINE_ void *my_memcpy(void *dst, const void *src, size_t byte_len) 100 | { 101 | if(byte_len == 0) { 102 | return dst; 103 | } 104 | 105 | return memcpy(dst, src, byte_len); 106 | } 107 | 108 | // my_memset avoids the undefined behaviour of memset when byte_len=0 109 | _INLINE_ void *my_memset(void *dst, const int ch, size_t byte_len) 110 | { 111 | if(byte_len == 0) { 112 | return dst; 113 | } 114 | 115 | return memset(dst, ch, byte_len); 116 | } 117 | 118 | _INLINE_ void secure_clean(OUT void *p, IN const size_t byte_len) 119 | { 120 | typedef void *(*memset_t)(void *, int, size_t); 121 | static volatile memset_t memset_func = my_memset; 122 | memset_func(p, 0, byte_len); 123 | } 124 | 125 | /////////////////////////////////////////// 126 | // Controlling the OpenSSL borrowed code 127 | /////////////////////////////////////////// 128 | 129 | #if defined(X86_64) 130 | // In OpenSSL the OPENSSL_ia32cap_P array holds the return values (in 131 | // RAX,RBX,RCX,RDX registesrs) of executing the Intel CPUID leaf 7 instruction. 132 | // The assembly code chooses the relevant SHA implementation according to this 133 | // array. 134 | 135 | extern unsigned int OPENSSL_ia32cap_P_local[4]; 136 | 137 | # define CLEAR_OPENSSL_CAP_ARRAY \ 138 | do { \ 139 | OPENSSL_ia32cap_P_local[0] = 0; \ 140 | OPENSSL_ia32cap_P_local[1] = 0; \ 141 | OPENSSL_ia32cap_P_local[2] = 0; \ 142 | OPENSSL_ia32cap_P_local[3] = 0; \ 143 | } while(0) 144 | 145 | // RAX[30] - Intel CPU bit 146 | // RBX[9] - SSSE3 bit 147 | // RBX[28] - AVX bit 148 | # define RUN_OPENSSL_CODE_WITH_AVX(x) \ 149 | do { \ 150 | OPENSSL_ia32cap_P_local[0] |= (1 << 30); \ 151 | OPENSSL_ia32cap_P_local[1] |= ((1 << 9) | (1 << 28)); \ 152 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 153 | } while(0) 154 | 155 | // RCX[3] - BMI1 bit 156 | // RCX[5] - AVX2 bit 157 | // RCX[8] - BMI2 bit 158 | # define RUN_OPENSSL_CODE_WITH_AVX2(x) \ 159 | do { \ 160 | OPENSSL_ia32cap_P_local[2] |= ((1 << 8) | (1 << 5) | (1 << 3)); \ 161 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 162 | } while(0) 163 | 164 | // RCX[29] - SHA_NI (EXT) bit 165 | # define RUN_OPENSSL_CODE_WITH_SHA_EXT(x) \ 166 | do { \ 167 | OPENSSL_ia32cap_P_local[2] |= (1 << 29); \ 168 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 169 | } while(0) 170 | 171 | #endif 172 | 173 | #if defined(AARCH64) 174 | 175 | extern unsigned int OPENSSL_armcap_P_local; 176 | 177 | # define CLEAR_OPENSSL_CAP_ARRAY \ 178 | do { \ 179 | OPENSSL_armcap_P_local = 0; \ 180 | } while(0) 181 | 182 | # define ARMV7_NEON (1 << 0) 183 | # define ARMV8_SHA256 (1 << 4) 184 | # define ARMV8_SHA512 (1 << 6) 185 | 186 | # define RUN_OPENSSL_CODE_WITH_NEON(x) \ 187 | do { \ 188 | OPENSSL_armcap_P_local |= ARMV7_NEON; \ 189 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 190 | } while(0) 191 | 192 | # define RUN_OPENSSL_CODE_WITH_SHA256_EXT(x) \ 193 | do { \ 194 | OPENSSL_armcap_P_local |= ARMV8_SHA256; \ 195 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 196 | } while(0) 197 | 198 | # define RUN_OPENSSL_CODE_WITH_SHA512_EXT(x) \ 199 | do { \ 200 | OPENSSL_armcap_P_local |= ARMV8_SHA512; \ 201 | {x} CLEAR_OPENSSL_CAP_ARRAY; \ 202 | } while(0) 203 | #endif 204 | -------------------------------------------------------------------------------- /include/internal/measurements.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | #ifndef REPEAT 10 | # define REPEAT 100 11 | #endif 12 | 13 | #ifndef OUTER_REPEAT 14 | # define OUTER_REPEAT 10 15 | #endif 16 | 17 | #ifndef WARMUP 18 | # define WARMUP (REPEAT / 4) 19 | #endif 20 | 21 | uint64_t start_clk, end_clk; 22 | double total_clk; 23 | double temp_clk; 24 | size_t rdtsc_itr; 25 | size_t rdtsc_outer_itr; 26 | 27 | #define HALF_GPR_SIZE UINT8_C(32) 28 | 29 | #if defined(X86_64) 30 | inline static uint64_t get_Clks(void) 31 | { 32 | uint64_t hi; 33 | uint64_t lo; 34 | __asm__ __volatile__("rdtscp\n\t" : "=a"(lo), "=d"(hi)::"rcx"); 35 | return lo ^ (hi << HALF_GPR_SIZE); 36 | } 37 | #endif 38 | 39 | #if defined(AARCH64) 40 | inline static uint64_t get_Clks(void) 41 | { 42 | /*uint32_t hi; 43 | uint32_t lo; 44 | __asm__ __volatile__("rdtscp\n\t" : "=a"(lo), "=d"(hi)::"rcx"); 45 | return ((uint64_t)lo) ^ (((uint64_t)hi) << HALF_GPR_SIZE);*/ 46 | uint64_t value; 47 | __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(value)); 48 | return value; 49 | } 50 | #endif 51 | 52 | // This MACRO measures the number of cycles "x" runs. This is the flow: 53 | // 1) it repeats "x" WARMUP times, in order to warm the cache. 54 | // 2) it reads the Time Stamp Counter at the beginning of the test. 55 | // 3) it repeats "x" REPEAT number of times. 56 | // 4) it reads the Time Stamp Counter again at the end of the test 57 | // 5) it calculates the average number of cycles per one iteration of "x", by 58 | // calculating the total number of cycles, and dividing it by REPEAT 59 | #define RDTSC_MEASURE(x) \ 60 | for(rdtsc_itr = 0; rdtsc_itr < WARMUP; rdtsc_itr++) { \ 61 | {x}; \ 62 | } \ 63 | total_clk = DBL_MAX; \ 64 | for(rdtsc_outer_itr = 0; rdtsc_outer_itr < OUTER_REPEAT; rdtsc_outer_itr++) { \ 65 | start_clk = get_Clks(); \ 66 | for(rdtsc_itr = 0; rdtsc_itr < REPEAT; rdtsc_itr++) { \ 67 | {x}; \ 68 | } \ 69 | end_clk = get_Clks(); \ 70 | temp_clk = (double)(end_clk - start_clk) / REPEAT; \ 71 | if(total_clk > temp_clk) total_clk = temp_clk; \ 72 | } \ 73 | printf("%12.0f ", total_clk); 74 | 75 | #define MEASURE(x) RDTSC_MEASURE(x) 76 | -------------------------------------------------------------------------------- /include/internal/neon_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #if defined(__ARM_NEON) 5 | # include 6 | #endif 7 | 8 | #if !defined(__clang__) 9 | static inline uint8x16x4_t vld1q_u8_x4(const uint8_t *mem) 10 | { 11 | uint8x16x2_t d0 = vld1q_u8_x2(mem); 12 | uint8x16x2_t d1 = vld1q_u8_x2(&mem[32]); 13 | 14 | uint8x16x4_t ret; 15 | ret.val[0] = d0.val[0]; 16 | ret.val[1] = d0.val[1]; 17 | ret.val[2] = d1.val[0]; 18 | ret.val[3] = d1.val[1]; 19 | return ret; 20 | } 21 | 22 | static inline void vst1q_u32_x2(uint32_t *mem, const uint32x4x2_t v) 23 | { 24 | vst1q_u32(mem, v.val[0]); 25 | vst1q_u32(mem + 4, v.val[1]); 26 | } 27 | #endif // __clang__ 28 | -------------------------------------------------------------------------------- /include/internal/sha256_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include "sha.h" 7 | 8 | typedef uint32_t sha256_word_t; 9 | 10 | #define SHA256_BLOCK_BYTE_LEN 64 11 | #define SHA256_ROUNDS_NUM 64 12 | #define SHA256_MSG_END_SYMBOL (0x80) 13 | #define SHA256_HASH_WORDS_NUM (SHA256_HASH_BYTE_LEN / sizeof(sha256_word_t)) 14 | #define SHA256_BLOCK_WORDS_NUM (SHA256_BLOCK_BYTE_LEN / sizeof(sha256_word_t)) 15 | 16 | #define SHA256_FINAL_ROUND_START_IDX 48 17 | 18 | // The SHA state: parameters a-h 19 | typedef ALIGN(64) struct sha256_state_st { 20 | sha256_word_t w[SHA256_HASH_WORDS_NUM]; 21 | } sha256_state_t; 22 | 23 | typedef ALIGN(64) struct sha256_msg_schedule_st { 24 | sha256_word_t w[SHA256_BLOCK_WORDS_NUM]; 25 | } sha256_msg_schedule_t; 26 | 27 | #define Sigma0_0 2 28 | #define Sigma0_1 13 29 | #define Sigma0_2 22 30 | #define Sigma1_0 6 31 | #define Sigma1_1 11 32 | #define Sigma1_2 25 33 | 34 | #define sigma0_0 7 35 | #define sigma0_1 18 36 | #define sigma0_2 3 37 | #define sigma1_0 17 38 | #define sigma1_1 19 39 | #define sigma1_2 10 40 | 41 | #define DUP2(x, y, z, w) x, y, z, w, x, y, z, w // NOLINT 42 | #define DUP4(x, y, z, w) x, y, z, w, x, y, z, w, x, y, z, w, x, y, z, w // NOLINT 43 | 44 | #define ROTR(x, v) ROTR32(x, v) 45 | #define Sigma0(x) (ROTR(x, Sigma0_0) ^ ROTR(x, Sigma0_1) ^ ROTR(x, Sigma0_2)) 46 | #define Sigma1(x) (ROTR(x, Sigma1_0) ^ ROTR(x, Sigma1_1) ^ ROTR(x, Sigma1_2)) 47 | #define sigma0(x) (ROTR(x, sigma0_0) ^ ROTR(x, sigma0_1) ^ ((x) >> sigma0_2)) 48 | #define sigma1(x) (ROTR(x, sigma1_0) ^ ROTR(x, sigma1_1) ^ ((x) >> sigma1_2)) 49 | #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 50 | #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) 51 | 52 | // In the AVX* implementations we operate on 1/2/4 blocks in parllel 53 | // In these cases, it is faster to duplicate the same line in memory 54 | // and load it instead of broadcasting it. 55 | ALIGN(64) extern const sha256_word_t K256[SHA256_ROUNDS_NUM]; 56 | ALIGN(64) extern const sha256_word_t K256x2[2 * SHA256_ROUNDS_NUM]; 57 | ALIGN(64) extern const sha256_word_t K256x4[4 * SHA256_ROUNDS_NUM]; 58 | 59 | #define ROTATE_STATE(s) \ 60 | do { \ 61 | const sha256_word_t tmp = (s)->w[7]; \ 62 | (s)->w[7] = (s)->w[6]; \ 63 | (s)->w[6] = (s)->w[5]; \ 64 | (s)->w[5] = (s)->w[4]; \ 65 | (s)->w[4] = (s)->w[3]; \ 66 | (s)->w[3] = (s)->w[2]; \ 67 | (s)->w[2] = (s)->w[1]; \ 68 | (s)->w[1] = (s)->w[0]; \ 69 | (s)->w[0] = tmp; \ 70 | } while(0) 71 | 72 | _INLINE_ void sha_round(IN OUT sha256_state_t *s, 73 | IN const sha256_word_t x, 74 | IN const sha256_word_t k) 75 | { 76 | sha256_word_t t = x + s->w[7] + Sigma1(s->w[4]); 77 | 78 | t += Ch(s->w[4], s->w[5], s->w[6]) + k; 79 | s->w[7] = t + Sigma0(s->w[0]) + Maj(s->w[0], s->w[1], s->w[2]); 80 | s->w[3] += t; 81 | ROTATE_STATE(s); 82 | } 83 | 84 | _INLINE_ void accumulate_state(IN OUT sha256_state_t *dst, 85 | IN const sha256_state_t *src) 86 | { 87 | for(size_t i = 0; i < SHA256_HASH_WORDS_NUM; i++) { 88 | dst->w[i] += src->w[i]; 89 | } 90 | } 91 | 92 | void sha256_compress_generic(IN OUT sha256_state_t *state, 93 | IN const uint8_t *data, 94 | IN size_t blocks_num); 95 | 96 | #if defined(X86_64) 97 | 98 | void sha256_compress_x86_64_avx(IN OUT sha256_state_t *state, 99 | IN const uint8_t *data, 100 | IN size_t blocks_num); 101 | 102 | void sha256_compress_x86_64_avx2(IN OUT sha256_state_t *state, 103 | IN const uint8_t *data, 104 | IN size_t blocks_num); 105 | 106 | void sha256_compress_x86_64_avx512(IN OUT sha256_state_t *state, 107 | IN const uint8_t *data, 108 | IN size_t blocks_num); 109 | 110 | void sha256_compress_x86_64_sha_ext(IN OUT sha256_state_t *state, 111 | IN const uint8_t *data, 112 | IN size_t blocks_num); 113 | #endif // X86_64 114 | 115 | #if defined(AARCH64) 116 | void sha256_compress_aarch64_sha_ext(IN OUT sha256_state_t *state, 117 | IN const uint8_t *data, 118 | IN size_t blocks_num); 119 | #endif 120 | 121 | // This ASM code was borrowed from OpenSSL as is. 122 | extern void sha256_block_data_order_local(IN OUT sha256_word_t *state, 123 | IN const uint8_t *data, 124 | IN size_t blocks_num); 125 | -------------------------------------------------------------------------------- /include/internal/sha512_defs.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include "sha.h" 7 | 8 | typedef uint64_t sha512_word_t; 9 | 10 | #define SHA512_BLOCK_BYTE_LEN 128 11 | #define SHA512_ROUNDS_NUM 80 12 | #define SHA512_MSG_END_SYMBOL (0x80) 13 | #define SHA512_HASH_WORDS_NUM (SHA512_HASH_BYTE_LEN / sizeof(sha512_word_t)) 14 | #define SHA512_BLOCK_WORDS_NUM (SHA512_BLOCK_BYTE_LEN / sizeof(sha512_word_t)) 15 | 16 | #define SHA512_FINAL_ROUND_START_IDX 64 17 | 18 | // The SHA state: parameters a-h 19 | typedef struct sha512_state_st { 20 | ALIGN(64) sha512_word_t w[SHA512_HASH_WORDS_NUM]; 21 | } sha512_state_t; 22 | 23 | typedef struct sha512_msg_schedule_st { 24 | ALIGN(64) sha512_word_t w[SHA512_BLOCK_WORDS_NUM]; 25 | } sha512_msg_schedule_t; 26 | 27 | #define Sigma0_0 28 28 | #define Sigma0_1 34 29 | #define Sigma0_2 39 30 | #define Sigma1_0 14 31 | #define Sigma1_1 18 32 | #define Sigma1_2 41 33 | 34 | #define sigma0_0 1 35 | #define sigma0_1 8 36 | #define sigma0_2 7 37 | #define sigma1_0 19 38 | #define sigma1_1 61 39 | #define sigma1_2 6 40 | 41 | #define DUP2(x, y) x, y, x, y // NOLINT 42 | #define DUP4(x, y) x, y, x, y, x, y, x, y // NOLINT 43 | 44 | #define ROTR(x, v) ROTR64(x, v) 45 | #define Sigma0(x) (ROTR(x, Sigma0_0) ^ ROTR(x, Sigma0_1) ^ ROTR(x, Sigma0_2)) 46 | #define Sigma1(x) (ROTR(x, Sigma1_0) ^ ROTR(x, Sigma1_1) ^ ROTR(x, Sigma1_2)) 47 | #define sigma0(x) (ROTR(x, sigma0_0) ^ ROTR(x, sigma0_1) ^ ((x) >> sigma0_2)) 48 | #define sigma1(x) (ROTR(x, sigma1_0) ^ ROTR(x, sigma1_1) ^ ((x) >> sigma1_2)) 49 | #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 50 | #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) 51 | 52 | // In the AVX* implementations we operate on 1/2/4 blocks in parllel 53 | // In these cases, it is faster to duplicate the same line in memory 54 | // and load it instead of broadcasting it. 55 | ALIGN(64) extern const sha512_word_t K512[SHA512_ROUNDS_NUM]; 56 | ALIGN(64) extern const sha512_word_t K512x2[2 * SHA512_ROUNDS_NUM]; 57 | ALIGN(64) extern const sha512_word_t K512x4[4 * SHA512_ROUNDS_NUM]; 58 | 59 | #define ROTATE_STATE(s) \ 60 | do { \ 61 | const sha512_word_t tmp = (s)->w[7]; \ 62 | (s)->w[7] = (s)->w[6]; \ 63 | (s)->w[6] = (s)->w[5]; \ 64 | (s)->w[5] = (s)->w[4]; \ 65 | (s)->w[4] = (s)->w[3]; \ 66 | (s)->w[3] = (s)->w[2]; \ 67 | (s)->w[2] = (s)->w[1]; \ 68 | (s)->w[1] = (s)->w[0]; \ 69 | (s)->w[0] = tmp; \ 70 | } while(0) 71 | 72 | _INLINE_ void sha_round(IN OUT sha512_state_t *s, 73 | IN const sha512_word_t x, 74 | IN const sha512_word_t k) 75 | { 76 | sha512_word_t t = x + s->w[7] + Sigma1(s->w[4]); 77 | 78 | t += Ch(s->w[4], s->w[5], s->w[6]) + k; 79 | s->w[7] = t + Sigma0(s->w[0]) + Maj(s->w[0], s->w[1], s->w[2]); 80 | s->w[3] += t; 81 | ROTATE_STATE(s); 82 | } 83 | 84 | _INLINE_ void accumulate_state(IN OUT sha512_state_t *dst, 85 | IN const sha512_state_t *src) 86 | { 87 | for(size_t i = 0; i < SHA512_HASH_WORDS_NUM; i++) { 88 | dst->w[i] += src->w[i]; 89 | } 90 | } 91 | 92 | void sha512_compress_generic(IN OUT sha512_state_t *state, 93 | IN const uint8_t *data, 94 | IN size_t blocks_num); 95 | 96 | #if defined(X86_64) 97 | void sha512_compress_x86_64_avx(IN OUT sha512_state_t *state, 98 | IN const uint8_t *data, 99 | IN size_t blocks_num); 100 | 101 | void sha512_compress_x86_64_avx2(IN OUT sha512_state_t *state, 102 | IN const uint8_t *data, 103 | IN size_t blocks_num); 104 | 105 | void sha512_compress_x86_64_avx512(IN OUT sha512_state_t *state, 106 | IN const uint8_t *data, 107 | IN size_t blocks_num); 108 | #endif // X86_64 109 | 110 | // This ASM code was borrowed from OpenSSL as is. 111 | extern void sha512_block_data_order_local(IN OUT sha512_word_t *state, 112 | IN const uint8_t *data, 113 | IN size_t blocks_num); 114 | -------------------------------------------------------------------------------- /include/sha.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #include "internal/defs.h" 7 | 8 | typedef enum sha_impl_e 9 | { 10 | GENERIC_IMPL, 11 | 12 | #if defined(X86_64) 13 | AVX_IMPL, 14 | OPENSSL_AVX_IMPL, 15 | #endif 16 | 17 | #if defined(AVX2_SUPPORT) 18 | AVX2_IMPL, 19 | OPENSSL_AVX2_IMPL, 20 | #endif 21 | 22 | #if defined(AVX512_SUPPORT) 23 | AVX512_IMPL, 24 | #endif 25 | 26 | #if defined(X86_64_SHA_SUPPORT) 27 | SHA_EXT_IMPL, 28 | OPENSSL_SHA_EXT_IMPL, 29 | #endif 30 | 31 | #if defined(NEON_SUPPORT) 32 | NEON_IMPL, 33 | OPENSSL_NEON_IMPL, 34 | #endif 35 | 36 | #if defined(AARCH64_SHA_SUPPORT) 37 | SHA_EXT_IMPL, 38 | OPENSSL_SHA_EXT_IMPL, 39 | #endif 40 | 41 | } sha_impl_t; 42 | 43 | #define SHA256_HASH_BYTE_LEN 32 44 | #define SHA512_HASH_BYTE_LEN 64 45 | 46 | void sha256(OUT uint8_t *dgst, 47 | IN const uint8_t *data, 48 | IN size_t byte_len, 49 | IN sha_impl_t impl); 50 | 51 | void sha512(OUT uint8_t *dgst, 52 | IN const uint8_t *data, 53 | IN size_t byte_len, 54 | IN sha_impl_t impl); 55 | -------------------------------------------------------------------------------- /src/openssl/README.md: -------------------------------------------------------------------------------- 1 | The code in this directory was copied from the compilation artifacts of OpenSSL commit [13c5d744](https://github.com/openssl/openssl/tree/e32c608e0733d5b295c9aa119153133413c5d744) Feb 24, 2020. 2 | 3 | To reproduce on a platform equipped with Intel 10th generation CPU: 4 | 5 | ``` 6 | git clone https://github.com/openssl/openssl 7 | cd openssl 8 | git checkout e32c608e0733d5b295c9aa119153133413c5d744 9 | ./config 10 | make 11 | ``` 12 | 13 | and the files are found in: 14 | 15 | ``` 16 | ./crypto/sha/sha256-x86_64.s 17 | ./crypto/sha/sha152-x86_64.s 18 | ``` 19 | 20 | These files include several implementations of SHA256 and SHA512 in x86-64 assembly. In particular, they include AVX/AVX2 implementations and for SHA256 also an implementation that uses the new SHA extension that is available on Intel's 10th generation CPUs. 21 | 22 | The relevant implementation is chosen according to the value of the OPENSSL_ia32cap_P array. 23 | 24 | On an AARCH64 machine, the files are found in: 25 | 26 | ``` 27 | ./crypto/sha/sha256-armv8.S 28 | ./crypto/sha/sha512-armv8.S 29 | ``` 30 | The relevant implementation is chosen according to the value of the OPENSSL_armcap_P array. 31 | 32 | To avoid symbols conflicts/mistakes the name of the function `sha256_block_data_order` was changed to `sha256_block_data_order_local`, the parameter `OPENSSL_ia32cap_P` was changed to `OPENSSL_ia32cap_P_local`, the parameter `OPENSSL_armcap_P` was changed to `OPENSSL_armcap_P_local` and in the aarch64 files the include files (dependencies) were removed. 33 | -------------------------------------------------------------------------------- /src/openssl/linux/sha512-armv8.S: -------------------------------------------------------------------------------- 1 | // Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License 2.0 (the "License"). You may not use 4 | // this file except in compliance with the License. You can obtain a copy 5 | // in the file LICENSE in the source distribution or at 6 | // https://www.openssl.org/source/license.html 7 | 8 | // ==================================================================== 9 | // Written by Andy Polyakov for the OpenSSL 10 | // project. The module is, however, dual licensed under OpenSSL and 11 | // CRYPTOGAMS licenses depending on where you obtain it. For further 12 | // details see http://www.openssl.org/~appro/cryptogams/. 13 | // 14 | // Permission to use under GPLv2 terms is granted. 15 | // ==================================================================== 16 | // 17 | // SHA256/512 for ARMv8. 18 | // 19 | // Performance in cycles per processed byte and improvement coefficient 20 | // over code generated with "default" compiler: 21 | // 22 | // SHA256-hw SHA256(*) SHA512 23 | // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 24 | // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 25 | // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 26 | // Denver 2.01 10.5 (+26%) 6.70 (+8%) 27 | // X-Gene 20.0 (+100%) 12.8 (+300%(***)) 28 | // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) 29 | // Kryo 1.92 17.4 (+30%) 11.2 (+8%) 30 | // ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) 31 | // 32 | // (*) Software SHA256 results are of lesser relevance, presented 33 | // mostly for informational purposes. 34 | // (**) The result is a trade-off: it's possible to improve it by 35 | // 10% (or by 1 cycle per round), but at the cost of 20% loss 36 | // on Cortex-A53 (or by 4 cycles per round). 37 | // (***) Super-impressive coefficients over gcc-generated code are 38 | // indication of some compiler "pathology", most notably code 39 | // generated with -mgeneral-regs-only is significantly faster 40 | // and the gap is only 40-90%. 41 | // 42 | // October 2016. 43 | // 44 | // Originally it was reckoned that it makes no sense to implement NEON 45 | // version of SHA256 for 64-bit processors. This is because performance 46 | // improvement on most wide-spread Cortex-A5x processors was observed 47 | // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was 48 | // observed that 32-bit NEON SHA256 performs significantly better than 49 | // 64-bit scalar version on *some* of the more recent processors. As 50 | // result 64-bit NEON version of SHA256 was added to provide best 51 | // all-round performance. For example it executes ~30% faster on X-Gene 52 | // and Mongoose. [For reference, NEON version of SHA512 is bound to 53 | // deliver much less improvement, likely *negative* on Cortex-A5x. 54 | // Which is why NEON support is limited to SHA256.] 55 | 56 | // $output is the last argument if it looks like a file (it has an extension) 57 | // $flavour is the first argument if it doesn't look like a file 58 | # define ARMV7_NEON (1<<0) 59 | # define ARMV8_SHA256 (1<<4) 60 | # define ARMV8_SHA512 (1<<6) 61 | 62 | .text 63 | 64 | .globl sha512_block_data_order_local 65 | .type sha512_block_data_order_local,%function 66 | .align 6 67 | sha512_block_data_order_local: 68 | #ifndef __KERNEL__ 69 | adrp x16,OPENSSL_armcap_P_local 70 | ldr w16,[x16,#:lo12:OPENSSL_armcap_P_local] 71 | tst w16,#ARMV8_SHA512 72 | b.ne .Lv8_entry 73 | #endif 74 | .inst 0xd503233f // paciasp 75 | stp x29,x30,[sp,#-128]! 76 | add x29,sp,#0 77 | 78 | stp x19,x20,[sp,#16] 79 | stp x21,x22,[sp,#32] 80 | stp x23,x24,[sp,#48] 81 | stp x25,x26,[sp,#64] 82 | stp x27,x28,[sp,#80] 83 | sub sp,sp,#4*8 84 | 85 | ldp x20,x21,[x0] // load context 86 | ldp x22,x23,[x0,#2*8] 87 | ldp x24,x25,[x0,#4*8] 88 | add x2,x1,x2,lsl#7 // end of input 89 | ldp x26,x27,[x0,#6*8] 90 | adr x30,.LK512 91 | stp x0,x2,[x29,#96] 92 | 93 | .Loop: 94 | ldp x3,x4,[x1],#2*8 95 | ldr x19,[x30],#8 // *K++ 96 | eor x28,x21,x22 // magic seed 97 | str x1,[x29,#112] 98 | #ifndef __AARCH64EB__ 99 | rev x3,x3 // 0 100 | #endif 101 | ror x16,x24,#14 102 | add x27,x27,x19 // h+=K[i] 103 | eor x6,x24,x24,ror#23 104 | and x17,x25,x24 105 | bic x19,x26,x24 106 | add x27,x27,x3 // h+=X[i] 107 | orr x17,x17,x19 // Ch(e,f,g) 108 | eor x19,x20,x21 // a^b, b^c in next round 109 | eor x16,x16,x6,ror#18 // Sigma1(e) 110 | ror x6,x20,#28 111 | add x27,x27,x17 // h+=Ch(e,f,g) 112 | eor x17,x20,x20,ror#5 113 | add x27,x27,x16 // h+=Sigma1(e) 114 | and x28,x28,x19 // (b^c)&=(a^b) 115 | add x23,x23,x27 // d+=h 116 | eor x28,x28,x21 // Maj(a,b,c) 117 | eor x17,x6,x17,ror#34 // Sigma0(a) 118 | add x27,x27,x28 // h+=Maj(a,b,c) 119 | ldr x28,[x30],#8 // *K++, x19 in next round 120 | //add x27,x27,x17 // h+=Sigma0(a) 121 | #ifndef __AARCH64EB__ 122 | rev x4,x4 // 1 123 | #endif 124 | ldp x5,x6,[x1],#2*8 125 | add x27,x27,x17 // h+=Sigma0(a) 126 | ror x16,x23,#14 127 | add x26,x26,x28 // h+=K[i] 128 | eor x7,x23,x23,ror#23 129 | and x17,x24,x23 130 | bic x28,x25,x23 131 | add x26,x26,x4 // h+=X[i] 132 | orr x17,x17,x28 // Ch(e,f,g) 133 | eor x28,x27,x20 // a^b, b^c in next round 134 | eor x16,x16,x7,ror#18 // Sigma1(e) 135 | ror x7,x27,#28 136 | add x26,x26,x17 // h+=Ch(e,f,g) 137 | eor x17,x27,x27,ror#5 138 | add x26,x26,x16 // h+=Sigma1(e) 139 | and x19,x19,x28 // (b^c)&=(a^b) 140 | add x22,x22,x26 // d+=h 141 | eor x19,x19,x20 // Maj(a,b,c) 142 | eor x17,x7,x17,ror#34 // Sigma0(a) 143 | add x26,x26,x19 // h+=Maj(a,b,c) 144 | ldr x19,[x30],#8 // *K++, x28 in next round 145 | //add x26,x26,x17 // h+=Sigma0(a) 146 | #ifndef __AARCH64EB__ 147 | rev x5,x5 // 2 148 | #endif 149 | add x26,x26,x17 // h+=Sigma0(a) 150 | ror x16,x22,#14 151 | add x25,x25,x19 // h+=K[i] 152 | eor x8,x22,x22,ror#23 153 | and x17,x23,x22 154 | bic x19,x24,x22 155 | add x25,x25,x5 // h+=X[i] 156 | orr x17,x17,x19 // Ch(e,f,g) 157 | eor x19,x26,x27 // a^b, b^c in next round 158 | eor x16,x16,x8,ror#18 // Sigma1(e) 159 | ror x8,x26,#28 160 | add x25,x25,x17 // h+=Ch(e,f,g) 161 | eor x17,x26,x26,ror#5 162 | add x25,x25,x16 // h+=Sigma1(e) 163 | and x28,x28,x19 // (b^c)&=(a^b) 164 | add x21,x21,x25 // d+=h 165 | eor x28,x28,x27 // Maj(a,b,c) 166 | eor x17,x8,x17,ror#34 // Sigma0(a) 167 | add x25,x25,x28 // h+=Maj(a,b,c) 168 | ldr x28,[x30],#8 // *K++, x19 in next round 169 | //add x25,x25,x17 // h+=Sigma0(a) 170 | #ifndef __AARCH64EB__ 171 | rev x6,x6 // 3 172 | #endif 173 | ldp x7,x8,[x1],#2*8 174 | add x25,x25,x17 // h+=Sigma0(a) 175 | ror x16,x21,#14 176 | add x24,x24,x28 // h+=K[i] 177 | eor x9,x21,x21,ror#23 178 | and x17,x22,x21 179 | bic x28,x23,x21 180 | add x24,x24,x6 // h+=X[i] 181 | orr x17,x17,x28 // Ch(e,f,g) 182 | eor x28,x25,x26 // a^b, b^c in next round 183 | eor x16,x16,x9,ror#18 // Sigma1(e) 184 | ror x9,x25,#28 185 | add x24,x24,x17 // h+=Ch(e,f,g) 186 | eor x17,x25,x25,ror#5 187 | add x24,x24,x16 // h+=Sigma1(e) 188 | and x19,x19,x28 // (b^c)&=(a^b) 189 | add x20,x20,x24 // d+=h 190 | eor x19,x19,x26 // Maj(a,b,c) 191 | eor x17,x9,x17,ror#34 // Sigma0(a) 192 | add x24,x24,x19 // h+=Maj(a,b,c) 193 | ldr x19,[x30],#8 // *K++, x28 in next round 194 | //add x24,x24,x17 // h+=Sigma0(a) 195 | #ifndef __AARCH64EB__ 196 | rev x7,x7 // 4 197 | #endif 198 | add x24,x24,x17 // h+=Sigma0(a) 199 | ror x16,x20,#14 200 | add x23,x23,x19 // h+=K[i] 201 | eor x10,x20,x20,ror#23 202 | and x17,x21,x20 203 | bic x19,x22,x20 204 | add x23,x23,x7 // h+=X[i] 205 | orr x17,x17,x19 // Ch(e,f,g) 206 | eor x19,x24,x25 // a^b, b^c in next round 207 | eor x16,x16,x10,ror#18 // Sigma1(e) 208 | ror x10,x24,#28 209 | add x23,x23,x17 // h+=Ch(e,f,g) 210 | eor x17,x24,x24,ror#5 211 | add x23,x23,x16 // h+=Sigma1(e) 212 | and x28,x28,x19 // (b^c)&=(a^b) 213 | add x27,x27,x23 // d+=h 214 | eor x28,x28,x25 // Maj(a,b,c) 215 | eor x17,x10,x17,ror#34 // Sigma0(a) 216 | add x23,x23,x28 // h+=Maj(a,b,c) 217 | ldr x28,[x30],#8 // *K++, x19 in next round 218 | //add x23,x23,x17 // h+=Sigma0(a) 219 | #ifndef __AARCH64EB__ 220 | rev x8,x8 // 5 221 | #endif 222 | ldp x9,x10,[x1],#2*8 223 | add x23,x23,x17 // h+=Sigma0(a) 224 | ror x16,x27,#14 225 | add x22,x22,x28 // h+=K[i] 226 | eor x11,x27,x27,ror#23 227 | and x17,x20,x27 228 | bic x28,x21,x27 229 | add x22,x22,x8 // h+=X[i] 230 | orr x17,x17,x28 // Ch(e,f,g) 231 | eor x28,x23,x24 // a^b, b^c in next round 232 | eor x16,x16,x11,ror#18 // Sigma1(e) 233 | ror x11,x23,#28 234 | add x22,x22,x17 // h+=Ch(e,f,g) 235 | eor x17,x23,x23,ror#5 236 | add x22,x22,x16 // h+=Sigma1(e) 237 | and x19,x19,x28 // (b^c)&=(a^b) 238 | add x26,x26,x22 // d+=h 239 | eor x19,x19,x24 // Maj(a,b,c) 240 | eor x17,x11,x17,ror#34 // Sigma0(a) 241 | add x22,x22,x19 // h+=Maj(a,b,c) 242 | ldr x19,[x30],#8 // *K++, x28 in next round 243 | //add x22,x22,x17 // h+=Sigma0(a) 244 | #ifndef __AARCH64EB__ 245 | rev x9,x9 // 6 246 | #endif 247 | add x22,x22,x17 // h+=Sigma0(a) 248 | ror x16,x26,#14 249 | add x21,x21,x19 // h+=K[i] 250 | eor x12,x26,x26,ror#23 251 | and x17,x27,x26 252 | bic x19,x20,x26 253 | add x21,x21,x9 // h+=X[i] 254 | orr x17,x17,x19 // Ch(e,f,g) 255 | eor x19,x22,x23 // a^b, b^c in next round 256 | eor x16,x16,x12,ror#18 // Sigma1(e) 257 | ror x12,x22,#28 258 | add x21,x21,x17 // h+=Ch(e,f,g) 259 | eor x17,x22,x22,ror#5 260 | add x21,x21,x16 // h+=Sigma1(e) 261 | and x28,x28,x19 // (b^c)&=(a^b) 262 | add x25,x25,x21 // d+=h 263 | eor x28,x28,x23 // Maj(a,b,c) 264 | eor x17,x12,x17,ror#34 // Sigma0(a) 265 | add x21,x21,x28 // h+=Maj(a,b,c) 266 | ldr x28,[x30],#8 // *K++, x19 in next round 267 | //add x21,x21,x17 // h+=Sigma0(a) 268 | #ifndef __AARCH64EB__ 269 | rev x10,x10 // 7 270 | #endif 271 | ldp x11,x12,[x1],#2*8 272 | add x21,x21,x17 // h+=Sigma0(a) 273 | ror x16,x25,#14 274 | add x20,x20,x28 // h+=K[i] 275 | eor x13,x25,x25,ror#23 276 | and x17,x26,x25 277 | bic x28,x27,x25 278 | add x20,x20,x10 // h+=X[i] 279 | orr x17,x17,x28 // Ch(e,f,g) 280 | eor x28,x21,x22 // a^b, b^c in next round 281 | eor x16,x16,x13,ror#18 // Sigma1(e) 282 | ror x13,x21,#28 283 | add x20,x20,x17 // h+=Ch(e,f,g) 284 | eor x17,x21,x21,ror#5 285 | add x20,x20,x16 // h+=Sigma1(e) 286 | and x19,x19,x28 // (b^c)&=(a^b) 287 | add x24,x24,x20 // d+=h 288 | eor x19,x19,x22 // Maj(a,b,c) 289 | eor x17,x13,x17,ror#34 // Sigma0(a) 290 | add x20,x20,x19 // h+=Maj(a,b,c) 291 | ldr x19,[x30],#8 // *K++, x28 in next round 292 | //add x20,x20,x17 // h+=Sigma0(a) 293 | #ifndef __AARCH64EB__ 294 | rev x11,x11 // 8 295 | #endif 296 | add x20,x20,x17 // h+=Sigma0(a) 297 | ror x16,x24,#14 298 | add x27,x27,x19 // h+=K[i] 299 | eor x14,x24,x24,ror#23 300 | and x17,x25,x24 301 | bic x19,x26,x24 302 | add x27,x27,x11 // h+=X[i] 303 | orr x17,x17,x19 // Ch(e,f,g) 304 | eor x19,x20,x21 // a^b, b^c in next round 305 | eor x16,x16,x14,ror#18 // Sigma1(e) 306 | ror x14,x20,#28 307 | add x27,x27,x17 // h+=Ch(e,f,g) 308 | eor x17,x20,x20,ror#5 309 | add x27,x27,x16 // h+=Sigma1(e) 310 | and x28,x28,x19 // (b^c)&=(a^b) 311 | add x23,x23,x27 // d+=h 312 | eor x28,x28,x21 // Maj(a,b,c) 313 | eor x17,x14,x17,ror#34 // Sigma0(a) 314 | add x27,x27,x28 // h+=Maj(a,b,c) 315 | ldr x28,[x30],#8 // *K++, x19 in next round 316 | //add x27,x27,x17 // h+=Sigma0(a) 317 | #ifndef __AARCH64EB__ 318 | rev x12,x12 // 9 319 | #endif 320 | ldp x13,x14,[x1],#2*8 321 | add x27,x27,x17 // h+=Sigma0(a) 322 | ror x16,x23,#14 323 | add x26,x26,x28 // h+=K[i] 324 | eor x15,x23,x23,ror#23 325 | and x17,x24,x23 326 | bic x28,x25,x23 327 | add x26,x26,x12 // h+=X[i] 328 | orr x17,x17,x28 // Ch(e,f,g) 329 | eor x28,x27,x20 // a^b, b^c in next round 330 | eor x16,x16,x15,ror#18 // Sigma1(e) 331 | ror x15,x27,#28 332 | add x26,x26,x17 // h+=Ch(e,f,g) 333 | eor x17,x27,x27,ror#5 334 | add x26,x26,x16 // h+=Sigma1(e) 335 | and x19,x19,x28 // (b^c)&=(a^b) 336 | add x22,x22,x26 // d+=h 337 | eor x19,x19,x20 // Maj(a,b,c) 338 | eor x17,x15,x17,ror#34 // Sigma0(a) 339 | add x26,x26,x19 // h+=Maj(a,b,c) 340 | ldr x19,[x30],#8 // *K++, x28 in next round 341 | //add x26,x26,x17 // h+=Sigma0(a) 342 | #ifndef __AARCH64EB__ 343 | rev x13,x13 // 10 344 | #endif 345 | add x26,x26,x17 // h+=Sigma0(a) 346 | ror x16,x22,#14 347 | add x25,x25,x19 // h+=K[i] 348 | eor x0,x22,x22,ror#23 349 | and x17,x23,x22 350 | bic x19,x24,x22 351 | add x25,x25,x13 // h+=X[i] 352 | orr x17,x17,x19 // Ch(e,f,g) 353 | eor x19,x26,x27 // a^b, b^c in next round 354 | eor x16,x16,x0,ror#18 // Sigma1(e) 355 | ror x0,x26,#28 356 | add x25,x25,x17 // h+=Ch(e,f,g) 357 | eor x17,x26,x26,ror#5 358 | add x25,x25,x16 // h+=Sigma1(e) 359 | and x28,x28,x19 // (b^c)&=(a^b) 360 | add x21,x21,x25 // d+=h 361 | eor x28,x28,x27 // Maj(a,b,c) 362 | eor x17,x0,x17,ror#34 // Sigma0(a) 363 | add x25,x25,x28 // h+=Maj(a,b,c) 364 | ldr x28,[x30],#8 // *K++, x19 in next round 365 | //add x25,x25,x17 // h+=Sigma0(a) 366 | #ifndef __AARCH64EB__ 367 | rev x14,x14 // 11 368 | #endif 369 | ldp x15,x0,[x1],#2*8 370 | add x25,x25,x17 // h+=Sigma0(a) 371 | str x6,[sp,#24] 372 | ror x16,x21,#14 373 | add x24,x24,x28 // h+=K[i] 374 | eor x6,x21,x21,ror#23 375 | and x17,x22,x21 376 | bic x28,x23,x21 377 | add x24,x24,x14 // h+=X[i] 378 | orr x17,x17,x28 // Ch(e,f,g) 379 | eor x28,x25,x26 // a^b, b^c in next round 380 | eor x16,x16,x6,ror#18 // Sigma1(e) 381 | ror x6,x25,#28 382 | add x24,x24,x17 // h+=Ch(e,f,g) 383 | eor x17,x25,x25,ror#5 384 | add x24,x24,x16 // h+=Sigma1(e) 385 | and x19,x19,x28 // (b^c)&=(a^b) 386 | add x20,x20,x24 // d+=h 387 | eor x19,x19,x26 // Maj(a,b,c) 388 | eor x17,x6,x17,ror#34 // Sigma0(a) 389 | add x24,x24,x19 // h+=Maj(a,b,c) 390 | ldr x19,[x30],#8 // *K++, x28 in next round 391 | //add x24,x24,x17 // h+=Sigma0(a) 392 | #ifndef __AARCH64EB__ 393 | rev x15,x15 // 12 394 | #endif 395 | add x24,x24,x17 // h+=Sigma0(a) 396 | str x7,[sp,#0] 397 | ror x16,x20,#14 398 | add x23,x23,x19 // h+=K[i] 399 | eor x7,x20,x20,ror#23 400 | and x17,x21,x20 401 | bic x19,x22,x20 402 | add x23,x23,x15 // h+=X[i] 403 | orr x17,x17,x19 // Ch(e,f,g) 404 | eor x19,x24,x25 // a^b, b^c in next round 405 | eor x16,x16,x7,ror#18 // Sigma1(e) 406 | ror x7,x24,#28 407 | add x23,x23,x17 // h+=Ch(e,f,g) 408 | eor x17,x24,x24,ror#5 409 | add x23,x23,x16 // h+=Sigma1(e) 410 | and x28,x28,x19 // (b^c)&=(a^b) 411 | add x27,x27,x23 // d+=h 412 | eor x28,x28,x25 // Maj(a,b,c) 413 | eor x17,x7,x17,ror#34 // Sigma0(a) 414 | add x23,x23,x28 // h+=Maj(a,b,c) 415 | ldr x28,[x30],#8 // *K++, x19 in next round 416 | //add x23,x23,x17 // h+=Sigma0(a) 417 | #ifndef __AARCH64EB__ 418 | rev x0,x0 // 13 419 | #endif 420 | ldp x1,x2,[x1] 421 | add x23,x23,x17 // h+=Sigma0(a) 422 | str x8,[sp,#8] 423 | ror x16,x27,#14 424 | add x22,x22,x28 // h+=K[i] 425 | eor x8,x27,x27,ror#23 426 | and x17,x20,x27 427 | bic x28,x21,x27 428 | add x22,x22,x0 // h+=X[i] 429 | orr x17,x17,x28 // Ch(e,f,g) 430 | eor x28,x23,x24 // a^b, b^c in next round 431 | eor x16,x16,x8,ror#18 // Sigma1(e) 432 | ror x8,x23,#28 433 | add x22,x22,x17 // h+=Ch(e,f,g) 434 | eor x17,x23,x23,ror#5 435 | add x22,x22,x16 // h+=Sigma1(e) 436 | and x19,x19,x28 // (b^c)&=(a^b) 437 | add x26,x26,x22 // d+=h 438 | eor x19,x19,x24 // Maj(a,b,c) 439 | eor x17,x8,x17,ror#34 // Sigma0(a) 440 | add x22,x22,x19 // h+=Maj(a,b,c) 441 | ldr x19,[x30],#8 // *K++, x28 in next round 442 | //add x22,x22,x17 // h+=Sigma0(a) 443 | #ifndef __AARCH64EB__ 444 | rev x1,x1 // 14 445 | #endif 446 | ldr x6,[sp,#24] 447 | add x22,x22,x17 // h+=Sigma0(a) 448 | str x9,[sp,#16] 449 | ror x16,x26,#14 450 | add x21,x21,x19 // h+=K[i] 451 | eor x9,x26,x26,ror#23 452 | and x17,x27,x26 453 | bic x19,x20,x26 454 | add x21,x21,x1 // h+=X[i] 455 | orr x17,x17,x19 // Ch(e,f,g) 456 | eor x19,x22,x23 // a^b, b^c in next round 457 | eor x16,x16,x9,ror#18 // Sigma1(e) 458 | ror x9,x22,#28 459 | add x21,x21,x17 // h+=Ch(e,f,g) 460 | eor x17,x22,x22,ror#5 461 | add x21,x21,x16 // h+=Sigma1(e) 462 | and x28,x28,x19 // (b^c)&=(a^b) 463 | add x25,x25,x21 // d+=h 464 | eor x28,x28,x23 // Maj(a,b,c) 465 | eor x17,x9,x17,ror#34 // Sigma0(a) 466 | add x21,x21,x28 // h+=Maj(a,b,c) 467 | ldr x28,[x30],#8 // *K++, x19 in next round 468 | //add x21,x21,x17 // h+=Sigma0(a) 469 | #ifndef __AARCH64EB__ 470 | rev x2,x2 // 15 471 | #endif 472 | ldr x7,[sp,#0] 473 | add x21,x21,x17 // h+=Sigma0(a) 474 | str x10,[sp,#24] 475 | ror x16,x25,#14 476 | add x20,x20,x28 // h+=K[i] 477 | ror x9,x4,#1 478 | and x17,x26,x25 479 | ror x8,x1,#19 480 | bic x28,x27,x25 481 | ror x10,x21,#28 482 | add x20,x20,x2 // h+=X[i] 483 | eor x16,x16,x25,ror#18 484 | eor x9,x9,x4,ror#8 485 | orr x17,x17,x28 // Ch(e,f,g) 486 | eor x28,x21,x22 // a^b, b^c in next round 487 | eor x16,x16,x25,ror#41 // Sigma1(e) 488 | eor x10,x10,x21,ror#34 489 | add x20,x20,x17 // h+=Ch(e,f,g) 490 | and x19,x19,x28 // (b^c)&=(a^b) 491 | eor x8,x8,x1,ror#61 492 | eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 493 | add x20,x20,x16 // h+=Sigma1(e) 494 | eor x19,x19,x22 // Maj(a,b,c) 495 | eor x17,x10,x21,ror#39 // Sigma0(a) 496 | eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 497 | add x3,x3,x12 498 | add x24,x24,x20 // d+=h 499 | add x20,x20,x19 // h+=Maj(a,b,c) 500 | ldr x19,[x30],#8 // *K++, x28 in next round 501 | add x3,x3,x9 502 | add x20,x20,x17 // h+=Sigma0(a) 503 | add x3,x3,x8 504 | .Loop_16_xx: 505 | ldr x8,[sp,#8] 506 | str x11,[sp,#0] 507 | ror x16,x24,#14 508 | add x27,x27,x19 // h+=K[i] 509 | ror x10,x5,#1 510 | and x17,x25,x24 511 | ror x9,x2,#19 512 | bic x19,x26,x24 513 | ror x11,x20,#28 514 | add x27,x27,x3 // h+=X[i] 515 | eor x16,x16,x24,ror#18 516 | eor x10,x10,x5,ror#8 517 | orr x17,x17,x19 // Ch(e,f,g) 518 | eor x19,x20,x21 // a^b, b^c in next round 519 | eor x16,x16,x24,ror#41 // Sigma1(e) 520 | eor x11,x11,x20,ror#34 521 | add x27,x27,x17 // h+=Ch(e,f,g) 522 | and x28,x28,x19 // (b^c)&=(a^b) 523 | eor x9,x9,x2,ror#61 524 | eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) 525 | add x27,x27,x16 // h+=Sigma1(e) 526 | eor x28,x28,x21 // Maj(a,b,c) 527 | eor x17,x11,x20,ror#39 // Sigma0(a) 528 | eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) 529 | add x4,x4,x13 530 | add x23,x23,x27 // d+=h 531 | add x27,x27,x28 // h+=Maj(a,b,c) 532 | ldr x28,[x30],#8 // *K++, x19 in next round 533 | add x4,x4,x10 534 | add x27,x27,x17 // h+=Sigma0(a) 535 | add x4,x4,x9 536 | ldr x9,[sp,#16] 537 | str x12,[sp,#8] 538 | ror x16,x23,#14 539 | add x26,x26,x28 // h+=K[i] 540 | ror x11,x6,#1 541 | and x17,x24,x23 542 | ror x10,x3,#19 543 | bic x28,x25,x23 544 | ror x12,x27,#28 545 | add x26,x26,x4 // h+=X[i] 546 | eor x16,x16,x23,ror#18 547 | eor x11,x11,x6,ror#8 548 | orr x17,x17,x28 // Ch(e,f,g) 549 | eor x28,x27,x20 // a^b, b^c in next round 550 | eor x16,x16,x23,ror#41 // Sigma1(e) 551 | eor x12,x12,x27,ror#34 552 | add x26,x26,x17 // h+=Ch(e,f,g) 553 | and x19,x19,x28 // (b^c)&=(a^b) 554 | eor x10,x10,x3,ror#61 555 | eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) 556 | add x26,x26,x16 // h+=Sigma1(e) 557 | eor x19,x19,x20 // Maj(a,b,c) 558 | eor x17,x12,x27,ror#39 // Sigma0(a) 559 | eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) 560 | add x5,x5,x14 561 | add x22,x22,x26 // d+=h 562 | add x26,x26,x19 // h+=Maj(a,b,c) 563 | ldr x19,[x30],#8 // *K++, x28 in next round 564 | add x5,x5,x11 565 | add x26,x26,x17 // h+=Sigma0(a) 566 | add x5,x5,x10 567 | ldr x10,[sp,#24] 568 | str x13,[sp,#16] 569 | ror x16,x22,#14 570 | add x25,x25,x19 // h+=K[i] 571 | ror x12,x7,#1 572 | and x17,x23,x22 573 | ror x11,x4,#19 574 | bic x19,x24,x22 575 | ror x13,x26,#28 576 | add x25,x25,x5 // h+=X[i] 577 | eor x16,x16,x22,ror#18 578 | eor x12,x12,x7,ror#8 579 | orr x17,x17,x19 // Ch(e,f,g) 580 | eor x19,x26,x27 // a^b, b^c in next round 581 | eor x16,x16,x22,ror#41 // Sigma1(e) 582 | eor x13,x13,x26,ror#34 583 | add x25,x25,x17 // h+=Ch(e,f,g) 584 | and x28,x28,x19 // (b^c)&=(a^b) 585 | eor x11,x11,x4,ror#61 586 | eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) 587 | add x25,x25,x16 // h+=Sigma1(e) 588 | eor x28,x28,x27 // Maj(a,b,c) 589 | eor x17,x13,x26,ror#39 // Sigma0(a) 590 | eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) 591 | add x6,x6,x15 592 | add x21,x21,x25 // d+=h 593 | add x25,x25,x28 // h+=Maj(a,b,c) 594 | ldr x28,[x30],#8 // *K++, x19 in next round 595 | add x6,x6,x12 596 | add x25,x25,x17 // h+=Sigma0(a) 597 | add x6,x6,x11 598 | ldr x11,[sp,#0] 599 | str x14,[sp,#24] 600 | ror x16,x21,#14 601 | add x24,x24,x28 // h+=K[i] 602 | ror x13,x8,#1 603 | and x17,x22,x21 604 | ror x12,x5,#19 605 | bic x28,x23,x21 606 | ror x14,x25,#28 607 | add x24,x24,x6 // h+=X[i] 608 | eor x16,x16,x21,ror#18 609 | eor x13,x13,x8,ror#8 610 | orr x17,x17,x28 // Ch(e,f,g) 611 | eor x28,x25,x26 // a^b, b^c in next round 612 | eor x16,x16,x21,ror#41 // Sigma1(e) 613 | eor x14,x14,x25,ror#34 614 | add x24,x24,x17 // h+=Ch(e,f,g) 615 | and x19,x19,x28 // (b^c)&=(a^b) 616 | eor x12,x12,x5,ror#61 617 | eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) 618 | add x24,x24,x16 // h+=Sigma1(e) 619 | eor x19,x19,x26 // Maj(a,b,c) 620 | eor x17,x14,x25,ror#39 // Sigma0(a) 621 | eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) 622 | add x7,x7,x0 623 | add x20,x20,x24 // d+=h 624 | add x24,x24,x19 // h+=Maj(a,b,c) 625 | ldr x19,[x30],#8 // *K++, x28 in next round 626 | add x7,x7,x13 627 | add x24,x24,x17 // h+=Sigma0(a) 628 | add x7,x7,x12 629 | ldr x12,[sp,#8] 630 | str x15,[sp,#0] 631 | ror x16,x20,#14 632 | add x23,x23,x19 // h+=K[i] 633 | ror x14,x9,#1 634 | and x17,x21,x20 635 | ror x13,x6,#19 636 | bic x19,x22,x20 637 | ror x15,x24,#28 638 | add x23,x23,x7 // h+=X[i] 639 | eor x16,x16,x20,ror#18 640 | eor x14,x14,x9,ror#8 641 | orr x17,x17,x19 // Ch(e,f,g) 642 | eor x19,x24,x25 // a^b, b^c in next round 643 | eor x16,x16,x20,ror#41 // Sigma1(e) 644 | eor x15,x15,x24,ror#34 645 | add x23,x23,x17 // h+=Ch(e,f,g) 646 | and x28,x28,x19 // (b^c)&=(a^b) 647 | eor x13,x13,x6,ror#61 648 | eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) 649 | add x23,x23,x16 // h+=Sigma1(e) 650 | eor x28,x28,x25 // Maj(a,b,c) 651 | eor x17,x15,x24,ror#39 // Sigma0(a) 652 | eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) 653 | add x8,x8,x1 654 | add x27,x27,x23 // d+=h 655 | add x23,x23,x28 // h+=Maj(a,b,c) 656 | ldr x28,[x30],#8 // *K++, x19 in next round 657 | add x8,x8,x14 658 | add x23,x23,x17 // h+=Sigma0(a) 659 | add x8,x8,x13 660 | ldr x13,[sp,#16] 661 | str x0,[sp,#8] 662 | ror x16,x27,#14 663 | add x22,x22,x28 // h+=K[i] 664 | ror x15,x10,#1 665 | and x17,x20,x27 666 | ror x14,x7,#19 667 | bic x28,x21,x27 668 | ror x0,x23,#28 669 | add x22,x22,x8 // h+=X[i] 670 | eor x16,x16,x27,ror#18 671 | eor x15,x15,x10,ror#8 672 | orr x17,x17,x28 // Ch(e,f,g) 673 | eor x28,x23,x24 // a^b, b^c in next round 674 | eor x16,x16,x27,ror#41 // Sigma1(e) 675 | eor x0,x0,x23,ror#34 676 | add x22,x22,x17 // h+=Ch(e,f,g) 677 | and x19,x19,x28 // (b^c)&=(a^b) 678 | eor x14,x14,x7,ror#61 679 | eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) 680 | add x22,x22,x16 // h+=Sigma1(e) 681 | eor x19,x19,x24 // Maj(a,b,c) 682 | eor x17,x0,x23,ror#39 // Sigma0(a) 683 | eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) 684 | add x9,x9,x2 685 | add x26,x26,x22 // d+=h 686 | add x22,x22,x19 // h+=Maj(a,b,c) 687 | ldr x19,[x30],#8 // *K++, x28 in next round 688 | add x9,x9,x15 689 | add x22,x22,x17 // h+=Sigma0(a) 690 | add x9,x9,x14 691 | ldr x14,[sp,#24] 692 | str x1,[sp,#16] 693 | ror x16,x26,#14 694 | add x21,x21,x19 // h+=K[i] 695 | ror x0,x11,#1 696 | and x17,x27,x26 697 | ror x15,x8,#19 698 | bic x19,x20,x26 699 | ror x1,x22,#28 700 | add x21,x21,x9 // h+=X[i] 701 | eor x16,x16,x26,ror#18 702 | eor x0,x0,x11,ror#8 703 | orr x17,x17,x19 // Ch(e,f,g) 704 | eor x19,x22,x23 // a^b, b^c in next round 705 | eor x16,x16,x26,ror#41 // Sigma1(e) 706 | eor x1,x1,x22,ror#34 707 | add x21,x21,x17 // h+=Ch(e,f,g) 708 | and x28,x28,x19 // (b^c)&=(a^b) 709 | eor x15,x15,x8,ror#61 710 | eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) 711 | add x21,x21,x16 // h+=Sigma1(e) 712 | eor x28,x28,x23 // Maj(a,b,c) 713 | eor x17,x1,x22,ror#39 // Sigma0(a) 714 | eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) 715 | add x10,x10,x3 716 | add x25,x25,x21 // d+=h 717 | add x21,x21,x28 // h+=Maj(a,b,c) 718 | ldr x28,[x30],#8 // *K++, x19 in next round 719 | add x10,x10,x0 720 | add x21,x21,x17 // h+=Sigma0(a) 721 | add x10,x10,x15 722 | ldr x15,[sp,#0] 723 | str x2,[sp,#24] 724 | ror x16,x25,#14 725 | add x20,x20,x28 // h+=K[i] 726 | ror x1,x12,#1 727 | and x17,x26,x25 728 | ror x0,x9,#19 729 | bic x28,x27,x25 730 | ror x2,x21,#28 731 | add x20,x20,x10 // h+=X[i] 732 | eor x16,x16,x25,ror#18 733 | eor x1,x1,x12,ror#8 734 | orr x17,x17,x28 // Ch(e,f,g) 735 | eor x28,x21,x22 // a^b, b^c in next round 736 | eor x16,x16,x25,ror#41 // Sigma1(e) 737 | eor x2,x2,x21,ror#34 738 | add x20,x20,x17 // h+=Ch(e,f,g) 739 | and x19,x19,x28 // (b^c)&=(a^b) 740 | eor x0,x0,x9,ror#61 741 | eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) 742 | add x20,x20,x16 // h+=Sigma1(e) 743 | eor x19,x19,x22 // Maj(a,b,c) 744 | eor x17,x2,x21,ror#39 // Sigma0(a) 745 | eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) 746 | add x11,x11,x4 747 | add x24,x24,x20 // d+=h 748 | add x20,x20,x19 // h+=Maj(a,b,c) 749 | ldr x19,[x30],#8 // *K++, x28 in next round 750 | add x11,x11,x1 751 | add x20,x20,x17 // h+=Sigma0(a) 752 | add x11,x11,x0 753 | ldr x0,[sp,#8] 754 | str x3,[sp,#0] 755 | ror x16,x24,#14 756 | add x27,x27,x19 // h+=K[i] 757 | ror x2,x13,#1 758 | and x17,x25,x24 759 | ror x1,x10,#19 760 | bic x19,x26,x24 761 | ror x3,x20,#28 762 | add x27,x27,x11 // h+=X[i] 763 | eor x16,x16,x24,ror#18 764 | eor x2,x2,x13,ror#8 765 | orr x17,x17,x19 // Ch(e,f,g) 766 | eor x19,x20,x21 // a^b, b^c in next round 767 | eor x16,x16,x24,ror#41 // Sigma1(e) 768 | eor x3,x3,x20,ror#34 769 | add x27,x27,x17 // h+=Ch(e,f,g) 770 | and x28,x28,x19 // (b^c)&=(a^b) 771 | eor x1,x1,x10,ror#61 772 | eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) 773 | add x27,x27,x16 // h+=Sigma1(e) 774 | eor x28,x28,x21 // Maj(a,b,c) 775 | eor x17,x3,x20,ror#39 // Sigma0(a) 776 | eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) 777 | add x12,x12,x5 778 | add x23,x23,x27 // d+=h 779 | add x27,x27,x28 // h+=Maj(a,b,c) 780 | ldr x28,[x30],#8 // *K++, x19 in next round 781 | add x12,x12,x2 782 | add x27,x27,x17 // h+=Sigma0(a) 783 | add x12,x12,x1 784 | ldr x1,[sp,#16] 785 | str x4,[sp,#8] 786 | ror x16,x23,#14 787 | add x26,x26,x28 // h+=K[i] 788 | ror x3,x14,#1 789 | and x17,x24,x23 790 | ror x2,x11,#19 791 | bic x28,x25,x23 792 | ror x4,x27,#28 793 | add x26,x26,x12 // h+=X[i] 794 | eor x16,x16,x23,ror#18 795 | eor x3,x3,x14,ror#8 796 | orr x17,x17,x28 // Ch(e,f,g) 797 | eor x28,x27,x20 // a^b, b^c in next round 798 | eor x16,x16,x23,ror#41 // Sigma1(e) 799 | eor x4,x4,x27,ror#34 800 | add x26,x26,x17 // h+=Ch(e,f,g) 801 | and x19,x19,x28 // (b^c)&=(a^b) 802 | eor x2,x2,x11,ror#61 803 | eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) 804 | add x26,x26,x16 // h+=Sigma1(e) 805 | eor x19,x19,x20 // Maj(a,b,c) 806 | eor x17,x4,x27,ror#39 // Sigma0(a) 807 | eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) 808 | add x13,x13,x6 809 | add x22,x22,x26 // d+=h 810 | add x26,x26,x19 // h+=Maj(a,b,c) 811 | ldr x19,[x30],#8 // *K++, x28 in next round 812 | add x13,x13,x3 813 | add x26,x26,x17 // h+=Sigma0(a) 814 | add x13,x13,x2 815 | ldr x2,[sp,#24] 816 | str x5,[sp,#16] 817 | ror x16,x22,#14 818 | add x25,x25,x19 // h+=K[i] 819 | ror x4,x15,#1 820 | and x17,x23,x22 821 | ror x3,x12,#19 822 | bic x19,x24,x22 823 | ror x5,x26,#28 824 | add x25,x25,x13 // h+=X[i] 825 | eor x16,x16,x22,ror#18 826 | eor x4,x4,x15,ror#8 827 | orr x17,x17,x19 // Ch(e,f,g) 828 | eor x19,x26,x27 // a^b, b^c in next round 829 | eor x16,x16,x22,ror#41 // Sigma1(e) 830 | eor x5,x5,x26,ror#34 831 | add x25,x25,x17 // h+=Ch(e,f,g) 832 | and x28,x28,x19 // (b^c)&=(a^b) 833 | eor x3,x3,x12,ror#61 834 | eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) 835 | add x25,x25,x16 // h+=Sigma1(e) 836 | eor x28,x28,x27 // Maj(a,b,c) 837 | eor x17,x5,x26,ror#39 // Sigma0(a) 838 | eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) 839 | add x14,x14,x7 840 | add x21,x21,x25 // d+=h 841 | add x25,x25,x28 // h+=Maj(a,b,c) 842 | ldr x28,[x30],#8 // *K++, x19 in next round 843 | add x14,x14,x4 844 | add x25,x25,x17 // h+=Sigma0(a) 845 | add x14,x14,x3 846 | ldr x3,[sp,#0] 847 | str x6,[sp,#24] 848 | ror x16,x21,#14 849 | add x24,x24,x28 // h+=K[i] 850 | ror x5,x0,#1 851 | and x17,x22,x21 852 | ror x4,x13,#19 853 | bic x28,x23,x21 854 | ror x6,x25,#28 855 | add x24,x24,x14 // h+=X[i] 856 | eor x16,x16,x21,ror#18 857 | eor x5,x5,x0,ror#8 858 | orr x17,x17,x28 // Ch(e,f,g) 859 | eor x28,x25,x26 // a^b, b^c in next round 860 | eor x16,x16,x21,ror#41 // Sigma1(e) 861 | eor x6,x6,x25,ror#34 862 | add x24,x24,x17 // h+=Ch(e,f,g) 863 | and x19,x19,x28 // (b^c)&=(a^b) 864 | eor x4,x4,x13,ror#61 865 | eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) 866 | add x24,x24,x16 // h+=Sigma1(e) 867 | eor x19,x19,x26 // Maj(a,b,c) 868 | eor x17,x6,x25,ror#39 // Sigma0(a) 869 | eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) 870 | add x15,x15,x8 871 | add x20,x20,x24 // d+=h 872 | add x24,x24,x19 // h+=Maj(a,b,c) 873 | ldr x19,[x30],#8 // *K++, x28 in next round 874 | add x15,x15,x5 875 | add x24,x24,x17 // h+=Sigma0(a) 876 | add x15,x15,x4 877 | ldr x4,[sp,#8] 878 | str x7,[sp,#0] 879 | ror x16,x20,#14 880 | add x23,x23,x19 // h+=K[i] 881 | ror x6,x1,#1 882 | and x17,x21,x20 883 | ror x5,x14,#19 884 | bic x19,x22,x20 885 | ror x7,x24,#28 886 | add x23,x23,x15 // h+=X[i] 887 | eor x16,x16,x20,ror#18 888 | eor x6,x6,x1,ror#8 889 | orr x17,x17,x19 // Ch(e,f,g) 890 | eor x19,x24,x25 // a^b, b^c in next round 891 | eor x16,x16,x20,ror#41 // Sigma1(e) 892 | eor x7,x7,x24,ror#34 893 | add x23,x23,x17 // h+=Ch(e,f,g) 894 | and x28,x28,x19 // (b^c)&=(a^b) 895 | eor x5,x5,x14,ror#61 896 | eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) 897 | add x23,x23,x16 // h+=Sigma1(e) 898 | eor x28,x28,x25 // Maj(a,b,c) 899 | eor x17,x7,x24,ror#39 // Sigma0(a) 900 | eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) 901 | add x0,x0,x9 902 | add x27,x27,x23 // d+=h 903 | add x23,x23,x28 // h+=Maj(a,b,c) 904 | ldr x28,[x30],#8 // *K++, x19 in next round 905 | add x0,x0,x6 906 | add x23,x23,x17 // h+=Sigma0(a) 907 | add x0,x0,x5 908 | ldr x5,[sp,#16] 909 | str x8,[sp,#8] 910 | ror x16,x27,#14 911 | add x22,x22,x28 // h+=K[i] 912 | ror x7,x2,#1 913 | and x17,x20,x27 914 | ror x6,x15,#19 915 | bic x28,x21,x27 916 | ror x8,x23,#28 917 | add x22,x22,x0 // h+=X[i] 918 | eor x16,x16,x27,ror#18 919 | eor x7,x7,x2,ror#8 920 | orr x17,x17,x28 // Ch(e,f,g) 921 | eor x28,x23,x24 // a^b, b^c in next round 922 | eor x16,x16,x27,ror#41 // Sigma1(e) 923 | eor x8,x8,x23,ror#34 924 | add x22,x22,x17 // h+=Ch(e,f,g) 925 | and x19,x19,x28 // (b^c)&=(a^b) 926 | eor x6,x6,x15,ror#61 927 | eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) 928 | add x22,x22,x16 // h+=Sigma1(e) 929 | eor x19,x19,x24 // Maj(a,b,c) 930 | eor x17,x8,x23,ror#39 // Sigma0(a) 931 | eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) 932 | add x1,x1,x10 933 | add x26,x26,x22 // d+=h 934 | add x22,x22,x19 // h+=Maj(a,b,c) 935 | ldr x19,[x30],#8 // *K++, x28 in next round 936 | add x1,x1,x7 937 | add x22,x22,x17 // h+=Sigma0(a) 938 | add x1,x1,x6 939 | ldr x6,[sp,#24] 940 | str x9,[sp,#16] 941 | ror x16,x26,#14 942 | add x21,x21,x19 // h+=K[i] 943 | ror x8,x3,#1 944 | and x17,x27,x26 945 | ror x7,x0,#19 946 | bic x19,x20,x26 947 | ror x9,x22,#28 948 | add x21,x21,x1 // h+=X[i] 949 | eor x16,x16,x26,ror#18 950 | eor x8,x8,x3,ror#8 951 | orr x17,x17,x19 // Ch(e,f,g) 952 | eor x19,x22,x23 // a^b, b^c in next round 953 | eor x16,x16,x26,ror#41 // Sigma1(e) 954 | eor x9,x9,x22,ror#34 955 | add x21,x21,x17 // h+=Ch(e,f,g) 956 | and x28,x28,x19 // (b^c)&=(a^b) 957 | eor x7,x7,x0,ror#61 958 | eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) 959 | add x21,x21,x16 // h+=Sigma1(e) 960 | eor x28,x28,x23 // Maj(a,b,c) 961 | eor x17,x9,x22,ror#39 // Sigma0(a) 962 | eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) 963 | add x2,x2,x11 964 | add x25,x25,x21 // d+=h 965 | add x21,x21,x28 // h+=Maj(a,b,c) 966 | ldr x28,[x30],#8 // *K++, x19 in next round 967 | add x2,x2,x8 968 | add x21,x21,x17 // h+=Sigma0(a) 969 | add x2,x2,x7 970 | ldr x7,[sp,#0] 971 | str x10,[sp,#24] 972 | ror x16,x25,#14 973 | add x20,x20,x28 // h+=K[i] 974 | ror x9,x4,#1 975 | and x17,x26,x25 976 | ror x8,x1,#19 977 | bic x28,x27,x25 978 | ror x10,x21,#28 979 | add x20,x20,x2 // h+=X[i] 980 | eor x16,x16,x25,ror#18 981 | eor x9,x9,x4,ror#8 982 | orr x17,x17,x28 // Ch(e,f,g) 983 | eor x28,x21,x22 // a^b, b^c in next round 984 | eor x16,x16,x25,ror#41 // Sigma1(e) 985 | eor x10,x10,x21,ror#34 986 | add x20,x20,x17 // h+=Ch(e,f,g) 987 | and x19,x19,x28 // (b^c)&=(a^b) 988 | eor x8,x8,x1,ror#61 989 | eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) 990 | add x20,x20,x16 // h+=Sigma1(e) 991 | eor x19,x19,x22 // Maj(a,b,c) 992 | eor x17,x10,x21,ror#39 // Sigma0(a) 993 | eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) 994 | add x3,x3,x12 995 | add x24,x24,x20 // d+=h 996 | add x20,x20,x19 // h+=Maj(a,b,c) 997 | ldr x19,[x30],#8 // *K++, x28 in next round 998 | add x3,x3,x9 999 | add x20,x20,x17 // h+=Sigma0(a) 1000 | add x3,x3,x8 1001 | cbnz x19,.Loop_16_xx 1002 | 1003 | ldp x0,x2,[x29,#96] 1004 | ldr x1,[x29,#112] 1005 | sub x30,x30,#648 // rewind 1006 | 1007 | ldp x3,x4,[x0] 1008 | ldp x5,x6,[x0,#2*8] 1009 | add x1,x1,#14*8 // advance input pointer 1010 | ldp x7,x8,[x0,#4*8] 1011 | add x20,x20,x3 1012 | ldp x9,x10,[x0,#6*8] 1013 | add x21,x21,x4 1014 | add x22,x22,x5 1015 | add x23,x23,x6 1016 | stp x20,x21,[x0] 1017 | add x24,x24,x7 1018 | add x25,x25,x8 1019 | stp x22,x23,[x0,#2*8] 1020 | add x26,x26,x9 1021 | add x27,x27,x10 1022 | cmp x1,x2 1023 | stp x24,x25,[x0,#4*8] 1024 | stp x26,x27,[x0,#6*8] 1025 | b.ne .Loop 1026 | 1027 | ldp x19,x20,[x29,#16] 1028 | add sp,sp,#4*8 1029 | ldp x21,x22,[x29,#32] 1030 | ldp x23,x24,[x29,#48] 1031 | ldp x25,x26,[x29,#64] 1032 | ldp x27,x28,[x29,#80] 1033 | ldp x29,x30,[sp],#128 1034 | .inst 0xd50323bf // autiasp 1035 | ret 1036 | .size sha512_block_data_order_local,.-sha512_block_data_order_local 1037 | 1038 | .align 6 1039 | .type .LK512,%object 1040 | .LK512: 1041 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd 1042 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 1043 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 1044 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 1045 | .quad 0xd807aa98a3030242,0x12835b0145706fbe 1046 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 1047 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 1048 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 1049 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 1050 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 1051 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 1052 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 1053 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 1054 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 1055 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 1056 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 1057 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 1058 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 1059 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 1060 | .quad 0x81c2c92e47edaee6,0x92722c851482353b 1061 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 1062 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 1063 | .quad 0xd192e819d6ef5218,0xd69906245565a910 1064 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 1065 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 1066 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 1067 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 1068 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 1069 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 1070 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 1071 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 1072 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 1073 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 1074 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 1075 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 1076 | .quad 0x113f9804bef90dae,0x1b710b35131c471b 1077 | .quad 0x28db77f523047d84,0x32caab7b40c72493 1078 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 1079 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 1080 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 1081 | .quad 0 // terminator 1082 | .size .LK512,.-.LK512 1083 | .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1084 | .align 2 1085 | .align 2 1086 | #ifndef __KERNEL__ 1087 | .type sha512_block_armv8,%function 1088 | .align 6 1089 | sha512_block_armv8: 1090 | .Lv8_entry: 1091 | stp x29,x30,[sp,#-16]! 1092 | add x29,sp,#0 1093 | 1094 | ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input 1095 | ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1096 | 1097 | ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context 1098 | adr x3,.LK512 1099 | 1100 | rev64 v16.16b,v16.16b 1101 | rev64 v17.16b,v17.16b 1102 | rev64 v18.16b,v18.16b 1103 | rev64 v19.16b,v19.16b 1104 | rev64 v20.16b,v20.16b 1105 | rev64 v21.16b,v21.16b 1106 | rev64 v22.16b,v22.16b 1107 | rev64 v23.16b,v23.16b 1108 | b .Loop_hw 1109 | 1110 | .align 4 1111 | .Loop_hw: 1112 | ld1 {v24.2d},[x3],#16 1113 | subs x2,x2,#1 1114 | sub x4,x1,#128 1115 | orr v26.16b,v0.16b,v0.16b // offload 1116 | orr v27.16b,v1.16b,v1.16b 1117 | orr v28.16b,v2.16b,v2.16b 1118 | orr v29.16b,v3.16b,v3.16b 1119 | csel x1,x1,x4,ne // conditional rewind 1120 | add v24.2d,v24.2d,v16.2d 1121 | ld1 {v25.2d},[x3],#16 1122 | ext v24.16b,v24.16b,v24.16b,#8 1123 | ext v5.16b,v2.16b,v3.16b,#8 1124 | ext v6.16b,v1.16b,v2.16b,#8 1125 | add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1126 | .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1127 | ext v7.16b,v20.16b,v21.16b,#8 1128 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1129 | .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1130 | add v4.2d,v1.2d,v3.2d // "D + T1" 1131 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1132 | add v25.2d,v25.2d,v17.2d 1133 | ld1 {v24.2d},[x3],#16 1134 | ext v25.16b,v25.16b,v25.16b,#8 1135 | ext v5.16b,v4.16b,v2.16b,#8 1136 | ext v6.16b,v0.16b,v4.16b,#8 1137 | add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1138 | .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1139 | ext v7.16b,v21.16b,v22.16b,#8 1140 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1141 | .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1142 | add v1.2d,v0.2d,v2.2d // "D + T1" 1143 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1144 | add v24.2d,v24.2d,v18.2d 1145 | ld1 {v25.2d},[x3],#16 1146 | ext v24.16b,v24.16b,v24.16b,#8 1147 | ext v5.16b,v1.16b,v4.16b,#8 1148 | ext v6.16b,v3.16b,v1.16b,#8 1149 | add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1150 | .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1151 | ext v7.16b,v22.16b,v23.16b,#8 1152 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1153 | .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1154 | add v0.2d,v3.2d,v4.2d // "D + T1" 1155 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1156 | add v25.2d,v25.2d,v19.2d 1157 | ld1 {v24.2d},[x3],#16 1158 | ext v25.16b,v25.16b,v25.16b,#8 1159 | ext v5.16b,v0.16b,v1.16b,#8 1160 | ext v6.16b,v2.16b,v0.16b,#8 1161 | add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1162 | .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1163 | ext v7.16b,v23.16b,v16.16b,#8 1164 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1165 | .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1166 | add v3.2d,v2.2d,v1.2d // "D + T1" 1167 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1168 | add v24.2d,v24.2d,v20.2d 1169 | ld1 {v25.2d},[x3],#16 1170 | ext v24.16b,v24.16b,v24.16b,#8 1171 | ext v5.16b,v3.16b,v0.16b,#8 1172 | ext v6.16b,v4.16b,v3.16b,#8 1173 | add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1174 | .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1175 | ext v7.16b,v16.16b,v17.16b,#8 1176 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1177 | .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1178 | add v2.2d,v4.2d,v0.2d // "D + T1" 1179 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1180 | add v25.2d,v25.2d,v21.2d 1181 | ld1 {v24.2d},[x3],#16 1182 | ext v25.16b,v25.16b,v25.16b,#8 1183 | ext v5.16b,v2.16b,v3.16b,#8 1184 | ext v6.16b,v1.16b,v2.16b,#8 1185 | add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1186 | .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1187 | ext v7.16b,v17.16b,v18.16b,#8 1188 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1189 | .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1190 | add v4.2d,v1.2d,v3.2d // "D + T1" 1191 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1192 | add v24.2d,v24.2d,v22.2d 1193 | ld1 {v25.2d},[x3],#16 1194 | ext v24.16b,v24.16b,v24.16b,#8 1195 | ext v5.16b,v4.16b,v2.16b,#8 1196 | ext v6.16b,v0.16b,v4.16b,#8 1197 | add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1198 | .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1199 | ext v7.16b,v18.16b,v19.16b,#8 1200 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1201 | .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1202 | add v1.2d,v0.2d,v2.2d // "D + T1" 1203 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1204 | add v25.2d,v25.2d,v23.2d 1205 | ld1 {v24.2d},[x3],#16 1206 | ext v25.16b,v25.16b,v25.16b,#8 1207 | ext v5.16b,v1.16b,v4.16b,#8 1208 | ext v6.16b,v3.16b,v1.16b,#8 1209 | add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1210 | .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1211 | ext v7.16b,v19.16b,v20.16b,#8 1212 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1213 | .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1214 | add v0.2d,v3.2d,v4.2d // "D + T1" 1215 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1216 | add v24.2d,v24.2d,v16.2d 1217 | ld1 {v25.2d},[x3],#16 1218 | ext v24.16b,v24.16b,v24.16b,#8 1219 | ext v5.16b,v0.16b,v1.16b,#8 1220 | ext v6.16b,v2.16b,v0.16b,#8 1221 | add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1222 | .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1223 | ext v7.16b,v20.16b,v21.16b,#8 1224 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1225 | .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1226 | add v3.2d,v2.2d,v1.2d // "D + T1" 1227 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1228 | add v25.2d,v25.2d,v17.2d 1229 | ld1 {v24.2d},[x3],#16 1230 | ext v25.16b,v25.16b,v25.16b,#8 1231 | ext v5.16b,v3.16b,v0.16b,#8 1232 | ext v6.16b,v4.16b,v3.16b,#8 1233 | add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1234 | .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1235 | ext v7.16b,v21.16b,v22.16b,#8 1236 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1237 | .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1238 | add v2.2d,v4.2d,v0.2d // "D + T1" 1239 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1240 | add v24.2d,v24.2d,v18.2d 1241 | ld1 {v25.2d},[x3],#16 1242 | ext v24.16b,v24.16b,v24.16b,#8 1243 | ext v5.16b,v2.16b,v3.16b,#8 1244 | ext v6.16b,v1.16b,v2.16b,#8 1245 | add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1246 | .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1247 | ext v7.16b,v22.16b,v23.16b,#8 1248 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1249 | .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1250 | add v4.2d,v1.2d,v3.2d // "D + T1" 1251 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1252 | add v25.2d,v25.2d,v19.2d 1253 | ld1 {v24.2d},[x3],#16 1254 | ext v25.16b,v25.16b,v25.16b,#8 1255 | ext v5.16b,v4.16b,v2.16b,#8 1256 | ext v6.16b,v0.16b,v4.16b,#8 1257 | add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1258 | .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1259 | ext v7.16b,v23.16b,v16.16b,#8 1260 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1261 | .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1262 | add v1.2d,v0.2d,v2.2d // "D + T1" 1263 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1264 | add v24.2d,v24.2d,v20.2d 1265 | ld1 {v25.2d},[x3],#16 1266 | ext v24.16b,v24.16b,v24.16b,#8 1267 | ext v5.16b,v1.16b,v4.16b,#8 1268 | ext v6.16b,v3.16b,v1.16b,#8 1269 | add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1270 | .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1271 | ext v7.16b,v16.16b,v17.16b,#8 1272 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1273 | .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1274 | add v0.2d,v3.2d,v4.2d // "D + T1" 1275 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1276 | add v25.2d,v25.2d,v21.2d 1277 | ld1 {v24.2d},[x3],#16 1278 | ext v25.16b,v25.16b,v25.16b,#8 1279 | ext v5.16b,v0.16b,v1.16b,#8 1280 | ext v6.16b,v2.16b,v0.16b,#8 1281 | add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1282 | .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1283 | ext v7.16b,v17.16b,v18.16b,#8 1284 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1285 | .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1286 | add v3.2d,v2.2d,v1.2d // "D + T1" 1287 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1288 | add v24.2d,v24.2d,v22.2d 1289 | ld1 {v25.2d},[x3],#16 1290 | ext v24.16b,v24.16b,v24.16b,#8 1291 | ext v5.16b,v3.16b,v0.16b,#8 1292 | ext v6.16b,v4.16b,v3.16b,#8 1293 | add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1294 | .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1295 | ext v7.16b,v18.16b,v19.16b,#8 1296 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1297 | .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1298 | add v2.2d,v4.2d,v0.2d // "D + T1" 1299 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1300 | add v25.2d,v25.2d,v23.2d 1301 | ld1 {v24.2d},[x3],#16 1302 | ext v25.16b,v25.16b,v25.16b,#8 1303 | ext v5.16b,v2.16b,v3.16b,#8 1304 | ext v6.16b,v1.16b,v2.16b,#8 1305 | add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1306 | .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1307 | ext v7.16b,v19.16b,v20.16b,#8 1308 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1309 | .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1310 | add v4.2d,v1.2d,v3.2d // "D + T1" 1311 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1312 | add v24.2d,v24.2d,v16.2d 1313 | ld1 {v25.2d},[x3],#16 1314 | ext v24.16b,v24.16b,v24.16b,#8 1315 | ext v5.16b,v4.16b,v2.16b,#8 1316 | ext v6.16b,v0.16b,v4.16b,#8 1317 | add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1318 | .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1319 | ext v7.16b,v20.16b,v21.16b,#8 1320 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1321 | .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1322 | add v1.2d,v0.2d,v2.2d // "D + T1" 1323 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1324 | add v25.2d,v25.2d,v17.2d 1325 | ld1 {v24.2d},[x3],#16 1326 | ext v25.16b,v25.16b,v25.16b,#8 1327 | ext v5.16b,v1.16b,v4.16b,#8 1328 | ext v6.16b,v3.16b,v1.16b,#8 1329 | add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1330 | .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1331 | ext v7.16b,v21.16b,v22.16b,#8 1332 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1333 | .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1334 | add v0.2d,v3.2d,v4.2d // "D + T1" 1335 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1336 | add v24.2d,v24.2d,v18.2d 1337 | ld1 {v25.2d},[x3],#16 1338 | ext v24.16b,v24.16b,v24.16b,#8 1339 | ext v5.16b,v0.16b,v1.16b,#8 1340 | ext v6.16b,v2.16b,v0.16b,#8 1341 | add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1342 | .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1343 | ext v7.16b,v22.16b,v23.16b,#8 1344 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1345 | .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1346 | add v3.2d,v2.2d,v1.2d // "D + T1" 1347 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1348 | add v25.2d,v25.2d,v19.2d 1349 | ld1 {v24.2d},[x3],#16 1350 | ext v25.16b,v25.16b,v25.16b,#8 1351 | ext v5.16b,v3.16b,v0.16b,#8 1352 | ext v6.16b,v4.16b,v3.16b,#8 1353 | add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1354 | .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1355 | ext v7.16b,v23.16b,v16.16b,#8 1356 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1357 | .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1358 | add v2.2d,v4.2d,v0.2d // "D + T1" 1359 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1360 | add v24.2d,v24.2d,v20.2d 1361 | ld1 {v25.2d},[x3],#16 1362 | ext v24.16b,v24.16b,v24.16b,#8 1363 | ext v5.16b,v2.16b,v3.16b,#8 1364 | ext v6.16b,v1.16b,v2.16b,#8 1365 | add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1366 | .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1367 | ext v7.16b,v16.16b,v17.16b,#8 1368 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1369 | .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1370 | add v4.2d,v1.2d,v3.2d // "D + T1" 1371 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1372 | add v25.2d,v25.2d,v21.2d 1373 | ld1 {v24.2d},[x3],#16 1374 | ext v25.16b,v25.16b,v25.16b,#8 1375 | ext v5.16b,v4.16b,v2.16b,#8 1376 | ext v6.16b,v0.16b,v4.16b,#8 1377 | add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1378 | .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1379 | ext v7.16b,v17.16b,v18.16b,#8 1380 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1381 | .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1382 | add v1.2d,v0.2d,v2.2d // "D + T1" 1383 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1384 | add v24.2d,v24.2d,v22.2d 1385 | ld1 {v25.2d},[x3],#16 1386 | ext v24.16b,v24.16b,v24.16b,#8 1387 | ext v5.16b,v1.16b,v4.16b,#8 1388 | ext v6.16b,v3.16b,v1.16b,#8 1389 | add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1390 | .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1391 | ext v7.16b,v18.16b,v19.16b,#8 1392 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1393 | .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1394 | add v0.2d,v3.2d,v4.2d // "D + T1" 1395 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1396 | add v25.2d,v25.2d,v23.2d 1397 | ld1 {v24.2d},[x3],#16 1398 | ext v25.16b,v25.16b,v25.16b,#8 1399 | ext v5.16b,v0.16b,v1.16b,#8 1400 | ext v6.16b,v2.16b,v0.16b,#8 1401 | add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1402 | .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1403 | ext v7.16b,v19.16b,v20.16b,#8 1404 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1405 | .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1406 | add v3.2d,v2.2d,v1.2d // "D + T1" 1407 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1408 | add v24.2d,v24.2d,v16.2d 1409 | ld1 {v25.2d},[x3],#16 1410 | ext v24.16b,v24.16b,v24.16b,#8 1411 | ext v5.16b,v3.16b,v0.16b,#8 1412 | ext v6.16b,v4.16b,v3.16b,#8 1413 | add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1414 | .inst 0xcec08230 //sha512su0 v16.16b,v17.16b 1415 | ext v7.16b,v20.16b,v21.16b,#8 1416 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1417 | .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b 1418 | add v2.2d,v4.2d,v0.2d // "D + T1" 1419 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1420 | add v25.2d,v25.2d,v17.2d 1421 | ld1 {v24.2d},[x3],#16 1422 | ext v25.16b,v25.16b,v25.16b,#8 1423 | ext v5.16b,v2.16b,v3.16b,#8 1424 | ext v6.16b,v1.16b,v2.16b,#8 1425 | add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1426 | .inst 0xcec08251 //sha512su0 v17.16b,v18.16b 1427 | ext v7.16b,v21.16b,v22.16b,#8 1428 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1429 | .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b 1430 | add v4.2d,v1.2d,v3.2d // "D + T1" 1431 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1432 | add v24.2d,v24.2d,v18.2d 1433 | ld1 {v25.2d},[x3],#16 1434 | ext v24.16b,v24.16b,v24.16b,#8 1435 | ext v5.16b,v4.16b,v2.16b,#8 1436 | ext v6.16b,v0.16b,v4.16b,#8 1437 | add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1438 | .inst 0xcec08272 //sha512su0 v18.16b,v19.16b 1439 | ext v7.16b,v22.16b,v23.16b,#8 1440 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1441 | .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b 1442 | add v1.2d,v0.2d,v2.2d // "D + T1" 1443 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1444 | add v25.2d,v25.2d,v19.2d 1445 | ld1 {v24.2d},[x3],#16 1446 | ext v25.16b,v25.16b,v25.16b,#8 1447 | ext v5.16b,v1.16b,v4.16b,#8 1448 | ext v6.16b,v3.16b,v1.16b,#8 1449 | add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1450 | .inst 0xcec08293 //sha512su0 v19.16b,v20.16b 1451 | ext v7.16b,v23.16b,v16.16b,#8 1452 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1453 | .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b 1454 | add v0.2d,v3.2d,v4.2d // "D + T1" 1455 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1456 | add v24.2d,v24.2d,v20.2d 1457 | ld1 {v25.2d},[x3],#16 1458 | ext v24.16b,v24.16b,v24.16b,#8 1459 | ext v5.16b,v0.16b,v1.16b,#8 1460 | ext v6.16b,v2.16b,v0.16b,#8 1461 | add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1462 | .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b 1463 | ext v7.16b,v16.16b,v17.16b,#8 1464 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1465 | .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b 1466 | add v3.2d,v2.2d,v1.2d // "D + T1" 1467 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1468 | add v25.2d,v25.2d,v21.2d 1469 | ld1 {v24.2d},[x3],#16 1470 | ext v25.16b,v25.16b,v25.16b,#8 1471 | ext v5.16b,v3.16b,v0.16b,#8 1472 | ext v6.16b,v4.16b,v3.16b,#8 1473 | add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1474 | .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b 1475 | ext v7.16b,v17.16b,v18.16b,#8 1476 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1477 | .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b 1478 | add v2.2d,v4.2d,v0.2d // "D + T1" 1479 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1480 | add v24.2d,v24.2d,v22.2d 1481 | ld1 {v25.2d},[x3],#16 1482 | ext v24.16b,v24.16b,v24.16b,#8 1483 | ext v5.16b,v2.16b,v3.16b,#8 1484 | ext v6.16b,v1.16b,v2.16b,#8 1485 | add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" 1486 | .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b 1487 | ext v7.16b,v18.16b,v19.16b,#8 1488 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1489 | .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b 1490 | add v4.2d,v1.2d,v3.2d // "D + T1" 1491 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1492 | add v25.2d,v25.2d,v23.2d 1493 | ld1 {v24.2d},[x3],#16 1494 | ext v25.16b,v25.16b,v25.16b,#8 1495 | ext v5.16b,v4.16b,v2.16b,#8 1496 | ext v6.16b,v0.16b,v4.16b,#8 1497 | add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" 1498 | .inst 0xcec08217 //sha512su0 v23.16b,v16.16b 1499 | ext v7.16b,v19.16b,v20.16b,#8 1500 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1501 | .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b 1502 | add v1.2d,v0.2d,v2.2d // "D + T1" 1503 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1504 | ld1 {v25.2d},[x3],#16 1505 | add v24.2d,v24.2d,v16.2d 1506 | ld1 {v16.16b},[x1],#16 // load next input 1507 | ext v24.16b,v24.16b,v24.16b,#8 1508 | ext v5.16b,v1.16b,v4.16b,#8 1509 | ext v6.16b,v3.16b,v1.16b,#8 1510 | add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" 1511 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1512 | rev64 v16.16b,v16.16b 1513 | add v0.2d,v3.2d,v4.2d // "D + T1" 1514 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1515 | ld1 {v24.2d},[x3],#16 1516 | add v25.2d,v25.2d,v17.2d 1517 | ld1 {v17.16b},[x1],#16 // load next input 1518 | ext v25.16b,v25.16b,v25.16b,#8 1519 | ext v5.16b,v0.16b,v1.16b,#8 1520 | ext v6.16b,v2.16b,v0.16b,#8 1521 | add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" 1522 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1523 | rev64 v17.16b,v17.16b 1524 | add v3.2d,v2.2d,v1.2d // "D + T1" 1525 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1526 | ld1 {v25.2d},[x3],#16 1527 | add v24.2d,v24.2d,v18.2d 1528 | ld1 {v18.16b},[x1],#16 // load next input 1529 | ext v24.16b,v24.16b,v24.16b,#8 1530 | ext v5.16b,v3.16b,v0.16b,#8 1531 | ext v6.16b,v4.16b,v3.16b,#8 1532 | add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" 1533 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1534 | rev64 v18.16b,v18.16b 1535 | add v2.2d,v4.2d,v0.2d // "D + T1" 1536 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1537 | ld1 {v24.2d},[x3],#16 1538 | add v25.2d,v25.2d,v19.2d 1539 | ld1 {v19.16b},[x1],#16 // load next input 1540 | ext v25.16b,v25.16b,v25.16b,#8 1541 | ext v5.16b,v2.16b,v3.16b,#8 1542 | ext v6.16b,v1.16b,v2.16b,#8 1543 | add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" 1544 | .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b 1545 | rev64 v19.16b,v19.16b 1546 | add v4.2d,v1.2d,v3.2d // "D + T1" 1547 | .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b 1548 | ld1 {v25.2d},[x3],#16 1549 | add v24.2d,v24.2d,v20.2d 1550 | ld1 {v20.16b},[x1],#16 // load next input 1551 | ext v24.16b,v24.16b,v24.16b,#8 1552 | ext v5.16b,v4.16b,v2.16b,#8 1553 | ext v6.16b,v0.16b,v4.16b,#8 1554 | add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" 1555 | .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b 1556 | rev64 v20.16b,v20.16b 1557 | add v1.2d,v0.2d,v2.2d // "D + T1" 1558 | .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b 1559 | ld1 {v24.2d},[x3],#16 1560 | add v25.2d,v25.2d,v21.2d 1561 | ld1 {v21.16b},[x1],#16 // load next input 1562 | ext v25.16b,v25.16b,v25.16b,#8 1563 | ext v5.16b,v1.16b,v4.16b,#8 1564 | ext v6.16b,v3.16b,v1.16b,#8 1565 | add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" 1566 | .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b 1567 | rev64 v21.16b,v21.16b 1568 | add v0.2d,v3.2d,v4.2d // "D + T1" 1569 | .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b 1570 | ld1 {v25.2d},[x3],#16 1571 | add v24.2d,v24.2d,v22.2d 1572 | ld1 {v22.16b},[x1],#16 // load next input 1573 | ext v24.16b,v24.16b,v24.16b,#8 1574 | ext v5.16b,v0.16b,v1.16b,#8 1575 | ext v6.16b,v2.16b,v0.16b,#8 1576 | add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" 1577 | .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b 1578 | rev64 v22.16b,v22.16b 1579 | add v3.2d,v2.2d,v1.2d // "D + T1" 1580 | .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b 1581 | sub x3,x3,#80*8 // rewind 1582 | add v25.2d,v25.2d,v23.2d 1583 | ld1 {v23.16b},[x1],#16 // load next input 1584 | ext v25.16b,v25.16b,v25.16b,#8 1585 | ext v5.16b,v3.16b,v0.16b,#8 1586 | ext v6.16b,v4.16b,v3.16b,#8 1587 | add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" 1588 | .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b 1589 | rev64 v23.16b,v23.16b 1590 | add v2.2d,v4.2d,v0.2d // "D + T1" 1591 | .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b 1592 | add v0.2d,v0.2d,v26.2d // accumulate 1593 | add v1.2d,v1.2d,v27.2d 1594 | add v2.2d,v2.2d,v28.2d 1595 | add v3.2d,v3.2d,v29.2d 1596 | 1597 | cbnz x2,.Loop_hw 1598 | 1599 | st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context 1600 | 1601 | ldr x29,[sp],#16 1602 | ret 1603 | .size sha512_block_armv8,.-sha512_block_armv8 1604 | #endif 1605 | #if !defined(__KERNEL__) && !defined(_WIN64) 1606 | .comm OPENSSL_armcap_P_local,4,4 1607 | #endif 1608 | -------------------------------------------------------------------------------- /src/openssl/openssl_cpu_globals.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #if defined(X86_64) 5 | // In OpenSSL the OPENSSL_ia32cap_P array holds the return values (in 6 | // RAX,RBX,RCX,RDX registesrs) of executing the Intel CPUID leaf 7 instruction. 7 | // The assembly code chooses the relevant SHA implementation according to this 8 | // array. 9 | unsigned int OPENSSL_ia32cap_P_local[4] = {0}; 10 | #endif 11 | 12 | #if defined(AARCH64) 13 | unsigned int OPENSSL_armcap_P_local = 0; 14 | #endif 15 | -------------------------------------------------------------------------------- /src/sha256.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | 6 | #include "sha256_defs.h" 7 | 8 | #define LAST_BLOCK_BYTE_LEN (2 * SHA256_BLOCK_BYTE_LEN) 9 | 10 | typedef struct sha256_hash_s { 11 | ALIGN(64) sha256_state_t state; 12 | uint64_t len; 13 | 14 | ALIGN(64) uint8_t data[LAST_BLOCK_BYTE_LEN]; 15 | 16 | sha256_word_t rem; 17 | sha_impl_t impl; 18 | } sha256_ctx_t; 19 | 20 | _INLINE_ void sha256_init(OUT sha256_ctx_t *ctx) 21 | { 22 | ctx->state.w[0] = UINT32_C(0x6a09e667); 23 | ctx->state.w[1] = UINT32_C(0xbb67ae85); 24 | ctx->state.w[2] = UINT32_C(0x3c6ef372); 25 | ctx->state.w[3] = UINT32_C(0xa54ff53a); 26 | ctx->state.w[4] = UINT32_C(0x510e527f); 27 | ctx->state.w[5] = UINT32_C(0x9b05688c); 28 | ctx->state.w[6] = UINT32_C(0x1f83d9ab); 29 | ctx->state.w[7] = UINT32_C(0x5be0cd19); 30 | } 31 | 32 | _INLINE_ void sha256_compress(IN OUT sha256_ctx_t *ctx, 33 | IN const uint8_t *data, 34 | IN const size_t blocks_num) 35 | { 36 | assert((ctx != NULL) && (data != NULL)); 37 | 38 | // OpenSSL code can crash without this check 39 | if(blocks_num == 0) { 40 | return; 41 | } 42 | 43 | switch(ctx->impl) { 44 | #if defined(X86_64) 45 | case AVX_IMPL: 46 | sha256_compress_x86_64_avx(&ctx->state, data, blocks_num); 47 | break; 48 | 49 | case OPENSSL_AVX_IMPL: 50 | RUN_OPENSSL_CODE_WITH_AVX( 51 | sha256_block_data_order_local(ctx->state.w, data, blocks_num);); 52 | break; 53 | #endif 54 | 55 | #if defined(AVX2_SUPPORT) 56 | case AVX2_IMPL: 57 | sha256_compress_x86_64_avx2(&ctx->state, data, blocks_num); 58 | break; 59 | 60 | case OPENSSL_AVX2_IMPL: 61 | RUN_OPENSSL_CODE_WITH_AVX2( 62 | sha256_block_data_order_local(ctx->state.w, data, blocks_num);); 63 | break; 64 | #endif 65 | 66 | #if defined(AVX512_SUPPORT) 67 | case AVX512_IMPL: 68 | sha256_compress_x86_64_avx512(&ctx->state, data, blocks_num); 69 | break; 70 | #endif 71 | 72 | #if defined(X86_64_SHA_SUPPORT) 73 | case SHA_EXT_IMPL: 74 | sha256_compress_x86_64_sha_ext(&ctx->state, data, blocks_num); 75 | break; 76 | 77 | case OPENSSL_SHA_EXT_IMPL: 78 | RUN_OPENSSL_CODE_WITH_SHA_EXT( 79 | sha256_block_data_order_local(ctx->state.w, data, blocks_num);); 80 | break; 81 | #endif 82 | 83 | #if defined(NEON_SUPPORT) 84 | case OPENSSL_NEON_IMPL: 85 | RUN_OPENSSL_CODE_WITH_NEON( 86 | sha256_block_data_order_local(ctx->state.w, data, blocks_num);); 87 | break; 88 | #endif 89 | 90 | #if defined(AARCH64_SHA_SUPPORT) 91 | case SHA_EXT_IMPL: 92 | sha256_compress_aarch64_sha_ext(&ctx->state, data, blocks_num); 93 | break; 94 | 95 | case OPENSSL_SHA_EXT_IMPL: 96 | RUN_OPENSSL_CODE_WITH_SHA256_EXT( 97 | sha256_block_data_order_local(ctx->state.w, data, blocks_num);); 98 | break; 99 | #endif 100 | default: sha256_compress_generic(&ctx->state, data, blocks_num); break; 101 | } 102 | } 103 | 104 | _INLINE_ void sha256_update(IN OUT sha256_ctx_t *ctx, 105 | IN const uint8_t *data, 106 | IN size_t byte_len) 107 | { 108 | // On exiting this function ctx->rem < SHA256_BLOCK_BYTE_LEN 109 | 110 | assert((ctx != NULL) && (data != NULL)); 111 | 112 | if(byte_len == 0) { 113 | return; 114 | } 115 | 116 | // Accumulate the overall size 117 | ctx->len += byte_len; 118 | 119 | // Less than a block. Store the data in a temporary buffer 120 | if((ctx->rem != 0) && ((ctx->rem + byte_len) < SHA256_BLOCK_BYTE_LEN)) { 121 | my_memcpy(&ctx->data[ctx->rem], data, byte_len); 122 | ctx->rem += byte_len; 123 | return; 124 | } 125 | 126 | // Complete and compress a previously stored block 127 | if(ctx->rem != 0) { 128 | const size_t clen = SHA256_BLOCK_BYTE_LEN - ctx->rem; 129 | my_memcpy(&ctx->data[ctx->rem], data, clen); 130 | sha256_compress(ctx, ctx->data, 1); 131 | 132 | data += clen; 133 | byte_len -= clen; 134 | 135 | ctx->rem = 0; 136 | secure_clean(ctx->data, SHA256_BLOCK_BYTE_LEN); 137 | } 138 | 139 | // Compress full blocks 140 | if(byte_len >= SHA256_BLOCK_BYTE_LEN) { 141 | const size_t blocks_num = (byte_len >> 6); 142 | const size_t full_blocks_byte_len = (blocks_num << 6); 143 | 144 | sha256_compress(ctx, data, blocks_num); 145 | 146 | data += full_blocks_byte_len; 147 | byte_len -= full_blocks_byte_len; 148 | } 149 | 150 | // Store the reminder 151 | my_memcpy(ctx->data, data, byte_len); 152 | ctx->rem = byte_len; 153 | } 154 | 155 | _INLINE_ void sha256_final(OUT uint8_t *dgst, IN OUT sha256_ctx_t *ctx) 156 | { 157 | assert((ctx != NULL) && (dgst != NULL)); 158 | assert(ctx->rem < SHA256_BLOCK_BYTE_LEN); 159 | 160 | // Byteswap the length in bits of the hashed message 161 | const uint64_t bswap_len = bswap_64(8 * ctx->len); 162 | const size_t last_block_num = (ctx->rem < 56) ? 1 : 2; 163 | const size_t last_qw_pos = 164 | (last_block_num * SHA256_BLOCK_BYTE_LEN) - sizeof(bswap_len); 165 | 166 | ctx->data[ctx->rem++] = SHA256_MSG_END_SYMBOL; 167 | 168 | // Reset the rest of the data buffer 169 | my_memset(&ctx->data[ctx->rem], 0, sizeof(ctx->data) - ctx->rem); 170 | my_memcpy(&ctx->data[last_qw_pos], (const uint8_t *)&bswap_len, 171 | sizeof(bswap_len)); 172 | 173 | // Compress the final block 174 | sha256_compress(ctx, ctx->data, last_block_num); 175 | 176 | // This implementation assumes running on a Little endian machine 177 | ctx->state.w[0] = bswap_32(ctx->state.w[0]); 178 | ctx->state.w[1] = bswap_32(ctx->state.w[1]); 179 | ctx->state.w[2] = bswap_32(ctx->state.w[2]); 180 | ctx->state.w[3] = bswap_32(ctx->state.w[3]); 181 | ctx->state.w[4] = bswap_32(ctx->state.w[4]); 182 | ctx->state.w[5] = bswap_32(ctx->state.w[5]); 183 | ctx->state.w[6] = bswap_32(ctx->state.w[6]); 184 | ctx->state.w[7] = bswap_32(ctx->state.w[7]); 185 | my_memcpy(dgst, &ctx->state, SHA256_HASH_BYTE_LEN); 186 | 187 | secure_clean(ctx, sizeof(*ctx)); 188 | } 189 | 190 | void sha256(OUT uint8_t *dgst, 191 | IN const uint8_t * data, 192 | IN const size_t byte_len, 193 | IN const sha_impl_t impl) 194 | { 195 | assert((data != NULL) || (dgst != NULL)); 196 | 197 | sha256_ctx_t ctx = {0}; 198 | ctx.impl = impl; 199 | sha256_init(&ctx); 200 | sha256_update(&ctx, data, byte_len); 201 | sha256_final(dgst, &ctx); 202 | } 203 | -------------------------------------------------------------------------------- /src/sha256_compress_aarch64_sha_ext.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // An implementation of the compress function of SHA256 using AARCH64 SHA 5 | // extension It was translated from assembly (OpenSSL) to C by 6 | // 7 | // Nir Drucker and Shay Gueron 8 | // AWS Cryptographic Algorithms Group. 9 | // (ndrucker@amazon.com, gueron@amazon.com) 10 | 11 | #include "neon_defs.h" 12 | #include "sha256_defs.h" 13 | 14 | _INLINE_ void load_data(uint32x4_t ms[4], const uint8_t *data) 15 | { 16 | uint8x16x4_t d = vld1q_u8_x4(data); 17 | ms[0] = vreinterpretq_u32_u8(vrev32q_u8(d.val[0])); 18 | ms[1] = vreinterpretq_u32_u8(vrev32q_u8(d.val[1])); 19 | ms[2] = vreinterpretq_u32_u8(vrev32q_u8(d.val[2])); 20 | ms[3] = vreinterpretq_u32_u8(vrev32q_u8(d.val[3])); 21 | } 22 | 23 | _INLINE_ void rotate_ms(uint32x4_t ms[4]) 24 | { 25 | uint32x4_t tmp = ms[0]; 26 | ms[0] = ms[1]; 27 | ms[1] = ms[2]; 28 | ms[2] = ms[3]; 29 | ms[3] = tmp; 30 | } 31 | 32 | void sha256_compress_aarch64_sha_ext(IN OUT sha256_state_t *state, 33 | IN const uint8_t *data, 34 | IN size_t blocks_num) 35 | { 36 | uint32x4_t ms[4]; 37 | uint32x4_t tmp[3]; 38 | uint32x4x2_t st; 39 | uint32x4x2_t st_save; 40 | 41 | st = vld1q_u32_x2(state->w); 42 | 43 | for(size_t j = 0; j < blocks_num; j++) { 44 | // Save current state 45 | st_save = st; 46 | 47 | load_data(ms, data); 48 | 49 | tmp[0] = vaddq_u32(ms[0], vld1q_u32(&K256[0])); 50 | 51 | // Rounds 0-47 52 | PRAGMA_LOOP_UNROLL_12 53 | 54 | for(size_t i = 0; i < 12; i++) { 55 | ms[0] = vsha256su0q_u32(ms[0], ms[1]); 56 | tmp[2] = st.val[0]; 57 | tmp[1] = vaddq_u32(ms[1], vld1q_u32(&K256[4 * (i + 1)])); 58 | st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[0]); 59 | st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[0]); 60 | ms[0] = vsha256su1q_u32(ms[0], ms[2], ms[3]); 61 | 62 | rotate_ms(ms); 63 | 64 | uint32x4_t t = tmp[0]; 65 | tmp[0] = tmp[1]; 66 | tmp[1] = t; 67 | } 68 | 69 | // Rounds 48-51 70 | PRAGMA_LOOP_UNROLL_4 71 | 72 | for(size_t i = 0; i < 3; i++) { 73 | tmp[2] = st.val[0]; 74 | tmp[LSB1(i + 1)] = 75 | vaddq_u32(ms[LSB2(i + 1)], vld1q_u32(&K256[4 * (i + 13)])); 76 | st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[LSB1(i)]); 77 | st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[LSB1(i)]); 78 | } 79 | 80 | // Rounds 60-63 81 | tmp[2] = st.val[0]; 82 | st.val[0] = vsha256hq_u32(st.val[0], st.val[1], tmp[1]); 83 | st.val[1] = vsha256h2q_u32(st.val[1], tmp[2], tmp[1]); 84 | 85 | // Accumluate state 86 | st.val[0] = vaddq_u32(st.val[0], st_save.val[0]); 87 | st.val[1] = vaddq_u32(st.val[1], st_save.val[1]); 88 | 89 | data += SHA256_BLOCK_BYTE_LEN; 90 | } 91 | 92 | // Store state 93 | vst1q_u32_x2(state->w, st); 94 | } 95 | -------------------------------------------------------------------------------- /src/sha256_compress_generic.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include "sha256_defs.h" 5 | 6 | // In the generic implementation we use memcpy to avoid align issues 7 | _INLINE_ sha256_word_t load_be32(IN const void *ptr) 8 | { 9 | sha256_word_t ret; 10 | my_memcpy(&ret, ptr, sizeof(ret)); 11 | return bswap_32(ret); 12 | } 13 | 14 | _INLINE_ void load_data_and_rounds_00_15(OUT sha256_msg_schedule_t *ms, 15 | IN OUT sha256_state_t *cur_state, 16 | IN const uint8_t *data) 17 | { 18 | PRAGMA_LOOP_UNROLL_4 19 | 20 | for(size_t i = 0; i < SHA256_BLOCK_WORDS_NUM; i++) { 21 | ms->w[i] = load_be32(&data[sizeof(sha256_word_t) * i]); 22 | sha_round(cur_state, ms->w[i], K256[i]); 23 | } 24 | } 25 | 26 | _INLINE_ void rounds_16_63(IN OUT sha256_state_t *cur_state, 27 | IN OUT sha256_msg_schedule_t *ms) 28 | { 29 | PRAGMA_LOOP_UNROLL_48 30 | 31 | for(size_t i = SHA256_BLOCK_WORDS_NUM; i < SHA256_ROUNDS_NUM; i++) { 32 | const sha256_word_t x1 = ms->w[LSB4(i + 1)]; 33 | const sha256_word_t x9 = ms->w[LSB4(i + 9)]; 34 | const sha256_word_t x14 = ms->w[LSB4(i + 14)]; 35 | 36 | ms->w[LSB4(i)] += sigma0(x1) + sigma1(x14) + x9; 37 | sha_round(cur_state, ms->w[LSB4(i)], K256[i]); 38 | } 39 | } 40 | 41 | void sha256_compress_generic(IN OUT sha256_state_t *state, 42 | IN const uint8_t *data, 43 | IN size_t blocks_num) 44 | { 45 | sha256_state_t cur_state; 46 | sha256_msg_schedule_t ms; 47 | 48 | while(blocks_num--) { 49 | my_memcpy(&cur_state, state, sizeof(cur_state)); 50 | 51 | load_data_and_rounds_00_15(&ms, &cur_state, data); 52 | data += SHA256_BLOCK_BYTE_LEN; 53 | 54 | rounds_16_63(&cur_state, &ms); 55 | accumulate_state(state, &cur_state); 56 | } 57 | 58 | secure_clean(&cur_state, sizeof(cur_state)); 59 | secure_clean(&ms, sizeof(ms)); 60 | } 61 | -------------------------------------------------------------------------------- /src/sha256_compress_x86_64_avx.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA256 using avx 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx_defs.h" 15 | #include "sha256_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32, 18 | // SLL32, SRL64 that are defined in avx_defs.h 19 | #include "sha256_compress_x86_64_avx_helper.c" 20 | 21 | #define MS_VEC_NUM (SHA256_BLOCK_BYTE_LEN / sizeof(vec_t)) 22 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha256_word_t)) 23 | 24 | _INLINE_ void load_data(OUT vec_t x[MS_VEC_NUM], 25 | IN OUT sha256_msg_schedule_t *ms, 26 | IN const uint8_t *data) 27 | { 28 | // 32 bits (4 bytes) swap masks 29 | const vec_t shuf_mask = 30 | _mm_setr_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); 31 | 32 | PRAGMA_LOOP_UNROLL_4 33 | 34 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 35 | x[i] = LOAD(&data[sizeof(vec_t) * i]); 36 | x[i] = SHUF8(x[i], shuf_mask); 37 | STORE(&ms->w[WORDS_IN_VEC * i], ADD32(x[i], LOAD(&K256[WORDS_IN_VEC * i]))); 38 | } 39 | } 40 | 41 | _INLINE_ void rounds_0_47(sha256_state_t * cur_state, 42 | vec_t x[MS_VEC_NUM], 43 | sha256_msg_schedule_t *ms) 44 | { 45 | const vec_t lo_mask = _mm_setr_epi32(0x03020100, 0x0b0a0908, -1, -1); 46 | const vec_t hi_mask = _mm_setr_epi32(-1, -1, 0x03020100, 0x0b0a0908); 47 | 48 | // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in 49 | // load_data(...). 50 | size_t k256_idx = SHA256_BLOCK_WORDS_NUM; 51 | 52 | // Rounds 0-47 (0-15, 16-31, 32-47) 53 | for(size_t i = 0; i < 3; i++) { 54 | 55 | PRAGMA_LOOP_UNROLL_4 56 | 57 | for(size_t j = 0; j < MS_VEC_NUM; j++) { 58 | const size_t pos = WORDS_IN_VEC * j; 59 | 60 | const vec_t y = sha256_update_x_avx(x, &K256[k256_idx], lo_mask, hi_mask); 61 | 62 | sha_round(cur_state, ms->w[pos + 0], 0); 63 | sha_round(cur_state, ms->w[pos + 1], 0); 64 | sha_round(cur_state, ms->w[pos + 2], 0); 65 | sha_round(cur_state, ms->w[pos + 3], 0); 66 | 67 | STORE(&ms->w[pos], y); 68 | k256_idx += WORDS_IN_VEC; 69 | } 70 | } 71 | } 72 | 73 | _INLINE_ void rounds_48_63(sha256_state_t * cur_state, 74 | const sha256_msg_schedule_t *ms) 75 | { 76 | PRAGMA_LOOP_UNROLL_16 77 | 78 | for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) { 79 | sha_round(cur_state, ms->w[LSB4(i)], 0); 80 | } 81 | } 82 | 83 | void sha256_compress_x86_64_avx(sha256_state_t *state, 84 | const uint8_t * data, 85 | size_t blocks_num) 86 | { 87 | sha256_state_t cur_state; 88 | sha256_msg_schedule_t ms; 89 | vec_t x[MS_VEC_NUM]; 90 | 91 | while(blocks_num--) { 92 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 93 | 94 | load_data(x, &ms, data); 95 | data += SHA256_BLOCK_BYTE_LEN; 96 | 97 | rounds_0_47(&cur_state, x, &ms); 98 | rounds_48_63(&cur_state, &ms); 99 | accumulate_state(state, &cur_state); 100 | } 101 | 102 | secure_clean(&cur_state, sizeof(cur_state)); 103 | secure_clean(&ms, sizeof(ms)); 104 | } 105 | -------------------------------------------------------------------------------- /src/sha256_compress_x86_64_avx2.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA256 using avx2 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx2_defs.h" 15 | #include "sha256_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32, 18 | // SLL32, SRL64 that are defined in avx2_defs.h 19 | #include "sha256_compress_x86_64_avx_helper.c" 20 | 21 | // Processing 2 blocks in parallel 22 | #define MS_VEC_NUM ((2 * SHA256_BLOCK_BYTE_LEN) / sizeof(vec_t)) 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha256_word_t)) 24 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha256_word_t)) 25 | 26 | _INLINE_ void load_data(vec_t x[MS_VEC_NUM], 27 | sha256_msg_schedule_t *ms, 28 | sha256_word_t t2[SHA256_ROUNDS_NUM], 29 | const uint8_t * data) 30 | { 31 | // 32 bits (4 bytes) swap masks 32 | const vec_t shuf_mask = 33 | _mm256_setr_epi32(DUP2(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)); 34 | 35 | PRAGMA_LOOP_UNROLL_4 36 | 37 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 38 | const size_t pos0 = (sizeof(vec_t) / 2) * i; 39 | const size_t pos1 = pos0 + SHA256_BLOCK_BYTE_LEN; 40 | 41 | LOADU2(&data[pos1], &data[pos0], x[i]); 42 | x[i] = SHUF8(x[i], shuf_mask); 43 | vec_t y = ADD32(x[i], LOAD(&K256x2[8 * i])); 44 | STOREU2(&t2[4 * i], &ms->w[4 * i], y); 45 | } 46 | } 47 | 48 | _INLINE_ void rounds_0_47(sha256_state_t * cur_state, 49 | vec_t x[MS_VEC_NUM], 50 | sha256_msg_schedule_t *ms, 51 | sha256_word_t t2[SHA256_ROUNDS_NUM]) 52 | { 53 | const vec_t lo_mask = _mm256_setr_epi32(DUP2(0x03020100, 0x0b0a0908, -1, -1)); 54 | const vec_t hi_mask = _mm256_setr_epi32(DUP2(-1, -1, 0x03020100, 0x0b0a0908)); 55 | 56 | // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in 57 | // load_data(...). 58 | size_t k256_idx = 2 * SHA256_BLOCK_WORDS_NUM; 59 | 60 | // Rounds 0-47 (0-15, 16-31, 32-47) 61 | for(size_t i = 1; i < 4; i++) { 62 | 63 | PRAGMA_LOOP_UNROLL_4 64 | 65 | for(size_t j = 0; j < WORDS_IN_128_BIT_VEC; j++) { 66 | const size_t pos = WORDS_IN_128_BIT_VEC * j; 67 | 68 | const vec_t y = sha256_update_x_avx(x, &K256x2[k256_idx], lo_mask, hi_mask); 69 | 70 | sha_round(cur_state, ms->w[pos + 0], 0); 71 | sha_round(cur_state, ms->w[pos + 1], 0); 72 | sha_round(cur_state, ms->w[pos + 2], 0); 73 | sha_round(cur_state, ms->w[pos + 3], 0); 74 | STOREU2(&t2[(16 * i) + pos], &ms->w[pos], y); 75 | 76 | k256_idx += WORDS_IN_VEC; 77 | } 78 | } 79 | } 80 | 81 | _INLINE_ void rounds_48_63(sha256_state_t * cur_state, 82 | const sha256_msg_schedule_t *ms) 83 | { 84 | PRAGMA_LOOP_UNROLL_16 85 | 86 | for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) { 87 | sha_round(cur_state, ms->w[LSB4(i)], 0); 88 | } 89 | } 90 | 91 | _INLINE_ void process_second_block(sha256_state_t * cur_state, 92 | const sha256_word_t t2[SHA256_ROUNDS_NUM]) 93 | { 94 | PRAGMA_LOOP_UNROLL_64 95 | 96 | for(size_t i = 0; i < SHA256_ROUNDS_NUM; i++) { 97 | sha_round(cur_state, t2[i], 0); 98 | } 99 | } 100 | 101 | void sha256_compress_x86_64_avx2(sha256_state_t *state, 102 | const uint8_t * data, 103 | size_t blocks_num) 104 | { 105 | ALIGN(64) sha256_msg_schedule_t ms; 106 | ALIGN(64) sha256_word_t t2[SHA256_ROUNDS_NUM]; 107 | sha256_state_t cur_state; 108 | vec_t x[MS_VEC_NUM]; 109 | 110 | if(blocks_num & 1) { 111 | sha256_compress_x86_64_avx(state, data, 1); 112 | data += SHA256_BLOCK_BYTE_LEN; 113 | blocks_num--; 114 | } 115 | 116 | // Perform two blocks in parallel 117 | // Here blocks_num is even 118 | for(size_t b = blocks_num; b != 0; b -= 2) { 119 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 120 | 121 | load_data(x, &ms, t2, data); 122 | data += 2 * SHA256_BLOCK_BYTE_LEN; 123 | 124 | // First block 125 | rounds_0_47(&cur_state, x, &ms, t2); 126 | rounds_48_63(&cur_state, &ms); 127 | accumulate_state(state, &cur_state); 128 | 129 | // Second block 130 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 131 | process_second_block(&cur_state, t2); 132 | accumulate_state(state, &cur_state); 133 | } 134 | 135 | secure_clean(&cur_state, sizeof(cur_state)); 136 | secure_clean(&ms, sizeof(ms)); 137 | secure_clean(t2, sizeof(t2)); 138 | } 139 | -------------------------------------------------------------------------------- /src/sha256_compress_x86_64_avx512.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA256 using avx512 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx512_defs.h" 15 | #include "sha256_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD32, ALIGNR8, SRL32, 18 | // SLL32, SRL64 that are defined in avx512_defs.h 19 | #include "sha256_compress_x86_64_avx_helper.c" 20 | 21 | // Processing 4 blocks in parallel 22 | #define MS_VEC_NUM ((4 * SHA256_BLOCK_BYTE_LEN) / sizeof(vec_t)) 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha256_word_t)) 24 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha256_word_t)) 25 | 26 | _INLINE_ void load_data(vec_t x[MS_VEC_NUM], 27 | sha256_msg_schedule_t *ms, 28 | sha256_word_t x2_4[][SHA256_ROUNDS_NUM], 29 | const uint8_t * data) 30 | { 31 | // 32 bits (4 bytes) swap masks 32 | const vec_t shuf_mask = 33 | _mm512_set_epi32(DUP4(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203)); 34 | 35 | PRAGMA_LOOP_UNROLL_4 36 | 37 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 38 | const size_t pos0 = (sizeof(vec_t) / 4) * i; 39 | const size_t pos1 = pos0 + SHA256_BLOCK_BYTE_LEN; 40 | const size_t pos2 = pos1 + SHA256_BLOCK_BYTE_LEN; 41 | const size_t pos3 = pos2 + SHA256_BLOCK_BYTE_LEN; 42 | 43 | LOADU4(&data[pos3], &data[pos2], &data[pos1], &data[pos0], x[i]); 44 | 45 | x[i] = SHUF8(x[i], shuf_mask); 46 | vec_t y = ADD32(x[i], LOAD(&K256x4[16 * i])); 47 | 48 | STOREU4(&x2_4[2][4 * i], &x2_4[1][4 * i], &x2_4[0][4 * i], &ms->w[4 * i], y); 49 | } 50 | } 51 | 52 | _INLINE_ void rounds_0_47(sha256_state_t * cur_state, 53 | vec_t x[MS_VEC_NUM], 54 | sha256_msg_schedule_t *ms, 55 | sha256_word_t x2_4[][SHA256_ROUNDS_NUM]) 56 | { 57 | const vec_t lo_mask = _mm512_set_epi32(DUP4(-1, -1, 0x0b0a0908, 0x03020100)); 58 | const vec_t hi_mask = _mm512_set_epi32(DUP4(0x0b0a0908, 0x03020100, -1, -1)); 59 | 60 | // The first SHA256_BLOCK_WORDS_NUM entries of K256 were loaded in 61 | // load_data(...). 62 | size_t k256_idx = 4 * SHA256_BLOCK_WORDS_NUM; 63 | 64 | // Rounds 0-47 (0-15, 16-31, 32-47) 65 | for(size_t i = 1; i < 4; i++) { 66 | 67 | PRAGMA_LOOP_UNROLL_4 68 | 69 | for(size_t j = 0; j < MS_VEC_NUM; j++) { 70 | const size_t pos = WORDS_IN_128_BIT_VEC * j; 71 | 72 | const vec_t y = sha256_update_x_avx(x, &K256x4[k256_idx], lo_mask, hi_mask); 73 | 74 | sha_round(cur_state, ms->w[pos + 0], 0); 75 | sha_round(cur_state, ms->w[pos + 1], 0); 76 | sha_round(cur_state, ms->w[pos + 2], 0); 77 | sha_round(cur_state, ms->w[pos + 3], 0); 78 | const size_t idx = (k256_idx >> 2); 79 | 80 | STOREU4(&x2_4[2][idx], &x2_4[1][idx], &x2_4[0][idx], &ms->w[pos], y); 81 | k256_idx += WORDS_IN_VEC; 82 | } 83 | } 84 | } 85 | 86 | _INLINE_ void rounds_48_63(sha256_state_t * cur_state, 87 | const sha256_msg_schedule_t *ms) 88 | { 89 | PRAGMA_LOOP_UNROLL_16 90 | 91 | for(size_t i = SHA256_FINAL_ROUND_START_IDX; i < SHA256_ROUNDS_NUM; i++) { 92 | sha_round(cur_state, ms->w[LSB4(i)], 0); 93 | } 94 | } 95 | 96 | _INLINE_ void process_extra_block(sha256_state_t * cur_state, 97 | const sha256_word_t t[SHA256_ROUNDS_NUM]) 98 | { 99 | PRAGMA_LOOP_UNROLL_64 100 | 101 | for(size_t i = 0; i < SHA256_ROUNDS_NUM; i++) { 102 | sha_round(cur_state, t[i], 0); 103 | } 104 | } 105 | 106 | void sha256_compress_x86_64_avx512(sha256_state_t *state, 107 | const uint8_t * data, 108 | size_t blocks_num) 109 | { 110 | ALIGN(64) sha256_msg_schedule_t ms; 111 | ALIGN(64) sha256_word_t x2_4[3][SHA256_ROUNDS_NUM]; 112 | sha256_state_t cur_state; 113 | vec_t x[MS_VEC_NUM]; 114 | 115 | const size_t rem = LSB2(blocks_num); 116 | if(rem != 0) { 117 | sha256_compress_x86_64_avx2(state, data, rem); 118 | data += rem * SHA256_BLOCK_BYTE_LEN; 119 | blocks_num -= rem; 120 | } 121 | 122 | // Process four blocks in parallel 123 | // Here blocks_num is divided by 4 124 | for(size_t b = blocks_num; b != 0; b -= 4) { 125 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 126 | 127 | load_data(x, &ms, x2_4, data); 128 | data += 4 * SHA256_BLOCK_BYTE_LEN; 129 | 130 | // First block 131 | rounds_0_47(&cur_state, x, &ms, x2_4); 132 | rounds_48_63(&cur_state, &ms); 133 | accumulate_state(state, &cur_state); 134 | 135 | for(size_t i = 0; i <= 2; i++) { 136 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 137 | process_extra_block(&cur_state, x2_4[i]); 138 | accumulate_state(state, &cur_state); 139 | } 140 | } 141 | 142 | secure_clean(&cur_state, sizeof(cur_state)); 143 | secure_clean(&ms, sizeof(ms)); 144 | secure_clean(x2_4, sizeof(x2_4)); 145 | } 146 | -------------------------------------------------------------------------------- /src/sha256_compress_x86_64_avx_helper.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // An implementation of the compress function of SHA256 using avx/avx2/avx512 5 | // It was translated from assembly (OpenSSL) to C by 6 | // 7 | // Nir Drucker and Shay Gueron 8 | // AWS Cryptographic Algorithms Group. 9 | // (ndrucker@amazon.com, gueron@amazon.com) 10 | 11 | // This file depends on vec_t and on the following macros: 12 | // LOAD, ADD32, ALIGNR8, SRL32, SLL32, SRL64 13 | 14 | #define SHA256_WORD_BIT_LEN (8 * sizeof(sha256_word_t)) 15 | 16 | _INLINE_ void rotate_x(vec_t x[4]) 17 | { 18 | const vec_t tmp = x[0]; 19 | x[0] = x[1]; 20 | x[1] = x[2]; 21 | x[2] = x[3]; 22 | x[3] = tmp; 23 | } 24 | 25 | #ifndef ALTERNATIVE_AVX512_IMPL 26 | 27 | _INLINE_ vec_t sha256_update_x_avx(vec_t x[4], 28 | const sha256_word_t *k256_p, 29 | const vec_t lo_mask, 30 | const vec_t hi_mask) 31 | { 32 | vec_t t[4]; 33 | 34 | // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates: 35 | // s0 = sigma0(d[(i + 1) % 16]) 36 | // s1 = sigma1(d[(i + 14) % 16]) 37 | // d[i % 16] += s0 + s1 + d[(i + 9) % 16] 38 | // 39 | // For x[0]=d[3:0] 40 | // 41 | // This means that 42 | // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9] 43 | // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10] 44 | // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11] 45 | // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12] 46 | 47 | t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1] 48 | t[3] = ALIGNR8(x[3], x[2], 4); // d[12:9] 49 | t[2] = SRL32(t[0], sigma0_0); // d[4:1] >> s0[0] 50 | x[0] = ADD32(x[0], t[3]); // d[3:0] + d[12:9] 51 | 52 | t[3] = SRL32(t[0], sigma0_2); // d[4:1] >> s0[2] 53 | t[1] = SLL32(t[0], SHA256_WORD_BIT_LEN - sigma0_1); // d[4:1] << (32 - s0[1]) 54 | t[0] = t[3] ^ t[2]; // (d[4:1] >> s0[2]) ^ 55 | // (d[4:1] >> s0[0]) 56 | t[3] = SHUF32(x[3], 0xfa); // d[15,15,14,14] 57 | t[2] = SRL32(t[2], sigma0_1 - sigma0_0); // d[4:1] >> s0[1] 58 | t[0] ^= t[1] ^ t[2]; // ROTR(d[4:1], s0[1]) ^ 59 | // (d[4:1] >> s0[2]) ^ 60 | // (d[4:1] >> s0[0]) 61 | t[1] = SLL32(t[1], sigma0_1 - sigma0_0); // d[4:1] << (32 - s0[0]) 62 | t[2] = SRL32(t[3], sigma1_2); // d[15,15,14,14] >> s1[2] 63 | t[3] = SRL64(t[3], sigma1_0); // ROTR(d[-,15,-,14], s1[0]) 64 | x[0] = ADD32(x[0], t[0] ^ t[1]); // d[3:0] + sigma0(d[4:1]) 65 | 66 | t[2] ^= t[3]; // d[15,15,14,14] >> s1[2] ^ ROTR(d[-,15,-,14], s1[0]) 67 | t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,15,-,14], s1[1]) 68 | t[2] = SHUF8(t[2] ^ t[3], lo_mask); // sigma1(d[Zero,Zero,15,14]) 69 | x[0] = ADD32(x[0], t[2]); // d[3:0] + sigma0(d[4:1]) + 70 | // sigma1(d[-,-,15,14]) + d[12:9] 71 | 72 | // When calculating s1 = sigma1(s1) for the upper dwords 73 | // we use the already updated d[1:0] 74 | t[3] = SHUF32(x[0], 0x50); // d[1,1,0,0] 75 | t[2] = SRL32(t[3], sigma1_2); // d[1,1,0,0] >> s1[2] 76 | t[3] = SRL64(t[3], sigma1_0); // ROTR(d[-,1,-,0]) >> s1[0] 77 | t[2] ^= t[3]; // ROTR(d[-,1,-,0]) >> s1[0] ^ 78 | // d[1,1,0,0] >> s1[2] 79 | t[3] = SRL64(t[3], sigma1_1 - sigma1_0); // ROTR(d[-,1,-,0]) >> s1[1] 80 | 81 | // sigma1(d[0,x[1],0,x[0]]) 82 | // sigma1(d[x[1],x[0],Zero,Zero]) 83 | x[0] = ADD32(x[0], SHUF8(t[2] ^ t[3], hi_mask)); 84 | 85 | rotate_x(x); 86 | 87 | return ADD32(x[3], LOAD(k256_p)); 88 | } 89 | 90 | #else 91 | 92 | _INLINE_ vec_t sha256_update_x_avx(vec_t x[4], 93 | const sha256_word_t *k256_p, 94 | UNUSED const vec_t lo_mask, 95 | UNUSED const vec_t hi_mask) 96 | { 97 | vec_t t[2]; 98 | vec_t s0; 99 | vec_t s1; 100 | 101 | // This function recieves 4 128-bit registers x[3:0]=d[15:0] and calculates: 102 | // s0 = sigma0(d[(i + 1) % 16]) 103 | // s1 = sigma1(d[(i + 14) % 16]) 104 | // d[i % 16] += s0 + s1 + d[(i + 9) % 16] 105 | // 106 | // For x[0]=d[3:0] 107 | // 108 | // This means that 109 | // res[0] depends on d[1] (for s0) d[14] (for s1) and d[9] 110 | // res[1] depends on d[2] (for s0) d[15] (for s1) and d[10] 111 | // res[2] depends on d[3] (for s0) res[0] (for s1) and d[11] 112 | // res[3] depends on d[4] (for s0) res[1] (for s1) and d[12] 113 | 114 | t[0] = ALIGNR8(x[1], x[0], 4); // d[4:1] 115 | t[1] = ALIGNR8(x[3], x[2], 4); // d[12:9] 116 | x[0] = ADD32(x[0], t[1]); // d[3:0] + d[12:9] 117 | s0 = ROR32(t[0], sigma0_0) ^ ROR32(t[0], sigma0_1) ^ SRL32(t[0], sigma0_2); 118 | x[0] = ADD32(x[0], s0); // d[3:0] + d[12:9] + sigma0(d[4:1]) 119 | 120 | t[1] = SHUF32(x[3], 0xfe); // d[-,-,15,14] 121 | s1 = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2); 122 | x[0] = MADD32(x[0], LOW32X2_MASK, x[0], s1); 123 | 124 | t[1] = SHUF32(x[0], 0x40); 125 | s1 = ROR32(t[1], sigma1_0) ^ ROR32(t[1], sigma1_1) ^ SRL32(t[1], sigma1_2); 126 | x[0] = MADD32(x[0], HIGH32X2_MASK, x[0], s1); 127 | 128 | rotate_x(x); 129 | 130 | return ADD32(x[3], LOAD(k256_p)); 131 | } 132 | 133 | #endif 134 | -------------------------------------------------------------------------------- /src/sha256_compress_x86_64_sha_ext.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA256 using the SHA extension 5 | // The implementation is based on: 6 | // https://software.intel.com/en-us/articles/intel-sha-extensions 7 | // 8 | // Written by Nir Drucker and Shay Gueron 9 | // AWS Cryptographic Algorithms Group. 10 | // (ndrucker@amazon.com, gueron@amazon.com) 11 | 12 | #include "avx_defs.h" 13 | #include "sha256_defs.h" 14 | 15 | #define RND2(s0, s1, data) (_mm_sha256rnds2_epu32(s0, s1, data)) 16 | #define SHAMSG1(m1, m2) (_mm_sha256msg1_epu32(m1, m2)) 17 | #define SHAMSG2(m1, m2) (_mm_sha256msg2_epu32(m1, m2)) 18 | 19 | #define SET_K(i) \ 20 | (SETR32(K256[4 * (i)], K256[(4 * (i)) + 1], K256[(4 * (i)) + 2], \ 21 | K256[(4 * (i)) + 3])) 22 | 23 | void sha256_compress_x86_64_sha_ext(IN OUT sha256_state_t *state, 24 | IN const uint8_t *data, 25 | IN size_t blocks_num) 26 | { 27 | vec_t state0; 28 | vec_t state1; 29 | vec_t msg; 30 | vec_t tmp; 31 | vec_t msgtmp[4]; 32 | vec_t ABEF_SAVE; 33 | vec_t CDGH_SAVE; 34 | 35 | const vec_t shuf_mask = 36 | SET64(UINT64_C(0x0c0d0e0f08090a0b), UINT64_C(0x0405060700010203)); 37 | 38 | tmp = SHUF32(LOAD(&state->w[0]), 0xB1); // CDAB 39 | state1 = SHUF32(LOAD(&state->w[4]), 0x1B); // EFGH 40 | state0 = ALIGNR8(tmp, state1, 8); // ABEF 41 | state1 = BLEND16(state1, tmp, 0xF0); // CDGH 42 | 43 | while(blocks_num--) { 44 | // Save the current state 45 | ABEF_SAVE = state0; 46 | CDGH_SAVE = state1; 47 | 48 | // Rounds 0-3 49 | msgtmp[0] = SHUF8(LOAD(&data[0]), shuf_mask); 50 | msg = ADD32(msgtmp[0], SET_K(0)); 51 | state1 = RND2(state1, state0, msg); 52 | msg = SHUF32(msg, 0x0E); 53 | state0 = RND2(state0, state1, msg); 54 | 55 | PRAGMA_LOOP_UNROLL_2 56 | 57 | // Rounds 4-7 (i=1) 58 | // Rounds 8-11 (i=2) 59 | for(size_t i = 1; i <= 2; i++) { 60 | msgtmp[i] = SHUF8(LOAD(&data[16 * i]), shuf_mask); 61 | msg = ADD32(msgtmp[i], SET_K(i)); 62 | state1 = RND2(state1, state0, msg); 63 | msg = SHUF32(msg, 0x0E); 64 | state0 = RND2(state0, state1, msg); 65 | msgtmp[i - 1] = SHAMSG1(msgtmp[i - 1], msgtmp[i]); 66 | } 67 | 68 | // Rounds 12-59 in blocks of 4 (12 multi-rounds) 69 | msgtmp[3] = SHUF8(LOAD(&data[48]), shuf_mask); 70 | 71 | PRAGMA_LOOP_UNROLL_12 72 | 73 | for(size_t i = 3; i <= 14; i++) { 74 | const size_t prev = LSB2(i - 1); 75 | const size_t curr = LSB2(i); 76 | const size_t next = LSB2(i + 1); 77 | 78 | msg = ADD32(msgtmp[curr], SET_K(i)); 79 | state1 = RND2(state1, state0, msg); 80 | tmp = ALIGNR8(msgtmp[curr], msgtmp[prev], 4); 81 | msgtmp[next] = ADD32(msgtmp[next], tmp); 82 | msgtmp[next] = SHAMSG2(msgtmp[next], msgtmp[curr]); 83 | msg = SHUF32(msg, 0x0E); 84 | state0 = RND2(state0, state1, msg); 85 | msgtmp[prev] = SHAMSG1(msgtmp[prev], msgtmp[curr]); 86 | } 87 | 88 | // Rounds 60-63 89 | msg = ADD32(msgtmp[3], SET_K(15)); 90 | state1 = RND2(state1, state0, msg); 91 | msg = SHUF32(msg, 0x0E); 92 | state0 = RND2(state0, state1, msg); 93 | 94 | // Accumulate state 95 | state0 = ADD32(state0, ABEF_SAVE); 96 | state1 = ADD32(state1, CDGH_SAVE); 97 | 98 | data += SHA256_BLOCK_BYTE_LEN; 99 | } 100 | 101 | tmp = SHUF32(state0, 0x1B); // FEBA 102 | state1 = SHUF32(state1, 0xB1); // DCHG 103 | state0 = BLEND16(tmp, state1, 0xF0); // DCBA 104 | state1 = ALIGNR8(state1, tmp, 8); // ABEF 105 | 106 | STORE((vec_t *)&state->w[0], state0); 107 | STORE((vec_t *)&state->w[4], state1); 108 | } 109 | -------------------------------------------------------------------------------- /src/sha256_consts.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include "sha256_defs.h" 5 | 6 | #define K256_0 UINT32_C(0x428a2f98) 7 | #define K256_1 UINT32_C(0x71374491) 8 | #define K256_2 UINT32_C(0xb5c0fbcf) 9 | #define K256_3 UINT32_C(0xe9b5dba5) 10 | #define K256_4 UINT32_C(0x3956c25b) 11 | #define K256_5 UINT32_C(0x59f111f1) 12 | #define K256_6 UINT32_C(0x923f82a4) 13 | #define K256_7 UINT32_C(0xab1c5ed5) 14 | #define K256_8 UINT32_C(0xd807aa98) 15 | #define K256_9 UINT32_C(0x12835b01) 16 | #define K256_10 UINT32_C(0x243185be) 17 | #define K256_11 UINT32_C(0x550c7dc3) 18 | #define K256_12 UINT32_C(0x72be5d74) 19 | #define K256_13 UINT32_C(0x80deb1fe) 20 | #define K256_14 UINT32_C(0x9bdc06a7) 21 | #define K256_15 UINT32_C(0xc19bf174) 22 | #define K256_16 UINT32_C(0xe49b69c1) 23 | #define K256_17 UINT32_C(0xefbe4786) 24 | #define K256_18 UINT32_C(0x0fc19dc6) 25 | #define K256_19 UINT32_C(0x240ca1cc) 26 | #define K256_20 UINT32_C(0x2de92c6f) 27 | #define K256_21 UINT32_C(0x4a7484aa) 28 | #define K256_22 UINT32_C(0x5cb0a9dc) 29 | #define K256_23 UINT32_C(0x76f988da) 30 | #define K256_24 UINT32_C(0x983e5152) 31 | #define K256_25 UINT32_C(0xa831c66d) 32 | #define K256_26 UINT32_C(0xb00327c8) 33 | #define K256_27 UINT32_C(0xbf597fc7) 34 | #define K256_28 UINT32_C(0xc6e00bf3) 35 | #define K256_29 UINT32_C(0xd5a79147) 36 | #define K256_30 UINT32_C(0x06ca6351) 37 | #define K256_31 UINT32_C(0x14292967) 38 | #define K256_32 UINT32_C(0x27b70a85) 39 | #define K256_33 UINT32_C(0x2e1b2138) 40 | #define K256_34 UINT32_C(0x4d2c6dfc) 41 | #define K256_35 UINT32_C(0x53380d13) 42 | #define K256_36 UINT32_C(0x650a7354) 43 | #define K256_37 UINT32_C(0x766a0abb) 44 | #define K256_38 UINT32_C(0x81c2c92e) 45 | #define K256_39 UINT32_C(0x92722c85) 46 | #define K256_40 UINT32_C(0xa2bfe8a1) 47 | #define K256_41 UINT32_C(0xa81a664b) 48 | #define K256_42 UINT32_C(0xc24b8b70) 49 | #define K256_43 UINT32_C(0xc76c51a3) 50 | #define K256_44 UINT32_C(0xd192e819) 51 | #define K256_45 UINT32_C(0xd6990624) 52 | #define K256_46 UINT32_C(0xf40e3585) 53 | #define K256_47 UINT32_C(0x106aa070) 54 | #define K256_48 UINT32_C(0x19a4c116) 55 | #define K256_49 UINT32_C(0x1e376c08) 56 | #define K256_50 UINT32_C(0x2748774c) 57 | #define K256_51 UINT32_C(0x34b0bcb5) 58 | #define K256_52 UINT32_C(0x391c0cb3) 59 | #define K256_53 UINT32_C(0x4ed8aa4a) 60 | #define K256_54 UINT32_C(0x5b9cca4f) 61 | #define K256_55 UINT32_C(0x682e6ff3) 62 | #define K256_56 UINT32_C(0x748f82ee) 63 | #define K256_57 UINT32_C(0x78a5636f) 64 | #define K256_58 UINT32_C(0x84c87814) 65 | #define K256_59 UINT32_C(0x8cc70208) 66 | #define K256_60 UINT32_C(0x90befffa) 67 | #define K256_61 UINT32_C(0xa4506ceb) 68 | #define K256_62 UINT32_C(0xbef9a3f7) 69 | #define K256_63 UINT32_C(0xc67178f2) 70 | 71 | ALIGN(64) 72 | const sha256_word_t K256[SHA256_ROUNDS_NUM] = { 73 | K256_0, K256_1, K256_2, K256_3, K256_4, K256_5, K256_6, K256_7, 74 | K256_8, K256_9, K256_10, K256_11, K256_12, K256_13, K256_14, K256_15, 75 | K256_16, K256_17, K256_18, K256_19, K256_20, K256_21, K256_22, K256_23, 76 | K256_24, K256_25, K256_26, K256_27, K256_28, K256_29, K256_30, K256_31, 77 | K256_32, K256_33, K256_34, K256_35, K256_36, K256_37, K256_38, K256_39, 78 | K256_40, K256_41, K256_42, K256_43, K256_44, K256_45, K256_46, K256_47, 79 | K256_48, K256_49, K256_50, K256_51, K256_52, K256_53, K256_54, K256_55, 80 | K256_56, K256_57, K256_58, K256_59, K256_60, K256_61, K256_62, K256_63}; 81 | 82 | ALIGN(64) 83 | const sha256_word_t K256x2[2 * SHA256_ROUNDS_NUM] = { 84 | DUP2(K256_0, K256_1, K256_2, K256_3), 85 | DUP2(K256_4, K256_5, K256_6, K256_7), 86 | DUP2(K256_8, K256_9, K256_10, K256_11), 87 | DUP2(K256_12, K256_13, K256_14, K256_15), 88 | DUP2(K256_16, K256_17, K256_18, K256_19), 89 | DUP2(K256_20, K256_21, K256_22, K256_23), 90 | DUP2(K256_24, K256_25, K256_26, K256_27), 91 | DUP2(K256_28, K256_29, K256_30, K256_31), 92 | DUP2(K256_32, K256_33, K256_34, K256_35), 93 | DUP2(K256_36, K256_37, K256_38, K256_39), 94 | DUP2(K256_40, K256_41, K256_42, K256_43), 95 | DUP2(K256_44, K256_45, K256_46, K256_47), 96 | DUP2(K256_48, K256_49, K256_50, K256_51), 97 | DUP2(K256_52, K256_53, K256_54, K256_55), 98 | DUP2(K256_56, K256_57, K256_58, K256_59), 99 | DUP2(K256_60, K256_61, K256_62, K256_63)}; 100 | 101 | ALIGN(64) 102 | const sha256_word_t K256x4[4 * SHA256_ROUNDS_NUM] = { 103 | DUP4(K256_0, K256_1, K256_2, K256_3), 104 | DUP4(K256_4, K256_5, K256_6, K256_7), 105 | DUP4(K256_8, K256_9, K256_10, K256_11), 106 | DUP4(K256_12, K256_13, K256_14, K256_15), 107 | DUP4(K256_16, K256_17, K256_18, K256_19), 108 | DUP4(K256_20, K256_21, K256_22, K256_23), 109 | DUP4(K256_24, K256_25, K256_26, K256_27), 110 | DUP4(K256_28, K256_29, K256_30, K256_31), 111 | DUP4(K256_32, K256_33, K256_34, K256_35), 112 | DUP4(K256_36, K256_37, K256_38, K256_39), 113 | DUP4(K256_40, K256_41, K256_42, K256_43), 114 | DUP4(K256_44, K256_45, K256_46, K256_47), 115 | DUP4(K256_48, K256_49, K256_50, K256_51), 116 | DUP4(K256_52, K256_53, K256_54, K256_55), 117 | DUP4(K256_56, K256_57, K256_58, K256_59), 118 | DUP4(K256_60, K256_61, K256_62, K256_63)}; 119 | -------------------------------------------------------------------------------- /src/sha512.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | 6 | #include "sha512_defs.h" 7 | 8 | #define LAST_BLOCK_BYTE_LEN (2 * SHA512_BLOCK_BYTE_LEN) 9 | 10 | typedef struct sha512_hash_s { 11 | ALIGN(64) sha512_state_t state; 12 | uint64_t len; 13 | 14 | ALIGN(64) uint8_t data[LAST_BLOCK_BYTE_LEN]; 15 | 16 | sha512_word_t rem; 17 | sha_impl_t impl; 18 | } sha512_ctx_t; 19 | 20 | _INLINE_ void sha512_init(OUT sha512_ctx_t *ctx) 21 | { 22 | ctx->state.w[0] = UINT64_C(0x6a09e667f3bcc908); 23 | ctx->state.w[1] = UINT64_C(0xbb67ae8584caa73b); 24 | ctx->state.w[2] = UINT64_C(0x3c6ef372fe94f82b); 25 | ctx->state.w[3] = UINT64_C(0xa54ff53a5f1d36f1); 26 | ctx->state.w[4] = UINT64_C(0x510e527fade682d1); 27 | ctx->state.w[5] = UINT64_C(0x9b05688c2b3e6c1f); 28 | ctx->state.w[6] = UINT64_C(0x1f83d9abfb41bd6b); 29 | ctx->state.w[7] = UINT64_C(0x5be0cd19137e2179); 30 | } 31 | 32 | _INLINE_ void sha512_compress(IN OUT sha512_ctx_t *ctx, 33 | IN const uint8_t *data, 34 | IN const size_t blocks_num) 35 | { 36 | assert((ctx != NULL) && (data != NULL)); 37 | 38 | // OpenSSL code can crash without this check 39 | if(blocks_num == 0) { 40 | return; 41 | } 42 | 43 | switch(ctx->impl) { 44 | #if defined(X86_64) 45 | case AVX_IMPL: 46 | sha512_compress_x86_64_avx(&ctx->state, data, blocks_num); 47 | break; 48 | 49 | case OPENSSL_AVX_IMPL: 50 | RUN_OPENSSL_CODE_WITH_AVX( 51 | sha512_block_data_order_local(ctx->state.w, data, blocks_num);); 52 | break; 53 | #endif 54 | 55 | #if defined(AVX2_SUPPORT) 56 | case AVX2_IMPL: 57 | sha512_compress_x86_64_avx2(&ctx->state, data, blocks_num); 58 | break; 59 | 60 | case OPENSSL_AVX2_IMPL: 61 | RUN_OPENSSL_CODE_WITH_AVX2( 62 | sha512_block_data_order_local(ctx->state.w, data, blocks_num);); 63 | break; 64 | #endif 65 | 66 | #if defined(AVX512_SUPPORT) 67 | case AVX512_IMPL: 68 | sha512_compress_x86_64_avx512(&ctx->state, data, blocks_num); 69 | break; 70 | #endif 71 | 72 | #if defined(NEON_SUPPORT) 73 | case OPENSSL_NEON_IMPL: 74 | RUN_OPENSSL_CODE_WITH_NEON( 75 | sha512_block_data_order_local(ctx->state.w, data, blocks_num);); 76 | break; 77 | #endif 78 | 79 | default: sha512_compress_generic(&ctx->state, data, blocks_num); break; 80 | } 81 | } 82 | 83 | _INLINE_ void sha512_update(IN OUT sha512_ctx_t *ctx, 84 | IN const uint8_t *data, 85 | IN size_t byte_len) 86 | { 87 | // On exiting this function ctx->rem < SHA512_BLOCK_BYTE_LEN 88 | 89 | assert((ctx != NULL) && (data != NULL)); 90 | 91 | if(byte_len == 0) { 92 | return; 93 | } 94 | 95 | // Accumulate the overall size 96 | ctx->len += byte_len; 97 | 98 | // Less than a block. Store the data in a temporary buffer 99 | if((ctx->rem != 0) && (ctx->rem + byte_len < SHA512_BLOCK_BYTE_LEN)) { 100 | my_memcpy(&ctx->data[ctx->rem], data, byte_len); 101 | ctx->rem += byte_len; 102 | return; 103 | } 104 | 105 | // Complete and compress a previously stored block 106 | if(ctx->rem != 0) { 107 | const size_t clen = SHA512_BLOCK_BYTE_LEN - ctx->rem; 108 | my_memcpy(&ctx->data[ctx->rem], data, clen); 109 | sha512_compress(ctx, ctx->data, 1); 110 | 111 | data += clen; 112 | byte_len -= clen; 113 | 114 | ctx->rem = 0; 115 | secure_clean(ctx->data, SHA512_BLOCK_BYTE_LEN); 116 | } 117 | 118 | // Compress full blocks 119 | if(byte_len >= SHA512_BLOCK_BYTE_LEN) { 120 | const size_t blocks_num = (byte_len >> 7); 121 | const size_t full_blocks_byte_len = (blocks_num << 7); 122 | 123 | sha512_compress(ctx, data, blocks_num); 124 | 125 | data += full_blocks_byte_len; 126 | byte_len -= full_blocks_byte_len; 127 | } 128 | 129 | // Store the reminder 130 | my_memcpy(ctx->data, data, byte_len); 131 | ctx->rem = byte_len; 132 | } 133 | 134 | _INLINE_ void sha512_final(OUT uint8_t *dgst, IN OUT sha512_ctx_t *ctx) 135 | { 136 | assert((ctx != NULL) && (dgst != NULL)); 137 | assert(ctx->rem < SHA512_BLOCK_BYTE_LEN); 138 | 139 | // Byteswap the length in bits of the hashed message 140 | const uint64_t bswap_len = bswap_64(8 * ctx->len); 141 | const size_t last_block_num = (ctx->rem < 112) ? 1 : 2; 142 | const size_t last_qw_pos = 143 | (last_block_num * SHA512_BLOCK_BYTE_LEN) - sizeof(bswap_len); 144 | 145 | ctx->data[ctx->rem++] = SHA512_MSG_END_SYMBOL; 146 | 147 | // Reset the rest of the data buffer 148 | my_memset(&ctx->data[ctx->rem], 0, sizeof(ctx->data) - ctx->rem); 149 | my_memcpy(&ctx->data[last_qw_pos], (const uint8_t *)&bswap_len, 150 | sizeof(bswap_len)); 151 | 152 | // Compress the final block 153 | sha512_compress(ctx, ctx->data, last_block_num); 154 | 155 | // This implementation assumes running on a Little endian machine 156 | ctx->state.w[0] = bswap_64(ctx->state.w[0]); 157 | ctx->state.w[1] = bswap_64(ctx->state.w[1]); 158 | ctx->state.w[2] = bswap_64(ctx->state.w[2]); 159 | ctx->state.w[3] = bswap_64(ctx->state.w[3]); 160 | ctx->state.w[4] = bswap_64(ctx->state.w[4]); 161 | ctx->state.w[5] = bswap_64(ctx->state.w[5]); 162 | ctx->state.w[6] = bswap_64(ctx->state.w[6]); 163 | ctx->state.w[7] = bswap_64(ctx->state.w[7]); 164 | my_memcpy(dgst, ctx->state.w, SHA512_HASH_BYTE_LEN); 165 | 166 | secure_clean(ctx, sizeof(*ctx)); 167 | } 168 | 169 | void sha512(OUT uint8_t *dgst, 170 | IN const uint8_t * data, 171 | IN const size_t byte_len, 172 | IN const sha_impl_t impl) 173 | { 174 | assert((data != NULL) || (dgst != NULL)); 175 | 176 | sha512_ctx_t ctx = {0}; 177 | ctx.impl = impl; 178 | sha512_init(&ctx); 179 | sha512_update(&ctx, data, byte_len); 180 | sha512_final(dgst, &ctx); 181 | } 182 | -------------------------------------------------------------------------------- /src/sha512_compress_generic.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include "sha512_defs.h" 5 | 6 | // In the generic implementation we use memcpy to avoid align issues 7 | _INLINE_ sha512_word_t load_be64(IN const void *ptr) 8 | { 9 | sha512_word_t ret; 10 | my_memcpy(&ret, ptr, sizeof(ret)); 11 | return bswap_64(ret); 12 | } 13 | 14 | _INLINE_ void load_data_and_rounds_00_15(OUT sha512_msg_schedule_t *ms, 15 | IN OUT sha512_state_t *cur_state, 16 | IN const uint8_t *data) 17 | { 18 | PRAGMA_LOOP_UNROLL_4 19 | 20 | for(size_t i = 0; i < SHA512_BLOCK_WORDS_NUM; i++) { 21 | ms->w[i] = load_be64(&data[sizeof(sha512_word_t) * i]); 22 | sha_round(cur_state, ms->w[i], K512[i]); 23 | } 24 | } 25 | 26 | _INLINE_ void rounds_16_79(IN OUT sha512_state_t *cur_state, 27 | IN OUT sha512_msg_schedule_t *ms) 28 | { 29 | PRAGMA_LOOP_UNROLL_64 30 | 31 | for(size_t i = SHA512_BLOCK_WORDS_NUM; i < SHA512_ROUNDS_NUM; i++) { 32 | const sha512_word_t x1 = ms->w[LSB4(i + 1)]; 33 | const sha512_word_t x9 = ms->w[LSB4(i + 9)]; 34 | const sha512_word_t x14 = ms->w[LSB4(i + 14)]; 35 | 36 | ms->w[LSB4(i)] += sigma0(x1) + sigma1(x14) + x9; 37 | sha_round(cur_state, ms->w[LSB4(i)], K512[i]); 38 | } 39 | } 40 | 41 | void sha512_compress_generic(IN OUT sha512_state_t *state, 42 | IN const uint8_t *data, 43 | IN size_t blocks_num) 44 | { 45 | sha512_state_t cur_state; 46 | sha512_msg_schedule_t ms; 47 | 48 | while(blocks_num--) { 49 | my_memcpy(&cur_state, state, sizeof(cur_state)); 50 | 51 | load_data_and_rounds_00_15(&ms, &cur_state, data); 52 | data += SHA512_BLOCK_BYTE_LEN; 53 | 54 | rounds_16_79(&cur_state, &ms); 55 | accumulate_state(state, &cur_state); 56 | } 57 | 58 | secure_clean(&cur_state, sizeof(cur_state)); 59 | secure_clean(&ms, sizeof(ms)); 60 | } 61 | -------------------------------------------------------------------------------- /src/sha512_compress_x86_64_avx.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA512 using avx 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx_defs.h" 15 | #include "sha512_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64 18 | // that are defined in avx512_defs.h 19 | #include "sha512_compress_x86_64_avx_helper.c" 20 | 21 | #define MS_VEC_NUM (SHA512_BLOCK_BYTE_LEN / sizeof(vec_t)) 22 | #define WORDS_IN_VEC (16 / sizeof(sha512_word_t)) 23 | 24 | _INLINE_ void load_data(OUT vec_t x[MS_VEC_NUM], 25 | IN OUT sha512_msg_schedule_t *ms, 26 | IN const uint8_t *data) 27 | { 28 | // 64 bits (8 bytes) swap masks 29 | const vec_t shuf_mask = 30 | _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); 31 | 32 | PRAGMA_LOOP_UNROLL_8 33 | 34 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 35 | const size_t pos = WORDS_IN_VEC * i; 36 | 37 | x[i] = LOAD(&data[sizeof(vec_t) * i]); 38 | x[i] = SHUF8(x[i], shuf_mask); 39 | STORE(&ms->w[pos], ADD64(x[i], LOAD(&K512[pos]))); 40 | } 41 | } 42 | 43 | _INLINE_ void rounds_0_63(sha512_state_t * cur_state, 44 | vec_t x[MS_VEC_NUM], 45 | sha512_msg_schedule_t *ms) 46 | { 47 | // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in 48 | // load_data(...). 49 | size_t k512_idx = SHA512_BLOCK_WORDS_NUM; 50 | 51 | // Rounds 0-63 (0-15, 16-31, 32-47, 48-63) 52 | for(size_t i = 0; i < 4; i++) { 53 | 54 | PRAGMA_LOOP_UNROLL_8 55 | 56 | for(size_t j = 0; j < MS_VEC_NUM; j++) { 57 | const size_t pos = WORDS_IN_VEC * j; 58 | 59 | const vec_t y = sha512_update_x_avx(x, &K512[k512_idx]); 60 | 61 | sha_round(cur_state, ms->w[pos], 0); 62 | sha_round(cur_state, ms->w[pos + 1], 0); 63 | 64 | STORE(&ms->w[pos], y); 65 | k512_idx += WORDS_IN_VEC; 66 | } 67 | } 68 | } 69 | 70 | _INLINE_ void rounds_64_79(sha512_state_t * cur_state, 71 | const sha512_msg_schedule_t *ms) 72 | { 73 | PRAGMA_LOOP_UNROLL_16 74 | 75 | for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) { 76 | sha_round(cur_state, ms->w[LSB4(i)], 0); 77 | } 78 | } 79 | 80 | void sha512_compress_x86_64_avx(sha512_state_t *state, 81 | const uint8_t * data, 82 | size_t blocks_num) 83 | { 84 | sha512_state_t cur_state; 85 | sha512_msg_schedule_t ms; 86 | vec_t x[MS_VEC_NUM]; 87 | 88 | while(blocks_num--) { 89 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 90 | 91 | load_data(x, &ms, data); 92 | data += SHA512_BLOCK_BYTE_LEN; 93 | 94 | rounds_0_63(&cur_state, x, &ms); 95 | rounds_64_79(&cur_state, &ms); 96 | accumulate_state(state, &cur_state); 97 | } 98 | 99 | secure_clean(&cur_state, sizeof(cur_state)); 100 | secure_clean(&ms, sizeof(ms)); 101 | } 102 | -------------------------------------------------------------------------------- /src/sha512_compress_x86_64_avx2.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA512 using avx2 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx2_defs.h" 15 | #include "sha512_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64 18 | // that are defined in avx512_defs.h 19 | #include "sha512_compress_x86_64_avx_helper.c" 20 | 21 | // Processing 2 blocks in parallel 22 | #define MS_VEC_NUM ((2 * SHA512_BLOCK_BYTE_LEN) / sizeof(vec_t)) 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha512_word_t)) 24 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha512_word_t)) 25 | 26 | _INLINE_ void load_data(vec_t x[MS_VEC_NUM], 27 | sha512_msg_schedule_t *ms, 28 | sha512_word_t t2[SHA512_ROUNDS_NUM], 29 | const uint8_t * data) 30 | { 31 | // 64 bits (8 bytes) swap masks 32 | const vec_t shuf_mask = 33 | _mm256_set_epi64x(DUP2(0x08090a0b0c0d0e0f, 0x0001020304050607)); 34 | 35 | PRAGMA_LOOP_UNROLL_8 36 | 37 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 38 | const size_t pos0 = (sizeof(vec_t) / 2) * i; 39 | const size_t pos1 = pos0 + SHA512_BLOCK_BYTE_LEN; 40 | 41 | LOADU2(&data[pos1], &data[pos0], x[i]); 42 | x[i] = SHUF8(x[i], shuf_mask); 43 | vec_t y = ADD64(x[i], LOAD(&K512x2[4 * i])); 44 | STOREU2(&t2[2 * i], &ms->w[2 * i], y); 45 | } 46 | } 47 | 48 | _INLINE_ void rounds_0_63(sha512_state_t * cur_state, 49 | vec_t x[MS_VEC_NUM], 50 | sha512_msg_schedule_t *ms, 51 | sha512_word_t t2[SHA512_ROUNDS_NUM]) 52 | { 53 | // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in 54 | // load_data(...). 55 | size_t k512_idx = 2 * SHA512_BLOCK_WORDS_NUM; 56 | 57 | // Rounds 0-63 (0-15, 16-31, 32-47, 48-63) 58 | for(size_t i = 1; i < 5; i++) { 59 | 60 | PRAGMA_LOOP_UNROLL_8 61 | 62 | for(size_t j = 0; j < 8; j++) { 63 | const size_t pos = WORDS_IN_128_BIT_VEC * j; 64 | 65 | const vec_t y = sha512_update_x_avx(x, &K512x2[k512_idx]); 66 | 67 | sha_round(cur_state, ms->w[pos], 0); 68 | sha_round(cur_state, ms->w[pos + 1], 0); 69 | STOREU2(&t2[(16 * i) + pos], &ms->w[pos], y); 70 | k512_idx += WORDS_IN_VEC; 71 | } 72 | } 73 | } 74 | 75 | _INLINE_ void rounds_64_79(sha512_state_t * cur_state, 76 | const sha512_msg_schedule_t *ms) 77 | { 78 | PRAGMA_LOOP_UNROLL_16 79 | 80 | for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) { 81 | sha_round(cur_state, ms->w[LSB4(i)], 0); 82 | } 83 | } 84 | 85 | _INLINE_ void process_second_block(sha512_state_t * cur_state, 86 | const sha512_word_t t2[SHA512_ROUNDS_NUM]) 87 | { 88 | PRAGMA_LOOP_UNROLL_80 89 | 90 | for(size_t i = 0; i < SHA512_ROUNDS_NUM; i++) { 91 | sha_round(cur_state, t2[i], 0); 92 | } 93 | } 94 | 95 | void sha512_compress_x86_64_avx2(sha512_state_t *state, 96 | const uint8_t * data, 97 | size_t blocks_num) 98 | { 99 | ALIGN(64) sha512_msg_schedule_t ms; 100 | ALIGN(64) sha512_word_t t2[SHA512_ROUNDS_NUM]; 101 | sha512_state_t cur_state; 102 | vec_t x[MS_VEC_NUM]; 103 | 104 | if(LSB1(blocks_num)) { 105 | sha512_compress_x86_64_avx(state, data, 1); 106 | data += SHA512_BLOCK_BYTE_LEN; 107 | blocks_num--; 108 | } 109 | 110 | // Process two blocks in parallel 111 | // Here blocks_num is even 112 | for(size_t b = blocks_num; b != 0; b -= 2) { 113 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 114 | 115 | load_data(x, &ms, t2, data); 116 | data += 2 * SHA512_BLOCK_BYTE_LEN; 117 | 118 | // First block 119 | rounds_0_63(&cur_state, x, &ms, t2); 120 | rounds_64_79(&cur_state, &ms); 121 | accumulate_state(state, &cur_state); 122 | 123 | // Second block 124 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 125 | process_second_block(&cur_state, t2); 126 | accumulate_state(state, &cur_state); 127 | } 128 | 129 | secure_clean(&cur_state, sizeof(cur_state)); 130 | secure_clean(&ms, sizeof(ms)); 131 | secure_clean(t2, sizeof(t2)); 132 | } 133 | -------------------------------------------------------------------------------- /src/sha512_compress_x86_64_avx512.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | // 4 | // An implementation of the compress function of SHA512 using avx512 5 | // The implementation is based on: 6 | // Gueron, S., Krasnov, V. Parallelizing message schedules to accelerate the 7 | // computations of hash functions. J Cryptogr Eng 2, 241–253 (2012). 8 | // https://doi.org/10.1007/s13389-012-0037-z 9 | // 10 | // Written by Nir Drucker and Shay Gueron 11 | // AWS Cryptographic Algorithms Group. 12 | // (ndrucker@amazon.com, gueron@amazon.com) 13 | 14 | #include "internal/avx512_defs.h" 15 | #include "sha512_defs.h" 16 | 17 | // This file depends on vec_t and on the macros LOAD, ADD64, ALIGNR8, SRL64, SLL64 18 | // that are defined in avx512_defs.h 19 | #include "sha512_compress_x86_64_avx_helper.c" 20 | 21 | // Processing 4 blocks in parallel 22 | #define MS_VEC_NUM ((4 * SHA512_BLOCK_BYTE_LEN) / sizeof(vec_t)) 23 | #define WORDS_IN_128_BIT_VEC (16 / sizeof(sha512_word_t)) 24 | #define WORDS_IN_VEC (sizeof(vec_t) / sizeof(sha512_word_t)) 25 | 26 | _INLINE_ void load_data(vec_t x[MS_VEC_NUM], 27 | sha512_msg_schedule_t *ms, 28 | sha512_word_t x2_4[][SHA512_ROUNDS_NUM], 29 | const uint8_t * data) 30 | { 31 | // 64 bits (8 bytes) swap masks 32 | const vec_t shuf_mask = 33 | _mm512_set_epi64(DUP4(0x08090a0b0c0d0e0f, 0x0001020304050607)); 34 | 35 | PRAGMA_LOOP_UNROLL_8 36 | 37 | for(size_t i = 0; i < MS_VEC_NUM; i++) { 38 | const size_t pos0 = (sizeof(vec_t) / 4) * i; 39 | const size_t pos1 = pos0 + SHA512_BLOCK_BYTE_LEN; 40 | const size_t pos2 = pos1 + SHA512_BLOCK_BYTE_LEN; 41 | const size_t pos3 = pos2 + SHA512_BLOCK_BYTE_LEN; 42 | LOADU4(&data[pos3], &data[pos2], &data[pos1], &data[pos0], x[i]); 43 | 44 | x[i] = SHUF8(x[i], shuf_mask); 45 | vec_t y = ADD64(x[i], LOAD(&K512x4[8 * i])); 46 | 47 | STOREU4(&x2_4[2][2 * i], &x2_4[1][2 * i], &x2_4[0][2 * i], &ms->w[2 * i], y); 48 | } 49 | } 50 | 51 | _INLINE_ void rounds_0_63(sha512_state_t * cur_state, 52 | vec_t x[MS_VEC_NUM], 53 | sha512_msg_schedule_t *ms, 54 | sha512_word_t x2_4[][SHA512_ROUNDS_NUM]) 55 | { 56 | // The first SHA512_BLOCK_WORDS_NUM entries of K512 were loaded in 57 | // load_data(...). 58 | size_t k512_idx = 4 * SHA512_BLOCK_WORDS_NUM; 59 | 60 | // Rounds 0-63 (0-15, 16-31, 32-47, 48-63) 61 | for(size_t i = 1; i < 5; i++) { 62 | 63 | PRAGMA_LOOP_UNROLL_8 64 | 65 | for(size_t j = 0; j < MS_VEC_NUM; j++) { 66 | const size_t pos = WORDS_IN_128_BIT_VEC * j; 67 | const vec_t y = sha512_update_x_avx(x, &K512x4[k512_idx]); 68 | 69 | sha_round(cur_state, ms->w[pos], 0); 70 | sha_round(cur_state, ms->w[pos + 1], 0); 71 | const size_t idx = k512_idx >> 2; 72 | 73 | STOREU4(&x2_4[2][idx], &x2_4[1][idx], &x2_4[0][idx], &ms->w[pos], y); 74 | k512_idx += WORDS_IN_VEC; 75 | } 76 | } 77 | } 78 | 79 | _INLINE_ void rounds_64_79(sha512_state_t * cur_state, 80 | const sha512_msg_schedule_t *ms) 81 | { 82 | PRAGMA_LOOP_UNROLL_16 83 | 84 | for(size_t i = SHA512_FINAL_ROUND_START_IDX; i < SHA512_ROUNDS_NUM; i++) { 85 | sha_round(cur_state, ms->w[LSB4(i)], 0); 86 | } 87 | } 88 | 89 | _INLINE_ void process_extra_block(sha512_state_t * cur_state, 90 | const sha512_word_t t[SHA512_ROUNDS_NUM]) 91 | { 92 | PRAGMA_LOOP_UNROLL_80 93 | 94 | for(size_t i = 0; i < SHA512_ROUNDS_NUM; i++) { 95 | sha_round(cur_state, t[i], 0); 96 | } 97 | } 98 | 99 | void sha512_compress_x86_64_avx512(sha512_state_t *state, 100 | const uint8_t * data, 101 | size_t blocks_num) 102 | { 103 | ALIGN(64) sha512_msg_schedule_t ms; 104 | ALIGN(64) sha512_word_t x2_4[3][SHA512_ROUNDS_NUM]; 105 | sha512_state_t cur_state; 106 | vec_t x[MS_VEC_NUM]; 107 | 108 | const size_t rem = LSB2(blocks_num); 109 | if(rem != 0) { 110 | sha512_compress_x86_64_avx2(state, data, rem); 111 | data += rem * SHA512_BLOCK_BYTE_LEN; 112 | blocks_num -= rem; 113 | } 114 | 115 | // Process four blocks in parallel 116 | // Here blocks_num is divided by 4 117 | for(size_t b = blocks_num; b != 0; b -= 4) { 118 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 119 | 120 | load_data(x, &ms, x2_4, data); 121 | data += 4 * SHA512_BLOCK_BYTE_LEN; 122 | 123 | // First block 124 | rounds_0_63(&cur_state, x, &ms, x2_4); 125 | rounds_64_79(&cur_state, &ms); 126 | accumulate_state(state, &cur_state); 127 | 128 | for(size_t i = 0; i <= 2; i++) { 129 | my_memcpy(cur_state.w, state->w, sizeof(cur_state.w)); 130 | process_extra_block(&cur_state, x2_4[i]); 131 | accumulate_state(state, &cur_state); 132 | } 133 | } 134 | 135 | secure_clean(&cur_state, sizeof(cur_state)); 136 | secure_clean(&ms, sizeof(ms)); 137 | secure_clean(x2_4, sizeof(x2_4)); 138 | } 139 | -------------------------------------------------------------------------------- /src/sha512_compress_x86_64_avx_helper.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | // An implementation of the compress function of SHA512 using avx/avx2/avx512 5 | // It was translated from assembly (OpenSSL) to C by 6 | // 7 | // Nir Drucker and Shay Gueron 8 | // AWS Cryptographic Algorithms Group. 9 | // (ndrucker@amazon.com, gueron@amazon.com) 10 | 11 | // This file depends on vec_t and on the following macros: 12 | // LOAD, ADD64, ALIGNR8, SRL64, SLL64 13 | 14 | #define SHA512_WORD_BIT_LEN (8 * sizeof(sha512_word_t)) 15 | 16 | _INLINE_ void rotate_x(vec_t x[8]) 17 | { 18 | const vec_t tmp = x[0]; 19 | 20 | for(size_t i = 0; i < 7; i++) { 21 | x[i] = x[i + 1]; 22 | } 23 | 24 | x[7] = tmp; 25 | } 26 | 27 | #ifndef ALTERNATIVE_AVX512_IMPL 28 | 29 | _INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *K512_p) 30 | { 31 | vec_t t[4]; 32 | 33 | // This function recieves 8 128-bit registers X[7:0]=q[15:0] and calculates: 34 | // s0 = sigma0(q[(i + 1) % 16]) 35 | // s1 = sigma1(q[(i + 14) % 16]) 36 | // q[i % 16] += s0 + s1 + q[(i + 9) % 16] 37 | // 38 | // For X[0]=q[3:0] 39 | // 40 | // This means that 41 | // res[0] depends on q[1] (for s0) q[14] (for s1) and q[9] 42 | // res[1] depends on q[2] (for s0) q[15] (for s1) and q[10] 43 | // res[2] depends on q[3] (for s0) res[0] (for s1) and q[11] 44 | // res[3] depends on q[4] (for s0) res[1] (for s1) and q[12] 45 | 46 | t[0] = ALIGNR8(x[1], x[0], 8); // q[2:1] 47 | t[3] = ALIGNR8(x[5], x[4], 8); // q[10:9] 48 | t[2] = SRL64(t[0], sigma0_0); // q[2:1] >> s0[0] 49 | x[0] = ADD64(x[0], t[3]); // q[1:0] + q[10:9] 50 | t[3] = SRL64(t[0], sigma0_2); // q[2:1] >> s0[2] 51 | t[1] = SLL64(t[0], SHA512_WORD_BIT_LEN - sigma0_1); // q[2:1] << (64 - s0[1]) 52 | t[0] = t[3] ^ t[2]; // (q[2:1] >> s0[2]) ^ 53 | // (q[2:1] >> s0[0]) 54 | t[2] = SRL64(t[2], sigma0_1 - sigma0_0); // q[2:1] >> s0[1] 55 | t[0] ^= t[1]; // (q[2:1] >> s0[2]) ^ 56 | // (q[2:1] >> s0[0]) ^ 57 | // q[2:1] << (64 - s0[1]) 58 | t[1] = SLL64(t[1], sigma0_1 - sigma0_0); // q[2:1] << (64 - s0[0]) 59 | t[0] ^= t[2] ^ t[1]; // sigma1(q[2:1]) 60 | t[3] = SRL64(x[7], sigma1_2); // q[15:14] >> s1[2] 61 | t[2] = SLL64(x[7], SHA512_WORD_BIT_LEN - sigma1_1); // q[15:14] >> (64 - s1[1]) 62 | x[0] = ADD64(x[0], t[0]); // q[1:0] + sigma0(q[2:1]) 63 | t[1] = SRL64(x[7], sigma1_0); // q[15:14] >> s1[0] 64 | t[3] ^= t[2]; // q[15:14] >> s1[2] ^ 65 | // q[15:14] >> (64 - s1[1]) 66 | t[2] = SLL64(t[2], sigma1_1 - sigma1_0); // q[15:14] >> (64 - s1[0]) 67 | t[3] ^= t[1]; // q[15:14] >> s1[2] ^ 68 | // q[15:14] >> (64 - s1[1] ^ 69 | // q[15:14] >> s1[0] 70 | t[1] = SRL64(t[1], sigma1_1 - sigma1_0); // q[15:14] >> s1[1] 71 | t[3] ^= t[2] ^ t[1]; // sigma1(q[15:14]) 72 | 73 | // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1]) 74 | x[0] = ADD64(x[0], t[3]); 75 | 76 | rotate_x(x); 77 | 78 | return ADD64(x[7], LOAD(K512_p)); 79 | } 80 | 81 | #else 82 | 83 | _INLINE_ vec_t sha512_update_x_avx(vec_t x[8], const sha512_word_t *k512_p) 84 | { 85 | vec_t t[2]; 86 | vec_t s0; 87 | vec_t s1; 88 | 89 | // This function recieves 8 wide registers X[7:0]=q[15:0] and calculates: 90 | // s0 = sigma0(q[2:1]) 91 | // s1 = sigma1(q[15:14]) 92 | // q[1:0] += s0 + s1 + q[10:9] 93 | 94 | t[0] = ALIGNR8(x[1], x[0], 8); // q[2:1] 95 | t[1] = ALIGNR8(x[5], x[4], 8); // q[10:9] 96 | s0 = ROR64(t[0], sigma0_0) ^ ROR64(t[0], sigma0_1) ^ SRL64(t[0], sigma0_2); 97 | s1 = ROR64(x[7], sigma1_0) ^ ROR64(x[7], sigma1_1) ^ SRL64(x[7], sigma1_2); 98 | x[0] = ADD64(ADD64(ADD64(x[0], s1), s0), t[1]); 99 | 100 | rotate_x(x); 101 | 102 | return ADD64(x[7], LOAD(k512_p)); 103 | } 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /src/sha512_consts.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include "sha512_defs.h" 5 | 6 | #define K512_0 UINT64_C(0x428a2f98d728ae22) 7 | #define K512_1 UINT64_C(0x7137449123ef65cd) 8 | #define K512_2 UINT64_C(0xb5c0fbcfec4d3b2f) 9 | #define K512_3 UINT64_C(0xe9b5dba58189dbbc) 10 | #define K512_4 UINT64_C(0x3956c25bf348b538) 11 | #define K512_5 UINT64_C(0x59f111f1b605d019) 12 | #define K512_6 UINT64_C(0x923f82a4af194f9b) 13 | #define K512_7 UINT64_C(0xab1c5ed5da6d8118) 14 | #define K512_8 UINT64_C(0xd807aa98a3030242) 15 | #define K512_9 UINT64_C(0x12835b0145706fbe) 16 | #define K512_10 UINT64_C(0x243185be4ee4b28c) 17 | #define K512_11 UINT64_C(0x550c7dc3d5ffb4e2) 18 | #define K512_12 UINT64_C(0x72be5d74f27b896f) 19 | #define K512_13 UINT64_C(0x80deb1fe3b1696b1) 20 | #define K512_14 UINT64_C(0x9bdc06a725c71235) 21 | #define K512_15 UINT64_C(0xc19bf174cf692694) 22 | #define K512_16 UINT64_C(0xe49b69c19ef14ad2) 23 | #define K512_17 UINT64_C(0xefbe4786384f25e3) 24 | #define K512_18 UINT64_C(0x0fc19dc68b8cd5b5) 25 | #define K512_19 UINT64_C(0x240ca1cc77ac9c65) 26 | #define K512_20 UINT64_C(0x2de92c6f592b0275) 27 | #define K512_21 UINT64_C(0x4a7484aa6ea6e483) 28 | #define K512_22 UINT64_C(0x5cb0a9dcbd41fbd4) 29 | #define K512_23 UINT64_C(0x76f988da831153b5) 30 | #define K512_24 UINT64_C(0x983e5152ee66dfab) 31 | #define K512_25 UINT64_C(0xa831c66d2db43210) 32 | #define K512_26 UINT64_C(0xb00327c898fb213f) 33 | #define K512_27 UINT64_C(0xbf597fc7beef0ee4) 34 | #define K512_28 UINT64_C(0xc6e00bf33da88fc2) 35 | #define K512_29 UINT64_C(0xd5a79147930aa725) 36 | #define K512_30 UINT64_C(0x06ca6351e003826f) 37 | #define K512_31 UINT64_C(0x142929670a0e6e70) 38 | #define K512_32 UINT64_C(0x27b70a8546d22ffc) 39 | #define K512_33 UINT64_C(0x2e1b21385c26c926) 40 | #define K512_34 UINT64_C(0x4d2c6dfc5ac42aed) 41 | #define K512_35 UINT64_C(0x53380d139d95b3df) 42 | #define K512_36 UINT64_C(0x650a73548baf63de) 43 | #define K512_37 UINT64_C(0x766a0abb3c77b2a8) 44 | #define K512_38 UINT64_C(0x81c2c92e47edaee6) 45 | #define K512_39 UINT64_C(0x92722c851482353b) 46 | #define K512_40 UINT64_C(0xa2bfe8a14cf10364) 47 | #define K512_41 UINT64_C(0xa81a664bbc423001) 48 | #define K512_42 UINT64_C(0xc24b8b70d0f89791) 49 | #define K512_43 UINT64_C(0xc76c51a30654be30) 50 | #define K512_44 UINT64_C(0xd192e819d6ef5218) 51 | #define K512_45 UINT64_C(0xd69906245565a910) 52 | #define K512_46 UINT64_C(0xf40e35855771202a) 53 | #define K512_47 UINT64_C(0x106aa07032bbd1b8) 54 | #define K512_48 UINT64_C(0x19a4c116b8d2d0c8) 55 | #define K512_49 UINT64_C(0x1e376c085141ab53) 56 | #define K512_50 UINT64_C(0x2748774cdf8eeb99) 57 | #define K512_51 UINT64_C(0x34b0bcb5e19b48a8) 58 | #define K512_52 UINT64_C(0x391c0cb3c5c95a63) 59 | #define K512_53 UINT64_C(0x4ed8aa4ae3418acb) 60 | #define K512_54 UINT64_C(0x5b9cca4f7763e373) 61 | #define K512_55 UINT64_C(0x682e6ff3d6b2b8a3) 62 | #define K512_56 UINT64_C(0x748f82ee5defb2fc) 63 | #define K512_57 UINT64_C(0x78a5636f43172f60) 64 | #define K512_58 UINT64_C(0x84c87814a1f0ab72) 65 | #define K512_59 UINT64_C(0x8cc702081a6439ec) 66 | #define K512_60 UINT64_C(0x90befffa23631e28) 67 | #define K512_61 UINT64_C(0xa4506cebde82bde9) 68 | #define K512_62 UINT64_C(0xbef9a3f7b2c67915) 69 | #define K512_63 UINT64_C(0xc67178f2e372532b) 70 | #define K512_64 UINT64_C(0xca273eceea26619c) 71 | #define K512_65 UINT64_C(0xd186b8c721c0c207) 72 | #define K512_66 UINT64_C(0xeada7dd6cde0eb1e) 73 | #define K512_67 UINT64_C(0xf57d4f7fee6ed178) 74 | #define K512_68 UINT64_C(0x06f067aa72176fba) 75 | #define K512_69 UINT64_C(0x0a637dc5a2c898a6) 76 | #define K512_70 UINT64_C(0x113f9804bef90dae) 77 | #define K512_71 UINT64_C(0x1b710b35131c471b) 78 | #define K512_72 UINT64_C(0x28db77f523047d84) 79 | #define K512_73 UINT64_C(0x32caab7b40c72493) 80 | #define K512_74 UINT64_C(0x3c9ebe0a15c9bebc) 81 | #define K512_75 UINT64_C(0x431d67c49c100d4c) 82 | #define K512_76 UINT64_C(0x4cc5d4becb3e42b6) 83 | #define K512_77 UINT64_C(0x597f299cfc657e2a) 84 | #define K512_78 UINT64_C(0x5fcb6fab3ad6faec) 85 | #define K512_79 UINT64_C(0x6c44198c4a475817) 86 | 87 | ALIGN(64) 88 | const sha512_word_t K512[SHA512_ROUNDS_NUM] = { 89 | K512_0, K512_1, K512_2, K512_3, K512_4, K512_5, K512_6, K512_7, K512_8, 90 | K512_9, K512_10, K512_11, K512_12, K512_13, K512_14, K512_15, K512_16, K512_17, 91 | K512_18, K512_19, K512_20, K512_21, K512_22, K512_23, K512_24, K512_25, K512_26, 92 | K512_27, K512_28, K512_29, K512_30, K512_31, K512_32, K512_33, K512_34, K512_35, 93 | K512_36, K512_37, K512_38, K512_39, K512_40, K512_41, K512_42, K512_43, K512_44, 94 | K512_45, K512_46, K512_47, K512_48, K512_49, K512_50, K512_51, K512_52, K512_53, 95 | K512_54, K512_55, K512_56, K512_57, K512_58, K512_59, K512_60, K512_61, K512_62, 96 | K512_63, K512_64, K512_65, K512_66, K512_67, K512_68, K512_69, K512_70, K512_71, 97 | K512_72, K512_73, K512_74, K512_75, K512_76, K512_77, K512_78, K512_79, 98 | }; 99 | 100 | ALIGN(64) 101 | const sha512_word_t K512x2[2 * SHA512_ROUNDS_NUM] = { 102 | DUP2(K512_0, K512_1), DUP2(K512_2, K512_3), DUP2(K512_4, K512_5), 103 | DUP2(K512_6, K512_7), DUP2(K512_8, K512_9), DUP2(K512_10, K512_11), 104 | DUP2(K512_12, K512_13), DUP2(K512_14, K512_15), DUP2(K512_16, K512_17), 105 | DUP2(K512_18, K512_19), DUP2(K512_20, K512_21), DUP2(K512_22, K512_23), 106 | DUP2(K512_24, K512_25), DUP2(K512_26, K512_27), DUP2(K512_28, K512_29), 107 | DUP2(K512_30, K512_31), DUP2(K512_32, K512_33), DUP2(K512_34, K512_35), 108 | DUP2(K512_36, K512_37), DUP2(K512_38, K512_39), DUP2(K512_40, K512_41), 109 | DUP2(K512_42, K512_43), DUP2(K512_44, K512_45), DUP2(K512_46, K512_47), 110 | DUP2(K512_48, K512_49), DUP2(K512_50, K512_51), DUP2(K512_52, K512_53), 111 | DUP2(K512_54, K512_55), DUP2(K512_56, K512_57), DUP2(K512_58, K512_59), 112 | DUP2(K512_60, K512_61), DUP2(K512_62, K512_63), DUP2(K512_64, K512_65), 113 | DUP2(K512_66, K512_67), DUP2(K512_68, K512_69), DUP2(K512_70, K512_71), 114 | DUP2(K512_72, K512_73), DUP2(K512_74, K512_75), DUP2(K512_76, K512_77), 115 | DUP2(K512_78, K512_79), 116 | }; 117 | 118 | ALIGN(64) 119 | const sha512_word_t K512x4[4 * SHA512_ROUNDS_NUM] = { 120 | DUP4(K512_0, K512_1), DUP4(K512_2, K512_3), DUP4(K512_4, K512_5), 121 | DUP4(K512_6, K512_7), DUP4(K512_8, K512_9), DUP4(K512_10, K512_11), 122 | DUP4(K512_12, K512_13), DUP4(K512_14, K512_15), DUP4(K512_16, K512_17), 123 | DUP4(K512_18, K512_19), DUP4(K512_20, K512_21), DUP4(K512_22, K512_23), 124 | DUP4(K512_24, K512_25), DUP4(K512_26, K512_27), DUP4(K512_28, K512_29), 125 | DUP4(K512_30, K512_31), DUP4(K512_32, K512_33), DUP4(K512_34, K512_35), 126 | DUP4(K512_36, K512_37), DUP4(K512_38, K512_39), DUP4(K512_40, K512_41), 127 | DUP4(K512_42, K512_43), DUP4(K512_44, K512_45), DUP4(K512_46, K512_47), 128 | DUP4(K512_48, K512_49), DUP4(K512_50, K512_51), DUP4(K512_52, K512_53), 129 | DUP4(K512_54, K512_55), DUP4(K512_56, K512_57), DUP4(K512_58, K512_59), 130 | DUP4(K512_60, K512_61), DUP4(K512_62, K512_63), DUP4(K512_64, K512_65), 131 | DUP4(K512_66, K512_67), DUP4(K512_68, K512_69), DUP4(K512_70, K512_71), 132 | DUP4(K512_72, K512_73), DUP4(K512_74, K512_75), DUP4(K512_76, K512_77), 133 | DUP4(K512_78, K512_79), 134 | }; 135 | -------------------------------------------------------------------------------- /tests/main_speed.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include 6 | 7 | #include "measurements.h" 8 | #include "sha.h" 9 | #include "test.h" 10 | 11 | #define MAX_MSG_BYTE_LEN (65536UL) 12 | 13 | _INLINE_ void speed_sha256(void) 14 | { 15 | uint8_t dgst[SHA256_HASH_BYTE_LEN] = {0}; 16 | uint8_t data[MAX_MSG_BYTE_LEN] = {0}; 17 | 18 | // Use a deterministic seed. 19 | srand(0); 20 | rand_data(data, sizeof(data)); 21 | 22 | printf("\nSHA-256 Benchmark:"); 23 | printf("\n------------------\n"); 24 | printf(" msg generic"); 25 | 26 | // X86-64 specific options 27 | RUN_X86_64(printf(" avx (C) avx (ossl)");); 28 | RUN_AVX2(printf(" avx2 (C) avx2 (ossl)");); 29 | RUN_AVX512(printf(" avx512 (C)");); 30 | RUN_X86_64_SHA_EXT(printf(" sha ext (C) sha ext (ossl) \n");); 31 | 32 | // Aarch64 specific options 33 | RUN_NEON(printf(" neon (ossl)");); 34 | RUN_AARCH64_SHA_EXT(printf(" sha ext (C) sha ext (ossl) \n");); 35 | 36 | printf("\n"); 37 | for(size_t msg_byte_len = 1; msg_byte_len <= MAX_MSG_BYTE_LEN; 38 | msg_byte_len <<= 1) { 39 | 40 | printf("%5ld bytes", msg_byte_len); 41 | MEASURE(sha256(dgst, data, msg_byte_len, GENERIC_IMPL);); 42 | 43 | // X86-64 specific options 44 | RUN_X86_64(MEASURE(sha256(dgst, data, msg_byte_len, AVX_IMPL););); 45 | RUN_X86_64(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_AVX_IMPL););); 46 | RUN_AVX2(MEASURE(sha256(dgst, data, msg_byte_len, AVX2_IMPL););); 47 | RUN_AVX2(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_AVX2_IMPL););); 48 | RUN_AVX512(MEASURE(sha256(dgst, data, msg_byte_len, AVX512_IMPL););); 49 | RUN_X86_64_SHA_EXT(MEASURE(sha256(dgst, data, msg_byte_len, SHA_EXT_IMPL););); 50 | RUN_X86_64_SHA_EXT( 51 | MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_SHA_EXT_IMPL););); 52 | 53 | // Aarch64 specific options 54 | RUN_NEON(MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_NEON_IMPL););); 55 | RUN_AARCH64_SHA_EXT( 56 | MEASURE(sha256(dgst, data, msg_byte_len, SHA_EXT_IMPL););); 57 | RUN_AARCH64_SHA_EXT( 58 | MEASURE(sha256(dgst, data, msg_byte_len, OPENSSL_SHA_EXT_IMPL););); 59 | 60 | printf("\n"); 61 | } 62 | } 63 | 64 | _INLINE_ void speed_sha512(void) 65 | { 66 | uint8_t dgst[SHA512_HASH_BYTE_LEN] = {0}; 67 | uint8_t data[MAX_MSG_BYTE_LEN] = {0}; 68 | 69 | // Use a deterministic seed. 70 | srand(0); 71 | rand_data(data, sizeof(data)); 72 | 73 | printf("\nSHA-512 Benchmark:"); 74 | printf("\n------------------\n"); 75 | printf(" msg generic"); 76 | 77 | // X86-64 specific options 78 | RUN_X86_64(printf(" avx (C) avx (ossl)");); 79 | RUN_AVX2(printf(" avx2 (C) avx2 (ossl)");); 80 | RUN_AVX512(printf(" avx512 (C)");); 81 | 82 | // Aarch64 specific options 83 | RUN_NEON(printf(" neon (ossl)");); 84 | 85 | printf("\n"); 86 | 87 | for(size_t msg_byte_len = 1; msg_byte_len <= MAX_MSG_BYTE_LEN; 88 | msg_byte_len <<= 1) { 89 | 90 | printf("%5ld bytes", msg_byte_len); 91 | MEASURE(sha512(dgst, data, msg_byte_len, GENERIC_IMPL);); 92 | 93 | // X86-64 specific options 94 | RUN_X86_64(MEASURE(sha512(dgst, data, msg_byte_len, AVX_IMPL););); 95 | RUN_X86_64(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_AVX_IMPL););); 96 | RUN_AVX2(MEASURE(sha512(dgst, data, msg_byte_len, AVX2_IMPL););); 97 | RUN_AVX2(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_AVX2_IMPL););); 98 | RUN_AVX512(MEASURE(sha512(dgst, data, msg_byte_len, AVX512_IMPL););); 99 | 100 | // Aarch64 specific options 101 | RUN_NEON(MEASURE(sha512(dgst, data, msg_byte_len, OPENSSL_NEON_IMPL););); 102 | 103 | printf("\n"); 104 | } 105 | } 106 | 107 | int main(void) 108 | { 109 | speed_sha256(); 110 | speed_sha512(); 111 | 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /tests/main_tests.c: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "sha.h" 11 | #include "test.h" 12 | 13 | #define SHA256_TEST_MAX_MSG_BYTE_LEN (6400) 14 | #define SHA512_TEST_MAX_MSG_BYTE_LEN (12800) 15 | 16 | #if !defined(MONTE_CARLO_NUM_OF_TESTS) 17 | # define MONTE_CARLO_NUM_OF_TESTS (100000) 18 | #endif 19 | 20 | _INLINE_ int test_sha256_impl(IN const sha_impl_t impl, 21 | IN const uint8_t *data, 22 | IN const uint8_t *ref_dgst, 23 | IN const size_t byte_len) 24 | { 25 | uint8_t tst_dgst[SHA256_HASH_BYTE_LEN] = {0}; 26 | sha256(tst_dgst, data, byte_len, impl); 27 | 28 | if(0 != memcmp(ref_dgst, tst_dgst, SHA256_HASH_BYTE_LEN)) { 29 | printf("Digest mismatch for impl=%d and size=%ld\n", impl, byte_len); 30 | print(ref_dgst, SHA256_HASH_BYTE_LEN); 31 | print(tst_dgst, SHA256_HASH_BYTE_LEN); 32 | return FAILURE; 33 | } 34 | 35 | return SUCCESS; 36 | } 37 | 38 | _INLINE_ int test_sha256() 39 | { 40 | uint8_t ref_dgst[SHA256_HASH_BYTE_LEN] = {0}; 41 | uint8_t data[SHA256_TEST_MAX_MSG_BYTE_LEN] = {0}; 42 | 43 | // Use a deterministic seed. 44 | srand(0); 45 | rand_data(data, sizeof(data)); 46 | 47 | printf("Testing SHA256 Short/Long tests\n"); 48 | 49 | for(size_t byte_len = 0; byte_len <= sizeof(data); byte_len++) { 50 | SHA256(data, byte_len, ref_dgst); 51 | 52 | GUARD(test_sha256_impl(GENERIC_IMPL, data, ref_dgst, byte_len)); 53 | 54 | // X86-64 specific options 55 | RUN_X86_64(GUARD(test_sha256_impl(AVX_IMPL, data, ref_dgst, byte_len));); 56 | RUN_AVX2(GUARD(test_sha256_impl(AVX2_IMPL, data, ref_dgst, byte_len));); 57 | RUN_AVX512(GUARD(test_sha256_impl(AVX512_IMPL, data, ref_dgst, byte_len));); 58 | RUN_X86_64_SHA_EXT( 59 | GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len));); 60 | 61 | // Aarch64 specific options 62 | RUN_AARCH64_SHA_EXT( 63 | GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len));); 64 | } 65 | 66 | printf("Testing SHA256 Monte Carlo tests\n"); 67 | 68 | // Perform 100,000 Monte Carlo tests. 69 | for(size_t i = 0; i < MONTE_CARLO_NUM_OF_TESTS; i++) { 70 | 71 | printf("\rTesting case=%ld", i); 72 | 73 | // Generate a random message and a reference digest. 74 | size_t byte_len = rand() % sizeof(data); 75 | rand_data(data, byte_len); 76 | SHA256(data, byte_len, ref_dgst); 77 | 78 | // X86-64 specific options 79 | RUN_X86_64(GUARD(test_sha256_impl(AVX_IMPL, data, ref_dgst, byte_len));); 80 | RUN_AVX2(GUARD(test_sha256_impl(AVX2_IMPL, data, ref_dgst, byte_len));); 81 | RUN_AVX512(GUARD(test_sha256_impl(AVX512_IMPL, data, ref_dgst, byte_len));); 82 | RUN_X86_64_SHA_EXT( 83 | GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len));); 84 | 85 | // Aarch64 specific options 86 | RUN_AARCH64_SHA_EXT( 87 | GUARD(test_sha256_impl(SHA_EXT_IMPL, data, ref_dgst, byte_len));); 88 | } 89 | 90 | printf("\n"); 91 | return SUCCESS; 92 | } 93 | 94 | _INLINE_ int test_sha512_impl(IN const sha_impl_t impl, 95 | IN const uint8_t *data, 96 | IN const uint8_t *ref_dgst, 97 | IN const size_t byte_len) 98 | { 99 | uint8_t tst_dgst[SHA512_HASH_BYTE_LEN] = {0}; 100 | sha512(tst_dgst, data, byte_len, impl); 101 | 102 | if(0 != memcmp(ref_dgst, tst_dgst, SHA512_HASH_BYTE_LEN)) { 103 | printf("Digest mismatch for impl=%d and size=%ld\n", impl, byte_len); 104 | print(ref_dgst, SHA512_HASH_BYTE_LEN); 105 | print(tst_dgst, SHA512_HASH_BYTE_LEN); 106 | return FAILURE; 107 | } 108 | 109 | return SUCCESS; 110 | } 111 | 112 | _INLINE_ int test_sha512() 113 | { 114 | uint8_t ref_dgst[SHA512_HASH_BYTE_LEN] = {0}; 115 | uint8_t data[SHA512_TEST_MAX_MSG_BYTE_LEN] = {0}; 116 | 117 | // Use a deterministic seed. 118 | srand(0); 119 | rand_data(data, sizeof(data)); 120 | 121 | printf("Testing SHA512 Short/Long tests\n"); 122 | 123 | for(size_t byte_len = 0; byte_len <= sizeof(data); byte_len++) { 124 | SHA512(data, byte_len, ref_dgst); 125 | 126 | GUARD(test_sha512_impl(GENERIC_IMPL, data, ref_dgst, byte_len)); 127 | 128 | // X86-64 specific options 129 | RUN_X86_64(GUARD(test_sha512_impl(AVX_IMPL, data, ref_dgst, byte_len));); 130 | RUN_AVX2(GUARD(test_sha512_impl(AVX2_IMPL, data, ref_dgst, byte_len));); 131 | RUN_AVX512(GUARD(test_sha512_impl(AVX512_IMPL, data, ref_dgst, byte_len));); 132 | } 133 | 134 | printf("Testing SHA512 Monte Carlo tests\n"); 135 | 136 | // Perform 100,000 Monte Carlo tests. 137 | for(size_t i = 0; i < MONTE_CARLO_NUM_OF_TESTS; i++) { 138 | 139 | printf("\rTesting case=%ld", i); 140 | 141 | // Generate a random message and a reference digest. 142 | size_t byte_len = rand() % sizeof(data); 143 | rand_data(data, byte_len); 144 | SHA512(data, byte_len, ref_dgst); 145 | 146 | GUARD(test_sha512_impl(GENERIC_IMPL, data, ref_dgst, byte_len)); 147 | 148 | // X86-64 specific options 149 | RUN_X86_64(GUARD(test_sha512_impl(AVX_IMPL, data, ref_dgst, byte_len));); 150 | RUN_AVX2(GUARD(test_sha512_impl(AVX2_IMPL, data, ref_dgst, byte_len));); 151 | RUN_AVX512(GUARD(test_sha512_impl(AVX512_IMPL, data, ref_dgst, byte_len));); 152 | } 153 | 154 | printf("\n"); 155 | return SUCCESS; 156 | } 157 | 158 | int main(void) 159 | { 160 | GUARD(test_sha256()); 161 | GUARD(test_sha512()); 162 | 163 | return 0; 164 | } 165 | -------------------------------------------------------------------------------- /tests/pre-commit-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Avoid removing the "build" directory if the script does not run from the 6 | # package root directory 7 | basedir=`pwd` 8 | if [[ ! -f "$basedir/tests/pre-commit-script.sh" ]]; then 9 | >&2 echo "Script does not run from the root directory" 10 | exit 0 11 | fi 12 | 13 | if [ $# -ne 0 ]; then 14 | # For speed testing when the first parameter is set we set the number 15 | # of monte carlo tests to 10. This should not be set before commiting a code. 16 | monte="-DMONTE_CARLO_NUM_OF_TESTS=10" 17 | else 18 | # Use the default (100,000) 19 | monte="" 20 | fi 21 | 22 | # Clean previous build content 23 | rm -rf build; 24 | 25 | mkdir build; 26 | cd build; 27 | 28 | # Test clang-format 29 | cmake ..; make format; 30 | rm -rf * 31 | 32 | for method in "" "-DALTERNATIVE_AVX512_IMPL=1"; do 33 | # Test clang-tidy 34 | CC=clang-9 cmake $method -DCMAKE_C_CLANG_TIDY="clang-tidy-9;--fix-errors;--format-style=file" .. 35 | make -j20 36 | rm -rf * 37 | 38 | for flag in "" "-DTEST_SPEED=1" "-DASAN=1" "-DMSAN=1" "-DTSAN=1" "-DUBSAN=1" ; do 39 | CC=clang-9 cmake $method $flag $monte ..; 40 | make -j20 41 | ./sha-with-intrinsic 42 | rm -rf * 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /tests/test.h: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: Apache-2.0 3 | 4 | #pragma once 5 | 6 | #define SUCCESS 0 7 | #define FAILURE (-1) 8 | #define GUARD(x) \ 9 | do { \ 10 | if(SUCCESS != (x)) { \ 11 | return FAILURE; \ 12 | } \ 13 | } while(0) 14 | 15 | ///////////////////////////// 16 | // X86_64 specific options 17 | ///////////////////////////// 18 | 19 | #if defined(X86_64) 20 | # define RUN_X86_64(x) \ 21 | do { \ 22 | x \ 23 | } while(0) 24 | #else 25 | # define RUN_X86_64(x) 26 | #endif 27 | 28 | #if defined(AVX2_SUPPORT) 29 | # define RUN_AVX2(x) \ 30 | do { \ 31 | x \ 32 | } while(0) 33 | #else 34 | # define RUN_AVX2(x) 35 | #endif 36 | 37 | #if defined(AVX512_SUPPORT) 38 | # define RUN_AVX512(x) \ 39 | do { \ 40 | x \ 41 | } while(0) 42 | #else 43 | # define RUN_AVX512(x) 44 | #endif 45 | 46 | #if defined(X86_64_SHA_SUPPORT) 47 | # define RUN_X86_64_SHA_EXT(x) \ 48 | do { \ 49 | x \ 50 | } while(0) 51 | #else 52 | # define RUN_X86_64_SHA_EXT(x) 53 | #endif 54 | 55 | ///////////////////////////// 56 | // AARCH64 specific options 57 | ///////////////////////////// 58 | 59 | #if defined(NEON_SUPPORT) 60 | # define RUN_NEON(x) \ 61 | do { \ 62 | x \ 63 | } while(0) 64 | #else 65 | # define RUN_NEON(x) 66 | #endif 67 | 68 | #if defined(AARCH64_SHA_SUPPORT) 69 | # define RUN_AARCH64_SHA_EXT(x) \ 70 | do { \ 71 | x \ 72 | } while(0) 73 | #else 74 | # define RUN_AARCH64_SHA_EXT(x) 75 | #endif 76 | 77 | ///////////////////////////// 78 | // Inline utilities 79 | ///////////////////////////// 80 | 81 | void print(const uint8_t *a, const int byte_len) 82 | { 83 | for(int i = byte_len - 1; i >= 0; i--) { 84 | printf("%.2x", a[i]); 85 | } 86 | printf("\n\n"); 87 | } 88 | 89 | _INLINE_ void rand_data(OUT uint8_t *out, IN const size_t byte_len) 90 | { 91 | for(size_t i = 0; i < byte_len; i++) { 92 | out[i] = rand(); 93 | } 94 | } 95 | --------------------------------------------------------------------------------