├── .gitignore
├── images
    ├── clockmed.jpg
    ├── clocksmall.jpg
    ├── clockxtrasmall.jpg
    ├── clockxtrasmall2.jpg
    ├── clockxtrasmall_border.jpg
    ├── clockxtrasmall_border2.jpg
    └── readme.txt
├── montgomery_arithmetic
    ├── include
    │   └── hurchalla
    │   │   └── montgomery_arithmetic
    │   │       ├── low_level_api
    │   │           ├── detail
    │   │           │   ├── integer_inverse.odt
    │   │           │   ├── integer_inverse.pdf
    │   │           │   ├── impl_inverse_mod_R.h
    │   │           │   └── platform_specific
    │   │           │   │   ├── impl_get_Rsquared_mod_n.h
    │   │           │   │   └── README_REDC_supplement.md
    │   │           ├── inverse_mod_R.h
    │   │           ├── get_R_mod_n.h
    │   │           └── get_Rsquared_mod_n.h
    │   │       └── detail
    │   │           ├── MontyTags.h
    │   │           ├── experimental
    │   │               ├── montgomery_two_pow
    │   │               │   ├── bench_reference.sh
    │   │               │   └── testbench.sh
    │   │               ├── montgomery_pow_2kary
    │   │               │   ├── filter_lines.py
    │   │               │   ├── remove_lines.py
    │   │               │   ├── firstline.py
    │   │               │   ├── timings_x64_Zen4
    │   │               │   │   └── partial_array_size_2
    │   │               │   │   │   ├── out.txt
    │   │               │   │   │   └── out2.txt
    │   │               │   ├── testbench_2kary.sh
    │   │               │   └── timings_ARM64_M2
    │   │               │   │   ├── 64_half_gcc_noasm_array.txt
    │   │               │   │   ├── 64_half_clang_noasm_array.txt
    │   │               │   │   └── 64_quarter_gcc_noasm_array.txt
    │   │               ├── README.md
    │   │               ├── montgomery_two_pow_API.h
    │   │               └── unit_testing_helpers
    │   │               │   └── AbstractMontgomeryWrapper.h
    │   │           ├── BaseMontgomeryValue.h
    │   │           ├── MontgomeryDefault.h
    │   │           ├── ImplMontgomeryForm.h
    │   │           └── MontgomeryFormExtensions.h
    └── CMakeLists.txt
├── examples
    ├── example_with_cmake
    │   ├── example.sh
    │   ├── CMakeLists.txt
    │   └── example.cpp
    └── example_without_cmake
    │   ├── example.sh
    │   └── example.cpp
├── .github
    └── workflows
    │   ├── devskim.yml
    │   ├── flawfinder.yml
    │   ├── cmake.yml
    │   └── codeql.yml
├── modular_arithmetic
    ├── include
    │   └── hurchalla
    │   │   └── modular_arithmetic
    │   │       ├── detail
    │   │           ├── optimization_tag_structs.h
    │   │           ├── impl_modular_pow.h
    │   │           └── impl_modular_multiplicative_inverse.h
    │   │       ├── modular_pow.h
    │   │       ├── absolute_value_difference.h
    │   │       ├── modular_multiplicative_inverse.h
    │   │       ├── modular_multiplication.h
    │   │       ├── modular_addition.h
    │   │       └── modular_subtraction.h
    ├── src
    │   └── platform_specific_MSVC_x86_64
    │   │   └── modular_multiply_uint64--x64_microsoft.asm
    └── CMakeLists.txt
├── msvc_build_tests.bat
├── test
    ├── modular_arithmetic
    │   ├── test_modular_addition_with_subtraction.cpp
    │   ├── test_absolute_value_difference.cpp
    │   ├── test_modular_addition.cpp
    │   ├── test_modular_multiplication.cpp
    │   └── test_modular_pow.cpp
    ├── CMakeLists.txt
    ├── montgomery_arithmetic
    │   ├── low_level_api
    │   │   ├── test_REDC.cpp
    │   │   ├── test_REDC_inline_asm.cpp
    │   │   ├── test_inverse_mod_R.cpp
    │   │   └── test_get_Rsquared_mod_n.cpp
    │   └── test_MontgomeryForm_extra.cpp
    └── FetchGoogleTest.cmake
├── CMakeLists.txt
├── macros_for_performance.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | [Bb]uild/*
2 | 


--------------------------------------------------------------------------------
/images/clockmed.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clockmed.jpg


--------------------------------------------------------------------------------
/images/clocksmall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clocksmall.jpg


--------------------------------------------------------------------------------
/images/clockxtrasmall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clockxtrasmall.jpg


--------------------------------------------------------------------------------
/images/clockxtrasmall2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clockxtrasmall2.jpg


--------------------------------------------------------------------------------
/images/clockxtrasmall_border.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clockxtrasmall_border.jpg


--------------------------------------------------------------------------------
/images/clockxtrasmall_border2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/images/clockxtrasmall_border2.jpg


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/integer_inverse.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/integer_inverse.odt


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/integer_inverse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hurchalla/modular_arithmetic/HEAD/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/integer_inverse.pdf


--------------------------------------------------------------------------------
/images/readme.txt:
--------------------------------------------------------------------------------
1 | The files clockmed.jpg, clocksmall.jpg, clockxtrasmall.jpg, clockxtrasmall2.jpg, clockxtrasmall_border.jpg, and clockxtrasmall_border2.jpg are cropped versions of the photograph
2 | https://commons.wikimedia.org/wiki/File:Clock_gears_in_the_St_Maximus_church_in_Magnac-Laval_03.jpg
3 | 
4 | The photo is by Krzysztof Golik, and licensed CC BY-SA 4.0 https://creativecommons.org/licenses/by-sa/4.0/deed.en
5 | 


--------------------------------------------------------------------------------
/examples/example_with_cmake/example.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (c) 2020-2025 Jeffrey Hurchalla.
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 7 | 
 8 | 
 9 | # This example is meant to show how to use the modular arithmetic library within
10 | # a CMake project.  If you haven't already done so, you should follow the steps
11 | # in the README.md for "How to use the library" | "With CMake"
12 | 
13 | mkdir -p tmp
14 | cmake -S. -B./tmp -DCMAKE_BUILD_TYPE=Release
15 | cmake --build ./tmp --config Release
16 | 
17 | echo
18 | echo Running example...
19 | echo
20 | ./tmp/modular_arithmetic_example
21 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyTags.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_MONTY_TAGS_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_MONTY_TAGS_H_INCLUDED
10 | 
11 | 
12 | 
13 | namespace hurchalla { namespace detail {
14 | 
15 | 
16 | struct TagMontyQuarterrange final {};  // IDs MontyQuarterRange independent of T
17 | struct TagMontyHalfrange final {};
18 | struct TagMontyFullrange final {};
19 | struct TagMontyWrappedmath final {};
20 | struct TagMontyFullrangeMasked final {};
21 | 
22 | 
23 | }} // end namespace
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/.github/workflows/devskim.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | name: DevSkim
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [ master ]
11 |   pull_request:
12 |     branches: [ master ]
13 |   schedule:
14 |     - cron: '27 3 * * 4'
15 | 
16 | jobs:
17 |   lint:
18 |     name: DevSkim
19 |     runs-on: ubuntu-20.04
20 |     permissions:
21 |       actions: read
22 |       contents: read
23 |       security-events: write
24 |     steps:
25 |       - name: Checkout code
26 |         uses: actions/checkout@v3
27 | 
28 |       - name: Run DevSkim scanner
29 |         uses: microsoft/DevSkim-Action@v1
30 |         
31 |       - name: Upload DevSkim scan results to GitHub Security tab
32 |         uses: github/codeql-action/upload-sarif@v2
33 |         with:
34 |           sarif_file: devskim-results.sarif
35 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/bench_reference.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | ./testbench.sh clang++ O3 MontgomeryQuarter uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
 3 | 
 4 | ./testbench.sh clang++ O3 MontgomeryHalf uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
 5 | 
 6 | ./testbench.sh clang++ O3 MontgomeryFull uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
 7 | 
 8 | 
 9 | ./testbench.sh g++ O3 MontgomeryQuarter uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
10 | 
11 | ./testbench.sh g++ O3 MontgomeryHalf uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
12 | 
13 | ./testbench.sh g++ O3 MontgomeryFull uint64_t 191 8 22 -DTEST_SCALAR -DHURCHALLA_MONTGOMERY_TWO_POW_USE_CSELECT_ON_BIT -DHURCHALLA_ALLOW_INLINE_ASM_ALL
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/example_with_cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | 
 7 | cmake_minimum_required(VERSION 3.14)
 8 | 
 9 | project(modular_arithmetic_example VERSION 1.0.0 LANGUAGES CXX)
10 | 
11 | add_executable(modular_arithmetic_example)
12 | 
13 | target_sources(modular_arithmetic_example PRIVATE
14 |     example.cpp
15 |     )
16 | 
17 | # in add_subdirectory below, the first argument must be the path on your system
18 | # to the root of the cloned modular arithmetic repository.  In this example,
19 | # "../.." is used because (unless you move it) this CMakeLists.txt file has a
20 | # path that is within the repository, and that path is two directory levels
21 | # below the repo's root.
22 | add_subdirectory(../..  ${CMAKE_CURRENT_BINARY_DIR}/modular_arithmetic)
23 | 
24 | target_link_libraries(modular_arithmetic_example
25 |                       hurchalla_modular_arithmetic)
26 | 
27 | 
28 | 
29 | # To build this example, see the file example.sh.
30 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/detail/optimization_tag_structs.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_OPTIMIZATION_TAG_STRUCTS_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_OPTIMIZATION_TAG_STRUCTS_H_INCLUDED
10 | 
11 | 
12 | namespace hurchalla {
13 | 
14 | 
15 | // private optimization tag intended only for use by the implementation
16 | struct PrivateAnyTag {};
17 | 
18 | 
19 | // Public optimization tags:
20 | // ------------------------
21 | // LowlatencyTag potentially offers optimizations targeted toward lowering the
22 | // latency of functions.
23 | // LowuopsTag potentially offers optimizations targeted toward reducing the
24 | // number of instructions generated/executed by functions.
25 | 
26 | struct LowlatencyTag final : public PrivateAnyTag {};
27 | struct LowuopsTag final : public PrivateAnyTag {};
28 | 
29 | 
30 | } // end namespace
31 | 
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/filter_lines.py:
--------------------------------------------------------------------------------
 1 | # filter_lines.py
 2 | 
 3 | import sys
 4 | 
 5 | def filter_lines(input_filename, search_string, output_filename):
 6 |     """
 7 |     Reads lines from input_filename and writes to output_filename
 8 |     all lines that contain search_string.
 9 |     """
10 |     with open(input_filename, 'r', encoding='utf-8') as infile, \
11 |          open(output_filename, 'w', encoding='utf-8') as outfile:
12 |         
13 |         for line in infile:
14 |             if search_string in line:
15 |                 outfile.write(line)
16 | 
17 | def main():
18 |     # Expect exactly three command-line arguments
19 |     if len(sys.argv) != 4:
20 |         print("Usage: python filter_lines.py <input_file> <search_string> <output_file>")
21 |         sys.exit(1)
22 | 
23 |     input_filename = sys.argv[1]
24 |     search_string = sys.argv[2]
25 |     output_filename = sys.argv[3]
26 | 
27 |     filter_lines(input_filename, search_string, output_filename)
28 |     print(f"Lines containing '{search_string}' have been written to '{output_filename}'.")
29 | 
30 | if __name__ == "__main__":
31 |     main()


--------------------------------------------------------------------------------
/modular_arithmetic/src/platform_specific_MSVC_x86_64/modular_multiply_uint64--x64_microsoft.asm:
--------------------------------------------------------------------------------
 1 | ; Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | ; This Source Code Form is subject to the terms of the Mozilla Public
 3 | ; License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | ; file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | .code
 7 | 
 8 | ; This uses Microsoft x64 calling convention
 9 | 
10 | 
11 | ; extern "C" uint64_t modular_multiply_uint64_asm_UID7b5f83fc983(uint64_t a,
12 | ;                                              uint64_t b, uint64_t modulus);
13 | ; Preconditions: 0 <= a < modulus,  0 <= b < modulus,  modulus > 0
14 | ; Postconditions: returns (a*b)%modulus
15 | ;
16 | ; rcx == a, rdx == b, r8 == modulus
17 | ; return register is rax
18 | PUBLIC  modular_multiply_uint64_asm_UID7b5f83fc983
19 | modular_multiply_uint64_asm_UID7b5f83fc983  PROC
20 |     mov rax, rcx
21 |     mul rdx         ; RDX:RAX = RAX*RDX; high-order bits of the product in RDX
22 |     div r8          ; (quotient RAX, remainder RDX) = RDX:RAX/R8
23 |     mov rax, rdx    ; return the remainder
24 |     ret 0
25 | modular_multiply_uint64_asm_UID7b5f83fc983  ENDP
26 | 
27 | 
28 | End
29 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/remove_lines.py:
--------------------------------------------------------------------------------
 1 | # remove_lines.py
 2 | 
 3 | import sys
 4 | 
 5 | def remove_lines(input_filename, search_string, output_filename):
 6 |     """
 7 |     Reads lines from input_filename and writes to output_filename
 8 |     all lines that do not contain search_string.
 9 |     """
10 |     with open(input_filename, 'r', encoding='utf-8') as infile, \
11 |          open(output_filename, 'w', encoding='utf-8') as outfile:
12 |         
13 |         for line in infile:
14 |             if search_string not in line:
15 |                 outfile.write(line)
16 | 
17 | def main():
18 |     # Expect exactly three command-line arguments
19 |     if len(sys.argv) != 4:
20 |         print("Usage: python remove_lines.py <input_file> <search_string> <output_file>")
21 |         sys.exit(1)
22 | 
23 |     input_filename = sys.argv[1]
24 |     search_string = sys.argv[2]
25 |     output_filename = sys.argv[3]
26 | 
27 |     remove_lines(input_filename, search_string, output_filename)
28 |     print(f"Lines containing '{search_string}' have been written to '{output_filename}'.")
29 | 
30 | if __name__ == "__main__":
31 |     main()


--------------------------------------------------------------------------------
/.github/workflows/flawfinder.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | name: flawfinder
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [ master ]
11 |   pull_request:
12 |     # The branches below must be a subset of the branches above
13 |     branches: [ master ]
14 |   schedule:
15 |     - cron: '39 7 * * 0'
16 | 
17 | jobs:
18 |   flawfinder:
19 |     name: Flawfinder
20 |     runs-on: ubuntu-latest
21 |     permissions:
22 |       actions: read
23 |       contents: read
24 |       security-events: write
25 |     steps:
26 |       - name: Checkout code
27 |         uses: actions/checkout@v3
28 | 
29 |       - name: flawfinder_scan
30 |         uses: david-a-wheeler/flawfinder@8e4a779ad59dbfaee5da586aa9210853b701959c
31 |         with:
32 |           arguments: '--sarif ./'
33 |           output: 'flawfinder_results.sarif'
34 | 
35 |       - name: Upload analysis results to GitHub Security tab
36 |         uses: github/codeql-action/upload-sarif@v2
37 |         with:
38 |           sarif_file: ${{github.workspace}}/flawfinder_results.sarif
39 | 


--------------------------------------------------------------------------------
/examples/example_without_cmake/example.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (c) 2020-2025 Jeffrey Hurchalla.
 4 | # This Source Code Form is subject to the terms of the Mozilla Public
 5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 7 | 
 8 | 
 9 | # This example is intended for the case that you are not using CMake.
10 | # If you haven't already done so, you should follow the steps in the README.md
11 | # for "How to use the library" | "Without CMake"
12 | 
13 | 
14 | # --------------------------------------------------------------------------
15 | # You'll need to change the installed_path below, and you may need to change
16 | # the cpp_compiler.
17 | # --------------------------------------------------------------------------
18 | 
19 | # set installed_path to the directory where you installed the modular arithmetic
20 | # library
21 | installed_path=/home/jeff/Desktop
22 | include_path=${installed_path}/include
23 | 
24 | # set the compiler to whatever you wish.  Below is gcc or clang.
25 | cpp_compiler=g++
26 | #cpp_compiler=clang++
27 | 
28 | 
29 | $cpp_compiler -std="c++17" \
30 |         -Wall -Wextra  -O2  \
31 |         -I$include_path \
32 |         -o example  example.cpp
33 | 
34 | ./example  
35 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/firstline.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def main():
 4 |     if len(sys.argv) != 2:
 5 |         print("Usage: python3 script.py <filename>")
 6 |         sys.exit(1)
 7 | 
 8 |     filename = sys.argv[1]
 9 | 
10 |     try:
11 |         with open(filename, 'r') as file:
12 |             lines = file.readlines()
13 |     except FileNotFoundError:
14 |         print(f"Error: File '{filename}' not found.")
15 |         sys.exit(1)
16 | 
17 |     # Find start and end markers
18 |     try:
19 |         start_index = next(i for i, line in enumerate(lines) if "OVERALL BEST:" in line)
20 |         end_index = next(i for i, line in enumerate(lines) if "Timings By Test Type:" in line)
21 |     except StopIteration:
22 |         print("Error: Could not find required markers in the file.")
23 |         sys.exit(1)
24 | 
25 |     # Process lines between the markers
26 |     for line in lines[start_index + 1:end_index]:
27 |         parts = line.strip().split()
28 |         if len(parts) != 7:
29 |             continue  # skip malformed lines
30 |         try:
31 |             third_field = int(parts[2])
32 |         except ValueError:
33 |             continue  # skip lines where the third field isn’t an integer
34 |         if third_field < 6:
35 |             print(line.strip())
36 |             return
37 | 
38 |     print("No line found where the third field is less than 6.")
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/msvc_build_tests.bat:
--------------------------------------------------------------------------------
 1 | 
 2 | @echo off
 3 | REM This Source Code Form is subject to the terms of the Mozilla Public
 4 | REM License, v. 2.0. If a copy of the MPL was not distributed with this
 5 | REM file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 | 
 7 | set build_dir=build\msvc
 8 | 
 9 | REM Example of how to use an earlier version of MSVC than the default:
10 | REM cmake --help   (will show the available Generators you can use)
11 | REM cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_MODULAR_ARITHMETIC=ON -G "Visual Studio 15"
12 | REM the above line appears to build x86-32.  To get x64:
13 | REM cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_MODULAR_ARITHMETIC=ON -G "Visual Studio 15 2017 Win64"
14 | 
15 | REM for Visual Studio 2019 and above, set the architecture with -A, for example:
16 | REM -G "Visual Studio 16 2019" -A Win32
17 | REM -G "Visual Studio 16 2019" -A x64
18 | REM -G "Visual Studio 16 2019" -A ARM
19 | REM -G "Visual Studio 16 2019" -A ARM64
20 | 
21 | cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_MODULAR_ARITHMETIC=ON -DHURCHALLA_TEST_MODULAR_ARITHMETIC_HEAVYWEIGHT=ON -G "Visual Studio 17 2022" -A x64
22 | if %errorlevel% neq 0 exit /b %errorlevel%
23 | cmake --build .\%build_dir% --config Release
24 | if %errorlevel% neq 0 exit /b %errorlevel%
25 | cmake --build .\%build_dir% --config Debug
26 | if %errorlevel% neq 0 exit /b %errorlevel%
27 | 
28 | 
29 | %build_dir%\Release\test_hurchalla_modular_arithmetic.exe
30 | if %errorlevel% neq 0 exit /b %errorlevel%
31 | 
32 | %build_dir%\Debug\test_hurchalla_modular_arithmetic.exe
33 | if %errorlevel% neq 0 exit /b %errorlevel%
34 | 


--------------------------------------------------------------------------------
/.github/workflows/cmake.yml:
--------------------------------------------------------------------------------
 1 | name: CMake
 2 | 
 3 | on:
 4 |  push:
 5 |   branches: [ master ]
 6 | 
 7 | env:
 8 |   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
 9 |   BUILD_TYPE: Release
10 |   #other potential flags for CXX_FLAGS are  -DHURCHALLA_AVOID_CSELECT=1 -DHURCHALLA_ALLOW_INLINE_ASM_ALL=1
11 |   CXX_FLAGS: '-std=c++11'
12 | 
13 | jobs:
14 |   build:
15 |     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
16 |     # You can convert this to a matrix build if you need cross-platform coverage.
17 |     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 | 
23 |     - name: Configure CMake
24 |       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
25 |       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
26 |       
27 |       #-DCMAKE_CXX_COMPILER=clang++-10
28 |       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DTEST_HURCHALLA_MODULAR_ARITHMETIC=ON -DCMAKE_CXX_FLAGS=${{env.CXX_FLAGS}}
29 | 
30 |     - name: Build
31 |       # Build your program with the given configuration
32 |       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
33 | 
34 |     - name: Test Modular Arithmetic
35 |       working-directory: ${{github.workspace}}/build
36 |       run: ./test_hurchalla_modular_arithmetic
37 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/BaseMontgomeryValue.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_BASE_MONTGOMERY_VALUE_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_BASE_MONTGOMERY_VALUE_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/util/traits/ut_numeric_limits.h"
13 | #include "hurchalla/util/traits/extensible_make_unsigned.h"
14 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
15 | #include "hurchalla/util/conditional_select.h"
16 | #include "hurchalla/util/compiler_macros.h"
17 | #include <type_traits>
18 | 
19 | namespace hurchalla { namespace detail {
20 | 
21 | 
22 | template <typename T>
23 | class BaseMontgomeryValue {
24 |     static_assert(ut_numeric_limits<T>::is_integer, "");
25 |     T value;
26 | protected:
27 |     HURCHALLA_FORCE_INLINE explicit BaseMontgomeryValue(T a) : value(a) {}
28 |     HURCHALLA_FORCE_INLINE T get() const { return value; }
29 | public:
30 |     // This next constructor purposely does not initialize 'value' - the
31 |     // contents are undefined until the object is assigned to.
32 |     HURCHALLA_FORCE_INLINE BaseMontgomeryValue() = default;
33 | 
34 |     template <class PerfTag = CSelectDefaultTag>
35 |     HURCHALLA_FORCE_INLINE void cmov(bool cond, BaseMontgomeryValue v)
36 |     {
37 |           // value = cond ? v.value : value
38 |         value = ::hurchalla::conditional_select<T, PerfTag>(cond,v.value,value);
39 |     }
40 | };
41 | 
42 | 
43 | }} // end namespace
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/modular_pow.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_MODULAR_POW_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_MODULAR_POW_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/impl_modular_pow.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
15 | 
16 | namespace hurchalla {
17 | 
18 | 
19 | // Alternatively, please consider using the MontgomeryForm class member function
20 | // pow() instead of this function modular_pow().  There's an excellent chance
21 | // that you will achieve much better perfomance using MontgomeryForm's pow -
22 | // though note that MontgomeryForm can only be used if your modulus is odd.
23 | 
24 | template <typename T, typename U>
25 | T modular_pow(T base, U exponent, T modulus)
26 | {
27 |     static_assert(ut_numeric_limits<T>::is_integer, "");
28 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
29 |     static_assert(ut_numeric_limits<U>::is_integer, "");
30 |     static_assert(!(ut_numeric_limits<U>::is_signed), "");
31 |     HPBC_CLOCKWORK_API_PRECONDITION(modulus > 1);
32 | 
33 |     T result = detail::impl_modular_pow::call(base, exponent, modulus);
34 | 
35 |     // POSTCONDITION:
36 |     //  Returns the modular exponentiation of base to the exponent (mod modulus)
37 |     HPBC_CLOCKWORK_POSTCONDITION(result<modulus);
38 |     return result;
39 | }
40 | 
41 | 
42 | }  // end namespace
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/absolute_value_difference.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_ABSOLUTE_VALUE_DIFFERENCE_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_ABSOLUTE_VALUE_DIFFERENCE_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/platform_specific/impl_absolute_value_difference.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/util/compiler_macros.h"
15 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
16 | 
17 | namespace hurchalla {
18 | 
19 | 
20 | // This function returns absolute_value(a-b), calculated as if 'a' and 'b' are
21 | // infinite precision signed integers.  It requires a>=0 and b>=0.
22 | 
23 | template <typename T>  HURCHALLA_FORCE_INLINE
24 | T absolute_value_difference(T a, T b)
25 | {
26 |     static_assert(ut_numeric_limits<T>::is_integer, "");
27 |     HPBC_CLOCKWORK_API_PRECONDITION(a >= 0);
28 |     HPBC_CLOCKWORK_API_PRECONDITION(b >= 0);
29 | 
30 |     T result = detail::impl_absolute_value_difference<T>::call(a, b);
31 | 
32 |     HPBC_CLOCKWORK_POSTCONDITION(result >= 0);
33 |     HPBC_CLOCKWORK_POSTCONDITION(result == ((a>b) ? a-b : b-a));
34 |     return result;
35 | }
36 | 
37 | 
38 | // Performance note for RISC-V (and other uncommon CPU architectures that do not
39 | // have an instruction for conditional move or conditional select):
40 | //   On this architecture, this function may perform better when T is signed
41 | // than when it is unsigned.  Specifically, when HURCHALLA_AVOID_CSELECT is
42 | // defined (see hurchalla/util/compiler_macros.h), a signed type may perform
43 | // better; if it is not defined, you should expect no performance difference
44 | // between signed and unsigned.
45 | 
46 | 
47 | }  // end namespace
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/inverse_mod_R.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_INVERSE_MOD_R_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_INVERSE_MOD_R_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h"
13 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
14 | #include "hurchalla/util/traits/ut_numeric_limits.h"
15 | #include "hurchalla/util/compiler_macros.h"
16 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
17 | 
18 | namespace hurchalla {
19 | 
20 | 
21 | // For discussion purposes, let type UP be a conceptually unlimited precision
22 | // unsigned integer type, and let the unlimited precision constant R represent
23 | // R = (UP)1 << ut_numeric_limits<T>::digits.  Equivalently,
24 | // R = (UP)ut_numeric_limits<T>::max + 1.  For example, if T is uint64_t, we
25 | // would have R = (UP)1 << 64.
26 | 
27 | // Returns the integer x satisfying  x*a ≡ 1 (mod R)
28 | // This function is constexpr when compiling for std=c++14 or higher
29 | template <typename T>
30 | HURCHALLA_CPP14_CONSTEXPR
31 | T inverse_mod_R(T a)
32 | {
33 |     static_assert(ut_numeric_limits<T>::is_integer, "");
34 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
35 |     static_assert(ut_numeric_limits<T>::is_modulo, "");
36 |     HPBC_CLOCKWORK_CONSTEXPR_PRECONDITION(a % 2 == 1);
37 | 
38 |     T inv = detail::impl_inverse_mod_R::call<T,ut_numeric_limits<T>::digits>(a);
39 | 
40 |     // guarantee inv*a ≡ 1 (mod R)
41 |     using P = typename safely_promote_unsigned<T>::type;
42 |     HPBC_CLOCKWORK_CONSTEXPR_POSTCONDITION(static_cast<T>(1) ==
43 |                        static_cast<T>(static_cast<P>(inv) * static_cast<P>(a)));
44 |     return inv;
45 | }
46 | 
47 | 
48 | } // end namespace
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/get_R_mod_n.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_GET_R_MOD_N_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_GET_R_MOD_N_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/util/traits/ut_numeric_limits.h"
13 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
14 | 
15 | namespace hurchalla {
16 | 
17 | 
18 | // For discussion purposes, let type UP be a conceptually unlimited precision
19 | // unsigned integer type, and let the unlimited precision constant R represent
20 | // R = (UP)1 << ut_numeric_limits<T>::digits.  Equivalently,
21 | // R = (UP)ut_numeric_limits<T>::max + 1.  For example, if T is uint64_t, we
22 | // would have R = (UP)1 << 64.
23 | 
24 | // Compute R % n
25 | template <typename T>
26 | T get_R_mod_n(T n)
27 | {
28 |     static_assert(ut_numeric_limits<T>::is_integer, "");
29 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
30 |     static_assert(ut_numeric_limits<T>::is_modulo, "");
31 |     HPBC_CLOCKWORK_PRECONDITION2(n % 2 == 1);
32 |     HPBC_CLOCKWORK_PRECONDITION2(n > 1);
33 | 
34 |     // Assign a tmp T variable rather than directly using the intermediate
35 |     // expression, in order to avoid a negative value (and a wrong answer)
36 |     // in cases where 'n' would be promoted to type 'int'.
37 |     T tmp = static_cast<T>(static_cast<T>(0) - n);
38 |     // Compute R % n.  Arithmetic wraparound behavior of the unsigned integral
39 |     // type T results in (0 - n) equaling (R - n).  Thus
40 |     // rModN = R % n == (R - n) % n == (0 - n) % n
41 |     T rModN = static_cast<T>(tmp % n);
42 |     // Since n is odd and > 1, and R is a power of 2,  n can not divide R.
43 |     // Thus, rModN != 0.
44 | 
45 |     HPBC_CLOCKWORK_POSTCONDITION2(0 < rModN && rModN < n);
46 |     return rModN;
47 | }
48 | 
49 | 
50 | }
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/examples/example_with_cmake/example.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | 
 9 | // This example is intended for the case that you are using CMake.
10 | // If you haven't already, you need to follow the steps in the README.md
11 | // for "How to use the library" | "With CMake"
12 | #include "hurchalla/modular_arithmetic/modular_pow.h"
13 | #include "hurchalla/montgomery_arithmetic/MontgomeryForm.h"
14 | #include <iostream>
15 | #include <cassert>
16 | #include <cstdint>
17 | 
18 | 
19 | int main()
20 | {
21 |    namespace hc = ::hurchalla;
22 | 
23 |    // you could use any integer type that the compiler supports
24 |    // (including __uint128_t)
25 |    using T = uint64_t;
26 | 
27 |    T modulus = 333333333;
28 |    T base = 42;
29 |    T exponent = 123456789;
30 | 
31 |    // ---- Demonstration of modular exponentiation ----
32 | 
33 |    // Montgomery arithmetic version:
34 |    assert(modulus % 2 == 1);  // montgomery arithmetic always needs odd modulus.
35 |       // First construct a MontgomeryForm object to do Montgomery arithmetic
36 |       // with the modulus we chose.
37 |    hc::MontgomeryForm<T> mf(modulus);
38 |       // Convert base to its Montgomery representation.
39 |    auto mont_base = mf.convertIn(base);
40 |       // Get the pow result in Montgomery representation.
41 |    auto mont_result = mf.pow(mont_base, exponent);
42 |       // Convert the Montgomery representation result to normal integer domain.
43 |    T result1 = mf.convertOut(mont_result);
44 | 
45 | 
46 |    // Standard arithmetic version:  (note that Montgomery arithmetic is
47 |    // usually much faster)
48 |    T result2 = hc::modular_pow(base, exponent, modulus);
49 | 
50 | 
51 |    std::cout << "Example results for " << base << "^" << exponent
52 |                                            << " (mod " << modulus << ")\n";
53 |    std::cout << "---------\n";
54 |    std::cout << "using Montgomery arithmetic: " << result1 << "\n";
55 |    std::cout << "using standard arithmetic: " << result2 << "\n";
56 | 
57 |    return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/example_without_cmake/example.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | 
 9 | // This example is intended for the case that you are not using CMake.
10 | // If you haven't already, you need to follow the steps in the README.md
11 | // for "How to use the library" | "Without CMake"
12 | #include "hurchalla/modular_arithmetic/modular_pow.h"
13 | #include "hurchalla/montgomery_arithmetic/MontgomeryForm.h"
14 | #include <iostream>
15 | #include <cassert>
16 | #include <cstdint>
17 | 
18 | 
19 | int main()
20 | {
21 |    namespace hc = ::hurchalla;
22 | 
23 |    // you could use any integer type that the compiler supports
24 |    // (including __uint128_t)
25 |    using T = uint64_t;
26 | 
27 |    T modulus = 333333333;
28 |    T base = 42;
29 |    T exponent = 123456789;
30 | 
31 |    // ---- Demonstration of modular exponentiation ----
32 | 
33 |    // Montgomery arithmetic version:
34 |    assert(modulus % 2 == 1);  // montgomery arithmetic always needs odd modulus.
35 |       // First construct a MontgomeryForm object to do Montgomery arithmetic
36 |       // with the modulus we chose.
37 |    hc::MontgomeryForm<T> mf(modulus);
38 |       // Convert base to its Montgomery representation.
39 |    auto mont_base = mf.convertIn(base);
40 |       // Get the pow result in Montgomery representation.
41 |    auto mont_result = mf.pow(mont_base, exponent);
42 |       // Convert the Montgomery representation result to normal integer domain.
43 |    T result1 = mf.convertOut(mont_result);
44 | 
45 | 
46 |    // Standard arithmetic version:  (note that Montgomery arithmetic is
47 |    // usually much faster)
48 |    T result2 = hc::modular_pow(base, exponent, modulus);
49 | 
50 | 
51 |    std::cout << "Example results for " << base << "^" << exponent
52 |                                            << " (mod " << modulus << ")\n";
53 |    std::cout << "---------\n";
54 |    std::cout << "using Montgomery arithmetic: " << result1 << "\n";
55 |    std::cout << "using standard arithmetic: " << result2 << "\n";
56 | 
57 |    return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/test/modular_arithmetic/test_modular_addition_with_subtraction.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | 
 9 | // This is an exhaustive test of modular addition using modular subtraction to
10 | // verify the addition results.  The test uses only type uint8_t, in order to
11 | // make it computationaly feasible.
12 | 
13 | 
14 | // Strictly for testing purposes, we'll define HURCHALLA_ALLOW_INLINE_ASM_ALL
15 | // here in order to make modular addition use an inline asm function version if
16 | // it is available.  Internally, this inline asm function will also call the
17 | // generic template function version of modular addition inside a postcondition,
18 | // in order to make sure that the asm result is correct. Of course postcondition
19 | // checks must be enabled for this check to occur - the easiest way to ensure
20 | // postconditions are enabled is to define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS,
21 | // which is why we do so here.  The same applies to modular subtraction as well.
22 | #undef HURCHALLA_ALLOW_INLINE_ASM_ALL
23 | #define HURCHALLA_ALLOW_INLINE_ASM_ALL 1
24 | 
25 | #ifndef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
26 | #  define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
27 | #endif
28 | 
29 | #include "hurchalla/modular_arithmetic/modular_addition.h"
30 | #include "hurchalla/modular_arithmetic/modular_subtraction.h"
31 | #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
32 | #include "gtest/gtest.h"
33 | #include <cstdint>
34 | 
35 | namespace {
36 | 
37 | 
38 | TEST(ModularArithmetic, modular_addition_with_subtraction) {
39 |     namespace hc = ::hurchalla;
40 |     using T = std::uint8_t;
41 | 
42 |     for (T modulus=255; modulus>0; --modulus) {
43 |         for (T a=0; a<modulus; ++a) {
44 |             for (T b=0; b<modulus; ++b) {
45 |                 T sum = hc::modular_addition_prereduced_inputs(a, b, modulus);
46 |                 EXPECT_TRUE(a == (hc::modular_subtraction_prereduced_inputs<T,hc::LowlatencyTag>(sum, b, modulus)));
47 |                 EXPECT_TRUE(a == (hc::modular_subtraction_prereduced_inputs<T,hc::LowuopsTag>(sum, b, modulus)));
48 |             }
49 |         }
50 |     }
51 | }
52 | 
53 | 
54 | } // end unnamed namespace
55 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/modular_multiplicative_inverse.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_MODULAR_MULTIPLICATIVE_INVERSE_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_MODULAR_MULTIPLICATIVE_INVERSE_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/impl_modular_multiplicative_inverse.h"
13 | #include "hurchalla/modular_arithmetic/modular_multiplication.h"
14 | #include "hurchalla/util/traits/ut_numeric_limits.h"
15 | #include "hurchalla/util/compiler_macros.h"
16 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
17 | 
18 | namespace hurchalla {
19 | 
20 | 
21 | // Returns the modular multiplicative inverse of 'a', mod the modulus.
22 | // Also assigns the gcd of 'a' and modulus to the reference parameter gcd.
23 | //
24 | // Note: Calling with a < modulus slightly improves performance.
25 | // [The multiplicative inverse is an integer > 0 and < modulus, such that
26 | //    a * multiplicative_inverse == 1 (mod modulus).   It is a unique number,
27 | //    but it exists if and only if 'a' and 'modulus' are coprime.]
28 | template <typename T>
29 | T modular_multiplicative_inverse(T a, T modulus, T& gcd)
30 | {
31 |     static_assert(ut_numeric_limits<T>::is_integer, "");
32 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
33 |     HPBC_CLOCKWORK_API_PRECONDITION(modulus > 1);
34 | 
35 |     T inv = detail::impl_modular_multiplicative_inverse::call(a, modulus, gcd);
36 | 
37 |     HPBC_CLOCKWORK_POSTCONDITION(inv < modulus);
38 |     //POSTCONDITION: Returns 0 if the inverse does not exist. Otherwise returns
39 |     //   the value of the inverse (which is never 0, given that modulus>1).
40 |     HPBC_CLOCKWORK_POSTCONDITION(inv == 0 ||
41 |                        ::hurchalla::modular_multiplication_prereduced_inputs(
42 |                            static_cast<T>(a % modulus), inv, modulus) == 1);
43 |     return inv;
44 | }
45 | 
46 | // Same as the above function, except that it omits the gcd reference parameter.
47 | template <typename T>
48 | HURCHALLA_FORCE_INLINE T modular_multiplicative_inverse(T a, T modulus)
49 | {
50 |     T gcd; // ignored
51 |     return modular_multiplicative_inverse(a, modulus, gcd);
52 | }
53 | 
54 | 
55 | }	// end namespace
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/README.md:
--------------------------------------------------------------------------------
1 | All files in this experimental directory are expected to be good and immediately usable if desired.  However, I consider them to be experiments that aren't useful enough to be moved outside of an "experimental" directory.
2 | 
3 | MontyFullRangeMasked.h:
4 | The class MontyFullRangeMasked is usable in the same situations and in the same way as MontyFullRange; i.e. any odd-value is permissable for the modulus of the constructor.  It uses some interesting and unusual optimizations to the Montgomery arithmetic algorithms, in order to (in theory) perform faster multiply and square and fused-multiply/square-add/sub operations.  The speedup comes at the cost of slightly slower simple add and subtract operations.  The speedup also applies only to certain sizes of T.  For a type T that is the same size as the CPU integer registers (e.g. uin64_t on a 64 bit computer) or a type T that is smaller than the register size, there is a decent chance that MontyFullRangeMasked<T> will perform better overall than MontyFullRange<T>, when both are given the same modulus.  This is due to the improved multiply, square, and fused-multiply/square-add/sub functions.  However, the plain add() and subtract() functions in MontyFullRangeMasked<T> will usually be slower than those in MontyFullRange<T>.  For a type T that is larger than the CPU integer register size, you can usually expect MontyFullRangeMasked<T> will perform worse overall than MontyFullRange<T>, and to provide little or no benefit.  If your modulus is small enough to allow use of MontyQuarterRange<T> or MontyHalfRange<T>, you can usually expect those classes to perform better than either MontyFullRange<T> or MontyFullRangeMasked<T>, regardless of the size of T.
5 | To use MontyFullRangeMasked, you would ordinarily declare a variable (using an unsigned integral type T) as follows:
6 | MontgomeryForm<T, MontyFullRangeMasked<T>> mf;
7 | 
8 | The unit_testing_helpers subdirectory contains classes that provide a run-time polymorphic version of MontgomeryForm for potentially much faster compile times during unit testing.  These classes of course have a run-time performance penalty, so they're intended for use only in unit testing.  At the moment, the class NoForceInlineMontgomeryForm (in the main test folder) seems to improve the compile times for the unit tests sufficiently, and so these extra classes remain here as experimental.  Nevertheless, these extra classes compile correctly for me with clang16 (on macOS) and pass their tests in test_MontgomeryForm_extra.cpp.
9 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '16 9 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'cpp' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v3
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v2
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v2
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v2
71 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontgomeryDefault.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_MONTGOMERY_DEFAULT_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_MONTGOMERY_DEFAULT_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/montgomery_arithmetic/detail/MontyFullRange.h"
13 | #include "hurchalla/montgomery_arithmetic/detail/MontyHalfRange.h"
14 | #include "hurchalla/montgomery_arithmetic/detail/MontyQuarterRange.h"
15 | #include "hurchalla/util/traits/extensible_make_unsigned.h"
16 | #include "hurchalla/util/traits/ut_numeric_limits.h"
17 | #include "hurchalla/util/sized_uint.h"
18 | #include <type_traits>
19 | 
20 | namespace hurchalla { namespace detail {
21 | 
22 | 
23 | template <typename T>
24 | class MontgomeryDefault final {
25 |     static_assert(ut_numeric_limits<T>::is_integer, "");
26 |     using U = typename extensible_make_unsigned<T>::type;
27 |     static constexpr int bitsT = ut_numeric_limits<T>::digits;
28 |     static constexpr int target_bits = HURCHALLA_TARGET_BIT_WIDTH;
29 | public:
30 |     using type = typename std::conditional<
31 |                      (bitsT <= target_bits - 2),
32 |                      MontyQuarterRange<typename sized_uint<target_bits>::type>,
33 |                      typename std::conditional<
34 |                          (bitsT <= target_bits - 1),
35 |                          MontyHalfRange<typename sized_uint<target_bits>::type>,
36 |                          MontyFullRange<U>
37 |                      >::type
38 |                  >::type;
39 | };
40 | 
41 | // Implementation note: when bitsT > target_bits (e.g. T == __int128_t on a 64
42 | // bit system), we purposely never use MontyHalfRange above and instead default
43 | // to MontyFullRange, because MontyFullRange uses unsigned hi_lo mults, whereas
44 | // MontyHalfRange uses signed hi_lo multiplications...
45 | // When bitsT > target_bits we're forced to use a 'slow' hi_lo mult routine,
46 | // since there's no simple asm instruction that's applicable- e.g. on x86_64,
47 | // we need far more than a single MUL or IMUL.  And unfortunately we don't have
48 | // a signed routine that's as good as unsigned when bitsT > target_bits.  For
49 | // details see the comments for slow_signed_multiply_to_hilo_product() in
50 | // hurchalla/util/detail/platform_specific/impl_signed_multiply_to_hilo_product.h
51 | 
52 | 
53 | }} // end namespace
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/modular_multiplication.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_MODULAR_MULTIPLICATION_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_MODULAR_MULTIPLICATION_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_multiplication.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/util/compiler_macros.h"
15 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
16 | 
17 | namespace hurchalla {
18 | 
19 | 
20 | // Alternatively, please consider using the montgomery multiplication class
21 | // MontgomeryForm (specifically its multiply function) instead of this function
22 | // modular_multiplication_prereduced_inputs().  If you are heavily using modular
23 | // multiplication in your code, there's a very good chance that montgomery
24 | // multiplication will improve performance- often significantly.  It always
25 | // requires an odd modulus though.
26 | 
27 | template <typename T>
28 | T modular_multiplication_prereduced_inputs(T a, T b, T modulus)
29 | {
30 |     static_assert(ut_numeric_limits<T>::is_integer, "");
31 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
32 |     HPBC_CLOCKWORK_API_PRECONDITION(modulus>0);
33 |     HPBC_CLOCKWORK_API_PRECONDITION(a<modulus);   // i.e. the input must be prereduced
34 |     HPBC_CLOCKWORK_API_PRECONDITION(b<modulus);   // i.e. the input must be prereduced
35 | 
36 |     T result = detail::impl_modular_multiplication<T>::call(a, b, modulus);
37 | 
38 |     // POSTCONDITION: Returns (a*b)%modulus, theoretically calculated at
39 |     //                infinite precision to avoid overflow.
40 |     HPBC_CLOCKWORK_POSTCONDITION(result<modulus);
41 |     return result;
42 | }
43 | 
44 | // You may find the function modular_multiplication_has_slow_perf() to be useful
45 | // when you have a calculation that seems borderline as to whether standard
46 | // modular multiplication or montgomery multiplication would perform better, in
47 | // general across systems.  You can use this function to help you choose at
48 | // compile-time whether you will perform a montgomery or standard modmult (e.g.
49 | // by calling this function in a constexpr if).
50 | template <typename T>
51 | HURCHALLA_FORCE_INLINE constexpr bool modular_multiplication_has_slow_perf()
52 | {
53 |     return detail::impl_modular_multiplication<T>::has_slow_perf();
54 | }
55 | 
56 | 
57 | } // end namespace
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2024-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_MONTGOMERY_FORM_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_MONTGOMERY_FORM_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/util/compiler_macros.h"
15 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
16 | #include <type_traits>
17 | #include <cstddef>
18 | 
19 | namespace hurchalla { namespace detail {
20 | 
21 | 
22 | // The primary template below handles when InlineAll == true, and annotates
23 | // all class functions with a force inline attribute.
24 | // The template specialization handles InlineAll == false, and does not annotate
25 | // any of the class functions with a force inline attibute.
26 | //
27 | // Note: this is a rare case where ugly #define / #undef / #include hacking
28 | // seems to be the best way to make the code clear and maintainable.  Placing or
29 | // not placing an attribute on a function doesn't appear to be something we can
30 | // directly do with a template parameter.  So we work-around it by creating two
31 | // exact class duplicates (not counting the attribute, which is #defined or
32 | // #undef'd) by using #include, and we use a class specialization (as below)
33 | // to determine whether or not the class's functions get the attribute defined
34 | // and placed, or not.
35 | 
36 | 
37 | #define HURCHALLA_IMF_MAYBE_FORCE_INLINE HURCHALLA_FORCE_INLINE
38 | //
39 | // Primary template, instantiated for InlineAll == true.
40 | //
41 | // All functions in this instantiation get a force inline annotation.
42 | template <class T, bool InlineAll, class MontyType>
43 | class ImplMontgomeryForm final {
44 | #include "hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.contents"
45 | };
46 | #undef HURCHALLA_IMF_MAYBE_FORCE_INLINE
47 | 
48 | 
49 | #define HURCHALLA_IMF_MAYBE_FORCE_INLINE
50 | //
51 | // Specialization, instantiated for InlineAll == false.
52 | //
53 | // No functions will get a force inline annotation, because
54 | // HURCHALLA_IMF_MAYBE_FORCE_INLINE is blank.
55 | template <class T, class MontyType>
56 | class ImplMontgomeryForm<T, false, MontyType> final {
57 | #include "hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.contents"
58 | };
59 | #undef HURCHALLA_IMF_MAYBE_FORCE_INLINE
60 | 
61 | 
62 | }} // end namespace
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | 
 7 | if(TARGET test_hurchalla_modular_arithmetic)
 8 |     return()
 9 | endif()
10 | 
11 | # later versions are probably fine, but are untested
12 | cmake_minimum_required(VERSION 3.14...4.03)
13 | 
14 | 
15 | include(FetchGoogleTest.cmake)
16 | include(EnableMaxWarnings.cmake)
17 | 
18 | 
19 | #set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/tests)
20 | #set(CTEST_BINARY_DIRECTORY ${PROJECT_BINARY_DIR}/tests)
21 | 
22 | 
23 | # needed for gtest_discover_tests()
24 | include(GoogleTest)
25 | 
26 | 
27 | add_executable(test_hurchalla_modular_arithmetic
28 |                modular_arithmetic/test_absolute_value_difference.cpp
29 |                modular_arithmetic/test_modular_addition.cpp
30 |                modular_arithmetic/test_modular_addition_with_subtraction.cpp
31 |                modular_arithmetic/test_modular_multiplication.cpp
32 |                modular_arithmetic/test_modular_multiplicative_inverse.cpp
33 |                modular_arithmetic/test_modular_pow.cpp
34 |                modular_arithmetic/test_modular_subtraction.cpp
35 |                montgomery_arithmetic/low_level_api/test_get_Rsquared_mod_n.cpp
36 |                montgomery_arithmetic/low_level_api/test_inverse_mod_R.cpp
37 |                montgomery_arithmetic/low_level_api/test_REDC.cpp
38 |                montgomery_arithmetic/low_level_api/test_REDC_inline_asm.cpp
39 |                montgomery_arithmetic/test_montgomery_pow.cpp
40 |                montgomery_arithmetic/test_montgomery_two_pow.cpp
41 |                montgomery_arithmetic/test_MontgomeryForm.cpp
42 |                montgomery_arithmetic/test_MontgomeryFormExtensions.cpp
43 |                montgomery_arithmetic/test_MontgomeryForm_extra.cpp
44 |                )
45 | 
46 | EnableMaxWarnings(test_hurchalla_modular_arithmetic)
47 | 
48 | 
49 | if(HURCHALLA_TEST_MODULAR_ARITHMETIC_HEAVYWEIGHT)
50 |     target_compile_definitions(test_hurchalla_modular_arithmetic
51 |                                PRIVATE
52 |                                HURCHALLA_TEST_MODULAR_ARITHMETIC_HEAVYWEIGHT)
53 | endif()
54 | if(MSVC)
55 |     target_compile_options(test_hurchalla_modular_arithmetic PRIVATE /bigobj)
56 | endif()
57 | 
58 | 
59 | set_target_properties(test_hurchalla_modular_arithmetic
60 |                       PROPERTIES FOLDER "Tests")
61 | target_link_libraries(test_hurchalla_modular_arithmetic
62 |                       hurchalla_modular_arithmetic
63 |                       gtest_main)
64 | #add_test(test_hurchalla_modular_arithmetic  test_hurchalla_modular_arithmetic)
65 | gtest_discover_tests(test_hurchalla_modular_arithmetic)
66 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/detail/impl_modular_pow.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_POW_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_POW_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/modular_multiplication.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/util/cselect_on_bit.h"
15 | #include "hurchalla/util/compiler_macros.h"
16 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
17 | 
18 | namespace hurchalla { namespace detail {
19 | 
20 | 
21 | // Returns the modular exponentiation of base^exponent (mod modulus).
22 | // For details, see http://en.wikipedia.org/wiki/Modular_exponentiation
23 | // note: uses a static member function to disallow ADL.
24 | struct impl_modular_pow {
25 |   template <typename T, typename U>
26 |   HURCHALLA_FORCE_INLINE static T call(T base, U exponent, T modulus)
27 |   {
28 |     static_assert(ut_numeric_limits<T>::is_integer, "");
29 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
30 |     static_assert(ut_numeric_limits<U>::is_integer, "");
31 |     static_assert(!(ut_numeric_limits<U>::is_signed), "");
32 |     HPBC_CLOCKWORK_PRECONDITION2(modulus > 1);
33 | 
34 |     namespace hc = ::hurchalla;
35 |     if (base >= modulus)
36 |        base = static_cast<T>(base % modulus);
37 | /*
38 |     // Applied Handbook of Cryptography- http://cacr.uwaterloo.ca/hac/
39 |     // Algorithm 14.76, original unoptimized version
40 |     T result = 1;
41 |     while (exponent > 0)
42 |     {
43 |        if (exponent & 1u) {
44 |           result = hc::modular_multiplication_prereduced_inputs(
45 |                                                          result, base, modulus);
46 |        }
47 |        exponent = exponent >> 1;
48 |        base = hc::modular_multiplication_prereduced_inputs(base, base, modulus);
49 |     }
50 | */
51 |     // slightly optimized version
52 |        // T result = (exponent & 1u) ? base : 1;
53 |     T result = ::hurchalla::cselect_on_bit<0>::ne_0(
54 |                       static_cast<uint64_t>(exponent), base, static_cast<T>(1));
55 |     while (exponent > 1)
56 |     {
57 |        exponent = static_cast<U>(exponent >> 1);
58 |        base = hc::modular_multiplication_prereduced_inputs(base, base, modulus);
59 |        if (exponent & 1u) {
60 |           result = hc::modular_multiplication_prereduced_inputs(
61 |                                                          result, base, modulus);
62 |        }
63 |     }
64 |     return static_cast<T>(result);
65 |   }
66 | };
67 | 
68 | 
69 | }}  // end namespace
70 | 
71 | #endif  // include guard
72 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2025 Jeffrey Hurchalla.
 4 | #
 5 | # This Source Code Form is subject to the terms of the Mozilla Public
 6 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 7 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 8 | 
 9 | 
10 | 
11 | 
12 | # You need to clone the util, factoring, and modular_arithmetic repos
13 | # from https://github.com/hurchalla
14 | 
15 | # SET repo_directory TO THE DIRECTORY WHERE YOU CLONED THE HURCHALLA GIT
16 | # REPOSITORIES.  (or otherwise ensure the compiler /I flags correctly specify
17 | # the needed hurchalla include directories)
18 | 
19 | repo_directory=/Users/jeffreyhurchalla/Desktop
20 | #repo_directory=/home/jeff/repos
21 | 
22 | 
23 | # you would ordinarily use either g++ or clang++  for $1
24 | cppcompiler=$1
25 | 
26 | 
27 | if [[ $cppcompiler == "g++" ]]; then
28 |   error_limit=-fmax-errors=3
29 |   warn_nrvo=-Wnrvo
30 | else
31 |   error_limit=-ferror-limit=3
32 | fi
33 | 
34 | 
35 | 
36 | 
37 | exit_on_failure () {
38 |   if [ $? -ne 0 ]; then
39 |     exit 1
40 |   fi
41 | }
42 | 
43 | #optimization_level=O2
44 | #optimization_level=O3
45 | optimization_level=$2
46 | 
47 | #define_mont_type=-DDEF_MONT_TYPE=MontgomeryQuarter
48 | define_mont_type=-DDEF_MONT_TYPE=$3
49 | define_uint_type=-DDEF_UINT_TYPE=$4
50 | 
51 | # you must specify either -DTEST_ARRAY or -DTEST_SCALAR for $8
52 | define_test_type=$8
53 | 
54 | 
55 | cpp_standard=c++17
56 | 
57 | 
58 | # You can use arguments $9 and ${10} and ${11} etc to define macros such as
59 | # -DHURCHALLA_ALLOW_INLINE_ASM_ALL
60 | # for debugging, defining the following macros may be useful
61 | # -DHURCHALLA_CLOCKWORK_ENABLE_ASSERTS  -DHURCHALLA_UTIL_ENABLE_ASSERTS
62 | 
63 | 
64 | # we could also use  -g  to get debug symbols (for lldb/gdb, and objdump)
65 | 
66 | $cppcompiler   \
67 |         $error_limit   -$optimization_level \
68 |         $define_mont_type  $define_uint_type  $define_test_type \
69 |          $9 ${10} ${11} ${12} ${13} ${14} \
70 |         -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion $warn_nrvo \
71 |         -std=$cpp_standard \
72 |         -I${repo_directory}/modular_arithmetic/modular_arithmetic/include \
73 |         -I${repo_directory}/modular_arithmetic/montgomery_arithmetic/include \
74 |         -I${repo_directory}/util/include \
75 |         -c testbench_montgomery_two_pow.cpp
76 | 
77 | exit_on_failure
78 | 
79 | $cppcompiler  -$optimization_level  -std=$cpp_standard  -o testbench_montgomery_two_pow  testbench_montgomery_two_pow.o -lm
80 | 
81 | exit_on_failure
82 | 
83 | echo "compilation finished, now executing:"
84 | 
85 | 
86 | # argument $5 (if present), is the randomization seed for std::mt19937_64
87 | # argument $6 (if present), is max_modulus_bits_reduce
88 | # argument $7 (if present), is exponent_bits_reduce
89 | 
90 | ./testbench_montgomery_two_pow $5 $6 $7
91 | 
92 | # To give you an example of invoking this script at the command line:
93 | #   ./testbench.sh clang++ O3 MontgomeryFull __uint128_t 191 8 50  -DTEST_ARRAY -DHURCHALLA_ALLOW_INLINE_ASM_ALL
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow_API.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_EXPERIMENTAL_API_MONTGOMERY_TWO_POW_H_INCLUDED
 9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_EXPERIMENTAL_API_MONTGOMERY_TWO_POW_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/montgomery_arithmetic/detail/platform_specific/montgomery_two_pow.h"
13 | #include <cstddef>
14 | #include <array>
15 | 
16 | namespace experimental_api {
17 | 
18 | 
19 | // Calculates the integer pow(2, n), modulo the modulus of mf, and returns the
20 | // result in MongomeryForm representation.
21 | //
22 | // MF can be any MontgomeryForm type (see MontgomeryForm.h), and U can be any
23 | // integer type.  ('n' is the exponent to use)
24 | //
25 | template <class MF, typename U>
26 | typename MF::MontgomeryValue montgomery_two_pow(const MF& mf, U n)
27 | {
28 |     // Rather than calling this function, you could just directly call
29 |     // mf.two_pow(n), as done in the next line.
30 |     return mf.two_pow(n);
31 | 
32 |     // Implementation note: the above function call internally just delegates to
33 |     // return hurchalla::detail::montgomery_two_pow::call(mf, n);
34 |     // It uses novel optimizations of the k-ary exponentiation algorithm
35 |     // ( https://en.wikipedia.org/wiki/Exponentiation_by_squaring )
36 |     // that rely on a hard-coded base 2.
37 | }
38 | 
39 | 
40 | // An array version of the above function - you can expect it to always have
41 | // significantly higher throughput than the above.  (In benchmarks I have
42 | // observed it to have a performance advantage of anywhere from 1.4x to 3x
43 | // higher throughput, depending on the CPU type and whether 64 or 128 bit
44 | // integer types are calculated)
45 | //
46 | // For each array index 'i' from 0 to ARRAY_SIZE-1, this function calculates
47 | // the integer result[i] = pow(2, n[i])  modulo the modulus of mf[i], and
48 | // returns this result array; the result array is in MontgomeryForm
49 | // representation.
50 | //
51 | // MF can be any MontgomeryForm type (see MontgomeryForm.h), and U can be any
52 | // integer type.
53 | //
54 | template <class MF, typename U, size_t ARRAY_SIZE>
55 | std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
56 | array_montgomery_two_pow(const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
57 | {
58 |     // Implementation note: at the moment this API function is the only easy way
59 |     // to get the array version of Montgomery two pow (MontgomeryForm.h does not
60 |     // have an *array* two_pow member function).
61 |     // At some point in the next 8 months I expect to create a SIMD version of
62 |     // MontgomeryForm, and at that time the SIMD MontgomeryForm will become the
63 |     // preferred API to use to access the (high throughput) array version of
64 |     // Montgomery two_pow.
65 | 
66 |     return hurchalla::detail::montgomery_two_pow::call(mf, n);
67 | }
68 | 
69 | 
70 | } // end namespace
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/detail/impl_modular_multiplicative_inverse.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_MULTIPLICATIVE_INV_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_MULTIPLICATIVE_INV_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/util/traits/extensible_make_signed.h"
13 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
14 | #include "hurchalla/util/traits/ut_numeric_limits.h"
15 | #include "hurchalla/util/conditional_select.h"
16 | #include "hurchalla/util/compiler_macros.h"
17 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
18 | #include <type_traits>
19 | 
20 | namespace hurchalla { namespace detail {
21 | 
22 | 
23 | // note: uses a static member function to disallow ADL.
24 | struct impl_modular_multiplicative_inverse {
25 |   template <typename T>
26 |   HURCHALLA_FORCE_INLINE static T call(T val, T modulus, T& gcd)
27 |   {
28 |     static_assert(ut_numeric_limits<T>::is_integer, "");
29 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
30 |     // I decided not to support modulus<=1, since it's not likely to be used and
31 |     // it complicates the return type and adds conditional branches.
32 |     HPBC_CLOCKWORK_PRECONDITION2(modulus > 1);
33 | 
34 |     // POSTCONDITION1: Returns 0 if the inverse doesn't exist. Otherwise returns
35 |     //    the inverse (which is never 0, given that modulus>1).
36 |     // POSTCONDITION2: Sets gcd to the greatest common divisor of val and
37 |     //    modulus. Note that if the inverse exists, we will get gcd == 1.
38 | 
39 |     using U = typename safely_promote_unsigned<T>::type;
40 |     using S = typename extensible_make_signed<U>::type;
41 | 
42 |     // The following algorithm is adapted from Figure 6 of
43 |     // https://jeffhurchalla.com/2018/10/13/implementing-the-extended-euclidean-algorithm-with-unsigned-inputs/
44 |     // calculating only what is needed for the modular multiplicative inverse.
45 |     S y1=0;
46 |     U a1=modulus;
47 |     S y0=1;
48 |     U a2=val;
49 |     U q=0;
50 |     while (a2 > 1) {
51 |         S y2 = static_cast<S>(y0 - static_cast<S>(q)*y1);
52 |         y0=y1;
53 |         y1=y2;
54 |         U a0=a1;
55 |         a1=a2;
56 | 
57 |         q = static_cast<U>(a0/a1);
58 |         a2 = static_cast<U>(a0 - q*a1);
59 |     }
60 |     HPBC_CLOCKWORK_ASSERT2(a1 > 1);
61 | 
62 |     if (a2 == 1) {
63 |         gcd = 1;
64 |         S y = static_cast<S>(y0 - static_cast<S>(q)*y1);
65 |           // inv = (y<0) ? y+modulus : y
66 |         U inv = ::hurchalla::conditional_select(y<0,
67 |                                   static_cast<U>(static_cast<U>(y)+modulus),
68 |                                   static_cast<U>(y));
69 |         HPBC_CLOCKWORK_POSTCONDITION2(inv < modulus);
70 |         return static_cast<T>(inv);
71 |     }
72 |     else {
73 |         gcd = static_cast<T>(a1);
74 |         HPBC_CLOCKWORK_ASSERT2(gcd > 1);
75 |         return 0;
76 |     }
77 |   }
78 | };
79 | 
80 | 
81 | }}  // end namespace
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2022 Jeffrey Hurchalla.
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | 
 7 | if(TARGET hurchalla_modular_arithmetic)
 8 |     return()
 9 | endif()
10 | 
11 | # later versions are probably fine, but are untested
12 | cmake_minimum_required(VERSION 3.14...4.03)
13 | 
14 | project(hurchalla_modular_arithmetic VERSION 1.0.0 LANGUAGES CXX)
15 | 
16 | 
17 | # if this is the top level CMakeLists.txt, let IDEs group projects into folders
18 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
19 |     set_property(GLOBAL PROPERTY USE_FOLDERS ON)
20 | endif()
21 | 
22 | 
23 | if(NOT DEFINED CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
24 |     set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
25 | endif()
26 | if(NOT DEFINED CMAKE_LIBRARY_OUTPUT_DIRECTORY)
27 |     set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
28 | endif()
29 | if(NOT DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
30 |     set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
31 | endif()
32 | 
33 | 
34 | add_library(hurchalla_modular_arithmetic INTERFACE)
35 | 
36 | 
37 | add_subdirectory(modular_arithmetic
38 |                  ${CMAKE_CURRENT_BINARY_DIR}/modular_arithmetic)
39 | add_subdirectory(montgomery_arithmetic
40 |                  ${CMAKE_CURRENT_BINARY_DIR}/montgomery_arithmetic)
41 | 
42 | target_link_libraries(hurchalla_modular_arithmetic
43 |                       INTERFACE hurchalla_basic_modular_arithmetic)
44 | target_link_libraries(hurchalla_modular_arithmetic
45 |                       INTERFACE hurchalla_montgomery_arithmetic)
46 | 
47 | 
48 | 
49 | # TODO:  The following may be overly simple, but works so far to install target
50 | # include directories.  It assumes that the build step from the subdirectories
51 | # montgomery_arithmetic and modular_arithmetic (which have build phase
52 | # target_include_directories commands) provides the information to cmake which
53 | # cmake then uses in the install phase target_include_directories below.  This
54 | # has worked for the basic cmake install tests I've done so far...
55 | # ---------------------
56 | target_include_directories(hurchalla_modular_arithmetic
57 |                            INTERFACE $<INSTALL_INTERFACE:include>)
58 | # TODO: use this instead?
59 | # ----------------
60 | #target_include_directories(hurchalla_modular_arithmetic SYSTEM
61 | #                      INTERFACE $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>)
62 | 
63 | 
64 | 
65 | # ***Tests***
66 | 
67 | # If this is the top level CMakeLists.txt, add testing options, and enable
68 | # testing when testing options have been set to ON.
69 | 
70 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
71 |     option(TEST_HURCHALLA_MODULAR_ARITHMETIC
72 |         "Build the tests for the Hurchalla modular arithmetic library project."
73 |         OFF)
74 |     option(HURCHALLA_TEST_MODULAR_ARITHMETIC_HEAVYWEIGHT
75 |         "Include extensive and time consuming tests."
76 |         OFF)
77 |     option(FORCE_TEST_HURCHALLA_CPP11_STANDARD
78 |        "If testing this library, ensure we build googletest and tests using -std=c++11")
79 | 
80 |     if(TEST_HURCHALLA_MODULAR_ARITHMETIC)
81 |         enable_testing()
82 |         # include(CTest)
83 |         add_subdirectory(test)
84 |     endif()
85 | endif()
86 | 


--------------------------------------------------------------------------------
/macros_for_performance.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Optional macros you can define to tune performance
 3 | --------------------------------------------------
 4 | There are a number of macros you can optionally define when you compile, to tune the
 5 | performance on your system for the modular arithmetic functions.  It is
 6 | generally recommended not to do so, but in some cases you may find it useful.
 7 | You would define one or more of these macros when compiling *your* sources,
 8 | given that Clockwork is a header-only library.
 9 | 
10 | For example, if you are compiling using clang or gcc from the command line, you would
11 | specify the -D compilation flag, similarly to the following: 
12 | clang++ -DHURCHALLA_ALLOW_INLINE_ASM_ALL  ...more arguments...
13 | As another example, if you are using CMake you would add the command "target_compile_definitions"
14 | to your CMakeLists.txt, similarly to the following: 
15 | target_compile_definitions(&lt;your_target_name&gt;  PRIVATE  HURCHALLA_ALLOW_INLINE_ASM_ALL) 
16 | \
17 | \
18 | HURCHALLA_TARGET_ISA_HAS_NO_DIVIDE - define this macro if your target system's
19 | instruction set does not include division.  Although it is unusual, some
20 | microcontrollers do not have division, and defining this macro might improve
21 | performance in such a case.
22 | 
23 | HURCHALLA_AVOID_CSELECT - you may wish to define this macro if your target
24 | system's instruction set does not include conditional move or conditional
25 | select.  It may improve performance in such a case.  This macro is normally
26 | already defined for RISC-V.
27 | 
28 | HURCHALLA_ALLOW_INLINE_ASM_ALL - defining this macro will enable all
29 | available inline asm functions.  Although this is the easiest macro to use, you
30 | can more selectively enable inline asm for particular functions, using macros
31 | listed below.  In some cases HURCHALLA_ALLOW_INLINE_ASM_ALL may improve
32 | performance up to 20% (gcc often benefits), and in other cases it may make
33 | essentially no difference or harm performance (clang does not seem to benefit).
34 | It is not enabled by default because inline asm is extremely difficult to verify
35 | for correctness.  While I believe I'm skilled at writing high quality inline
36 | asm, I advise you to be skeptical of this and of any inline asm you see.
37 | Unit tests of inline asm are far less helpful than you might think - the ability
38 | of a unit test to detect a bug in inline asm often depends upon the register
39 | allocation choices the compiler makes for surrounding test code, which is mostly
40 | outside a programmer's control.  Generally speaking, it is [difficult to
41 | recommend inline asm](https://gcc.gnu.org/wiki/DontUseInlineAsm) unless there is
42 | a large performance benefit or performance is critical.
43 | 
44 | HURCHALLA_ALLOW_INLINE_ASM_REDC  
45 | HURCHALLA_ALLOW_INLINE_ASM_ABSDIFF  
46 | HURCHALLA_ALLOW_INLINE_ASM_MODADD  
47 | HURCHALLA_ALLOW_INLINE_ASM_MODSUB  
48 | HURCHALLA_ALLOW_INLINE_ASM_QUARTERRANGE_GET_CANONICAL  
49 | HURCHALLA_ALLOW_INLINE_ASM_HALFRANGE_GET_CANONICAL  
50 | - these macros selectively enable inline asm for functions.  They may or may not
51 | improve performance, and the warnings above for HURCHALLA_ALLOW_INLINE_ASM_ALL
52 | apply here too.  To determine if they are even useful, you would need to
53 | compare performance with different ASM macros defined/not defined.  Generally
54 | you would want to start with HURCHALLA_ALLOW_INLINE_ASM_REDC.
55 | 


--------------------------------------------------------------------------------
/test/montgomery_arithmetic/low_level_api/test_REDC.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | // Test the REDC function versions that contain no inline asm.
 9 | 
10 | #undef HURCHALLA_ALLOW_INLINE_ASM_ALL
11 | #undef HURCHALLA_ALLOW_INLINE_ASM_REDC
12 | 
13 | 
14 | #include "test_REDC.h"
15 | 
16 | 
17 | TEST(MontgomeryArithmetic, REDC8) {
18 |     std::vector<uint8_t> moduli { 3, 255, 19, 21, 211, 23, 171 };
19 |     for (auto n : moduli)
20 |         REDC_test_all(n);
21 | }
22 | TEST(MontgomeryArithmetic, REDC16) {
23 |     std::vector<uint16_t> moduli { 3, 17, UINT16_C(65535),
24 |                           UINT16_C(65533), UINT16_C(357), UINT16_C(32253),
25 |                           UINT16_C(11111) };
26 |     for (auto n : moduli)
27 |         REDC_test_all(n);
28 | }
29 | TEST(MontgomeryArithmetic, REDC32) {
30 |     std::vector<uint32_t> moduli { 3, 13, UINT32_C(4294967295),
31 |                           UINT32_C(4294967293), UINT32_C(2147483347),
32 |                           UINT32_C(246098243), UINT32_C(1111111) };
33 |     for (auto n : moduli)
34 |         REDC_test_all(n);
35 | }
36 | TEST(MontgomeryArithmetic, REDC64) {
37 |     std::vector<uint64_t> moduli { 3, 11, UINT64_C(18446744073709551615),
38 |                           UINT64_C(18446744073709551613),
39 |                           UINT64_C(4294967295),
40 |                           UINT64_C(3194806714689), UINT64_C(11111111311) };
41 |     for (auto n : moduli)
42 |         REDC_test_all(n);
43 | }
44 | 
45 | #if !defined(__GNUC__) || __GNUC__ >= 11 || defined(__INTEL_COMPILER) || \
46 |                                             defined(__clang__)
47 | // Older versions of GCC (most of them prior to v11) have a compiler bug that
48 | // causes an incorrect value of n to be produced and thus results in one of my
49 | // google test assertions failing. See
50 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98474 .  The bug appears to have
51 | // been introduced as a regression to gcc in v5.1.  It exists up to the latest
52 | // released version (v10.2) of gcc at the time of this writing.  It's unclear
53 | // at the moment whether __uint128_t is safe to use with any version of gcc
54 | // between 5.1 and 10.2.  The patch appears to fix the bug, and it is scheduled
55 | // to be in the gcc 11 release.
56 | // The #if above disables the following tests on gcc prior to gcc v11, since
57 | // they will fail at optimization level -O1 or higher due to the compiler bug.
58 | # if HURCHALLA_COMPILER_HAS_UINT128_T()
59 | TEST(MontgomeryArithmetic, REDC128) {
60 |     __uint128_t zero = 0;
61 |     std::vector<__uint128_t> moduli { 3, 11, zero-1, zero-3,
62 |                   static_cast<__uint128_t>(UINT64_C(18446744073709551613)) *
63 |                                              UINT64_C(18446744073709551611),
64 |                   static_cast<__uint128_t>(UINT64_C(35698723439051265)) *
65 |                                                 UINT64_C(70945870135873583),
66 |                   static_cast<__uint128_t>(UINT64_C(34069834503)) *
67 |                                               UINT64_C(895835939) };
68 |     for (auto n : moduli)
69 |         REDC_test_all(n);
70 | }
71 | # endif
72 | #endif
73 | 


--------------------------------------------------------------------------------
/test/montgomery_arithmetic/test_MontgomeryForm_extra.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2024 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #include "test_MontgomeryForm.h"
 9 | #include "hurchalla/montgomery_arithmetic/detail/MontyWrappedStandardMath.h"
10 | #include "hurchalla/montgomery_arithmetic/detail/experimental/MontyFullRangeMasked.h"
11 | #include "hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryForm.h"
12 | #include "hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/ConcreteMontgomeryForm.h"
13 | #include "hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryWrapper.h"
14 | #include "hurchalla/util/compiler_macros.h"
15 | #include "gtest/gtest.h"
16 | 
17 | 
18 | namespace {
19 | 
20 | 
21 | // For unit testing, we want fast compile times, so it helps to use the version
22 | // of MontgomeryForm that generally doesn't do force inlining.
23 | #if 1
24 | constexpr bool forceInlineAllFunctions = false;
25 | #else
26 | constexpr bool forceInlineAllFunctions = true;
27 | #endif
28 | 
29 | template <class T, class Monty> using MF =
30 |     hurchalla::MontgomeryForm<T, forceInlineAllFunctions, Monty>;
31 | 
32 | 
33 | 
34 | // test the 'unusual' Montgomery types, which are MontyWrappedStandardMath and
35 | // the experimental class MontyFullRangeMasked.
36 | 
37 | TEST(MontgomeryArithmetic, MontyWrappedStandardMath) {
38 |     test_custom_monty<MF, hurchalla::detail::MontyWrappedStandardMath>();
39 | }
40 | 
41 | 
42 | #ifdef HURCHALLA_TEST_MODULAR_ARITHMETIC_HEAVYWEIGHT
43 | // MontyFullRangeMasked is experimental, so we skip it when we're not doing
44 | // extensive (heavyweight) testing.
45 | TEST(MontgomeryArithmetic, MontyFullRangeMasked) {
46 |     test_custom_monty<MF, hurchalla::detail::MontyFullRangeMasked>();
47 | }
48 | 
49 | // The group of classes: ConcreteMontgomeryForm, AbstractMontgomeryForm, and
50 | // AbstractMontgomeryWrapper, are experimental, so we skip testing them when
51 | // we're not doing extensive (heavyweight) testing.
52 | TEST(MontgomeryArithmetic, MontyVirtual) {
53 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
54 |     {
55 |         using ConcreteMF = hurchalla::ConcreteMontgomeryForm<hurchalla::MontgomeryForm<__uint128_t>,
56 |                                                              TESTABLE_ARRAY_POW_SIZES()>;
57 |         using Wrapper = hurchalla::AbstractMontgomeryWrapper<ConcreteMF::Parent>;
58 |         test_MontgomeryForm<Wrapper, ConcreteMF>();
59 |     }
60 | #endif
61 |     {
62 |         using ConcreteMF = hurchalla::ConcreteMontgomeryForm<hurchalla::MontgomeryForm<uint32_t>,
63 |                                                              TESTABLE_ARRAY_POW_SIZES()>;
64 |         using Wrapper = hurchalla::AbstractMontgomeryWrapper<ConcreteMF::Parent>;
65 |         test_MontgomeryForm<Wrapper, ConcreteMF>();
66 |     }
67 |     {
68 |         using ConcreteMF = hurchalla::ConcreteMontgomeryForm<hurchalla::MontgomeryForm<int32_t>,
69 |                                                              TESTABLE_ARRAY_POW_SIZES()>;
70 |         using Wrapper = hurchalla::AbstractMontgomeryWrapper<ConcreteMF::Parent>;
71 |         test_MontgomeryForm<Wrapper, ConcreteMF>();
72 |     }
73 | }
74 | #endif
75 | 
76 | 
77 | } // end anonymous namespace
78 | 


--------------------------------------------------------------------------------
/test/montgomery_arithmetic/low_level_api/test_REDC_inline_asm.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | // Test the REDC function versions that contain inline asm.
 9 | 
10 | #undef HURCHALLA_ALLOW_INLINE_ASM_ALL
11 | #define HURCHALLA_ALLOW_INLINE_ASM_ALL
12 | 
13 | // For extra coverage, we also enable the asserts, so that the internal REDC
14 | // function postconditions call corresponding non-inline asm functions to
15 | // check their results.
16 | #undef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
17 | #define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
18 | #undef HURCHALLA_CLOCKWORK_ASSERT_LEVEL
19 | #define HURCHALLA_CLOCKWORK_ASSERT_LEVEL 3
20 | 
21 | 
22 | #include "test_REDC.h"
23 | 
24 | 
25 | TEST(MontgomeryArithmetic, REDC8_inline_asm) {
26 |     std::vector<uint8_t> moduli { 3, 255, 19, 21, 211, 23, 171 };
27 |     for (auto n : moduli)
28 |         REDC_test_all(n);
29 | }
30 | TEST(MontgomeryArithmetic, REDC16_inline_asm) {
31 |     std::vector<uint16_t> moduli { 3, 17, UINT16_C(65535),
32 |                           UINT16_C(65533), UINT16_C(357), UINT16_C(32253),
33 |                           UINT16_C(11111) };
34 |     for (auto n : moduli)
35 |         REDC_test_all(n);
36 | }
37 | TEST(MontgomeryArithmetic, REDC32_inline_asm) {
38 |     std::vector<uint32_t> moduli { 3, 13, UINT32_C(4294967295),
39 |                           UINT32_C(4294967293), UINT32_C(2147483347),
40 |                           UINT32_C(246098243), UINT32_C(1111111) };
41 |     for (auto n : moduli)
42 |         REDC_test_all(n);
43 | }
44 | TEST(MontgomeryArithmetic, REDC64_inline_asm) {
45 |     std::vector<uint64_t> moduli { 3, 11, UINT64_C(18446744073709551615),
46 |                           UINT64_C(18446744073709551613),
47 |                           UINT64_C(4294967295),
48 |                           UINT64_C(3194806714689), UINT64_C(11111111311) };
49 |     for (auto n : moduli)
50 |         REDC_test_all(n);
51 | }
52 | 
53 | #if !defined(__GNUC__) || __GNUC__ >= 11 || defined(__INTEL_COMPILER) || \
54 |                                             defined(__clang__)
55 | // Older versions of GCC (most of them prior to v11) have a compiler bug that
56 | // causes an incorrect value of n to be produced and thus results in one of my
57 | // google test assertions failing. See
58 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98474 .  The bug appears to have
59 | // been introduced as a regression to gcc in v5.1.  It exists up to the latest
60 | // released version (v10.2) of gcc at the time of this writing.  It's unclear
61 | // at the moment whether __uint128_t is safe to use with any version of gcc
62 | // between 5.1 and 10.2.  The patch appears to fix the bug, and it is scheduled
63 | // to be in the gcc 11 release.
64 | // The #if above disables the following tests on gcc prior to gcc v11, since
65 | // they will fail at optimization level -O1 or higher due to the compiler bug.
66 | # if HURCHALLA_COMPILER_HAS_UINT128_T()
67 | TEST(MontgomeryArithmetic, REDC128_inline_asm) {
68 |     __uint128_t zero = 0;
69 |     std::vector<__uint128_t> moduli { 3, 11, zero-1, zero-3,
70 |                   static_cast<__uint128_t>(UINT64_C(18446744073709551613)) *
71 |                                              UINT64_C(18446744073709551611),
72 |                   static_cast<__uint128_t>(UINT64_C(35698723439051265)) *
73 |                                                 UINT64_C(70945870135873583),
74 |                   static_cast<__uint128_t>(UINT64_C(34069834503)) *
75 |                                               UINT64_C(895835939) };
76 |     for (auto n : moduli)
77 |         REDC_test_all(n);
78 | }
79 | # endif
80 | #endif
81 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/modular_addition.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_MODULAR_ADDITION_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_MODULAR_ADDITION_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_addition.h"
13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
14 | #include "hurchalla/util/compiler_macros.h"
15 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
16 | 
17 | namespace hurchalla {
18 | 
19 | 
20 | // Perfomance notes are given below this function
21 | template <typename T>  HURCHALLA_FORCE_INLINE
22 | T modular_addition_prereduced_inputs(T a, T b, T modulus)
23 | {
24 |     static_assert(ut_numeric_limits<T>::is_integer, "");
25 |     HPBC_CLOCKWORK_API_PRECONDITION(modulus > 0);
26 |     HPBC_CLOCKWORK_API_PRECONDITION(0<=a && a<modulus);   // i.e. the input must be prereduced
27 |     HPBC_CLOCKWORK_API_PRECONDITION(0<=b && b<modulus);   // i.e. the input must be prereduced
28 |     
29 |     T result = detail::impl_modular_addition<T>::call(a, b, modulus);
30 | 
31 |     // POSTCONDITION:
32 |     // Returns (a+b)%modulus, performed as if a and b have infinite precision
33 |     // and thus as if (a+b) is never subject to integer overflow.
34 |     HPBC_CLOCKWORK_POSTCONDITION(0<=result && result<modulus);
35 |     return result;
36 | }
37 | 
38 | 
39 | // --Performance Notes--
40 | 
41 | // For this function to be able to complete at its lowest latency, you will need
42 | // to ensure in your calling code (if possible) that neither 'b' nor 'modulus'
43 | // was recently changed (or set) prior to your call of this function - note that
44 | // "recently" could be on a prior loop iteration.  Generally speaking, if either
45 | // 'b' or 'modulus' was changed on an immediately preceding (modular) arithmetic
46 | // instruction, or if one of those two variables was otherwise changed
47 | // immediately beforehand, then usually this function will need one more cycle
48 | // to complete than it would need at its ideal lowest latency.
49 | // If you wish to maximize throughput rather than minimize latency, then you may
50 | // find modular subtraction to be useful - modular subtraction by default has
51 | // fewer uops than modular addition (note that subtraction never has lower
52 | // latency).  Given that modular subtraction by default uses fewer uops, if you
53 | // need to do modular additions and you want to optimize for a low uop count,
54 | // *and* if you see that 'b' will remain constant over many of your modular
55 | // addition calls (typically due to you calling in a loop), then as an option,
56 | // you could calculate:
57 | // negative_b = (b == 0) ? 0 : (modulus - b),  and then, instead of calling
58 | // modular_addition_prereduced_inputs(a, b, modulus),  you can instead call
59 | // modular_subtraction_prereduced_inputs(a, negative_b, modulus).  If you
60 | // calculate negative_b once, and then use it over many calls of
61 | // modular_subtraction_prereduced_inputs, then potentially modular subtraction's
62 | // lowered uop count might increase your overall throughput slightly.
63 | 
64 | // Performance note for RISC-V (and other uncommon CPU architectures that do not
65 | // have an instruction for conditional move or conditional select):
66 | //   On this architecture, modular addition may perform better when T is signed
67 | // than when it is unsigned.  Specifically, when HURCHALLA_AVOID_CSELECT is
68 | // defined (see hurchalla/util/compiler_macros.h), a signed type may perform
69 | // better; if it is not defined, you should expect no performance difference
70 | // between signed and unsigned.
71 | 
72 | 
73 | }  // end namespace
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/modular_arithmetic/include/hurchalla/modular_arithmetic/modular_subtraction.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | /*
 3 |  * This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
 6 |  */
 7 | 
 8 | #ifndef HURCHALLA_MODULAR_ARITHMETIC_MODULAR_SUBTRACTION_H_INCLUDED
 9 | #define HURCHALLA_MODULAR_ARITHMETIC_MODULAR_SUBTRACTION_H_INCLUDED
10 | 
11 | 
12 | #include "hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_subtraction.h"
13 | #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
14 | #include "hurchalla/util/traits/ut_numeric_limits.h"
15 | #include "hurchalla/util/compiler_macros.h"
16 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
17 | #include <type_traits>
18 | 
19 | namespace hurchalla {
20 | 
21 | 
22 | // Perfomance recommendations are given below this function
23 | template <typename T, class PTAG = LowuopsTag>  HURCHALLA_FORCE_INLINE
24 | T modular_subtraction_prereduced_inputs(T a, T b, T modulus)
25 | {
26 |     static_assert(ut_numeric_limits<T>::is_integer, "");
27 |     static_assert(std::is_same<PTAG,LowlatencyTag>::value ||
28 |                   std::is_same<PTAG,LowuopsTag>::value, "");
29 |     HPBC_CLOCKWORK_API_PRECONDITION(modulus > 0);
30 |     HPBC_CLOCKWORK_API_PRECONDITION(0<=a && a<modulus);   // i.e. the input must be prereduced
31 |     HPBC_CLOCKWORK_API_PRECONDITION(0<=b && b<modulus);   // i.e. the input must be prereduced
32 | 
33 |     T result = detail::impl_modular_subtraction<T,PTAG>::call(a, b, modulus);
34 | 
35 |     // POSTCONDITION:
36 |     // Let a conceptual "%%" operator represent a modulo operator that always
37 |     // returns a non-negative remainder.
38 |     // This function returns (a-b) %% modulus, performed as if a and b are
39 |     // infinite precision signed ints (and thus as if it is impossible for the
40 |     // subtraction (a-b) to overflow).
41 |     HPBC_CLOCKWORK_POSTCONDITION(0<=result && result<modulus);
42 |     return result;
43 | }
44 | 
45 | 
46 | // --Performance Suggestions--
47 | 
48 | // Note on the optional PTAG (performance tag) template parameter:
49 | //    This parameter does not affect functionality in any way; it affects only
50 | // performance.
51 | //    If you specify PTAG, you must choose either LowuopsTag or LowlatencyTag.
52 | //    If you want low latency, you should usually choose LowlatencyTag, provided
53 | // that you can see that both modulus and one of either 'a' or 'b' were not set
54 | // or modified recently before your call - note that a "recent" modify could be
55 | // on a prior loop iteration.  If they were recently modified, the compiler will
56 | // often be unable to provide any low latency benefit over LowuopsTag.  Note
57 | // that LowlatencyTag will typically use more uops and create more pressure on
58 | // the ALU than LowuopsTag.
59 | //    You should usually prefer LowuopsTag if you want to minimize the uop count
60 | // and ALU pressure (presumably for higher throughput), or if you see that
61 | // either 'modulus' or both of 'a' and 'b' were set/modified close to your call
62 | // of this function - note that "close" could be on a prior loop iteration.
63 | // LowuopsTag generally provides a lower uop count and lower ALU pressure than
64 | // LowlatencyTag.  We can note though that it is possible for LowlatencyTag to
65 | // effectively have similar uop count and ALU pressure as LowuopsTag, if the
66 | // compiler can loop hoist its extra instruction(s) involving 'a'(or 'b') and
67 | // 'modulus'.
68 | 
69 | // Performance note for RISC-V (and other uncommon CPU architectures that do not
70 | // have an instruction for conditional move or conditional select):
71 | //   On this architecture, modular subtraction may perform better when T is
72 | // signed than when it is unsigned.  Specifically, when HURCHALLA_AVOID_CSELECT
73 | // is defined (see hurchalla/util/compiler_macros.h), a signed type may perform
74 | // better; if it is not defined, you should expect no performance difference
75 | // between signed and unsigned.
76 | 
77 | 
78 | }  // end namespace
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/test/FetchGoogleTest.cmake:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | # This Source Code Form is subject to the terms of the Mozilla Public
  3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
  5 | 
  6 | 
  7 | # Use FetchContent to get Googletest
  8 | # https://cmake.org/cmake/help/git-master/module/FetchContent.html
  9 | 
 10 | # Also inspired by https://github.com/Crascit/DownloadProject/issues/29
 11 | # and https://github.com/CLIUtils/cmake/blob/master/AddGoogleTest.cmake
 12 | 
 13 | 
 14 | if (NOT TARGET gtest_main)
 15 |     # Prevent GoogleTest from overriding our compiler/linker options
 16 |     # when building with Visual Studio
 17 |     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 18 |     set(BUILD_SHARED_LIBS OFF)
 19 | 
 20 | 
 21 |     include(FetchContent)
 22 | 
 23 |     if(FORCE_TEST_HURCHALLA_CPP11_STANDARD)
 24 |         # googletest v1.12 is the final release that supports C++11
 25 |         FetchContent_Declare(
 26 |             googletest
 27 |             GIT_REPOSITORY https://github.com/google/googletest.git
 28 |             GIT_TAG        release-1.12.1
 29 |         )
 30 |         set(CMAKE_POLICY_VERSION_MINIMUM 3.10)
 31 |         #
 32 |         # when using GIT_TAG release-1.12.1, setting CMAKE_POLICY_VERSION_MINIMUM
 33 |         # to 3.10 avoids deprecation warnings (gtest v1.12's cmakelists.txt
 34 |         # evidently set cmake_minimum_required to somethimg under 3.10).
 35 |     else()
 36 |         FetchContent_Declare(
 37 |             googletest
 38 |             GIT_REPOSITORY https://github.com/google/googletest.git
 39 |             GIT_TAG        main
 40 |         )
 41 |     endif()
 42 | 
 43 | 
 44 |     # For Windows: Prevent overriding the parent project's compiler/linker settings
 45 |     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 46 |     FetchContent_MakeAvailable(googletest)
 47 | 
 48 | #    OLD METHOD (deprecated):
 49 | #
 50 | #    FetchContent_GetProperties(googletest)
 51 | #    if(NOT googletest_POPULATED)
 52 | #        FetchContent_Populate(googletest)
 53 | #    endif()
 54 | 
 55 | 
 56 | #    set_directory_properties(PROPERTIES EXCLUDE_FROM_ALL ON)
 57 | #
 58 | #    # Needed because googletest project sets minimum CMake version < 3.0
 59 | #    set(CMAKE_POLICY_DEFAULT_CMP0048 OLD)
 60 | #
 61 | #    # Ignore undef warnings, issue with source file:
 62 | #    # googletest/include/gtest/internal/gtest-port.h:309:5: error: "_MSC_VER" is not defined
 63 | #    if(NOT MSVC)
 64 | #        add_compile_options( -Wno-undef )
 65 | #    endif()
 66 | #    set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME googletest)
 67 | 
 68 | #    add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
 69 | 
 70 | 
 71 | 
 72 | #    add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
 73 | 
 74 | 
 75 | 
 76 | # alternative
 77 | #    set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE BOOL "")
 78 | #    add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
 79 | #    unset(CMAKE_SUPPRESS_DEVELOPER_WARNINGS)
 80 | 
 81 | 
 82 | 
 83 | #    add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} 
 84 | #            --force-new-ctest-process --output-on-failure)
 85 | #    set_target_properties(check PROPERTIES FOLDER "Scripts")
 86 | 
 87 |     mark_as_advanced(
 88 |     gmock_build_tests
 89 |     gtest_build_samples
 90 |     gtest_build_tests
 91 |     gtest_disable_pthreads
 92 |     gtest_force_shared_crt
 93 |     gtest_hide_internal_symbols
 94 |     BUILD_GMOCK
 95 |     BUILD_GTEST
 96 |     )
 97 | 
 98 |     set_target_properties(gtest gtest_main gmock gmock_main
 99 |         PROPERTIES FOLDER "Googletest")
100 | 
101 | #    if(MSVC AND MSVC_VERSION GREATER_EQUAL 1900)
102 | #        target_compile_definitions(gtest PUBLIC _SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING)
103 | #        target_compile_definitions(gtest_main PUBLIC _SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING)
104 | #        target_compile_definitions(gmock PUBLIC _SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING)
105 | #        target_compile_definitions(gmock_main PUBLIC _SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING)
106 | #    endif()
107 | 
108 | endif()
109 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/timings_x64_Zen4/partial_array_size_2/out.txt:
--------------------------------------------------------------------------------
  1 | 0.2885 2 09 x x 02 yxxttxt
  2 | 0.2885 2 09 x x 02 yxttttt
  3 | 0.2886 2 09 x x 02 yxtxttt
  4 | 0.2886 2 09 x x 02 yxxxttt
  5 | 0.2886 2 09 x x 02 yxxttxx
  6 | 0.2886 2 09 x x 02 yxxxttx
  7 | 0.2886 2 09 x x 02 uxtxttx
  8 | 0.2886 2 09 x x 02 yxxtttx
  9 | 0.2886 2 09 x x 02 uxtxtxx
 10 | 0.2886 2 09 x x 02 uxxxttx
 11 | 0.2886 2 09 x x 02 uxtxtxt
 12 | 0.2886 2 09 x x 02 yxtttxx
 13 | 0.2886 2 09 x x 02 uxxttxt
 14 | 0.2887 2 09 x x 02 uxtttxt
 15 | 0.2887 2 09 x x 02 uxxxtxx
 16 | 0.2887 2 09 x x 02 yxtttxt
 17 | 0.2887 2 09 x x 02 yxxtttt
 18 | 0.2887 2 09 x x 02 yxttttx
 19 | 0.2887 2 09 x x 02 yxxxtxx
 20 | 0.2887 2 09 x x 02 yxtxtxt
 21 | 0.2887 2 09 x x 02 uxxxtxt
 22 | 0.2887 2 09 x x 02 yxtxtxx
 23 | 0.2887 2 09 x x 02 yxtxttx
 24 | 0.2888 2 09 x x 02 yxxxtxt
 25 | 0.2889 2 09 x x 02 uxxtttt
 26 | 0.2889 2 09 x x 02 uxttttx
 27 | 0.2889 2 09 x x 02 uxxttxx
 28 | 0.2889 2 09 x x 02 uxtttxx
 29 | 0.2890 2 09 x x 02 uxxxttt
 30 | 0.2890 2 09 x x 02 uxxtttx
 31 | 0.2890 2 09 x x 02 uxttttt
 32 | 0.2890 2 09 x x 02 uxtxttt
 33 | 0.2896 2 09 x x 02 uttxttx
 34 | 0.2896 2 09 x x 02 uttxtxx
 35 | 0.2896 2 09 x x 02 uttttxt
 36 | 0.2897 2 09 x x 02 utxxtxx
 37 | 0.2897 2 09 x x 02 ytxttxt
 38 | 0.2897 2 09 x x 02 utxxttx
 39 | 0.2897 2 09 x x 02 ytttttt
 40 | 0.2897 2 09 x x 02 ytxxttx
 41 | 0.2897 2 09 x x 02 uttxtxt
 42 | 0.2897 2 09 x x 02 ytxxtxt
 43 | 0.2898 2 09 x x 02 utxxtxt
 44 | 0.2898 2 09 x x 02 ytxxtxx
 45 | 0.2898 2 09 x x 02 utxttxx
 46 | 0.2898 2 09 x x 02 yttxttx
 47 | 0.2898 2 09 x x 02 yttxtxx
 48 | 0.2898 2 09 x x 02 yttttxt
 49 | 0.2898 2 09 x x 02 utxtttx
 50 | 0.2898 2 09 x x 02 ytttttx
 51 | 0.2898 2 09 x x 02 yttxtxt
 52 | 0.2898 2 09 x x 02 yttttxx
 53 | 0.2898 2 09 x x 02 yttxttt
 54 | 0.2899 2 09 x x 02 utttttt
 55 | 0.2899 2 09 x x 02 utxtttt
 56 | 0.2899 2 09 x x 02 ytxttxx
 57 | 0.2900 2 09 x x 02 utxttxt
 58 | 0.2900 2 09 x x 02 utttttx
 59 | 0.2900 2 09 x x 02 uttttxx
 60 | 0.2900 2 09 x x 02 ytxxttt
 61 | 0.2901 2 09 x x 02 ytxtttx
 62 | 0.2902 2 09 x x 02 ytxtttt
 63 | 0.2902 2 09 x x 02 utxxttt
 64 | 0.2902 2 09 x x 02 uttxttt
 65 | 0.2904 2 09 x x 02 uttxxxt
 66 | 0.2907 2 09 x x 02 yxtxxtx
 67 | 0.2907 2 09 x x 02 uxxxxxt
 68 | 0.2907 2 09 x x 02 yxxxxtt
 69 | 0.2907 2 09 x x 02 yxttxtx
 70 | 0.2908 2 09 x x 02 yxxtxxx
 71 | 0.2908 2 09 x x 02 yxtxxtt
 72 | 0.2908 2 09 x x 02 yxxtxxt
 73 | 0.2908 2 09 x x 02 uxxtxxt
 74 | 0.2908 2 09 x x 02 uxtxxxt
 75 | 0.2908 2 09 x x 02 uxxtxtx
 76 | 0.2908 2 09 x x 02 yxttxxt
 77 | 0.2908 2 09 x x 02 uxxtxxx
 78 | 0.2908 2 09 x x 02 uxxtxtt
 79 | 0.2908 2 09 x x 02 uxttxxt
 80 | 0.2908 2 09 x x 02 uxttxtt
 81 | 0.2908 2 09 x x 02 yxxxxxx
 82 | 0.2908 2 09 x x 02 yxttxxx
 83 | 0.2908 2 09 x x 02 utxxxtt
 84 | 0.2908 2 09 x x 02 ytxtxtt
 85 | 0.2909 2 09 x x 02 uxttxtx
 86 | 0.2909 2 09 x x 02 ytxtxxx
 87 | 0.2909 2 09 x x 02 yxxtxtx
 88 | 0.2909 2 09 x x 02 uxttxxx
 89 | 0.2909 2 09 x x 02 utxtxxx
 90 | 0.2909 2 09 x x 02 ytxtxtx
 91 | 0.2909 2 09 x x 02 ytttxxt
 92 | 0.2909 2 09 x x 02 yxxxxxt
 93 | 0.2909 2 09 x x 02 yttxxxx
 94 | 0.2909 2 09 x x 02 yxxxxtx
 95 | 0.2909 2 09 x x 02 yttxxtx
 96 | 0.2909 2 09 x x 02 utxtxxt
 97 | 0.2909 2 09 x x 02 ytxxxtt
 98 | 0.2909 2 09 x x 02 yxtxxxx
 99 | 0.2909 2 09 x x 02 yttxxxt
100 | 0.2910 2 09 x x 02 utxxxxx
101 | 0.2910 2 09 x x 02 utttxtt
102 | 0.2910 2 09 x x 02 uxtxxxx
103 | 0.2910 2 09 x x 02 ytxxxxx
104 | 0.2910 2 09 x x 02 utttxtx
105 | 0.2910 2 09 x x 02 uttxxxx
106 | 0.2910 2 09 x x 02 uxxxxtx
107 | 0.2910 2 09 x x 02 uxxxxxx
108 | 0.2910 2 09 x x 02 ytxxxtx
109 | 0.2910 2 09 x x 02 utttxxx
110 | 0.2910 2 09 x x 02 uttxxtt
111 | 0.2910 2 09 x x 02 ytttxtt
112 | 0.2910 2 09 x x 02 yxxtxtt
113 | 0.2910 2 09 x x 02 uxxxxtt
114 | 0.2910 2 09 x x 02 ytxtxxt
115 | 0.2911 2 09 x x 02 utxxxtx
116 | 0.2911 2 09 x x 02 utxxxxt
117 | 0.2911 2 09 x x 02 ytttxxx
118 | 0.2911 2 09 x x 02 yxttxtt
119 | 0.2911 2 09 x x 02 ytttxtx
120 | 0.2911 2 09 x x 02 uxtxxtx
121 | 0.2911 2 09 x x 02 utttxxt
122 | 0.2912 2 09 x x 02 uxtxxtt
123 | 0.2912 2 09 x x 02 utxtxtt
124 | 0.2912 2 09 x x 02 utxtxtx
125 | 0.2912 2 09 x x 02 yttxxtt
126 | 0.2912 2 09 x x 02 uttxxtx
127 | 0.2913 2 09 x x 02 ytxxxxt
128 | 0.2913 2 09 x x 02 yxtxxxt
129 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/timings_x64_Zen4/partial_array_size_2/out2.txt:
--------------------------------------------------------------------------------
  1 | 0.2864 2 09 x x 02 uxxtttx
  2 | 0.2865 2 09 x x 02 yxxttxt
  3 | 0.2865 2 09 x x 02 uxtxtxx
  4 | 0.2865 2 09 x x 02 yxxtttt
  5 | 0.2865 2 09 x x 02 yxtxtxx
  6 | 0.2865 2 09 x x 02 uxxxtxt
  7 | 0.2865 2 09 x x 02 yxxttxx
  8 | 0.2865 2 09 x x 02 yxxxttx
  9 | 0.2865 2 09 x x 02 yxxxtxt
 10 | 0.2865 2 09 x x 02 uxttttx
 11 | 0.2865 2 09 x x 02 yxtxtxt
 12 | 0.2866 2 09 x x 02 yxtxttt
 13 | 0.2866 2 09 x x 02 uxxxtxx
 14 | 0.2866 2 09 x x 02 yxttttx
 15 | 0.2866 2 09 x x 02 yxxxttt
 16 | 0.2867 2 09 x x 02 uxxttxt
 17 | 0.2867 2 09 x x 02 yxttttt
 18 | 0.2867 2 09 x x 02 yxtttxx
 19 | 0.2867 2 09 x x 02 uxtxtxt
 20 | 0.2867 2 09 x x 02 uxttttt
 21 | 0.2867 2 09 x x 02 yxtxttx
 22 | 0.2868 2 09 x x 02 uxxttxx
 23 | 0.2868 2 09 x x 02 uxxtttt
 24 | 0.2868 2 09 x x 02 uxtttxx
 25 | 0.2868 2 09 x x 02 uxtxttx
 26 | 0.2868 2 09 x x 02 uxtttxt
 27 | 0.2868 2 09 x x 02 yxxtttx
 28 | 0.2868 2 09 x x 02 uxxxttt
 29 | 0.2869 2 09 x x 02 yxxxtxx
 30 | 0.2869 2 09 x x 02 uxxxttx
 31 | 0.2869 2 09 x x 02 yxtttxt
 32 | 0.2871 2 09 x x 02 uxtxttt
 33 | 0.2871 2 09 x x 02 ytxxtxx
 34 | 0.2871 2 09 x x 02 yttttxx
 35 | 0.2871 2 09 x x 02 utxxttt
 36 | 0.2871 2 09 x x 02 ytxtttt
 37 | 0.2871 2 09 x x 02 utxxtxt
 38 | 0.2871 2 09 x x 02 yttxttx
 39 | 0.2871 2 09 x x 02 ytxtttx
 40 | 0.2871 2 09 x x 02 uttxttx
 41 | 0.2872 2 09 x x 02 utxtttt
 42 | 0.2872 2 09 x x 02 ytxttxt
 43 | 0.2872 2 09 x x 02 utxttxt
 44 | 0.2872 2 09 x x 02 utxtttx
 45 | 0.2872 2 09 x x 02 yttxtxt
 46 | 0.2872 2 09 x x 02 uttttxx
 47 | 0.2872 2 09 x x 02 yttxttt
 48 | 0.2873 2 09 x x 02 utxxtxx
 49 | 0.2873 2 09 x x 02 ytttttt
 50 | 0.2873 2 09 x x 02 yttxtxx
 51 | 0.2873 2 09 x x 02 ytxttxx
 52 | 0.2874 2 09 x x 02 uttxttt
 53 | 0.2874 2 09 x x 02 ytxxttx
 54 | 0.2874 2 09 x x 02 utxttxx
 55 | 0.2874 2 09 x x 02 uttxtxt
 56 | 0.2874 2 09 x x 02 yttttxt
 57 | 0.2874 2 09 x x 02 ytttttx
 58 | 0.2875 2 09 x x 02 utttttt
 59 | 0.2876 2 09 x x 02 uttttxt
 60 | 0.2876 2 09 x x 02 uttxtxx
 61 | 0.2877 2 09 x x 02 utxxttx
 62 | 0.2877 2 09 x x 02 ytxxtxt
 63 | 0.2877 2 09 x x 02 ytxxttt
 64 | 0.2877 2 09 x x 02 utttttx
 65 | 0.2878 2 09 x x 02 yxxxxxx
 66 | 0.2878 2 09 x x 02 yxttxxt
 67 | 0.2878 2 09 x x 02 yxttxtt
 68 | 0.2878 2 09 x x 02 uxxtxtx
 69 | 0.2878 2 09 x x 02 yxtxxtx
 70 | 0.2878 2 09 x x 02 uxttxxx
 71 | 0.2878 2 09 x x 02 yxxtxxt
 72 | 0.2878 2 09 x x 02 yxxtxtt
 73 | 0.2881 2 09 x x 02 uxxtxtt
 74 | 0.2881 2 09 x x 02 yxtxxtt
 75 | 0.2882 2 09 x x 02 uxxxxxx
 76 | 0.2882 2 09 x x 02 yxxtxxx
 77 | 0.2882 2 09 x x 02 yxtxxxt
 78 | 0.2882 2 09 x x 02 uxtxxtx
 79 | 0.2882 2 09 x x 02 uxxtxxt
 80 | 0.2883 2 09 x x 02 uxttxxt
 81 | 0.2883 2 09 x x 02 yxttxtx
 82 | 0.2883 2 09 x x 02 uxttxtt
 83 | 0.2883 2 09 x x 02 yxxxxtt
 84 | 0.2883 2 09 x x 02 yxxxxxt
 85 | 0.2884 2 09 x x 02 yxxtxtx
 86 | 0.2885 2 09 x x 02 uxttxtx
 87 | 0.2885 2 09 x x 02 yxxxxtx
 88 | 0.2885 2 09 x x 02 uxtxxtt
 89 | 0.2885 2 09 x x 02 uxtxxxx
 90 | 0.2885 2 09 x x 02 uxxxxtx
 91 | 0.2885 2 09 x x 02 uxxxxxt
 92 | 0.2885 2 09 x x 02 yxttxxx
 93 | 0.2885 2 09 x x 02 yxtxxxx
 94 | 0.2885 2 09 x x 02 uxxtxxx
 95 | 0.2885 2 09 x x 02 uxtxxxt
 96 | 0.2886 2 09 x x 02 uxxxxtt
 97 | 0.2892 2 09 x x 02 ytxtxxt
 98 | 0.2892 2 09 x x 02 ytxtxtt
 99 | 0.2892 2 09 x x 02 uttxxtt
100 | 0.2892 2 09 x x 02 yttxxxt
101 | 0.2893 2 09 x x 02 utxxxxt
102 | 0.2893 2 09 x x 02 utxxxtx
103 | 0.2893 2 09 x x 02 ytttxtt
104 | 0.2893 2 09 x x 02 ytxxxtx
105 | 0.2893 2 09 x x 02 yttxxxx
106 | 0.2893 2 09 x x 02 ytttxxt
107 | 0.2894 2 09 x x 02 utxtxxt
108 | 0.2894 2 09 x x 02 utttxtx
109 | 0.2894 2 09 x x 02 ytxxxtt
110 | 0.2894 2 09 x x 02 utxtxxx
111 | 0.2894 2 09 x x 02 utxtxtt
112 | 0.2894 2 09 x x 02 uttxxxx
113 | 0.2894 2 09 x x 02 utttxxt
114 | 0.2894 2 09 x x 02 ytxxxxt
115 | 0.2894 2 09 x x 02 utttxtt
116 | 0.2894 2 09 x x 02 ytttxxx
117 | 0.2894 2 09 x x 02 yttxxtt
118 | 0.2895 2 09 x x 02 ytxtxtx
119 | 0.2895 2 09 x x 02 utxxxtt
120 | 0.2895 2 09 x x 02 utxtxtx
121 | 0.2895 2 09 x x 02 ytxxxxx
122 | 0.2895 2 09 x x 02 utttxxx
123 | 0.2895 2 09 x x 02 yttxxtx
124 | 0.2898 2 09 x x 02 utxxxxx
125 | 0.2898 2 09 x x 02 uttxxxt
126 | 0.2898 2 09 x x 02 uttxxtx
127 | 0.2899 2 09 x x 02 ytttxtx
128 | 0.2899 2 09 x x 02 ytxtxxx
129 | 


--------------------------------------------------------------------------------
/test/montgomery_arithmetic/low_level_api/test_inverse_mod_R.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | 
  9 | #include "hurchalla/montgomery_arithmetic/low_level_api/inverse_mod_R.h"
 10 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
 11 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 12 | #include "gtest/gtest.h"
 13 | #include <cstdint>
 14 | 
 15 | namespace {
 16 | 
 17 | 
 18 | namespace hc = ::hurchalla;
 19 | 
 20 | template <typename T>
 21 | void test_single_inverse(T a)
 22 | {
 23 |     using P = typename hc::safely_promote_unsigned<T>::type;
 24 |     T one = static_cast<T>(1);
 25 | 
 26 |     T inv = hc::inverse_mod_R(a);
 27 |     EXPECT_TRUE(static_cast<T>(static_cast<P>(inv) * static_cast<P>(a)) == one);
 28 | }
 29 | 
 30 | 
 31 | template <typename T>
 32 | void test_constexpr_inverse()
 33 | {
 34 |     // the #if is a slight hack, but inverse_mod_R is only constexpr for C++14
 35 |     // and above (C++11's support for constexpr functions was too primitive)
 36 | #if (__cplusplus >= 201402L) || \
 37 |         (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L && _MSC_VER >= 1910)
 38 |     // test constexpr use of inverse_mod_R
 39 |     static_assert(hc::inverse_mod_R(static_cast<T>(1)) == 1, "");
 40 | 
 41 |     // Suppress a false positive warning MSVC++ 2017 issues when constexpr
 42 |     // compiling, regarding (unsigned) integral overflow that occurs inside
 43 |     // impl_inverse_mod_R.  Unfortunately suppressing it there doesn't work,
 44 |     // probably due to VC2017 awkwardly compiling constexpr functions.  Unsigned
 45 |     // overflow is well defined and correct there, and MS removed this false
 46 |     // warning in VC++ 2019.
 47 | #if defined(_MSC_VER)
 48 | #  pragma warning(push)
 49 | #  pragma warning(disable : 4307)
 50 | #endif
 51 |     static_assert(static_cast<T>(3 * hc::inverse_mod_R(static_cast<T>(3)))
 52 |                   == 1, "");
 53 |     static_assert(static_cast<T>(251 * hc::inverse_mod_R(static_cast<T>(251)))
 54 |                   == 1, "");
 55 | #if defined(_MSC_VER)
 56 | #  pragma warning(pop)
 57 | #endif
 58 | #endif
 59 | }
 60 | 
 61 | 
 62 | template <typename T>
 63 | void test_inverse_exhaustive()
 64 | {
 65 |     T tmax = hc::ut_numeric_limits<T>::max();
 66 |     T evenmax = static_cast<T>((tmax/2)*2);
 67 |     T oddmax = (evenmax != tmax) ? tmax : static_cast<T>(tmax - 1);
 68 | 
 69 |     for (T a=oddmax; a>1; a=static_cast<T>(a-2))
 70 |         test_single_inverse(a);
 71 |     test_single_inverse(static_cast<T>(1));
 72 | }
 73 | 
 74 | 
 75 | template <typename T>
 76 | void test_inverse_mod_r()
 77 | {
 78 |     T tmax = hc::ut_numeric_limits<T>::max();
 79 |     T evenmax = static_cast<T>((tmax/2)*2);
 80 |     T oddmax = (evenmax != tmax) ? tmax : static_cast<T>(tmax - 1);
 81 |     T oddhalfmax = static_cast<T>((tmax/4)*2 + 1);
 82 | 
 83 |     // inverse_mod_r's preconditions require input a is odd.
 84 | 
 85 |     test_single_inverse(static_cast<T>(1));
 86 |     test_single_inverse(static_cast<T>(3));
 87 |     test_single_inverse(static_cast<T>(5));
 88 |     test_single_inverse(static_cast<T>(7));
 89 | 
 90 |     test_single_inverse(static_cast<T>(oddmax));
 91 |     test_single_inverse(static_cast<T>(oddmax - 2));
 92 |     test_single_inverse(static_cast<T>(oddmax - 4));
 93 | 
 94 |     test_single_inverse(static_cast<T>(oddhalfmax));
 95 |     test_single_inverse(static_cast<T>(oddhalfmax + 2));
 96 |     test_single_inverse(static_cast<T>(oddhalfmax - 2));
 97 | 
 98 |     test_constexpr_inverse<T>();
 99 | }
100 | 
101 | 
102 | 
103 | TEST(MontgomeryArithmetic, inverse_mod_r) {
104 |     test_inverse_mod_r<std::uint8_t>();
105 |     test_inverse_mod_r<std::uint16_t>();
106 |     test_inverse_mod_r<std::uint32_t>();
107 |     test_inverse_mod_r<std::uint64_t>();
108 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
109 |     test_inverse_mod_r<__uint128_t>();
110 | #endif
111 | 
112 |     test_inverse_exhaustive<std::uint8_t>();
113 |     test_inverse_exhaustive<std::uint16_t>();
114 | }
115 | 
116 | 
117 | } // end unnamed namespace
118 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/testbench_2kary.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright (c) 2025 Jeffrey Hurchalla.
  4 | #
  5 | # This Source Code Form is subject to the terms of the Mozilla Public
  6 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  7 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
  8 | 
  9 | 
 10 | 
 11 | 
 12 | # You need to clone the util, factoring, and modular_arithmetic repos
 13 | # from https://github.com/hurchalla
 14 | 
 15 | # SET repo_directory TO THE DIRECTORY WHERE YOU CLONED THE HURCHALLA GIT
 16 | # REPOSITORIES.  (or otherwise ensure the compiler /I flags correctly specify
 17 | # the needed hurchalla include directories)
 18 | 
 19 | repo_directory=/Users/jeffreyhurchalla/Desktop
 20 | #repo_directory=/home/jeff/repos
 21 | 
 22 | 
 23 | # you would ordinarily use either g++ or clang++  for $1
 24 | cppcompiler=$1
 25 | 
 26 | 
 27 | if [[ $cppcompiler == "g++" ]]; then
 28 |   error_limit=-fmax-errors=3
 29 |   warn_nrvo=-Wnrvo
 30 | else
 31 |   error_limit=-ferror-limit=3
 32 | fi
 33 | 
 34 | 
 35 | 
 36 | 
 37 | exit_on_failure () {
 38 |   if [ $? -ne 0 ]; then
 39 |     exit 1
 40 |   fi
 41 | }
 42 | 
 43 | #optimization_level=O2
 44 | #optimization_level=O3
 45 | optimization_level=$2
 46 | 
 47 | #define_mont_type=-DDEF_MONT_TYPE=MontgomeryQuarter
 48 | define_mont_type=-DDEF_MONT_TYPE=$3
 49 | define_uint_type=-DDEF_UINT_TYPE=$4
 50 | 
 51 | # you must specify either -DTEST_ARRAY or -DTEST_SCALAR or -DTEST_PARTIAL_ARRAY for $8
 52 | define_test_type=$8
 53 | 
 54 | 
 55 | cpp_standard=c++17
 56 | 
 57 | 
 58 | # You can use arguments $9 and ${10} and ${11} etc to define macros such as
 59 | # -DHURCHALLA_ALLOW_INLINE_ASM_ALL
 60 | # For debugging, defining the following macros may be useful
 61 | # -DHURCHALLA_CLOCKWORK_ENABLE_ASSERTS  -DHURCHALLA_UTIL_ENABLE_ASSERTS
 62 | 
 63 | append_if_set() {
 64 |     local array_name="$1"
 65 |     local value="$2"
 66 |     if [ -n "$value" ]; then
 67 |         eval "$array_name+=(\"\$value\")"
 68 |     fi
 69 | }
 70 | extra_args=()
 71 | append_if_set extra_args "$9"
 72 | append_if_set extra_args "${10}"
 73 | append_if_set extra_args "${11}"
 74 | append_if_set extra_args "${12}"
 75 | append_if_set extra_args "${13}"
 76 | append_if_set extra_args "${14}"
 77 | append_if_set extra_args "${15}"
 78 | append_if_set extra_args "${16}"
 79 | append_if_set extra_args "${17}"
 80 | append_if_set extra_args "${18}"
 81 | append_if_set extra_args "${19}"
 82 | append_if_set extra_args "${20}"
 83 | append_if_set extra_args "${21}"
 84 | append_if_set extra_args "${22}"
 85 | append_if_set extra_args "${23}"
 86 | append_if_set extra_args "${24}"
 87 | append_if_set extra_args "${25}"
 88 | append_if_set extra_args "${26}"
 89 | append_if_set extra_args "${27}"
 90 | append_if_set extra_args "${28}"
 91 | append_if_set extra_args "${29}"
 92 | append_if_set extra_args "${30}"
 93 | 
 94 | 
 95 | # we could also use  -g  to get debug symbols (for lldb/gdb, and objdump)
 96 | 
 97 | $cppcompiler   \
 98 |         $error_limit   -$optimization_level \
 99 |         $define_mont_type  $define_uint_type  $define_test_type \
100 |          "${extra_args[@]}" \
101 |         -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion $warn_nrvo \
102 |         -std=$cpp_standard \
103 |         -I${repo_directory}/modular_arithmetic/modular_arithmetic/include \
104 |         -I${repo_directory}/modular_arithmetic/montgomery_arithmetic/include \
105 |         -I${repo_directory}/util/include \
106 |         -c testbench_montgomery_pow_2kary.cpp
107 | 
108 | exit_on_failure
109 | 
110 | $cppcompiler  -$optimization_level  -std=$cpp_standard  -o testbench_montgomery_pow_2kary  testbench_montgomery_pow_2kary.o -lm
111 | 
112 | exit_on_failure
113 | 
114 | echo "compilation finished, now executing:"
115 | 
116 | 
117 | # argument $5 (if present), is the randomization seed for std::mt19937_64
118 | # argument $6 (if present), is max_modulus_bits_reduce
119 | # argument $7 (if present), is exponent_bits_reduce
120 | 
121 | ./testbench_montgomery_pow_2kary $5 $6 $7
122 | 
123 | # To give you an example of invoking this script at the command line:
124 | #   ./testbench.sh clang++ O3 MontgomeryFull __uint128_t 191 8 50  -DTEST_ARRAY -DHURCHALLA_ALLOW_INLINE_ASM_ALL
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/modular_arithmetic/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | 
 7 | if(TARGET hurchalla_basic_modular_arithmetic)
 8 |     return()
 9 | endif()
10 | 
11 | # later versions are probably fine, but are untested
12 | cmake_minimum_required(VERSION 3.14...4.03)
13 | 
14 | project(hurchalla_basic_modular_arithmetic VERSION 1.0.0 LANGUAGES CXX)
15 | 
16 | # We need to detect if we're using MSVC for x86_64, prior to MSVC2019, since
17 | # these old MSVC versions need a separate asm file for modular multiplication.
18 | # (_MSC_VER < 1920 indicates Visual Studio 2017 or lower)
19 | if((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND (MSVC_VERSION LESS 1920) AND
20 |           (CMAKE_SYSTEM_PROCESSOR MATCHES "x86|X86|amd64|AMD64|EM64T") AND
21 |           (CMAKE_SIZEOF_VOID_P EQUAL 8))
22 |     set(HURCHALLA_MA_USING_OLD_MSVC_X64 TRUE)
23 |     enable_language(ASM_MASM)
24 | else()
25 |     set(HURCHALLA_MA_USING_OLD_MSVC_X64 FALSE)
26 | endif()
27 | 
28 | 
29 | # if this is the top level CMakeLists.txt, let IDEs group projects into folders
30 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
31 |     set_property(GLOBAL PROPERTY USE_FOLDERS ON)
32 | endif()
33 | 
34 | 
35 | if(HURCHALLA_MA_USING_OLD_MSVC_X64)
36 |     add_library(hurchalla_basic_modular_arithmetic STATIC)
37 | else()
38 |     add_library(hurchalla_basic_modular_arithmetic INTERFACE)
39 | endif()
40 | 
41 | 
42 | target_sources(hurchalla_basic_modular_arithmetic INTERFACE
43 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/absolute_value_difference.h>
44 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/modular_addition.h>
45 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/modular_multiplication.h>
46 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/modular_multiplicative_inverse.h>
47 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/modular_pow.h>
48 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/modular_subtraction.h>
49 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h>
50 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/impl_modular_multiplicative_inverse.h>
51 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/impl_modular_pow.h>
52 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/optimization_tag_structs.h>
53 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_absolute_value_difference.h>
54 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_addition.h>
55 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_multiplication.h>
56 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_subtraction.h>
57 |     )
58 | if(HURCHALLA_MA_USING_OLD_MSVC_X64)
59 |     target_sources(hurchalla_basic_modular_arithmetic PRIVATE
60 |         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src/platform_specific_MSVC_x86_64/modular_multiply_uint64--x64_microsoft.asm>
61 |         )
62 | endif()
63 | 
64 | 
65 | install(DIRECTORY
66 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/>
67 |     DESTINATION include)
68 | 
69 | 
70 | target_include_directories(hurchalla_basic_modular_arithmetic INTERFACE
71 |             $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
72 | 
73 | 
74 | include(FetchContent)
75 | FetchContent_Declare(
76 |     hurchalla_util
77 |     GIT_REPOSITORY https://github.com/hurchalla/util.git
78 |     GIT_TAG        8e03b87c7b6d5c3bf3c0e439a153768c59c512c5
79 | )
80 | FetchContent_MakeAvailable(hurchalla_util)
81 | 
82 | target_link_libraries(hurchalla_basic_modular_arithmetic
83 |                       INTERFACE hurchalla_util)
84 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/get_Rsquared_mod_n.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_GET_RSQUARED_MOD_N_H_INCLUDED
  9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_GET_RSQUARED_MOD_N_H_INCLUDED
 10 | 
 11 | 
 12 | #include "hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_get_Rsquared_mod_n.h"
 13 | #include "hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_array_get_Rsquared_mod_n.h"
 14 | #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
 15 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
 16 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 17 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
 18 | #include <type_traits>
 19 | #include <array>
 20 | 
 21 | #if defined(_MSC_VER)
 22 | #  pragma warning(push)
 23 | #  pragma warning(disable : 4127)
 24 | #endif
 25 | 
 26 | namespace hurchalla {
 27 | 
 28 | 
 29 | // For discussion purposes, let type UP be a conceptually unlimited precision
 30 | // unsigned integer type, and let the unlimited precision constant R represent
 31 | // R = (UP)1 << ut_numeric_limits<T>::digits.  Equivalently,
 32 | // R = (UP)ut_numeric_limits<T>::max + 1.  For example, if T is uint64_t, we
 33 | // would have R = (UP)1 << 64.
 34 | 
 35 | // get_Rsquared_mod_n() computes and returns (R*R) % n.
 36 | // You can get the argument inverse_n_modR by calling inverse_mod_r().  You can
 37 | // get Rmod_n by calling get_R_mod_n().
 38 | 
 39 | // For the template arguments nIsGuaranteedLessThanRdiv4 and LowlatencyTag, it
 40 | // is easiest not to specify them, and accept the defaults.  Their purpose is
 41 | // solely to provide ways to improve performance.  These are their details:
 42 | //    For nIsGuaranteedLessThanRdiv, if you can guarantee that n <= R/4, you can
 43 | // set it to true to improve performance.  Otherwise, accept the default of
 44 | // false.
 45 | //    For PTAG, if you prefer to have the lowest number of uops rather than
 46 | // lowest latency, then you can set it to LowuopsTag.  Otherwise accept the
 47 | // default of LowlatencyTag.
 48 | 
 49 | template <typename T,
 50 |           bool nIsGuaranteedLessThanRdiv4 = false,
 51 |           class PTAG = LowlatencyTag>
 52 | T get_Rsquared_mod_n(T n, T inverse_n_modR, T Rmod_n)
 53 | {
 54 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 55 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
 56 |     static_assert(ut_numeric_limits<T>::is_modulo, "");
 57 |     HPBC_CLOCKWORK_PRECONDITION2(n % 2 == 1);  // REDC requires an odd modulus.
 58 |     HPBC_CLOCKWORK_PRECONDITION2(n > 1);
 59 |     using P = typename safely_promote_unsigned<T>::type;
 60 |     // verify that  n * inverse_n_modR ≡ 1 (mod R)
 61 |     HPBC_CLOCKWORK_PRECONDITION2(
 62 |        static_cast<T>(static_cast<P>(n) * static_cast<P>(inverse_n_modR)) == 1);
 63 | 
 64 |     T rSquaredModN = detail::impl_get_Rsquared_mod_n
 65 |             <nIsGuaranteedLessThanRdiv4, PTAG>::call(n, inverse_n_modR, Rmod_n);
 66 | 
 67 |     HPBC_CLOCKWORK_POSTCONDITION2(rSquaredModN < n);
 68 |     return rSquaredModN;
 69 | }
 70 | 
 71 | 
 72 | // You can usually get much better performance by using this std::array
 73 | // version, when you need multiple calculations of different Rsquared mod Ns.
 74 | template <typename T, std::size_t ARRAY_SIZE,
 75 |           bool nIsGuaranteedLessThanRdiv4 = false,
 76 |           class PTAG> // = LowuopsTag
 77 | std::array<T, ARRAY_SIZE>
 78 | get_Rsquared_mod_n(const std::array<T, ARRAY_SIZE>& n,
 79 |                    const std::array<T, ARRAY_SIZE>& inverse_n_modR,
 80 |                    const std::array<T, ARRAY_SIZE>& Rmod_n)
 81 | {
 82 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 83 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
 84 |     static_assert(ut_numeric_limits<T>::is_modulo, "");
 85 | 
 86 |     using P = typename safely_promote_unsigned<T>::type;
 87 | 
 88 |     if (HPBC_CLOCKWORK_PRECONDITION2_MACRO_IS_ACTIVE) {
 89 |         for (std::size_t i = 0; i < ARRAY_SIZE; ++i) {
 90 |             HPBC_CLOCKWORK_PRECONDITION2(n[i] % 2 == 1);  // REDC requires an odd modulus.
 91 |             HPBC_CLOCKWORK_PRECONDITION2(n[i] > 1);
 92 |             HPBC_CLOCKWORK_PRECONDITION2(static_cast<T>(static_cast<P>(n[i]) *
 93 |                                static_cast<P>(inverse_n_modR[i])) == 1);
 94 |         }
 95 |     }
 96 | 
 97 |     std::array<T, ARRAY_SIZE> result = detail::impl_array_get_Rsquared_mod_n
 98 |             <nIsGuaranteedLessThanRdiv4, PTAG>::call(n, inverse_n_modR, Rmod_n);
 99 | 
100 |     if (HPBC_CLOCKWORK_POSTCONDITION2_MACRO_IS_ACTIVE) {
101 |         for (std::size_t i = 0; i < ARRAY_SIZE; ++i)
102 |             HPBC_CLOCKWORK_POSTCONDITION2(result[i] < n[i]);
103 |     }
104 |     return result;
105 | }
106 | 
107 | 
108 | } // end namespace
109 | 
110 | #if defined(_MSC_VER)
111 | #  pragma warning(pop)
112 | #endif
113 | 
114 | #endif
115 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_INVERSE_MOD_R_H_INCLUDED
  9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_INVERSE_MOD_R_H_INCLUDED
 10 | 
 11 | 
 12 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
 13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 14 | #include "hurchalla/util/sized_uint.h"
 15 | #include "hurchalla/util/compiler_macros.h"
 16 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
 17 | #include <type_traits>
 18 | 
 19 | namespace hurchalla { namespace detail {
 20 | 
 21 | 
 22 | // For discussion purposes, let type UP be a conceptually unlimited precision
 23 | // unsigned integer type, and let the unlimited precision constant R represent
 24 | // R = (UP)1 << ut_numeric_limits<T>::digits.  Equivalently,
 25 | // R = (UP)ut_numeric_limits<T>::max + 1.  For example, if T is uint64_t, we
 26 | // would have R = (UP)1 << 64.
 27 | 
 28 | // minor note: we use static member functions to disallow ADL.
 29 | 
 30 | struct impl_inverse_mod_R {
 31 | private:
 32 |     template <int n>   // internal helper constexpr function
 33 |     constexpr static int log2()
 34 |     {
 35 |         // PRECONDITION: n!=0 (this isn't possible to express via static_assert)
 36 |         static_assert(n>=0, "");
 37 |         static_assert(n==1 || (n/2)*2 == n, "");
 38 |         return (n<=1) ? 0 : 1 + log2<n/2>();
 39 |     }
 40 | public:
 41 |     // This algorithm for the inverse (mod R) is described in
 42 |     // https://arxiv.org/abs/2204.04342.  Note: it is a
 43 |     // generalized and slightly more efficient version of Dumas' algorithm (from
 44 |     // https://arxiv.org/abs/1209.6626), so we still call it Dumas' algorithm.
 45 |     //
 46 |     // Note: Dumas' alg only makes sense to use for the native integral types -
 47 |     // Newton's method becomes more efficient when larger types are required.
 48 | #ifndef HURCHALLA_TARGET_BIT_WIDTH
 49 | #  error "HURCHALLA_TARGET_BIT_WIDTH must be defined"
 50 | #endif
 51 |     template <typename T, int bits>
 52 |     static  HURCHALLA_FORCE_INLINE  HURCHALLA_CPP14_CONSTEXPR
 53 |     typename std::enable_if<(bits <= HURCHALLA_TARGET_BIT_WIDTH), T>::type
 54 |     call(T a)
 55 |     {
 56 |         static_assert(ut_numeric_limits<T>::is_integer, "");
 57 |         static_assert(!(ut_numeric_limits<T>::is_signed), "");
 58 | 
 59 |         static_assert(bits == ut_numeric_limits<T>::digits, "");
 60 |         static_assert(std::is_unsigned<T>::value, ""); //native unsigned integer
 61 |         HPBC_CLOCKWORK_CONSTEXPR_PRECONDITION(a % 2 == 1);
 62 | 
 63 |         // avoid undefined behavior that could result if T is an unsigned type
 64 |         // that would be promoted to (signed) 'int'.
 65 |         using P = typename safely_promote_unsigned<T>::type;
 66 |         P b = static_cast<P>(a);
 67 | 
 68 |         P x = (3u*b)^2u;  // good to 5 bits, but we'll treat it as good to 4
 69 |         constexpr int goodbits = 4;  // must be a power of 2
 70 |         P s = b*x;
 71 |         P y = 1-s;
 72 | 
 73 |         static_assert((bits/goodbits)*goodbits == bits, "");
 74 |         constexpr int iterations = log2<bits/goodbits>();
 75 |         // cause compile error if iterations isn't initialized at compile time
 76 |         static_assert(iterations != 0, "");
 77 |         HURCHALLA_REQUEST_UNROLL_LOOP
 78 |         for (int i=0; i<iterations; ++i) {
 79 |             P t = y+1;
 80 |             y = y*y;
 81 |             x = x*t;
 82 |         }
 83 |         return static_cast<T>(x);
 84 |     }
 85 | 
 86 |     // This is Newton's method algorithm for the inverse (mod R).
 87 |     // To get the starting bits of 'x' we recurse until we use Dumas' method
 88 |     // (it's more efficient than Newton's method for native integer types).
 89 |     template <typename T, int bits>
 90 |     static  HURCHALLA_FORCE_INLINE  HURCHALLA_CPP14_CONSTEXPR
 91 |     typename std::enable_if<!(bits <= HURCHALLA_TARGET_BIT_WIDTH), T>::type
 92 |     call(T a)
 93 |     {
 94 |         static_assert(ut_numeric_limits<T>::is_integer, "");
 95 |         static_assert(!(ut_numeric_limits<T>::is_signed), "");
 96 |         static_assert((bits/2)*2 == bits, "");
 97 |         constexpr bool is_valid_su = is_valid_sized_uint<bits/2>::value;
 98 |         using T2 = typename std::conditional<is_valid_su,
 99 |                                     typename sized_uint<bits/2>::type, T>::type;
100 |         HPBC_CLOCKWORK_CONSTEXPR_PRECONDITION(a % 2 == 1);
101 | 
102 |         // set x so that the lower ('bits'/2) half of the bits are good.
103 |         T x = static_cast<T>(call<T2, bits/2>(static_cast<T2>(a)));
104 | 
105 |         using P = typename safely_promote_unsigned<T>::type;
106 |         // use one step of the standard newtons method algorithm for the
107 |         // inverse to double the number of good bits.
108 |         return static_cast<T>(x * (2 - static_cast<P>(a)*x));
109 |     }
110 | };
111 | 
112 | 
113 | }} // end namespace
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/test/modular_arithmetic/test_absolute_value_difference.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | 
  9 | // Strictly for testing purposes, we'll define HURCHALLA_ALLOW_INLINE_ASM_ALL
 10 | // here in order to make  absolute_value_difference() use an inline asm function
 11 | // version if it is available.
 12 | // Internally, this inline asm function will also call the generic template
 13 | // function version of absolute_value_difference inside a postcondition, in
 14 | // order to make sure that the asm result is correct.  Of course postcondition
 15 | // checks must be enabled for this check to occur - the easiest way to ensure
 16 | // postconditions are enabled is to define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS,
 17 | // which is why we do so here.  This is all strictly for testing purposes.
 18 | #undef HURCHALLA_ALLOW_INLINE_ASM_ALL
 19 | #define HURCHALLA_ALLOW_INLINE_ASM_ALL 1
 20 | 
 21 | #ifndef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 22 | #  define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 23 | #endif
 24 | 
 25 | 
 26 | #include "hurchalla/modular_arithmetic/absolute_value_difference.h"
 27 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 28 | #include "hurchalla/util/compiler_macros.h"
 29 | #include "gtest/gtest.h"
 30 | #include <cstdint>
 31 | 
 32 | namespace {
 33 | 
 34 | 
 35 | template <typename T>
 36 | void test_absolute_value_difference()
 37 | {
 38 |     namespace hc = ::hurchalla;
 39 | 
 40 |     // Test with a few basic examples first
 41 |     T a = 5;
 42 |     T b = 12;
 43 |     EXPECT_TRUE(static_cast<T>(7) == hc::absolute_value_difference(a, b));
 44 |     EXPECT_TRUE(static_cast<T>(7) == hc::absolute_value_difference(b, a));
 45 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(b, b));
 46 |     a = 7; b = 6;
 47 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(a, b));
 48 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(b, a));
 49 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(b, b));
 50 | 
 51 |     // --------- Test possible edge cases --------
 52 | 
 53 |     a = 0; b = 0;
 54 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(a, b));
 55 |     a = 0; b = 1;
 56 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(a, b));
 57 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(b, a));
 58 |     a = 1; b = 1;
 59 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(a, b));
 60 | 
 61 |     a = 0; b = hc::ut_numeric_limits<T>::max();
 62 |     EXPECT_TRUE(b == hc::absolute_value_difference(a, b));
 63 |     EXPECT_TRUE(b == hc::absolute_value_difference(b, a));
 64 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(b, b));
 65 |     a = 1;
 66 |     EXPECT_TRUE(static_cast<T>(b-1) == hc::absolute_value_difference(a, b));
 67 |     EXPECT_TRUE(static_cast<T>(b-1) == hc::absolute_value_difference(b, a));
 68 | 
 69 |     a = 0; b = static_cast<T>(hc::ut_numeric_limits<T>::max() - 1);
 70 |     EXPECT_TRUE(b == hc::absolute_value_difference(a, b));
 71 |     EXPECT_TRUE(b == hc::absolute_value_difference(b, a));
 72 |     a = 1;
 73 |     EXPECT_TRUE(static_cast<T>(b-1) == hc::absolute_value_difference(a, b));
 74 |     EXPECT_TRUE(static_cast<T>(b-1) == hc::absolute_value_difference(b, a));
 75 | 
 76 |     a = static_cast<T>(hc::ut_numeric_limits<T>::max()/2);
 77 |     b = static_cast<T>(a + 1);
 78 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(a, b));
 79 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(b, a));
 80 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(a, a));
 81 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(b, b));
 82 | 
 83 |     b++;
 84 |     EXPECT_TRUE(static_cast<T>(2) == hc::absolute_value_difference(a, b));
 85 |     EXPECT_TRUE(static_cast<T>(2) == hc::absolute_value_difference(b, a));
 86 |     a++;
 87 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(a, b));
 88 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(b, a));
 89 | 
 90 |     a = static_cast<T>(hc::ut_numeric_limits<T>::max()/2 - 1);
 91 |     b = static_cast<T>(a + 1);
 92 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(a, b));
 93 |     EXPECT_TRUE(static_cast<T>(1) == hc::absolute_value_difference(b, a));
 94 |     EXPECT_TRUE(static_cast<T>(0) == hc::absolute_value_difference(a, a));
 95 | }
 96 | 
 97 | 
 98 | 
 99 | TEST(ModularArithmetic, absolute_value_difference) {
100 |     test_absolute_value_difference<std::uint8_t>();
101 |     test_absolute_value_difference<std::uint16_t>();
102 |     test_absolute_value_difference<std::uint32_t>();
103 |     test_absolute_value_difference<std::uint64_t>();
104 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
105 |     test_absolute_value_difference<__uint128_t>();
106 | #endif
107 | 
108 |     test_absolute_value_difference<std::int8_t>();
109 |     test_absolute_value_difference<std::int16_t>();
110 |     test_absolute_value_difference<std::int32_t>();
111 |     test_absolute_value_difference<std::int64_t>();
112 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
113 |     test_absolute_value_difference<__int128_t>();
114 | #endif
115 | }
116 | 
117 | 
118 | } // end unnamed namespace
119 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020-2025 Jeffrey Hurchalla.
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this
 4 | # file, You can obtain one at https://mozilla.org/MPL/2.0/.
 5 | 
 6 | 
 7 | if(TARGET hurchalla_montgomery_arithmetic)
 8 |     return()
 9 | endif()
10 | 
11 | # later versions are probably fine, but are untested
12 | cmake_minimum_required(VERSION 3.14...4.03)
13 | 
14 | project(hurchalla_montgomery_arithmetic VERSION 1.0.0 LANGUAGES CXX)
15 | 
16 | 
17 | # if this is the top level CMakeLists.txt, let IDEs group projects into folders
18 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
19 |     set_property(GLOBAL PROPERTY USE_FOLDERS ON)
20 | endif()
21 | 
22 | 
23 | add_library(hurchalla_montgomery_arithmetic INTERFACE)
24 | 
25 | 
26 | target_sources(hurchalla_montgomery_arithmetic INTERFACE
27 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/MontgomeryForm.h>
28 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/montgomery_form_aliases.h>
29 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/BaseMontgomeryValue.h>
30 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.contents>
31 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.h>
32 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/impl_montgomery_two_pow.h>
33 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontgomeryDefault.h>
34 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontgomeryFormExtensions.h>
35 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h>
36 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyFullRange.h>
37 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyHalfRange.h>
38 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyTags.h>
39 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyQuarterRange.h>
40 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/MontyWrappedStandardMath.h>
41 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary.h>
42 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow_API.h>
43 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/MontyFullRangeMasked.h>
44 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryForm.h>
45 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryWrapper.h>
46 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/ConcreteMontgomeryForm.h>
47 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/montgomery_pow.h>
48 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/montgomery_two_pow.h>
49 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/subtract_returning_difference_or_zero.h>
50 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/two_times_restricted.h>
51 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/quarterrange_get_canonical.h>
52 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/detail/platform_specific/halfrange_get_canonical.h>
53 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/get_R_mod_n.h>
54 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/get_Rsquared_mod_n.h>
55 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/inverse_mod_R.h>
56 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/REDC.h>
57 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h>
58 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_array_get_Rsquared_mod_n.h>
59 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_get_Rsquared_mod_n.h>
60 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h>
61 |     )
62 | 
63 | 
64 | install(DIRECTORY
65 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/>
66 |     DESTINATION include)
67 | 
68 | 
69 | target_include_directories(hurchalla_montgomery_arithmetic
70 |             INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
71 | 
72 | 
73 | add_subdirectory(../modular_arithmetic
74 |             ${CMAKE_CURRENT_BINARY_DIR}/modular_arithmetic)
75 | target_link_libraries(hurchalla_montgomery_arithmetic
76 |                       INTERFACE hurchalla_basic_modular_arithmetic)
77 | 
78 | 
79 | include(FetchContent)
80 | FetchContent_Declare(
81 |     hurchalla_util
82 |     GIT_REPOSITORY https://github.com/hurchalla/util.git
83 |     GIT_TAG        8e03b87c7b6d5c3bf3c0e439a153768c59c512c5
84 | )
85 | FetchContent_MakeAvailable(hurchalla_util)
86 | 
87 | target_link_libraries(hurchalla_montgomery_arithmetic
88 |                       INTERFACE hurchalla_util)
89 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontgomeryFormExtensions.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2025 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_MONTGOMERY_FORM_EXTENSIONS_H_INCLUDED
  9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_MONTGOMERY_FORM_EXTENSIONS_H_INCLUDED
 10 | 
 11 | 
 12 | #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
 13 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 14 | #include "hurchalla/util/compiler_macros.h"
 15 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
 16 | #include <cstddef>
 17 | 
 18 | namespace hurchalla { namespace detail { 
 19 | 
 20 | 
 21 | // Implementation helper functions that shouldn't be exposed in the
 22 | // MontgomeryForm API.
 23 | 
 24 | 
 25 | template <class MF, class PTAG>
 26 | struct MontgomeryFormExtensions final {
 27 | 
 28 |     using RU = typename MF::MontType::uint_type;
 29 |     // conceptually, R = (UP)1 << ut_numeric_limits<RU>::digits, with UP as an
 30 |     // unlimited precision unsigned integer type.
 31 |     static_assert(ut_numeric_limits<RU>::is_integer, "");
 32 |     static_assert(!(ut_numeric_limits<RU>::is_signed), "");
 33 | 
 34 |     using CanonicalValue = typename MF::CanonicalValue;
 35 |     using MontgomeryValue = typename MF::MontgomeryValue;
 36 |     using SquaringValue = typename MF::MontType::squaringvalue_type;
 37 | 
 38 |     HURCHALLA_FORCE_INLINE
 39 |     static MontgomeryValue convertInExtended(const MF& mf, RU a)
 40 |     {
 41 |         return mf.impl.template convertInExtended<PTAG>(a);
 42 |     }
 43 | 
 44 |     // note: montvalueR is the Montgomery representation of R.
 45 |     //       In normal integer form it is literally R squared mod N.
 46 |     HURCHALLA_FORCE_INLINE
 47 |     static CanonicalValue getMontvalueR(const MF& mf)
 48 |     {
 49 |         return mf.impl.getMontvalueR();
 50 |     }
 51 | 
 52 |     // this first shifts x by exponent, which is equivalent to
 53 |     // multiplying x by 2^exponent, and then it completes the
 54 |     // mont mul as usual by calling REDC.
 55 |     // -- IMPORTANT NOTE -- because (2^exponent) is an integer domain
 56 |     // value rather than a montgomery domain value, the returned
 57 |     // result viewed as an integer value is
 58 |     // REDC((x_int * R) * (2^exponent)) == (x_int * (2^exponent) * R) * R^(-1)
 59 |     // To counteract the inverse R factor, so that you get what most likely
 60 |     // you wanted, being just plain   (x_int * (2^exponent) * R),
 61 |     // you need to ensure that x has an extra factor of R built into it it,
 62 |     // rather than just the normal single factor of x_int * R.  To build an
 63 |     // extra factor of R into x, you first get  montR = getMontvalueR(mf),
 64 |     // and then you do a normal montgomery multiply of x and montR.
 65 |     HURCHALLA_FORCE_INLINE
 66 |     static MontgomeryValue twoPowLimited_times_x(const MF& mf, size_t exponent, CanonicalValue x)
 67 |     {
 68 |         HPBC_CLOCKWORK_PRECONDITION(exponent < ut_numeric_limits<RU>::digits);
 69 |         return mf.impl.template twoPowLimited_times_x<PTAG>(exponent, x);
 70 |     }
 71 |     HURCHALLA_FORCE_INLINE
 72 |     static MontgomeryValue twoPowLimited_times_x_v2(const MF& mf, size_t exponent, CanonicalValue x)
 73 |     {
 74 |         HPBC_CLOCKWORK_PRECONDITION(0 < exponent && exponent <= ut_numeric_limits<RU>::digits);
 75 |         return mf.impl.template twoPowLimited_times_x_v2<PTAG>(exponent, x);
 76 |     }
 77 | 
 78 |     // note: magicValue is R cubed mod N  (in normal integer form)
 79 |     HURCHALLA_FORCE_INLINE
 80 |     static RU getMagicValue(const MF& mf)
 81 |     {
 82 |         return mf.impl.template getMagicValue<PTAG>();
 83 |     }
 84 | 
 85 |     HURCHALLA_FORCE_INLINE
 86 |     static MontgomeryValue
 87 |     convertInExtended_aTimesR(const MF& mf, RU a, RU magicValue)
 88 |     {
 89 |         HPBC_CLOCKWORK_PRECONDITION(magicValue == getMagicValue(mf));
 90 |         return mf.impl.template convertInExtended_aTimesR<PTAG>(a, magicValue);
 91 |     }
 92 | 
 93 |     // this shifts RsquaredModN by exponent (rather than multiplying by
 94 |     // (1<<exponent)) before calling REDC as usual.
 95 |     // The amount RsquaredModN can be shifted is limited by the bit width of
 96 |     // RsquaredModN's type - shifting more would be undefined behavior.
 97 |     // Thus the (exponent) shift is limited to 0 <= shift < digitsR.
 98 |     HURCHALLA_FORCE_INLINE
 99 |     static MontgomeryValue twoPowLimited(const MF& mf, size_t exponent)
100 |     {
101 |         HPBC_CLOCKWORK_PRECONDITION(exponent < ut_numeric_limits<RU>::digits);
102 |         return mf.impl.template twoPowLimited<PTAG>(exponent);
103 |     }
104 | 
105 |     // this shifts RcubedModN by exponent (rather than multiplying by
106 |     // (1<<exponent)) before calling REDC as usual.
107 |     // Similarly to twoPowLimited, the exponent shift must be limited
108 |     // to 0 <= shift < digitsR.
109 |     HURCHALLA_FORCE_INLINE
110 |     static MontgomeryValue
111 |     RTimesTwoPowLimited(const MF& mf, size_t exponent, RU magicValue)
112 |     {
113 |         HPBC_CLOCKWORK_PRECONDITION(exponent < ut_numeric_limits<RU>::digits);
114 |         return mf.impl.template RTimesTwoPowLimited<PTAG>(exponent, magicValue);
115 |     }
116 | 
117 | 
118 |     HURCHALLA_FORCE_INLINE
119 |     static SquaringValue getSquaringValue(const MF& mf, MontgomeryValue x)
120 |     {
121 |         return mf.impl.getSquaringValue(x);
122 |     }
123 | 
124 |     HURCHALLA_FORCE_INLINE
125 |     static SquaringValue squareSV(const MF& mf, SquaringValue sv)
126 |     {
127 |         return mf.impl.template squareSV<PTAG>(sv);
128 |     }
129 | 
130 |     HURCHALLA_FORCE_INLINE
131 |     static MontgomeryValue
132 |     squareToMontgomeryValue(const MF& mf, SquaringValue sv)
133 |     {
134 |         return mf.impl.template squareToMontgomeryValue<PTAG>(sv);
135 |     }
136 | 
137 |     HURCHALLA_FORCE_INLINE
138 |     static MontgomeryValue getMontgomeryValue(const MF& mf, SquaringValue sv)
139 |     {
140 |         return mf.impl.getMontgomeryValue(sv);
141 |     }
142 | };
143 | 
144 | 
145 | }} // end namespace
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/test/modular_arithmetic/test_modular_addition.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | 
  9 | // Strictly for testing purposes, we'll define HURCHALLA_ALLOW_INLINE_ASM_ALL
 10 | // here in order to make modular addition use an inline asm function version if
 11 | // it is available.  Internally, this inline asm function will also call the
 12 | // generic template function version of modular addition inside a postcondition,
 13 | // in order to make sure that the asm result is correct. Of course postcondition
 14 | // checks must be enabled for this check to occur - the easiest way to ensure
 15 | // postconditions are enabled is to define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS,
 16 | // which is why we do so here.  This is all strictly for testing purposes.
 17 | #undef HURCHALLA_ALLOW_INLINE_ASM_ALL
 18 | #define HURCHALLA_ALLOW_INLINE_ASM_ALL 1
 19 | 
 20 | #ifndef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 21 | #  define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 22 | #endif
 23 | 
 24 | 
 25 | #include "hurchalla/modular_arithmetic/modular_addition.h"
 26 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 27 | #include "hurchalla/util/compiler_macros.h"
 28 | #include "gtest/gtest.h"
 29 | #include <cstdint>
 30 | 
 31 | namespace {
 32 | 
 33 | 
 34 | namespace hc = ::hurchalla;
 35 | 
 36 | template <typename T>
 37 | void test_modulus(T modulus)
 38 | {
 39 |     EXPECT_TRUE(modulus > 2); // if this fails, this test file has a bug
 40 | 
 41 |     T a = 0;
 42 |     T b = 0;
 43 |     EXPECT_TRUE(static_cast<T>(0) ==
 44 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 45 |     a = 0; b = 1;
 46 |     EXPECT_TRUE(static_cast<T>(1) ==
 47 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 48 |     EXPECT_TRUE(static_cast<T>(1) ==
 49 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 50 |     a = 1; b = 1;
 51 |     EXPECT_TRUE(static_cast<T>(2) ==
 52 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 53 | 
 54 |     a = 0; b = static_cast<T>(modulus - 1);
 55 |     EXPECT_TRUE(b == hc::modular_addition_prereduced_inputs(a, b, modulus));
 56 |     EXPECT_TRUE(b == hc::modular_addition_prereduced_inputs(b, a, modulus));
 57 |     EXPECT_TRUE(static_cast<T>(modulus - 2) ==
 58 |                          hc::modular_addition_prereduced_inputs(b, b, modulus));
 59 | 
 60 |     a = 1; b = static_cast<T>(modulus - 1);
 61 |     EXPECT_TRUE(static_cast<T>(0) ==
 62 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 63 |     EXPECT_TRUE(static_cast<T>(0) ==
 64 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 65 | 
 66 |     a = static_cast<T>(modulus/2);
 67 |     b = static_cast<T>(modulus - a);
 68 |     EXPECT_TRUE(static_cast<T>(0) ==
 69 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 70 |     EXPECT_TRUE(static_cast<T>(0) ==
 71 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 72 | 
 73 |     b++;
 74 |     EXPECT_TRUE(static_cast<T>(1) ==
 75 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 76 |     EXPECT_TRUE(static_cast<T>(1) ==
 77 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 78 |     a++;
 79 |     EXPECT_TRUE(static_cast<T>(2) ==
 80 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 81 |     EXPECT_TRUE(static_cast<T>(2) ==
 82 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 83 | 
 84 |     a = static_cast<T>(modulus/2 - 1);
 85 |     b = static_cast<T>(modulus - a - 2);
 86 |     EXPECT_TRUE(static_cast<T>(modulus - 2) ==
 87 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 88 |     EXPECT_TRUE(static_cast<T>(modulus - 2) ==
 89 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 90 |     a++;
 91 |     EXPECT_TRUE(static_cast<T>(modulus - 1) ==
 92 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
 93 |     EXPECT_TRUE(static_cast<T>(modulus - 1) ==
 94 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
 95 | }
 96 | 
 97 | 
 98 | template <typename T>
 99 | void test_modular_addition()
100 | {
101 |     // test with a few basic examples first
102 |     T modulus = 13;
103 |     T a = 5;
104 |     T b = 12;
105 |     EXPECT_TRUE(static_cast<T>(4) ==
106 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
107 |     EXPECT_TRUE(static_cast<T>(4) ==
108 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
109 |     EXPECT_TRUE(static_cast<T>(11) ==
110 |                          hc::modular_addition_prereduced_inputs(b, b, modulus));
111 |     a = 7; b = 6;
112 |     EXPECT_TRUE(static_cast<T>(0) ==
113 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
114 |     EXPECT_TRUE(static_cast<T>(0) ==
115 |                          hc::modular_addition_prereduced_inputs(b, a, modulus));
116 |     EXPECT_TRUE(static_cast<T>(12) ==
117 |                          hc::modular_addition_prereduced_inputs(b, b, modulus));
118 | 
119 |     test_modulus(modulus);
120 |     test_modulus(static_cast<T>(14));
121 | 
122 |     // --------- Test using moduli that are likely edge cases --------
123 | 
124 |     modulus = 1;
125 |     a = 0; b = 0;
126 |     EXPECT_TRUE(static_cast<T>(0) ==
127 |                          hc::modular_addition_prereduced_inputs(a, b, modulus));
128 | 
129 |     modulus = hc::ut_numeric_limits<T>::max();
130 |     test_modulus(modulus);
131 |     modulus--;
132 |     test_modulus(modulus);
133 | 
134 |     modulus = hc::ut_numeric_limits<T>::max() / 2;
135 |     test_modulus(modulus);
136 |     modulus++;
137 |     test_modulus(modulus);
138 | }
139 | 
140 | 
141 | 
142 | TEST(ModularArithmetic, modular_addition) {
143 |     test_modular_addition<std::uint8_t>();
144 |     test_modular_addition<std::uint16_t>();
145 |     test_modular_addition<std::uint32_t>();
146 |     test_modular_addition<std::uint64_t>();
147 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
148 |     test_modular_addition<__uint128_t>();
149 | #endif
150 | 
151 |     test_modular_addition<std::int8_t>();
152 |     test_modular_addition<std::int16_t>();
153 |     test_modular_addition<std::int32_t>();
154 |     test_modular_addition<std::int64_t>();
155 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
156 |     test_modular_addition<__int128_t>();
157 | #endif
158 | }
159 | 
160 | 
161 | } // end unnamed namespace
162 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_get_Rsquared_mod_n.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2025 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_GET_RSQUARED_MOD_N_H_INCLUDED
  9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_IMPL_GET_RSQUARED_MOD_N_H_INCLUDED
 10 | 
 11 | 
 12 | #include "hurchalla/montgomery_arithmetic/low_level_api/REDC.h"
 13 | #include "hurchalla/montgomery_arithmetic/detail/platform_specific/two_times_restricted.h"
 14 | #include "hurchalla/modular_arithmetic/modular_addition.h"
 15 | #include "hurchalla/modular_arithmetic/modular_multiplication.h"
 16 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 17 | #include "hurchalla/util/unsigned_square_to_hilo_product.h"
 18 | #include "hurchalla/util/compiler_macros.h"
 19 | #include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
 20 | 
 21 | #if defined(_MSC_VER)
 22 | #  pragma warning(push)
 23 | #  pragma warning(disable : 4127)
 24 | #endif
 25 | 
 26 | namespace hurchalla { namespace detail {
 27 | 
 28 | 
 29 | // For discussion purposes, let type UP be a conceptually unlimited precision
 30 | // unsigned integer type, and let the unlimited precision constant R represent
 31 | // R = (UP)1 << ut_numeric_limits<T>::digits.  Equivalently,
 32 | // R = (UP)ut_numeric_limits<T>::max + 1.  For example, if T is uint64_t, we
 33 | // would have R = (UP)1 << 64.
 34 | 
 35 | // Compute (R*R) % n
 36 | 
 37 | // Minor note: we use a static member function to disallow ADL.
 38 | template <bool nIsGuaranteedLessThanRdiv4, class PTAG>
 39 | struct impl_get_Rsquared_mod_n {
 40 | 
 41 |   template <typename T>
 42 |   HURCHALLA_FORCE_INLINE static T call(T n, T inverse_n_modR, T Rmod_n)
 43 |   {
 44 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 45 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
 46 | 
 47 |     HPBC_CLOCKWORK_PRECONDITION2(n % 2 == 1);
 48 |     HPBC_CLOCKWORK_PRECONDITION2(n > 1);
 49 | 
 50 |     namespace hc = ::hurchalla;
 51 |     T rSquaredModN;
 52 | #ifdef HURCHALLA_TESTING_RSQUARED_MOD_N
 53 |     if (true) {
 54 | #else
 55 |     if HURCHALLA_CPP17_CONSTEXPR
 56 |             (hc::modular_multiplication_has_slow_perf<T>()) {
 57 | #endif
 58 |         HPBC_CLOCKWORK_ASSERT2(Rmod_n < n);
 59 |         T tmp = Rmod_n;   // Rmod_n ≡ 1*R (mod n)
 60 |         int i=0;
 61 |         for (; i<8; ++i)
 62 |             tmp = hc::modular_addition_prereduced_inputs(tmp, tmp, n);
 63 |         // at this point,  tmp ≡ 256*R (mod n)
 64 |         constexpr int bitsT = ut_numeric_limits<T>::digits;
 65 |         for (; i<bitsT; i*=2) {
 66 |             // use montgomery multiplication to square tmp on each iteration
 67 |             T u_hi, u_lo;
 68 |             u_hi = hc::unsigned_square_to_hilo_product(u_lo, tmp);
 69 |             tmp = hc::REDC_standard(u_hi, u_lo, n, inverse_n_modR, PTAG());
 70 |         }
 71 |         HPBC_CLOCKWORK_ASSERT2(i == bitsT);
 72 |         // We should now have  tmp ≡ R*R (mod n).
 73 |         // REDC_standard's postcondition guarantees the following:
 74 |         HPBC_CLOCKWORK_ASSERT2(tmp < n);
 75 | 
 76 |         rSquaredModN = tmp;
 77 |         HPBC_CLOCKWORK_POSTCONDITION2(rSquaredModN ==
 78 |                hc::modular_multiplication_prereduced_inputs(Rmod_n, Rmod_n, n));
 79 |     } else {
 80 |         rSquaredModN = hc::modular_multiplication_prereduced_inputs(
 81 |                                                              Rmod_n, Rmod_n, n);
 82 |     }
 83 | 
 84 |     HPBC_CLOCKWORK_POSTCONDITION2(rSquaredModN < n);
 85 |     return rSquaredModN;
 86 |   }
 87 | };
 88 | 
 89 | 
 90 | template<class PTAG>
 91 | struct impl_get_Rsquared_mod_n<true, PTAG> {
 92 | 
 93 |   template <typename T>
 94 |   HURCHALLA_FORCE_INLINE static T call(T n, T inverse_n_modR, T Rmod_n)
 95 |   {
 96 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 97 |     static_assert(!(ut_numeric_limits<T>::is_signed), "");
 98 | 
 99 |     HPBC_CLOCKWORK_PRECONDITION2(n % 2 == 1);
100 |     HPBC_CLOCKWORK_PRECONDITION2(n > 1);
101 |     // and since the template param nIsGuaranteedLessThanRdiv4 == true,
102 |     constexpr T Rdiv4 = static_cast<T>(static_cast<T>(1) <<
103 |                                             (ut_numeric_limits<T>::digits - 2));
104 |     HPBC_CLOCKWORK_PRECONDITION2(n < Rdiv4);
105 | 
106 |     namespace hc = ::hurchalla;
107 |     T rSquaredModN;
108 | #ifdef HURCHALLA_TESTING_RSQUARED_MOD_N
109 |     if (true) {
110 | #else
111 |     if HURCHALLA_CPP17_CONSTEXPR
112 |             (hc::modular_multiplication_has_slow_perf<T>()) {
113 | #endif
114 |         HPBC_CLOCKWORK_ASSERT2(Rmod_n < n);
115 |         T tmp = Rmod_n;   // Rmod_n ≡ 1*R (mod n)
116 |         int i=0;
117 | 
118 |         for (; i<4; ++i)
119 |             tmp = hc::detail::two_times_restricted<T>::call(tmp, n);
120 | 
121 |         // at this point,  tmp ≡ 16*R (mod n)
122 |         constexpr int bitsT = ut_numeric_limits<T>::digits;
123 | 
124 |         for (; i<bitsT/2; i*=2) {
125 |             // use montgomery multiplication to square tmp on each iteration
126 |             T u_hi, u_lo;
127 |             u_hi = hc::unsigned_square_to_hilo_product(u_lo, tmp);
128 |             // use the same logic as MontyQuarterRange's montyREDC():
129 |             tmp = hc::REDC_incomplete(u_hi, u_lo, n, inverse_n_modR, PTAG());
130 |             tmp = static_cast<T>(tmp + n);
131 |             HPBC_CLOCKWORK_ASSERT2(0 < tmp && tmp < static_cast<T>(2*n));
132 |         }
133 |         HPBC_CLOCKWORK_ASSERT2(i == bitsT/2);
134 |         {
135 |             // This final iteration was unrolled from the loop above so we can
136 |             // use standard REDC, which will end with tmp in the range [0, n).
137 |             T u_hi, u_lo;
138 |             u_hi = hc::unsigned_square_to_hilo_product(u_lo, tmp);
139 |             tmp = hc::REDC_standard(u_hi, u_lo, n, inverse_n_modR, PTAG());
140 |         }
141 | 
142 |         // We should now have  tmp ≡ R*R (mod n).
143 |         // REDC_standard's postcondition guarantees the following:
144 |         HPBC_CLOCKWORK_ASSERT2(tmp < n);
145 | 
146 |         rSquaredModN = tmp;
147 |         HPBC_CLOCKWORK_POSTCONDITION2(rSquaredModN ==
148 |                hc::modular_multiplication_prereduced_inputs(Rmod_n, Rmod_n, n));
149 |     } else {
150 |         rSquaredModN = hc::modular_multiplication_prereduced_inputs(
151 |                                                              Rmod_n, Rmod_n, n);
152 |     }
153 | 
154 |     HPBC_CLOCKWORK_POSTCONDITION2(rSquaredModN < n);
155 |     return rSquaredModN;
156 |   }
157 | };
158 | 
159 | 
160 | }} // end namespace
161 | 
162 | #if defined(_MSC_VER)
163 | #  pragma warning(pop)
164 | #endif
165 | 
166 | 
167 | #endif
168 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/README_REDC_supplement.md:
--------------------------------------------------------------------------------
 1 | This file supplements the document [README_REDC.md](README_REDC.md).
 2 | <br><br>
 3 | 
 4 | The simplest and usually most effective way to implement the traditional REDC is to write a delegating function that calls the alternate REDC.  With inlining, its total uops will likely be lower than the low-uops asm version further below, and there is a decent chance that the compiler will loop hoist the calculation of invN if we are calling this function from a loop.  Thus this version could also achieve latency equal to the low-latency asm version further below.  In practice, even if the negation is not loop hoisted, REDC will most often be called during Montgomery multiplication, and the negation will not contribute to latency since its calculation will overlap with the preceding multiply in the Montgomery multiplication.  Yet another reason why we might prefer this implementation is that the delegate "REDC_alternate" function can be implemented effectively with just standard C, which would eliminate the chance of inline-asm related bugs, and will sometimes improve performance since inline-asm may hinder compiler optimizations.<br>
 5 | 
 6 | <pre>
 7 | // On Intel Skylake: ~9-10 cycles latency, ~8 fused uops
 8 | inline uint64_t REDC_traditional_delegating(uint64_t T_hi, uint64_t T_lo,
 9 |                                                    uint64_t N, uint64_t negInvN)
10 | {
11 |     uint64_t invN = -negInvN;
12 |     return REDC_alternate(T_hi, T_lo, N, invN);
13 | }
14 | </pre>
15 | <i>Delegating Function for the Traditional REDC</i>
16 | <br><br>
17 | 
18 | There is usually no need to read further unless you are curious.
19 | 
20 | We can improve upon the inline assembly we saw in the main document for the traditional REDC, though the code becomes harder to understand.  The improvements also can't be implemented well in standard C; none of the major compilers (gcc, clang, MSVC, icc) are able to compile standard C versions of the functions below without adding significant extra latency and uops, even with idiomatic use of the ternary operator for conditional move.
21 | 
22 | Since the alternate REDC function from [README_REDC.md](README_REDC.md) does better on uops and equals or betters the latency, all while being easier to understand, and friendlier for compilers if written in standard C, we should certainly prefer the alternate REDC to the functions that follow.  Nevertheless the functions below do improve the traditional REDC inline asm, so they could be useful as an easy drop-in replacement of an existing REDC function (which will almost certainly be traditional REDC with the negative inverse), or they might be interesting for anyone curious.
23 | 
24 | The improved functions below are correct and produce output equivalent to the previous inline asm we saw for the traditional REDC.  You can find a rough proof of correctness in comments of the C++ function ["REDC(T u_hi, T u_lo, T n, T neg_inv_n, FullrangeTag, InplaceLowlatencyTag)" of an old git commit](https://github.com/hurchalla/modular_arithmetic/blob/66281af1639031b04bdaf9b916e5d5638d3ded25/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/platform_specific/RedcLargeR.h#L365).
25 | 
26 | The first function below is optimized for lowest latency(REDC_traditional_improved1).  The second is optimized for lowest uops (REDC_traditional_improved2):</br>
27 | 
28 | 
29 | <pre>
30 | // On Intel Skylake: 9 cycles latency, 11 fused uops
31 | inline uint64_t REDC_traditional_improved1(uint64_t T_hi, uint64_t T_lo,
32 |                                                    uint64_t N, uint64_t negInvN)
33 | {
34 |     assert(T_hi < N);   // REDC requires T < NR, and this enforces it.
35 |     uint64_t rrax = T_lo;
36 |     uint64_t Thi = T_hi;
37 |     uint64_t tmp;
38 |     __asm__ (
39 |         "movq %%rax, %[tmp] \n\t"
40 |         "imulq %[inv], %%rax \n\t"    /* m = T_lo * negInvN */
41 |         "mulq %[N] \n\t"              /* mN = m * N */
42 |         "movq %[Thi], %%rax \n\t"
43 |         "subq %[N], %%rax \n\t"       /* diff = T_hi - N */
44 |         "negq %[tmp] \n\t"            /* Sets carry to (T_lo != 0) */
45 |         "adcq %%rdx, %[Thi] \n\t"     /* sum1 = addcarry(T_hi, mN_hi) */
46 |         "negq %[tmp] \n\t"            /* Sets carry to (T_lo != 0) */
47 |         "adcq %%rdx, %%rax \n\t"      /* sum2 = addcarry(diff, mN_hi) */
48 |         "cmovaeq %[Thi], %%rax \n\t"  /* rrax = (sum2 >= mN_hi) ? sum1 : sum2 */
49 |         : [Thi]"+r"(Thi), "+&a"(rrax), [tmp]"=&r"(tmp)
50 |         : [N]"r"(N), [inv]"r"(negInvN)
51 |         : "rdx", "cc");
52 |     return rrax;
53 | }
54 | </pre>
55 | <i>Improved Traditional REDC (low latency version)</i>
56 | 
57 | </br>
58 | 
59 | <pre>
60 | // On Intel Skylake: 10 cycles latency, 9 fused uops.
61 | inline uint64_t REDC_traditional_improved2(uint64_t T_hi, uint64_t T_lo,
62 |                                                    uint64_t N, uint64_t negInvN)
63 | {
64 |     assert(T_hi < N);   // REDC requires T < NR, and this enforces it.
65 |     uint64_t rrax = T_lo;
66 |     uint64_t Thi = T_hi;
67 |     uint64_t tmp;
68 |     __asm__ (
69 |         "movq %%rax, %[tmp] \n\t"
70 |         "imulq %[inv], %%rax \n\t"        /* m = T_lo * negInvN */
71 |         "mulq %[N] \n\t"                  /* mN = m * N */
72 |         "subq %[N], %[Thi] \n\t"          /* diff = T_hi - N */
73 |         "negq %[tmp] \n\t"                /* Sets carry to (T_lo != 0) */
74 |         "adcq %[Thi], %%rdx \n\t"         /* rdx = addcarry(diff, mN_hi) */
75 |         "leaq (%%rdx, %[N]), %%rax \n\t"  /* rax = rdx + N */
76 |         "cmovbq %%rdx, %%rax \n\t"        /* rrax = (rdx &lt; mN_hi) ? rdx : rax */
77 |         : [Thi]"+&r"(Thi), "+&a"(rrax), [tmp]"=&r"(tmp)
78 |         : [N]"r"(N), [inv]"r"(negInvN)
79 |         : "rdx", "cc");
80 |     return rrax;
81 | }
82 | </pre>
83 | <i>Improved Traditional REDC (low uops version)</i>
84 | 
85 | </br>
86 | 
87 | All code in this file is licensed under the MIT Open Source License:
88 | 
89 | Copyright (c) 2022 by Jeffrey Hurchalla.
90 | 
91 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
92 | 
93 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
94 | 
95 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
96 | 


--------------------------------------------------------------------------------
/test/modular_arithmetic/test_modular_multiplication.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | 
  9 | // We'll undefine HURCHALLA_DISALLOW_INLINE_ASM_MODMUL here in order to make
 10 | // modular multiplication use an inline asm function version if it is available.
 11 | // This shouldn't be strictly necessary, since there's no reason this macro
 12 | // would be defined at this point, and by default modular multiplication uses
 13 | // inline asm (if available) unless this macro is defined.
 14 | // Internally, the inline asm function will also call the generic template
 15 | // function version of modular multiplication inside a postcondition, in order
 16 | // to make sure that the asm result is correct.  Of course postcondition checks
 17 | // must be enabled for this check to occur - the easiest way to ensure
 18 | // postconditions are enabled is to define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS,
 19 | // which is why we do so here.  This is all strictly for testing purposes.
 20 | #undef HURCHALLA_DISALLOW_INLINE_ASM_MODMUL
 21 | 
 22 | #ifndef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 23 | #  define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 24 | #endif
 25 | 
 26 | 
 27 | 
 28 | #include "hurchalla/modular_arithmetic/modular_multiplication.h"
 29 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 30 | #include "hurchalla/util/compiler_macros.h"
 31 | #include "gtest/gtest.h"
 32 | #include <cstdint>
 33 | 
 34 | namespace {
 35 | 
 36 | 
 37 | namespace hc = ::hurchalla;
 38 | 
 39 | template <typename T>
 40 | void test_modulus(T modulus)
 41 | {
 42 |     T a = 0;
 43 |     T b = 0;
 44 |     EXPECT_TRUE(static_cast<T>(0) ==
 45 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 46 |     a = 0; b = 1;
 47 |     EXPECT_TRUE(static_cast<T>(0) ==
 48 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 49 |     EXPECT_TRUE(static_cast<T>(0) ==
 50 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 51 |     a = 1; b = 1;
 52 |     EXPECT_TRUE(static_cast<T>(1) ==
 53 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 54 | 
 55 |     a = 2; b = 3;
 56 |     EXPECT_TRUE(static_cast<T>(6) ==
 57 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 58 |     EXPECT_TRUE(static_cast<T>(6) ==
 59 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 60 |     EXPECT_TRUE(static_cast<T>(4) ==
 61 |                    hc::modular_multiplication_prereduced_inputs(a, a, modulus));
 62 | 
 63 |     a = 0; b = static_cast<T>(modulus - 1);
 64 |     EXPECT_TRUE(static_cast<T>(0) ==
 65 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 66 |     EXPECT_TRUE(static_cast<T>(0) ==
 67 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 68 |     EXPECT_TRUE(static_cast<T>(1) ==
 69 |                    hc::modular_multiplication_prereduced_inputs(b, b, modulus));
 70 | 
 71 |     a = 1; b = static_cast<T>(modulus - 1);
 72 |     EXPECT_TRUE(static_cast<T>(modulus - 1) ==
 73 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 74 |     EXPECT_TRUE(static_cast<T>(modulus - 1) ==
 75 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 76 | 
 77 |     a = static_cast<T>(modulus - 1);
 78 |     b = static_cast<T>(modulus - 2);
 79 |     EXPECT_TRUE(static_cast<T>(2) ==
 80 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 81 |     EXPECT_TRUE(static_cast<T>(2) ==
 82 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 83 | 
 84 |     a = static_cast<T>(modulus - 2);
 85 |     b = static_cast<T>(modulus - 3);
 86 |     EXPECT_TRUE(static_cast<T>(6) ==
 87 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
 88 |     EXPECT_TRUE(static_cast<T>(6) ==
 89 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
 90 | 
 91 |     T tmp = static_cast<T>((modulus/4)*4);  // make tmp == 4n for some integer n
 92 |     a = static_cast<T>(tmp/2);
 93 |     EXPECT_TRUE(static_cast<T>(0) ==
 94 |                    hc::modular_multiplication_prereduced_inputs(a, a, tmp));
 95 | 
 96 |     tmp = static_cast<T>((modulus/2)*2);
 97 |     a = static_cast<T>(tmp/2);
 98 |     b = static_cast<T>(6);
 99 |     EXPECT_TRUE(static_cast<T>(0) ==
100 |                    hc::modular_multiplication_prereduced_inputs(a, b, tmp));
101 |     EXPECT_TRUE(static_cast<T>(0) ==
102 |                    hc::modular_multiplication_prereduced_inputs(b, a, tmp));
103 | 
104 |     b = static_cast<T>(5);
105 |     EXPECT_TRUE(a == hc::modular_multiplication_prereduced_inputs(a, b, tmp));
106 |     EXPECT_TRUE(a == hc::modular_multiplication_prereduced_inputs(b, a, tmp));
107 | }
108 | 
109 | 
110 | template <typename T>
111 | void test_modular_multiplication()
112 | {
113 |     // test with a few basic examples first
114 |     T modulus = 13;
115 |     T a = 5;
116 |     T b = 12;
117 |     EXPECT_TRUE(static_cast<T>(8) ==
118 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
119 |     EXPECT_TRUE(static_cast<T>(8) ==
120 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
121 |     EXPECT_TRUE(static_cast<T>(12) ==
122 |                    hc::modular_multiplication_prereduced_inputs(a, a, modulus));
123 |     EXPECT_TRUE(static_cast<T>(1) ==
124 |                    hc::modular_multiplication_prereduced_inputs(b, b, modulus));
125 | 
126 |     modulus = 14;
127 |     a = 7;
128 |     b = 8;
129 |     EXPECT_TRUE(static_cast<T>(0) ==
130 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
131 |     EXPECT_TRUE(static_cast<T>(0) ==
132 |                    hc::modular_multiplication_prereduced_inputs(b, a, modulus));
133 | 
134 |     test_modulus(modulus);
135 |     test_modulus(static_cast<T>(15));
136 | 
137 |     // --------- Test using moduli that are likely edge cases --------
138 | 
139 |     modulus = 1;
140 |     a = 0; b = 0;
141 |     EXPECT_TRUE(static_cast<T>(0) ==
142 |                    hc::modular_multiplication_prereduced_inputs(a, b, modulus));
143 | 
144 |     modulus = hc::ut_numeric_limits<T>::max();
145 |     test_modulus(modulus);
146 |     modulus--;
147 |     test_modulus(modulus);
148 | 
149 |     modulus = hc::ut_numeric_limits<T>::max() / 2;
150 |     test_modulus(modulus);
151 |     modulus++;
152 |     test_modulus(modulus);
153 | }
154 | 
155 | 
156 | 
157 | TEST(ModularArithmetic, modular_multiplication) {
158 |     test_modular_multiplication<std::uint8_t>();
159 |     test_modular_multiplication<std::uint16_t>();
160 |     test_modular_multiplication<std::uint32_t>();
161 |     test_modular_multiplication<std::uint64_t>();
162 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
163 |     test_modular_multiplication<__uint128_t>();
164 | #endif
165 | }
166 | 
167 | 
168 | } // end unnamed namespace
169 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Clockwork Modular Arithmetic library
 2 | 
 3 | ![Alt text](images/clockxtrasmall_border2.jpg?raw=true "Clock Gears, photo by Krzysztof Golik, licensed CC BY-SA 4.0")
 4 | 
 5 | Clockwork is a high performance, easy to use Modular Arithmetic library for C++ provided as a "header-only" library, supporting up to 128 bit integer types, and providing extensive support for Montgomery arithmetic.  If you want or need Montgomery arithmetic in this range, or general modular arithmetic functions, Clockwork is almost certainly the fastest and easiest library you could use.  
 6 | 
 7 | The library requires only C++11, and works with all higher versions of the C++ standard.
 8 | 
 9 | ## Design goals
10 | 
11 | Clockwork is designed to be a flexible and bulletproof library with the best performance achievable for modular arithmetic using standard C++ language integer types (e.g. uint32_t or uint64_t) and the language extension types \_\_uint128_t and \_\_int128_t.  Larger than 128 bit types are permissible by [specialization](https://github.com/hurchalla/util/blob/master/include/hurchalla/util/traits/ut_numeric_limits.h); however a library like [GMP](https://gmplib.org/) is likely to be a better choice for such sizes.
12 | 
13 | ## Requirements
14 | 
15 | The Clockwork library requires only compiler support for C++11, which is essentially supported universally at this point.  
16 | 
17 | Compilers that are confirmed to build this library without warnings or errors on Ubuntu linux (x64) include clang6, clang10, clang18, gcc7, gcc10, gcc13, and intel compiler 19.  On Windows, Microsoft Visual C++ 2017, 2019, 2022 are all confirmed to build the library without warnings or errors.  On MacOS, clang16 and gcc14 are confirmed to build without warnings or errors.  The library is intended for use on all architectures (e.g. x86/64, ARM, RISC-V), but has so far been tested only with x86, x64 (Windows and Ubuntu), and ARM64 (MacOS).
18 | 
19 | ## Status
20 | 
21 | Released.  All planned functionality and unit tests are finished and working correctly.
22 | 
23 | ## Author
24 | 
25 | * **Jeffrey Hurchalla**
26 | 
27 | ## License
28 | 
29 | This project is licensed under the MPL 2.0 License - see the [LICENSE.TXT](LICENSE.TXT) file for details
30 | 
31 | <br/>
32 | 
33 | ## How to use the library
34 | 
35 | ### With CMake
36 | 
37 | If you're using CMake for your project and you wish to add the Clockwork modular arithmetic library to it, then clone this git repository onto your system.  In your project's CMakeLists.txt file, add the following two lines with appropriate changes to their italic portions to match your project and paths ( an easy replacement for *your_binary_dir* is ${CMAKE_CURRENT_BINARY_DIR} ):  
38 | add_subdirectory(*path_of_the_cloned_modular_arithmetic_repository* &nbsp; *your_binary_dir*/modular_arithmetic)  
39 | target_link_libraries(*your_project_target_name* &nbsp; hurchalla_modular_arithmetic)  
40 | 
41 | It may help to see a simple [example project with CMake](examples/example_with_cmake).
42 | 
43 | ### Without CMake
44 | 
45 | If you're not using CMake for your project, you'll need to install Clockwork's modular arithmetic headers and its dependencies to some directory in order to use them.  To do this, first clone this git repository onto your system.  You'll need to have CMake (at least temporarily) on your system, so install CMake if you don't have it.  Then from your shell run the following commands:  
46 | 
47 | >cd *path_of_the_cloned_modular_arithmetic_repository*  
48 | >mkdir tmp  
49 | >cd tmp  
50 | >cmake -S.. -B.  
51 | >cmake --install . --prefix *the_folder_you_want_to_install_to*  
52 | If you prefer, for the last command you could instead use CMake's default install location (on linux this is /usr/local) by omitting the --prefix and subsequent folder.  
53 | 
54 | This will copy all the files needed for this modular arithmetic library to an "include" subfolder in the installation folder of your choosing.
55 | When compiling your project, you'll of course need to ensure that you have that include subfolder as part of your include path.  
56 | 
57 | It may help to see a simple [example](examples/example_without_cmake).
58 | 
59 | ## The API
60 | 
61 | Clockwork modular arithmetic is a header-only library, and the API is exposed by very short and simple header files (all headers not under any *detail* folder).  There are two main folder groupings: montgomery_arithmetic, and modular_arithmetic (i.e. standard non-montgomery).  A quick summary of the header files and functions is provided below; in all cases T is a template parameter of integral type.  Please view the header files for their documentation.  Probably the single most useful file is MontgomeryForm.h, discussed below.
62 | 
63 | From the modular_arithmetic group, the files *absolute_value_difference.h*, *modular_addition.h*, *modular_subtraction.h*, *modular_multiplication.h*, *modular_multiplicative_inverse.h*, and *modular_pow.h* provide the following functions, using standard (non-Montgomery) modular arithmetic:
64 | 
65 | *hurchalla::absolute_value_difference(T a, T b)*.  Returns the absolute value of (a-b), performed as if a and b are infinite precision signed ints.  
66 | *hurchalla::modular_subtraction_prereduced_inputs(T a, T b, T modulus)*.  Let a conceptual "%%" operator represent a modulo operator that always returns a non-negative remainder. This function returns (a-b) %% modulus, performed as if a and b are infinite precision signed ints.  
67 | *hurchalla::modular_addition_prereduced_inputs(T a, T b, T modulus)*.  Returns (a+b)%modulus, performed as if a and b have infinite precision and thus as if (a+b) is never subject to integer overflow.  
68 | *hurchalla::modular_multiplication_prereduced_inputs(T a, T b, T modulus)*.   Returns (a\*b)%modulus, performed as if a and b have infinite precision.  
69 | *hurchalla::modular_multiplicative_inverse(T a, T modulus)*.  Returns the multiplicative inverse of a if it exists, and otherwise returns 0.  
70 | *hurchalla::modular_pow(T base, T exponent, T modulus)*.  Returns the modular exponentiation of base to the exponent (mod modulus).  
71 | 
72 | From the montgomery_arithmetic group, the file *MontgomeryForm.h* provides the easy to use (and zero cost abstraction) class *hurchalla::MontgomeryForm*, which has simple member functions for performing operations in the Montgomery domain.  These operations include converting to/from Montgomery domain, add, subtract, multiply, square, [fused-multiply-add/sub](https://jeffhurchalla.com/2022/05/01/the-montgomery-multiply-accumulate), pow, gcd, and more.  For improved performance, if you can guarantee your modulus will be under half or under a quarter of the maximum value of your integer type T, the file *montgomery_form_aliases.h* provides aliases of the class MontgomeryForm which typically run ~5-10% faster.
73 | 
74 | For an easy demonstration of MontgomeryForm, you can see one of the [examples](examples/example_without_cmake).
75 | 
76 | If you prefer not to use the high level interface of MontgomeryForm, and instead wish to directly call low level Montgomery arithmetic functions (such as REDC), the API header files within montgomery_arithmetic/low_level_api provide the essential low level functions.
77 | 
78 | ## Performance Notes
79 | 
80 | If you're interested in experimenting, defining certain macros when compiling might improve performance - see [macros_for_performance.md](macros_for_performance.md).
81 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryWrapper.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | #ifndef HURCHALLA_MONTGOMERY_ARITHMETIC_ABSTRACT_MONTGOMERY_WRAPPER_H_INCLUDED
  9 | #define HURCHALLA_MONTGOMERY_ARITHMETIC_ABSTRACT_MONTGOMERY_WRAPPER_H_INCLUDED
 10 | 
 11 | 
 12 | //#include "AbstractMontgomeryForm.h"
 13 | #include <memory>
 14 | 
 15 | namespace hurchalla {
 16 | 
 17 | // AMF should be AbstractMontgomeryForm<true> or AbstractMontgomeryForm<false>
 18 | template <class AMF>
 19 | class AbstractMontgomeryWrapper final {
 20 |     std::unique_ptr<const AMF> pimpl;
 21 | public:
 22 |     using IntegerType = typename AMF::IntegerType;
 23 |     using MontgomeryValue = typename AMF::MontgomeryValue;
 24 |     using CanonicalValue = typename AMF::CanonicalValue;
 25 |     using FusingValue = typename AMF::FusingValue;
 26 | //    using RU = typename AMF::RU;
 27 | 
 28 |     explicit AbstractMontgomeryWrapper(std::unique_ptr<const AMF> pimpl_)
 29 |         : pimpl(std::move(pimpl_)) {}
 30 | 
 31 |     IntegerType max_modulus() const { return pimpl->max_modulus(); }
 32 |     IntegerType getModulus() const { return pimpl->getModulus(); }
 33 | 
 34 |     template <class PTAG = LowlatencyTag>
 35 |     MontgomeryValue convertIn(IntegerType a) const
 36 |         { return pimpl->template convertIn<PTAG>(a); }
 37 | 
 38 |     template <class PTAG = LowlatencyTag>
 39 |     IntegerType convertOut(MontgomeryValue x) const
 40 |         { return pimpl->template convertOut<PTAG>(x); }
 41 | 
 42 |     CanonicalValue getCanonicalValue(MontgomeryValue x) const
 43 |         { return pimpl->getCanonicalValue(x); }
 44 |     FusingValue getFusingValue(MontgomeryValue x) const
 45 |         { return pimpl->getFusingValue(x); }
 46 |     CanonicalValue getUnityValue() const
 47 |         { return pimpl->getUnityValue(); }
 48 |     CanonicalValue getZeroValue() const
 49 |         { return pimpl->getZeroValue(); }
 50 |     CanonicalValue getNegativeOneValue() const
 51 |         { return pimpl->getNegativeOneValue(); }
 52 |     MontgomeryValue add(MontgomeryValue x, MontgomeryValue y) const
 53 |         { return pimpl->add(x, y); }
 54 |     MontgomeryValue add(MontgomeryValue x, CanonicalValue y) const
 55 |         { return pimpl->add(x, y); }
 56 |     MontgomeryValue add(CanonicalValue x, MontgomeryValue y) const
 57 |         { return pimpl->add(x, y); }
 58 |     CanonicalValue add(CanonicalValue x, CanonicalValue y) const
 59 |         { return pimpl->add(x, y); }
 60 | 
 61 |     template <class PTAG = LowuopsTag>
 62 |     MontgomeryValue subtract(MontgomeryValue x, MontgomeryValue y) const
 63 |         { return pimpl->template subtract<PTAG>(x, y); }
 64 |     template <class PTAG = LowuopsTag>
 65 |     MontgomeryValue subtract(MontgomeryValue x, CanonicalValue y) const
 66 |         { return pimpl->template subtract<PTAG>(x, y); }
 67 |     template <class PTAG = LowuopsTag>
 68 |     MontgomeryValue subtract(CanonicalValue x, MontgomeryValue y) const
 69 |         { return pimpl->template subtract<PTAG>(x, y); }
 70 |     template <class PTAG = LowuopsTag>
 71 |     CanonicalValue subtract(CanonicalValue x, CanonicalValue y) const
 72 |         { return pimpl->template subtract<PTAG>(x, y); }
 73 | 
 74 |     MontgomeryValue unorderedSubtract(MontgomeryValue x, MontgomeryValue y) const
 75 |         { return pimpl->unorderedSubtract(x, y); }
 76 |     MontgomeryValue unorderedSubtract(MontgomeryValue x, CanonicalValue y) const
 77 |         { return pimpl->unorderedSubtract(x, y); }
 78 |     MontgomeryValue unorderedSubtract(CanonicalValue x, MontgomeryValue y) const
 79 |         { return pimpl->unorderedSubtract(x, y); }
 80 |     MontgomeryValue negate(MontgomeryValue x) const
 81 |         { return pimpl->negate(x); }
 82 |     CanonicalValue negate(CanonicalValue x) const
 83 |         { return pimpl->negate(x); }
 84 | 
 85 |     MontgomeryValue two_times(MontgomeryValue x) const
 86 |         { return pimpl->two_times(x); }
 87 |     CanonicalValue two_times(CanonicalValue x) const
 88 |         { return pimpl->two_times(x); }
 89 | 
 90 |     MontgomeryValue halve(MontgomeryValue x) const
 91 |         { return pimpl->halve(x); }
 92 |     CanonicalValue halve(CanonicalValue x) const
 93 |         { return pimpl->halve(x); }
 94 | 
 95 |     template <class PTAG = LowlatencyTag>
 96 |     MontgomeryValue multiply(MontgomeryValue x, MontgomeryValue y) const
 97 |         { return pimpl->template multiply<PTAG>(x, y); }
 98 | 
 99 |     template <class PTAG = LowlatencyTag>
100 |     MontgomeryValue multiply(MontgomeryValue x, MontgomeryValue y, bool& resultIsZero) const
101 |         { return pimpl->template multiply<PTAG>(x, y, resultIsZero); }
102 | 
103 |     template <class PTAG = LowlatencyTag>
104 |     MontgomeryValue fmsub(MontgomeryValue x, MontgomeryValue y, CanonicalValue z) const
105 |         { return pimpl->template fmsub<PTAG>(x, y, z); }
106 | 
107 |     template <class PTAG = LowlatencyTag>
108 |     MontgomeryValue fmsub(MontgomeryValue x, MontgomeryValue y, FusingValue z) const
109 |         { return pimpl->template fmsub<PTAG>(x, y, z); }
110 | 
111 |     template <class PTAG = LowlatencyTag>
112 |     MontgomeryValue fmadd(MontgomeryValue x, MontgomeryValue y, CanonicalValue z) const
113 |         { return pimpl->template fmadd<PTAG>(x, y, z); }
114 | 
115 |     template <class PTAG = LowlatencyTag>
116 |     MontgomeryValue fmadd(MontgomeryValue x, MontgomeryValue y, FusingValue z) const
117 |         { return pimpl->template fmadd<PTAG>(x, y, z); }
118 | 
119 |     template <class PTAG = LowlatencyTag>
120 |     MontgomeryValue square(MontgomeryValue x) const
121 |         { return pimpl->template square<PTAG>(x); }
122 | 
123 |     template <class PTAG = LowlatencyTag>
124 |     MontgomeryValue fusedSquareSub(MontgomeryValue x, CanonicalValue cv) const
125 |         { return pimpl->template fusedSquareSub<PTAG>(x, cv); }
126 | 
127 |     template <class PTAG = LowlatencyTag>
128 |     MontgomeryValue fusedSquareAdd(MontgomeryValue x, CanonicalValue cv) const
129 |         { return pimpl->template fusedSquareAdd<PTAG>(x, cv); }
130 | 
131 |     template <class PTAG = LowlatencyTag>
132 |     CanonicalValue inverse(MontgomeryValue x) const
133 |         { return pimpl->template inverse<PTAG>(x); }
134 | 
135 |     MontgomeryValue pow(MontgomeryValue base, IntegerType exponent) const
136 |         { return pimpl->pow(base, exponent); }
137 | 
138 |     MontgomeryValue two_pow(IntegerType exponent) const
139 |         { return pimpl->two_pow(exponent); }
140 | 
141 |     template <std::size_t NUM_BASES>
142 |     std::array<MontgomeryValue, NUM_BASES>
143 |     pow(const std::array<MontgomeryValue, NUM_BASES>& bases, IntegerType exponent) const
144 |         { return pimpl->pow(bases, exponent); }
145 | 
146 |     template <class F>
147 |     IntegerType gcd_with_modulus(MontgomeryValue x, const F& gcd_functor) const
148 |         { return pimpl->gcd_with_modulus(x, gcd_functor); }
149 | 
150 |     template <class PTAG = LowlatencyTag>
151 |     IntegerType remainder(IntegerType a) const
152 |         { return pimpl->template remainder<PTAG>(a); }
153 | };
154 | 
155 | 
156 | } // end namespace
157 | 
158 | #endif
159 | 


--------------------------------------------------------------------------------
/test/modular_arithmetic/test_modular_pow.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | 
  9 | // We'll undefine HURCHALLA_DISALLOW_INLINE_ASM_MODMUL here in order to make
 10 | // modular multiplication use an inline asm function version if it is available.
 11 | // This shouldn't be strictly necessary, since there's no reason this macro
 12 | // would be defined at this point, and by default modular multiplication uses
 13 | // inline asm (if available) unless this macro is defined.
 14 | // Internally, the inline asm function will also call the generic template
 15 | // function version of modular multiplication inside a postcondition, in order
 16 | // to make sure that the asm result is correct.  Of course postcondition checks
 17 | // must be enabled for this check to occur - the easiest way to ensure
 18 | // postconditions are enabled is to define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS,
 19 | // which is why we do so here.  This is all strictly for testing purposes.
 20 | #undef HURCHALLA_DISALLOW_INLINE_ASM_MODMUL
 21 | 
 22 | #ifndef HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 23 | #  define HURCHALLA_CLOCKWORK_ENABLE_ASSERTS
 24 | #endif
 25 | 
 26 | 
 27 | #include "hurchalla/modular_arithmetic/modular_pow.h"
 28 | #include "hurchalla/modular_arithmetic/modular_multiplication.h"
 29 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 30 | #include "hurchalla/util/compiler_macros.h"
 31 | #include "gtest/gtest.h"
 32 | #include <cstdint>
 33 | 
 34 | namespace {
 35 | 
 36 | 
 37 | namespace hc = ::hurchalla;
 38 | 
 39 | template <typename T>
 40 | T brute_modular_pow(T base, T power, T modulus)
 41 | {
 42 |     T result = 1;
 43 |     for (T i=0; i<power; ++i)
 44 |         result = hc::modular_multiplication_prereduced_inputs(result, base,
 45 |                                                                        modulus);
 46 |     return result;
 47 | }
 48 | 
 49 | 
 50 | template <typename T>
 51 | void test_modulus(T modulus)
 52 | {
 53 |     static_cast<void>(modulus);
 54 | 
 55 |     T base = 0;
 56 |     T power = 0;
 57 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 58 |     base = 0; power = 1;
 59 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
 60 |     base = 0; power = 2;
 61 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
 62 |     base = 1; power = 0;
 63 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 64 |     base = 1; power = 1;
 65 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 66 |     base = 1; power = 2;
 67 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 68 | 
 69 |     base = static_cast<T>(modulus - 1);
 70 |     power = 0;
 71 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 72 |     power = 1;
 73 |     EXPECT_TRUE(base == hc::modular_pow(base, power, modulus));
 74 |     power = 2;
 75 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 76 |     power = 3;
 77 |     EXPECT_TRUE(base == hc::modular_pow(base, power, modulus));
 78 | 
 79 |     T tmax = hc::ut_numeric_limits<T>::max();
 80 |     // make power the largest possible even number
 81 |     power = static_cast<T>((tmax/2)*2);
 82 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 83 |     --power;  // power should now be odd
 84 |     EXPECT_TRUE(base == hc::modular_pow(base, power, modulus));
 85 | 
 86 |     base = modulus;
 87 |     power = 2;
 88 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
 89 |     power = 5;
 90 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
 91 | 
 92 |     if (modulus < tmax) {
 93 |         base = static_cast<T>(modulus + 1);
 94 |         power = 2;
 95 |         EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 96 |         power = 5;
 97 |         EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
 98 |     }
 99 | 
100 |     T tmp = static_cast<T>((modulus/4)*4);  // make tmp == 4n for some integer n
101 |     base = static_cast<T>(tmp/2);
102 |     power = 2;
103 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, tmp));
104 | }
105 | 
106 | 
107 | template <typename T>
108 | void test_modular_pow()
109 | {
110 |     // test with a few basic examples first
111 |     T modulus = 13;
112 |     T base = 5;
113 |     T power = 12;
114 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
115 |     base = 7; power = 6;
116 |     EXPECT_TRUE(static_cast<T>(12) == hc::modular_pow(base, power, modulus));
117 |     modulus = 14;
118 |     EXPECT_TRUE(static_cast<T>(7) == hc::modular_pow(base, power, modulus));
119 | 
120 |     base = 5;
121 |     power = 53;
122 |     modulus = 13;
123 |     EXPECT_TRUE(static_cast<T>(5) == hc::modular_pow(base, power, modulus));
124 |     base = 6;
125 |     EXPECT_TRUE(static_cast<T>(2) == hc::modular_pow(base, power, modulus));
126 | 
127 |     test_modulus(static_cast<T>(13));
128 |     test_modulus(static_cast<T>(14));
129 | 
130 |     // --------- Test using moduli that are likely edge cases --------
131 | 
132 |     modulus = 2;
133 |     base = 0; power = 0;
134 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
135 |     base = 0; power = 5;
136 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
137 |     base = 1; power = 0;
138 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
139 |     base = 31; power = 0;
140 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
141 |     base = 1; power = 3;
142 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
143 |     base = 17; power = 3;
144 |     EXPECT_TRUE(static_cast<T>(1) == hc::modular_pow(base, power, modulus));
145 |     base = 14; power = 3;
146 |     EXPECT_TRUE(static_cast<T>(0) == hc::modular_pow(base, power, modulus));
147 | 
148 |     modulus = hc::ut_numeric_limits<T>::max();
149 |     test_modulus(modulus);
150 |     modulus--;
151 |     test_modulus(modulus);
152 | 
153 |     modulus = hc::ut_numeric_limits<T>::max() / 2;
154 |     test_modulus(modulus);
155 |     modulus++;
156 |     test_modulus(modulus);
157 | }
158 | 
159 | 
160 | 
161 | TEST(ModularArithmetic, modular_pow) {
162 |     test_modular_pow<std::uint8_t>();
163 |     test_modular_pow<std::uint16_t>();
164 |     test_modular_pow<std::uint32_t>();
165 |     test_modular_pow<std::uint64_t>();
166 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
167 |     test_modular_pow<__uint128_t>();
168 | #endif
169 | }
170 | 
171 | TEST(ModularArithmetic, modular_pow_large_exponents) {
172 |     // test a couple large exponent cases
173 |     std::uint32_t base = 81452;
174 |     std::uint32_t exponent = 113;
175 |     std::uint32_t modulus = 2951486173u;
176 |     std::uint32_t result = hc::modular_pow(base, exponent, modulus);
177 |     EXPECT_TRUE(result == brute_modular_pow(base, exponent, modulus));
178 | 
179 |     base = 81451;
180 |     exponent = 113;
181 |     result = hc::modular_pow(base, exponent, modulus);
182 |     EXPECT_TRUE(result == brute_modular_pow(base, exponent, modulus));
183 | 
184 |     exponent = 114;
185 |     result = hc::modular_pow(base, exponent, modulus);
186 |     EXPECT_TRUE(result == brute_modular_pow(base, exponent, modulus));
187 | }
188 | 
189 | 
190 | } // end unnamed namespace
191 | 


--------------------------------------------------------------------------------
/test/montgomery_arithmetic/low_level_api/test_get_Rsquared_mod_n.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020-2022 Jeffrey Hurchalla.
  2 | /*
  3 |  * This Source Code Form is subject to the terms of the Mozilla Public
  4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 |  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  6 |  */
  7 | 
  8 | // lets us get inside the black box of get_Rsquared_mod_n() to ensure that
  9 | // we test the complex compiled possibility rather than the trivial one.
 10 | #define HURCHALLA_TESTING_RSQUARED_MOD_N 1
 11 | 
 12 | #include "hurchalla/montgomery_arithmetic/low_level_api/get_Rsquared_mod_n.h"
 13 | #include "hurchalla/montgomery_arithmetic/low_level_api/inverse_mod_R.h"
 14 | #include "hurchalla/montgomery_arithmetic/low_level_api/get_R_mod_n.h"
 15 | #include "hurchalla/util/traits/safely_promote_unsigned.h"
 16 | #include "hurchalla/util/traits/ut_numeric_limits.h"
 17 | #include "gtest/gtest.h"
 18 | #include <cstdint>
 19 | #include <type_traits>
 20 | 
 21 | namespace {
 22 | 
 23 | 
 24 | using namespace ::hurchalla;
 25 | 
 26 | 
 27 | template <typename T>
 28 | void test_single_R2(T n)
 29 | {
 30 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 31 |     static_assert(!ut_numeric_limits<T>::is_signed, "");
 32 |     constexpr int digitsR = ut_numeric_limits<T>::digits;
 33 |     constexpr T Rdiv4 = static_cast<T>(1) << (digitsR - 2);
 34 | 
 35 |     T rmodn = get_R_mod_n(n);
 36 |     T inv = inverse_mod_R(n);
 37 |     using P = typename safely_promote_unsigned<T>::type;
 38 |     T one = static_cast<T>(1);
 39 |     // the next line tests inverse_mod_R - we might as well test it while here.
 40 |     EXPECT_TRUE(static_cast<T>(static_cast<P>(inv) * static_cast<P>(n)) == one);
 41 |     T answer = modular_multiplication_prereduced_inputs(rmodn, rmodn, n);
 42 | 
 43 |     if (n < Rdiv4) {
 44 |         T r2modn_1 = get_Rsquared_mod_n<T, true, LowlatencyTag>(n, inv, rmodn);
 45 |         T r2modn_2 = get_Rsquared_mod_n<T, true, LowuopsTag>(n, inv, rmodn);
 46 |         EXPECT_TRUE(r2modn_1 == answer);
 47 |         EXPECT_TRUE(r2modn_2 == answer);
 48 |     }
 49 |     // test version that works for all n
 50 |     {
 51 |         T r2modn_1 = get_Rsquared_mod_n<T, false, LowlatencyTag>(n, inv, rmodn);
 52 |         T r2modn_2 = get_Rsquared_mod_n<T, false, LowuopsTag>(n, inv, rmodn);
 53 |         EXPECT_TRUE(r2modn_1 == answer);
 54 |         EXPECT_TRUE(r2modn_2 == answer);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | template <std::size_t ARRAY_SIZE, typename T>
 60 | void test_single_R2_array(T n)
 61 | {
 62 |     static_assert(ut_numeric_limits<T>::is_integer, "");
 63 |     static_assert(!ut_numeric_limits<T>::is_signed, "");
 64 |     constexpr int digitsR = ut_numeric_limits<T>::digits;
 65 |     constexpr T Rdiv4 = static_cast<T>(1) << (digitsR - 2);
 66 | 
 67 |     std::array<T, ARRAY_SIZE> a_n;
 68 |     std::array<T, ARRAY_SIZE> a_rmn;
 69 |     std::array<T, ARRAY_SIZE> a_invn;
 70 |     std::array<T, ARRAY_SIZE> answer;
 71 |     for (std::size_t i=0; i<ARRAY_SIZE; ++i) {
 72 |         if (n < 3 + 2*i)
 73 |             a_n[i] = 3;
 74 |         else
 75 |             a_n[i] = static_cast<T>(n - 2*i);
 76 |         a_rmn[i] = get_R_mod_n(a_n[i]);
 77 |         a_invn[i] = inverse_mod_R(a_n[i]);
 78 |         answer[i] = modular_multiplication_prereduced_inputs(
 79 |                                                     a_rmn[i], a_rmn[i], a_n[i]);
 80 |         // we might as well test inverse_mod_R while here.
 81 |         using P = typename safely_promote_unsigned<T>::type;
 82 |         T one = static_cast<T>(1);
 83 |         EXPECT_TRUE(static_cast<T>(
 84 |                     static_cast<P>(a_invn[i]) * static_cast<P>(a_n[i])) == one);
 85 |     }
 86 | 
 87 |     // since we subtracted from n to set a_n, (a_n[0] < Rdiv4) covers all a_n[i]
 88 |     if (a_n[0] < Rdiv4) {
 89 |         auto r2mn1 = get_Rsquared_mod_n<T, ARRAY_SIZE, true, LowlatencyTag>
 90 |                                                            (a_n, a_invn, a_rmn);
 91 |         auto r2mn2 = get_Rsquared_mod_n<T, ARRAY_SIZE, true, LowuopsTag>
 92 |                                                            (a_n, a_invn, a_rmn);
 93 |         EXPECT_TRUE(r2mn1 == answer);
 94 |         EXPECT_TRUE(r2mn2 == answer);
 95 |     }
 96 |     // test version that works for any size a_n[i]
 97 |     {
 98 |         auto r2mn1 = get_Rsquared_mod_n<T, ARRAY_SIZE, false, LowlatencyTag>
 99 |                                                            (a_n, a_invn, a_rmn);
100 |         auto r2mn2 = get_Rsquared_mod_n<T, ARRAY_SIZE, false, LowuopsTag>
101 |                                                            (a_n, a_invn, a_rmn);
102 |         EXPECT_TRUE(r2mn1 == answer);
103 |         EXPECT_TRUE(r2mn2 == answer);
104 |     }
105 | }
106 | 
107 | 
108 | 
109 | template <typename T>
110 | void test_R2_exhaustive()
111 | {
112 |     T max = ut_numeric_limits<T>::max();
113 |     EXPECT_TRUE(max > 0);
114 |     T evenmax = static_cast<T>((max/2)*2);
115 |     T oddmax = (evenmax != max) ? max : static_cast<T>(max - 1);
116 |     // get_Rsquared_mod_n's preconditions require input n is odd and > 1.
117 |     for (T n=oddmax; n>1; n=static_cast<T>(n-2)) {
118 |         test_single_R2(n);
119 |         test_single_R2_array<3>(n);  // array size of 3 is an arbitrary size
120 |     }
121 | }
122 | 
123 | 
124 | template <typename T>
125 | void test_R2()
126 | {
127 |     T max = ut_numeric_limits<T>::max();
128 |     EXPECT_TRUE(max > 0);
129 |     T evenmax = static_cast<T>((max/2)*2);
130 |     T oddmax = (evenmax != max) ? max : static_cast<T>(max - 1);
131 |     T oddquartermax = static_cast<T>((max/8)*2 + 1);
132 | 
133 |     // get_Rsquared_mod_n's preconditions require input n is odd and > 1.
134 | 
135 |     T n = 3;
136 |     test_single_R2(n);
137 |     test_single_R2_array<1>(n);
138 |     test_single_R2_array<2>(n);
139 |     test_single_R2_array<5>(n);
140 | 
141 |     n = 9;
142 |     test_single_R2(n);
143 |     test_single_R2_array<1>(n);
144 |     test_single_R2_array<2>(n);
145 |     test_single_R2_array<5>(n);
146 | 
147 |     n = 11;
148 |     test_single_R2(n);
149 |     test_single_R2_array<1>(n);
150 |     test_single_R2_array<2>(n);
151 |     test_single_R2_array<5>(n);
152 | 
153 |     n = 21;
154 |     test_single_R2(n);
155 |     test_single_R2_array<1>(n);
156 |     test_single_R2_array<2>(n);
157 |     test_single_R2_array<5>(n);
158 | 
159 |     n = oddmax;
160 |     test_single_R2(n);
161 |     test_single_R2_array<1>(n);
162 |     test_single_R2_array<2>(n);
163 |     test_single_R2_array<5>(n);
164 | 
165 |     n = static_cast<T>(oddmax - 2);
166 |     test_single_R2(n);
167 |     test_single_R2_array<1>(n);
168 |     test_single_R2_array<2>(n);
169 |     test_single_R2_array<5>(n);
170 | 
171 |     n = static_cast<T>(oddmax - 6);
172 |     test_single_R2(n);
173 |     test_single_R2_array<1>(n);
174 |     test_single_R2_array<2>(n);
175 |     test_single_R2_array<5>(n);
176 | 
177 |     n = oddquartermax;
178 |     test_single_R2(n);
179 |     test_single_R2_array<1>(n);
180 |     test_single_R2_array<2>(n);
181 |     test_single_R2_array<5>(n);
182 | 
183 |     n = static_cast<T>(oddquartermax + 2);
184 |     test_single_R2(n);
185 |     test_single_R2_array<1>(n);
186 |     test_single_R2_array<2>(n);
187 |     test_single_R2_array<5>(n);
188 | 
189 |     n = static_cast<T>(oddquartermax - 2);
190 |     test_single_R2(n);
191 |     test_single_R2_array<1>(n);
192 |     test_single_R2_array<2>(n);
193 |     test_single_R2_array<5>(n);
194 | }
195 | 
196 | 
197 | 
198 | TEST(MontgomeryArithmetic, get_Rsquared_mod_N) {
199 |     test_R2<std::uint8_t>();
200 |     test_R2<std::uint16_t>();
201 |     test_R2<std::uint32_t>();
202 |     test_R2<std::uint64_t>();
203 | #if HURCHALLA_COMPILER_HAS_UINT128_T()
204 |     test_R2<__uint128_t>();
205 | #endif
206 | 
207 |     test_R2_exhaustive<std::uint8_t>();
208 |     test_R2_exhaustive<std::uint16_t>();
209 | }
210 | 
211 | 
212 | } // end unnamed namespace
213 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/timings_ARM64_M2/64_half_gcc_noasm_array.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | ./testbench_2kary.sh g++ O3 MontgomeryHalf uint64_t 191 8 22 -DTEST_ARRAY
  3 | 
  4 | compilation finished, now executing:
  5 | ---Running Program---
  6 | 
  7 | begin benchmarks - array pow
  8 | (ignore)17513345000201297728
  9 | 
 10 | OVERALL BEST:
 11 | 0.1842  3 00 x 08
 12 | 0.1859  3 01 x 08
 13 | 0.1863  3 01 x 12
 14 | 0.1874  3 01 x 07
 15 | 0.1876  3 01 x 10
 16 | 0.1877  3 00 x 07
 17 | 0.1878  3 00 x 10
 18 | 0.1881  3 00 x 12
 19 | 0.1887  3 01 x 06
 20 | 0.1897  4 00 x 08
 21 | 0.1898  3 00 x 06
 22 | 0.1901  4 01 x 08
 23 | 0.1932  4 01 x 12
 24 | 0.1941  4 00 x 07
 25 | 0.1946  4 01 x 10
 26 | 0.1948  4 00 x 12
 27 | 0.1950  4 01 x 07
 28 | 0.1954  4 00 x 06
 29 | 0.1955  3 00 x 05
 30 | 0.1955  4 00 x 10
 31 | 0.1968  4 01 x 06
 32 | 0.1982  3 01 x 05
 33 | 0.2006  2 01 x 07
 34 | 0.2016  2 00 x 08
 35 | 0.2018  2 00 x 07
 36 | 0.2030  4 00 x 05
 37 | 0.2032  2 01 x 08
 38 | 0.2044  2 00 x 12
 39 | 0.2055  2 00 x 06
 40 | 0.2060  4 01 x 05
 41 | 0.2061  2 00 x 10
 42 | 0.2065  2 01 x 10
 43 | 0.2071  2 01 x 12
 44 | 0.2084  2 01 x 06
 45 | 0.2142  2 00 x 05
 46 | 0.2148  3 00 x 04
 47 | 0.2150  3 01 x 04
 48 | 0.2154  5 00 x 08
 49 | 0.2171  2 01 x 05
 50 | 0.2179  5 01 x 08
 51 | 0.2193  4 00 x 04
 52 | 0.2206  4 01 x 04
 53 | 0.2283  5 00 x 07
 54 | 0.2284  5 00 x 12
 55 | 0.2287  5 00 x 10
 56 | 0.2292  5 01 x 12
 57 | 0.2300  5 01 x 07
 58 | 0.2305  5 01 x 06
 59 | 0.2308  5 01 x 10
 60 | 0.2325  5 00 x 06
 61 | 0.2369  5 00 x 05
 62 | 0.2391  5 01 x 05
 63 | 0.2443  2 01 x 04
 64 | 0.2463  2 00 x 04
 65 | 0.2481  5 00 x 04
 66 | 0.2502  5 01 x 04
 67 | 0.2660  3 00 x 03
 68 | 0.2679  3 01 x 03
 69 | 0.2707  4 00 x 03
 70 | 0.2727  4 01 x 03
 71 | 0.2955  2 01 x 03
 72 | 0.2961  2 00 x 03
 73 | 0.3002  5 00 x 03
 74 | 0.3020  5 01 x 03
 75 | 0.3633  4 01 x 02
 76 | 0.3634  4 00 x 02
 77 | 0.3687  3 00 x 02
 78 | 0.3688  3 01 x 02
 79 | 0.3803  5 01 x 02
 80 | 0.3804  5 00 x 02
 81 | 0.4116  2 01 x 02
 82 | 0.4118  2 00 x 02
 83 | Timings By Test Type:
 84 | 0.0549  3 00 x 08    0.0549  3 00 x 08    0.0372  3 00 x 08    0.0372  3 00 x 08
 85 | 0.0554  3 01 x 08    0.0553  3 01 x 08    0.0376  3 01 x 12    0.0375  3 01 x 08
 86 | 0.0556  4 00 x 08    0.0554  3 01 x 12    0.0376  3 01 x 08    0.0376  3 01 x 12
 87 | 0.0558  3 01 x 12    0.0555  3 01 x 10    0.0377  3 00 x 12    0.0377  3 01 x 10
 88 | 0.0558  4 01 x 08    0.0555  4 00 x 08    0.0377  3 01 x 10    0.0378  3 01 x 07
 89 | 0.0559  3 01 x 07    0.0556  3 00 x 12    0.0379  3 00 x 07    0.0378  3 00 x 12
 90 | 0.0560  3 00 x 07    0.0557  3 00 x 10    0.0379  3 01 x 07    0.0378  3 00 x 07
 91 | 0.0563  3 01 x 06    0.0557  4 01 x 08    0.0379  3 00 x 10    0.0379  3 00 x 10
 92 | 0.0564  3 00 x 10    0.0558  3 01 x 07    0.0381  3 01 x 06    0.0381  3 01 x 06
 93 | 0.0566  3 00 x 06    0.0559  3 00 x 07    0.0383  3 00 x 06    0.0383  3 00 x 06
 94 | 0.0567  3 01 x 10    0.0561  4 01 x 12    0.0392  4 00 x 08    0.0393  3 00 x 05
 95 | 0.0568  4 00 x 07    0.0562  3 01 x 06    0.0392  4 01 x 08    0.0393  4 00 x 08
 96 | 0.0569  3 00 x 12    0.0565  4 01 x 10    0.0393  3 00 x 05    0.0393  4 01 x 08
 97 | 0.0570  4 01 x 07    0.0566  4 00 x 12    0.0398  4 01 x 12    0.0399  3 01 x 05
 98 | 0.0571  4 00 x 06    0.0566  3 00 x 06    0.0399  3 01 x 05    0.0399  4 01 x 12
 99 | 0.0574  4 01 x 12    0.0567  4 00 x 10    0.0400  2 01 x 07    0.0400  2 01 x 07
100 | 0.0575  4 01 x 06    0.0567  4 00 x 07    0.0402  2 00 x 07    0.0401  4 00 x 12
101 | 0.0576  4 01 x 10    0.0571  4 01 x 07    0.0402  2 00 x 08    0.0402  2 00 x 07
102 | 0.0577  4 00 x 10    0.0571  4 00 x 06    0.0402  4 00 x 12    0.0403  2 00 x 08
103 | 0.0578  4 00 x 12    0.0575  4 01 x 06    0.0403  4 01 x 10    0.0403  4 00 x 07
104 | 0.0584  3 00 x 05    0.0585  3 00 x 05    0.0404  2 00 x 12    0.0403  4 01 x 10
105 | 0.0592  3 01 x 05    0.0592  3 01 x 05    0.0404  4 00 x 07    0.0403  2 01 x 08
106 | 0.0596  4 00 x 05    0.0596  4 00 x 05    0.0404  4 00 x 10    0.0404  4 01 x 07
107 | 0.0603  2 01 x 07    0.0603  2 01 x 07    0.0405  4 01 x 07    0.0405  4 00 x 06
108 | 0.0604  4 01 x 05    0.0605  4 01 x 05    0.0405  4 00 x 06    0.0406  2 00 x 12
109 | 0.0605  2 00 x 08    0.0606  2 00 x 08    0.0405  2 01 x 08    0.0407  4 00 x 10
110 | 0.0608  2 00 x 07    0.0607  2 00 x 07    0.0408  2 00 x 10    0.0408  2 01 x 10
111 | 0.0615  2 01 x 08    0.0608  2 01 x 08    0.0408  4 01 x 06    0.0408  4 01 x 06
112 | 0.0616  5 00 x 08    0.0610  2 00 x 12    0.0408  2 01 x 10    0.0409  2 00 x 06
113 | 0.0618  2 00 x 06    0.0617  5 00 x 08    0.0409  2 00 x 06    0.0410  2 01 x 12
114 | 0.0624  5 01 x 08    0.0617  2 01 x 10    0.0411  2 01 x 12    0.0410  2 00 x 10
115 | 0.0624  2 00 x 12    0.0617  2 00 x 10    0.0415  2 01 x 06    0.0415  2 01 x 06
116 | 0.0626  2 00 x 10    0.0618  2 00 x 06    0.0419  4 00 x 05    0.0419  4 00 x 05
117 | 0.0627  2 01 x 06    0.0622  2 01 x 12    0.0426  4 01 x 05    0.0426  4 01 x 05
118 | 0.0629  2 01 x 12    0.0622  5 01 x 08    0.0427  2 00 x 05    0.0427  2 00 x 05
119 | 0.0632  2 01 x 10    0.0627  2 01 x 06    0.0428  3 00 x 04    0.0428  3 00 x 04
120 | 0.0644  2 00 x 05    0.0644  2 00 x 05    0.0430  3 01 x 04    0.0430  3 01 x 04
121 | 0.0645  3 01 x 04    0.0644  5 00 x 10    0.0433  2 01 x 05    0.0433  2 01 x 05
122 | 0.0646  3 00 x 04    0.0645  5 00 x 12    0.0448  4 00 x 04    0.0447  4 00 x 04
123 | 0.0649  4 00 x 04    0.0645  3 01 x 04    0.0451  4 01 x 04    0.0451  4 01 x 04
124 | 0.0650  5 00 x 07    0.0646  3 00 x 04    0.0461  5 00 x 08    0.0461  5 00 x 08
125 | 0.0652  4 01 x 04    0.0646  5 01 x 12    0.0466  5 01 x 08    0.0467  5 01 x 08
126 | 0.0653  2 01 x 05    0.0649  4 00 x 04    0.0484  2 01 x 04    0.0484  2 01 x 04
127 | 0.0655  5 01 x 06    0.0649  5 00 x 07    0.0487  5 00 x 12    0.0487  2 00 x 04
128 | 0.0655  5 01 x 07    0.0652  4 01 x 04    0.0487  2 00 x 04    0.0489  5 00 x 12
129 | 0.0663  5 00 x 10    0.0653  2 01 x 05    0.0490  5 01 x 12    0.0490  5 00 x 10
130 | 0.0663  5 00 x 12    0.0653  5 01 x 07    0.0490  5 00 x 10    0.0491  5 01 x 12
131 | 0.0663  5 00 x 06    0.0654  5 01 x 10    0.0492  5 00 x 07    0.0492  5 00 x 07
132 | 0.0665  5 01 x 12    0.0656  5 01 x 06    0.0493  5 01 x 10    0.0493  5 01 x 10
133 | 0.0669  5 01 x 10    0.0662  5 00 x 06    0.0496  5 01 x 07    0.0496  5 01 x 07
134 | 0.0677  5 00 x 05    0.0677  5 00 x 05    0.0497  5 01 x 06    0.0497  5 01 x 06
135 | 0.0683  5 01 x 05    0.0683  5 01 x 05    0.0500  5 00 x 06    0.0500  5 00 x 06
136 | 0.0719  5 00 x 04    0.0720  5 00 x 04    0.0508  5 00 x 05    0.0507  5 00 x 05
137 | 0.0725  5 01 x 04    0.0725  5 01 x 04    0.0513  5 01 x 05    0.0513  5 01 x 05
138 | 0.0738  2 01 x 04    0.0738  2 01 x 04    0.0521  5 00 x 04    0.0521  5 00 x 04
139 | 0.0744  2 00 x 04    0.0744  2 00 x 04    0.0525  3 00 x 03    0.0525  3 00 x 03
140 | 0.0805  4 00 x 03    0.0804  3 00 x 03    0.0526  5 01 x 04    0.0526  5 01 x 04
141 | 0.0805  3 00 x 03    0.0806  4 00 x 03    0.0531  3 01 x 03    0.0530  3 01 x 03
142 | 0.0809  3 01 x 03    0.0809  3 01 x 03    0.0548  4 00 x 03    0.0548  4 00 x 03
143 | 0.0811  4 01 x 03    0.0811  4 01 x 03    0.0553  4 01 x 03    0.0553  4 01 x 03
144 | 0.0874  5 00 x 03    0.0873  5 00 x 03    0.0582  2 01 x 03    0.0582  2 01 x 03
145 | 0.0878  5 01 x 03    0.0878  5 01 x 03    0.0583  2 00 x 03    0.0583  2 00 x 03
146 | 0.0896  2 01 x 03    0.0895  2 01 x 03    0.0627  5 00 x 03    0.0627  5 00 x 03
147 | 0.0898  2 00 x 03    0.0898  2 00 x 03    0.0632  5 01 x 03    0.0632  5 01 x 03
148 | 0.1098  4 01 x 02    0.1095  4 01 x 02    0.0720  3 00 x 02    0.0720  4 01 x 02
149 | 0.1099  4 00 x 02    0.1095  4 00 x 02    0.0720  4 00 x 02    0.0720  4 00 x 02
150 | 0.1124  3 01 x 02    0.1122  3 00 x 02    0.0720  4 01 x 02    0.0720  3 01 x 02
151 | 0.1125  3 00 x 02    0.1123  3 01 x 02    0.0720  3 01 x 02    0.0721  3 00 x 02
152 | 0.1134  5 00 x 02    0.1130  5 01 x 02    0.0769  5 01 x 02    0.0769  5 01 x 02
153 | 0.1135  5 01 x 02    0.1132  5 00 x 02    0.0769  5 00 x 02    0.0770  5 00 x 02
154 | 0.1260  2 01 x 02    0.1255  2 01 x 02    0.0800  2 00 x 02    0.0800  2 01 x 02
155 | 0.1260  2 00 x 02    0.1256  2 00 x 02    0.0800  2 01 x 02    0.0802  2 00 x 02
156 | ---Benchmark Program Finished---
157 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/timings_ARM64_M2/64_half_clang_noasm_array.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | ./testbench_2kary.sh clang++ O3 MontgomeryHalf uint64_t 191 8 22 -DTEST_ARRAY
  3 | 
  4 | compilation finished, now executing:
  5 | ---Running Program---
  6 | 
  7 | begin benchmarks - array pow
  8 | (ignore)17513345000201297728
  9 | 
 10 | OVERALL BEST:
 11 | 0.1760  3 00 x 08
 12 | 0.1776  3 01 x 08
 13 | 0.1822  4 00 x 08
 14 | 0.1836  4 01 x 08
 15 | 0.1894  3 00 x 12
 16 | 0.1894  3 00 x 10
 17 | 0.1896  2 00 x 08
 18 | 0.1898  3 01 x 10
 19 | 0.1902  2 01 x 08
 20 | 0.1920  4 01 x 12
 21 | 0.1925  4 00 x 10
 22 | 0.1930  4 01 x 10
 23 | 0.1931  3 01 x 12
 24 | 0.1947  3 01 x 07
 25 | 0.1949  4 00 x 12
 26 | 0.1954  3 00 x 07
 27 | 0.1958  4 01 x 07
 28 | 0.1967  4 00 x 07
 29 | 0.1995  3 01 x 06
 30 | 0.1999  4 00 x 06
 31 | 0.2000  4 01 x 06
 32 | 0.2003  3 00 x 06
 33 | 0.2053  3 01 x 05
 34 | 0.2056  3 00 x 05
 35 | 0.2075  4 01 x 05
 36 | 0.2091  2 00 x 12
 37 | 0.2092  4 00 x 05
 38 | 0.2102  2 01 x 10
 39 | 0.2103  5 00 x 08
 40 | 0.2104  2 00 x 10
 41 | 0.2105  5 01 x 08
 42 | 0.2109  2 01 x 12
 43 | 0.2169  2 01 x 07
 44 | 0.2183  2 00 x 07
 45 | 0.2184  5 01 x 12
 46 | 0.2189  5 00 x 10
 47 | 0.2195  5 01 x 10
 48 | 0.2210  5 01 x 07
 49 | 0.2211  5 00 x 12
 50 | 0.2214  3 00 x 04
 51 | 0.2215  5 00 x 07
 52 | 0.2218  3 01 x 04
 53 | 0.2247  5 00 x 06
 54 | 0.2249  5 01 x 06
 55 | 0.2257  2 01 x 06
 56 | 0.2265  4 00 x 04
 57 | 0.2267  4 01 x 04
 58 | 0.2270  2 00 x 06
 59 | 0.2332  5 01 x 05
 60 | 0.2338  5 00 x 05
 61 | 0.2371  2 00 x 05
 62 | 0.2374  2 01 x 05
 63 | 0.2431  2 00 x 04
 64 | 0.2440  2 01 x 04
 65 | 0.2513  5 00 x 04
 66 | 0.2520  5 01 x 04
 67 | 0.2767  4 00 x 03
 68 | 0.2772  4 01 x 03
 69 | 0.2795  3 00 x 03
 70 | 0.2796  3 01 x 03
 71 | 0.2988  5 01 x 03
 72 | 0.2989  5 00 x 03
 73 | 0.3030  2 00 x 03
 74 | 0.3033  2 01 x 03
 75 | 0.3617  4 00 x 02
 76 | 0.3624  4 01 x 02
 77 | 0.3708  3 00 x 02
 78 | 0.3708  3 01 x 02
 79 | 0.3835  5 01 x 02
 80 | 0.3836  5 00 x 02
 81 | 0.4153  2 00 x 02
 82 | 0.4156  2 01 x 02
 83 | Timings By Test Type:
 84 | 0.0523  3 00 x 08    0.0523  3 00 x 08    0.0357  3 00 x 08    0.0357  3 00 x 08
 85 | 0.0528  3 01 x 08    0.0527  3 01 x 08    0.0361  3 01 x 08    0.0361  3 01 x 08
 86 | 0.0532  4 00 x 08    0.0532  4 00 x 08    0.0379  4 00 x 08    0.0379  4 00 x 08
 87 | 0.0536  4 01 x 08    0.0536  4 01 x 08    0.0380  2 00 x 08    0.0380  2 00 x 08
 88 | 0.0560  4 01 x 12    0.0560  4 01 x 12    0.0382  4 01 x 08    0.0381  4 01 x 08
 89 | 0.0562  3 00 x 12    0.0562  3 00 x 12    0.0382  2 01 x 08    0.0381  2 01 x 08
 90 | 0.0563  3 00 x 10    0.0562  3 00 x 10    0.0384  3 00 x 10    0.0384  3 00 x 10
 91 | 0.0563  4 00 x 10    0.0563  4 00 x 10    0.0385  3 00 x 12    0.0385  3 00 x 12
 92 | 0.0564  3 01 x 10    0.0563  3 01 x 10    0.0385  3 01 x 10    0.0385  3 01 x 10
 93 | 0.0564  4 01 x 10    0.0564  4 01 x 10    0.0394  3 01 x 07    0.0394  3 01 x 07
 94 | 0.0568  4 00 x 12    0.0568  4 00 x 12    0.0395  3 01 x 12    0.0395  3 01 x 12
 95 | 0.0568  2 00 x 08    0.0568  2 00 x 08    0.0395  3 00 x 07    0.0395  3 00 x 07
 96 | 0.0569  2 01 x 08    0.0570  2 01 x 08    0.0400  4 00 x 10    0.0400  4 00 x 10
 97 | 0.0571  3 01 x 12    0.0571  3 01 x 12    0.0400  4 01 x 12    0.0400  4 01 x 12
 98 | 0.0574  4 01 x 07    0.0574  4 01 x 07    0.0401  4 01 x 10    0.0401  4 01 x 10
 99 | 0.0577  4 00 x 07    0.0577  4 00 x 07    0.0403  3 01 x 06    0.0403  3 01 x 06
100 | 0.0579  3 01 x 07    0.0579  3 01 x 07    0.0404  3 00 x 06    0.0404  3 00 x 06
101 | 0.0581  3 00 x 07    0.0583  3 00 x 07    0.0405  4 01 x 07    0.0405  4 01 x 07
102 | 0.0587  4 00 x 06    0.0587  4 00 x 06    0.0406  4 00 x 07    0.0407  4 00 x 07
103 | 0.0587  4 01 x 06    0.0587  4 01 x 06    0.0407  4 00 x 12    0.0407  4 00 x 12
104 | 0.0595  3 01 x 06    0.0595  3 01 x 06    0.0412  4 00 x 06    0.0413  4 00 x 06
105 | 0.0597  3 00 x 06    0.0597  3 00 x 06    0.0413  4 01 x 06    0.0413  4 01 x 06
106 | 0.0600  5 00 x 08    0.0600  5 00 x 08    0.0414  3 01 x 05    0.0414  3 01 x 05
107 | 0.0600  5 01 x 08    0.0600  5 01 x 08    0.0415  3 00 x 05    0.0415  3 00 x 05
108 | 0.0610  4 01 x 05    0.0610  4 01 x 05    0.0419  2 00 x 12    0.0419  2 00 x 12
109 | 0.0612  3 01 x 05    0.0612  3 01 x 05    0.0421  2 01 x 10    0.0421  2 01 x 10
110 | 0.0613  3 00 x 05    0.0613  3 00 x 05    0.0422  2 00 x 10    0.0421  2 00 x 10
111 | 0.0615  4 00 x 05    0.0615  4 00 x 05    0.0423  2 01 x 12    0.0423  2 01 x 12
112 | 0.0622  5 01 x 12    0.0622  5 01 x 12    0.0428  4 01 x 05    0.0427  4 01 x 05
113 | 0.0624  5 00 x 10    0.0625  5 00 x 10    0.0431  4 00 x 05    0.0431  4 00 x 05
114 | 0.0626  2 00 x 12    0.0626  2 00 x 12    0.0434  2 01 x 07    0.0434  2 01 x 07
115 | 0.0626  5 01 x 10    0.0627  5 01 x 10    0.0435  2 00 x 07    0.0436  2 00 x 07
116 | 0.0629  5 00 x 12    0.0629  5 00 x 12    0.0443  3 00 x 04    0.0442  3 00 x 04
117 | 0.0630  2 01 x 10    0.0630  2 01 x 10    0.0444  3 01 x 04    0.0444  3 01 x 04
118 | 0.0630  2 00 x 10    0.0630  2 00 x 10    0.0450  2 01 x 06    0.0449  2 01 x 06
119 | 0.0632  2 01 x 12    0.0631  2 01 x 12    0.0452  5 00 x 08    0.0452  5 00 x 08
120 | 0.0632  5 01 x 07    0.0632  5 01 x 07    0.0453  2 00 x 06    0.0452  5 01 x 08
121 | 0.0634  5 00 x 07    0.0634  5 00 x 07    0.0453  5 01 x 08    0.0452  2 00 x 06
122 | 0.0644  5 00 x 06    0.0644  5 00 x 06    0.0463  4 00 x 04    0.0463  4 00 x 04
123 | 0.0645  5 01 x 06    0.0645  5 01 x 06    0.0463  4 01 x 04    0.0463  4 01 x 04
124 | 0.0651  2 01 x 07    0.0651  2 01 x 07    0.0470  5 00 x 10    0.0469  5 00 x 10
125 | 0.0656  2 00 x 07    0.0656  2 00 x 07    0.0470  5 01 x 12    0.0470  5 01 x 12
126 | 0.0665  3 00 x 04    0.0664  3 00 x 04    0.0471  5 01 x 10    0.0471  5 01 x 10
127 | 0.0665  3 01 x 04    0.0666  3 01 x 04    0.0471  2 00 x 05    0.0471  2 00 x 05
128 | 0.0670  4 00 x 04    0.0670  4 00 x 04    0.0473  2 01 x 05    0.0473  2 01 x 05
129 | 0.0670  4 01 x 04    0.0670  4 01 x 04    0.0473  5 01 x 07    0.0473  5 01 x 07
130 | 0.0670  5 01 x 05    0.0671  5 01 x 05    0.0474  5 00 x 07    0.0473  5 00 x 07
131 | 0.0672  5 00 x 05    0.0672  5 00 x 05    0.0476  5 00 x 12    0.0476  5 00 x 12
132 | 0.0679  2 01 x 06    0.0679  2 01 x 06    0.0479  5 00 x 06    0.0480  5 00 x 06
133 | 0.0683  2 00 x 06    0.0683  2 00 x 06    0.0480  5 01 x 06    0.0480  5 01 x 06
134 | 0.0714  2 00 x 05    0.0714  2 00 x 05    0.0481  2 00 x 04    0.0482  2 00 x 04
135 | 0.0714  2 01 x 05    0.0715  2 01 x 05    0.0483  2 01 x 04    0.0483  2 01 x 04
136 | 0.0728  5 00 x 04    0.0728  5 00 x 04    0.0495  5 01 x 05    0.0495  5 01 x 05
137 | 0.0730  5 01 x 04    0.0730  5 01 x 04    0.0497  5 00 x 05    0.0497  5 00 x 05
138 | 0.0734  2 00 x 04    0.0734  2 00 x 04    0.0529  5 00 x 04    0.0529  5 00 x 04
139 | 0.0737  2 01 x 04    0.0737  2 01 x 04    0.0530  5 01 x 04    0.0530  5 01 x 04
140 | 0.0823  4 00 x 03    0.0823  4 00 x 03    0.0552  3 00 x 03    0.0552  3 00 x 03
141 | 0.0825  4 01 x 03    0.0825  4 01 x 03    0.0553  3 01 x 03    0.0552  3 01 x 03
142 | 0.0845  3 01 x 03    0.0845  3 00 x 03    0.0560  4 00 x 03    0.0560  4 00 x 03
143 | 0.0846  3 00 x 03    0.0846  3 01 x 03    0.0561  4 01 x 03    0.0561  4 01 x 03
144 | 0.0874  5 01 x 03    0.0874  5 00 x 03    0.0597  2 00 x 03    0.0597  2 00 x 03
145 | 0.0874  5 00 x 03    0.0874  5 01 x 03    0.0598  2 01 x 03    0.0598  2 01 x 03
146 | 0.0918  2 00 x 03    0.0918  2 00 x 03    0.0620  5 01 x 03    0.0620  5 01 x 03
147 | 0.0918  2 01 x 03    0.0919  2 01 x 03    0.0620  5 00 x 03    0.0620  5 00 x 03
148 | 0.1092  4 00 x 02    0.1094  4 00 x 02    0.0716  4 00 x 02    0.0715  4 00 x 02
149 | 0.1094  4 01 x 02    0.1096  4 01 x 02    0.0718  4 01 x 02    0.0717  4 01 x 02
150 | 0.1128  3 01 x 02    0.1128  3 00 x 02    0.0726  3 01 x 02    0.0725  3 01 x 02
151 | 0.1128  3 00 x 02    0.1129  3 01 x 02    0.0726  3 00 x 02    0.0726  3 00 x 02
152 | 0.1141  5 01 x 02    0.1141  5 01 x 02    0.0777  5 00 x 02    0.0777  5 00 x 02
153 | 0.1141  5 00 x 02    0.1141  5 00 x 02    0.0777  5 01 x 02    0.0777  5 01 x 02
154 | 0.1268  2 01 x 02    0.1268  2 00 x 02    0.0809  2 00 x 02    0.0809  2 00 x 02
155 | 0.1268  2 00 x 02    0.1268  2 01 x 02    0.0810  2 01 x 02    0.0810  2 01 x 02
156 | ---Benchmark Program Finished---
157 | 


--------------------------------------------------------------------------------
/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/timings_ARM64_M2/64_quarter_gcc_noasm_array.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | ./testbench_2kary.sh g++ O3 MontgomeryQuarter uint64_t 191 8 22 -DTEST_ARRAY
  3 | 
  4 | compilation finished, now executing:
  5 | ---Running Program---
  6 | 
  7 | begin benchmarks - array pow
  8 | (ignore)368336231662783440
  9 | 
 10 | OVERALL BEST:
 11 | 0.1804  3 01 x 08
 12 | 0.1813  3 00 x 07
 13 | 0.1813  3 00 x 08
 14 | 0.1815  3 01 x 10
 15 | 0.1821  3 01 x 12
 16 | 0.1824  3 00 x 12
 17 | 0.1831  3 01 x 07
 18 | 0.1846  3 00 x 06
 19 | 0.1850  4 00 x 08
 20 | 0.1850  4 01 x 08
 21 | 0.1854  3 00 x 10
 22 | 0.1893  3 01 x 06
 23 | 0.1896  4 01 x 10
 24 | 0.1900  4 00 x 10
 25 | 0.1910  4 01 x 07
 26 | 0.1920  4 01 x 06
 27 | 0.1921  4 00 x 07
 28 | 0.1922  4 01 x 12
 29 | 0.1924  4 00 x 12
 30 | 0.1928  3 00 x 05
 31 | 0.1954  3 01 x 05
 32 | 0.1964  2 01 x 12
 33 | 0.1972  2 01 x 10
 34 | 0.1974  4 00 x 06
 35 | 0.1976  2 01 x 07
 36 | 0.1979  2 00 x 08
 37 | 0.1979  2 01 x 08
 38 | 0.1993  2 00 x 07
 39 | 0.1995  2 00 x 12
 40 | 0.2010  2 00 x 10
 41 | 0.2015  2 00 x 06
 42 | 0.2016  4 00 x 05
 43 | 0.2019  2 01 x 06
 44 | 0.2024  4 01 x 05
 45 | 0.2075  2 00 x 05
 46 | 0.2091  2 01 x 05
 47 | 0.2132  3 00 x 04
 48 | 0.2159  3 01 x 04
 49 | 0.2167  5 00 x 08
 50 | 0.2169  5 01 x 08
 51 | 0.2180  4 00 x 04
 52 | 0.2199  4 01 x 04
 53 | 0.2236  5 00 x 12
 54 | 0.2247  5 01 x 12
 55 | 0.2252  5 00 x 07
 56 | 0.2254  5 01 x 10
 57 | 0.2263  5 00 x 10
 58 | 0.2293  5 01 x 06
 59 | 0.2301  5 01 x 07
 60 | 0.2331  5 00 x 06
 61 | 0.2357  5 00 x 05
 62 | 0.2372  5 01 x 05
 63 | 0.2416  2 00 x 04
 64 | 0.2437  2 01 x 04
 65 | 0.2482  5 00 x 04
 66 | 0.2503  5 01 x 04
 67 | 0.2641  3 01 x 03
 68 | 0.2652  3 00 x 03
 69 | 0.2657  4 00 x 03
 70 | 0.2663  4 01 x 03
 71 | 0.2874  2 00 x 03
 72 | 0.2892  2 01 x 03
 73 | 0.2961  5 00 x 03
 74 | 0.2966  5 01 x 03
 75 | 0.3581  4 01 x 02
 76 | 0.3583  4 00 x 02
 77 | 0.3640  3 00 x 02
 78 | 0.3645  3 01 x 02
 79 | 0.3743  5 01 x 02
 80 | 0.3754  5 00 x 02
 81 | 0.4051  2 00 x 02
 82 | 0.4056  2 01 x 02
 83 | Timings By Test Type:
 84 | 0.0535  3 01 x 08    0.0536  3 01 x 08    0.0365  3 00 x 08    0.0365  3 00 x 08
 85 | 0.0538  3 00 x 07    0.0537  3 00 x 07    0.0367  3 01 x 08    0.0367  3 01 x 08
 86 | 0.0539  3 01 x 10    0.0538  3 01 x 10    0.0369  3 00 x 07    0.0369  3 00 x 07
 87 | 0.0539  4 01 x 08    0.0539  4 01 x 08    0.0369  3 01 x 10    0.0369  3 01 x 10
 88 | 0.0540  3 01 x 12    0.0540  4 00 x 08    0.0370  3 01 x 12    0.0370  3 01 x 12
 89 | 0.0540  4 00 x 08    0.0541  3 00 x 12    0.0371  3 00 x 12    0.0371  3 00 x 12
 90 | 0.0541  3 00 x 12    0.0541  3 00 x 08    0.0373  3 01 x 07    0.0373  3 01 x 07
 91 | 0.0542  3 00 x 08    0.0541  3 01 x 12    0.0375  3 00 x 06    0.0375  3 00 x 06
 92 | 0.0543  3 01 x 07    0.0543  3 01 x 07    0.0376  3 00 x 10    0.0376  3 00 x 10
 93 | 0.0548  3 00 x 06    0.0548  3 00 x 06    0.0384  4 00 x 08    0.0385  3 01 x 06
 94 | 0.0551  3 00 x 10    0.0551  4 01 x 10    0.0385  3 01 x 06    0.0385  4 00 x 08
 95 | 0.0551  4 01 x 10    0.0551  3 00 x 10    0.0386  4 01 x 08    0.0386  4 01 x 08
 96 | 0.0552  4 00 x 10    0.0553  4 00 x 10    0.0391  3 00 x 05    0.0392  3 00 x 05
 97 | 0.0555  4 01 x 07    0.0556  4 01 x 07    0.0396  2 01 x 12    0.0396  2 01 x 12
 98 | 0.0558  4 01 x 06    0.0558  4 00 x 07    0.0397  4 01 x 10    0.0396  3 01 x 05
 99 | 0.0558  4 00 x 07    0.0559  4 01 x 12    0.0397  3 01 x 05    0.0397  4 01 x 10
100 | 0.0560  4 00 x 12    0.0559  4 01 x 06    0.0398  4 00 x 10    0.0397  4 00 x 10
101 | 0.0561  4 01 x 12    0.0560  4 00 x 12    0.0398  2 01 x 10    0.0398  2 01 x 07
102 | 0.0562  3 01 x 06    0.0561  3 01 x 06    0.0399  2 01 x 07    0.0398  2 01 x 10
103 | 0.0573  3 00 x 05    0.0573  3 00 x 05    0.0399  4 01 x 07    0.0398  2 00 x 08
104 | 0.0576  4 00 x 06    0.0576  4 00 x 06    0.0399  2 01 x 08    0.0399  2 01 x 08
105 | 0.0580  3 01 x 05    0.0581  3 01 x 05    0.0400  2 00 x 08    0.0399  4 01 x 07
106 | 0.0586  2 01 x 12    0.0587  2 01 x 12    0.0401  2 00 x 07    0.0401  4 01 x 12
107 | 0.0588  2 01 x 10    0.0589  2 01 x 10    0.0401  4 01 x 12    0.0402  4 01 x 06
108 | 0.0589  4 00 x 05    0.0589  4 00 x 05    0.0401  4 01 x 06    0.0402  2 00 x 07
109 | 0.0590  2 00 x 08    0.0589  2 01 x 07    0.0402  4 00 x 12    0.0402  4 00 x 12
110 | 0.0590  4 01 x 05    0.0590  2 01 x 08    0.0402  4 00 x 07    0.0402  4 00 x 07
111 | 0.0590  2 01 x 08    0.0590  2 00 x 08    0.0403  2 00 x 12    0.0402  2 00 x 12
112 | 0.0591  2 01 x 07    0.0590  4 01 x 05    0.0404  2 00 x 10    0.0404  2 00 x 10
113 | 0.0595  2 00 x 07    0.0595  2 00 x 12    0.0406  2 00 x 06    0.0405  2 00 x 06
114 | 0.0595  2 00 x 12    0.0595  2 00 x 07    0.0407  2 01 x 06    0.0406  2 01 x 06
115 | 0.0602  2 00 x 10    0.0600  2 00 x 10    0.0411  4 00 x 06    0.0411  4 00 x 06
116 | 0.0602  2 00 x 06    0.0602  2 00 x 06    0.0417  2 00 x 05    0.0417  2 00 x 05
117 | 0.0603  2 01 x 06    0.0603  2 01 x 06    0.0419  4 00 x 05    0.0419  4 00 x 05
118 | 0.0616  5 01 x 08    0.0616  5 01 x 08    0.0420  2 01 x 05    0.0420  2 01 x 05
119 | 0.0617  5 00 x 08    0.0617  5 00 x 08    0.0421  4 01 x 05    0.0421  4 01 x 05
120 | 0.0620  2 00 x 05    0.0621  2 00 x 05    0.0428  3 00 x 04    0.0428  3 00 x 04
121 | 0.0625  2 01 x 05    0.0625  2 01 x 05    0.0435  3 01 x 04    0.0434  3 01 x 04
122 | 0.0635  5 01 x 12    0.0633  5 00 x 12    0.0448  4 00 x 04    0.0448  4 00 x 04
123 | 0.0636  5 00 x 12    0.0636  5 01 x 12    0.0453  4 01 x 04    0.0453  4 01 x 04
124 | 0.0636  5 00 x 07    0.0637  5 00 x 07    0.0466  5 00 x 08    0.0467  5 00 x 08
125 | 0.0638  3 00 x 04    0.0638  5 01 x 10    0.0468  5 01 x 08    0.0468  5 01 x 08
126 | 0.0641  5 01 x 10    0.0638  3 00 x 04    0.0483  5 00 x 12    0.0484  2 00 x 04
127 | 0.0642  4 00 x 04    0.0642  5 00 x 10    0.0484  2 00 x 04    0.0484  5 00 x 12
128 | 0.0643  5 00 x 10    0.0642  4 00 x 04    0.0487  2 01 x 04    0.0486  5 01 x 12
129 | 0.0645  3 01 x 04    0.0645  3 01 x 04    0.0488  5 01 x 10    0.0487  5 01 x 10
130 | 0.0647  4 01 x 04    0.0647  4 01 x 04    0.0488  5 00 x 10    0.0487  2 01 x 04
131 | 0.0649  5 01 x 06    0.0650  5 01 x 06    0.0489  5 00 x 07    0.0489  5 00 x 07
132 | 0.0650  5 01 x 07    0.0650  5 01 x 07    0.0490  5 01 x 12    0.0490  5 00 x 10
133 | 0.0663  5 00 x 06    0.0663  5 00 x 06    0.0496  5 01 x 06    0.0498  5 01 x 06
134 | 0.0671  5 00 x 05    0.0671  5 00 x 05    0.0501  5 01 x 07    0.0500  5 01 x 07
135 | 0.0675  5 01 x 05    0.0675  5 01 x 05    0.0503  5 00 x 06    0.0503  5 00 x 06
136 | 0.0718  5 00 x 04    0.0717  5 00 x 04    0.0508  5 00 x 05    0.0508  5 00 x 05
137 | 0.0722  5 01 x 04    0.0722  5 01 x 04    0.0511  5 01 x 05    0.0511  5 01 x 05
138 | 0.0724  2 00 x 04    0.0725  2 00 x 04    0.0524  5 00 x 04    0.0524  5 00 x 04
139 | 0.0731  2 01 x 04    0.0731  2 01 x 04    0.0525  3 01 x 03    0.0526  3 01 x 03
140 | 0.0786  4 00 x 03    0.0786  4 00 x 03    0.0528  3 00 x 03    0.0528  3 00 x 03
141 | 0.0787  4 01 x 03    0.0788  4 01 x 03    0.0529  5 01 x 04    0.0530  5 01 x 04
142 | 0.0795  3 01 x 03    0.0795  3 01 x 03    0.0542  4 00 x 03    0.0543  4 00 x 03
143 | 0.0798  3 00 x 03    0.0799  3 00 x 03    0.0544  4 01 x 03    0.0544  4 01 x 03
144 | 0.0857  5 00 x 03    0.0858  5 00 x 03    0.0572  2 00 x 03    0.0572  2 00 x 03
145 | 0.0857  5 01 x 03    0.0858  5 01 x 03    0.0576  2 01 x 03    0.0576  2 01 x 03
146 | 0.0865  2 00 x 03    0.0865  2 00 x 03    0.0623  5 00 x 03    0.0623  5 00 x 03
147 | 0.0870  2 01 x 03    0.0869  2 01 x 03    0.0625  5 01 x 03    0.0625  5 01 x 03
148 | 0.1074  4 01 x 02    0.1074  4 01 x 02    0.0716  4 01 x 02    0.0716  4 00 x 02
149 | 0.1075  4 00 x 02    0.1075  4 00 x 02    0.0716  4 00 x 02    0.0717  4 01 x 02
150 | 0.1103  3 00 x 02    0.1102  3 00 x 02    0.0718  3 00 x 02    0.0718  3 00 x 02
151 | 0.1103  3 01 x 02    0.1104  3 01 x 02    0.0719  3 01 x 02    0.0719  3 01 x 02
152 | 0.1108  5 01 x 02    0.1108  5 01 x 02    0.0763  5 01 x 02    0.0764  5 01 x 02
153 | 0.1114  5 00 x 02    0.1110  5 00 x 02    0.0765  5 00 x 02    0.0765  5 00 x 02
154 | 0.1228  2 00 x 02    0.1229  2 01 x 02    0.0797  2 00 x 02    0.0797  2 00 x 02
155 | 0.1232  2 01 x 02    0.1229  2 00 x 02    0.0797  2 01 x 02    0.0797  2 01 x 02
156 | ---Benchmark Program Finished---
157 | 


--------------------------------------------------------------------------------