├── .clang-format
├── .cmake-format.yaml
├── .github
    └── workflows
    │   ├── benchmark.yml
    │   ├── ci.yml
    │   ├── coverage.yml
    │   └── multiarch.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    ├── Config.cmake.in
    └── ConfigureTarget.cmake
├── docs
    └── README.template.md
├── mpi
    ├── CMakeLists.txt
    ├── mpi-prime.c
    ├── mpi-rsa.c
    ├── mpi-rsa.h
    ├── mpi.c
    └── mpi.h
├── mpn
    ├── CMakeLists.txt
    ├── asm
    │   ├── asmdefs.inc
    │   ├── ia_32e.inc
    │   ├── ia_common.inc
    │   ├── ia_emm.inc
    │   ├── intel64
    │   │   ├── bn_uaddadd_m7as.asm
    │   │   ├── bn_uaddsub_m7as.asm
    │   │   ├── bn_um7.inc
    │   │   ├── bn_umul.inc
    │   │   ├── bn_umul_basic.inc
    │   │   ├── bn_umul_fix.inc
    │   │   ├── bn_umulpp.inc
    │   │   ├── bn_umulpp_basic.inc
    │   │   ├── bn_umulpp_fix.inc
    │   │   ├── bn_umulschool.inc
    │   │   ├── bn_usqr.inc
    │   │   ├── bn_usqr_basic.inc
    │   │   ├── bn_usqrpp.inc
    │   │   ├── bn_usqrpp_basic.inc
    │   │   ├── bn_usqrschool.inc
    │   │   ├── clear_regs.inc
    │   │   ├── cpinitas.asm
    │   │   ├── emulator.inc
    │   │   ├── ia_32e_regs.inc
    │   │   ├── memcpy.inc
    │   │   ├── mont_mul1024_avx2as.asm
    │   │   ├── mont_mul_avx2as.asm
    │   │   ├── mont_sqr1024_avx2as.asm
    │   │   ├── mont_sqr_avx2as.asm
    │   │   ├── mpi_mont_reduction_m7as.asm
    │   │   ├── mpi_uadd_m7as.asm
    │   │   ├── mpi_udiv_u32_m7as.asm
    │   │   ├── mpi_uinc_udec_m7as.asm
    │   │   ├── mpi_umul_acc_m7as.asm
    │   │   ├── mpi_umul_m7as.asm
    │   │   ├── mpi_umul_usqr_redc_srvl9.asm
    │   │   ├── mpi_umul_usqr_redc_srvl9pp.asm
    │   │   ├── mpi_usqr_m7as.asm
    │   │   ├── mpi_usub_m7as.asm
    │   │   ├── mred.inc
    │   │   ├── mred_basic.inc
    │   │   ├── mred_pp.inc
    │   │   ├── mred_pp_basic.inc
    │   │   ├── mulx.inc
    │   │   ├── os.inc
    │   │   ├── reg_sizes.inc
    │   │   ├── variant.inc
    │   │   └── variant_txt_acm.inc
    │   ├── montgomery-avx2.c
    │   ├── montgomery-avx512.c
    │   └── utils.inc
    ├── mpn-asm.c
    ├── mpn-asm.h
    ├── mpn-binary.c
    ├── mpn-binary.h
    ├── mpn-conf.h
    ├── mpn-montgomery.c
    ├── mpn-montgomery.h
    ├── mpn-optimizer.c
    └── mpn-optimizer.h
└── tests
    ├── CMakeLists.txt
    ├── benchmark.cpp
    ├── ini.h
    ├── logger.h
    ├── mpi-compiler.h
    ├── mpn-division.c
    ├── nameof.h
    ├── profiler.h
    ├── tabulate.h
    ├── test.cc
    └── unittest-mpi.cpp


/.clang-format:
--------------------------------------------------------------------------------
  1 | # configured with https://zed0.co.uk/clang-format-configurator
  2 | 
  3 | ---
  4 | Language: Cpp
  5 | AccessModifierOffset: '-2'
  6 | AlignAfterOpenBracket: Align
  7 | AlignConsecutiveMacros: 'true'
  8 | AlignConsecutiveAssignments: 'false'
  9 | AlignConsecutiveDeclarations: 'false'
 10 | AlignEscapedNewlines: Left
 11 | AlignOperands: 'true'
 12 | AlignTrailingComments: 'true'
 13 | AllowAllArgumentsOnNextLine: 'true'
 14 | AllowAllConstructorInitializersOnNextLine: 'true'
 15 | AllowAllParametersOfDeclarationOnNextLine: 'true'
 16 | AllowShortBlocksOnASingleLine: 'false'
 17 | AllowShortCaseLabelsOnASingleLine: 'false'
 18 | AllowShortFunctionsOnASingleLine: Empty
 19 | AllowShortIfStatementsOnASingleLine: WithoutElse
 20 | AllowShortLambdasOnASingleLine: None
 21 | AllowShortLoopsOnASingleLine: 'true'
 22 | AlwaysBreakAfterDefinitionReturnType: None
 23 | AlwaysBreakAfterReturnType: None
 24 | AlwaysBreakBeforeMultilineStrings: 'false'
 25 | AlwaysBreakTemplateDeclarations: 'Yes'
 26 | BinPackArguments: 'true'
 27 | BinPackParameters: 'true'
 28 | BraceWrapping:
 29 |   AfterCaseLabel: 'false'
 30 |   AfterClass: 'false'
 31 |   AfterControlStatement: 'false'
 32 |   AfterEnum: 'false'
 33 |   AfterFunction: 'true'
 34 |   AfterNamespace: 'true'
 35 |   AfterObjCDeclaration: 'false'
 36 |   AfterStruct: 'false'
 37 |   AfterUnion: 'false'
 38 |   AfterExternBlock: 'false'
 39 |   BeforeCatch: 'false'
 40 |   BeforeElse: 'false'
 41 |   IndentBraces: 'false'
 42 |   SplitEmptyFunction: 'true'
 43 |   SplitEmptyRecord: 'true'
 44 |   SplitEmptyNamespace: 'true'
 45 | BreakBeforeBinaryOperators: NonAssignment
 46 | BreakBeforeBraces: Custom
 47 | BreakBeforeTernaryOperators: 'true'
 48 | BreakConstructorInitializers: BeforeColon
 49 | BreakInheritanceList: BeforeColon
 50 | BreakStringLiterals: 'true'
 51 | ColumnLimit: '120'
 52 | CompactNamespaces: 'false'
 53 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
 54 | ConstructorInitializerIndentWidth: '4'
 55 | ContinuationIndentWidth: '4'
 56 | Cpp11BracedListStyle: 'true'
 57 | DerivePointerAlignment: 'false'
 58 | DisableFormat: 'false'
 59 | ExperimentalAutoDetectBinPacking: 'false'
 60 | FixNamespaceComments: 'true'
 61 | ForEachMacros: ['foreach', 'FOREACH', 'RANGES_FOR', 'hlist_for_each_entry_continue', 'hlist_for_each_entry', 'hlist_for_each_entry_from', 'hlist_for_each_entry_safe', 'hlist_for_each_safe', 'list_for_each_entry', 'list_for_each_entry_continue', 'list_for_each_entry_continue_reverse', 'list_for_each_entry_from', 'list_for_each_entry_reverse', 'list_for_each_entry_safe', 'list_for_each_entry_safe_continue', 'list_for_each_entry_safe_from', 'list_for_each_entry_safe_reverse', 'list_for_each_from', 'list_for_each_prev', 'list_for_each_prev_safe', 'list_for_each_safe']
 62 | TypenameMacros: ['STACK_OF', 'LIST']
 63 | IncludeBlocks: Regroup
 64 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 65 | IndentCaseLabels: 'true'
 66 | IndentPPDirectives: None
 67 | IndentWidth: '4'
 68 | IndentWrappedFunctionNames: 'false'
 69 | KeepEmptyLinesAtTheStartOfBlocks: 'false'
 70 | MaxEmptyLinesToKeep: '3'
 71 | NamespaceIndentation: None
 72 | PenaltyBreakAssignment: '2'
 73 | PenaltyBreakBeforeFirstCallParameter: '1'
 74 | PenaltyBreakComment: '300'
 75 | PenaltyBreakFirstLessLess: '120'
 76 | PenaltyBreakString: '1000'
 77 | PenaltyBreakTemplateDeclaration: '10'
 78 | PenaltyExcessCharacter: '1000000'
 79 | PenaltyReturnTypeOnItsOwnLine: '500'
 80 | PointerAlignment: Right
 81 | RawStringFormats:
 82 |   - Language: Cpp
 83 |     Delimiters:
 84 |       - 'cc'
 85 |       - 'CC'
 86 |       - 'cpp'
 87 |       - 'Cpp'
 88 |       - 'CPP'
 89 |       - 'c++'
 90 |       - 'C++'
 91 |     CanonicalDelimiter: ''
 92 |     BasedOnStyle: google
 93 |   - Language: TextProto
 94 |     Delimiters:
 95 |       - 'pb'
 96 |       - 'PB'
 97 |       - 'proto'
 98 |       - 'PROTO'
 99 |     EnclosingFunctions:
100 |       - EqualsProto
101 |       - EquivToProto
102 |       - PARSE_PARTIAL_TEXT_PROTO
103 |       - PARSE_TEST_PROTO
104 |       - PARSE_TEXT_PROTO
105 |       - ParseTextOrDie
106 |       - ParseTextProtoOrDie
107 |     CanonicalDelimiter: ''
108 |     BasedOnStyle: google
109 | ReflowComments: 'true'
110 | SortIncludes: 'false'
111 | SortUsingDeclarations: 'false'
112 | SpaceAfterCStyleCast: 'false'
113 | SpaceAfterLogicalNot: 'false'
114 | SpaceAfterTemplateKeyword: 'true'
115 | SpaceBeforeAssignmentOperators: 'true'
116 | SpaceBeforeCpp11BracedList: 'false'
117 | SpaceBeforeCtorInitializerColon: 'true'
118 | SpaceBeforeInheritanceColon: 'true'
119 | SpaceBeforeParens: ControlStatements
120 | SpaceBeforeRangeBasedForLoopColon: 'true'
121 | SpaceInEmptyParentheses: 'false'
122 | SpacesBeforeTrailingComments: '1'
123 | SpacesInAngles: 'false'
124 | SpacesInCStyleCastParentheses: 'false'
125 | SpacesInContainerLiterals: 'false'
126 | SpacesInParentheses: 'false'
127 | SpacesInSquareBrackets: 'false'
128 | Standard: Auto
129 | StatementMacros: ['__maybe_unused']
130 | TabWidth: '4'
131 | UseTab: Never
132 | ...
133 | 


--------------------------------------------------------------------------------
/.cmake-format.yaml:
--------------------------------------------------------------------------------
  1 | _help_parse: Options affecting listfile parsing
  2 | parse:
  3 |   _help_additional_commands:
  4 |     - Specify structure for custom cmake functions
  5 |   additional_commands:
  6 |     APPEND_TO_LISTS:
  7 |       kwargs:
  8 |         LISTS: "*"
  9 |         VALUES: "*"
 10 |     target_sources:
 11 |       flags:
 12 |         - PUBLIC
 13 |         - PRIVATE
 14 |   _help_vartags:
 15 |     - Specify variable tags.
 16 |   vartags: []
 17 |   _help_proptags:
 18 |     - Specify property tags.
 19 |   proptags: []
 20 | _help_format: Options affecting formatting.
 21 | format:
 22 |   _help_line_width:
 23 |     - How wide to allow formatted cmake files
 24 |   line_width: 80
 25 |   _help_tab_size:
 26 |     - How many spaces to tab for indent
 27 |   tab_size: 2
 28 |   _help_max_subgroups_hwrap:
 29 |     - If an argument group contains more than this many sub-groups
 30 |     - (parg or kwarg groups) then force it to a vertical layout.
 31 |   max_subgroups_hwrap: 6
 32 |   _help_max_pargs_hwrap:
 33 |     - If a positional argument group contains more than this many
 34 |     - arguments, then force it to a vertical layout.
 35 |   max_pargs_hwrap: 8
 36 |   _help_max_rows_cmdline:
 37 |     - If a cmdline positional group consumes more than this many
 38 |     - lines without nesting, then invalidate the layout (and nest)
 39 |   max_rows_cmdline: 6
 40 |   _help_separate_ctrl_name_with_space:
 41 |     - If true, separate flow control names from their parentheses
 42 |     - with a space
 43 |   separate_ctrl_name_with_space: true
 44 |   _help_separate_fn_name_with_space:
 45 |     - If true, separate function names from parentheses with a
 46 |     - space
 47 |   separate_fn_name_with_space: false
 48 |   _help_dangle_parens:
 49 |     - If a statement is wrapped to more than one line, than dangle
 50 |     - the closing parenthesis on its own line.
 51 |   dangle_parens: true
 52 |   _help_dangle_align:
 53 |     - If the trailing parenthesis must be 'dangled' on its on
 54 |     - "line, then align it to this reference: `prefix`: the start"
 55 |     - "of the statement, `prefix-indent`: the start of the"
 56 |     - "statement, plus one indentation level, `child`: align to"
 57 |     - the column of the arguments
 58 |   dangle_align: prefix
 59 |   _help_min_prefix_chars:
 60 |     - If the statement spelling length (including space and
 61 |     - parenthesis) is smaller than this amount, then force reject
 62 |     - nested layouts.
 63 |   min_prefix_chars: 4
 64 |   _help_max_prefix_chars:
 65 |     - If the statement spelling length (including space and
 66 |     - parenthesis) is larger than the tab width by more than this
 67 |     - amount, then force reject un-nested layouts.
 68 |   max_prefix_chars: 10
 69 |   _help_max_lines_hwrap:
 70 |     - If a candidate layout is wrapped horizontally but it exceeds
 71 |     - this many lines, then reject the layout.
 72 |   max_lines_hwrap: 10
 73 |   _help_line_ending:
 74 |     - What style line endings to use in the output.
 75 |   line_ending: unix
 76 |   _help_command_case:
 77 |     - Format command names consistently as 'lower' or 'upper' case
 78 |   command_case: upper
 79 |   _help_keyword_case:
 80 |     - Format keywords consistently as 'lower' or 'upper' case
 81 |   keyword_case: upper
 82 |   _help_always_wrap:
 83 |     - A list of command names which should always be wrapped
 84 |   always_wrap: []
 85 |   _help_enable_sort:
 86 |     - If true, the argument lists which are known to be sortable
 87 |     - will be sorted lexicographicall
 88 |   enable_sort: true
 89 |   _help_autosort:
 90 |     - If true, the parsers may infer whether or not an argument
 91 |     - list is sortable (without annotation).
 92 |   autosort: false
 93 |   _help_require_valid_layout:
 94 |     - By default, if cmake-format cannot successfully fit
 95 |     - everything into the desired linewidth it will apply the
 96 |     - last, most agressive attempt that it made. If this flag is
 97 |     - True, however, cmake-format will print error, exit with non-
 98 |     - zero status code, and write-out nothing
 99 |   require_valid_layout: false
100 |   _help_layout_passes:
101 |     - A dictionary mapping layout nodes to a list of wrap
102 |     - decisions. See the documentation for more information.
103 |   layout_passes: {}
104 | _help_markup: Options affecting comment reflow and formatting.
105 | markup:
106 |   _help_bullet_char:
107 |     - What character to use for bulleted lists
108 |   bullet_char: "*"
109 |   _help_enum_char:
110 |     - What character to use as punctuation after numerals in an
111 |     - enumerated list
112 |   enum_char: .
113 |   _help_first_comment_is_literal:
114 |     - If comment markup is enabled, don't reflow the first comment
115 |     - block in each listfile. Use this to preserve formatting of
116 |     - your copyright/license statements.
117 |   first_comment_is_literal: false
118 |   _help_literal_comment_pattern:
119 |     - If comment markup is enabled, don't reflow any comment block
120 |     - which matches this (regex) pattern. Default is `None`
121 |     - (disabled).
122 |   literal_comment_pattern: null
123 |   _help_fence_pattern:
124 |     - Regular expression to match preformat fences in comments
125 |     - default= ``r'^\s*([`~]{3}[`~]*)(.*)$'``
126 |   fence_pattern: ^\s*([`~]{3}[`~]*)(.*)$
127 |   _help_ruler_pattern:
128 |     - Regular expression to match rulers in comments default=
129 |     - '``r''^\s*[^\w\s]{3}.*[^\w\s]{3}$''``'
130 |   ruler_pattern: ^\s*[^\w\s]{3}.*[^\w\s]{3}$
131 |   _help_explicit_trailing_pattern:
132 |     - If a comment line matches starts with this pattern then it
133 |     - is explicitly a trailing comment for the preceeding
134 |     - argument. Default is '#<'
135 |   explicit_trailing_pattern: "#<"
136 |   _help_hashruler_min_length:
137 |     - If a comment line starts with at least this many consecutive
138 |     - hash characters, then don't lstrip() them off. This allows
139 |     - for lazy hash rulers where the first hash char is not
140 |     - separated by space
141 |   hashruler_min_length: 10
142 |   _help_canonicalize_hashrulers:
143 |     - If true, then insert a space between the first hash char and
144 |     - remaining hash chars in a hash ruler, and normalize its
145 |     - length to fill the column
146 |   canonicalize_hashrulers: true
147 |   _help_enable_markup:
148 |     - enable comment markup parsing and reflow
149 |   enable_markup: false
150 | _help_lint: Options affecting the linter
151 | lint:
152 |   _help_disabled_codes:
153 |     - a list of lint codes to disable
154 |   disabled_codes: []
155 |   _help_function_pattern:
156 |     - regular expression pattern describing valid function names
157 |   function_pattern: "[0-9a-z_]+"
158 |   _help_macro_pattern:
159 |     - regular expression pattern describing valid macro names
160 |   macro_pattern: "[0-9A-Z_]+"
161 |   _help_global_var_pattern:
162 |     - regular expression pattern describing valid names for
163 |     - variables with global (cache) scope
164 |   global_var_pattern: "[A-Z][0-9A-Z_]+"
165 |   _help_internal_var_pattern:
166 |     - regular expression pattern describing valid names for
167 |     - variables with global scope (but internal semantic)
168 |   internal_var_pattern: _[A-Z][0-9A-Z_]+
169 |   _help_local_var_pattern:
170 |     - regular expression pattern describing valid names for
171 |     - variables with local scope
172 |   local_var_pattern: "[a-z][a-z0-9_]+"
173 |   _help_private_var_pattern:
174 |     - regular expression pattern describing valid names for
175 |     - privatedirectory variables
176 |   private_var_pattern: _[0-9a-z_]+
177 |   _help_public_var_pattern:
178 |     - regular expression pattern describing valid names for public
179 |     - directory variables
180 |   public_var_pattern: "[A-Z][0-9A-Z_]+"
181 |   _help_argument_var_pattern:
182 |     - regular expression pattern describing valid names for
183 |     - function/macro arguments and loop variables.
184 |   argument_var_pattern: "[a-z][a-z0-9_]+"
185 |   _help_keyword_pattern:
186 |     - regular expression pattern describing valid names for
187 |     - keywords used in functions or macros
188 |   keyword_pattern: "[A-Z][0-9A-Z_]+"
189 |   _help_max_conditionals_custom_parser:
190 |     - In the heuristic for C0201, how many conditionals to match
191 |     - within a loop in before considering the loop a parser.
192 |   max_conditionals_custom_parser: 2
193 |   _help_min_statement_spacing:
194 |     - Require at least this many newlines between statements
195 |   min_statement_spacing: 1
196 |   _help_max_statement_spacing:
197 |     - Require no more than this many newlines between statements
198 |   max_statement_spacing: 2
199 |   max_returns: 6
200 |   max_branches: 12
201 |   max_arguments: 5
202 |   max_localvars: 15
203 |   max_statements: 50
204 | _help_encode: Options affecting file encoding
205 | encode:
206 |   _help_emit_byteorder_mark:
207 |     - If true, emit the unicode byte-order mark (BOM) at the start
208 |     - of the file
209 |   emit_byteorder_mark: false
210 |   _help_input_encoding:
211 |     - Specify the encoding of the input file. Defaults to utf-8
212 |   input_encoding: utf-8
213 |   _help_output_encoding:
214 |     - Specify the encoding of the output file. Defaults to utf-8.
215 |     - Note that cmake only claims to support utf-8 so be careful
216 |     - when using anything else
217 |   output_encoding: utf-8
218 | _help_misc: Miscellaneous configurations options.
219 | misc:
220 |   _help_per_command:
221 |     - A dictionary containing any per-command configuration
222 |     - overrides. Currently only `command_case` is supported.
223 |   per_command: {}
224 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
 1 | name: benchmark
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: [ubuntu-latest]
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 | 
12 |       - name: Install requirements
13 |         run: |
14 |           sudo apt-get update -q -y
15 |           sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
16 | 
17 |       - name: Configure
18 |         run: cmake -B ${{github.workspace}}/build -DMPN_NO_ASM=ON -DBUILD_VENDOR=ON
19 | 
20 |       - name: Build
21 |         run: cmake --build ${{github.workspace}}/build
22 | 
23 |       - name: Run Benchmark
24 |         working-directory: ${{github.workspace}}/build
25 |         run: |
26 |           ${{github.workspace}}/build/tests/benchmark | tee ${{github.workspace}}/build/benchmark.txt
27 |           cat ${{github.workspace}}/docs/README.template.md > ${{github.workspace}}/README.md
28 |           echo -e '## Benchmark(libmpi VS openssl)\n' >> ${{github.workspace}}/README.md
29 |           awk '/-----BEGIN MARKDOWN TABLE-----/{ f = 1; next } /-----END MARKDOWN TABLE-----/{ f = 0 } f' benchmark.txt >> ${{github.workspace}}/README.md
30 |           git add ${{github.workspace}}/README.md
31 | 
32 |       - name: Commit files
33 |         run: |
34 |           git config --local user.email "github-actions[bot]@users.noreply.github.com"
35 |           git config --local user.name "github-actions[bot]"
36 |           git commit -m "Update performance data" -a
37 | 
38 |       - name: Push changes
39 |         uses: ad-m/github-push-action@master
40 |         with:
41 |           github_token: ${{ secrets.GITHUB_TOKEN }}
42 |           branch: ${{ github.ref }}
43 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ${{ matrix.distro }}
 8 | 
 9 |     strategy:
10 |       matrix:
11 |         distro: [ubuntu-latest, macos-latest]
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v2
15 | 
16 |       - name: Install requirements
17 |         id: requirements
18 |         run: |
19 |           case "${{ matrix.distro }}" in
20 |             ubuntu*|jessie|stretch|buster|bullseye)
21 |               sudo apt-get update -q -y
22 |               sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
23 |               ;;
24 |             macos*)
25 |               brew install nasm googletest openssl@1.1 openssl@3
26 |               brew link openssl --force
27 |               echo ::set-output name=LDFLAGS::"-L/usr/local/opt/openssl@1.1/lib"
28 |               echo ::set-output name=CPPFLAGS::"-I/usr/local/opt/openssl@1.1/include"
29 |               ;;
30 |             fedora*)
31 |               sudo dnf -y update
32 |               sudo dnf -y install gcc g++ git nasm gtest openssl cmake
33 |               ;;
34 |             alpine*)
35 |               apk update
36 |               apk add gcc g++ git nasm gtest openssl cmake
37 |               ;;
38 |           esac
39 | 
40 |       - name: Configure
41 |         run: cmake -B ${{github.workspace}}/build -DCMAKE_VERBOSE_MAKEFILE=ON -DMPN_NO_ASM=ON -DCMAKE_CXX_FLAGS=${{ steps.requirements.outputs.CPPFLAGS }} -DCMAKE_EXE_LINKER_FLAGS=${{ steps.requirements.outputs.LDFLAGS }}
42 | 
43 |       - name: Build
44 |         run: cmake --build ${{github.workspace}}/build
45 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
 1 | name: coverage
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: [ubuntu-latest]
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 | 
12 |       - name: Install requirements
13 |         run: |
14 |           sudo apt-get update -q -y
15 |           sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
16 | 
17 |       - name: Configure
18 |         run: cmake -B ${{github.workspace}}/build -DGCOV=ON -DCMAKE_BUILD_TYPE=Debug -DMPN_NO_ASM=ON
19 | 
20 |       - name: Build
21 |         run: cmake --build ${{github.workspace}}/build
22 | 
23 |       - name: Run Test
24 |         working-directory: ${{github.workspace}}/build
25 |         run: |
26 |           make test || true
27 |           ${{github.workspace}}/build/tests/benchmark || true
28 | 
29 |       - name: Upload To CodeCov
30 |         run: bash <(curl -s https://codecov.io/bash)
31 | 


--------------------------------------------------------------------------------
/.github/workflows/multiarch.yml:
--------------------------------------------------------------------------------
 1 | name: multiarch
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build_job:
 7 |     # The host should always be linux
 8 |     runs-on: ubuntu-18.04
 9 |     name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
10 | 
11 |     # Run steps on a matrix of 3 arch/distro combinations
12 |     strategy:
13 |       matrix:
14 |         include:
15 |           - arch: aarch64
16 |             distro: ubuntu18.04
17 |           # - arch: armv7
18 |           #   distro: ubuntu20.04
19 |           - arch: s390x
20 |             distro: fedora_latest
21 |           # - arch: ppc64le
22 |           #   distro: alpine_latest
23 | 
24 |     steps:
25 |       - uses: actions/checkout@v2.1.0
26 |       - name: Building
27 |         uses: uraimo/run-on-arch-action@v2.1.1
28 |         id: build
29 |         with:
30 |           arch: ${{ matrix.arch }}
31 |           distro: ${{ matrix.distro }}
32 | 
33 |           # Not required, but speeds up builds
34 |           githubToken: ${{ github.token }}
35 | 
36 |           # Mount the github.workspace directory as /workspace in the container
37 |           dockerRunArgs: |
38 |             --volume "${{ github.workspace }}:/workspace"
39 | 
40 |           # Pass some environment variables to the container
41 |           env: |
42 |             workspace: /workspace
43 | 
44 |           # The shell to run commands with in the container
45 |           shell: /bin/bash
46 | 
47 |           # Install some dependencies in the container. This speeds up builds if
48 |           # you are also using githubToken. Any dependencies installed here will
49 |           # be part of the container image that gets cached, so subsequent
50 |           # builds don't have to re-install them. The image layer is cached
51 |           # publicly in your project's package repository, so it is vital that
52 |           # no secrets are present in the container state or logs.
53 |           install: |
54 |             case "${{ matrix.distro }}" in
55 |               ubuntu*|jessie|stretch|buster|bullseye)
56 |                 apt-get update -q -y
57 |                 apt-get install -q -y gcc g++ nasm libgtest-dev openssl cmake
58 |                 ;;
59 |               macos*)
60 |                 brew update
61 |                 brew install nasm googletest openssl
62 |                 brew link openssl --force
63 |                 export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib"
64 |                 export CPPFLAGS="-I/usr/local/opt/openssl@1.1/include"
65 |                 ;;
66 |               fedora*)
67 |                 dnf -y update
68 |                 dnf -y install gcc g++ nasm gtest openssl cmake
69 |                 ;;
70 |               alpine*)
71 |                 apk update
72 |                 apk add gcc g++ nasm gtest openssl cmake
73 |                 ;;
74 |             esac
75 | 
76 |           # Configure and Build
77 |           run: |
78 |             mkdir -p ${workspace}/build && cd ${workspace}/build
79 |             cmake .. && make
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # ignore directories
35 | build/**
36 | .vscode/**
37 | vendor/**
38 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | fail_fast: false
 2 | 
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.2.0
 6 |     hooks:
 7 |       - id: check-added-large-files
 8 |       - id: trailing-whitespace
 9 |         args: [--markdown-linebreak-ext=md]
10 |       - id: check-merge-conflict
11 |       - id: check-json
12 |       - id: check-yaml
13 |         args: [--allow-multiple-document]
14 |       - id: check-case-conflict
15 |       - id: check-symlinks
16 |       - id: end-of-file-fixer
17 |       - id: pretty-format-json
18 |   - repo: git://github.com/doublify/pre-commit-clang-format
19 |     rev: 62302476d0da01515660132d76902359bed0f782
20 |     hooks:
21 |       - id: clang-format
22 |         entry: clang-format
23 |         language: system
24 |         files: \.(c|cc|cxx|cpp|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|proto|vert)$
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
  2 | PROJECT("Cryptograph Algorithms Implementation")
  3 | 
  4 | INCLUDE(CMakePackageConfigHelpers)
  5 | INCLUDE(cmake/ConfigureTarget.cmake)
  6 | 
  7 | ADD_COMPILE_OPTIONS(-Wno-deprecated-declarations)
  8 | 
  9 | IF (NOT DEFINED ARCH)
 10 |   # MATCHES "^(os|ios|android|linux|win32)$
 11 |   IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|amd64)$")
 12 |     SET(ARCH "x86_64")
 13 |   ELSEIF (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
 14 |     # cmake reports AMD64 on Windows, but we might be building for 32-bit.
 15 |     IF (CMAKE_SIZEOF_VOID_P EQUAL 8)
 16 |       SET(ARCH "x86_64")
 17 |     ELSE ()
 18 |       SET(ARCH "x86")
 19 |     ENDIF ()
 20 |   ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86|i386|i386)$")
 21 |     SET(ARCH "x86")
 22 |   ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(aarch64|arm64|arm64e)$")
 23 |     SET(ARCH "aarch64")
 24 |   ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm*")
 25 |     SET(ARCH "arm")
 26 |   ELSE ()
 27 |     SET(ARCH "generic")
 28 |     MESSAGE(STATUE "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR})
 29 |   ENDIF ()
 30 | ENDIF ()
 31 | 
 32 | IF (UNIX)
 33 |   IF (${ARCH} STREQUAL "aarch64")
 34 |     IF (APPLE)
 35 |       SET(PERLASM_STYLE ios64)
 36 |     ELSE ()
 37 |       SET(PERLASM_STYLE linux64)
 38 |     ENDIF ()
 39 |   ELSEIF (${ARCH} STREQUAL "arm")
 40 |     IF (APPLE)
 41 |       SET(PERLASM_STYLE ios32)
 42 |     ELSE ()
 43 |       SET(PERLASM_STYLE linux32)
 44 |     ENDIF ()
 45 |   ELSE ()
 46 |     IF (${ARCH} STREQUAL "x86")
 47 |       SET(PERLASM_FLAGS "-fPIC -DCRYPTO_IA32_SSE2")
 48 |     ENDIF ()
 49 |     IF (APPLE)
 50 |       SET(PERLASM_STYLE macosx)
 51 |     ELSE ()
 52 |       SET(PERLASM_STYLE elf)
 53 |     ENDIF ()
 54 |   ENDIF ()
 55 |   SET(ASM_EXT S)
 56 |   ENABLE_LANGUAGE(ASM)
 57 |   SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,--noexecstack")
 58 | 
 59 |   # Clang's integerated assembler does not support debug symbols.
 60 |   IF (NOT CMAKE_ASM_COMPILER_ID MATCHES "Clang")
 61 |     SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,-g")
 62 |   ENDIF ()
 63 | 
 64 |   # CMake does not add -isysroot and -arch flags to assembly.
 65 |   IF (APPLE)
 66 |     IF (CMAKE_OSX_SYSROOT)
 67 |       SET(CMAKE_ASM_FLAGS
 68 |           "${CMAKE_ASM_FLAGS} -isysroot \"${CMAKE_OSX_SYSROOT}\""
 69 |       )
 70 |     ENDIF ()
 71 |     FOREACH (arch ${CMAKE_OSX_ARCHITECTURES})
 72 |       SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -arch ${arch}")
 73 |     ENDFOREACH ()
 74 |   ENDIF ()
 75 | ELSE ()
 76 |   IF (${ARCH} STREQUAL "x86_64")
 77 |     SET(PERLASM_STYLE nasm)
 78 |   ELSE ()
 79 |     SET(PERLASM_STYLE win32n)
 80 |     SET(PERLASM_FLAGS "-DCRYPTO_IA32_SSE2")
 81 |   ENDIF ()
 82 |   SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -gcv8")
 83 | 
 84 |   # On Windows, we use the NASM output, specifically built with Yasm.
 85 |   SET(ASM_EXT asm)
 86 | ENDIF ()
 87 | 
 88 | FIND_PACKAGE(Perl REQUIRED)
 89 | MACRO (PERLASM dest src)
 90 |   ADD_CUSTOM_COMMAND(
 91 |     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${dest}
 92 |     COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}
 93 |     COMMAND CC=${CMAKE_C_COMPILER} ${PERL_EXECUTABLE} ${src} ${PERLASM_STYLE}
 94 |             ${PERLASM_FLAGS} ${ARGN} ${CMAKE_CURRENT_BINARY_DIR}/${dest}
 95 |     DEPENDS ${src} ${CMAKE_SOURCE_DIR}/perlasm/arm-xlate.pl
 96 |             ${CMAKE_SOURCE_DIR}/perlasm/x86_64-xlate.pl
 97 |             ${CMAKE_SOURCE_DIR}/perlasm/x86asm.pl
 98 |             ${CMAKE_SOURCE_DIR}/perlasm/x86gas.pl
 99 |             ${CMAKE_SOURCE_DIR}/perlasm/x86masm.pl
100 |             ${CMAKE_SOURCE_DIR}/perlasm/x86nasm.pl
101 |     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
102 |   )
103 | ENDMACRO ()
104 | 
105 | IF (BUILD_VENDOR)
106 |   INCLUDE(ExternalProject)
107 |   IF (NOT EXISTS ${CMAKE_SOURCE_DIR}/vendor)
108 |     FILE(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/vendor)
109 |   ENDIF ()
110 | 
111 |   INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/include)
112 |   LINK_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/lib)
113 |   SET(ENV{PATH} ${CMAKE_BINARY_DIR}/bin:$ENV{PATH})
114 |   SET(ENV{PKG_CONFIG_PATH} ${CMAKE_BINARY_DIR}/lib/pkgconfig)
115 | 
116 |   # cmake-format: off
117 |   SET(NASM_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/nasm-2.15.05.tar.gz)
118 |   SET(NASM_DOWNLOAD_URL https://www.nasm.us/pub/nasm/releasebuilds/2.15.05/nasm-2.15.05.tar.gz)
119 |   IF (NOT EXISTS ${NASM_LOCAL_FILE})
120 |     FILE(
121 |       DOWNLOAD ${NASM_DOWNLOAD_URL} ${NASM_LOCAL_FILE}
122 |       TIMEOUT 60
123 |       TLS_VERIFY ON
124 |     )
125 |   ENDIF ()
126 |   EXTERNALPROJECT_ADD(
127 |     nasm
128 |     URL ${NASM_LOCAL_FILE}
129 |     CONFIGURE_COMMAND ./configure --prefix=${CMAKE_BINARY_DIR}
130 |     BUILD_COMMAND make -j${CONCURRENCY}
131 |     BUILD_IN_SOURCE 1
132 |   )
133 | 
134 |   SET(OPENSSL_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i.tar.gz)
135 |   SET(OPENSSL_DOWNLOAD_URL https://www.openssl.org/source/old/1.1.1/openssl-1.1.1i.tar.gz)
136 |   IF (NOT EXISTS ${OPENSSL_LOCAL_FILE})
137 |     FILE(
138 |       DOWNLOAD ${OPENSSL_DOWNLOAD_URL} ${OPENSSL_LOCAL_FILE}
139 |       TIMEOUT 60
140 |       TLS_VERIFY ON
141 |     )
142 |   ENDIF ()
143 |   EXTERNALPROJECT_ADD(
144 |     openssl
145 |     URL ${OPENSSL_LOCAL_FILE}
146 |     CONFIGURE_COMMAND ./config no-shared no-asm -d --prefix=${CMAKE_BINARY_DIR}
147 |     BUILD_COMMAND make depend && make -j${CONCURRENCY}
148 |     INSTALL_COMMAND make install_sw
149 |     BUILD_IN_SOURCE 1
150 |   )
151 |   # cmake-format: on
152 |   LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/lib)
153 |   INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/include)
154 | ELSE ()
155 |   FIND_PROGRAM(OPENSSL openssl REQUIRED)
156 |   GET_FILENAME_COMPONENT(OPENSSL_DIR ${OPENSSL} DIRECTORY CACHE)
157 |   LINK_DIRECTORIES(${OPENSSL_DIR}/../lib)
158 |   INCLUDE_DIRECTORIES(${OPENSSL_DIR}/../include)
159 | ENDIF ()
160 | 
161 | LINK_DIRECTORIES(/usr/local/lib)
162 | INCLUDE_DIRECTORIES(/usr/local/include ${CMAKE_BINARY_DIR}/include)
163 | 
164 | # mpn
165 | ADD_SUBDIRECTORY(mpn)
166 | 
167 | # mpi
168 | ADD_SUBDIRECTORY(mpi)
169 | 
170 | # tests
171 | ENABLE_TESTING()
172 | ADD_SUBDIRECTORY(tests)
173 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # libmpi
 2 | 
 3 | [![license](https://img.shields.io/badge/license-Apache-brightgreen.svg?style=flat)](https://github.com/vxfury/libmpi/blob/master/LICENSE)
 4 | [![CI Status](https://github.com/vxfury/libmpi/workflows/ci/badge.svg)](https://github.com/vxfury/libmpi/actions)
 5 | [![codecov](https://codecov.io/gh/vxfury/libmpi/branch/main/graph/badge.svg?token=5IfLTTEcnF)](https://codecov.io/gh/vxfury/libmpi)
 6 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/vxfury/libmpi?color=red&label=release)
 7 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/vxfury/libmpi/pulls)
 8 | 
 9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA
10 | ## Benchmark(libmpi VS openssl)
11 | 
12 | | brief | average time<br>(nanoseconds) | instability<br>(coefficient of variation) | rating | 
13 | | :-- | :-: | :-: | :-: |
14 | | from-string(mpi vs openssl) | 2443.7<br>30303.4* | 0.0701562 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">12.4006<br>(Tu es mon meilleur frère...)</span> | 
15 | | to-string(mpi vs openssl) | 1328.88<br>3463.21* | 0.109777 | <span style="color:#008000;font-weight:bold;">2.60612<br>(Tu peux faire mieux, continue)</span> | 
16 | | from-octets(mpi vs openssl) | 273.632<br>702.13* | 0.0870046 | <span style="color:#008000;font-weight:bold;">2.56597<br>(Tu peux faire mieux, continue)</span> | 
17 | | to-octets(mpi vs openssl) | 172.067<br>1475.5* | 0.359989 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">8.57515<br>(C'est super, dessine-toi une tarte)</span> | 
18 | | add(mpi vs openssl) | 51.1222<br>333.814* | 0.164442 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">6.52973<br>(C'est super, dessine-toi une tarte)</span> | 
19 | | add-assign(mpi vs openssl) | 56.7424<br>332.054* | 0.202937 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">5.85196<br>(C'est super, dessine-toi une tarte)</span> | 
20 | | sub(mpi vs openssl) | 61.6028<br>162.647* | 0.207007 | <span style="color:#008000;font-weight:bold;">2.64025<br>(Tu peux faire mieux, continue)</span> | 
21 | | sub-assign(mpi vs openssl) | 58.2224<br>288.852* | 0.155195 | <span style="color:#008000;font-weight:bold;">4.96119<br>(Tu peux faire mieux, continue)</span> | 
22 | | mul(mpi vs openssl) | 2070.41<br>14037.9* | 0.0553581 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">6.78025<br>(C'est super, dessine-toi une tarte)</span> | 
23 | | sqr(mpi vs openssl) | 1329.62<br>8760.12* | 0.168403 | <span style="color:#008000;font-weight:bold;text-decoration:blink;">6.58845<br>(C'est super, dessine-toi une tarte)</span> | 
24 | | MUL2(a * 2 = a + a) | 37.5416 | 0.163214 | <span style="font-style:italic;">N/A</span> | 
25 | | MUL2(a * 2 = a << 1) | 77.5234 | 0.113647 | <span style="font-style:italic;">N/A</span> | 
26 | 


--------------------------------------------------------------------------------
/cmake/Config.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 | 
3 | find_package(Threads)
4 | include(${CMAKE_CURRENT_LIST_DIR}/libacoTargets.cmake)
5 | 


--------------------------------------------------------------------------------
/docs/README.template.md:
--------------------------------------------------------------------------------
 1 | # libmpi
 2 | 
 3 | [![license](https://img.shields.io/badge/license-Apache-brightgreen.svg?style=flat)](https://github.com/vxfury/libmpi/blob/master/LICENSE)
 4 | [![CI Status](https://github.com/vxfury/libmpi/workflows/ci/badge.svg)](https://github.com/vxfury/libmpi/actions)
 5 | [![codecov](https://codecov.io/gh/vxfury/libmpi/branch/main/graph/badge.svg?token=5IfLTTEcnF)](https://codecov.io/gh/vxfury/libmpi)
 6 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/vxfury/libmpi?color=red&label=release)
 7 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/vxfury/libmpi/pulls)
 8 | 
 9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA
10 | 


--------------------------------------------------------------------------------
/mpi/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Multiple Precision Integer and Relevant Algorithms
 2 | 
 3 | CONFIGURE_FILE(mpi.h ${CMAKE_BINARY_DIR}/include/mpi/mpi.h COPYONLY)
 4 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi.h DESTINATION include/mpi)
 5 | ADD_LIBRARY(mpi mpi.c mpi-prime.c)
 6 | ConfigureTarget(mpi)
 7 | TARGET_LINK_LIBRARIES(mpi PUBLIC mpn)
 8 | INSTALL(TARGETS mpi ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
 9 | 
10 | # RSA(Rivest–Shamir–Adleman) Algorithm
11 | OPTION(MPI_NO_RSA "build without rsa algorithm" OFF)
12 | IF (NOT MPI_NO_RSA)
13 |   CONFIGURE_FILE(mpi-rsa.h ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h COPYONLY)
14 |   INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h
15 |           DESTINATION include/mpi
16 |   )
17 |   TARGET_SOURCES(mpi PRIVATE mpi-rsa.c)
18 | ENDIF ()
19 | 


--------------------------------------------------------------------------------
/mpi/mpi-rsa.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Ethan.cr.yp.to
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef MULTIPLE_PRECISION_RSA_H
17 | #define MULTIPLE_PRECISION_RSA_H
18 | 
19 | #include <mpi/mpi.h>
20 | #include <mpn/mpn-montgomery.h>
21 | 
22 | #if defined(__cplusplus)
23 | extern "C" {
24 | #endif
25 | 
26 | typedef struct {
27 |     unsigned int nbits; /* RSA modulus bitsize */
28 |     unsigned int ebits; /* RSA public exp bitsize */
29 |     unsigned int dbits; /* RSA private exp bitsize */
30 |     unsigned int pbits; /* RSA p-factor bitsize */
31 |     unsigned int qbits; /* RSA q-factor bitsize */
32 | 
33 |     mpn_limb_t *e;    /* public exponent, bitsize(e) = ebits */
34 |     mpn_limb_t *d;    /* private exponent, bitsize(d) = dbits <= nbits */
35 |     mpn_limb_t *dp;   /* the first factor's CRT exponent, d mod (p - 1), bitsize(dp) <= pbits */
36 |     mpn_limb_t *dq;   /* the second factor's CRT exponent, d mod (q - 1), bitsize(dq) <= qbits */
37 |     mpn_limb_t *qinv; /* the (first) CRT coefficient, q^(-1) mode p, bitsize(qinv) <= pbits */
38 | 
39 |     mpn_montgomery_t *montN; /* montgomery context for (N, the modulus, bitsize(n) = nbits) */
40 |     mpn_montgomery_t *montP; /* montgomery context for (P, the first factor) */
41 |     mpn_montgomery_t *montQ; /* montgomery context for (Q, the second factor) */
42 | 
43 |     /* TODO: multiple-primes support */
44 |     unsigned int primes;
45 |     struct rsa_factor {
46 |         unsigned int bits; /* bit-size of factor */
47 |         mpn_limb_t *r;     /* factor */
48 |         mpn_limb_t *d;     /* factor's CRT exponent */
49 |         mpn_limb_t *t;     /* factor's CRT coefficient */
50 |     } factors[0];
51 | } rsa_key_t;
52 | 
53 | rsa_key_t *rsa_new(unsigned int ebits, unsigned int nbits, unsigned int primes);
54 | void rsa_free(rsa_key_t *key);
55 | 
56 | int rsa_import(rsa_key_t *key, const mpi_t *n, const mpi_t *e, const mpi_t *d, const mpi_t *dp, const mpi_t *dq,
57 |                const mpi_t *qinv);
58 | rsa_key_t *rsa_generate_key(const mpi_t *pubexp, unsigned int nbits, unsigned int primes,
59 |                             int (*rand_bytes)(void *, unsigned char *, unsigned int), void *rand_state);
60 | 
61 | int rsa_pub_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
62 | int rsa_prv_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
63 | int rsa_prv_cipher_crt(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
64 | 
65 | #if defined(__cplusplus)
66 | }
67 | #endif
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/mpi/mpi.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2021 Ethan.cr.yp.to
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | /**
 17 |  * @brief: multiple precision integer: configurations, macros, and prototypes
 18 |  *
 19 |  * @note:
 20 |  *    1. Assume that all variable representing size will never overflow
 21 |  */
 22 | 
 23 | #ifndef MULTIPLE_PRECISION_H
 24 | #define MULTIPLE_PRECISION_H
 25 | 
 26 | #include <mpn/mpn-optimizer.h>
 27 | #include <mpn/mpn-montgomery.h>
 28 | 
 29 | #if defined(__cplusplus)
 30 | extern "C" {
 31 | #endif
 32 | 
 33 | /**
 34 |  * mpi implementation
 35 |  */
 36 | #define MPI_SIGN_NEGTIVE     1    /* a < 0, negtive */
 37 | #define MPI_SIGN_NON_NEGTIVE 0    /* a >= 0, non-negtive */
 38 | #define MPI_ATTR_NOTOWNED    0x01 /* TODO: data field not owned by */
 39 | #define MPI_ATTR_DETACHED    0x02 /* TODO: detached data field */
 40 | #define MPI_ATTR_AUTOSIZE    0x04 /* TODO: resize data field automatically */
 41 | 
 42 | typedef struct {
 43 |     unsigned int attr; /**< mpi attributes */
 44 |     unsigned int sign; /**< mpi sign: negtive or not */
 45 |     mpn_size_t size;   /**< mpi size (count of mpn_limb_t) */
 46 |     mpn_size_t room;   /**< mpi max size (count of mpn_limb_t) */
 47 |     mpn_limb_t *data;  /**< mpi data chunk(most significant limb at the largest) */
 48 | } mpi_t;
 49 | #define MPI_ALIGNED_HEAD_LIMBS ((mpn_size_t)((sizeof(mpi_t) + sizeof(mpn_limb_t) - 1) / sizeof(mpn_limb_t)))
 50 | 
 51 | /** High-Level APIs */
 52 | /**
 53 |  * create mpi with expected bits |bits| to reserve
 54 |  *
 55 |  * |bits| == 0, to create empty room
 56 |  *
 57 |  * @performance: Locality of reference and Cacheline alignment
 58 |  *   mpi_t and this->data will be allocated as a continuous memory chunk
 59 |  */
 60 | mpi_t *mpi_create(mpn_size_t bits);
 61 | 
 62 | /**
 63 |  * create mpi(detached) with expected bits |bits| to reserve
 64 |  *
 65 |  * |bits| == 0, to create empty room
 66 |  */
 67 | mpi_t *mpi_create_detached(mpn_size_t bits);
 68 | 
 69 | /**
 70 |  *  duplicate big-numer |a|
 71 |  */
 72 | mpi_t *mpi_dup(const mpi_t *a);
 73 | 
 74 | /**
 75 |  * clear and release mpi |v|
 76 |  */
 77 | void mpi_destory(mpi_t *v);
 78 | 
 79 | /**
 80 |  * make mpi with given chunk
 81 |  */
 82 | void mpi_make(mpi_t *r, mpn_limb_t *data, mpn_size_t size);
 83 | 
 84 | /**
 85 |  *  copy big-numer |a| to |r|
 86 |  *
 87 |  * @note:
 88 |  *   1. resize |r| to proper size before copy
 89 |  */
 90 | int mpi_copy(mpi_t *r, const mpi_t *a);
 91 | 
 92 | /**
 93 |  * compare mpi |a| and |b|
 94 |  *   0, if |a| = |b|
 95 |  *   1, if |a| > |b|
 96 |  *  -1, if |a| < |b|
 97 |  * otherwise, error code
 98 |  */
 99 | int mpi_cmp(const mpi_t *a, const mpi_t *b);
100 | 
101 | /**
102 |  * get bit size of mpi |a|(constant-time version)
103 |  *
104 |  * @note:
105 |  *   1. 0, if a is NULL
106 |  */
107 | mpn_size_t mpi_bits(const mpi_t *a);
108 | 
109 | /**
110 |  * get byte size of mpi |a|(constant-time version)
111 |  *
112 |  * @note:
113 |  *   1. 0, if a is NULL
114 |  */
115 | mpn_size_t mpi_bytes(const mpi_t *a);
116 | 
117 | /**
118 |  * get max bit size of mpi |a|(constant-time version)
119 |  *
120 |  * @note:
121 |  *   1. 0, if a is NULL
122 |  */
123 | mpn_size_t mpi_max_bits(const mpi_t *a);
124 | 
125 | /**
126 |  * get max byte size of mpi |a|(constant-time version)
127 |  *
128 |  * @note:
129 |  *   1. 0, if a is NULL
130 |  */
131 | mpn_size_t mpi_max_bytes(const mpi_t *a);
132 | 
133 | /**
134 |  * mpi: expand mpi to expected bits |bits|
135 |  *
136 |  * @note:
137 |  *   1. maybe fail when no enough memory or invalid size given
138 |  */
139 | mpi_t *mpi_expand(mpi_t *v, mpn_size_t bits);
140 | 
141 | /**
142 |  * resize mpi to expected bits |bits|
143 |  *
144 |  * @note:
145 |  *   1. maybe fail when no enough memory or invalid size given
146 |  *
147 |  */
148 | mpi_t *mpi_resize(mpi_t *v, mpn_size_t bits);
149 | 
150 | /**
151 |  * zeroize mpi |v|
152 |  */
153 | int mpi_zeroize(mpi_t *v);
154 | 
155 | /**
156 |  * set mpi |r| to unsigned sigle-precision integer |v|
157 |  */
158 | int mpi_set_limb(mpi_t *r, mpn_limb_t v);
159 | 
160 | /**
161 |  *  initialize mpi |v| from octets |buff|/|bufflen|
162 |  *
163 |  * @note:
164 |  *   1. if *|v| is NULL, mpi will be created with proper size
165 |  *   2. if *|v| isn't NULL, mpi-number will be resized, and maybe *|v| will be set to a new memory chunk
166 |  */
167 | int mpi_from_octets(mpi_t **v, const unsigned char *buff, mpn_size_t bufflen);
168 | 
169 | /**
170 |  *  convert mpi to big-endian octets
171 |  */
172 | int mpi_to_octets(const mpi_t *a, unsigned char *out, mpn_size_t outsize, mpn_size_t *outlen);
173 | 
174 | /**
175 |  *  initialize mpi |v| from hex-string |a|
176 |  */
177 | int mpi_from_string(mpi_t **v, const char *a);
178 | 
179 | /**
180 |  *  convert mpi to string
181 |  *
182 |  * @note:
183 |  *   1. FREE the return pointer after usage
184 |  */
185 | char *mpi_to_string(const mpi_t *v);
186 | 
187 | /**
188 |  * mpi addition: |r| = |a| + |b|
189 |  *
190 |  * @note:
191 |  *   1. make sure r->room is enough to store the result
192 |  *      minimal advise size: MAX(bit_size(a), bit_size(b)) + 1
193 |  */
194 | int mpi_add(mpi_t *r, const mpi_t *a, const mpi_t *b);
195 | 
196 | /**
197 |  * mpi addition: |r| = |a| + w
198 |  *
199 |  * @note:
200 |  *   1. make sure r->room is enough to store the result
201 |  *      minimal advise size: MAX(bit_size(a), bit_size(w)) + 1
202 |  */
203 | int mpi_add_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w);
204 | 
205 | /**
206 |  * mpi subtraction: |r| = |a| - |b|
207 |  *
208 |  * @note:
209 |  *   1. make sure r->room is enough to store the result
210 |  *      minimal advise size: MAX(bit_size(a), bit_size(b))
211 |  *   2. make sure |a| >= |b| 'cause negative mpi not supported till now
212 |  */
213 | int mpi_sub(mpi_t *r, const mpi_t *a, const mpi_t *b);
214 | 
215 | /**
216 |  * mpi subtraction: |r| = |a| - w
217 |  *
218 |  * @note:
219 |  *   1. make sure r->room is enough to store the result
220 |  *      minimal advise size: MAX(bit_size(a), bit_size(w))
221 |  */
222 | int mpi_sub_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w);
223 | 
224 | /**
225 |  * mpi multiplication: |r| = |a| * |b|
226 |  *
227 |  * @note:
228 |  *   1. make sure r->room is enough to store the result
229 |  *      minimal advise size: bit_size(a) + bit_size(b) + MPN_LIMB_BITS
230 |  */
231 | int mpi_mul(mpi_t *r, const mpi_t *a, const mpi_t *b);
232 | 
233 | /**
234 |  * mpi multiplication: |r| = |a| * |b|
235 |  *
236 |  * @note:
237 |  *   1. make sure r->room is enough to store the result
238 |  *      minimal advise size: bit_size(a) + bit_size(b)
239 |  */
240 | int mpi_mul_limb(mpi_t *r, const mpi_t *a, mpn_limb_t b);
241 | 
242 | /**
243 |  * mpi square: |r| = |a| ^ 2
244 |  *
245 |  * @note:
246 |  *   1. make sure r->room is enough to store the result
247 |  *      minimal advise size: 2 * bit_size(a)
248 |  */
249 | int mpi_sqr(mpi_t *r, const mpi_t *a);
250 | 
251 | /**
252 |  * mpi division: |q|, |r| = |x| / |y|, |x| = |q| * |y| + |r|(0 <= |r| < |y|)
253 |  *
254 |  * @note:
255 |  *   1. make sure room of |q|, |r| is enough to store the result
256 |  *      minimal advise size: bit_size(r) = bit_size(y)
257 |  */
258 | int mpi_div(mpi_t *q, mpi_t *r, const mpi_t *x, const mpi_t *y);
259 | 
260 | /**
261 |  * mpi division: q, r = a / w
262 |  */
263 | mpn_limb_t mpi_div_limb(mpi_t *a, mpn_limb_t w);
264 | 
265 | /**
266 |  * mpi modular: r = a mod m
267 |  */
268 | mpn_limb_t mpi_mod_limb(const mpi_t *a, mpn_limb_t w);
269 | 
270 | /**
271 |  * greatest common divisor
272 |  */
273 | int mpi_gcd(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer);
274 | 
275 | /**
276 |  * greatest common divisor(constant-time version)
277 |  */
278 | int mpi_gcd_consttime(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer);
279 | 
280 | /**
281 |  * mpi modular: r = a mod m
282 |  */
283 | int mpi_mod(mpi_t *r, const mpi_t *a, const mpi_t *m);
284 | 
285 | /**
286 |  * mpi exponentiation: r = g ^ e
287 |  */
288 | int mpi_exp(mpi_t *r, const mpi_t *g, const mpi_t *e);
289 | 
290 | /**
291 |  * mpi exponentiation(word): r = g ^ e
292 |  */
293 | int mpi_exp_limb(mpi_t *r, const mpi_t *g, mpn_limb_t e);
294 | 
295 | /**
296 |  * get bit
297 |  */
298 | int mpi_get_bit(const mpi_t *a, mpn_size_t n);
299 | 
300 | /**
301 |  * set bit
302 |  */
303 | int mpi_set_bit(const mpi_t *a, mpn_size_t n);
304 | 
305 | /**
306 |  * clr bit
307 |  */
308 | int mpi_clr_bit(const mpi_t *a, mpn_size_t n);
309 | 
310 | /**
311 |  * left-shift: |r| = |a| << n
312 |  */
313 | int mpi_lshift(mpi_t *r, const mpi_t *a, mpn_size_t n);
314 | 
315 | /**
316 |  * right-shift: |r| = |a| >> n
317 |  */
318 | int mpi_rshift(mpi_t *r, const mpi_t *a, mpn_size_t n);
319 | 
320 | /**
321 |  * conditional swap(constant-time version)
322 |  */
323 | int mpi_swap_consttime(unsigned condition, mpi_t *a, mpi_t *b, mpn_size_t n);
324 | 
325 | /**
326 |  * mpi(prime): test if a is a prime
327 |  *
328 |  * @note:
329 |  *   1. return 0 if the number is composite
330 |  *      1 if it is prime with an error probability of less than 0.25^checks
331 |  */
332 | int mpi_is_prime(const mpi_t *a, mpn_size_t checks, unsigned do_trial_division, mpn_optimizer_t *optimizer,
333 |                  int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
334 | 
335 | /**
336 |  * mpi(prime): enerates a pseudo-random prime number of at least bit length |bits|
337 |  *
338 |  * @note:
339 |  *   1. The returned number is probably prime with a negligible error.
340 |  *   2. If |add| is NULL the returned prime number will have exact bit length |bits| with the top most two
341 |  * bits set.
342 |  *   3. The prime may have to fulfill additional requirements for use in Diffie-Hellman key exchange:
343 |  *      If |add| is not NULL, the prime will fulfill the condition p % |add| == |rem| (p % |add| == 1 if
344 |  * |rem| == NULL) in order to suit a given generator.
345 |  *
346 |  *      If |safe| is true, it will be a safe prime (i.e. a prime p so hat (p-1)/2 is also prime).
347 |  *      If |safe| is true, and |rem| == NULL the condition will be p % |add| == 3.
348 |  *      It is recommended that |add| is a multiple of 4.
349 |  */
350 | int mpi_generate_prime(mpi_t *ret, mpn_size_t bits, unsigned safe, const mpi_t *add, const mpi_t *rem,
351 |                        int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
352 | 
353 | 
354 | /**
355 |  * mpn optimizer: get mpi with specified room from optimizer
356 |  *
357 |  * @note:
358 |  *   1. size: size of chunk, in unit of 'mpn_limb_t'
359 |  */
360 | mpi_t *mpi_optimizer_get(mpn_optimizer_t *optimizer, mpn_size_t size);
361 | 
362 | /**
363 |  * mpn optimizer: put back mpi of specified room
364 |  */
365 | void mpi_optimizer_put(mpn_optimizer_t *optimizer, mpn_size_t size);
366 | 
367 | 
368 | /**
369 |  * mpn montgomery: intialize montgomery context with modulus
370 |  *
371 |  */
372 | int mpi_montgomery_set_modulus(mpn_montgomery_t *mont, const mpi_t *modulus);
373 | 
374 | /**
375 |  * mpn montgomery: exponentiation
376 |  *
377 |  */
378 | int mpi_montgomery_exp(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont);
379 | 
380 | /**
381 |  * mpn montgomery: exponentiation(constant-time version)
382 |  *
383 |  */
384 | int mpi_montgomery_exp_consttime(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont);
385 | 
386 | #if defined(__cplusplus)
387 | }
388 | #endif
389 | 
390 | #endif
391 | 


--------------------------------------------------------------------------------
/mpn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Multiple-Precision-Natural-Number
 2 | 
 3 | CONFIGURE_FILE(mpn-asm.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-asm.h COPYONLY)
 4 | CONFIGURE_FILE(mpn-conf.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h COPYONLY)
 5 | CONFIGURE_FILE(
 6 |   mpn-binary.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h COPYONLY
 7 | )
 8 | CONFIGURE_FILE(
 9 |   mpn-optimizer.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h COPYONLY
10 | )
11 | CONFIGURE_FILE(
12 |   mpn-montgomery.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h COPYONLY
13 | )
14 | 
15 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h
16 |               ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h
17 |               ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h
18 |               ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h
19 |         DESTINATION include/mpn
20 | )
21 | 
22 | ADD_LIBRARY(mpn mpn-binary.c mpn-asm.c mpn-optimizer.c mpn-montgomery.c)
23 | ConfigureTarget(mpn)
24 | INSTALL(TARGETS mpn ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
25 | 
26 | OPTION(MPN_NO_ASM "disable asm for mpn" OFF)
27 | IF ((NOT MPN_NO_ASM) AND (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
28 |   ENABLE_LANGUAGE(ASM_NASM)
29 |   IF (NOT DEFINED ARCH)
30 |     SET(ARCH ${CMAKE_SYSTEM_PROCESSOR})
31 |   ENDIF ()
32 |   SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D_L9 -DLINUX32E")
33 |   IF (${ARCH} STREQUAL "x86_64")
34 |     FILE(GLOB ASM_SOURCE asm/intel64/*.asm)
35 |     TARGET_SOURCES(mpn PRIVATE ${ASM_SOURCE})
36 |     TARGET_INCLUDE_DIRECTORIES(mpn PRIVATE asm asm/intel64)
37 |     TARGET_COMPILE_DEFINITIONS(
38 |       mpn
39 |       PRIVATE -DMPN_UADD_VECTORIZED_ASM
40 |               -DMPN_USUB_VECTORIZED_ASM
41 |               -DMPN_UINC_VECTORIZED_ASM
42 |               -DMPN_UDEC_VECTORIZED_ASM
43 |               -DMPN_UDIV_ASM
44 |               -DMPN_UMUL_ASM
45 |               -DMPN_USQR_ASM
46 |               -DMPN_UMUL_ADD_ASM
47 |               -DMPN_MONT_REDC_ASM
48 |     )
49 |   ENDIF ()
50 | ENDIF ()
51 | 
52 | IF (MPN_NO_INLINE_ASM)
53 |   TARGET_COMPILE_DEFINITIONS(mpn PRIVATE -DMPN_NO_INLINE_ASM)
54 | ENDIF()
55 | 


--------------------------------------------------------------------------------
/mpn/asm/asmdefs.inc:
--------------------------------------------------------------------------------
  1 | %ifndef __ASMDEFS_INC__
  2 | %define __ASMDEFS_INC__ 1
  3 | 
  4 | %assign __ARCH_PX 0    ; pure C-code ia32
  5 | %assign __ARCH_M5 1    ; Intel(R) Quark(TM) processor - ia32
  6 | %assign __ARCH_W7 8    ; Intel(R) Streaming SIMD Extensions 2 - ia32
  7 | %assign __ARCH_T7 16   ; Intel(R) Streaming SIMD Extensions 3 - ia32
  8 | %assign __ARCH_V8 32   ; Supplemental Streaming SIMD Extensions 3 (SSSE3)
  9 | %assign __ARCH_S8 33   ; SSSE3 + MOVBE instruction - ia32
 10 | %assign __ARCH_P8 64   ; Intel(R) Streaming SIMD Extensions 4.2 - ia32
 11 | %assign __ARCH_G9 128  ; Intel(R) Advanced Vector Extensions - ia32
 12 | %assign __ARCH_H9 256  ; Intel(R) Advanced Vector Extensions 2 - ia32
 13 | %assign __ARCH_I0 512  ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - ia32
 14 | %assign __ARCH_S0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32
 15 | 
 16 | %assign __ARCH32E_PX __ARCH_PX ; pure C-code x64
 17 | %assign __ARCH32E_M7 32   ; Intel(R) Streaming SIMD Extensions 3 - intel64
 18 | %assign __ARCH32E_U8 64   ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64
 19 | %assign __ARCH32E_N8 65   ; SSSE3 + MOVBE instruction - intel64
 20 | %assign __ARCH32E_Y8 128  ; Intel(R) Streaming SIMD Extensions 4.2 - intel64
 21 | %assign __ARCH32E_E9 256  ; Intel(R) Advanced Vector Extensions - intel64
 22 | %assign __ARCH32E_L9 512  ; Intel(R) Advanced Vector Extensions 2 - intel64
 23 | %assign __ARCH32E_N0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - intel64
 24 | %assign __ARCH32E_K0 2048 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64
 25 | 
 26 | %assign __ARCH __ARCH_PX
 27 | %assign __ARCH32E __ARCH32E_PX
 28 | 
 29 | %ifdef _M5 ; Intel(R) Quark(TM) processor - ia32
 30 |   %assign __ARCH __ARCH_M5
 31 | %elifdef _W7 ; Intel(R) Streaming SIMD Extensions 2 - ia32
 32 |   %assign __ARCH __ARCH_W7
 33 | %elifdef _T7 ; Intel(R) Streaming SIMD Extensions 3 - ia32
 34 |   %assign __ARCH __ARCH_T7
 35 | %elifdef _V8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3)
 36 |   %assign __ARCH __ARCH_V8
 37 | %elifdef _S8 ; SSSE3 + MOVBE instruction - ia32
 38 |   %assign __ARCH __ARCH_S8
 39 | %elifdef _P8 ; Intel(R) Streaming SIMD Extensions 4.2 - ia32
 40 |   %assign __ARCH __ARCH_P8
 41 | %elifdef _G9 ; Intel(R) Advanced Vector Extensions - ia32
 42 |   %assign ARCH_ALIGN_FACTOR 32
 43 |   %assign __ARCH __ARCH_G9
 44 | %elifdef _H9 ; Intel(R) Advanced Vector Extensions 2 - ia32
 45 |   %assign ARCH_ALIGN_FACTOR 32
 46 |   %assign __ARCH __ARCH_H9
 47 | %elifdef _S0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32
 48 |   %assign ARCH_ALIGN_FACTOR 64
 49 |   %assign __ARCH __ARCH_S0
 50 | %elifdef _M7 ; Intel(R) Streaming SIMD Extensions 3 - intel64
 51 |   %assign __ARCH __ARCH_PX
 52 |   %assign __ARCH32E __ARCH32E_M7
 53 | %elifdef _U8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64
 54 |   %assign __ARCH __ARCH_PX
 55 |   %assign __ARCH32E __ARCH32E_U8
 56 | %elifdef _N8 ; SSSE3 + MOVBE instruction - intel64
 57 |   %assign __ARCH __ARCH_PX
 58 |   %assign __ARCH32E __ARCH32E_N8
 59 | %elifdef _Y8 ; Intel(R) Streaming SIMD Extensions 4.2 - intel64
 60 |   %assign __ARCH __ARCH_PX
 61 |   %assign __ARCH32E __ARCH32E_Y8
 62 | %elifdef _E9 ; Intel(R) Advanced Vector Extensions - intel64
 63 |   %assign ARCH_ALIGN_FACTOR 32
 64 |   %assign __ARCH __ARCH_PX
 65 |   %assign __ARCH32E __ARCH32E_E9
 66 | %elifdef _L9 ; Intel(R) Advanced Vector Extensions 2 - intel64
 67 |   %assign ARCH_ALIGN_FACTOR 32
 68 |   %assign __ARCH __ARCH_PX
 69 |   %assign __ARCH32E __ARCH32E_L9
 70 | %elifdef _N0 ; Intel(R) Advanced Vector Extensions 512 (formerly Knights Landing) - intel64
 71 |   %assign ARCH_ALIGN_FACTOR 64
 72 |   %assign __ARCH __ARCH_PX
 73 |   %assign __ARCH32E __ARCH32E_N0
 74 | %elifdef _K0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64
 75 |   %assign ARCH_ALIGN_FACTOR 64
 76 |   %assign __ARCH __ARCH_PX
 77 |   %assign __ARCH32E __ARCH32E_K0
 78 | %else
 79 |   %assign __ARCH __ARCH_PX ; pure C-code
 80 | %endif
 81 | 
 82 | %if (__ARCH > __ARCH_H9) || (__ARCH32E > __ARCH32E_L9)
 83 |   %assign ARCH_ALIGN_FACTOR 64
 84 | %elif (__ARCH > __ARCH_P8) || (__ARCH32E > __ARCH32E_Y8)
 85 |   %assign ARCH_ALIGN_FACTOR 32
 86 | %else
 87 |   %assign ARCH_ALIGN_FACTOR 16
 88 | %endif
 89 | 
 90 | ; noexec stack
 91 | %ifdef LINUX32
 92 |   %ifndef OSX32
 93 | section .note.GNU-stack noalloc noexec nowrite progbits
 94 |   %endif
 95 | %endif
 96 | 
 97 | ; noexec stack
 98 | %ifdef LINUX32E
 99 |   %ifndef OSXEM64T
100 |     %ifndef _ARCH_KNC
101 | section .note.GNU-stack noalloc noexec nowrite progbits
102 |     %endif
103 |   %endif
104 | %endif
105 | 
106 | 
107 | %ifidn __OUTPUT_FORMAT__, elf32
108 |   %assign IPP_BINARY_FORMAT 0
109 | %elifidn __OUTPUT_FORMAT__, elf64
110 |   %assign IPP_BINARY_FORMAT 1
111 | %elifidn __OUTPUT_FORMAT__, macho64
112 |   %assign IPP_BINARY_FORMAT 2
113 | %elifidn __OUTPUT_FORMAT__, win32
114 |   %assign IPP_BINARY_FORMAT 3
115 | %elifidn __OUTPUT_FORMAT__, win64
116 |   %assign IPP_BINARY_FORMAT 4
117 | %else
118 |   %fatal Unsupported output format: __OUTPUT_FORMAT__. Shall be: elf32, elf64, win32, win64, macho64
119 | %endif
120 | 
121 | %ifdef _MERGED_BLD
122 |   %assign _OWN_MERGED_BLD 1
123 | %endif ; _MERGED_BLD
124 | 
125 | ; data compilation definitions: merged builds shall compile data only as
126 | ; part of one single object build to avoid multiple definition warnings at link time
127 | %ifndef _MERGED_BLD
128 |   %assign __ARCH_DATA 1
129 | %else
130 |   %if (__ARCH == __ARCH_G9) || (__ARCH32E == __ARCH32E_E9)
131 |     %assign __ARCH_DATA 1
132 |   %endif
133 | %endif ; _MERGED_BLD
134 | 
135 | ; Definitions of sizeof(type)
136 | %iassign ZWORD_size 64 ; zmm-word
137 | %iassign YWORD_size 32 ; ymm-word
138 | %iassign OWORD_size 16 ; octo-word
139 | %iassign TWORD_size 10 ; ten-bytes word
140 | %iassign QWORD_size 8  ; quad-word
141 | %iassign DWORD_size 4  ; double-word
142 | %iassign WORD_size 2
143 | %iassign BYTE_size 1
144 | 
145 | %idefine YMMWORD YWORD
146 | %idefine XMMWORD OWORD
147 | %iassign YMMWORD_size YWORD_size
148 | %iassign XMMWORD_size OWORD_size
149 | 
150 | %idefine sizeof(_x_) _x_%+_size
151 | 
152 | %endif
153 | 


--------------------------------------------------------------------------------
/mpn/asm/ia_common.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2014-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | %ifndef __IA_COMMON_INC__
 18 | %define __IA_COMMON_INC__ 1
 19 | 
 20 | ; use multi-byte nop's sequences to align loops and jmp's when threshold is reached
 21 | %use smartalign
 22 | ALIGNMODE p6,16
 23 | 
 24 | ; Declares function, sets visibility and binding and adds __cdecl decoration when needed.
 25 | %macro DECLARE_FUNC 2-3.nolist
 26 |   %xdefine %%func_name %1
 27 |   %xdefine %%visibility %2
 28 |   %xdefine %%binding %3
 29 | 
 30 |   %ifctx _DECLARE_FUNC_CTX_
 31 |     %fatal "DECLARE_FUNC: already in the context, need to call ENDFUNC"
 32 |   %endif
 33 | 
 34 |   ; Accepted visibility values are PUBLIC and PRIVATE
 35 |   %ifnidni %%visibility, PUBLIC
 36 |     %ifnidni %%visibility, PRIVATE
 37 |       %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
 38 |     %endif
 39 |   %endif
 40 | 
 41 |   ; Accepted binding values are WEAK or STRONG (default)
 42 |   %ifnempty %%binding
 43 |     %ifnidni %%binding, WEAK
 44 |       %ifnidni %%binding, STRONG
 45 |         %%fatal Function %%func_name binding is not properly defined. Shall be: WEAK or STRONG.
 46 |       %endif
 47 |     %endif
 48 |   %endif
 49 | 
 50 |   ; Function decoration length
 51 |   %assign %%decoration_length 0
 52 | 
 53 |   ; The __cdecl calling convention name decoration (to have interoperability with C).
 54 |   ; Only public functions are decorated
 55 |   %ifidni %%visibility, PUBLIC
 56 |     %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T
 57 |       %xdefine %%func_name _%[%%func_name]
 58 |       %assign %%decoration_length %%decoration_length+1
 59 |     %endif
 60 |   %endif
 61 | 
 62 |   ; If current macro is called from IPPASM macro, then function might be decorated by CPU-prefix
 63 |   %ifctx _IPPASM_CTX_
 64 |     %assign %%decoration_length %%decoration_length + %$decoration_length ; %$decoration_length belongs to _IPPASM_CTX_
 65 |   %endif
 66 | 
 67 |   %push _DECLARE_FUNC_CTX_
 68 |   ; setup context variables to use in ENDFUNC
 69 |   %xdefine %$func_name_ctx %%func_name
 70 |   %assign %$decoration_length %%decoration_length ; %$decoration_length belongs to _DECLARE_FUNC_CTX_
 71 | 
 72 |   %ifidn %%visibility, PUBLIC
 73 |     %if (IPP_BINARY_FORMAT < 2) ; LINUX32 or LINUX32E
 74 |       %ifnempty %%binding
 75 |         global %%func_name:function %%binding (%%func_name%+.LEnd_%+%%func_name - %%func_name)
 76 |       %else
 77 |         global %%func_name:function (%%func_name%+.LEnd_%+%%func_name - %%func_name)
 78 |       %endif
 79 |     %else
 80 |       global %%func_name
 81 |     %endif
 82 |   %endif
 83 |   %%func_name:
 84 | 
 85 |   ; CET enabling (macOS not supported)
 86 |   %if ((IPP_BINARY_FORMAT == 0) || (IPP_BINARY_FORMAT == 3)) ; elf32/win32
 87 |     db 0F3h, 00Fh, 01Eh, 0FBh ; endbr32
 88 |   %elif ((IPP_BINARY_FORMAT == 1) || (IPP_BINARY_FORMAT == 4)) ; elf64/win64
 89 |     db 0F3h, 00Fh, 01Eh, 0FAh ; endbr64
 90 |   %endif
 91 | %endmacro
 92 | 
 93 | ; Calls assembler function declared by DECLARE_FUNC
 94 | ; Default visibility is PRIVATE (affects decoration)
 95 | %macro CALL_FUNC 1-2.nolist PRIVATE
 96 |   %xdefine %%func_name %1
 97 |   %xdefine %%visibility %2
 98 | 
 99 |   ; Accepted visibility values are PUBLIC and PRIVATE
100 |   %ifnidni %%visibility, PUBLIC
101 |     %ifnidni %%visibility, PRIVATE
102 |       %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
103 |     %endif
104 |   %endif
105 | 
106 |   ; __cdecl on WIN32/OSXEM64T obligates to have undersore prefix decoration.
107 |   ; Only PUBLIC functions are decorated.
108 |   %ifidni %%visibility, PUBLIC
109 |     %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T
110 |       %xdefine %%func_name _%1
111 |     %endif
112 |   %endif
113 | 
114 |   call %%func_name
115 | %endmacro
116 | 
117 | ; Declares function decorated by appropriate CPU prefix (for the merged library)
118 | ; Default visibility (if not defined) is PUBLIC.
119 | %macro IPPASM 1-2.nolist PUBLIC
120 |   %xdefine %%func_name %1
121 |   %xdefine %%visibility %2
122 | 
123 |   %ifctx _IPPASM_CTX_
124 |     %fatal "IPPASM: already in the context, need to call ENDFUNC"
125 |   %endif
126 |   %push _IPPASM_CTX_
127 | 
128 |   %push _CPU_PREFIX_DECORATE_CTX_
129 |   CPU_PREFIX_DECORATE %%func_name
130 |   %xdefine %%func_name %$decorated_func_name
131 |   %assign %$$decoration_length %$decoration_length
132 |   %pop _CPU_PREFIX_DECORATE_CTX_
133 | 
134 |   DECLARE_FUNC %%func_name, %%visibility
135 | %endmacro
136 | 
137 | ; Calls assembler function declared by IPPASM
138 | ; Default visibility is PRIVATE (affects decoration)
139 | %macro CALL_IPPASM 1-2.nolist PRIVATE
140 |   %xdefine %%func_name %1
141 |   %xdefine %%visibility %2
142 | 
143 |   ; Accepted visibility values are PUBLIC and PRIVATE
144 |   %ifnidni %%visibility, PUBLIC
145 |     %ifnidni %%visibility, PRIVATE
146 |       %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
147 |     %endif
148 |   %endif
149 | 
150 |   %push _CPU_PREFIX_DECORATE_CTX_
151 |   CPU_PREFIX_DECORATE %%func_name
152 |   %xdefine %%func_name %$decorated_func_name
153 |   %pop _CPU_PREFIX_DECORATE_CTX_
154 | 
155 |   CALL_FUNC %%func_name,%%visibility
156 | %endmacro
157 | 
158 | ; End function macro - required to be called after IPPASM or DECLARE_FUNC macro invokation.
159 | %macro ENDFUNC 1.nolist
160 |   %xdefine %%func_name %1
161 |   %ifnctx _DECLARE_FUNC_CTX_
162 |     %fatal "Not in the context: _DECLARE_FUNC_CTX_"
163 |   %endif
164 | 
165 |   ; Cross-check of context variable with macro parameter
166 |   %defstr %%func_name_str %%func_name
167 |   %defstr %%func_name_ctx_str %$func_name_ctx
168 |   %substr %%func_name_ctx_str_not_decorated %%func_name_ctx_str %[%$decoration_length+1],-1   ; remove decoration (first X symbols)
169 |   %ifnidn %%func_name_str,%%func_name_ctx_str
170 |     %ifnidn %%func_name_str,%%func_name_ctx_str_not_decorated
171 |       %fatal ENDFUNC: function name [%%func_name] does match context: [%$func_name_ctx]
172 |     %endif
173 |   %endif
174 | 
175 |   ; Add local label to be able calculate function size
176 |   ; Take function name from the context (real declaration name)
177 | .LEnd_%+%$func_name_ctx:
178 |   %pop _DECLARE_FUNC_CTX_
179 | 
180 |   %ifctx _IPPASM_CTX_
181 |     %pop _IPPASM_CTX_
182 |   %endif
183 | %endmacro
184 | 
185 | %endif
186 | 


--------------------------------------------------------------------------------
/mpn/asm/ia_emm.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2014-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | %include "asmdefs.inc"
 18 | %include "ia_common.inc"
 19 | %include "utils.inc"
 20 | 
 21 | ; Decorates function name with appropriate CPU prefix (for the merged library).
 22 | ; The macro is context-dependent and returns decorated name in the %$decorated_func_name
 23 | ; context variable.
 24 | %macro CPU_PREFIX_DECORATE 1.nolist
 25 |   %ifnctx _CPU_PREFIX_DECORATE_CTX_
 26 |     %fatal "Not in the context: _CPU_PREFIX_DECORATE_CTX_"
 27 |   %endif
 28 | 
 29 |   ; Add CPU-specific suffix for the dispatched library
 30 |   %ifdef _OWN_MERGED_BLD
 31 |     %if (__ARCH == __ARCH_PX)
 32 |       %xdefine %%func_name px_%1
 33 |       %assign %%decoration_length 3
 34 |     %endif
 35 |     %if (__ARCH == __ARCH_W7)
 36 |       %xdefine %%func_name w7_%1
 37 |       %assign %%decoration_length 3
 38 |     %endif
 39 |     %if (__ARCH == __ARCH_V8)
 40 |       %xdefine %%func_name v8_%1
 41 |       %assign %%decoration_length 3
 42 |     %endif
 43 |     %if (__ARCH == __ARCH_S8)
 44 |       %xdefine %%func_name s8_%1
 45 |       %assign %%decoration_length 3
 46 |     %endif
 47 |     %if (__ARCH == __ARCH_P8)
 48 |       %xdefine %%func_name p8_%1
 49 |       %assign %%decoration_length 3
 50 |     %endif
 51 |     %if (__ARCH == __ARCH_G9)
 52 |       %xdefine %%func_name g9_%1
 53 |       %assign %%decoration_length 3
 54 |     %endif
 55 |     %if (__ARCH == __ARCH_H9)
 56 |       %xdefine %%func_name h9_%1
 57 |       %assign %%decoration_length 3
 58 |     %endif
 59 |   %else
 60 |     %xdefine %%func_name %1
 61 |     %assign %%decoration_length 0
 62 |   %endif
 63 | 
 64 |   %ifndef %%func_name
 65 |     %fatal "CPU_PREFIX_DECORATE: unknown decoration for: __ARCH = " __ARCH
 66 |   %endif
 67 |   %xdefine %$decorated_func_name %[%%func_name]
 68 |   %assign %$decoration_length %%decoration_length
 69 | %endmacro
 70 | 
 71 | %define NONVOLATILE_REGS_32_GPR ebp,ebx,esi,edi
 72 | 
 73 | ; Saves non-volatile GPR registers on stack.
 74 | ; Input - list of used registers.
 75 | %macro USES_GPR 1+.nolist
 76 |   %assign LOCAL_FRAME 0
 77 |   %assign GPR_FRAME 0
 78 |   %define GPR_CUR
 79 | 
 80 |   BEGIN_INTERSECT
 81 |   INTERSECT {%1},{%[NONVOLATILE_REGS_32_GPR]}
 82 |   ; List of non-volatile GPR registers in the order they will be pushed on stack
 83 |   %xdefine GPR_CUR %$intersection
 84 |   %assign GPR_FRAME %$cardinality * 4
 85 |   END_INTERSECT
 86 | 
 87 |   ; Push non-volatile GPRs on stack
 88 |   FOREACH GPR_CUR,{push}
 89 | 
 90 |   ; Set up offset of arguments from ESP
 91 |   %assign ARG_1 %[GPR_FRAME + 4]
 92 | %endmacro
 93 | 
 94 | ; Restore preliminary saved by USES_GPR non-volatile GPR registers from the stack.
 95 | ; The macro shall be called after function processing.
 96 | %macro REST_GPR 0.nolist
 97 |   %ifndef GPR_CUR
 98 |     %fatal "REST_GPR: no GPR_CUR defined"
 99 |   %endif
100 |   ; Pop saved GPRs from the stack
101 |   RFOREACH GPR_CUR,{pop}
102 | %endmacro
103 | 
104 | %macro LD_ADDR 2.nolist
105 |   %xdefine %%reg %1
106 |   %xdefine %%addr %2
107 | 
108 | %ifdef IPP_PIC
109 |   call  %%LABEL
110 | %%LABEL:  pop      %%reg
111 |   sub   %%reg, %%LABEL-%%addr
112 | %else
113 |   lea   %%reg, [%%addr]
114 | %endif
115 | %endmacro
116 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/bn_usqrschool.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2010-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  Cryptography Primitive.
 20 | ;               BNU squaring support
 21 | ;
 22 | ;
 23 | 
 24 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 25 | ;;
 26 | ;; MULx1 genaral-case squarer macros
 27 | ;;
 28 | 
 29 | ;; dst = src * B epilogue (srcLen=4*n+3)
 30 | %macro sMULx1_4N_3_ELOG 8.nolist
 31 |   %xdefine %%rDst %1
 32 |   %xdefine %%rSrc %2
 33 |   %xdefine %%update_idx %3
 34 |   %xdefine %%B %4
 35 |   %xdefine %%T0 %5
 36 |   %xdefine %%T1 %6
 37 |   %xdefine %%T2 %7
 38 |   %xdefine %%T3 %8
 39 | 
 40 |    mul   %%B
 41 |    xor   %%T1, %%T1
 42 |    add   %%T0, rax
 43 |    mov   qword [%%rDst+sizeof(qword)], %%T0
 44 |    mov   rax, qword [%%rSrc+sizeof(qword)*2]
 45 |    adc   %%T1, rdx
 46 | 
 47 |    mul   %%B
 48 |    xor   %%T2, %%T2
 49 |    add   %%T1, rax
 50 |    mov   qword [%%rDst+sizeof(qword)*2], %%T1
 51 |    mov   rax, qword [%%rSrc+sizeof(qword)*3]
 52 |    adc   %%T2, rdx
 53 | 
 54 |    mul   %%B
 55 |    %%update_idx
 56 |    add   %%T2, rax
 57 |    mov   qword [%%rDst+sizeof(qword)*3], %%T2
 58 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
 59 |    adc   rdx, 0
 60 | 
 61 |    mov   qword [%%rDst+sizeof(qword)*4], rdx
 62 |    add   %%rDst, sizeof(qword)
 63 | %endmacro
 64 | 
 65 | ;; dst = src * B epilogue (srcLen=4*n+1)
 66 | %macro sMULx1_4N_1_ELOG 8.nolist
 67 |   %xdefine %%rDst %1
 68 |   %xdefine %%rSrc %2
 69 |   %xdefine %%update_idx %3
 70 |   %xdefine %%B %4
 71 |   %xdefine %%T0 %5
 72 |   %xdefine %%T1 %6
 73 |   %xdefine %%T2 %7
 74 |   %xdefine %%T3 %8
 75 | 
 76 |    mul   %%B
 77 |    %%update_idx
 78 |    add   %%T0, rax
 79 |    mov   qword [%%rDst+sizeof(qword)*3], %%T0
 80 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
 81 |    adc   rdx, 0
 82 | 
 83 |    mov   qword [%%rDst+sizeof(qword)*4], rdx
 84 |    add   %%rDst, sizeof(qword)
 85 | %endmacro
 86 | 
 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 88 | ;;
 89 | ;; MULx2 genaral-case multiplier macros
 90 | ;;
 91 | 
 92 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+1)
 93 | %macro sMULx2_4N_1_ELOG 9.nolist
 94 |   %xdefine %%rDst %1
 95 |   %xdefine %%rSrc %2
 96 |   %xdefine %%update_idx %3
 97 |   %xdefine %%B0 %4
 98 |   %xdefine %%B1 %5
 99 |   %xdefine %%T0 %6
100 |   %xdefine %%T1 %7
101 |   %xdefine %%T2 %8
102 |   %xdefine %%T3 %9
103 | 
104 |    mul   %%B1                                                       ; {T2:T1} += a[lenA-1]*B1
105 |   ;add   rDst, sizeof(qword)*2
106 |    %%update_idx
107 |    mov   qword [%%rDst+sizeof(qword)*3], %%T0
108 |    add   %%T1, rax
109 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
110 |    adc   rdx, %%T2
111 | 
112 |    mov   qword [%%rDst+sizeof(qword)*4], %%T1
113 |    mov   qword [%%rDst+sizeof(qword)*5], rdx
114 | %endmacro
115 | 
116 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+3)
117 | %macro sMULx2_4N_3_ELOG 9.nolist
118 |   %xdefine %%rDst %1
119 |   %xdefine %%rSrc %2
120 |   %xdefine %%update_idx %3
121 |   %xdefine %%B0 %4
122 |   %xdefine %%B1 %5
123 |   %xdefine %%T0 %6
124 |   %xdefine %%T1 %7
125 |   %xdefine %%T2 %8
126 |   %xdefine %%T3 %9
127 | 
128 |    mul   %%B1                                                       ; {T2:T1} += a[lenA-3]*B1
129 |    xor   %%T3, %%T3
130 |    add   %%T1, rax
131 |    mov   rax, qword [%%rSrc+sizeof(qword)*2]                    ; a[lenA-2]
132 |    adc   %%T2, rdx
133 | 
134 |    mul   %%B0                                                       ; {T3:T2:T1} += a[LenA-2]*B0
135 |    mov   qword [%%rDst+sizeof(qword)], %%T0
136 |    add   %%T1, rax
137 |    mov   rax, qword [%%rSrc+sizeof(qword)*2]                    ; a[lenA-2]
138 |    adc   %%T2, rdx
139 |    adc   %%T3, 0
140 | 
141 |    mul   %%B1                                                       ; {T3:T2} += a[lenA-2]*B1
142 |    xor   %%T0, %%T0
143 |    add   %%T2, rax
144 |    mov   rax, qword [%%rSrc+sizeof(qword)*3]                    ; a[lenA-1]
145 |    adc   %%T3, rdx
146 | 
147 |    mul   %%B0                                                       ; {T0:T3:T2} += a[lenA-1]*B0
148 |    mov   qword [%%rDst+sizeof(qword)*2], %%T1
149 |    add   %%T2, rax
150 |    mov   rax, qword [%%rSrc+sizeof(qword)*3]                    ; a[lenA-1]
151 |    adc   %%T3, rdx
152 |    adc   %%T0, 0
153 | 
154 |    mul   %%B1                                                       ; {T0:T3} += a[lenA-1]*B1
155 |   ;add   rDst, sizeof(qword)*2
156 |    %%update_idx
157 |    mov   qword [%%rDst+sizeof(qword)*3], %%T2
158 |    add   %%T3, rax
159 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
160 |    adc   rdx, %%T0
161 | 
162 |    mov   qword [%%rDst+sizeof(qword)*4], %%T3
163 |    mov   qword [%%rDst+sizeof(qword)*5], rdx
164 | %endmacro
165 | 
166 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
167 | ;;
168 | ;; MLAx2 genaral-case multiplier macros
169 | ;;
170 | 
171 | ;;
172 | ;; B0 = rSrc[-2]
173 | ;; B1 = rSrc[-1]
174 | ;; inp_vector = rSrc
175 | ;; out_vector = rDst
176 | %macro sMLAx2_PLOG 8.nolist
177 |   %xdefine %%rDst %1
178 |   %xdefine %%rSrc %2
179 |   %xdefine %%B0 %3
180 |   %xdefine %%B1 %4
181 |   %xdefine %%T0 %5
182 |   %xdefine %%T1 %6
183 |   %xdefine %%T2 %7
184 |   %xdefine %%T3 %8
185 | 
186 |    mov      %%B0, qword [%%rSrc-2*sizeof(qword)]   ; preload a[-2]
187 |    mov      %%B1, qword [%%rSrc-sizeof(qword)]     ; and a[i-1]
188 | 
189 |    mov      rax, %%B1
190 |    mul      %%B0                                     ; a[-2]*a[i-1]
191 |    xor      %%T0, %%T0
192 | 
193 |    add      qword [%%rDst-sizeof(qword)], rax
194 |    mov      rax, qword [%%rSrc]                  ; a[i]
195 |    adc      %%T0, rdx
196 | 
197 |    mul      %%B0                                     ; B0*a[i]
198 |    xor      %%T1, %%T1
199 |    xor      %%T2, %%T2
200 |    add      %%T0, rax
201 |    mov      rax, qword [%%rSrc]                  ; a[i]
202 |    adc      %%T1, rdx
203 | %endmacro
204 | 
205 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+1)
206 | %macro sMLAx2_4N_1_ELOG 9.nolist
207 |   %xdefine %%rDst %1
208 |   %xdefine %%rSrc %2
209 |   %xdefine %%update_idx %3
210 |   %xdefine %%B0 %4
211 |   %xdefine %%B1 %5
212 |   %xdefine %%T0 %6
213 |   %xdefine %%T1 %7
214 |   %xdefine %%T2 %8
215 |   %xdefine %%T3 %9
216 | 
217 |    mul   %%B1                                                       ; {T2:T1} += a[lenA-1]*B1 + r[lenA-1]
218 |   ;add   rDst, sizeof(qword)*2
219 |    %%update_idx
220 |    add   %%T0, qword [%%rDst+sizeof(qword)*3]
221 |    mov   qword [%%rDst+sizeof(qword)*3], %%T0
222 |    adc   %%T1, rax
223 |    adc   rdx, %%T2
224 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
225 | 
226 |    mov   qword [%%rDst+sizeof(qword)*4], %%T1
227 |    mov   qword [%%rDst+sizeof(qword)*5], rdx
228 | %endmacro
229 | 
230 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+3)
231 | %macro sMLAx2_4N_3_ELOG 9.nolist
232 |   %xdefine %%rDst %1
233 |   %xdefine %%rSrc %2
234 |   %xdefine %%update_idx %3
235 |   %xdefine %%B0 %4
236 |   %xdefine %%B1 %5
237 |   %xdefine %%T0 %6
238 |   %xdefine %%T1 %7
239 |   %xdefine %%T2 %8
240 |   %xdefine %%T3 %9
241 | 
242 |    mul   %%B1                                                       ; {T2:T1} += a[lenA-3]*B1
243 |    xor   %%T3, %%T3
244 |    add   %%T1, rax
245 |    mov   rax, qword [%%rSrc+sizeof(qword)*2]                    ; a[lenA-2]
246 |    adc   %%T2, rdx
247 | 
248 |    mul   %%B0                                                       ; {T3:T2:T1} += a[LenA-2]*B0 + r[len-3]
249 |    add   %%T0, qword [%%rDst+sizeof(qword)]
250 |    mov   qword [%%rDst+sizeof(qword)], %%T0
251 |    adc   %%T1, rax
252 |    adc   %%T2, rdx
253 |    adc   %%T3, 0
254 |    mov   rax, qword [%%rSrc+sizeof(qword)*2]                    ; a[lenA-2]
255 | 
256 |    mul   %%B1                                                       ; {T3:T2} += a[lenA-2]*B1
257 |    xor   %%T0, %%T0
258 |    add   %%T2, rax
259 |    adc   %%T3, rdx
260 |    mov   rax, qword [%%rSrc+sizeof(qword)*3]                    ; a[lenA-1]
261 | 
262 |    mul   %%B0                                                       ; {T0:T3:T2} += a[lenA-1]*B0 + r[lenA-2]
263 |    add   %%T1, qword [%%rDst+sizeof(qword)*2]
264 |    mov   qword [%%rDst+sizeof(qword)*2], %%T1
265 |    adc   %%T2, rax
266 |    adc   %%T3, rdx
267 |    adc   %%T0, 0
268 |    mov   rax, qword [%%rSrc+sizeof(qword)*3]                    ; a[lenA-1]
269 | 
270 |    mul   %%B1                                                       ; {T0:T3} += a[lenA-1]*B1 + r[lenA-1]
271 |   ;add   rDst, sizeof(qword)*2
272 |    %%update_idx
273 |    add   %%T2, qword [%%rDst+sizeof(qword)*3]
274 |    mov   qword [%%rDst+sizeof(qword)*3], %%T2
275 |    adc   %%T3, rax
276 |    adc   rdx, %%T0
277 |   ;mov   rax, qword [rSrc+idx*sizeof(qword)]
278 | 
279 |    mov   qword [%%rDst+sizeof(qword)*4], %%T3
280 |    mov   qword [%%rDst+sizeof(qword)*5], rdx
281 | %endmacro
282 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/clear_regs.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | %ifndef _CLEAR_REGS_ASM_
 18 | %define _CLEAR_REGS_ASM_
 19 | 
 20 | %include "os.inc"
 21 | 
 22 | ;
 23 | ; This macro clears any GP registers passed
 24 | ;
 25 | %macro clear_gps 1-16
 26 | %define %%NUM_REGS %0
 27 | %rep %%NUM_REGS
 28 |         xor %1, %1
 29 | %rotate 1
 30 | %endrep
 31 | %endmacro
 32 | 
 33 | ;
 34 | ; This macro clears any XMM registers passed on SSE
 35 | ;
 36 | %macro clear_xmms_sse 1-16
 37 | %define %%NUM_REGS %0
 38 | %rep %%NUM_REGS
 39 |         pxor    %1, %1
 40 | %rotate 1
 41 | %endrep
 42 | %endmacro
 43 | 
 44 | ;
 45 | ; This macro clears any XMM registers passed on AVX
 46 | ;
 47 | %macro clear_xmms_avx 1-16
 48 | %define %%NUM_REGS %0
 49 | %rep %%NUM_REGS
 50 |         vpxor   %1, %1
 51 | %rotate 1
 52 | %endrep
 53 | %endmacro
 54 | 
 55 | ;
 56 | ; This macro clears any YMM registers passed
 57 | ;
 58 | %macro clear_ymms 1-16
 59 | %define %%NUM_REGS %0
 60 | %rep %%NUM_REGS
 61 |         vpxor   %1, %1
 62 | %rotate 1
 63 | %endrep
 64 | %endmacro
 65 | 
 66 | ;
 67 | ; This macro clears any ZMM registers passed
 68 | ;
 69 | %macro clear_zmms 1-32
 70 | %define %%NUM_REGS %0
 71 | %rep %%NUM_REGS
 72 |         vpxorq  %1, %1
 73 | %rotate 1
 74 | %endrep
 75 | %endmacro
 76 | 
 77 | ;
 78 | ; This macro clears all scratch GP registers
 79 | ; for Windows or Linux
 80 | ;
 81 | %macro clear_scratch_gps_asm 0
 82 |         clear_gps rax, rcx, rdx, r8, r9, r10, r11
 83 | %ifdef LINUX
 84 |         clear_gps rdi, rsi
 85 | %endif
 86 | %endmacro
 87 | 
 88 | ;
 89 | ; This macro clears all scratch XMM registers on SSE
 90 | ;
 91 | %macro clear_scratch_xmms_sse_asm 0
 92 | %ifdef LINUX
 93 | %assign i 0
 94 | %rep 16
 95 |         pxor    xmm %+ i, xmm %+ i
 96 | %assign i (i+1)
 97 | %endrep
 98 | ; On Windows, XMM0-XMM5 registers are scratch registers
 99 | %else
100 | %assign i 0
101 | %rep 6
102 |         pxor    xmm %+ i, xmm %+ i
103 | %assign i (i+1)
104 | %endrep
105 | %endif ; LINUX
106 | %endmacro
107 | 
108 | ;
109 | ; This macro clears all scratch XMM registers on AVX
110 | ;
111 | %macro clear_scratch_xmms_avx_asm 0
112 | %ifdef LINUX
113 |         vzeroall
114 | ; On Windows, XMM0-XMM5 registers are scratch registers
115 | %else
116 | %assign i 0
117 | %rep 6
118 |         vpxor   xmm %+ i, xmm %+ i
119 | %assign i (i+1)
120 | %endrep
121 | %endif ; LINUX
122 | %endmacro
123 | 
124 | ;
125 | ; This macro clears all scratch YMM registers
126 | ;
127 | ; It should be called before restoring the XMM registers
128 | ; for Windows (XMM6-XMM15)
129 | ;
130 | %macro clear_scratch_ymms_asm 0
131 | ; On Linux, all YMM registers are scratch registers
132 | %ifdef LINUX
133 |         vzeroall
134 | ; On Windows, YMM0-YMM5 registers are scratch registers.
135 | ; YMM6-YMM15 upper 128 bits are scratch registers too, but
136 | ; the lower 128 bits are to be restored after calling these function
137 | ; which clears the upper bits too.
138 | %else
139 | %assign i 0
140 | %rep 6
141 |         vpxor   ymm %+ i, ymm %+ i
142 | %assign i (i+1)
143 | %endrep
144 | %endif ; LINUX
145 | %endmacro
146 | 
147 | ;
148 | ; This macro clears all scratch ZMM registers
149 | ;
150 | ; It should be called before restoring the XMM registers
151 | ; for Windows (XMM6-XMM15). YMM registers are used
152 | ; on purpose, since XOR'ing YMM registers is faster
153 | ; than XOR'ing ZMM registers, and the operation clears
154 | ; also the upper 256 bits
155 | ;
156 | %macro clear_scratch_zmms_asm 0
157 | ; On Linux, all ZMM registers are scratch registers
158 | %ifdef LINUX
159 |         vzeroall
160 |         ;; vzeroall only clears the first 16 ZMM registers
161 | %assign i 16
162 | %rep 16
163 |         vpxorq  ymm %+ i, ymm %+ i
164 | %assign i (i+1)
165 | %endrep
166 | ; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
167 | ; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
168 | ; the lower 128 bits are to be restored after calling these function
169 | ; which clears the upper bits too.
170 | %else
171 | %assign i 0
172 | %rep 6
173 |         vpxorq  ymm %+ i, ymm %+ i
174 | %assign i (i+1)
175 | %endrep
176 | 
177 | %assign i 16
178 | %rep 16
179 |         vpxorq  ymm %+ i, ymm %+ i
180 | %assign i (i+1)
181 | %endrep
182 | %endif ; LINUX
183 | %endmacro
184 | 
185 | %endif ;; _CLEAR_REGS_ASM
186 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/cpinitas.asm:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2014-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | %include "asmdefs.inc"
 18 | %include "ia_32e.inc"
 19 | 
 20 | %assign LOCAL_ALIGN_FACTOR 32
 21 | 
 22 | %ifdef __ARCH_DATA
 23 | 
 24 | segment .text align=LOCAL_ALIGN_FACTOR
 25 | 
 26 | ;####################################################################
 27 | ;#          void cpGetReg( int* buf, int valueEAX, int valueECX ); #
 28 | ;####################################################################
 29 | 
 30 | %ifdef WIN32E
 31 |   %define buf       rcx
 32 |   %define valueEAX  edx
 33 |   %define valueECX  r8d
 34 | %else
 35 |   %define buf       rdi
 36 |   %define valueEAX  esi
 37 |   %define valueECX  edx
 38 | %endif
 39 | 
 40 | align LOCAL_ALIGN_FACTOR
 41 | DECLARE_FUNC cpGetReg,PUBLIC
 42 |         push rbx
 43 |         movsxd  r9, valueEAX
 44 |         movsxd  r10, valueECX
 45 |         mov     r11, buf
 46 | 
 47 |         mov     rax, r9
 48 |         mov     rcx, r10
 49 |         xor     ebx, ebx
 50 |         xor     edx, edx
 51 |         cpuid
 52 |         mov     [r11], eax
 53 |         mov     [r11 + 4], ebx
 54 |         mov     [r11 + 8], ecx
 55 |         mov     [r11 + 12], edx
 56 |         pop rbx
 57 |         ret
 58 | ENDFUNC cpGetReg
 59 | 
 60 | ;###################################################
 61 | 
 62 | ; OSXSAVE support, feature information after cpuid(1), ECX, bit 27 ( XGETBV is enabled by OS )
 63 | %assign XSAVEXGETBV_FLAG   8000000h
 64 | 
 65 | ; Feature information after XGETBV(ECX=0), EAX, bits 2,1 ( XMM state and YMM state are enabled by OS )
 66 | %assign XGETBV_MASK        06h
 67 | 
 68 | %assign XGETBV_AVX512_MASK 0E0h
 69 | 
 70 | align LOCAL_ALIGN_FACTOR
 71 | DECLARE_FUNC cp_is_avx_extension,PUBLIC
 72 |          push  rbx
 73 |          mov   eax, 1
 74 |          cpuid
 75 |          xor   eax, eax
 76 |          and   ecx, 018000000h
 77 |          cmp   ecx, 018000000h
 78 |          jne   .not_avx
 79 |          xor   ecx, ecx
 80 |          db 00fh,001h,0d0h        ; xgetbv
 81 |          mov   ecx, eax
 82 |          xor   eax, eax
 83 |          and   ecx, XGETBV_MASK
 84 |          cmp   ecx, XGETBV_MASK
 85 |          jne   .not_avx
 86 |          mov   eax, 1
 87 | .not_avx:
 88 |          pop   rbx
 89 |          ret
 90 | ENDFUNC cp_is_avx_extension
 91 | 
 92 | align LOCAL_ALIGN_FACTOR
 93 | DECLARE_FUNC cp_is_avx512_extension,PUBLIC
 94 |          push  rbx
 95 |          mov   eax, 1
 96 |          cpuid
 97 |          xor   eax, eax
 98 |          and   ecx, XSAVEXGETBV_FLAG
 99 |          cmp   ecx, XSAVEXGETBV_FLAG
100 |          jne   .not_avx512
101 |          xor   ecx, ecx
102 |          db 00fh,001h,0d0h        ; xgetbv
103 |          mov   ecx, eax
104 |          xor   eax, eax
105 |          and   ecx, XGETBV_AVX512_MASK
106 |          cmp   ecx, XGETBV_AVX512_MASK
107 |          jne   .not_avx512
108 |          mov   eax, 1
109 | .not_avx512:
110 |          pop   rbx
111 |          ret
112 | ENDFUNC cp_is_avx512_extension
113 | 
114 | align LOCAL_ALIGN_FACTOR
115 | DECLARE_FUNC cp_issue_avx512_instruction,PUBLIC
116 |          db    062h,0f1h,07dh,048h,0efh,0c0h ; vpxord  zmm0, zmm0, zmm0
117 |          xor   eax, eax
118 |          ret
119 | ENDFUNC cp_issue_avx512_instruction
120 | 
121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 | 
123 | align LOCAL_ALIGN_FACTOR
124 | DECLARE_FUNC cp_get_pentium_counter,PUBLIC
125 |          rdtsc
126 |          sal    rdx,32
127 |          or     rax,rdx
128 |          ret
129 | ENDFUNC cp_get_pentium_counter
130 | 
131 | align LOCAL_ALIGN_FACTOR
132 | DECLARE_FUNC cpStartTscp,PUBLIC
133 |          push     rbx
134 |          xor      rax, rax
135 |          cpuid
136 |          pop      rbx
137 |          rdtscp
138 |          sal      rdx,32
139 |          or       rax,rdx
140 |          ret
141 | ENDFUNC cpStartTscp
142 | 
143 | align LOCAL_ALIGN_FACTOR
144 | DECLARE_FUNC cpStopTscp,PUBLIC
145 |          rdtscp
146 |          sal      rdx,32
147 |          or       rax,rdx
148 |          push     rax
149 |          push     rbx
150 |          xor      rax, rax
151 |          cpuid
152 |          pop      rbx
153 |          pop      rax
154 |          ret
155 | ENDFUNC cpStopTscp
156 | 
157 | align LOCAL_ALIGN_FACTOR
158 | DECLARE_FUNC cpStartTsc,PUBLIC
159 |          push     rbx
160 |          xor      rax, rax
161 |          cpuid
162 |          pop      rbx
163 |          rdtsc
164 |          sal      rdx,32
165 |          or       rax,rdx
166 |          ret
167 | ENDFUNC cpStartTsc
168 | 
169 | align LOCAL_ALIGN_FACTOR
170 | DECLARE_FUNC cpStopTsc,PUBLIC
171 |          rdtsc
172 |          sal      rdx,32
173 |          or       rax,rdx
174 |          push     rax
175 |          push     rbx
176 |          xor      rax, rax
177 |          cpuid
178 |          pop      rbx
179 |          pop      rax
180 |          ret
181 | ENDFUNC cpStopTsc
182 | 
183 | 
184 | ;*****************************************
185 | ; int cpGetCacheSize( int* tableCache );
186 | align LOCAL_ALIGN_FACTOR
187 | %define table rdi
188 | DECLARE_FUNC cpGetCacheSize,PUBLIC
189 | %assign LOCAL_FRAME 16
190 |         USES_GPR rsi, rdi, rbx, rbp
191 |         USES_XMM
192 |         COMP_ABI 1
193 | 
194 |         mov     rbp, rsp
195 |         xor     esi, esi
196 | 
197 |         mov     eax, 2
198 |         cpuid
199 | 
200 |         cmp     al, 1
201 |         jne     .GetCacheSize_11
202 | 
203 |         test    eax, 080000000h
204 |         jz      .GetCacheSize_00
205 |         xor     eax, eax
206 | .GetCacheSize_00:
207 |         test    ebx, 080000000h
208 |         jz      .GetCacheSize_01
209 |         xor     ebx, ebx
210 | .GetCacheSize_01:
211 |         test    ecx, 080000000h
212 |         jz      .GetCacheSize_02
213 |         xor     ecx, ecx
214 | .GetCacheSize_02:
215 |         test    edx, 080000000h
216 |         jz      .GetCacheSize_03
217 |         xor     edx, edx
218 | 
219 | .GetCacheSize_03:
220 |         test    eax, eax
221 |         jz      .GetCacheSize_04
222 |         mov     [rbp], eax
223 |         add     rbp, 4
224 |         add     esi, 3
225 | .GetCacheSize_04:
226 |         test    ebx, ebx
227 |         jz      .GetCacheSize_05
228 |         mov     [rbp], ebx
229 |         add     rbp, 4
230 |         add     esi, 4
231 | .GetCacheSize_05:
232 |         test    ecx, ecx
233 |         jz      .GetCacheSize_06
234 |         mov     [rbp], ecx
235 |         add     rbp, 4
236 |         add     esi, 4
237 | .GetCacheSize_06:
238 |         test    edx, edx
239 |         jz      .GetCacheSize_07
240 |         mov     [rbp], edx
241 |         add     esi, 4
242 | 
243 | .GetCacheSize_07:
244 |         test    esi, esi
245 |         jz      .GetCacheSize_11
246 |         mov     eax, -1
247 | .GetCacheSize_08:
248 |         xor     edx, edx
249 |         add     edx, [table]
250 |         jz      .ExitGetCacheSize00
251 |         add     table, 8
252 |         mov     ecx, esi
253 | .GetCacheSize_09:
254 |         cmp     dl, BYTE [rsp + rcx]
255 |         je      .GetCacheSize_10
256 |         dec     ecx
257 |         jnz     .GetCacheSize_09
258 |         jmp     .GetCacheSize_08
259 | 
260 | .GetCacheSize_10:
261 |         mov     eax, [table - 4]
262 | 
263 | .ExitGetCacheSize00:
264 |         REST_XMM
265 |         REST_GPR
266 |         ret
267 | 
268 | .GetCacheSize_11:
269 |         mov     eax, -1
270 |         jmp     .ExitGetCacheSize00
271 | ENDFUNC cpGetCacheSize
272 | 
273 | ;****************************
274 | 
275 | %endif ; __ARCH_DATA
276 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/emulator.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2009-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  EM64T Cryptography Primitive.
 20 | ;
 21 | ;
 22 | ;
 23 | 
 24 | %ifndef _EMULATOR_INC_
 25 | %define _EMULATOR_INC_
 26 | 
 27 | %macro my_pclmulqdq 3.nolist
 28 |   %xdefine %%xxDst %1
 29 |   %xdefine %%xxSrc %2
 30 |   %xdefine %%xxOp  %3
 31 | 
 32 |   %if (my_emulator == 0)
 33 |     pclmulqdq   %%xxDst, %%xxSrc, %%xxOp
 34 |   %else
 35 | ;;
 36 | ;; rsp
 37 | ;; registers <rax,rbx,rcx,rdx,rdi,rsi,rbp,r8,..,r15>
 38 | ;; +00  => xxDst
 39 | ;; +16  => xxSrc
 40 | 
 41 |     pushf
 42 |     push     rax
 43 |     push     rbx
 44 |     push     rcx
 45 |     push     rdx
 46 |     push     rdi
 47 |     push     rsi
 48 |     push     rbp
 49 |     push     r8
 50 |     push     r9
 51 |     push     r10
 52 |     push     r11
 53 |     push     r12
 54 |     push     r13
 55 |     push     r14
 56 |     push     r15
 57 | 
 58 |     %assign %%stackSize (sizeof(oword)*2)
 59 |     sub      rsp,%%stackSize
 60 | 
 61 |     movdqu   oword [rsp+00], %%xxDst  ;; save Dst
 62 |     movdqu   oword [rsp+16], %%xxSrc  ;; save Src
 63 | 
 64 |     lea      rcx, [rsp+00]
 65 |     lea      rdx, [rsp+16]
 66 |     mov      r8, %%xxOp
 67 | 
 68 |     sub      rsp, (sizeof(qword)*3)
 69 |     call     emu_pclmulqdq
 70 |     add      rsp, (sizeof(qword)*3)
 71 | 
 72 |     movdqu   %%xxDst, oword [rsp+00]  ;; return Dst
 73 |    ;movdqu   xxSrc, oword [rsp+16]  ;; return Src
 74 |     add      esp, %%stackSize
 75 | 
 76 |     pop      r15
 77 |     pop      r14
 78 |     pop      r13
 79 |     pop      r12
 80 |     pop      r11
 81 |     pop      r10
 82 |     pop      r9
 83 |     pop      r8
 84 |     pop      rbp
 85 |     pop      rsi
 86 |     pop      rdi
 87 |     pop      rdx
 88 |     pop      rcx
 89 |     pop      rbx
 90 |     pop      rax
 91 |     popf
 92 |   %endif
 93 | %endmacro
 94 | 
 95 | %macro my_aesenc 2.nolist
 96 |   %xdefine %%xxDst %1
 97 |   %xdefine %%xxSrc %2
 98 | 
 99 |   %if (my_emulator == 0)
100 |     aesenc %%xxDst, %%xxSrc
101 |   %else
102 |     pushf
103 |     push     rax
104 |     push     rbx
105 |     push     rcx
106 |     push     rdx
107 |     push     rdi
108 |     push     rsi
109 |     push     rbp
110 |     push     r8
111 |     push     r9
112 |     push     r10
113 |     push     r11
114 |     push     r12
115 |     push     r13
116 |     push     r14
117 |     push     r15
118 | 
119 |     %assign %%stackSize (sizeof(oword)*2)
120 |     sub      rsp,%%stackSize
121 | 
122 |     movdqu   oword [rsp+00], %%xxDst  ;; save Dst
123 |     movdqu   oword [rsp+16], %%xxSrc  ;; save Src
124 | 
125 |     lea      rcx, [rsp+00]
126 |     lea      rdx, [rsp+16]
127 | 
128 |     sub      rsp, (sizeof(qword)*2)
129 |     call     emu_aesenc
130 |     add      rsp, (sizeof(qword)*2)
131 | 
132 |     movdqu   %%xxDst, oword [rsp+00]  ;; return Dst
133 |     add      esp, %%stackSize
134 | 
135 |     pop      r15
136 |     pop      r14
137 |     pop      r13
138 |     pop      r12
139 |     pop      r11
140 |     pop      r10
141 |     pop      r9
142 |     pop      r8
143 |     pop      rbp
144 |     pop      rsi
145 |     pop      rdi
146 |     pop      rdx
147 |     pop      rcx
148 |     pop      rbx
149 |     pop      rax
150 |     popf
151 |   %endif
152 | %endmacro
153 | 
154 | %macro my_aesenclast 2.nolist
155 |   %xdefine %%xxDst %1
156 |   %xdefine %%xxSrc %2
157 | 
158 |   %if (my_emulator == 0)
159 |     aesenclast %%xxDst, %%xxSrc
160 |   %else
161 |     pushf
162 |     push     rax
163 |     push     rbx
164 |     push     rcx
165 |     push     rdx
166 |     push     rdi
167 |     push     rsi
168 |     push     rbp
169 |     push     r8
170 |     push     r9
171 |     push     r10
172 |     push     r11
173 |     push     r12
174 |     push     r13
175 |     push     r14
176 |     push     r15
177 | 
178 |     %assign %%stackSize (sizeof(oword)*2)
179 |     sub      rsp,%%stackSize
180 | 
181 |     movdqu   oword [rsp+00], %%xxDst  ;; save Dst
182 |     movdqu   oword [rsp+16], %%xxSrc  ;; save Src
183 | 
184 |     lea      rcx, [rsp+00]
185 |     lea      rdx, [rsp+16]
186 | 
187 |     sub      rsp, (sizeof(qword)*2)
188 |     call     emu_aesenclast
189 |     add      rsp, (sizeof(qword)*2)
190 | 
191 |     movdqu   %%xxDst, oword [rsp+00]  ;; return Dst
192 |     add      esp, %%stackSize
193 | 
194 |     pop      r15
195 |     pop      r14
196 |     pop      r13
197 |     pop      r12
198 |     pop      r11
199 |     pop      r10
200 |     pop      r9
201 |     pop      r8
202 |     pop      rbp
203 |     pop      rsi
204 |     pop      rdi
205 |     pop      rdx
206 |     pop      rcx
207 |     pop      rbx
208 |     pop      rax
209 |     popf
210 |   %endif
211 | %endmacro
212 | 
213 | %macro my_aesdec 2.nolist
214 |   %xdefine %%xxDst %1
215 |   %xdefine %%xxSrc %2
216 | 
217 |   %if (my_emulator == 0)
218 |     aesdec %%xxDst, %%xxSrc
219 |   %else
220 |     pushf
221 |     push     rax
222 |     push     rbx
223 |     push     rcx
224 |     push     rdx
225 |     push     rdi
226 |     push     rsi
227 |     push     rbp
228 |     push     r8
229 |     push     r9
230 |     push     r10
231 |     push     r11
232 |     push     r12
233 |     push     r13
234 |     push     r14
235 |     push     r15
236 | 
237 |     %assign %%stackSize (sizeof(oword)*2)
238 |     sub      rsp,%%stackSize
239 | 
240 |     movdqu   oword [rsp+00], %%xxDst  ;; save Dst
241 |     movdqu   oword [rsp+16], %%xxSrc  ;; save Src
242 | 
243 |     lea      rcx, [rsp+00]
244 |     lea      rdx, [rsp+16]
245 | 
246 |     sub      rsp, (sizeof(qword)*2)
247 |     call     emu_aesdec
248 |     add      rsp, (sizeof(qword)*2)
249 | 
250 |     movdqu   %%xxDst, oword [rsp+00]  ;; return Dst
251 |     add      esp, %%stackSize
252 | 
253 |     pop      r15
254 |     pop      r14
255 |     pop      r13
256 |     pop      r12
257 |     pop      r11
258 |     pop      r10
259 |     pop      r9
260 |     pop      r8
261 |     pop      rbp
262 |     pop      rsi
263 |     pop      rdi
264 |     pop      rdx
265 |     pop      rcx
266 |     pop      rbx
267 |     pop      rax
268 |     popf
269 |   %endif
270 | %endmacro
271 | 
272 | %macro my_aesdeclast 2.nolist
273 |   %xdefine %%xxDst %1
274 |   %xdefine %%xxSrc %2
275 | 
276 |   %if (my_emulator == 0)
277 |     aesenclast %%xxDst, %%xxSrc
278 |   %else
279 |     pushf
280 |     push     rax
281 |     push     rbx
282 |     push     rcx
283 |     push     rdx
284 |     push     rdi
285 |     push     rsi
286 |     push     rbp
287 |     push     r8
288 |     push     r9
289 |     push     r10
290 |     push     r11
291 |     push     r12
292 |     push     r13
293 |     push     r14
294 |     push     r15
295 | 
296 |     %assign %%stackSize (sizeof(oword)*2)
297 |     sub      rsp,%%stackSize
298 | 
299 |     movdqu   oword [rsp+00], %%xxDst  ;; save Dst
300 |     movdqu   oword [rsp+16], %%xxSrc  ;; save Src
301 | 
302 |     lea      rcx, [rsp+00]
303 |     lea      rdx, [rsp+16]
304 | 
305 |     sub      rsp, (sizeof(qword)*2)
306 |     call     emu_aesdeclast
307 |     add      rsp, (sizeof(qword)*2)
308 | 
309 |     movdqu   %%xxDst, oword [rsp+00]  ;; return Dst
310 |     add      esp, %%stackSize
311 | 
312 |     pop      r15
313 |     pop      r14
314 |     pop      r13
315 |     pop      r12
316 |     pop      r11
317 |     pop      r10
318 |     pop      r9
319 |     pop      r8
320 |     pop      rbp
321 |     pop      rsi
322 |     pop      rdi
323 |     pop      rdx
324 |     pop      rcx
325 |     pop      rbx
326 |     pop      rax
327 |     popf
328 |   %endif
329 | %endmacro
330 | 
331 | %if (my_emulator != 0)
332 |   extern emu_pclmulqdq
333 |   extern emu_aesenc
334 |   extern emu_aesenclast
335 |   extern emu_aesdec
336 |   extern emu_aesdeclast
337 | %endif
338 | 
339 | %endif
340 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/ia_32e_regs.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2012-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  EM64T Cryptography Primitive.
 20 | ;
 21 | ;
 22 | ;
 23 | 
 24 | ;;
 25 | ;; Just for unify GPRs usage
 26 | ;;
 27 | 
 28 | %ifndef _IA_32_REGS_INC_
 29 | %define _IA_32_REGS_INC_
 30 | 
 31 | %define r0  rax     ;; 64-bits GPRs
 32 | %define r1  rbx
 33 | %define r2  rcx
 34 | %define r3  rdx
 35 | %define r4  rdi
 36 | %define r5  rsi
 37 | %define r6  rbp
 38 | %define r7  rsp
 39 | 
 40 | %define r0d  eax     ;; 32-bits GPRs
 41 | %define r1d  ebx
 42 | %define r2d  ecx
 43 | %define r3d  edx
 44 | %define r4d  edi
 45 | %define r5d  esi
 46 | %define r6d  ebp
 47 | %define r7d  esp
 48 | 
 49 | %define raxd  eax
 50 | %define rbxd  ebx
 51 | %define rcxd  ecx
 52 | %define rdxd  edx
 53 | %define rdid  edi
 54 | %define rsid  esi
 55 | %define rbpd  ebp
 56 | 
 57 | %define r0w  ax     ;; 16-bits GPRs
 58 | %define r1w  bx
 59 | %define r2w  cx
 60 | %define r3w  dx
 61 | %define r4w  di
 62 | %define r5w  si
 63 | %define r6w  bp
 64 | %define r7w  sp
 65 | 
 66 | %define raxw  ax
 67 | %define rbxw  bx
 68 | %define rcxw  cx
 69 | %define rdxw  dx
 70 | %define rdiw  di
 71 | %define rsiw  si
 72 | %define rbpw  bp
 73 | 
 74 | %define r0b  al     ;;  8-bits GPRs
 75 | %define r1b  bl
 76 | %define r2b  cl
 77 | %define r3b  dl
 78 | %define r4b  dil
 79 | %define r5b  sil
 80 | %define r6b  bpl
 81 | %define r7b  spl
 82 | 
 83 | %define raxb  al
 84 | %define rbxb  bl
 85 | %define rcxb  cl
 86 | %define rdxb  dl
 87 | %define rdib  dil
 88 | %define rsib  sil
 89 | %define rbpb  bpl
 90 | 
 91 | %define raxbl  al
 92 | %define rbxbl  bl
 93 | %define rcxbl  cl
 94 | %define rdxbl  dl
 95 | %define raxbh  ah
 96 | %define rbxbh  bh
 97 | %define rcxbh  ch
 98 | %define rdxbh  dh
 99 | 
100 | ;;
101 | ;; Register Parameters (depend on used OS)
102 | ;;
103 | %ifdef WIN32E
104 |   %define rpar1 rcx
105 |   %define rpar2 rdx
106 |   %define rpar3 r8
107 |   %define rpar4 r9
108 |   %define rpar5 [rsp + ARG_5]
109 |   %define rpar6 [rsp + ARG_6]
110 | %endif
111 | 
112 | %ifdef LINUX32E
113 |   %define rpar1 rdi
114 |   %define rpar2 rsi
115 |   %define rpar3 rdx
116 |   %define rpar4 rcx
117 |   %define rpar5 r8
118 |   %define rpar6 r9
119 | %endif
120 | 
121 | ;; use GPR implementation everywhere possible
122 | %assign GPR_version 1
123 | 
124 | %endif
125 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_uadd_m7as.asm:
--------------------------------------------------------------------------------
  1 | %include "asmdefs.inc"
  2 | %include "ia_32e.inc"
  3 | 
  4 | ;
  5 | ; carry, r[:n] = a[:n] + b[:n]
  6 | ; uint64_t mpn_add_vectorized(uint64_t *r, const uint64_t *a, const uint64_t *b, unsigned int n)
  7 | ;
  8 | 
  9 | segment .text align=ARCH_ALIGN_FACTOR
 10 | 
 11 | align ARCH_ALIGN_FACTOR
 12 | IPPASM mpn_add_vectorized,PUBLIC
 13 | %assign LOCAL_FRAME 0
 14 |         USES_GPR rsi,rdi
 15 |         USES_XMM
 16 |         COMP_ABI 4
 17 | 
 18 | ; rdi = r
 19 | ; rsi = a
 20 | ; rdx = b
 21 | ; rcx = n
 22 | 
 23 |    movsxd   rcx, ecx    ; unsigned length
 24 |    xor      rax, rax
 25 | 
 26 |     cmp     rcx, 2
 27 |     jge     .ADD_GE2
 28 | 
 29 | ;********** lenSrcA == 1 *************************************
 30 |     add     rax, rax
 31 |     mov     r8, qword [rsi]             ; rsi = a
 32 |     adc     r8, qword [rdx]             ; r8  = a+b = s
 33 |     mov     qword [rdi], r8             ; save s
 34 |     sbb     rax, rax                    ;
 35 |     jmp     .FINAL
 36 | 
 37 | ;********** lenSrcA == 1  END ********************************
 38 | 
 39 | .ADD_GE2:
 40 |     jg      .ADD_GT2
 41 | 
 42 | ;********** lenSrcA == 2 *************************************
 43 |     add     rax, rax
 44 |     mov     r8, qword [rsi]             ; r8  = a0
 45 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
 46 |     mov     r9, qword [rsi+8]           ; r9  = a1
 47 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
 48 |     mov     qword [rdi], r8             ; save s0
 49 |     mov     qword [rdi+8], r9           ; save s1
 50 |     sbb     rax, rax                    ; rax = carry
 51 |     jmp     .FINAL
 52 | 
 53 | ;********** lenSrcA == 2 END *********************************
 54 | 
 55 | .ADD_GT2:
 56 |     cmp     rcx, 4
 57 |     jge     .ADD_GE4
 58 | 
 59 | ;********** lenSrcA == 3 *************************************
 60 |     add     rax, rax
 61 |     mov     r8, qword [rsi]             ; r8  = a0
 62 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
 63 |     mov     r9, qword [rsi+8]           ; r9  = a1
 64 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
 65 |     mov     r10, qword [rsi+16]         ; r10 = a2
 66 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
 67 |     mov     qword [rdi], r8             ; save s0
 68 |     mov     qword [rdi+8], r9           ; save s1
 69 |     mov     qword [rdi+16], r10         ; save s2
 70 |     sbb     rax, rax                    ; rax = carry
 71 |     jmp     .FINAL
 72 | 
 73 | ;********** lenSrcA == 3 END *********************************
 74 | 
 75 | .ADD_GE4:
 76 |     jg      .ADD_GT4
 77 | 
 78 | ;********** lenSrcA == 4 *************************************
 79 |     add     rax, rax
 80 |     mov     r8, qword [rsi]             ; r8  = a0
 81 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
 82 |     mov     r9, qword [rsi+8]           ; r9  = a1
 83 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
 84 |     mov     r10, qword [rsi+16]         ; r10 = a2
 85 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
 86 |     mov     r11, qword [rsi+24]         ; r11 = a3
 87 |     adc     r11, qword [rdx+24]         ; r11 = a3+b3 = s3
 88 |     mov     qword [rdi], r8             ; save s0
 89 |     mov     qword [rdi+8], r9           ; save s1
 90 |     mov     qword [rdi+16], r10         ; save s2
 91 |     mov     qword [rdi+24], r11         ; save s2
 92 |     sbb     rax, rax                    ; rax = carry
 93 |     jmp     .FINAL
 94 | 
 95 | ;********** lenSrcA == 4 END *********************************
 96 | 
 97 | .ADD_GT4:
 98 |     cmp     rcx, 6
 99 |     jge     .ADD_GE6
100 | 
101 | ;********** lenSrcA == 5 *************************************
102 |     add     rax, rax
103 |     mov     r8, qword [rsi]             ; r8  = a0
104 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
105 |     mov     r9, qword [rsi+8]           ; r9  = a1
106 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
107 |     mov     r10, qword [rsi+16]         ; r10 = a2
108 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
109 |     mov     r11, qword [rsi+24]         ; r11 = a3
110 |     adc     r11, qword [rdx+24]         ; r11 = a3+b3 = s3
111 |     mov     rcx, qword [rsi+32]         ; rcx = a4
112 |     adc     rcx, qword [rdx+32]         ; rcx = a4+b4 = s4
113 |     mov     qword [rdi], r8             ; save s0
114 |     mov     qword [rdi+8], r9           ; save s1
115 |     mov     qword [rdi+16], r10         ; save s2
116 |     mov     qword [rdi+24], r11         ; save s3
117 |     mov     qword [rdi+32], rcx         ; save s4
118 |     sbb     rax, rax                    ; rax = carry
119 |     jmp     .FINAL
120 | 
121 | ;********** lenSrcA == 5 END *********************************
122 | 
123 | .ADD_GE6:
124 |     jg      .ADD_GT6
125 | 
126 | ;********** lenSrcA == 6 *************************************
127 |     add     rax, rax
128 |     mov     r8, qword [rsi]             ; r8  = a0
129 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
130 |     mov     r9, qword [rsi+8]           ; r9  = a1
131 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
132 |     mov     r10, qword [rsi+16]         ; r10 = a2
133 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
134 |     mov     r11, qword [rsi+24]         ; r11 = a3
135 |     adc     r11, qword [rdx+24]         ; r11 = a3+b3 = s3
136 |     mov     rcx, qword [rsi+32]         ; rcx = a4
137 |     adc     rcx, qword [rdx+32]         ; rcx = a4+b4 = s4
138 |     mov     rsi, qword [rsi+40]         ; rsi = a5
139 |     adc     rsi, qword [rdx+40]         ; rsi = a5+b5 = s5
140 |     mov     qword [rdi], r8             ; save s0
141 |     mov     qword [rdi+8], r9           ; save s1
142 |     mov     qword [rdi+16], r10         ; save s2
143 |     mov     qword [rdi+24], r11         ; save s3
144 |     mov     qword [rdi+32], rcx         ; save s4
145 |     mov     qword [rdi+40], rsi         ; save s5
146 |     sbb     rax, rax                    ; rax = carry
147 |     jmp     .FINAL
148 | 
149 | ;********** lenSrcA == 6 END *********************************
150 | 
151 | .ADD_GT6:
152 |     cmp     rcx, 8
153 |     jge     .ADD_GE8
154 | 
155 | .ADD_EQ7:
156 | ;********** lenSrcA == 7 *************************************
157 |     add     rax, rax
158 |     mov     r8, qword [rsi]             ; r8  = a0
159 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
160 |     mov     r9, qword [rsi+8]           ; r9  = a1
161 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
162 |     mov     r10, qword [rsi+16]         ; r10 = a2
163 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
164 |     mov     r11, qword [rsi+24]         ; r11 = a3
165 |     adc     r11, qword [rdx+24]         ; r11 = a3+b3 = s3
166 |     mov     rcx, qword [rsi+32]         ; rcx = a4
167 |     adc     rcx, qword [rdx+32]         ; rcx = a4+b4 = s4
168 |     mov     qword [rdi], r8             ; save s0
169 |     mov     r8, qword [rsi+40]          ; r8  = a5
170 |     adc     r8, qword [rdx+40]          ; r8  = a5+b5 = s5
171 |     mov     rsi, qword [rsi+48]         ; rsi = a6
172 |     adc     rsi, qword [rdx+48]         ; rsi = a6+b6 = s6
173 |     mov     qword [rdi+8], r9           ; save s1
174 |     mov     qword [rdi+16], r10         ; save s2
175 |     mov     qword [rdi+24], r11         ; save s3
176 |     mov     qword [rdi+32], rcx         ; save s4
177 |     mov     qword [rdi+40], r8          ; save s5
178 |     mov     qword [rdi+48], rsi         ; save s6
179 |     sbb     rax, rax                    ; rax = carry
180 |     jmp     .FINAL
181 | 
182 | ;********** lenSrcA == 7 END *********************************
183 | 
184 | 
185 | .ADD_GE8:
186 |     jg       .ADD_GT8
187 | 
188 | ;********** lenSrcA == 8 *************************************
189 |     add     rax, rax
190 |     mov     r8, qword [rsi]             ; r8  = a0
191 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = s0
192 |     mov     r9, qword [rsi+8]           ; r9  = a1
193 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = s1
194 |     mov     r10, qword [rsi+16]         ; r10 = a2
195 |     adc     r10, qword [rdx+16]         ; r10 = a2+b2 = s2
196 |     mov     r11, qword [rsi+24]         ; r11 = a3
197 |     adc     r11, qword [rdx+24]         ; r11 = a3+b3 = s3
198 |     mov     rcx, qword [rsi+32]         ; rcx = a4
199 |     adc     rcx, qword [rdx+32]         ; rcx = a4+b4 = s4
200 |     mov     qword [rdi], r8             ; save s0
201 |     mov     r8, qword [rsi+40]          ; r8  = a5
202 |     adc     r8, qword [rdx+40]          ; r8  = a5+b5 = s5
203 |     mov     qword [rdi+8], r9           ; save s1
204 |     mov     r9, qword [rsi+48]          ; r9  = a7
205 |     adc     r9, qword [rdx+48]          ; r9  = a7+b7 = s7
206 |     mov     rsi, qword [rsi+56]         ; rsi = a6
207 |     adc     rsi, qword [rdx+56]         ; rsi = a6+b6 = s6
208 |     mov     qword [rdi+16], r10         ; save s2
209 |     mov     qword [rdi+24], r11         ; save s3
210 |     mov     qword [rdi+32], rcx         ; save s4
211 |     mov     qword [rdi+40], r8          ; save s5
212 |     mov     qword [rdi+48], r9          ; save s6
213 |     mov     qword [rdi+56], rsi         ; save s7
214 |     sbb     rax, rax                    ; rax = carry
215 |     jmp     .FINAL
216 | 
217 | ;********** lenSrcA == 8 END *********************************
218 | 
219 | 
220 | ;********** lenSrcA > 8  *************************************
221 | 
222 | .ADD_GT8:
223 |     mov     r8, rax
224 |     mov     rax, rcx                    ; rax = len
225 |     and     rcx, 3                      ;
226 |     xor     rcx, rax                    ;
227 |     lea     rsi, [rsi+8*rcx]            ;
228 |     lea     rdx, [rdx+8*rcx]            ;
229 |     lea     rdi, [rdi+8*rcx]            ;
230 |     neg     rcx
231 |     add     r8, r8
232 |     jmp     .ADD_GLOOP
233 | 
234 | align ARCH_ALIGN_FACTOR
235 | .ADD_GLOOP:
236 |     mov     r8, qword [rsi+8*rcx]       ; r8  = a0
237 |     mov     r9, qword [rsi+8*rcx+8]     ; r9  = a1
238 |     mov     r10, qword [rsi+8*rcx+16]   ; r10 = a2
239 |     mov     r11, qword [rsi+8*rcx+24]   ; r11 = a3
240 |     adc     r8, qword [rdx+8*rcx]       ; r8  = a0+b0 = r0
241 |     adc     r9, qword [rdx+8*rcx+8]     ; r9  = a1+b1 = r1
242 |     adc     r10, qword [rdx+8*rcx+16]   ; r10 = a2+b2 = r2
243 |     adc     r11, qword [rdx+8*rcx+24]   ; r11 = a3+b3 = r3
244 |     mov     qword [rdi+8*rcx], r8       ;
245 |     mov     qword [rdi+8*rcx+8], r9     ;
246 |     mov     qword [rdi+8*rcx+16], r10   ;
247 |     mov     qword [rdi+8*rcx+24], r11   ;
248 |     lea     rcx, [rcx+4]
249 |     jrcxz   .ADD_LLAST0
250 |     jmp     .ADD_GLOOP
251 | 
252 | .ADD_LLAST0:
253 |     sbb     rcx, rcx
254 |     and     rax, 3
255 |     jz      .FIN0
256 | 
257 | .ADD_LLOOP:
258 |     test    rax, 2
259 |     jz      .ADD_LLAST1
260 | 
261 |     add     rcx, rcx
262 |     mov     r8, qword [rsi]             ; r8  = a0
263 |     mov     r9, qword [rsi+8]           ; r9  = a1
264 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = r0
265 |     adc     r9, qword [rdx+8]           ; r9  = a1+b1 = r1
266 |     mov     qword [rdi], r8             ;
267 |     mov     qword [rdi+8], r9           ;
268 |     sbb     rcx, rcx
269 |     test    rax, 1
270 |     jz      .FIN0
271 | 
272 |     add     rsi, 16
273 |     add     rdx, 16
274 |     add     rdi, 16
275 | 
276 | .ADD_LLAST1:
277 |     add     rcx, rcx
278 |     mov     r8, qword [rsi]             ; r8  = a0
279 |     adc     r8, qword [rdx]             ; r8  = a0+b0 = r0
280 |     mov     qword [rdi], r8             ;
281 |     sbb     rcx, rcx
282 | 
283 | .FIN0:
284 |     mov     rax, rcx
285 | 
286 | ;******************* .FINAL ***********************************************************
287 | 
288 | .FINAL:
289 |     neg   rax
290 |     REST_XMM
291 |     REST_GPR
292 |     ret
293 | ENDFUNC mpn_add_vectorized
294 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_uinc_udec_m7as.asm:
--------------------------------------------------------------------------------
  1 | %include "asmdefs.inc"
  2 | %include "ia_32e.inc"
  3 | 
  4 | %if (__ARCH32E >= __ARCH32E_M7)
  5 | 
  6 | segment .text align=ARCH_ALIGN_FACTOR
  7 | 
  8 | ;
  9 | ; carry, r[:size] = a[:size] + w
 10 | ; uint64_t mpn_inc_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w)
 11 | ;
 12 | align ARCH_ALIGN_FACTOR
 13 | IPPASM mpn_inc_vectorized,PUBLIC
 14 | %assign LOCAL_FRAME 0
 15 |         USES_GPR rsi,rdi
 16 |         USES_XMM
 17 |         COMP_ABI 4
 18 | 
 19 | ; rdi = r
 20 | ; rsi = a
 21 | ; rdx = size
 22 | ; rcx = w
 23 | 
 24 |    movsxd   rdx, edx    ; length
 25 | 
 26 |    mov      r8, qword [rsi]     ; r[0] = r[0]+increment
 27 |    add      r8, rcx
 28 |    mov      qword [rdi], r8
 29 | 
 30 |    lea      rsi, [rsi+rdx*sizeof(qword)]
 31 |    lea      rdi, [rdi+rdx*sizeof(qword)]
 32 |    lea      rcx, [rdx*sizeof(qword)]
 33 | 
 34 |    sbb      rax, rax                ; save cf
 35 |    neg      rcx                     ; rcx = negative length (bytes)
 36 |    add      rcx, sizeof(qword)
 37 |    jrcxz    .exit
 38 |    add      rax, rax                ; restore cf
 39 |    jnc      .copy
 40 | 
 41 | align ARCH_ALIGN_FACTOR
 42 | .inc_loop:
 43 |    mov      r8, qword [rsi+rcx]
 44 |    adc      r8, 0
 45 |    mov      qword [rdi+rcx], r8
 46 |    lea      rcx, [rcx+sizeof(qword)]
 47 |    jrcxz    .exit_loop
 48 |    jnc      .exit_loop
 49 |    jmp      .inc_loop
 50 | .exit_loop:
 51 |    sbb      rax, rax                ; save cf
 52 | 
 53 | .copy:
 54 |    cmp      rsi, rdi
 55 |    jz       .exit
 56 |    jrcxz    .exit
 57 | .copy_loop:
 58 |    mov      r8, qword [rsi+rcx]
 59 |    mov      qword [rdi+rcx], r8
 60 |    add      rcx, sizeof(qword)
 61 |    jnz      .copy_loop
 62 | 
 63 | .exit:
 64 |    neg      rax
 65 |    REST_XMM
 66 |    REST_GPR
 67 |    ret
 68 | ENDFUNC mpn_inc_vectorized
 69 | 
 70 | 
 71 | ;
 72 | ; borrow, r[:size] = a[:size] - w
 73 | ; uint64_t mpn_dec_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w)
 74 | ;
 75 | 
 76 | align ARCH_ALIGN_FACTOR
 77 | IPPASM mpn_dec_vectorized,PUBLIC
 78 | %assign LOCAL_FRAME 0
 79 |         USES_GPR rsi,rdi
 80 |         USES_XMM
 81 |         COMP_ABI 4
 82 | 
 83 | ; rdi = r
 84 | ; rsi = a
 85 | ; rdx = size
 86 | ; rcx = w
 87 | 
 88 |    movsxd   rdx, edx    ; length
 89 | 
 90 |    mov      r8, qword [rsi]     ; r[0] = r[0]+increment
 91 |    sub      r8, rcx
 92 |    mov      qword [rdi], r8
 93 | 
 94 |    lea      rsi, [rsi+rdx*sizeof(qword)]
 95 |    lea      rdi, [rdi+rdx*sizeof(qword)]
 96 |    lea      rcx, [rdx*sizeof(qword)]
 97 | 
 98 |    sbb      rax, rax                ; save cf
 99 |    neg      rcx                     ; rcx = negative length (bytes)
100 |    add      rcx, sizeof(qword)
101 |    jrcxz    .exit
102 |    add      rax, rax                ; restore cf
103 |    jnc      .copy
104 | 
105 | align ARCH_ALIGN_FACTOR
106 | .inc_loop:
107 |    mov      r8, qword [rsi+rcx]
108 |    sbb      r8, 0
109 |    mov      qword [rdi+rcx], r8
110 |    lea      rcx, [rcx+sizeof(qword)]
111 |    jrcxz    .exit_loop
112 |    jnc      .exit_loop
113 |    jmp      .inc_loop
114 | .exit_loop:
115 |    sbb      rax, rax                ; save cf
116 | 
117 | .copy:
118 |    cmp      rsi, rdi
119 |    jz       .exit
120 |    jrcxz    .exit
121 | .copy_loop:
122 |    mov      r8, qword [rsi+rcx]
123 |    mov      qword [rdi+rcx], r8
124 |    add      rcx, sizeof(qword)
125 |    jnz      .copy_loop
126 | 
127 | .exit:
128 |    neg      rax
129 |    REST_XMM
130 |    REST_GPR
131 |    ret
132 | ENDFUNC mpn_dec_vectorized
133 | 
134 | %endif
135 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_acc_m7as.asm:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  Cryptography Primitive.
 20 | ;               Big Number Operations
 21 | ;
 22 | ;     Content:
 23 | ;        cpMulDgt_BNU()
 24 | ;        mpn_mul_acc()
 25 | ;        cpSubMulDgt_BNU()
 26 | ;        mpn_mul_acc()
 27 | ;
 28 | ;
 29 | 
 30 | %include "asmdefs.inc"
 31 | %include "ia_32e.inc"
 32 | %include "ia_32e_regs.inc"
 33 | %include "bn_umulschool.inc"
 34 | 
 35 | %if (__ARCH32E >= __ARCH32E_M7)
 36 | 
 37 | segment .text align=ARCH_ALIGN_FACTOR
 38 | 
 39 | 
 40 | ;*************************************************************
 41 | ; uint64_t mpn_mul_acc(uint64_t* pDst,
 42 | ;                  const uint64_t* pSrcA,
 43 | ;                        int    len,
 44 | ;                        uint64_t B )
 45 | ;*************************************************************
 46 | align ARCH_ALIGN_FACTOR
 47 | IPPASM mpn_mul_acc,PUBLIC
 48 | %assign LOCAL_FRAME 0
 49 |         USES_GPR rbx,rsi,rdi,r11,r12
 50 |         USES_XMM
 51 |         COMP_ABI 4
 52 | 
 53 | ; rdi = pDst
 54 | ; rsi = pSrc
 55 | ; rdx = len
 56 | ; rcx = B
 57 | 
 58 | %xdefine B0    rcx   ; b
 59 | 
 60 | %xdefine T0    r8    ; temporary
 61 | %xdefine T1    r9
 62 | %xdefine T2    r10
 63 | %xdefine T3    r11
 64 | 
 65 | %xdefine idx   rbx   ; index
 66 | %xdefine rDst  rdi
 67 | %xdefine rSrc  rsi
 68 | 
 69 |    mov   edx, edx       ; unsigned length
 70 | 
 71 |    mov   rax, qword [rsi]
 72 |    cmp   rdx, 1
 73 |    jnz   .general_case
 74 | 
 75 |    mul   rcx
 76 |    add   qword [rdi], rax
 77 |    adc   rdx, 0
 78 |    mov   rax, rdx
 79 |    REST_XMM
 80 |    REST_GPR
 81 |    ret
 82 | 
 83 | .general_case:
 84 |    lea   rSrc, [rSrc+rdx*sizeof(qword)-sizeof(qword)*5]
 85 |    lea   rDst, [rDst+rdx*sizeof(qword)-sizeof(qword)*5]
 86 |    mov   idx, dword 5
 87 |    sub   idx, rdx       ; negative counter -(len-5)
 88 | 
 89 |    mul   rcx            ; {T1:T0} = a[0]*B
 90 |    mov   T0, rax
 91 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)]
 92 |    mov   T1, rdx
 93 | 
 94 |    cmp   idx, 0
 95 |    jge   .skip_muladd_loop4
 96 | 
 97 | align ARCH_ALIGN_FACTOR
 98 | .muladd_loop4:
 99 |    mul   rcx                     ; a[4*i+1]*B
100 |    xor   T2, T2
101 |    add   qword [rDst+idx*sizeof(qword)], T0
102 |    adc   T1, rax
103 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
104 |    adc   T2, rdx
105 | 
106 |    mul   rcx                     ; a[4*i+2]*B
107 |    xor   T3, T3
108 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
109 |    adc    T2, rax
110 |    mov    rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
111 |    adc    T3, rdx
112 | 
113 |    mul   rcx                     ; a[4*i+3]*B
114 |    xor   T0, T0
115 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
116 |    adc   T3, rax
117 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4]
118 |    adc   T0, rdx
119 | 
120 |    mul   rcx                     ; a[4*i+4]*B
121 |    xor   T1, T1
122 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
123 |    adc   T0, rax
124 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*5]
125 |    adc   T1, rdx
126 | 
127 |    add   idx, 4
128 |    jnc   .muladd_loop4
129 | 
130 | .skip_muladd_loop4:
131 |    mul   rcx
132 |    xor   T2, T2
133 |    add   qword [rDst+idx*sizeof(qword)], T0
134 |    adc   T1, rax
135 |    adc   T2, rdx
136 | 
137 |    cmp   idx, 2
138 |    ja    .fin_mul1x4n_2   ; idx=3
139 |    jz    .fin_mul1x4n_3   ; idx=2
140 |    jp    .fin_mul1x4n_4   ; idx=1
141 |    ;     .fin_mul1x4n_1   ; idx=0
142 | 
143 | .fin_mul1x4n_1:
144 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
145 |    mul   rcx
146 |    xor   T3, T3
147 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
148 |    adc   T2, rax
149 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
150 |    adc   T3, rdx
151 | 
152 |    mul   rcx
153 |    xor   T0, T0
154 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
155 |    adc   T3, rax
156 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4]
157 |    adc   T0, rdx
158 | 
159 |    mul   rcx
160 |    xor   T1, T1
161 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
162 |    adc   T0, rax
163 |    adc   rdx, 0
164 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*4], T0
165 |    adc   rdx, 0
166 |    mov   rax, rdx
167 |    jmp   .exit
168 | 
169 | .fin_mul1x4n_4:
170 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
171 |    mul   rcx
172 |    xor   T3, T3
173 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
174 |    adc   T2, rax
175 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
176 |    adc   T3, rdx
177 | 
178 |    mul   rcx
179 |    xor   T0, T0
180 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
181 |    adc   T3, rax
182 |    adc   rdx, 0
183 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
184 |    adc   rdx, 0
185 |    mov   rax, rdx
186 |    jmp   .exit
187 | 
188 | .fin_mul1x4n_3:
189 |    mov   rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
190 |    mul   rcx
191 |    xor   T3, T3
192 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
193 |    adc   T2, rax
194 |    adc   rdx, 0
195 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
196 |    adc   rdx, 0
197 |    mov   rax, rdx
198 |    jmp   .exit
199 | 
200 | .fin_mul1x4n_2:
201 |    add   qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
202 |    adc   T2, 0
203 |    mov   rax, T2
204 | 
205 | .exit:
206 |     REST_XMM
207 |     REST_GPR
208 |     ret
209 | ENDFUNC mpn_mul_acc
210 | 
211 | %endif
212 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_m7as.asm:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  Cryptography Primitive.
 20 | ;               Big Number Operations
 21 | ;
 22 | ;     Content:
 23 | ;        mpn_mul()
 24 | ;
 25 | ;
 26 | 
 27 | %include "asmdefs.inc"
 28 | %include "ia_32e.inc"
 29 | %include "bn_umulschool.inc"
 30 | %include "variant.inc"
 31 | 
 32 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_OFF_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
 33 | %if (__ARCH32E >= __ARCH32E_M7) && (__ARCH32E < __ARCH32E_L9)
 34 | 
 35 | 
 36 | segment .text align=ARCH_ALIGN_FACTOR
 37 | 
 38 | 
 39 | ;*************************************************************
 40 | ;* uint64_t  mpn_mul(uint64_t* pR;
 41 | ;*                       const uint64_t* pA, int  aSize,
 42 | ;*                       const uint64_t* pB, int  bSize)
 43 | ;* returns pR[aSize+bSize]
 44 | ;*
 45 | ;*************************************************************
 46 | align ARCH_ALIGN_FACTOR
 47 | IPPASM mpn_mul,PUBLIC
 48 | %assign LOCAL_FRAME (1*sizeof(qword))
 49 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
 50 |         USES_XMM
 51 |         COMP_ABI 5
 52 | 
 53 | ; rdi = pDst
 54 | ; rsi = pSrcA
 55 | ; edx = lenA
 56 | ; rcx = pSrcB
 57 | ; r8d = lenB
 58 | 
 59 | ;;
 60 | ;; stack structure:
 61 | ;;counterB = (0)
 62 | ;;counterA = (counterB+sizeof(qword))
 63 | %assign counterA  (0)
 64 | 
 65 | 
 66 |    cmp      edx, r8d
 67 |    jl       .general_case_mul_entry
 68 |    jg       .general_case_mul
 69 | %if (__ARCH32E < __ARCH32E_E9)
 70 |    cmp      edx, 4
 71 | %else
 72 |    cmp      edx, 8
 73 | %endif
 74 |    jg       .general_case_mul
 75 | 
 76 | %if (__ARCH32E >= __ARCH32E_E9)
 77 |    cmp     edx, 4
 78 |    jg      .more_then_4
 79 | %endif
 80 | 
 81 |    cmp      edx, 3
 82 |    ja       .mul_4x4
 83 |    jz       .mul_3x3
 84 |    jp       .mul_2x2
 85 |   ;         mul_1x1
 86 | 
 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 88 | ;;
 89 | ;; fixed-size multipliers (1-4)
 90 | ;;
 91 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 92 | align ARCH_ALIGN_FACTOR
 93 | .mul_1x1:
 94 |    mov      rax, qword [rsi]
 95 |    mul      qword [rcx]
 96 |    mov      qword [rdi], rax
 97 |    mov      qword [rdi+sizeof(qword)], rdx
 98 |    mov      rax, qword [rdi+sizeof(qword)*1]
 99 |    REST_XMM
100 |    REST_GPR
101 |    ret
102 | 
103 | align ARCH_ALIGN_FACTOR
104 | .mul_2x2:
105 |    mov      r8, [rcx]
106 |    mov      r9, [rcx+sizeof(qword)*1]
107 |    MUL_NxN  2, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
108 |    mov      rax, qword [rdi+sizeof(qword)*3]
109 |    REST_XMM
110 |    REST_GPR
111 |    ret
112 | 
113 | align ARCH_ALIGN_FACTOR
114 | .mul_3x3:
115 |    mov      r8, [rcx]
116 |    mov      r9, [rcx+sizeof(qword)*1]
117 |    mov      r10,[rcx+sizeof(qword)*2]
118 |    MUL_NxN  3, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
119 |    mov      rax, qword [rdi+sizeof(qword)*5]
120 |    REST_XMM
121 |    REST_GPR
122 |    ret
123 | 
124 | align ARCH_ALIGN_FACTOR
125 | .mul_4x4:
126 |    mov      r8, [rcx]
127 |    mov      r9, [rcx+sizeof(qword)*1]
128 |    mov      r10,[rcx+sizeof(qword)*2]
129 |    mov      r11,[rcx+sizeof(qword)*3]
130 |    MUL_NxN  4, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
131 |    mov      rax, qword [rdi+sizeof(qword)*7]
132 |    REST_XMM
133 |    REST_GPR
134 |    ret
135 | 
136 | %if (__ARCH32E >= __ARCH32E_E9)
137 | .more_then_4:
138 |    cmp      edx, 7
139 |    ja       .mul_8x8
140 |    jz       .mul_7x7
141 |    jp       .mul_6x6
142 |   ;         mul_5x5
143 | 
144 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
145 | ;;
146 | ;; fixed-size multipliers (5-8)
147 | ;;
148 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
149 | align ARCH_ALIGN_FACTOR
150 | .mul_5x5:
151 |    mov      r8, [rcx]
152 |    mov      r9, [rcx+sizeof(qword)*1]
153 |    mov      r10,[rcx+sizeof(qword)*2]
154 |    mov      r11,[rcx+sizeof(qword)*3]
155 |    mov      r12,[rcx+sizeof(qword)*4]
156 |    MUL_NxN  5, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
157 |    mov      rax, qword [rdi+sizeof(qword)*9]
158 |    REST_XMM
159 |    REST_GPR
160 |    ret
161 | 
162 | align ARCH_ALIGN_FACTOR
163 | .mul_6x6:
164 |    mov      r8, [rcx]
165 |    mov      r9, [rcx+sizeof(qword)*1]
166 |    mov      r10,[rcx+sizeof(qword)*2]
167 |    mov      r11,[rcx+sizeof(qword)*3]
168 |    mov      r12,[rcx+sizeof(qword)*4]
169 |    mov      r13,[rcx+sizeof(qword)*5]
170 |    MUL_NxN  6, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
171 |    mov      rax, qword [rdi+sizeof(qword)*11]
172 |    REST_XMM
173 |    REST_GPR
174 |    ret
175 | 
176 | align ARCH_ALIGN_FACTOR
177 | .mul_7x7:
178 |    mov      r8, [rcx]
179 |    mov      r9, [rcx+sizeof(qword)*1]
180 |    mov      r10,[rcx+sizeof(qword)*2]
181 |    mov      r11,[rcx+sizeof(qword)*3]
182 |    mov      r12,[rcx+sizeof(qword)*4]
183 |    mov      r13,[rcx+sizeof(qword)*5]
184 |    mov      r14,[rcx+sizeof(qword)*6]
185 |    MUL_NxN  7, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
186 |    mov      rax, qword [rdi+sizeof(qword)*13]
187 |    REST_XMM
188 |    REST_GPR
189 |    ret
190 | 
191 | align ARCH_ALIGN_FACTOR
192 | .mul_8x8:
193 |    mov      r8, [rcx]
194 |    mov      r9, [rcx+sizeof(qword)*1]
195 |    mov      r10,[rcx+sizeof(qword)*2]
196 |    mov      r11,[rcx+sizeof(qword)*3]
197 |    mov      r12,[rcx+sizeof(qword)*4]
198 |    mov      r13,[rcx+sizeof(qword)*5]
199 |    mov      r14,[rcx+sizeof(qword)*6]
200 |    mov      r15,[rcx+sizeof(qword)*7]
201 |    MUL_NxN  8, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
202 |    mov      rax, qword [rdi+sizeof(qword)*15]
203 |    REST_XMM
204 |    REST_GPR
205 |    ret
206 | %endif
207 | 
208 | 
209 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210 | ;;
211 | ;; general case multiplier
212 | ;;
213 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
214 | align ARCH_ALIGN_FACTOR
215 | .general_case_mul_entry:
216 |    ; swap operands %if lenA < lenB then exchange operands
217 |    xor      rsi, rcx
218 |    xor      edx, r8d
219 |    xor      rcx, rsi
220 |    xor      r8d, edx
221 |    xor      rsi, rcx
222 |    xor      edx, r8d
223 | 
224 | %xdefine B0    r10   ; b[i], b[i+1]
225 | %xdefine B1    r11
226 | 
227 | %xdefine T0    r12   ; temporary
228 | %xdefine T1    r13
229 | %xdefine T2    r14
230 | %xdefine T3    r15
231 | 
232 | %xdefine idx   rbx   ; index
233 | %xdefine rDst  rdi
234 | %xdefine rSrc  rsi
235 | 
236 | align ARCH_ALIGN_FACTOR
237 | .general_case_mul:
238 |    movsxd   rdx, edx    ; expand length
239 |    movsxd   r8,  r8d
240 | 
241 |    lea      rdi, [rdi+rdx*sizeof(qword)-sizeof(qword)*4] ; rdi = &R[lenA-4]
242 |    lea      rsi, [rsi+rdx*sizeof(qword)-sizeof(qword)*4] ; rsi = &A[lenA-4]
243 | 
244 |    mov      idx, dword 4                        ; negative
245 |    sub      idx, rdx                      ; A-counter
246 |    mov      qword [rsp+counterA], idx
247 | 
248 |    mov      rax, qword [rsi+idx*sizeof(qword)] ; a[0]
249 |    mov      B0, qword [rcx]                    ; b[0]
250 |    test     r8, 1
251 |    jz       .init_even_B
252 | 
253 | ;********** lenSrcB = 2*n+ 1 (multiply only) *********************
254 | .init_odd_B:
255 |    xor      T0, T0
256 |    cmp      idx, 0
257 |    jge      .skip_mul1
258 | 
259 |    MULx1    rdi, rsi, idx, B0, T0, T1, T2, T3
260 | 
261 | .skip_mul1:
262 |    cmp      idx, 2
263 |    ja       .fin_mul1x4n_1   ; idx=3
264 |    jz       .fin_mul1x4n_2   ; idx=2
265 |    jp       .fin_mul1x4n_3   ; idx=1
266 |    ;        fin_mul1x4n_4   ; idx=0
267 | 
268 | .fin_mul1x4n_4:
269 |    MULx1_4N_4_ELOG rdi, rsi, B0, T0,T1,T2,T3
270 |    add      rcx, sizeof(qword)
271 |    add      r8, 1
272 |    jmp      .mla2x4n_4
273 | .fin_mul1x4n_3:
274 |    MULx1_4N_3_ELOG rdi, rsi, B0, T0,T1,T2,T3
275 |    add      rcx, sizeof(qword)
276 |    add      r8, 1
277 |    jmp      .mla2x4n_3
278 | .fin_mul1x4n_2:
279 |    MULx1_4N_2_ELOG rdi, rsi, B0, T0,T1,T2,T3
280 |    add      rcx, sizeof(qword)
281 |    add      r8, 1
282 |    jmp      .mla2x4n_2
283 | .fin_mul1x4n_1:
284 |    MULx1_4N_1_ELOG rdi, rsi, B0, T0,T1,T2,T3
285 |    add      rcx, sizeof(qword)
286 |    add      r8, 1
287 |    jmp      .mla2x4n_1
288 | 
289 | 
290 | ;********** lenSrcB = 2*n (multiply only) ************************
291 | .init_even_B:
292 |    mov      rbp, rax
293 |    mul      B0                                  ; {T2:T1:T0} = a[0]*B0
294 |    mov      B1, qword [rcx+sizeof(qword)]
295 |    xor      T2, T2
296 |    mov      T0, rax
297 |    mov      rax, rbp                            ; restore a[0]
298 |    mov      T1, rdx
299 | 
300 |    cmp      idx, 0
301 |    jge      .skip_mul_nx2
302 | 
303 |    MULx2    rdi, rsi, idx, B0,B1, T0,T1,T2,T3
304 | 
305 | .skip_mul_nx2:
306 |    cmp      idx, 2
307 |    ja       .fin_mul2x4n_1   ; idx=3
308 |    jz       .fin_mul2x4n_2   ; idx=2
309 |    jp       .fin_mul2x4n_3   ; idx=1
310 |    ;        fin_mul2x4n_4   ; idx=0
311 | 
312 | .fin_mul2x4n_4:
313 |    MULx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
314 |    add      rcx, sizeof(qword)*2
315 | align ARCH_ALIGN_FACTOR
316 | .mla2x4n_4:
317 |    sub      r8, 2
318 |    jz       .quit
319 |    MLAx2_PLOG  B0,B1, rcx, T0,T1,T2,T3
320 |    cmp      idx, 0
321 |    jz       .skip_mla_x2
322 |    MLAx2    rdi, rsi, idx, B0,B1, T0,T1,T2,T3
323 | .skip_mla_x2:
324 |    MLAx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
325 |    add      rcx, sizeof(qword)*2
326 |    jmp      .mla2x4n_4
327 | 
328 | .fin_mul2x4n_3:
329 |    MULx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
330 |    add      rcx, sizeof(qword)*2
331 | align ARCH_ALIGN_FACTOR
332 | .mla2x4n_3:
333 |    sub      r8, 2
334 |    jz       .quit
335 |    MLAx2_PLOG  B0,B1, rcx, T0,T1,T2,T3
336 |    MLAx2    rdi, rsi, idx, B0,B1, T0,T1,T2,T3
337 |    MLAx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
338 |    add      rcx, sizeof(qword)*2
339 |    jmp      .mla2x4n_3
340 | 
341 | .fin_mul2x4n_2:
342 |    MULx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
343 |    add      rcx, sizeof(qword)*2
344 | align ARCH_ALIGN_FACTOR
345 | .mla2x4n_2:
346 |    sub      r8, 2
347 |    jz       .quit
348 |    MLAx2_PLOG  B0,B1, rcx, T0,T1,T2,T3
349 |    MLAx2    rdi, rsi, idx, B0,B1, T0,T1,T2,T3
350 |    MLAx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
351 |    add      rcx, sizeof(qword)*2
352 |    jmp      .mla2x4n_2
353 | 
354 | .fin_mul2x4n_1:
355 |    MULx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
356 |    add      rcx, sizeof(qword)*2
357 | align ARCH_ALIGN_FACTOR
358 | .mla2x4n_1:
359 |    sub      r8, 2
360 |    jz       .quit
361 |    MLAx2_PLOG  B0,B1, rcx, T0,T1,T2,T3
362 |    MLAx2    rdi, rsi, idx, B0,B1, T0,T1,T2,T3
363 |    MLAx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
364 |    add      rcx, sizeof(qword)*2
365 |    jmp      .mla2x4n_1
366 | 
367 | .quit:
368 |    mov   rax, rdx
369 | 
370 |    REST_XMM
371 |    REST_GPR
372 |    ret
373 | ENDFUNC mpn_mul
374 | 
375 | %endif
376 | 
377 | %endif ;; _ADCOX_NI_ENABLING_
378 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_usqr_redc_srvl9.asm:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  Cryptography Primitive.
 20 | ;               Big Number Multiplicative Operations
 21 | ;
 22 | ;      Content:
 23 | ;         mpn_mul()
 24 | ;         mpn_sqr()
 25 | ;         mpn_montgomery_reduce_bin()
 26 | ;
 27 | ;  Implementation is using mulx and adcx/adox instruvtions
 28 | ;
 29 | ;
 30 | 
 31 | %include "asmdefs.inc"
 32 | %include "ia_32e.inc"
 33 | %include "variant.inc"
 34 | 
 35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
 36 | %if (__ARCH32E >= __ARCH32E_L9)
 37 | 
 38 | %assign _xEMULATION_  1
 39 | 
 40 | segment .text align=ARCH_ALIGN_FACTOR
 41 | 
 42 | 
 43 | %include "bn_umul.inc"
 44 | %include "bn_usqr.inc"
 45 | %include "mred.inc"
 46 | 
 47 | ;*************************************************************
 48 | ;* uint64_t  mpn_mul(uint64_t* pR;
 49 | ;*                       const uint64_t* pA, int  aSize,
 50 | ;*                       const uint64_t* pB, int  bSize)
 51 | ;*
 52 | ;*************************************************************
 53 | align ARCH_ALIGN_FACTOR
 54 | IPPASM mpn_mul,PUBLIC
 55 | %assign LOCAL_FRAME 0
 56 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
 57 |         USES_XMM
 58 |         COMP_ABI 5
 59 | 
 60 | ; rdi = pR
 61 | ; rsi = pA
 62 | ; edx = nsA
 63 | ; rcx = pB
 64 | ; r8d = nsB
 65 | 
 66 |    movsxd   rdx, edx    ; expand length
 67 |    movsxd   rbx, r8d
 68 | 
 69 |    xor      r8, r8      ; clear scratch
 70 |    xor      r9, r9
 71 |    xor      r10, r10
 72 |    xor      r11, r11
 73 |    xor      r12, r12
 74 |    xor      r13, r13
 75 |    xor      r14, r14
 76 |    xor      r15, r15
 77 | 
 78 |    cmp      rdx, rbx
 79 |    jl       .swap_operans      ; nsA < nsB
 80 |    jg       .test_8N_case      ; test %if nsA=8*N and nsB=8*M
 81 | 
 82 |    cmp      rdx, 16
 83 |    jg       .test_8N_case
 84 | 
 85 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 86 | ;; short nsA==nsB (1,..,16)
 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 88 |    cmp      rdx, 4
 89 |    jg       .more_then_4
 90 | 
 91 |    cmp      edx, 3
 92 |    ja       .mul_4_4
 93 |    jz       .mul_3_3
 94 |    jp       .mul_2_2
 95 |   ;         mul_1_1
 96 | 
 97 | .mul_1_1:
 98 |    MUL_NxN  1, rdi, rsi, rcx, rbx,rbp, r8
 99 |    jmp      .quit
100 | .mul_2_2:
101 |    MUL_NxN  2, rdi, rsi, rcx, rbx,rbp, r8,r9
102 |    jmp      .quit
103 | .mul_3_3:
104 |    MUL_NxN  3, rdi, rsi, rcx, rbx,rbp, r8,r9,r10
105 |    jmp      .quit
106 | .mul_4_4:
107 |    MUL_NxN  4, rdi, rsi, rcx, rbx,rbp, r8,r9,r10,r11
108 |    jmp      .quit
109 | 
110 | .more_then_4:
111 |    GET_EP   rax, mul_lxl_basic, rdx, rbp
112 |    call     rax
113 |    jmp      .quit
114 | 
115 | .swap_operans:
116 |    SWAP     rsi, rcx       ; swap operands
117 |    SWAP     rdx, rbx
118 | 
119 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
120 | ;; 8*N x 8*M case multiplier
121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 | .test_8N_case:
123 |    mov      rax, rdx
124 |    or       rax, rbx
125 |    and      rax, 7
126 |    jnz      .general_mul
127 | 
128 |    CALL_FUNC     mul_8Nx8M
129 |    jmp      .quit
130 | 
131 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
132 | ;; general case multiplier
133 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
134 | .general_mul:
135 |    CALL_FUNC    mul_NxM
136 |    jmp     .quit
137 | 
138 | .quit:
139 |    REST_XMM
140 |    REST_GPR
141 |    ret
142 | ENDFUNC mpn_mul
143 | 
144 | ;*************************************************************
145 | ;*
146 | ;* uint64_t  mpn_sqr(uint64_t* pR;
147 | ;*                       const uint64_t* pA, int  aSize)
148 | ;*
149 | ;*************************************************************
150 | align ARCH_ALIGN_FACTOR
151 | IPPASM mpn_sqr,PUBLIC
152 | %assign LOCAL_FRAME 0
153 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
154 |         USES_XMM
155 |         COMP_ABI 3
156 | 
157 |    movsxd   rdx, edx    ; expand length
158 | 
159 |    xor      r8, r8      ; clear scratch
160 |    xor      r9, r9
161 |    xor      r10, r10
162 |    xor      r11, r11
163 |    xor      r12, r12
164 |    xor      r13, r13
165 |    xor      r14, r14
166 |    xor      r15, r15
167 | 
168 |    cmp      rdx, 16
169 |    jg       .test_8N_case
170 | 
171 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
172 | ;; short nsA (1,..,16)
173 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174 |    GET_EP   rax, sqr_l_basic, rdx, rbp
175 |    call     rax
176 |    jmp      .quit
177 | 
178 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
179 | ;; 8N case squarer
180 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181 | .test_8N_case:
182 |    test     rdx, 7
183 |    jnz      .general_sqr
184 | 
185 |    CALL_FUNC     sqr_8N
186 |    jmp      .quit
187 | 
188 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189 | ;; general case squarer
190 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191 | .general_sqr:
192 |    CALL_FUNC     sqr_N
193 | 
194 | .quit:
195 |    REST_XMM
196 |    REST_GPR
197 |    ret
198 | ENDFUNC mpn_sqr
199 | 
200 | ;*************************************************************
201 | ;*
202 | ;* uint64_t  mpn_montgomery_reduce_bin(uint64_t* pR;
203 | ;*                          uint64_t* pProduct,
204 | ;*                    const uint64_t* pModulus, int  mSize,
205 | ;*                          uint64_t  m)
206 | ;*************************************************************
207 | align ARCH_ALIGN_FACTOR
208 | IPPASM mpn_montgomery_reduce_bin,PUBLIC
209 | %assign LOCAL_FRAME (0)
210 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
211 |         USES_XMM
212 |         COMP_ABI 5
213 | ;pR        (rdi) address of the reduction
214 | ;pProduct  (rsi) address of the temporary product
215 | ;pModulus  (rdx) address of the modulus
216 | ;mSize     (rcx) size    of the modulus
217 | ;m0        (r8)  montgomery helper (m')
218 | 
219 |    mov      r15, rdi    ; store reduction address
220 | 
221 |    ; reload parameters for future convinience:
222 |    mov      rdi, rsi    ; rdi = temporary product buffer
223 |    mov      rsi, rdx    ; rsi = modulus
224 |    movsxd   rdx, ecx    ; rdx = length of modulus
225 | 
226 |    cmp      rdx, 16
227 |    ja       .test_8N_case   ; length of modulus >16
228 | 
229 |    cmp      rdx, 4
230 |    ja       .above4         ; length of modulus 4,..,16
231 | 
232 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
233 | ;; short modulus (1,..,4)
234 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
235 |    cmp      rdx, 3
236 |    ja       .red_4
237 |    jz       .red_3
238 |    jp       .red_2
239 |   ;         red_1
240 | 
241 | .red_1:
242 |    mov      r9, qword [rdi+sizeof(qword)*0]
243 |    MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9
244 |    jmp      .quit
245 | 
246 | .red_2:
247 |    mov      r9,  qword [rdi+sizeof(qword)*0]
248 |    mov      r10, qword [rdi+sizeof(qword)*1]
249 |    MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10
250 |    jmp      .quit
251 | 
252 | .red_3:
253 |    mov      r9,  qword [rdi+sizeof(qword)*0]
254 |    mov      r10, qword [rdi+sizeof(qword)*1]
255 |    mov      r11, qword [rdi+sizeof(qword)*2]
256 |    MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11
257 |    jmp      .quit
258 | 
259 | .red_4:
260 |    mov      r9,  qword [rdi+sizeof(qword)*0]
261 |    mov      r10, qword [rdi+sizeof(qword)*1]
262 |    mov      r11, qword [rdi+sizeof(qword)*2]
263 |    mov      r12, qword [rdi+sizeof(qword)*3]
264 |    MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12
265 |    jmp      .quit
266 | 
267 | 
268 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
269 | ;; short modulus (5,..,16)
270 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
271 | .above4:
272 |    mov      rbp, rdx
273 |    sub      rbp, 4
274 |    GET_EP   rax, mred_short, rbp    ; mred procedure
275 | 
276 |    call     rax
277 |    jmp      .quit
278 | 
279 | 
280 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
281 | ;; 8N case squarer
282 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283 | .test_8N_case:
284 |    test     rdx, 7
285 |    jnz      .general_case
286 | 
287 |    CALL_FUNC     mred_8N
288 |    jmp      .quit
289 | 
290 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
291 | ;;
292 | ;; general case modulus
293 | ;;
294 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295 | .general_case:
296 |    CALL_FUNC     mred_N
297 | 
298 | .quit:
299 |    REST_XMM
300 |    REST_GPR
301 |    ret
302 | ENDFUNC mpn_montgomery_reduce_bin
303 | 
304 | %endif
305 | 
306 | %endif ;; _ADCOX_NI_ENABLING_
307 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_usqr_redc_srvl9pp.asm:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  Cryptography Primitive.
 20 | ;               Big Number Multiplicative Operations
 21 | ;
 22 | ;      Content:
 23 | ;         mpi_umul_bin_adx()
 24 | ;         mpi_usqr_bin_adx()
 25 | ;         mpi_montgomery_reduce_bin_adx()
 26 | ;
 27 | ;  Implementation is using mulx and adcx/adox instruvtions
 28 | ;
 29 | ;
 30 | 
 31 | %include "asmdefs.inc"
 32 | %include "ia_32e.inc"
 33 | %include "variant.inc"
 34 | 
 35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
 36 | %if (__ARCH32E >= __ARCH32E_L9)
 37 | 
 38 | %assign _xEMULATION_ 1
 39 | %assign _ADCX_ADOX_  1
 40 | 
 41 | segment .text align=ARCH_ALIGN_FACTOR
 42 | 
 43 | %include "bn_umulpp.inc"
 44 | %include "bn_usqrpp.inc"
 45 | %include "mred_pp.inc"
 46 | 
 47 | ;*************************************************************
 48 | ;* uint64_t  mpi_umul_bin_adx(uint64_t* pR;
 49 | ;*                       const uint64_t* pA, int  aSize,
 50 | ;*                       const uint64_t* pB, int  bSize)
 51 | ;*************************************************************
 52 | align ARCH_ALIGN_FACTOR
 53 | IPPASM mpi_umul_bin_adx,PUBLIC
 54 | %assign LOCAL_FRAME 0
 55 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
 56 |         USES_XMM
 57 |         COMP_ABI 5
 58 | 
 59 | ; rdi = pR
 60 | ; rsi = pA
 61 | ; edx = nsA
 62 | ; rcx = pB
 63 | ; r8d = nsB
 64 | 
 65 |    movsxd   rdx, edx    ; expand length
 66 |    movsxd   rbx, r8d
 67 | 
 68 |    xor      r8, r8      ; clear scratch
 69 |    xor      r9, r9
 70 |    xor      r10, r10
 71 |    xor      r11, r11
 72 |    xor      r12, r12
 73 |    xor      r13, r13
 74 |    xor      r14, r14
 75 |    xor      r15, r15
 76 | 
 77 |    cmp      rdx, rbx
 78 |    jl       .swap_operans      ; nsA < nsB
 79 |    jg       .test_8N_case      ; test %if nsA=8*N and nsB=8*M
 80 | 
 81 |    cmp      rdx, 16
 82 |    jg       .test_8N_case
 83 | 
 84 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 85 | ;; short nsA==nsB (1,..,16)
 86 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 87 |    cmp      rdx, 4
 88 |    jg       .more_then_4
 89 | 
 90 |    cmp      edx, 3
 91 |    ja       .mul_4_4
 92 |    jz       .mul_3_3
 93 |    jp       .mul_2_2
 94 |   ;         mul_1_1
 95 | 
 96 | .mul_1_1:
 97 |    MUL_NxN  1, rdi, rsi, rcx, rbx, rbp, r8
 98 |    jmp      .quit
 99 | .mul_2_2:
100 |    MUL_NxN  2, rdi, rsi, rcx, rbx, rbp, r8, r9
101 |    jmp      .quit
102 | .mul_3_3:
103 |    MUL_NxN  3, rdi, rsi, rcx, rbx, rbp, r8, r9, r10
104 |    jmp      .quit
105 | .mul_4_4:
106 |    MUL_NxN  4, rdi, rsi, rcx, rbx, rbp, r8, r9, r10, r11
107 |    jmp      .quit
108 | 
109 | .more_then_4:
110 |    GET_EP   rax, mul_lxl_basic, rdx, rbp
111 |    call     rax
112 |    jmp      .quit
113 | 
114 | .swap_operans:
115 |    SWAP     rsi, rcx       ; swap operands
116 |    SWAP     rdx, rbx
117 | 
118 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
119 | ;; 8*N x 8*M case multiplier
120 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121 | .test_8N_case:
122 |    mov      rax, rdx
123 |    or       rax, rbx
124 |    and      rax, 7
125 |    jnz      .general_mul
126 | 
127 |    CALL_FUNC     mul_8Nx8M_adcox
128 |    jmp      .quit
129 | 
130 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
131 | ;; general case multiplier
132 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133 | .general_mul:
134 |    CALL_FUNC  mul_NxM_adcox
135 |    jmp   .quit
136 | 
137 | .quit:
138 |    REST_XMM
139 |    REST_GPR
140 |    ret
141 | ENDFUNC mpi_umul_bin_adx
142 | 
143 | ;*************************************************************
144 | ;*
145 | ;* uint64_t  mpi_usqr_bin_adx(uint64_t* pR;
146 | ;*                       const uint64_t* pA, int  aSize)
147 | ;*
148 | ;*************************************************************
149 | align ARCH_ALIGN_FACTOR
150 | IPPASM mpi_usqr_bin_adx,PUBLIC
151 | %assign LOCAL_FRAME 0
152 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
153 |         USES_XMM
154 |         COMP_ABI 3
155 | 
156 |    movsxd   rdx, edx    ; expand length
157 | 
158 |    xor      r8, r8      ; clear scratch
159 |    xor      r9, r9
160 |    xor      r10, r10
161 |    xor      r11, r11
162 |    xor      r12, r12
163 |    xor      r13, r13
164 |    xor      r14, r14
165 |    xor      r15, r15
166 | 
167 |    cmp      rdx, 16
168 |    jg       .test_8N_case
169 | 
170 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
171 | ;; short nsA (1,..,16)
172 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 |    GET_EP   rax, sqr_l_basic, rdx, rbp
174 |    call     rax
175 |    jmp      .quit
176 | 
177 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178 | ;; 8N case squarer
179 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
180 | .test_8N_case:
181 |    test     rdx, 7
182 |    jnz      .general_sqr
183 | 
184 |    CALL_FUNC     sqr_8N_adcox
185 |    jmp      .quit
186 | 
187 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188 | ;; general case squarer
189 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
190 | .general_sqr:
191 |    CALL_FUNC     sqr_N_adcox
192 | 
193 | .quit:
194 |    REST_XMM
195 |    REST_GPR
196 |    ret
197 | ENDFUNC mpi_usqr_bin_adx
198 | 
199 | ;*************************************************************
200 | ;*
201 | ;* uint64_t  mpi_montgomery_reduce_bin_adx(uint64_t* pR;
202 | ;*                          uint64_t* pProduct,
203 | ;*                    const uint64_t* pModulus, int  mSize,
204 | ;*                          uint64_t  m)
205 | ;*************************************************************
206 | align ARCH_ALIGN_FACTOR
207 | IPPASM mpi_montgomery_reduce_bin_adx,PUBLIC
208 | %assign LOCAL_FRAME (0)
209 |         USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
210 |         USES_XMM
211 |         COMP_ABI 5
212 | ;pR        (rdi) address of the reduction
213 | ;pProduct  (rsi) address of the temporary product
214 | ;pModulus  (rdx) address of the modulus
215 | ;mSize     (rcx) size    of the modulus
216 | ;m0        (r8)  montgomery helper (m')
217 | 
218 |    mov      r15, rdi    ; store reduction address
219 | 
220 |    ; reload parameters for future convinience:
221 |    mov      rdi, rsi    ; rdi = temporary product buffer
222 |    mov      rsi, rdx    ; rsi = modulus
223 |    movsxd   rdx, ecx    ; rdx = length of modulus
224 | 
225 |    cmp      rdx, 16
226 |    ja       .test_8N_case   ; length of modulus >16
227 | 
228 |    cmp      rdx, 4
229 |    ja       .above4         ; length of modulus 4,..,16
230 | 
231 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
232 | ;; short modulus (1,..,4)
233 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
234 |    cmp      rdx, 3
235 |    ja       .red_4
236 |    jz       .red_3
237 |    jp       .red_2
238 |   ;         red_1
239 | 
240 | .red_1:
241 |    mov      r9, qword [rdi+sizeof(qword)*0]
242 |    MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9
243 |    jmp      .quit
244 | 
245 | .red_2:
246 |    mov      r9,  qword [rdi+sizeof(qword)*0]
247 |    mov      r10, qword [rdi+sizeof(qword)*1]
248 |    MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10
249 |    jmp      .quit
250 | 
251 | .red_3:
252 |    mov      r9,  qword [rdi+sizeof(qword)*0]
253 |    mov      r10, qword [rdi+sizeof(qword)*1]
254 |    mov      r11, qword [rdi+sizeof(qword)*2]
255 |    MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11
256 |    jmp      .quit
257 | 
258 | .red_4:
259 |    mov      r9,  qword [rdi+sizeof(qword)*0]
260 |    mov      r10, qword [rdi+sizeof(qword)*1]
261 |    mov      r11, qword [rdi+sizeof(qword)*2]
262 |    mov      r12, qword [rdi+sizeof(qword)*3]
263 |    MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12
264 |    jmp      .quit
265 | 
266 | 
267 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
268 | ;; short modulus (5,..,16)
269 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
270 | .above4:
271 |    mov      rbp, rdx
272 |    sub      rbp, 4
273 |    GET_EP   rax, mred_short, rbp    ; mred procedure
274 | 
275 |    call     rax
276 |    jmp      .quit
277 | 
278 | 
279 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
280 | ;; 8N case squarer
281 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282 | .test_8N_case:
283 |    test     rdx, 7
284 |    jnz      .general_case
285 | 
286 |    CALL_FUNC     mred_8N_adcox
287 |    jmp      .quit
288 | 
289 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
290 | ;;
291 | ;; general case modulus
292 | ;;
293 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
294 | .general_case:
295 |    CALL_FUNC     mred_N_adcox
296 | 
297 | .quit:
298 |    REST_XMM
299 |    REST_GPR
300 |    ret
301 | ENDFUNC mpi_montgomery_reduce_bin_adx
302 | 
303 | %endif
304 | 
305 | %endif ;; _ADCOX_NI_ENABLING_
306 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/mulx.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2013-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;
 19 | ;     Purpose:  EM64T Cryptography Primitive.
 20 | ;               Emulation of Intel(R) instructions MULX, ADCX, ADOX (for debug only)
 21 | ;
 22 | ;
 23 | %ifndef _PCPMULX_INC_
 24 | %assign _PCPMULX_INC_  1
 25 | 
 26 | %ifndef _EMULATION_
 27 | %macro gsmulx 3.nolist
 28 |   %xdefine %%resH %1
 29 |   %xdefine %%resL %2
 30 |   %xdefine %%src %3
 31 | 
 32 |    mulx  %%resH,%%resL,%%src
 33 | %endmacro
 34 | 
 35 | %endif
 36 | 
 37 | %ifdef _EMULATION_
 38 | %macro gsmulx 3.nolist
 39 |   %xdefine %%resH %1
 40 |   %xdefine %%resL %2
 41 |   %xdefine %%src %3
 42 | 
 43 |    pushf                            ;; store flags
 44 | 
 45 |    sub   rsp, sizeof(qword)*4
 46 |    mov   [rsp-sizeof(qword)*3], rax ;; store RAX
 47 |    mov   [rsp-sizeof(qword)*2], rdx ;; store RDX
 48 |    mov   rax,rdx
 49 |    mov   rdx, %%src
 50 | 
 51 |    mul   rdx
 52 | 
 53 |    mov   [rsp-sizeof(qword)*1], rax ;; store Low product
 54 |    mov   [rsp-sizeof(qword)*0], rdx ;; store Hig product
 55 | 
 56 |    mov   rax, [rsp-sizeof(qword)*3] ;; re-store RAX
 57 |    mov   rdx, [rsp-sizeof(qword)*2] ;; re-store RDX
 58 |    mov   %%resL, [rsp-sizeof(qword)*1];; load Low product
 59 |    mov   %%resH, [rsp-sizeof(qword)*0];; load Hig product
 60 |    add   rsp, sizeof(qword)*4
 61 | 
 62 |    popf                             ;; re-store flags
 63 | %endmacro
 64 | 
 65 | %endif
 66 | 
 67 | %ifndef _EMULATION_
 68 | %macro gsadcx 2.nolist
 69 |   %xdefine %%rdst %1
 70 |   %xdefine %%rsrc %2
 71 | 
 72 |    adcx     %%rdst, %%rsrc
 73 | %endmacro
 74 | 
 75 | %endif
 76 | 
 77 | %ifdef _EMULATION_
 78 | %macro gsadcx 2.nolist
 79 |   %xdefine %%rdst %1
 80 |   %xdefine %%src %2
 81 | 
 82 |    push  %%rdst      ;; slot for result
 83 |    push  rax         ;; save rax
 84 |    pushfq            ;; flags before adc
 85 | 
 86 |    adc   %%rdst, %%src
 87 |    mov   [rsp+2*sizeof(qword)], %%rdst
 88 | 
 89 |    pushfq            ;; rsrc = flags after operation
 90 |    pop   rax
 91 |    and   rax, 1      ;; cf after operation
 92 |    and   qword [rsp], (-2)   ;; clear cf before operation
 93 |    or    [rsp], rax  ;; new psw
 94 |    popfq
 95 | 
 96 |    pop   rax
 97 |    pop   %%rdst
 98 | %endmacro
 99 | 
100 | %endif
101 | 
102 | %ifndef _EMULATION_
103 | %macro gsadox 2.nolist
104 |   %xdefine %%rdst %1
105 |   %xdefine %%rsrc %2
106 | 
107 |    adox     %%rdst, %%rsrc
108 | %endmacro
109 | 
110 | %endif
111 | 
112 | %ifdef _EMULATION_
113 | %macro gsadox 2.nolist
114 |   %xdefine %%rdst %1
115 |   %xdefine %%src %2
116 | 
117 |    push  %%rdst
118 |    push  rax         ;; save rax
119 | 
120 |    pushfq            ;; rax = flags before adc
121 |    mov   rax, [rsp]
122 |    and   rax, 800h   ;; of
123 |    xor   [rsp], rax  ;; clear of
124 | 
125 |    shr   rax, 11     ;; mov of to cf position
126 |    push  rax         ;; new psw
127 |    popfq
128 | 
129 | %ifidni %%src,rax
130 |    mov   rax, [rsp+sizeof(qword)]
131 | %endif
132 | %ifidni %%rdst,rax
133 |    mov   %%rdst, [rsp+2*sizeof(qword)]
134 | %endif
135 | 
136 |    adc   %%rdst, %%src
137 |    mov   [rsp+2*sizeof(qword)], %%rdst
138 | 
139 |    pushfq            ;; rsrc = flags after operation
140 |    pop   rax
141 |    and   rax, 1      ;; cf after operation
142 | 
143 |    shl   rax, 11     ;; mov cf into of position
144 |    or    [rsp], rax  ;; new psw
145 |    popfq
146 | 
147 |    pop   rax
148 |    pop   %%rdst
149 | %endmacro
150 | 
151 | %endif
152 | 
153 | %endif ;; _PCPMULX_INC_
154 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/os.inc:
--------------------------------------------------------------------------------
 1 | ;===============================================================================
 2 | ; Copyright 2020 Intel Corporation
 3 | ;
 4 | ; Licensed under the Apache License, Version 2.0 (the "License");
 5 | ; you may not use this file except in compliance with the License.
 6 | ; You may obtain a copy of the License at
 7 | ;
 8 | ;     http://www.apache.org/licenses/LICENSE-2.0
 9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 | 
17 | %ifndef OS_ASM_FILE
18 | %define OS_ASM_FILE
19 | 
20 | %ifndef WIN_ABI
21 | %ifidn __OUTPUT_FORMAT__, win64
22 | %define WIN_ABI
23 | %endif
24 | %endif
25 | 
26 | %ifndef LINUX
27 | %ifidn __OUTPUT_FORMAT__, elf64
28 | %define LINUX
29 | %endif
30 | %endif
31 | 
32 | ;; code is the same for linux and macos
33 | %ifndef LINUX
34 | %ifidn __OUTPUT_FORMAT__, macho64
35 | %define LINUX
36 | %endif
37 | %endif
38 | 
39 | %endif                          ; OS_ASM_FILE
40 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/reg_sizes.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ; define d and w variants for registers
 18 | 
 19 | %ifndef _REG_SIZES_ASM_
 20 | %define _REG_SIZES_ASM_
 21 | 
 22 | %define	raxd	eax
 23 | %define raxw	ax
 24 | %define raxb	al
 25 | 
 26 | %define	rbxd	ebx
 27 | %define rbxw	bx
 28 | %define rbxb	bl
 29 | 
 30 | %define	rcxd	ecx
 31 | %define rcxw	cx
 32 | %define rcxb	cl
 33 | 
 34 | %define	rdxd	edx
 35 | %define rdxw	dx
 36 | %define rdxb	dl
 37 | 
 38 | %define	rsid	esi
 39 | %define rsiw	si
 40 | %define rsib	sil
 41 | 
 42 | %define	rdid	edi
 43 | %define rdiw	di
 44 | %define rdib	dil
 45 | 
 46 | %define	rbpd	ebp
 47 | %define rbpw	bp
 48 | %define rbpb	bpl
 49 | 
 50 | %define zmm0x xmm0
 51 | %define zmm1x xmm1
 52 | %define zmm2x xmm2
 53 | %define zmm3x xmm3
 54 | %define zmm4x xmm4
 55 | %define zmm5x xmm5
 56 | %define zmm6x xmm6
 57 | %define zmm7x xmm7
 58 | %define zmm8x xmm8
 59 | %define zmm9x xmm9
 60 | %define zmm10x xmm10
 61 | %define zmm11x xmm11
 62 | %define zmm12x xmm12
 63 | %define zmm13x xmm13
 64 | %define zmm14x xmm14
 65 | %define zmm15x xmm15
 66 | %define zmm16x xmm16
 67 | %define zmm17x xmm17
 68 | %define zmm18x xmm18
 69 | %define zmm19x xmm19
 70 | %define zmm20x xmm20
 71 | %define zmm21x xmm21
 72 | %define zmm22x xmm22
 73 | %define zmm23x xmm23
 74 | %define zmm24x xmm24
 75 | %define zmm25x xmm25
 76 | %define zmm26x xmm26
 77 | %define zmm27x xmm27
 78 | %define zmm28x xmm28
 79 | %define zmm29x xmm29
 80 | %define zmm30x xmm30
 81 | %define zmm31x xmm31
 82 | 
 83 | %define ymm0x xmm0
 84 | %define ymm1x xmm1
 85 | %define ymm2x xmm2
 86 | %define ymm3x xmm3
 87 | %define ymm4x xmm4
 88 | %define ymm5x xmm5
 89 | %define ymm6x xmm6
 90 | %define ymm7x xmm7
 91 | %define ymm8x xmm8
 92 | %define ymm9x xmm9
 93 | %define ymm10x xmm10
 94 | %define ymm11x xmm11
 95 | %define ymm12x xmm12
 96 | %define ymm13x xmm13
 97 | %define ymm14x xmm14
 98 | %define ymm15x xmm15
 99 | %define ymm16x xmm16
100 | %define ymm17x xmm17
101 | %define ymm18x xmm18
102 | %define ymm19x xmm19
103 | %define ymm20x xmm20
104 | %define ymm21x xmm21
105 | %define ymm22x xmm22
106 | %define ymm23x xmm23
107 | %define ymm24x xmm24
108 | %define ymm25x xmm25
109 | %define ymm26x xmm26
110 | %define ymm27x xmm27
111 | %define ymm28x xmm28
112 | %define ymm29x xmm29
113 | %define ymm30x xmm30
114 | %define ymm31x xmm31
115 | 
116 | %define xmm0x xmm0
117 | %define xmm1x xmm1
118 | %define xmm2x xmm2
119 | %define xmm3x xmm3
120 | %define xmm4x xmm4
121 | %define xmm5x xmm5
122 | %define xmm6x xmm6
123 | %define xmm7x xmm7
124 | %define xmm8x xmm8
125 | %define xmm9x xmm9
126 | %define xmm10x xmm10
127 | %define xmm11x xmm11
128 | %define xmm12x xmm12
129 | %define xmm13x xmm13
130 | %define xmm14x xmm14
131 | %define xmm15x xmm15
132 | %define xmm16x xmm16
133 | %define xmm17x xmm17
134 | %define xmm18x xmm18
135 | %define xmm19x xmm19
136 | %define xmm20x xmm20
137 | %define xmm21x xmm21
138 | %define xmm22x xmm22
139 | %define xmm23x xmm23
140 | %define xmm24x xmm24
141 | %define xmm25x xmm25
142 | %define xmm26x xmm26
143 | %define xmm27x xmm27
144 | %define xmm28x xmm28
145 | %define xmm29x xmm29
146 | %define xmm30x xmm30
147 | %define xmm31x xmm31
148 | 
149 | %define zmm0y ymm0
150 | %define zmm1y ymm1
151 | %define zmm2y ymm2
152 | %define zmm3y ymm3
153 | %define zmm4y ymm4
154 | %define zmm5y ymm5
155 | %define zmm6y ymm6
156 | %define zmm7y ymm7
157 | %define zmm8y ymm8
158 | %define zmm9y ymm9
159 | %define zmm10y ymm10
160 | %define zmm11y ymm11
161 | %define zmm12y ymm12
162 | %define zmm13y ymm13
163 | %define zmm14y ymm14
164 | %define zmm15y ymm15
165 | %define zmm16y ymm16
166 | %define zmm17y ymm17
167 | %define zmm18y ymm18
168 | %define zmm19y ymm19
169 | %define zmm20y ymm20
170 | %define zmm21y ymm21
171 | %define zmm22y ymm22
172 | %define zmm23y ymm23
173 | %define zmm24y ymm24
174 | %define zmm25y ymm25
175 | %define zmm26y ymm26
176 | %define zmm27y ymm27
177 | %define zmm28y ymm28
178 | %define zmm29y ymm29
179 | %define zmm30y ymm30
180 | %define zmm31y ymm31
181 | 
182 | %define xmm0y ymm0
183 | %define xmm1y ymm1
184 | %define xmm2y ymm2
185 | %define xmm3y ymm3
186 | %define xmm4y ymm4
187 | %define xmm5y ymm5
188 | %define xmm6y ymm6
189 | %define xmm7y ymm7
190 | %define xmm8y ymm8
191 | %define xmm9y ymm9
192 | %define xmm10y ymm10
193 | %define xmm11y ymm11
194 | %define xmm12y ymm12
195 | %define xmm13y ymm13
196 | %define xmm14y ymm14
197 | %define xmm15y ymm15
198 | %define xmm16y ymm16
199 | %define xmm17y ymm17
200 | %define xmm18y ymm18
201 | %define xmm19y ymm19
202 | %define xmm20y ymm20
203 | %define xmm21y ymm21
204 | %define xmm22y ymm22
205 | %define xmm23y ymm23
206 | %define xmm24y ymm24
207 | %define xmm25y ymm25
208 | %define xmm26y ymm26
209 | %define xmm27y ymm27
210 | %define xmm28y ymm28
211 | %define xmm29y ymm29
212 | %define xmm30y ymm30
213 | %define xmm31y ymm31
214 | 
215 | %define xmm0z zmm0
216 | %define xmm1z zmm1
217 | %define xmm2z zmm2
218 | %define xmm3z zmm3
219 | %define xmm4z zmm4
220 | %define xmm5z zmm5
221 | %define xmm6z zmm6
222 | %define xmm7z zmm7
223 | %define xmm8z zmm8
224 | %define xmm9z zmm9
225 | %define xmm10z zmm10
226 | %define xmm11z zmm11
227 | %define xmm12z zmm12
228 | %define xmm13z zmm13
229 | %define xmm14z zmm14
230 | %define xmm15z zmm15
231 | %define xmm16z zmm16
232 | %define xmm17z zmm17
233 | %define xmm18z zmm18
234 | %define xmm19z zmm19
235 | %define xmm20z zmm20
236 | %define xmm21z zmm21
237 | %define xmm22z zmm22
238 | %define xmm23z zmm23
239 | %define xmm24z zmm24
240 | %define xmm25z zmm25
241 | %define xmm26z zmm26
242 | %define xmm27z zmm27
243 | %define xmm28z zmm28
244 | %define xmm29z zmm29
245 | %define xmm30z zmm30
246 | %define xmm31z zmm31
247 | 
248 | %define ymm0z zmm0
249 | %define ymm1z zmm1
250 | %define ymm2z zmm2
251 | %define ymm3z zmm3
252 | %define ymm4z zmm4
253 | %define ymm5z zmm5
254 | %define ymm6z zmm6
255 | %define ymm7z zmm7
256 | %define ymm8z zmm8
257 | %define ymm9z zmm9
258 | %define ymm10z zmm10
259 | %define ymm11z zmm11
260 | %define ymm12z zmm12
261 | %define ymm13z zmm13
262 | %define ymm14z zmm14
263 | %define ymm15z zmm15
264 | %define ymm16z zmm16
265 | %define ymm17z zmm17
266 | %define ymm18z zmm18
267 | %define ymm19z zmm19
268 | %define ymm20z zmm20
269 | %define ymm21z zmm21
270 | %define ymm22z zmm22
271 | %define ymm23z zmm23
272 | %define ymm24z zmm24
273 | %define ymm25z zmm25
274 | %define ymm26z zmm26
275 | %define ymm27z zmm27
276 | %define ymm28z zmm28
277 | %define ymm29z zmm29
278 | %define ymm30z zmm30
279 | %define ymm31z zmm31
280 | 
281 | %define DWORD(reg) reg %+ d
282 | %define WORD(reg)  reg %+ w
283 | %define BYTE(reg)  reg %+ b
284 | 
285 | %define XWORD(reg) reg %+ x
286 | %define YWORD(reg) reg %+ y
287 | %define ZWORD(reg) reg %+ z
288 | 
289 | %endif ;; _REG_SIZES_ASM_
290 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/variant.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | ;
 18 | ;               Intel(R) Integrated Performance Primitives
 19 | ;                   Cryptographic Primitives (ippcp)
 20 | ;
 21 | ;      Purpose:
 22 | ;         Define ippCP variant
 23 | ;
 24 | ;  do not changes in definitions below!
 25 | ;
 26 | 
 27 | ;;
 28 | ;; modes of the feature
 29 | ;;
 30 | %assign _FEATURE_OFF_      0   ;; feature is OFF
 31 | %assign _FEATURE_ON_       1   ;; feature is ON
 32 | %assign _FEATURE_TICKTOCK_ 2   ;; dectect is feature OFF/ON
 33 | 
 34 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 35 | ; %define _XMM7560_ 1
 36 | %ifdef _XMM7560_
 37 | %include "variant_xmm7560.inc"
 38 | %endif
 39 | 
 40 | ; %define _TXT_ACM_ 1
 41 | %ifdef _TXT_ACM_
 42 | %include "variant_txt_acm.inc"
 43 | %endif
 44 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 45 | 
 46 | ;;
 47 | ;; it possible to force use of C-version of some implementtaions
 48 | ;; instead of ASM one
 49 | ;;
 50 | %ifndef _USE_C_MPZ_uadd_
 51 |   %assign _USE_C_MPZ_uadd_ _FEATURE_OFF_
 52 | %endif
 53 | 
 54 | %ifndef _USE_C_MPZ_usub_
 55 |   %assign _USE_C_MPZ_usub_ _FEATURE_OFF_
 56 | %endif
 57 | 
 58 | %ifndef _USE_C_MPZ_uadd_word_
 59 |   %assign _USE_C_MPZ_uadd_word_ _FEATURE_OFF_
 60 | %endif
 61 | 
 62 | %ifndef _USE_C_batch_mul_add_
 63 |   %assign _USE_C_batch_mul_add_ _FEATURE_OFF_
 64 | %endif
 65 | 
 66 | %ifndef _USE_C_batch_mul_
 67 |   %assign _USE_C_batch_mul_ _FEATURE_OFF_
 68 | %endif
 69 | 
 70 | %ifndef _USE_C_bn_sqr_words_
 71 |   %assign _USE_C_cpMulSqr_BNU_vectorized_ _FEATURE_OFF_
 72 | %endif
 73 | 
 74 | %ifndef _USE_C_bn_mont_red_words_
 75 |   %assign _USE_C_bn_mont_red_words_ _FEATURE_OFF_
 76 | %endif
 77 | 
 78 | ;;
 79 | ;; set _AES_NI_ENABLING_
 80 | ;;
 81 | %ifdef __ARCH_AES_NI_
 82 |   %if (__ARCH_AES_NI_ == 0)
 83 |     %assign _AES_NI_ENABLING_ _FEATURE_OFF_
 84 |   %elif (__ARCH_AES_NI_ == 1)
 85 |     %assign _AES_NI_ENABLING_ _FEATURE_ON_
 86 |   %else
 87 |     %error <Define __ARCH_AES_NI_=0 or 1 or omit __ARCH_AES_NI_ at all>
 88 |   %endif
 89 | %else
 90 |   %if (__ARCH32E >= __ARCH32E_Y8)
 91 |     %assign _AES_NI_ENABLING_ _FEATURE_TICKTOCK_
 92 |   %else
 93 |     %assign _AES_NI_ENABLING_ _FEATURE_OFF_
 94 |   %endif
 95 | %endif
 96 | 
 97 | ;;
 98 | ;; if there is no outside assignment
 99 | ;; set _SHA_NI_ENABLING_  based on CPU specification
100 | ;;
101 | %ifndef _SHA_NI_ENABLING_
102 |   %if (__ARCH32E >= __ARCH32E_Y8 )
103 |     %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_
104 |   %else
105 |     %assign _SHA_NI_ENABLING_ _FEATURE_OFF_
106 |   %endif
107 | %endif
108 | 
109 | ;;
110 | ;; set _ADCOX_NI_ENABLING_
111 | ;;
112 | %ifdef __ARCH_ADCX_NI_
113 |   %if (__ARCH_ADCX_NI_ == 0)
114 |     %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_
115 |   %elif (__ARCH_ADCX_NI_ == 1)
116 |     %assign _ADCOX_NI_ENABLING_ _FEATURE_ON_
117 |   %else
118 |     %error  <Define __ARCH_ADCX_NI_=0 or 1 or omit __ARCH_ADCX_NI_ at all>
119 |   %endif
120 | %else
121 |   %if (__ARCH32E >= __ARCH32E_L9)
122 |     %assign _ADCOX_NI_ENABLING_ _FEATURE_TICKTOCK_
123 |   %else
124 |     %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_
125 |   %endif
126 | %endif
127 | 
128 | 
129 | ;;
130 | ;; select Hash algorithm
131 | ;;
132 | %ifndef _DISABLE_ALG_SHA1_
133 |   %assign _ENABLE_ALG_SHA1_ _FEATURE_ON_  ;; SHA1 on
134 | %else
135 |   %assign _ENABLE_ALG_SHA1_ _FEATURE_OFF_ ;; SHA1 on
136 | %endif
137 | 
138 | %ifndef _DISABLE_ALG_SHA256_
139 |   %assign _ENABLE_ALG_SHA256_ _FEATURE_ON_  ;; SHA256 on
140 | %else
141 |   %assign _ENABLE_ALG_SHA256_ _FEATURE_OFF_ ;; SHA256 off
142 | %endif
143 | 
144 | %ifndef _DISABLE_ALG_SHA521_
145 |   %assign _ENABLE_ALG_SHA512_ _FEATURE_ON_  ;; SHA512 on
146 | %else
147 |   %assign _ENABLE_ALG_SHA512_ _FEATURE_OFF_ ;; SHA512 off
148 | %endif
149 | 
150 | %ifndef _DISABLE_ALG_MD5_
151 |   %assign _ENABLE_ALG_MD5_ _FEATURE_ON_  ;; MD5 on
152 | %else
153 |   %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_ ;; MD5 off
154 | %endif
155 | 
156 | %ifndef _DISABLE_ALG_SM3_
157 |   %assign _ENABLE_ALG_SM3_ _FEATURE_ON_  ;; SM3 on
158 | %else
159 |   %assign _ENABLE_ALG_SM3_ _FEATURE_OFF_ ;; SM3 off
160 | %endif
161 | 
162 | ;;
163 | ;; BN arithmetic
164 | ;;
165 | %assign _ENABLE_KARATSUBA_ _FEATURE_OFF_  ;; not use Karatsuba method for multiplication
166 | 
167 | ;;
168 | ;; EC specific
169 | ;;
170 | %assign _ECP_IMPL_NONE_       0
171 | %assign _ECP_IMPL_ARBIRTRARY_ 1
172 | %assign _ECP_IMPL_SPECIFIC_   2
173 | %assign _ECP_IMPL_MFM_        3
174 | 
175 | %ifndef _ECP_128_
176 |   %assign _ECP_128_ _ECP_IMPL_SPECIFIC_
177 | %endif
178 | 
179 | %ifndef _ECP_192_
180 |   %assign _ECP_192_ _ECP_IMPL_MFM_
181 | %endif
182 | 
183 | %ifndef _ECP_224_
184 |   %assign _ECP_224_ _ECP_IMPL_MFM_
185 | %endif
186 | 
187 | %ifndef _ECP_256_
188 |   %assign _ECP_256_ _ECP_IMPL_MFM_
189 | %endif
190 | 
191 | %ifndef _ECP_384_
192 |   %assign _ECP_384_ _ECP_IMPL_MFM_
193 | %endif
194 | 
195 | %ifndef _ECP_521_
196 |   %assign _ECP_521_ _ECP_IMPL_MFM_
197 | %endif
198 | 
199 | %ifndef _ECP_SM2_
200 |   %assign _ECP_SM2_ _ECP_IMPL_MFM_
201 | %endif
202 | 


--------------------------------------------------------------------------------
/mpn/asm/intel64/variant_txt_acm.inc:
--------------------------------------------------------------------------------
 1 | ;===============================================================================
 2 | ; Copyright 2015-2020 Intel Corporation
 3 | ;
 4 | ; Licensed under the Apache License, Version 2.0 (the "License");
 5 | ; you may not use this file except in compliance with the License.
 6 | ; You may obtain a copy of the License at
 7 | ;
 8 | ;     http://www.apache.org/licenses/LICENSE-2.0
 9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 | 
17 | ;
18 | ;               Intel(R) Integrated Performance Primitives
19 | ;                   Cryptographic Primitives (ippcp)
20 | ;
21 | ;      Purpose:
22 | ;        Update standard ippCP variant
23 | ;
24 | ;  do not changes in definitions below!
25 | ;
26 | 
27 | %ifdef _TXT_ACM_
28 | 
29 | ;;
30 | ;; HASH algs outside settings
31 | ;;
32 | %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_
33 | 
34 | ;;
35 | ;; select Hash algorithm
36 | ;;
37 | ; %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_
38 | 
39 | %endif
40 | 


--------------------------------------------------------------------------------
/mpn/asm/utils.inc:
--------------------------------------------------------------------------------
  1 | ;===============================================================================
  2 | ; Copyright 2015-2020 Intel Corporation
  3 | ;
  4 | ; Licensed under the Apache License, Version 2.0 (the "License");
  5 | ; you may not use this file except in compliance with the License.
  6 | ; You may obtain a copy of the License at
  7 | ;
  8 | ;     http://www.apache.org/licenses/LICENSE-2.0
  9 | ;
 10 | ; Unless required by applicable law or agreed to in writing, software
 11 | ; distributed under the License is distributed on an "AS IS" BASIS,
 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | ; See the License for the specific language governing permissions and
 14 | ; limitations under the License.
 15 | ;===============================================================================
 16 | 
 17 | %ifndef __UTILS_INC__
 18 | %define __UTILS_INC__ 1
 19 | 
 20 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters.
 21 | ; A list is processed in direct order. Note: an input list can be empty.
 22 | %macro FOREACH 2-*.nolist
 23 |   %rotate -1
 24 |   %xdefine %%functor %1
 25 |   %rep %0-1
 26 |   %rotate 1
 27 |     %ifnempty %1
 28 |       %%functor %1
 29 |     %endif
 30 |   %endrep
 31 | %endmacro
 32 | 
 33 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters.
 34 | ; A list is processed in reverse order. Note: an input list can be empty.
 35 | %macro RFOREACH 2-*.nolist
 36 |   %rotate -1
 37 |   %xdefine %%functor %1
 38 |   %rep %0-1
 39 |   %rotate -1
 40 |     %ifnempty %1
 41 |       %%functor %1
 42 |     %endif
 43 |   %endrep
 44 | %endmacro
 45 | 
 46 | ; Shall be called before INTERSECT macro to open corresponding context.
 47 | %macro BEGIN_INTERSECT 0.nolist
 48 |   %push _INTERSECT_CTX_
 49 |   %xdefine %$intersection
 50 |   %assign %$cardinality 0
 51 | %endmacro
 52 | 
 53 | ; Shall be called after INTERSECT macro to close corresponding context.
 54 | %macro END_INTERSECT 0.nolist
 55 |   %pop _INTERSECT_CTX_
 56 | %endmacro
 57 | 
 58 | ; The macro searches intersection between two lists.
 59 | ; Input: two comma-separated lists, enclosed in curly braces.
 60 | ; Output:
 61 | ; - Intersection will be located in the %$instersection context macro (can be empty).
 62 | ; - Count of intersection elements list will be stored in the %$cardinality context variable.
 63 | %macro INTERSECT 2.nolist
 64 |   %ifnctx _INTERSECT_CTX_
 65 |     %fatal "Not in the context: _INTERSECT_CTX_"
 66 |   %endif
 67 | 
 68 |   %xdefine %%list1 %1
 69 |   %xdefine %%list2 %2
 70 | 
 71 |   FOREACH %%list1,{?INTERSECT_BODY {%%list2},}
 72 | %endmacro
 73 | 
 74 | ; Helper macro to concatenate two lists.
 75 | ; The result will be stored in the 3rd parameter that must be a macro identifier.
 76 | %macro CONCATENATE 3.nolist
 77 |   %ifnid %3
 78 |     %fatal "CONCATENATE: 3rd parameter must be a macro identifier."
 79 |   %endif
 80 |   %define %3 %[%1]
 81 |   %ifnempty %3
 82 |     %ifnempty %2
 83 |       %define %3 %[%3],%[%2]
 84 |     %endif
 85 |   %else
 86 |     %define %3 %[%2]
 87 |   %endif
 88 | %endmacro
 89 | 
 90 | ; Helper macro that searches the specified element in the input list.
 91 | ; Input:
 92 | ; - Last parameter - target element
 93 | ; - First parameters refer to the list where the search is processed.
 94 | ; Output:
 95 | ; - The macro is context dependent and upon the element is found, the context macro %$elem_exists will be defined.
 96 | %macro ?FIND 2-*.nolist
 97 |   %ifnctx _FIND_CTX_
 98 |     %fatal "Not in the context: _FIND_CTX_"
 99 |   %endif
100 |   %rotate -1
101 |   %xdefine %%elem_to_check %1
102 |   %undef %$elem_exists
103 | 
104 |   %rep %0-1
105 |     %rotate -1
106 |     %ifidni %%elem_to_check, %1
107 |       %define %$elem_exists %1
108 |       %exitrep
109 |     %endif
110 |   %endrep
111 | %endmacro
112 | 
113 | ; Macro that finds and collects intersection elements. To be used as INTERSECT macro functor.
114 | %macro ?INTERSECT_BODY 2.nolist
115 |   %xdefine %%list %1
116 |   %xdefine %%elem %2
117 | 
118 |   %push _FIND_CTX_
119 |   ?FIND %%list,%%elem
120 |   %ifdef %$elem_exists
121 |     %ifempty %$$intersection
122 |       %define %$$intersection %2
123 |     %else
124 |       %define %$$intersection %[%$$intersection],%%elem
125 |     %endif
126 |     %assign %$$cardinality %$$cardinality + 1
127 |   %endif
128 |   %pop _FIND_CTX_
129 | %endmacro
130 | 
131 | %endif
132 | 


--------------------------------------------------------------------------------
/mpn/mpn-asm.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Ethan.cr.yp.to
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include "mpn-asm.h"
17 | 
18 | #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
19 | // clang-format off
20 | const unsigned char __mpi_clz_tab[129] = {
21 |     1, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
22 |     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
23 |     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
24 |     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
25 |     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
26 |     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27 |     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 |     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 |     9,
30 | };
31 | // clang-format on
32 | #endif
33 | 


--------------------------------------------------------------------------------
/mpn/mpn-binary.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2021 Ethan.cr.yp.to
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #ifndef MULTIPLE_PRECISION_BINARY_H
 17 | #define MULTIPLE_PRECISION_BINARY_H
 18 | 
 19 | #include <mpn/mpn-asm.h>
 20 | #include <mpn/mpn-optimizer.h>
 21 | 
 22 | #define BITS_PER_BYTE        8                          /* @constant: bits per byte */
 23 | #define BITS_PER_CHAR        4                          /* @constant: bits per character */
 24 | #define MPN_MAX_BITS         (UINT_MAX / BITS_PER_BYTE) /* @note: mpn width limitation */
 25 | #define MPN_BITS_TO_BYTES(n) (((n) + BITS_PER_BYTE - 1) / BITS_PER_BYTE)
 26 | 
 27 | /* swap variable */
 28 | #define SWAP(type, a, b) \
 29 |     do {                 \
 30 |         type __t = a;    \
 31 |         (a) = (b);       \
 32 |         (b) = __t;       \
 33 |     } while (0)
 34 | 
 35 | /* copy(increment) */
 36 | #define COPY(dst, src, to) \
 37 |     for (mpn_size_t __i = 0; __i < (to); __i++) { (dst)[__i] = (src)[__i]; }
 38 | 
 39 | /* expand by zeros */
 40 | #define ZEROIZE(dst, from, to) \
 41 |     for (mpn_size_t __i = (from); __i < (to); __i++) { (dst)[__i] = 0; }
 42 | 
 43 | /* copy and expand the left by zeros */
 44 | #define ZEXPAND(dst, dstlen, src, srclen)                                 \
 45 |     {                                                                     \
 46 |         mpn_size_t __i;                                                   \
 47 |         for (__i = 0; __i < (srclen); __i++) { (dst)[__i] = (src)[__i]; } \
 48 |         for (; __i < (dstlen); __i++) { (dst)[__i] = 0; }                 \
 49 |     }
 50 | 
 51 | /**
 52 |  * mpn alignment
 53 |  */
 54 | MPN_INLINE mpn_size_t mpi_aligned_diff(void *ptr, uintptr_t alignment)
 55 | {
 56 |     return (mpn_size_t)((~(((uintptr_t)ptr) & (alignment - 1)) + 1) & (alignment - 1));
 57 | }
 58 | 
 59 | MPN_INLINE mpn_size_t mpi_aligned_size(mpn_size_t size, mpn_size_t alignment)
 60 | {
 61 |     return (size + (alignment - 1)) & (-alignment);
 62 | }
 63 | 
 64 | MPN_INLINE mpn_limb_t *mpi_aligned_pointer(void *ptr, uintptr_t alignment)
 65 | {
 66 |     return (mpn_limb_t *)((uintptr_t)((unsigned char *)ptr + alignment - 1) & (-alignment));
 67 | }
 68 | 
 69 | /**
 70 |  * basic constant-time operation
 71 |  */
 72 | /* return all-ones if MSB(a) == 1; otherwise, all-zeros */
 73 | MPN_INLINE mpn_limb_t mpn_limb_test_msb_consttime(mpn_limb_t a)
 74 | {
 75 |     return (mpn_limb_t)0 - (a >> (sizeof(a) * BITS_PER_BYTE - 1));
 76 | }
 77 | 
 78 | /* return all-ones if |a| equals zero; otherwise, all-zeros */
 79 | MPN_INLINE mpn_limb_t mpn_limb_is_zero_consttime(mpn_limb_t a)
 80 | {
 81 |     mpn_limb_t t = ~a & (a - 1);
 82 |     return (mpn_limb_t)0 - (t >> (sizeof(t) * BITS_PER_BYTE - 1));
 83 | }
 84 | 
 85 | /* copy under mask: dst[] = (a[] & mask) ^ (b[] & ~mask) */
 86 | MPN_INLINE void mpn_masked_copy_consttime(mpn_limb_t *dst, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t len,
 87 |                                           mpn_limb_t mask)
 88 | {
 89 |     mpn_limb_t rmask = ~mask;
 90 |     for (mpn_size_t i = 0; i < len; i++) { dst[i] = (a[i] & mask) ^ (b[i] & rmask); }
 91 | }
 92 | 
 93 | /* conditional swap: a[], b[] = b[], a[] if cond; otherwise not changed */
 94 | MPN_INLINE void mpn_masked_swap_consttime(mpn_limb_t *a, mpn_limb_t *b, mpn_size_t n, unsigned cond)
 95 | {
 96 |     mpn_limb_t mask = cond;
 97 |     mask = ((~mask & ((mask - 1))) >> (sizeof(mpn_limb_t) * BITS_PER_BYTE - 1)) - 1;
 98 |     for (mpn_size_t i = 0; i < n; i++) {
 99 |         mpn_limb_t t = (a[i] ^ b[i]) & mask;
100 |         a[i] ^= t;
101 |         b[i] ^= t;
102 |     }
103 | }
104 | 
105 | /* conditional move: dst[] = cond ? src[] : dst[] */
106 | MPN_INLINE void mpn_masked_move_consttime(mpn_limb_t *dst, const mpn_limb_t *src, mpn_size_t len, unsigned cond)
107 | {
108 |     mpn_masked_copy_consttime(dst, src, dst, len, (mpn_limb_t)0 - cond != 0);
109 | }
110 | 
111 | #if defined(__cplusplus)
112 | extern "C" {
113 | #endif
114 | 
115 | /**
116 |  * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros
117 |  */
118 | mpn_limb_t mpn_is_zero(const mpn_limb_t *buff, mpn_size_t bufflen);
119 | 
120 | /**
121 |  * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros(constant-time version)
122 |  */
123 | mpn_limb_t mpn_is_zero_consttime(const mpn_limb_t *buff, mpn_size_t bufflen);
124 | 
125 | /**
126 |  * mpn: get most significant bit
127 |  */
128 | mpn_size_t mpn_bits(const mpn_limb_t *data, mpn_size_t size);
129 | 
130 | /**
131 |  * mpn: get most significant bit(constant-time version)
132 |  */
133 | mpn_size_t mpn_bits_consttime(const mpn_limb_t *data, mpn_size_t size);
134 | 
135 | /**
136 |  * mpn: get most significant limb
137 |  */
138 | mpn_size_t mpn_limbs(const mpn_limb_t *data, mpn_size_t size);
139 | 
140 | /**
141 |  * mpn: get most significant limb(constant-time version)
142 |  */
143 | mpn_size_t mpn_limbs_consttime(const mpn_limb_t *data, mpn_size_t size);
144 | 
145 | /**
146 |  * mpn: unsigned comparison
147 |  *
148 |  * @note:
149 |  *   1. return 1 if a[] > b[]; 0 if a[] = b[]; -1 if a[] < b[]
150 |  */
151 | int mpn_cmp(const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize);
152 | 
153 | 
154 | /**
155 |  * mpn: left shift
156 |  *
157 |  * @note:
158 |  *   1. required bit_size(r) >= bit_size(a) + nbits
159 |  *   2. the return is number of |mpn_limb_t| of the result |r|
160 |  *   3. r == a is acceptable
161 |  */
162 | mpn_size_t mpn_lshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits);
163 | 
164 | /**
165 |  * mpn: right shift
166 |  *
167 |  * @note:
168 |  *   1. required bit_size(r) >= bit_size(a) - nbits
169 |  *   2. the return is number of |mpn_limb_t| of the result |r|
170 |  *   3. r == a is acceptable
171 |  */
172 | mpn_size_t mpn_rshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits);
173 | 
174 | /**
175 |  * mpn addition: carry, r = a[:n] + b[:n]
176 |  */
177 | mpn_limb_t mpn_add_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n);
178 | 
179 | /**
180 |  * mpn: carry, r[] = a[] + b[]
181 |  */
182 | mpn_limb_t mpn_add(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b,
183 |                    mpn_size_t bsize);
184 | 
185 | /**
186 |  * mpn: carry, r[:n] = a[:n] + w
187 |  */
188 | mpn_limb_t mpn_inc_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t size, mpn_limb_t w);
189 | 
190 | /**
191 |  * mpn: carry, r[] = a[] + w
192 |  */
193 | mpn_limb_t mpn_inc(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
194 | 
195 | /**
196 |  * mpn subtraction: borrow, r[:n] = a[:n] - b[:n]
197 |  *
198 |  * @note:
199 |  *   1. make sure r->room is enough to store the result
200 |  *      minimal advise size: MAX(bit_size(a), bit_size(b)) + 1
201 |  */
202 | mpn_limb_t mpn_sub_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n);
203 | 
204 | /**
205 |  * mpn subtraction: size, r[] = a[] - b[]
206 |  */
207 | mpn_size_t mpn_sub(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b,
208 |                    mpn_size_t bsize);
209 | 
210 | /**
211 |  * mpn: borrow, r[:n] = a[:n] - w
212 |  */
213 | mpn_limb_t mpn_dec_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
214 | 
215 | /**
216 |  * mpn: size, r[] = a[] - w
217 |  */
218 | mpn_size_t mpn_dec(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
219 | 
220 | /**
221 |  * mpn multiplication: extension, r[:asize+bsize] = a[:asize] * b[:bsize]
222 |  * @note:
223 |  *   1. (IMPORTANT)make sure size of |r| isn't less than |asize| + |bsize|
224 |  *   2. the return is the highest unit |mpn_limb_t|
225 |  */
226 | mpn_limb_t mpn_mul(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize);
227 | 
228 | /**
229 |  * mpn multiply-and-add: extension, r[] += a[] * b
230 |  * @note:
231 |  *   1. (IMPORTANT)make sure size of |r| isn't less than |asize|
232 |  *   2. the return is extension of result of multiply-and-add.
233 |  */
234 | mpn_limb_t mpn_mul_acc(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t b);
235 | 
236 | /**
237 |  * mpn square: r[] = a[] ^ 2
238 |  *
239 |  * @note:
240 |  *   1. make sure r->room is enough to store the result
241 |  *      minimal advise size: 2 * bit_size(a)
242 |  */
243 | mpn_limb_t mpn_sqr(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t anum);
244 | 
245 | /**
246 |  * mpn division: xsize, q, x(q = x / y, x = x % y)
247 |  */
248 | mpn_size_t mpn_div(mpn_limb_t *q, mpn_size_t *qsize, mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize);
249 | 
250 | /**
251 |  * mpn modular: x[] = x[] % y[]
252 |  */
253 | mpn_size_t mpn_mod(mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize);
254 | 
255 | /**
256 |  * mpn: division(n by 1)
257 |  *
258 |  * @note:
259 |  *   1. required length of q should be not smaller than size
260 |  */
261 | mpn_size_t mpn_div_limb(mpn_limb_t q[], const mpn_limb_t x[], mpn_size_t size, mpn_limb_t *r, mpn_limb_t d);
262 | 
263 | /**
264 |  * mpn: division(n by 2)
265 |  *
266 |  * @note:
267 |  *   1. required length of q should be not smaller than size
268 |  */
269 | mpn_size_t mpn_div_double_limbs(mpn_limb_t q[], mpn_limb_t r[2], const mpn_limb_t n[], mpn_size_t nn,
270 |                                 const mpn_limb_t d[2]);
271 | 
272 | /**
273 |  * @brief: multiplicative inversion
274 |  *
275 |  * @params:
276 |  *   a/asize: source (value) BigNum A whose size is asize
277 |  *   m/msize: source (modulus) BigNum M whose size is msize
278 |  *   invbuf: buffer of inv
279 |  *   abuf  : buffer of A
280 |  *   mbuf  : buffer of M
281 |  *   r  : result BigNum
282 |  */
283 | mpn_size_t mpn_mod_invert(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *m, mpn_size_t msize,
284 |                           mpn_optimizer_t *optimizer);
285 | 
286 | /**
287 |  *  mpn: create mpn from hex string
288 |  */
289 | mpn_size_t mpn_from_string(mpn_limb_t *r, mpn_size_t size, const char *in, mpn_size_t inlen);
290 | 
291 | /**
292 |  *  mpn: convert mpn to hex string
293 |  */
294 | mpn_size_t mpn_to_string(char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size);
295 | 
296 | /**
297 |  *  mpn: create mpn from big-endian octets
298 |  */
299 | mpn_size_t mpn_from_octets(mpn_limb_t *r, mpn_size_t size, const unsigned char *in, mpn_size_t inlen);
300 | 
301 | /**
302 |  *  mpn: convert mpn to big-endian octets
303 |  */
304 | mpn_size_t mpn_to_octets(unsigned char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size);
305 | 
306 | /**
307 |  * leading zeros counting(constant-time version)
308 |  */
309 | mpn_size_t mpn_limb_nlz_consttime(mpn_limb_t x);
310 | 
311 | /**
312 |  * trailing zeros counting(constant-time version)
313 |  */
314 | mpn_size_t mpn_limb_ntz_consttime(mpn_limb_t x);
315 | 
316 | /**
317 |  * greatest common divisor(mpn_limb_t)
318 |  */
319 | mpn_limb_t mpn_limb_gcd(mpn_limb_t a, mpn_limb_t b);
320 | 
321 | /**
322 |  * mpn: generate in range
323 |  *
324 |  * @note:
325 |  *   1. length of |r| >= hilen
326 |  */
327 | int mpn_random_range(mpn_limb_t *r, mpn_size_t maxtries, const mpn_limb_t *lo, mpn_size_t lolen, const mpn_limb_t *hi,
328 |                      mpn_size_t hilen, int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
329 | 
330 | /**
331 |  * test if |a| and |b| are coprime
332 |  */
333 | int mpn_is_coprime(mpn_limb_t *a, mpn_size_t asize, mpn_limb_t *b, mpn_size_t bsize, mpn_optimizer_t *optimizer);
334 | 
335 | #if defined(__cplusplus)
336 | }
337 | #endif
338 | 
339 | #endif
340 | 


--------------------------------------------------------------------------------
/mpn/mpn-montgomery.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2021 Ethan.cr.yp.to
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #ifndef MULTIPLE_PRECISION_MONTGOMERY_H
 17 | #define MULTIPLE_PRECISION_MONTGOMERY_H
 18 | 
 19 | #include <mpn/mpn-conf.h>
 20 | 
 21 | #if defined(__cplusplus)
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | typedef struct {
 26 |     mpn_size_t modbits;  /**< size of modulus in bit */
 27 |     mpn_size_t modsize;  /**< size of modulus in mpn_limb_t */
 28 |     mpn_limb_t k0;       /**< low word of (1/modulus) mod R */
 29 |     mpn_limb_t *modulus; /**< modulus */
 30 |     mpn_limb_t *montR;   /**< mont_enc(1) */
 31 |     mpn_limb_t *montRR;  /**< mont_enc(1) ^ 2 */
 32 | 
 33 |     mpn_optimizer_t *optimizer; /**< optimizer for montgomery operation */
 34 | } mpn_montgomery_t;
 35 | 
 36 | /**
 37 |  * mpn montgomery: create montgomery context
 38 |  *
 39 |  */
 40 | mpn_montgomery_t *mpn_montgomery_create(mpn_size_t mbits, mpn_size_t psize);
 41 | 
 42 | /**
 43 |  * mpn montgomery: destory montgomery context
 44 |  *
 45 |  */
 46 | void mpn_montgomery_destory(mpn_montgomery_t *mont);
 47 | 
 48 | /**
 49 |  * mpn montgomery: intialize montgomery context with modulus
 50 |  *
 51 |  */
 52 | int mpn_montgomery_set_modulus_bin(mpn_montgomery_t *mont, const mpn_limb_t *modulus, mpn_size_t mbits);
 53 | 
 54 | /**
 55 |  * mpn montgomery: montgomery reduction
 56 |  *
 57 |  * @note:
 58 |  *   1. m0: low word of (1 / modulus) mod b
 59 |  *   2. r = T/R mod m
 60 |  */
 61 | void mpn_montgomery_reduce_bin(mpn_limb_t *r, mpn_limb_t *product, const mpn_limb_t *m, mpn_size_t mnum, mpn_limb_t m0);
 62 | 
 63 | /**
 64 |  * mpn montgomery: r[] = to_mont(a[])
 65 |  *
 66 |  * @requirements:
 67 |  *   1. length of r: modsize
 68 |  *   2. length of a: modsize
 69 |  *   3. memory size from the pool: modsize * sizeof(mpn_limb_t)
 70 |  */
 71 | void mpn_montgomery_encode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
 72 | 
 73 | /**
 74 |  * mpn montgomery: r[] = from_mont(a)
 75 |  *
 76 |  * @requirements:
 77 |  *   1. length of r: modsize
 78 |  *   2. length of a: modsize
 79 |  *   3. memory size from the pool: modsize * sizeof(mpn_limb_t)
 80 |  */
 81 | void mpn_montgomery_decode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
 82 | 
 83 | /**
 84 |  * mpn montgomery: r = (a + b) mod m
 85 |  *
 86 |  * @requirements:
 87 |  *   1. length of r: modsize
 88 |  *   2. length of a: modsize
 89 |  *   3. length of b: modsize
 90 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t)
 91 |  */
 92 | void mpn_montgomery_add(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
 93 | 
 94 | /**
 95 |  * mpn montgomery: r = (a - b) mod m
 96 |  *
 97 |  * @requirements:
 98 |  *   1. length of r: modsize
 99 |  *   2. length of a: modsize
100 |  *   3. length of b: modsize
101 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t)
102 |  */
103 | void mpn_montgomery_sub(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
104 | 
105 | /**
106 |  * mpn montgomery: r = -b mod m = (m - b) mod m
107 |  *
108 |  * @requirements:
109 |  *   1. length of r: modsize
110 |  *   2. length of a: modsize
111 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t)
112 |  */
113 | void mpn_montgomery_negative(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
114 | 
115 | /**
116 |  * mpn montgomery: r = (a / 2) mod m
117 |  *
118 |  * @requirements:
119 |  *   1. length of r: modsize
120 |  *   2. length of a: modsize
121 |  *   3. memory size from the pool: modsize * sizeof(mpn_limb_t)
122 |  */
123 | void mpn_montgomery_halve(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
124 | 
125 | /**
126 |  * mpn montgomery: r = (a * 2) mod m
127 |  *
128 |  * @requirements:
129 |  *   1. length of r: modsize
130 |  *   2. length of a: modsize
131 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t)
132 |  */
133 | void mpn_montgomery_double(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
134 | 
135 | /**
136 |  * mpn montgomery: r = (a * 3) mod m
137 |  *
138 |  * @requirements:
139 |  *   1. length of r: modsize
140 |  *   2. length of a: modsize
141 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t)
142 |  */
143 | void mpn_montgomery_triple(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
144 | 
145 | /**
146 |  * mpn montgomery: r = prod mod m
147 |  *
148 |  * @requirements:
149 |  *   1. length of r: modsize
150 |  *   2. length of rod: modsize
151 |  *   4. memory size from the pool: N/A
152 |  */
153 | void mpn_montgomery_reduce(mpn_limb_t *r, mpn_limb_t *prod, mpn_montgomery_t *mont);
154 | 
155 | /**
156 |  * mpn montgomery: r = (a * b) mod m
157 |  *
158 |  * @requirements:
159 |  *   1. length of r: modsize
160 |  *   2. length of a: modsize
161 |  *   3. length of b: modsize
162 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2
163 |  */
164 | void mpn_montgomery_mul(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
165 | 
166 | /**
167 |  * mpn montgomery: r = (a ^ 2) mod m
168 |  *
169 |  * @requirements:
170 |  *   1. length of r: modsize
171 |  *   2. length of a: modsize
172 |  *   4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2
173 |  */
174 | void mpn_montgomery_square(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
175 | 
176 | /**
177 |  * montgomery factor k0 = -((modulus^-1 mod B) %B)
178 |  */
179 | mpn_limb_t mpn_montgomery_factor(mpn_limb_t m0);
180 | 
181 | /**
182 |  * mpn montgomery: binary exponentiation
183 |  *
184 |  */
185 | mpn_size_t mpn_montgomery_exp(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e,
186 |                               mpn_size_t ebits, mpn_montgomery_t *mont);
187 | 
188 | /**
189 |  * mpn montgomery: binary exponentiation(consttime)
190 |  *
191 |  */
192 | mpn_size_t mpn_montgomery_exp_consttime(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e,
193 |                                         mpn_size_t ebits, mpn_montgomery_t *mont);
194 | 
195 | #if defined(__cplusplus)
196 | }
197 | #endif
198 | 
199 | #endif
200 | 


--------------------------------------------------------------------------------
/mpn/mpn-optimizer.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2021 Ethan.cr.yp.to
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #include "mpn-binary.h"
 17 | 
 18 | /**
 19 |  * mpn optimizer: create optimizer for mpn operation
 20 |  *
 21 |  * @note:
 22 |  *   1. room: room size of optimizer chunk, in unit of 'mpn_limb_t'
 23 |  */
 24 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room)
 25 | {
 26 |     if (room == 0) {
 27 |         /* it's meaningless to create 0-length optimizer */
 28 |         return NULL;
 29 |     }
 30 |     size_t size = sizeof(mpn_optimizer_t) + MPN_LIMB_BYTES + room * sizeof(mpn_limb_t);
 31 |     mpn_optimizer_t *optimizer = (mpn_optimizer_t *)MPI_ALLOCATE(size);
 32 |     if (optimizer != NULL) {
 33 |         optimizer->size = 0;
 34 |         optimizer->next = NULL;
 35 |         optimizer->room = room;
 36 |         optimizer->chunk = mpi_aligned_pointer((unsigned char *)optimizer + sizeof(mpn_optimizer_t), MPN_LIMB_BYTES);
 37 |     }
 38 | 
 39 |     return optimizer;
 40 | }
 41 | 
 42 | /**
 43 |  * mpn optimizer: reset optimizer, mark all as unused
 44 |  */
 45 | void mpn_optimizer_reset(mpn_optimizer_t *optimizer)
 46 | {
 47 |     mpn_optimizer_t *curr = optimizer;
 48 |     while (curr != NULL) {
 49 |         curr->size = 0;
 50 |         curr = curr->next;
 51 |     }
 52 | }
 53 | 
 54 | /**
 55 |  * mpn optimizer: destory optimizer
 56 |  */
 57 | void mpn_optimizer_destory(mpn_optimizer_t *optimizer)
 58 | {
 59 |     mpn_optimizer_t *curr = optimizer, *next;
 60 |     while (curr != NULL) {
 61 |         next = curr->next;
 62 |         MPI_DEALLOCATE(curr); /* cleanse and free mpn_optimizer_t node */
 63 |         curr = next;
 64 |     }
 65 | }
 66 | 
 67 | /**
 68 |  * mpn optimizer: get memory chunk for mpn operation
 69 |  *
 70 |  * @note:
 71 |  *   1. size: size of chunk, in unit of 'mpn_limb_t'
 72 |  */
 73 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *optimizer, mpn_size_t size)
 74 | {
 75 |     if (optimizer == NULL) {
 76 |         MPI_RAISE_ERROR(-EINVAL);
 77 |         return NULL;
 78 |     }
 79 |     if (size == 0) { return NULL; }
 80 | 
 81 |     mpn_size_t total = 0;
 82 |     mpn_optimizer_t *curr = optimizer, *prev = NULL;
 83 |     while (curr != NULL) {
 84 |         total += curr->size;
 85 |         prev = curr;
 86 |         curr = curr->next;
 87 |     }
 88 | 
 89 |     if (prev->room - prev->size >= size) {
 90 |         curr = prev;
 91 |     } else {
 92 |         mpn_size_t room = size + total / 2; // XXX: optimize growth rule
 93 |         prev->next = curr = mpn_optimizer_create(room);
 94 |     }
 95 | 
 96 |     if (curr != NULL) {
 97 |         mpn_limb_t *p = &curr->chunk[curr->size];
 98 |         curr->size += size;
 99 | 
100 |         return p;
101 |     } else {
102 |         MPI_RAISE_ERROR(-ENOMEM);
103 | 
104 |         return NULL;
105 |     }
106 | }
107 | 
108 | /**
109 |  * mpn optimizer: put back memory chunk
110 |  */
111 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size)
112 | {
113 |     if (optimizer == NULL) { return; }
114 | 
115 |     mpn_optimizer_t *curr = optimizer, *prev = NULL;
116 |     while (curr != NULL) {
117 |         prev = curr;
118 |         curr = curr->next;
119 |     }
120 | 
121 |     if (prev->size >= size) { prev->size -= size; }
122 | }
123 | 


--------------------------------------------------------------------------------
/mpn/mpn-optimizer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Ethan.cr.yp.to
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      https://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef MULTIPLE_PRECISION_OPTIMIZER_H
17 | #define MULTIPLE_PRECISION_OPTIMIZER_H
18 | 
19 | #include <mpn/mpn-conf.h>
20 | 
21 | #if defined(__cplusplus)
22 | extern "C" {
23 | #endif
24 | 
25 | typedef struct mpn_optimizer_t {
26 |     mpn_size_t size;              /**< offset of used chunk */
27 |     mpn_size_t room;              /**< max size of chunk */
28 |     mpn_limb_t *chunk;            /**< mpn chunk */
29 |     struct mpn_optimizer_t *next; /**< next optimizer node */
30 | } mpn_optimizer_t;
31 | 
32 | /**
33 |  * mpn optimizer: create optimizer for mpn operation
34 |  *
35 |  * @note:
36 |  *   1. room: room size of optimizer chunk, in unit of 'mpn_limb_t'
37 |  */
38 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room);
39 | 
40 | /**
41 |  * mpn optimizer: destory optimizer
42 |  */
43 | void mpn_optimizer_destory(mpn_optimizer_t *opt);
44 | 
45 | /**
46 |  * mpn optimizer: get memory chunk for mpn operation
47 |  *
48 |  * @note:
49 |  *   1. size: size of chunk, in unit of 'mpn_limb_t'
50 |  */
51 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *opt, mpn_size_t size);
52 | 
53 | /**
54 |  * mpn optimizer: put back memory chunk
55 |  */
56 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size);
57 | 
58 | /**
59 |  * mpn optimizer: reset optimizer, mark all as unused
60 |  */
61 | void mpn_optimizer_reset(mpn_optimizer_t *opt);
62 | 
63 | #if defined(__cplusplus)
64 | }
65 | #endif
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # unit-test
 2 | ADD_EXECUTABLE(unittest-mpi unittest-mpi.cpp)
 3 | TARGET_LINK_LIBRARIES(unittest-mpi mpi crypto dl gtest pthread)
 4 | ADD_TEST(NAME unittest-mpi COMMAND unittest-mpi
 5 |          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 6 | )
 7 | ConfigureTarget(unittest-mpi)
 8 | 
 9 | IF (BUILD_VENDOR)
10 |   ADD_DEPENDENCIES(unittest-mpi openssl)
11 | ENDIF ()
12 | 
13 | # benchmark
14 | ADD_EXECUTABLE(benchmark benchmark.cpp)
15 | TARGET_LINK_LIBRARIES(benchmark mpi crypto pthread dl)
16 | TARGET_COMPILE_OPTIONS(benchmark PRIVATE -std=gnu++17)
17 | ConfigureTarget(benchmark)
18 | 
19 | IF (BUILD_VENDOR)
20 |   ADD_DEPENDENCIES(benchmark openssl)
21 | ENDIF ()
22 | 
23 | INSTALL(TARGETS benchmark unittest-mpi RUNTIME DESTINATION bin)
24 | 


--------------------------------------------------------------------------------
/tests/test.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2022 Kiran Nowak(kiran.nowak@gmail.com)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      https://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdint.h>
 19 | #include <assert.h>
 20 | #include <iostream>
 21 | 
 22 | #include "logger.h"
 23 | #include "tabulate.h"
 24 | #include "benchmark.h"
 25 | 
 26 | template <typename T>
 27 | T reverse(T n);
 28 | 
 29 | unsigned char reverse(unsigned char n)
 30 | {
 31 | #ifdef USE_SMALL_LOOKUP_TABLE
 32 |     // clang-format off
 33 |     static const unsigned char lookup[16] = {
 34 |         0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
 35 |         0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
 36 |     };
 37 |     // clang-format on
 38 | 
 39 |     // Detailed breakdown of the math
 40 |     //  + lookup reverse of bottom nibble
 41 |     //  |       + grab bottom nibble
 42 |     //  |       |        + move bottom result into top nibble
 43 |     //  |       |        |     + combine the bottom and top results
 44 |     //  |       |        |     | + lookup reverse of top nibble
 45 |     //  |       |        |     | |       + grab top nibble
 46 |     //  V       V        V     V V       V
 47 |     // (lookup[n&0b1111] << 4) | lookup[n>>4]
 48 | 
 49 |     // Reverse the top and bottom nibble then swap them.
 50 |     return (lookup[n & 0b1111] << 4) | lookup[n >> 4];
 51 | #else
 52 |     // clang-format off
 53 |     static const unsigned char reversed[] = {
 54 |         0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
 55 |         0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
 56 |         0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
 57 |         0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
 58 |         0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
 59 |         0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
 60 |         0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
 61 |         0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
 62 |         0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
 63 |         0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
 64 |         0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
 65 |         0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
 66 |         0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
 67 |         0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
 68 |         0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
 69 |         0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
 70 |         0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
 71 |         0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
 72 |         0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
 73 |         0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
 74 |         0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
 75 |         0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
 76 |         0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
 77 |         0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
 78 |         0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
 79 |         0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
 80 |         0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
 81 |         0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
 82 |         0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
 83 |         0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
 84 |         0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
 85 |         0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
 86 |     };
 87 |     // clang-format on
 88 | 
 89 |     return reversed[n];
 90 | #endif
 91 | }
 92 | 
 93 | unsigned int reverse(unsigned int n)
 94 | {
 95 |     unsigned int m = n;
 96 |     unsigned char *p = (unsigned char *)(&m);
 97 | 
 98 |     if (sizeof(unsigned int) == 4) {
 99 |         return (reverse(p[0]) << 24) | (reverse(p[1]) << 16) | (reverse(p[2]) << 8) | reverse(p[3]);
100 |     } else {
101 |         for (unsigned i = 0; i < sizeof(unsigned int) / 2; i++) {
102 |             unsigned char h = p[i], l = p[sizeof(unsigned int) - 1 - i];
103 |             p[i] = reverse(l);
104 |             p[sizeof(unsigned int) - 1 - i] = reverse(h);
105 |         }
106 | 
107 |         return m;
108 |     }
109 | }
110 | 
111 | unsigned int reverse_ref(unsigned int num)
112 | {
113 |     unsigned int count = sizeof(num) * 8 - 1;
114 |     unsigned int reverse_num = num;
115 | 
116 |     num >>= 1;
117 |     while (num) {
118 |         reverse_num <<= 1;
119 |         reverse_num |= num & 1;
120 |         num >>= 1;
121 |         count--;
122 |     }
123 |     reverse_num <<= count;
124 | 
125 |     return reverse_num;
126 | }
127 | 
128 | 
129 | static int clz(unsigned int x)
130 | {
131 |     static_assert(sizeof(unsigned int) == 4, "unsigned int must be 32 bits");
132 | 
133 |     // clang-format off
134 |     static const char debruijn32[32] = {
135 |          0, 31,  9, 30,  3,  8, 13, 29,
136 |          2,  5,  7, 21, 12, 24, 28, 19,
137 |          1, 10,  4, 14,  6, 22, 25, 20,
138 |         11, 15, 23, 26, 16, 27, 17, 18,
139 |     };
140 |     // clang-format on
141 |     x |= x >> 1;
142 |     x |= x >> 2;
143 |     x |= x >> 4;
144 |     x |= x >> 8;
145 |     x |= x >> 16;
146 |     x++;
147 | 
148 |     return debruijn32[x * 0x076be629 >> 27];
149 | }
150 | 
151 | int merge(unsigned int &merged, unsigned int hi, unsigned int lo)
152 | {
153 |     auto hbits = clz(hi);
154 |     auto lbits = clz(lo);
155 |     merged = lo | reverse(hi);
156 | 
157 |     return static_cast<int>(lbits + hbits) - static_cast<int>(sizeof(unsigned int) * 8);
158 | }
159 | 
160 | struct data {
161 |     std::string ma;
162 |     std::string mb;
163 |     std::string mc;
164 |     data(const std::string &a, const std::string &b, const std::string &c) : ma(a), mb(b), mc(c) {}
165 | };
166 | 
167 | namespace logging
168 | {
169 | template <>
170 | inline std::string to_string<data>(const std::vector<data> &v)
171 | {
172 |     using namespace tabulate;
173 |     Table table("Company", "Contact", "Country");
174 |     table[0].format().align(Align::center);
175 |     for (auto const &item : v) { table.add(item.ma, item.mb, item.mc); }
176 | 
177 |     // Iterate over rows in the table
178 |     size_t index = 0;
179 |     for (auto &row : table) {
180 |         row.format().styles(Style::bold);
181 | 
182 |         // Set blue background color for alternate rows
183 |         if (index > 0 && index % 2 == 0) {
184 |             for (auto &cell : row) { cell.format().background_color(Color::blue); }
185 |         }
186 |         index += 1;
187 |     }
188 | 
189 |     return table.xterm();
190 | }
191 | } // namespace logging
192 | 
193 | int main()
194 | {
195 |     {
196 |         int a = 1;
197 |         float b = 2.0;
198 |         std::string c = "three";
199 |         bool d = true;
200 |         std::vector<int> e{1, 3, 5, 7, 9};
201 |         std::vector<data> f{
202 |             data("Alfreds Futterkiste", "Maria Anders", "Germany"),
203 |             data("Centro comercial Moctezuma", "Francisco Chang", "Mexico"),
204 |             data("Ernst Handel", "Roland Mendel", "Austria"),
205 |             data("Island Trading", "Helen Bennett", "UK"),
206 |             data("Laughing Bacchus Winecellars", "Yoshi Tannamuri", "Canada"),
207 |             data("Magazzini Alimentari Riuniti", "Giovanni Rovelli", "Italy"),
208 |         };
209 | 
210 |         enum flags {
211 |             FLAG1 = 0x1,
212 |             FLAG2 = 0x2,
213 |             FLAG3 = 0x4,
214 |         } g = FLAG2,
215 |           h = static_cast<flags>(FLAG1 | FLAG3);
216 | 
217 |         llogi(a, b, c, d, e, f, f[0].mc, g, h);
218 |     }
219 | 
220 |     {
221 |         struct {
222 |             unsigned int hi, lo;
223 |         } datas[] = {
224 |             {.hi = 0x01, .lo = 0x1000},
225 |             {.hi = 0x09, .lo = 0x1000},
226 |             {.hi = 0x10, .lo = 0x1000},
227 |             {.hi = 0xF1, .lo = 0x1000},
228 |         };
229 | 
230 |         std::cout << std::endl;
231 |         for (auto const &data : datas) {
232 |             int rbits;
233 |             unsigned int merged;
234 | 
235 |             printf("merge(0x%02X, 0x%04X): ", data.hi, data.lo);
236 |             if ((rbits = merge(merged, data.hi, data.lo)) >= 0) {
237 |                 printf("0x%08X, remain-bits = %2d\n", merged, rbits);
238 |             } else {
239 |                 printf("failed.\n");
240 |             }
241 |         }
242 |     }
243 | 
244 |     {
245 |         BENCHER(reverse_, DoNotOptimize(reverse(__j)), 20, 20000000);
246 |         BENCHER(reverse_ref_, DoNotOptimize(reverse_ref(__j)), 20, 20000000);
247 | 
248 |         std::cout << std::endl;
249 |         std::cout << "reverse: avg = " << reverse_avg << ", stddev = " << reverse_stddev << std::endl;
250 |         std::cout << "reverse(ref): avg = " << reverse_ref_avg << ", stddev = " << reverse_ref_stddev << std::endl;
251 |         std::cout << "perf-diff: " << reverse_ref_avg / reverse_avg << std::endl;
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------