├── .clang-format
├── .cmake-format.yaml
├── .github
└── workflows
│ ├── benchmark.yml
│ ├── ci.yml
│ ├── coverage.yml
│ └── multiarch.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
├── Config.cmake.in
└── ConfigureTarget.cmake
├── docs
└── README.template.md
├── mpi
├── CMakeLists.txt
├── mpi-prime.c
├── mpi-rsa.c
├── mpi-rsa.h
├── mpi.c
└── mpi.h
├── mpn
├── CMakeLists.txt
├── asm
│ ├── asmdefs.inc
│ ├── ia_32e.inc
│ ├── ia_common.inc
│ ├── ia_emm.inc
│ ├── intel64
│ │ ├── bn_uaddadd_m7as.asm
│ │ ├── bn_uaddsub_m7as.asm
│ │ ├── bn_um7.inc
│ │ ├── bn_umul.inc
│ │ ├── bn_umul_basic.inc
│ │ ├── bn_umul_fix.inc
│ │ ├── bn_umulpp.inc
│ │ ├── bn_umulpp_basic.inc
│ │ ├── bn_umulpp_fix.inc
│ │ ├── bn_umulschool.inc
│ │ ├── bn_usqr.inc
│ │ ├── bn_usqr_basic.inc
│ │ ├── bn_usqrpp.inc
│ │ ├── bn_usqrpp_basic.inc
│ │ ├── bn_usqrschool.inc
│ │ ├── clear_regs.inc
│ │ ├── cpinitas.asm
│ │ ├── emulator.inc
│ │ ├── ia_32e_regs.inc
│ │ ├── memcpy.inc
│ │ ├── mont_mul1024_avx2as.asm
│ │ ├── mont_mul_avx2as.asm
│ │ ├── mont_sqr1024_avx2as.asm
│ │ ├── mont_sqr_avx2as.asm
│ │ ├── mpi_mont_reduction_m7as.asm
│ │ ├── mpi_uadd_m7as.asm
│ │ ├── mpi_udiv_u32_m7as.asm
│ │ ├── mpi_uinc_udec_m7as.asm
│ │ ├── mpi_umul_acc_m7as.asm
│ │ ├── mpi_umul_m7as.asm
│ │ ├── mpi_umul_usqr_redc_srvl9.asm
│ │ ├── mpi_umul_usqr_redc_srvl9pp.asm
│ │ ├── mpi_usqr_m7as.asm
│ │ ├── mpi_usub_m7as.asm
│ │ ├── mred.inc
│ │ ├── mred_basic.inc
│ │ ├── mred_pp.inc
│ │ ├── mred_pp_basic.inc
│ │ ├── mulx.inc
│ │ ├── os.inc
│ │ ├── reg_sizes.inc
│ │ ├── variant.inc
│ │ └── variant_txt_acm.inc
│ ├── montgomery-avx2.c
│ ├── montgomery-avx512.c
│ └── utils.inc
├── mpn-asm.c
├── mpn-asm.h
├── mpn-binary.c
├── mpn-binary.h
├── mpn-conf.h
├── mpn-montgomery.c
├── mpn-montgomery.h
├── mpn-optimizer.c
└── mpn-optimizer.h
└── tests
├── CMakeLists.txt
├── benchmark.cpp
├── ini.h
├── logger.h
├── mpi-compiler.h
├── mpn-division.c
├── nameof.h
├── profiler.h
├── tabulate.h
├── test.cc
└── unittest-mpi.cpp
/.clang-format:
--------------------------------------------------------------------------------
1 | # configured with https://zed0.co.uk/clang-format-configurator
2 |
3 | ---
4 | Language: Cpp
5 | AccessModifierOffset: '-2'
6 | AlignAfterOpenBracket: Align
7 | AlignConsecutiveMacros: 'true'
8 | AlignConsecutiveAssignments: 'false'
9 | AlignConsecutiveDeclarations: 'false'
10 | AlignEscapedNewlines: Left
11 | AlignOperands: 'true'
12 | AlignTrailingComments: 'true'
13 | AllowAllArgumentsOnNextLine: 'true'
14 | AllowAllConstructorInitializersOnNextLine: 'true'
15 | AllowAllParametersOfDeclarationOnNextLine: 'true'
16 | AllowShortBlocksOnASingleLine: 'false'
17 | AllowShortCaseLabelsOnASingleLine: 'false'
18 | AllowShortFunctionsOnASingleLine: Empty
19 | AllowShortIfStatementsOnASingleLine: WithoutElse
20 | AllowShortLambdasOnASingleLine: None
21 | AllowShortLoopsOnASingleLine: 'true'
22 | AlwaysBreakAfterDefinitionReturnType: None
23 | AlwaysBreakAfterReturnType: None
24 | AlwaysBreakBeforeMultilineStrings: 'false'
25 | AlwaysBreakTemplateDeclarations: 'Yes'
26 | BinPackArguments: 'true'
27 | BinPackParameters: 'true'
28 | BraceWrapping:
29 | AfterCaseLabel: 'false'
30 | AfterClass: 'false'
31 | AfterControlStatement: 'false'
32 | AfterEnum: 'false'
33 | AfterFunction: 'true'
34 | AfterNamespace: 'true'
35 | AfterObjCDeclaration: 'false'
36 | AfterStruct: 'false'
37 | AfterUnion: 'false'
38 | AfterExternBlock: 'false'
39 | BeforeCatch: 'false'
40 | BeforeElse: 'false'
41 | IndentBraces: 'false'
42 | SplitEmptyFunction: 'true'
43 | SplitEmptyRecord: 'true'
44 | SplitEmptyNamespace: 'true'
45 | BreakBeforeBinaryOperators: NonAssignment
46 | BreakBeforeBraces: Custom
47 | BreakBeforeTernaryOperators: 'true'
48 | BreakConstructorInitializers: BeforeColon
49 | BreakInheritanceList: BeforeColon
50 | BreakStringLiterals: 'true'
51 | ColumnLimit: '120'
52 | CompactNamespaces: 'false'
53 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
54 | ConstructorInitializerIndentWidth: '4'
55 | ContinuationIndentWidth: '4'
56 | Cpp11BracedListStyle: 'true'
57 | DerivePointerAlignment: 'false'
58 | DisableFormat: 'false'
59 | ExperimentalAutoDetectBinPacking: 'false'
60 | FixNamespaceComments: 'true'
61 | ForEachMacros: ['foreach', 'FOREACH', 'RANGES_FOR', 'hlist_for_each_entry_continue', 'hlist_for_each_entry', 'hlist_for_each_entry_from', 'hlist_for_each_entry_safe', 'hlist_for_each_safe', 'list_for_each_entry', 'list_for_each_entry_continue', 'list_for_each_entry_continue_reverse', 'list_for_each_entry_from', 'list_for_each_entry_reverse', 'list_for_each_entry_safe', 'list_for_each_entry_safe_continue', 'list_for_each_entry_safe_from', 'list_for_each_entry_safe_reverse', 'list_for_each_from', 'list_for_each_prev', 'list_for_each_prev_safe', 'list_for_each_safe']
62 | TypenameMacros: ['STACK_OF', 'LIST']
63 | IncludeBlocks: Regroup
64 | IncludeIsMainRegex: '([-_](test|unittest))?$'
65 | IndentCaseLabels: 'true'
66 | IndentPPDirectives: None
67 | IndentWidth: '4'
68 | IndentWrappedFunctionNames: 'false'
69 | KeepEmptyLinesAtTheStartOfBlocks: 'false'
70 | MaxEmptyLinesToKeep: '3'
71 | NamespaceIndentation: None
72 | PenaltyBreakAssignment: '2'
73 | PenaltyBreakBeforeFirstCallParameter: '1'
74 | PenaltyBreakComment: '300'
75 | PenaltyBreakFirstLessLess: '120'
76 | PenaltyBreakString: '1000'
77 | PenaltyBreakTemplateDeclaration: '10'
78 | PenaltyExcessCharacter: '1000000'
79 | PenaltyReturnTypeOnItsOwnLine: '500'
80 | PointerAlignment: Right
81 | RawStringFormats:
82 | - Language: Cpp
83 | Delimiters:
84 | - 'cc'
85 | - 'CC'
86 | - 'cpp'
87 | - 'Cpp'
88 | - 'CPP'
89 | - 'c++'
90 | - 'C++'
91 | CanonicalDelimiter: ''
92 | BasedOnStyle: google
93 | - Language: TextProto
94 | Delimiters:
95 | - 'pb'
96 | - 'PB'
97 | - 'proto'
98 | - 'PROTO'
99 | EnclosingFunctions:
100 | - EqualsProto
101 | - EquivToProto
102 | - PARSE_PARTIAL_TEXT_PROTO
103 | - PARSE_TEST_PROTO
104 | - PARSE_TEXT_PROTO
105 | - ParseTextOrDie
106 | - ParseTextProtoOrDie
107 | CanonicalDelimiter: ''
108 | BasedOnStyle: google
109 | ReflowComments: 'true'
110 | SortIncludes: 'false'
111 | SortUsingDeclarations: 'false'
112 | SpaceAfterCStyleCast: 'false'
113 | SpaceAfterLogicalNot: 'false'
114 | SpaceAfterTemplateKeyword: 'true'
115 | SpaceBeforeAssignmentOperators: 'true'
116 | SpaceBeforeCpp11BracedList: 'false'
117 | SpaceBeforeCtorInitializerColon: 'true'
118 | SpaceBeforeInheritanceColon: 'true'
119 | SpaceBeforeParens: ControlStatements
120 | SpaceBeforeRangeBasedForLoopColon: 'true'
121 | SpaceInEmptyParentheses: 'false'
122 | SpacesBeforeTrailingComments: '1'
123 | SpacesInAngles: 'false'
124 | SpacesInCStyleCastParentheses: 'false'
125 | SpacesInContainerLiterals: 'false'
126 | SpacesInParentheses: 'false'
127 | SpacesInSquareBrackets: 'false'
128 | Standard: Auto
129 | StatementMacros: ['__maybe_unused']
130 | TabWidth: '4'
131 | UseTab: Never
132 | ...
133 |
--------------------------------------------------------------------------------
/.cmake-format.yaml:
--------------------------------------------------------------------------------
1 | _help_parse: Options affecting listfile parsing
2 | parse:
3 | _help_additional_commands:
4 | - Specify structure for custom cmake functions
5 | additional_commands:
6 | APPEND_TO_LISTS:
7 | kwargs:
8 | LISTS: "*"
9 | VALUES: "*"
10 | target_sources:
11 | flags:
12 | - PUBLIC
13 | - PRIVATE
14 | _help_vartags:
15 | - Specify variable tags.
16 | vartags: []
17 | _help_proptags:
18 | - Specify property tags.
19 | proptags: []
20 | _help_format: Options affecting formatting.
21 | format:
22 | _help_line_width:
23 | - How wide to allow formatted cmake files
24 | line_width: 80
25 | _help_tab_size:
26 | - How many spaces to tab for indent
27 | tab_size: 2
28 | _help_max_subgroups_hwrap:
29 | - If an argument group contains more than this many sub-groups
30 | - (parg or kwarg groups) then force it to a vertical layout.
31 | max_subgroups_hwrap: 6
32 | _help_max_pargs_hwrap:
33 | - If a positional argument group contains more than this many
34 | - arguments, then force it to a vertical layout.
35 | max_pargs_hwrap: 8
36 | _help_max_rows_cmdline:
37 | - If a cmdline positional group consumes more than this many
38 | - lines without nesting, then invalidate the layout (and nest)
39 | max_rows_cmdline: 6
40 | _help_separate_ctrl_name_with_space:
41 | - If true, separate flow control names from their parentheses
42 | - with a space
43 | separate_ctrl_name_with_space: true
44 | _help_separate_fn_name_with_space:
45 | - If true, separate function names from parentheses with a
46 | - space
47 | separate_fn_name_with_space: false
48 | _help_dangle_parens:
49 | - If a statement is wrapped to more than one line, than dangle
50 | - the closing parenthesis on its own line.
51 | dangle_parens: true
52 | _help_dangle_align:
53 | - If the trailing parenthesis must be 'dangled' on its on
54 | - "line, then align it to this reference: `prefix`: the start"
55 | - "of the statement, `prefix-indent`: the start of the"
56 | - "statement, plus one indentation level, `child`: align to"
57 | - the column of the arguments
58 | dangle_align: prefix
59 | _help_min_prefix_chars:
60 | - If the statement spelling length (including space and
61 | - parenthesis) is smaller than this amount, then force reject
62 | - nested layouts.
63 | min_prefix_chars: 4
64 | _help_max_prefix_chars:
65 | - If the statement spelling length (including space and
66 | - parenthesis) is larger than the tab width by more than this
67 | - amount, then force reject un-nested layouts.
68 | max_prefix_chars: 10
69 | _help_max_lines_hwrap:
70 | - If a candidate layout is wrapped horizontally but it exceeds
71 | - this many lines, then reject the layout.
72 | max_lines_hwrap: 10
73 | _help_line_ending:
74 | - What style line endings to use in the output.
75 | line_ending: unix
76 | _help_command_case:
77 | - Format command names consistently as 'lower' or 'upper' case
78 | command_case: upper
79 | _help_keyword_case:
80 | - Format keywords consistently as 'lower' or 'upper' case
81 | keyword_case: upper
82 | _help_always_wrap:
83 | - A list of command names which should always be wrapped
84 | always_wrap: []
85 | _help_enable_sort:
86 | - If true, the argument lists which are known to be sortable
87 | - will be sorted lexicographicall
88 | enable_sort: true
89 | _help_autosort:
90 | - If true, the parsers may infer whether or not an argument
91 | - list is sortable (without annotation).
92 | autosort: false
93 | _help_require_valid_layout:
94 | - By default, if cmake-format cannot successfully fit
95 | - everything into the desired linewidth it will apply the
96 | - last, most agressive attempt that it made. If this flag is
97 | - True, however, cmake-format will print error, exit with non-
98 | - zero status code, and write-out nothing
99 | require_valid_layout: false
100 | _help_layout_passes:
101 | - A dictionary mapping layout nodes to a list of wrap
102 | - decisions. See the documentation for more information.
103 | layout_passes: {}
104 | _help_markup: Options affecting comment reflow and formatting.
105 | markup:
106 | _help_bullet_char:
107 | - What character to use for bulleted lists
108 | bullet_char: "*"
109 | _help_enum_char:
110 | - What character to use as punctuation after numerals in an
111 | - enumerated list
112 | enum_char: .
113 | _help_first_comment_is_literal:
114 | - If comment markup is enabled, don't reflow the first comment
115 | - block in each listfile. Use this to preserve formatting of
116 | - your copyright/license statements.
117 | first_comment_is_literal: false
118 | _help_literal_comment_pattern:
119 | - If comment markup is enabled, don't reflow any comment block
120 | - which matches this (regex) pattern. Default is `None`
121 | - (disabled).
122 | literal_comment_pattern: null
123 | _help_fence_pattern:
124 | - Regular expression to match preformat fences in comments
125 | - default= ``r'^\s*([`~]{3}[`~]*)(.*)$'``
126 | fence_pattern: ^\s*([`~]{3}[`~]*)(.*)$
127 | _help_ruler_pattern:
128 | - Regular expression to match rulers in comments default=
129 | - '``r''^\s*[^\w\s]{3}.*[^\w\s]{3}$''``'
130 | ruler_pattern: ^\s*[^\w\s]{3}.*[^\w\s]{3}$
131 | _help_explicit_trailing_pattern:
132 | - If a comment line matches starts with this pattern then it
133 | - is explicitly a trailing comment for the preceeding
134 | - argument. Default is '#<'
135 | explicit_trailing_pattern: "#<"
136 | _help_hashruler_min_length:
137 | - If a comment line starts with at least this many consecutive
138 | - hash characters, then don't lstrip() them off. This allows
139 | - for lazy hash rulers where the first hash char is not
140 | - separated by space
141 | hashruler_min_length: 10
142 | _help_canonicalize_hashrulers:
143 | - If true, then insert a space between the first hash char and
144 | - remaining hash chars in a hash ruler, and normalize its
145 | - length to fill the column
146 | canonicalize_hashrulers: true
147 | _help_enable_markup:
148 | - enable comment markup parsing and reflow
149 | enable_markup: false
150 | _help_lint: Options affecting the linter
151 | lint:
152 | _help_disabled_codes:
153 | - a list of lint codes to disable
154 | disabled_codes: []
155 | _help_function_pattern:
156 | - regular expression pattern describing valid function names
157 | function_pattern: "[0-9a-z_]+"
158 | _help_macro_pattern:
159 | - regular expression pattern describing valid macro names
160 | macro_pattern: "[0-9A-Z_]+"
161 | _help_global_var_pattern:
162 | - regular expression pattern describing valid names for
163 | - variables with global (cache) scope
164 | global_var_pattern: "[A-Z][0-9A-Z_]+"
165 | _help_internal_var_pattern:
166 | - regular expression pattern describing valid names for
167 | - variables with global scope (but internal semantic)
168 | internal_var_pattern: _[A-Z][0-9A-Z_]+
169 | _help_local_var_pattern:
170 | - regular expression pattern describing valid names for
171 | - variables with local scope
172 | local_var_pattern: "[a-z][a-z0-9_]+"
173 | _help_private_var_pattern:
174 | - regular expression pattern describing valid names for
175 | - privatedirectory variables
176 | private_var_pattern: _[0-9a-z_]+
177 | _help_public_var_pattern:
178 | - regular expression pattern describing valid names for public
179 | - directory variables
180 | public_var_pattern: "[A-Z][0-9A-Z_]+"
181 | _help_argument_var_pattern:
182 | - regular expression pattern describing valid names for
183 | - function/macro arguments and loop variables.
184 | argument_var_pattern: "[a-z][a-z0-9_]+"
185 | _help_keyword_pattern:
186 | - regular expression pattern describing valid names for
187 | - keywords used in functions or macros
188 | keyword_pattern: "[A-Z][0-9A-Z_]+"
189 | _help_max_conditionals_custom_parser:
190 | - In the heuristic for C0201, how many conditionals to match
191 | - within a loop in before considering the loop a parser.
192 | max_conditionals_custom_parser: 2
193 | _help_min_statement_spacing:
194 | - Require at least this many newlines between statements
195 | min_statement_spacing: 1
196 | _help_max_statement_spacing:
197 | - Require no more than this many newlines between statements
198 | max_statement_spacing: 2
199 | max_returns: 6
200 | max_branches: 12
201 | max_arguments: 5
202 | max_localvars: 15
203 | max_statements: 50
204 | _help_encode: Options affecting file encoding
205 | encode:
206 | _help_emit_byteorder_mark:
207 | - If true, emit the unicode byte-order mark (BOM) at the start
208 | - of the file
209 | emit_byteorder_mark: false
210 | _help_input_encoding:
211 | - Specify the encoding of the input file. Defaults to utf-8
212 | input_encoding: utf-8
213 | _help_output_encoding:
214 | - Specify the encoding of the output file. Defaults to utf-8.
215 | - Note that cmake only claims to support utf-8 so be careful
216 | - when using anything else
217 | output_encoding: utf-8
218 | _help_misc: Miscellaneous configurations options.
219 | misc:
220 | _help_per_command:
221 | - A dictionary containing any per-command configuration
222 | - overrides. Currently only `command_case` is supported.
223 | per_command: {}
224 |
--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
1 | name: benchmark
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: [ubuntu-latest]
8 |
9 | steps:
10 | - uses: actions/checkout@v2
11 |
12 | - name: Install requirements
13 | run: |
14 | sudo apt-get update -q -y
15 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
16 |
17 | - name: Configure
18 | run: cmake -B ${{github.workspace}}/build -DMPN_NO_ASM=ON -DBUILD_VENDOR=ON
19 |
20 | - name: Build
21 | run: cmake --build ${{github.workspace}}/build
22 |
23 | - name: Run Benchmark
24 | working-directory: ${{github.workspace}}/build
25 | run: |
26 | ${{github.workspace}}/build/tests/benchmark | tee ${{github.workspace}}/build/benchmark.txt
27 | cat ${{github.workspace}}/docs/README.template.md > ${{github.workspace}}/README.md
28 | echo -e '## Benchmark(libmpi VS openssl)\n' >> ${{github.workspace}}/README.md
29 | awk '/-----BEGIN MARKDOWN TABLE-----/{ f = 1; next } /-----END MARKDOWN TABLE-----/{ f = 0 } f' benchmark.txt >> ${{github.workspace}}/README.md
30 | git add ${{github.workspace}}/README.md
31 |
32 | - name: Commit files
33 | run: |
34 | git config --local user.email "github-actions[bot]@users.noreply.github.com"
35 | git config --local user.name "github-actions[bot]"
36 | git commit -m "Update performance data" -a
37 |
38 | - name: Push changes
39 | uses: ad-m/github-push-action@master
40 | with:
41 | github_token: ${{ secrets.GITHUB_TOKEN }}
42 | branch: ${{ github.ref }}
43 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: ci
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ${{ matrix.distro }}
8 |
9 | strategy:
10 | matrix:
11 | distro: [ubuntu-latest, macos-latest]
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 |
16 | - name: Install requirements
17 | id: requirements
18 | run: |
19 | case "${{ matrix.distro }}" in
20 | ubuntu*|jessie|stretch|buster|bullseye)
21 | sudo apt-get update -q -y
22 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
23 | ;;
24 | macos*)
25 | brew install nasm googletest openssl@1.1 openssl@3
26 | brew link openssl --force
27 | echo ::set-output name=LDFLAGS::"-L/usr/local/opt/openssl@1.1/lib"
28 | echo ::set-output name=CPPFLAGS::"-I/usr/local/opt/openssl@1.1/include"
29 | ;;
30 | fedora*)
31 | sudo dnf -y update
32 | sudo dnf -y install gcc g++ git nasm gtest openssl cmake
33 | ;;
34 | alpine*)
35 | apk update
36 | apk add gcc g++ git nasm gtest openssl cmake
37 | ;;
38 | esac
39 |
40 | - name: Configure
41 | run: cmake -B ${{github.workspace}}/build -DCMAKE_VERBOSE_MAKEFILE=ON -DMPN_NO_ASM=ON -DCMAKE_CXX_FLAGS=${{ steps.requirements.outputs.CPPFLAGS }} -DCMAKE_EXE_LINKER_FLAGS=${{ steps.requirements.outputs.LDFLAGS }}
42 |
43 | - name: Build
44 | run: cmake --build ${{github.workspace}}/build
45 |
--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
1 | name: coverage
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: [ubuntu-latest]
8 |
9 | steps:
10 | - uses: actions/checkout@v2
11 |
12 | - name: Install requirements
13 | run: |
14 | sudo apt-get update -q -y
15 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake
16 |
17 | - name: Configure
18 | run: cmake -B ${{github.workspace}}/build -DGCOV=ON -DCMAKE_BUILD_TYPE=Debug -DMPN_NO_ASM=ON
19 |
20 | - name: Build
21 | run: cmake --build ${{github.workspace}}/build
22 |
23 | - name: Run Test
24 | working-directory: ${{github.workspace}}/build
25 | run: |
26 | make test || true
27 | ${{github.workspace}}/build/tests/benchmark || true
28 |
29 | - name: Upload To CodeCov
30 | run: bash <(curl -s https://codecov.io/bash)
31 |
--------------------------------------------------------------------------------
/.github/workflows/multiarch.yml:
--------------------------------------------------------------------------------
1 | name: multiarch
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build_job:
7 | # The host should always be linux
8 | runs-on: ubuntu-18.04
9 | name: Build on ${{ matrix.distro }} ${{ matrix.arch }}
10 |
11 | # Run steps on a matrix of 3 arch/distro combinations
12 | strategy:
13 | matrix:
14 | include:
15 | - arch: aarch64
16 | distro: ubuntu18.04
17 | # - arch: armv7
18 | # distro: ubuntu20.04
19 | - arch: s390x
20 | distro: fedora_latest
21 | # - arch: ppc64le
22 | # distro: alpine_latest
23 |
24 | steps:
25 | - uses: actions/checkout@v2.1.0
26 | - name: Building
27 | uses: uraimo/run-on-arch-action@v2.1.1
28 | id: build
29 | with:
30 | arch: ${{ matrix.arch }}
31 | distro: ${{ matrix.distro }}
32 |
33 | # Not required, but speeds up builds
34 | githubToken: ${{ github.token }}
35 |
36 | # Mount the github.workspace directory as /workspace in the container
37 | dockerRunArgs: |
38 | --volume "${{ github.workspace }}:/workspace"
39 |
40 | # Pass some environment variables to the container
41 | env: |
42 | workspace: /workspace
43 |
44 | # The shell to run commands with in the container
45 | shell: /bin/bash
46 |
47 | # Install some dependencies in the container. This speeds up builds if
48 | # you are also using githubToken. Any dependencies installed here will
49 | # be part of the container image that gets cached, so subsequent
50 | # builds don't have to re-install them. The image layer is cached
51 | # publicly in your project's package repository, so it is vital that
52 | # no secrets are present in the container state or logs.
53 | install: |
54 | case "${{ matrix.distro }}" in
55 | ubuntu*|jessie|stretch|buster|bullseye)
56 | apt-get update -q -y
57 | apt-get install -q -y gcc g++ nasm libgtest-dev openssl cmake
58 | ;;
59 | macos*)
60 | brew update
61 | brew install nasm googletest openssl
62 | brew link openssl --force
63 | export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib"
64 | export CPPFLAGS="-I/usr/local/opt/openssl@1.1/include"
65 | ;;
66 | fedora*)
67 | dnf -y update
68 | dnf -y install gcc g++ nasm gtest openssl cmake
69 | ;;
70 | alpine*)
71 | apk update
72 | apk add gcc g++ nasm gtest openssl cmake
73 | ;;
74 | esac
75 |
76 | # Configure and Build
77 | run: |
78 | mkdir -p ${workspace}/build && cd ${workspace}/build
79 | cmake .. && make
80 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Compiled Object files
5 | *.slo
6 | *.lo
7 | *.o
8 | *.obj
9 |
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 |
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 |
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 |
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 |
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 |
34 | # ignore directories
35 | build/**
36 | .vscode/**
37 | vendor/**
38 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | fail_fast: false
2 |
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v4.2.0
6 | hooks:
7 | - id: check-added-large-files
8 | - id: trailing-whitespace
9 | args: [--markdown-linebreak-ext=md]
10 | - id: check-merge-conflict
11 | - id: check-json
12 | - id: check-yaml
13 | args: [--allow-multiple-document]
14 | - id: check-case-conflict
15 | - id: check-symlinks
16 | - id: end-of-file-fixer
17 | - id: pretty-format-json
18 | - repo: git://github.com/doublify/pre-commit-clang-format
19 | rev: 62302476d0da01515660132d76902359bed0f782
20 | hooks:
21 | - id: clang-format
22 | entry: clang-format
23 | language: system
24 | files: \.(c|cc|cxx|cpp|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|proto|vert)$
25 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0)
2 | PROJECT("Cryptograph Algorithms Implementation")
3 |
4 | INCLUDE(CMakePackageConfigHelpers)
5 | INCLUDE(cmake/ConfigureTarget.cmake)
6 |
7 | ADD_COMPILE_OPTIONS(-Wno-deprecated-declarations)
8 |
9 | IF (NOT DEFINED ARCH)
10 | # MATCHES "^(os|ios|android|linux|win32)$
11 | IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|amd64)$")
12 | SET(ARCH "x86_64")
13 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
14 | # cmake reports AMD64 on Windows, but we might be building for 32-bit.
15 | IF (CMAKE_SIZEOF_VOID_P EQUAL 8)
16 | SET(ARCH "x86_64")
17 | ELSE ()
18 | SET(ARCH "x86")
19 | ENDIF ()
20 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86|i386|i386)$")
21 | SET(ARCH "x86")
22 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(aarch64|arm64|arm64e)$")
23 | SET(ARCH "aarch64")
24 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm*")
25 | SET(ARCH "arm")
26 | ELSE ()
27 | SET(ARCH "generic")
28 | MESSAGE(STATUE "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR})
29 | ENDIF ()
30 | ENDIF ()
31 |
32 | IF (UNIX)
33 | IF (${ARCH} STREQUAL "aarch64")
34 | IF (APPLE)
35 | SET(PERLASM_STYLE ios64)
36 | ELSE ()
37 | SET(PERLASM_STYLE linux64)
38 | ENDIF ()
39 | ELSEIF (${ARCH} STREQUAL "arm")
40 | IF (APPLE)
41 | SET(PERLASM_STYLE ios32)
42 | ELSE ()
43 | SET(PERLASM_STYLE linux32)
44 | ENDIF ()
45 | ELSE ()
46 | IF (${ARCH} STREQUAL "x86")
47 | SET(PERLASM_FLAGS "-fPIC -DCRYPTO_IA32_SSE2")
48 | ENDIF ()
49 | IF (APPLE)
50 | SET(PERLASM_STYLE macosx)
51 | ELSE ()
52 | SET(PERLASM_STYLE elf)
53 | ENDIF ()
54 | ENDIF ()
55 | SET(ASM_EXT S)
56 | ENABLE_LANGUAGE(ASM)
57 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,--noexecstack")
58 |
59 | # Clang's integerated assembler does not support debug symbols.
60 | IF (NOT CMAKE_ASM_COMPILER_ID MATCHES "Clang")
61 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,-g")
62 | ENDIF ()
63 |
64 | # CMake does not add -isysroot and -arch flags to assembly.
65 | IF (APPLE)
66 | IF (CMAKE_OSX_SYSROOT)
67 | SET(CMAKE_ASM_FLAGS
68 | "${CMAKE_ASM_FLAGS} -isysroot \"${CMAKE_OSX_SYSROOT}\""
69 | )
70 | ENDIF ()
71 | FOREACH (arch ${CMAKE_OSX_ARCHITECTURES})
72 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -arch ${arch}")
73 | ENDFOREACH ()
74 | ENDIF ()
75 | ELSE ()
76 | IF (${ARCH} STREQUAL "x86_64")
77 | SET(PERLASM_STYLE nasm)
78 | ELSE ()
79 | SET(PERLASM_STYLE win32n)
80 | SET(PERLASM_FLAGS "-DCRYPTO_IA32_SSE2")
81 | ENDIF ()
82 | SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -gcv8")
83 |
84 | # On Windows, we use the NASM output, specifically built with Yasm.
85 | SET(ASM_EXT asm)
86 | ENDIF ()
87 |
88 | FIND_PACKAGE(Perl REQUIRED)
89 | MACRO (PERLASM dest src)
90 | ADD_CUSTOM_COMMAND(
91 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${dest}
92 | COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}
93 | COMMAND CC=${CMAKE_C_COMPILER} ${PERL_EXECUTABLE} ${src} ${PERLASM_STYLE}
94 | ${PERLASM_FLAGS} ${ARGN} ${CMAKE_CURRENT_BINARY_DIR}/${dest}
95 | DEPENDS ${src} ${CMAKE_SOURCE_DIR}/perlasm/arm-xlate.pl
96 | ${CMAKE_SOURCE_DIR}/perlasm/x86_64-xlate.pl
97 | ${CMAKE_SOURCE_DIR}/perlasm/x86asm.pl
98 | ${CMAKE_SOURCE_DIR}/perlasm/x86gas.pl
99 | ${CMAKE_SOURCE_DIR}/perlasm/x86masm.pl
100 | ${CMAKE_SOURCE_DIR}/perlasm/x86nasm.pl
101 | WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
102 | )
103 | ENDMACRO ()
104 |
105 | IF (BUILD_VENDOR)
106 | INCLUDE(ExternalProject)
107 | IF (NOT EXISTS ${CMAKE_SOURCE_DIR}/vendor)
108 | FILE(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/vendor)
109 | ENDIF ()
110 |
111 | INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/include)
112 | LINK_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/lib)
113 | SET(ENV{PATH} ${CMAKE_BINARY_DIR}/bin:$ENV{PATH})
114 | SET(ENV{PKG_CONFIG_PATH} ${CMAKE_BINARY_DIR}/lib/pkgconfig)
115 |
116 | # cmake-format: off
117 | SET(NASM_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/nasm-2.15.05.tar.gz)
118 | SET(NASM_DOWNLOAD_URL https://www.nasm.us/pub/nasm/releasebuilds/2.15.05/nasm-2.15.05.tar.gz)
119 | IF (NOT EXISTS ${NASM_LOCAL_FILE})
120 | FILE(
121 | DOWNLOAD ${NASM_DOWNLOAD_URL} ${NASM_LOCAL_FILE}
122 | TIMEOUT 60
123 | TLS_VERIFY ON
124 | )
125 | ENDIF ()
126 | EXTERNALPROJECT_ADD(
127 | nasm
128 | URL ${NASM_LOCAL_FILE}
129 | CONFIGURE_COMMAND ./configure --prefix=${CMAKE_BINARY_DIR}
130 | BUILD_COMMAND make -j${CONCURRENCY}
131 | BUILD_IN_SOURCE 1
132 | )
133 |
134 | SET(OPENSSL_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i.tar.gz)
135 | SET(OPENSSL_DOWNLOAD_URL https://www.openssl.org/source/old/1.1.1/openssl-1.1.1i.tar.gz)
136 | IF (NOT EXISTS ${OPENSSL_LOCAL_FILE})
137 | FILE(
138 | DOWNLOAD ${OPENSSL_DOWNLOAD_URL} ${OPENSSL_LOCAL_FILE}
139 | TIMEOUT 60
140 | TLS_VERIFY ON
141 | )
142 | ENDIF ()
143 | EXTERNALPROJECT_ADD(
144 | openssl
145 | URL ${OPENSSL_LOCAL_FILE}
146 | CONFIGURE_COMMAND ./config no-shared no-asm -d --prefix=${CMAKE_BINARY_DIR}
147 | BUILD_COMMAND make depend && make -j${CONCURRENCY}
148 | INSTALL_COMMAND make install_sw
149 | BUILD_IN_SOURCE 1
150 | )
151 | # cmake-format: on
152 | LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/lib)
153 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/include)
154 | ELSE ()
155 | FIND_PROGRAM(OPENSSL openssl REQUIRED)
156 | GET_FILENAME_COMPONENT(OPENSSL_DIR ${OPENSSL} DIRECTORY CACHE)
157 | LINK_DIRECTORIES(${OPENSSL_DIR}/../lib)
158 | INCLUDE_DIRECTORIES(${OPENSSL_DIR}/../include)
159 | ENDIF ()
160 |
161 | LINK_DIRECTORIES(/usr/local/lib)
162 | INCLUDE_DIRECTORIES(/usr/local/include ${CMAKE_BINARY_DIR}/include)
163 |
164 | # mpn
165 | ADD_SUBDIRECTORY(mpn)
166 |
167 | # mpi
168 | ADD_SUBDIRECTORY(mpi)
169 |
170 | # tests
171 | ENABLE_TESTING()
172 | ADD_SUBDIRECTORY(tests)
173 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # libmpi
2 |
3 | [](https://github.com/vxfury/libmpi/blob/master/LICENSE)
4 | [](https://github.com/vxfury/libmpi/actions)
5 | [](https://codecov.io/gh/vxfury/libmpi)
6 | 
7 | [](https://github.com/vxfury/libmpi/pulls)
8 |
9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA
10 | ## Benchmark(libmpi VS openssl)
11 |
12 | | brief | average time
(nanoseconds) | instability
(coefficient of variation) | rating |
13 | | :-- | :-: | :-: | :-: |
14 | | from-string(mpi vs openssl) | 2443.7
30303.4* | 0.0701562 | 12.4006
(Tu es mon meilleur frère...) |
15 | | to-string(mpi vs openssl) | 1328.88
3463.21* | 0.109777 | 2.60612
(Tu peux faire mieux, continue) |
16 | | from-octets(mpi vs openssl) | 273.632
702.13* | 0.0870046 | 2.56597
(Tu peux faire mieux, continue) |
17 | | to-octets(mpi vs openssl) | 172.067
1475.5* | 0.359989 | 8.57515
(C'est super, dessine-toi une tarte) |
18 | | add(mpi vs openssl) | 51.1222
333.814* | 0.164442 | 6.52973
(C'est super, dessine-toi une tarte) |
19 | | add-assign(mpi vs openssl) | 56.7424
332.054* | 0.202937 | 5.85196
(C'est super, dessine-toi une tarte) |
20 | | sub(mpi vs openssl) | 61.6028
162.647* | 0.207007 | 2.64025
(Tu peux faire mieux, continue) |
21 | | sub-assign(mpi vs openssl) | 58.2224
288.852* | 0.155195 | 4.96119
(Tu peux faire mieux, continue) |
22 | | mul(mpi vs openssl) | 2070.41
14037.9* | 0.0553581 | 6.78025
(C'est super, dessine-toi une tarte) |
23 | | sqr(mpi vs openssl) | 1329.62
8760.12* | 0.168403 | 6.58845
(C'est super, dessine-toi une tarte) |
24 | | MUL2(a * 2 = a + a) | 37.5416 | 0.163214 | N/A |
25 | | MUL2(a * 2 = a << 1) | 77.5234 | 0.113647 | N/A |
26 |
--------------------------------------------------------------------------------
/cmake/Config.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 |
3 | find_package(Threads)
4 | include(${CMAKE_CURRENT_LIST_DIR}/libacoTargets.cmake)
5 |
--------------------------------------------------------------------------------
/docs/README.template.md:
--------------------------------------------------------------------------------
1 | # libmpi
2 |
3 | [](https://github.com/vxfury/libmpi/blob/master/LICENSE)
4 | [](https://github.com/vxfury/libmpi/actions)
5 | [](https://codecov.io/gh/vxfury/libmpi)
6 | 
7 | [](https://github.com/vxfury/libmpi/pulls)
8 |
9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA
10 |
--------------------------------------------------------------------------------
/mpi/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Multiple Precision Integer and Relevant Algorithms
2 |
3 | CONFIGURE_FILE(mpi.h ${CMAKE_BINARY_DIR}/include/mpi/mpi.h COPYONLY)
4 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi.h DESTINATION include/mpi)
5 | ADD_LIBRARY(mpi mpi.c mpi-prime.c)
6 | ConfigureTarget(mpi)
7 | TARGET_LINK_LIBRARIES(mpi PUBLIC mpn)
8 | INSTALL(TARGETS mpi ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
9 |
10 | # RSA(Rivest–Shamir–Adleman) Algorithm
11 | OPTION(MPI_NO_RSA "build without rsa algorithm" OFF)
12 | IF (NOT MPI_NO_RSA)
13 | CONFIGURE_FILE(mpi-rsa.h ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h COPYONLY)
14 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h
15 | DESTINATION include/mpi
16 | )
17 | TARGET_SOURCES(mpi PRIVATE mpi-rsa.c)
18 | ENDIF ()
19 |
--------------------------------------------------------------------------------
/mpi/mpi-rsa.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef MULTIPLE_PRECISION_RSA_H
17 | #define MULTIPLE_PRECISION_RSA_H
18 |
19 | #include
20 | #include
21 |
22 | #if defined(__cplusplus)
23 | extern "C" {
24 | #endif
25 |
26 | typedef struct {
27 | unsigned int nbits; /* RSA modulus bitsize */
28 | unsigned int ebits; /* RSA public exp bitsize */
29 | unsigned int dbits; /* RSA private exp bitsize */
30 | unsigned int pbits; /* RSA p-factor bitsize */
31 | unsigned int qbits; /* RSA q-factor bitsize */
32 |
33 | mpn_limb_t *e; /* public exponent, bitsize(e) = ebits */
34 | mpn_limb_t *d; /* private exponent, bitsize(d) = dbits <= nbits */
35 | mpn_limb_t *dp; /* the first factor's CRT exponent, d mod (p - 1), bitsize(dp) <= pbits */
36 | mpn_limb_t *dq; /* the second factor's CRT exponent, d mod (q - 1), bitsize(dq) <= qbits */
37 | mpn_limb_t *qinv; /* the (first) CRT coefficient, q^(-1) mode p, bitsize(qinv) <= pbits */
38 |
39 | mpn_montgomery_t *montN; /* montgomery context for (N, the modulus, bitsize(n) = nbits) */
40 | mpn_montgomery_t *montP; /* montgomery context for (P, the first factor) */
41 | mpn_montgomery_t *montQ; /* montgomery context for (Q, the second factor) */
42 |
43 | /* TODO: multiple-primes support */
44 | unsigned int primes;
45 | struct rsa_factor {
46 | unsigned int bits; /* bit-size of factor */
47 | mpn_limb_t *r; /* factor */
48 | mpn_limb_t *d; /* factor's CRT exponent */
49 | mpn_limb_t *t; /* factor's CRT coefficient */
50 | } factors[0];
51 | } rsa_key_t;
52 |
53 | rsa_key_t *rsa_new(unsigned int ebits, unsigned int nbits, unsigned int primes);
54 | void rsa_free(rsa_key_t *key);
55 |
56 | int rsa_import(rsa_key_t *key, const mpi_t *n, const mpi_t *e, const mpi_t *d, const mpi_t *dp, const mpi_t *dq,
57 | const mpi_t *qinv);
58 | rsa_key_t *rsa_generate_key(const mpi_t *pubexp, unsigned int nbits, unsigned int primes,
59 | int (*rand_bytes)(void *, unsigned char *, unsigned int), void *rand_state);
60 |
61 | int rsa_pub_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
62 | int rsa_prv_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
63 | int rsa_prv_cipher_crt(mpi_t *r, const mpi_t *x, const rsa_key_t *key);
64 |
65 | #if defined(__cplusplus)
66 | }
67 | #endif
68 |
69 | #endif
70 |
--------------------------------------------------------------------------------
/mpi/mpi.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | /**
17 | * @brief: multiple precision integer: configurations, macros, and prototypes
18 | *
19 | * @note:
20 | * 1. Assume that all variable representing size will never overflow
21 | */
22 |
23 | #ifndef MULTIPLE_PRECISION_H
24 | #define MULTIPLE_PRECISION_H
25 |
26 | #include
27 | #include
28 |
29 | #if defined(__cplusplus)
30 | extern "C" {
31 | #endif
32 |
33 | /**
34 | * mpi implementation
35 | */
36 | #define MPI_SIGN_NEGTIVE 1 /* a < 0, negtive */
37 | #define MPI_SIGN_NON_NEGTIVE 0 /* a >= 0, non-negtive */
38 | #define MPI_ATTR_NOTOWNED 0x01 /* TODO: data field not owned by */
39 | #define MPI_ATTR_DETACHED 0x02 /* TODO: detached data field */
40 | #define MPI_ATTR_AUTOSIZE 0x04 /* TODO: resize data field automatically */
41 |
42 | typedef struct {
43 | unsigned int attr; /**< mpi attributes */
44 | unsigned int sign; /**< mpi sign: negtive or not */
45 | mpn_size_t size; /**< mpi size (count of mpn_limb_t) */
46 | mpn_size_t room; /**< mpi max size (count of mpn_limb_t) */
47 | mpn_limb_t *data; /**< mpi data chunk(most significant limb at the largest) */
48 | } mpi_t;
49 | #define MPI_ALIGNED_HEAD_LIMBS ((mpn_size_t)((sizeof(mpi_t) + sizeof(mpn_limb_t) - 1) / sizeof(mpn_limb_t)))
50 |
51 | /** High-Level APIs */
52 | /**
53 | * create mpi with expected bits |bits| to reserve
54 | *
55 | * |bits| == 0, to create empty room
56 | *
57 | * @performance: Locality of reference and Cacheline alignment
58 | * mpi_t and this->data will be allocated as a continuous memory chunk
59 | */
60 | mpi_t *mpi_create(mpn_size_t bits);
61 |
62 | /**
63 | * create mpi(detached) with expected bits |bits| to reserve
64 | *
65 | * |bits| == 0, to create empty room
66 | */
67 | mpi_t *mpi_create_detached(mpn_size_t bits);
68 |
69 | /**
70 | * duplicate big-numer |a|
71 | */
72 | mpi_t *mpi_dup(const mpi_t *a);
73 |
74 | /**
75 | * clear and release mpi |v|
76 | */
77 | void mpi_destory(mpi_t *v);
78 |
79 | /**
80 | * make mpi with given chunk
81 | */
82 | void mpi_make(mpi_t *r, mpn_limb_t *data, mpn_size_t size);
83 |
84 | /**
85 | * copy big-numer |a| to |r|
86 | *
87 | * @note:
88 | * 1. resize |r| to proper size before copy
89 | */
90 | int mpi_copy(mpi_t *r, const mpi_t *a);
91 |
92 | /**
93 | * compare mpi |a| and |b|
94 | * 0, if |a| = |b|
95 | * 1, if |a| > |b|
96 | * -1, if |a| < |b|
97 | * otherwise, error code
98 | */
99 | int mpi_cmp(const mpi_t *a, const mpi_t *b);
100 |
101 | /**
102 | * get bit size of mpi |a|(constant-time version)
103 | *
104 | * @note:
105 | * 1. 0, if a is NULL
106 | */
107 | mpn_size_t mpi_bits(const mpi_t *a);
108 |
109 | /**
110 | * get byte size of mpi |a|(constant-time version)
111 | *
112 | * @note:
113 | * 1. 0, if a is NULL
114 | */
115 | mpn_size_t mpi_bytes(const mpi_t *a);
116 |
117 | /**
118 | * get max bit size of mpi |a|(constant-time version)
119 | *
120 | * @note:
121 | * 1. 0, if a is NULL
122 | */
123 | mpn_size_t mpi_max_bits(const mpi_t *a);
124 |
125 | /**
126 | * get max byte size of mpi |a|(constant-time version)
127 | *
128 | * @note:
129 | * 1. 0, if a is NULL
130 | */
131 | mpn_size_t mpi_max_bytes(const mpi_t *a);
132 |
133 | /**
134 | * mpi: expand mpi to expected bits |bits|
135 | *
136 | * @note:
137 | * 1. maybe fail when no enough memory or invalid size given
138 | */
139 | mpi_t *mpi_expand(mpi_t *v, mpn_size_t bits);
140 |
141 | /**
142 | * resize mpi to expected bits |bits|
143 | *
144 | * @note:
145 | * 1. maybe fail when no enough memory or invalid size given
146 | *
147 | */
148 | mpi_t *mpi_resize(mpi_t *v, mpn_size_t bits);
149 |
150 | /**
151 | * zeroize mpi |v|
152 | */
153 | int mpi_zeroize(mpi_t *v);
154 |
155 | /**
156 | * set mpi |r| to unsigned sigle-precision integer |v|
157 | */
158 | int mpi_set_limb(mpi_t *r, mpn_limb_t v);
159 |
160 | /**
161 | * initialize mpi |v| from octets |buff|/|bufflen|
162 | *
163 | * @note:
164 | * 1. if *|v| is NULL, mpi will be created with proper size
165 | * 2. if *|v| isn't NULL, mpi-number will be resized, and maybe *|v| will be set to a new memory chunk
166 | */
167 | int mpi_from_octets(mpi_t **v, const unsigned char *buff, mpn_size_t bufflen);
168 |
169 | /**
170 | * convert mpi to big-endian octets
171 | */
172 | int mpi_to_octets(const mpi_t *a, unsigned char *out, mpn_size_t outsize, mpn_size_t *outlen);
173 |
174 | /**
175 | * initialize mpi |v| from hex-string |a|
176 | */
177 | int mpi_from_string(mpi_t **v, const char *a);
178 |
179 | /**
180 | * convert mpi to string
181 | *
182 | * @note:
183 | * 1. FREE the return pointer after usage
184 | */
185 | char *mpi_to_string(const mpi_t *v);
186 |
187 | /**
188 | * mpi addition: |r| = |a| + |b|
189 | *
190 | * @note:
191 | * 1. make sure r->room is enough to store the result
192 | * minimal advise size: MAX(bit_size(a), bit_size(b)) + 1
193 | */
194 | int mpi_add(mpi_t *r, const mpi_t *a, const mpi_t *b);
195 |
196 | /**
197 | * mpi addition: |r| = |a| + w
198 | *
199 | * @note:
200 | * 1. make sure r->room is enough to store the result
201 | * minimal advise size: MAX(bit_size(a), bit_size(w)) + 1
202 | */
203 | int mpi_add_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w);
204 |
205 | /**
206 | * mpi subtraction: |r| = |a| - |b|
207 | *
208 | * @note:
209 | * 1. make sure r->room is enough to store the result
210 | * minimal advise size: MAX(bit_size(a), bit_size(b))
211 | * 2. make sure |a| >= |b| 'cause negative mpi not supported till now
212 | */
213 | int mpi_sub(mpi_t *r, const mpi_t *a, const mpi_t *b);
214 |
215 | /**
216 | * mpi subtraction: |r| = |a| - w
217 | *
218 | * @note:
219 | * 1. make sure r->room is enough to store the result
220 | * minimal advise size: MAX(bit_size(a), bit_size(w))
221 | */
222 | int mpi_sub_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w);
223 |
224 | /**
225 | * mpi multiplication: |r| = |a| * |b|
226 | *
227 | * @note:
228 | * 1. make sure r->room is enough to store the result
229 | * minimal advise size: bit_size(a) + bit_size(b) + MPN_LIMB_BITS
230 | */
231 | int mpi_mul(mpi_t *r, const mpi_t *a, const mpi_t *b);
232 |
233 | /**
234 | * mpi multiplication: |r| = |a| * |b|
235 | *
236 | * @note:
237 | * 1. make sure r->room is enough to store the result
238 | * minimal advise size: bit_size(a) + bit_size(b)
239 | */
240 | int mpi_mul_limb(mpi_t *r, const mpi_t *a, mpn_limb_t b);
241 |
242 | /**
243 | * mpi square: |r| = |a| ^ 2
244 | *
245 | * @note:
246 | * 1. make sure r->room is enough to store the result
247 | * minimal advise size: 2 * bit_size(a)
248 | */
249 | int mpi_sqr(mpi_t *r, const mpi_t *a);
250 |
251 | /**
252 | * mpi division: |q|, |r| = |x| / |y|, |x| = |q| * |y| + |r|(0 <= |r| < |y|)
253 | *
254 | * @note:
255 | * 1. make sure room of |q|, |r| is enough to store the result
256 | * minimal advise size: bit_size(r) = bit_size(y)
257 | */
258 | int mpi_div(mpi_t *q, mpi_t *r, const mpi_t *x, const mpi_t *y);
259 |
260 | /**
261 | * mpi division: q, r = a / w
262 | */
263 | mpn_limb_t mpi_div_limb(mpi_t *a, mpn_limb_t w);
264 |
265 | /**
266 | * mpi modular: r = a mod m
267 | */
268 | mpn_limb_t mpi_mod_limb(const mpi_t *a, mpn_limb_t w);
269 |
270 | /**
271 | * greatest common divisor
272 | */
273 | int mpi_gcd(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer);
274 |
275 | /**
276 | * greatest common divisor(constant-time version)
277 | */
278 | int mpi_gcd_consttime(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer);
279 |
280 | /**
281 | * mpi modular: r = a mod m
282 | */
283 | int mpi_mod(mpi_t *r, const mpi_t *a, const mpi_t *m);
284 |
285 | /**
286 | * mpi exponentiation: r = g ^ e
287 | */
288 | int mpi_exp(mpi_t *r, const mpi_t *g, const mpi_t *e);
289 |
290 | /**
291 | * mpi exponentiation(word): r = g ^ e
292 | */
293 | int mpi_exp_limb(mpi_t *r, const mpi_t *g, mpn_limb_t e);
294 |
295 | /**
296 | * get bit
297 | */
298 | int mpi_get_bit(const mpi_t *a, mpn_size_t n);
299 |
300 | /**
301 | * set bit
302 | */
303 | int mpi_set_bit(const mpi_t *a, mpn_size_t n);
304 |
305 | /**
306 | * clr bit
307 | */
308 | int mpi_clr_bit(const mpi_t *a, mpn_size_t n);
309 |
310 | /**
311 | * left-shift: |r| = |a| << n
312 | */
313 | int mpi_lshift(mpi_t *r, const mpi_t *a, mpn_size_t n);
314 |
315 | /**
316 | * right-shift: |r| = |a| >> n
317 | */
318 | int mpi_rshift(mpi_t *r, const mpi_t *a, mpn_size_t n);
319 |
320 | /**
321 | * conditional swap(constant-time version)
322 | */
323 | int mpi_swap_consttime(unsigned condition, mpi_t *a, mpi_t *b, mpn_size_t n);
324 |
325 | /**
326 | * mpi(prime): test if a is a prime
327 | *
328 | * @note:
329 | * 1. return 0 if the number is composite
330 | * 1 if it is prime with an error probability of less than 0.25^checks
331 | */
332 | int mpi_is_prime(const mpi_t *a, mpn_size_t checks, unsigned do_trial_division, mpn_optimizer_t *optimizer,
333 | int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
334 |
335 | /**
336 | * mpi(prime): enerates a pseudo-random prime number of at least bit length |bits|
337 | *
338 | * @note:
339 | * 1. The returned number is probably prime with a negligible error.
340 | * 2. If |add| is NULL the returned prime number will have exact bit length |bits| with the top most two
341 | * bits set.
342 | * 3. The prime may have to fulfill additional requirements for use in Diffie-Hellman key exchange:
343 | * If |add| is not NULL, the prime will fulfill the condition p % |add| == |rem| (p % |add| == 1 if
344 | * |rem| == NULL) in order to suit a given generator.
345 | *
346 | * If |safe| is true, it will be a safe prime (i.e. a prime p so hat (p-1)/2 is also prime).
347 | * If |safe| is true, and |rem| == NULL the condition will be p % |add| == 3.
348 | * It is recommended that |add| is a multiple of 4.
349 | */
350 | int mpi_generate_prime(mpi_t *ret, mpn_size_t bits, unsigned safe, const mpi_t *add, const mpi_t *rem,
351 | int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
352 |
353 |
354 | /**
355 | * mpn optimizer: get mpi with specified room from optimizer
356 | *
357 | * @note:
358 | * 1. size: size of chunk, in unit of 'mpn_limb_t'
359 | */
360 | mpi_t *mpi_optimizer_get(mpn_optimizer_t *optimizer, mpn_size_t size);
361 |
362 | /**
363 | * mpn optimizer: put back mpi of specified room
364 | */
365 | void mpi_optimizer_put(mpn_optimizer_t *optimizer, mpn_size_t size);
366 |
367 |
368 | /**
369 | * mpn montgomery: intialize montgomery context with modulus
370 | *
371 | */
372 | int mpi_montgomery_set_modulus(mpn_montgomery_t *mont, const mpi_t *modulus);
373 |
374 | /**
375 | * mpn montgomery: exponentiation
376 | *
377 | */
378 | int mpi_montgomery_exp(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont);
379 |
380 | /**
381 | * mpn montgomery: exponentiation(constant-time version)
382 | *
383 | */
384 | int mpi_montgomery_exp_consttime(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont);
385 |
386 | #if defined(__cplusplus)
387 | }
388 | #endif
389 |
390 | #endif
391 |
--------------------------------------------------------------------------------
/mpn/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Multiple-Precision-Natural-Number
2 |
3 | CONFIGURE_FILE(mpn-asm.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-asm.h COPYONLY)
4 | CONFIGURE_FILE(mpn-conf.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h COPYONLY)
5 | CONFIGURE_FILE(
6 | mpn-binary.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h COPYONLY
7 | )
8 | CONFIGURE_FILE(
9 | mpn-optimizer.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h COPYONLY
10 | )
11 | CONFIGURE_FILE(
12 | mpn-montgomery.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h COPYONLY
13 | )
14 |
15 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h
16 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h
17 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h
18 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h
19 | DESTINATION include/mpn
20 | )
21 |
22 | ADD_LIBRARY(mpn mpn-binary.c mpn-asm.c mpn-optimizer.c mpn-montgomery.c)
23 | ConfigureTarget(mpn)
24 | INSTALL(TARGETS mpn ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
25 |
26 | OPTION(MPN_NO_ASM "disable asm for mpn" OFF)
27 | IF ((NOT MPN_NO_ASM) AND (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
28 | ENABLE_LANGUAGE(ASM_NASM)
29 | IF (NOT DEFINED ARCH)
30 | SET(ARCH ${CMAKE_SYSTEM_PROCESSOR})
31 | ENDIF ()
32 | SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D_L9 -DLINUX32E")
33 | IF (${ARCH} STREQUAL "x86_64")
34 | FILE(GLOB ASM_SOURCE asm/intel64/*.asm)
35 | TARGET_SOURCES(mpn PRIVATE ${ASM_SOURCE})
36 | TARGET_INCLUDE_DIRECTORIES(mpn PRIVATE asm asm/intel64)
37 | TARGET_COMPILE_DEFINITIONS(
38 | mpn
39 | PRIVATE -DMPN_UADD_VECTORIZED_ASM
40 | -DMPN_USUB_VECTORIZED_ASM
41 | -DMPN_UINC_VECTORIZED_ASM
42 | -DMPN_UDEC_VECTORIZED_ASM
43 | -DMPN_UDIV_ASM
44 | -DMPN_UMUL_ASM
45 | -DMPN_USQR_ASM
46 | -DMPN_UMUL_ADD_ASM
47 | -DMPN_MONT_REDC_ASM
48 | )
49 | ENDIF ()
50 | ENDIF ()
51 |
52 | IF (MPN_NO_INLINE_ASM)
53 | TARGET_COMPILE_DEFINITIONS(mpn PRIVATE -DMPN_NO_INLINE_ASM)
54 | ENDIF()
55 |
--------------------------------------------------------------------------------
/mpn/asm/asmdefs.inc:
--------------------------------------------------------------------------------
1 | %ifndef __ASMDEFS_INC__
2 | %define __ASMDEFS_INC__ 1
3 |
4 | %assign __ARCH_PX 0 ; pure C-code ia32
5 | %assign __ARCH_M5 1 ; Intel(R) Quark(TM) processor - ia32
6 | %assign __ARCH_W7 8 ; Intel(R) Streaming SIMD Extensions 2 - ia32
7 | %assign __ARCH_T7 16 ; Intel(R) Streaming SIMD Extensions 3 - ia32
8 | %assign __ARCH_V8 32 ; Supplemental Streaming SIMD Extensions 3 (SSSE3)
9 | %assign __ARCH_S8 33 ; SSSE3 + MOVBE instruction - ia32
10 | %assign __ARCH_P8 64 ; Intel(R) Streaming SIMD Extensions 4.2 - ia32
11 | %assign __ARCH_G9 128 ; Intel(R) Advanced Vector Extensions - ia32
12 | %assign __ARCH_H9 256 ; Intel(R) Advanced Vector Extensions 2 - ia32
13 | %assign __ARCH_I0 512 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - ia32
14 | %assign __ARCH_S0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32
15 |
16 | %assign __ARCH32E_PX __ARCH_PX ; pure C-code x64
17 | %assign __ARCH32E_M7 32 ; Intel(R) Streaming SIMD Extensions 3 - intel64
18 | %assign __ARCH32E_U8 64 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64
19 | %assign __ARCH32E_N8 65 ; SSSE3 + MOVBE instruction - intel64
20 | %assign __ARCH32E_Y8 128 ; Intel(R) Streaming SIMD Extensions 4.2 - intel64
21 | %assign __ARCH32E_E9 256 ; Intel(R) Advanced Vector Extensions - intel64
22 | %assign __ARCH32E_L9 512 ; Intel(R) Advanced Vector Extensions 2 - intel64
23 | %assign __ARCH32E_N0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - intel64
24 | %assign __ARCH32E_K0 2048 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64
25 |
26 | %assign __ARCH __ARCH_PX
27 | %assign __ARCH32E __ARCH32E_PX
28 |
29 | %ifdef _M5 ; Intel(R) Quark(TM) processor - ia32
30 | %assign __ARCH __ARCH_M5
31 | %elifdef _W7 ; Intel(R) Streaming SIMD Extensions 2 - ia32
32 | %assign __ARCH __ARCH_W7
33 | %elifdef _T7 ; Intel(R) Streaming SIMD Extensions 3 - ia32
34 | %assign __ARCH __ARCH_T7
35 | %elifdef _V8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3)
36 | %assign __ARCH __ARCH_V8
37 | %elifdef _S8 ; SSSE3 + MOVBE instruction - ia32
38 | %assign __ARCH __ARCH_S8
39 | %elifdef _P8 ; Intel(R) Streaming SIMD Extensions 4.2 - ia32
40 | %assign __ARCH __ARCH_P8
41 | %elifdef _G9 ; Intel(R) Advanced Vector Extensions - ia32
42 | %assign ARCH_ALIGN_FACTOR 32
43 | %assign __ARCH __ARCH_G9
44 | %elifdef _H9 ; Intel(R) Advanced Vector Extensions 2 - ia32
45 | %assign ARCH_ALIGN_FACTOR 32
46 | %assign __ARCH __ARCH_H9
47 | %elifdef _S0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32
48 | %assign ARCH_ALIGN_FACTOR 64
49 | %assign __ARCH __ARCH_S0
50 | %elifdef _M7 ; Intel(R) Streaming SIMD Extensions 3 - intel64
51 | %assign __ARCH __ARCH_PX
52 | %assign __ARCH32E __ARCH32E_M7
53 | %elifdef _U8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64
54 | %assign __ARCH __ARCH_PX
55 | %assign __ARCH32E __ARCH32E_U8
56 | %elifdef _N8 ; SSSE3 + MOVBE instruction - intel64
57 | %assign __ARCH __ARCH_PX
58 | %assign __ARCH32E __ARCH32E_N8
59 | %elifdef _Y8 ; Intel(R) Streaming SIMD Extensions 4.2 - intel64
60 | %assign __ARCH __ARCH_PX
61 | %assign __ARCH32E __ARCH32E_Y8
62 | %elifdef _E9 ; Intel(R) Advanced Vector Extensions - intel64
63 | %assign ARCH_ALIGN_FACTOR 32
64 | %assign __ARCH __ARCH_PX
65 | %assign __ARCH32E __ARCH32E_E9
66 | %elifdef _L9 ; Intel(R) Advanced Vector Extensions 2 - intel64
67 | %assign ARCH_ALIGN_FACTOR 32
68 | %assign __ARCH __ARCH_PX
69 | %assign __ARCH32E __ARCH32E_L9
70 | %elifdef _N0 ; Intel(R) Advanced Vector Extensions 512 (formerly Knights Landing) - intel64
71 | %assign ARCH_ALIGN_FACTOR 64
72 | %assign __ARCH __ARCH_PX
73 | %assign __ARCH32E __ARCH32E_N0
74 | %elifdef _K0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64
75 | %assign ARCH_ALIGN_FACTOR 64
76 | %assign __ARCH __ARCH_PX
77 | %assign __ARCH32E __ARCH32E_K0
78 | %else
79 | %assign __ARCH __ARCH_PX ; pure C-code
80 | %endif
81 |
82 | %if (__ARCH > __ARCH_H9) || (__ARCH32E > __ARCH32E_L9)
83 | %assign ARCH_ALIGN_FACTOR 64
84 | %elif (__ARCH > __ARCH_P8) || (__ARCH32E > __ARCH32E_Y8)
85 | %assign ARCH_ALIGN_FACTOR 32
86 | %else
87 | %assign ARCH_ALIGN_FACTOR 16
88 | %endif
89 |
90 | ; noexec stack
91 | %ifdef LINUX32
92 | %ifndef OSX32
93 | section .note.GNU-stack noalloc noexec nowrite progbits
94 | %endif
95 | %endif
96 |
97 | ; noexec stack
98 | %ifdef LINUX32E
99 | %ifndef OSXEM64T
100 | %ifndef _ARCH_KNC
101 | section .note.GNU-stack noalloc noexec nowrite progbits
102 | %endif
103 | %endif
104 | %endif
105 |
106 |
107 | %ifidn __OUTPUT_FORMAT__, elf32
108 | %assign IPP_BINARY_FORMAT 0
109 | %elifidn __OUTPUT_FORMAT__, elf64
110 | %assign IPP_BINARY_FORMAT 1
111 | %elifidn __OUTPUT_FORMAT__, macho64
112 | %assign IPP_BINARY_FORMAT 2
113 | %elifidn __OUTPUT_FORMAT__, win32
114 | %assign IPP_BINARY_FORMAT 3
115 | %elifidn __OUTPUT_FORMAT__, win64
116 | %assign IPP_BINARY_FORMAT 4
117 | %else
118 | %fatal Unsupported output format: __OUTPUT_FORMAT__. Shall be: elf32, elf64, win32, win64, macho64
119 | %endif
120 |
121 | %ifdef _MERGED_BLD
122 | %assign _OWN_MERGED_BLD 1
123 | %endif ; _MERGED_BLD
124 |
125 | ; data compilation definitions: merged builds shall compile data only as
126 | ; part of one single object build to avoid multiple definition warnings at link time
127 | %ifndef _MERGED_BLD
128 | %assign __ARCH_DATA 1
129 | %else
130 | %if (__ARCH == __ARCH_G9) || (__ARCH32E == __ARCH32E_E9)
131 | %assign __ARCH_DATA 1
132 | %endif
133 | %endif ; _MERGED_BLD
134 |
135 | ; Definitions of sizeof(type)
136 | %iassign ZWORD_size 64 ; zmm-word
137 | %iassign YWORD_size 32 ; ymm-word
138 | %iassign OWORD_size 16 ; octo-word
139 | %iassign TWORD_size 10 ; ten-bytes word
140 | %iassign QWORD_size 8 ; quad-word
141 | %iassign DWORD_size 4 ; double-word
142 | %iassign WORD_size 2
143 | %iassign BYTE_size 1
144 |
145 | %idefine YMMWORD YWORD
146 | %idefine XMMWORD OWORD
147 | %iassign YMMWORD_size YWORD_size
148 | %iassign XMMWORD_size OWORD_size
149 |
150 | %idefine sizeof(_x_) _x_%+_size
151 |
152 | %endif
153 |
--------------------------------------------------------------------------------
/mpn/asm/ia_common.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2014-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %ifndef __IA_COMMON_INC__
18 | %define __IA_COMMON_INC__ 1
19 |
20 | ; use multi-byte nop's sequences to align loops and jmp's when threshold is reached
21 | %use smartalign
22 | ALIGNMODE p6,16
23 |
24 | ; Declares function, sets visibility and binding and adds __cdecl decoration when needed.
25 | %macro DECLARE_FUNC 2-3.nolist
26 | %xdefine %%func_name %1
27 | %xdefine %%visibility %2
28 | %xdefine %%binding %3
29 |
30 | %ifctx _DECLARE_FUNC_CTX_
31 | %fatal "DECLARE_FUNC: already in the context, need to call ENDFUNC"
32 | %endif
33 |
34 | ; Accepted visibility values are PUBLIC and PRIVATE
35 | %ifnidni %%visibility, PUBLIC
36 | %ifnidni %%visibility, PRIVATE
37 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
38 | %endif
39 | %endif
40 |
41 | ; Accepted binding values are WEAK or STRONG (default)
42 | %ifnempty %%binding
43 | %ifnidni %%binding, WEAK
44 | %ifnidni %%binding, STRONG
45 | %%fatal Function %%func_name binding is not properly defined. Shall be: WEAK or STRONG.
46 | %endif
47 | %endif
48 | %endif
49 |
50 | ; Function decoration length
51 | %assign %%decoration_length 0
52 |
53 | ; The __cdecl calling convention name decoration (to have interoperability with C).
54 | ; Only public functions are decorated
55 | %ifidni %%visibility, PUBLIC
56 | %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T
57 | %xdefine %%func_name _%[%%func_name]
58 | %assign %%decoration_length %%decoration_length+1
59 | %endif
60 | %endif
61 |
62 | ; If current macro is called from IPPASM macro, then function might be decorated by CPU-prefix
63 | %ifctx _IPPASM_CTX_
64 | %assign %%decoration_length %%decoration_length + %$decoration_length ; %$decoration_length belongs to _IPPASM_CTX_
65 | %endif
66 |
67 | %push _DECLARE_FUNC_CTX_
68 | ; setup context variables to use in ENDFUNC
69 | %xdefine %$func_name_ctx %%func_name
70 | %assign %$decoration_length %%decoration_length ; %$decoration_length belongs to _DECLARE_FUNC_CTX_
71 |
72 | %ifidn %%visibility, PUBLIC
73 | %if (IPP_BINARY_FORMAT < 2) ; LINUX32 or LINUX32E
74 | %ifnempty %%binding
75 | global %%func_name:function %%binding (%%func_name%+.LEnd_%+%%func_name - %%func_name)
76 | %else
77 | global %%func_name:function (%%func_name%+.LEnd_%+%%func_name - %%func_name)
78 | %endif
79 | %else
80 | global %%func_name
81 | %endif
82 | %endif
83 | %%func_name:
84 |
85 | ; CET enabling (macOS not supported)
86 | %if ((IPP_BINARY_FORMAT == 0) || (IPP_BINARY_FORMAT == 3)) ; elf32/win32
87 | db 0F3h, 00Fh, 01Eh, 0FBh ; endbr32
88 | %elif ((IPP_BINARY_FORMAT == 1) || (IPP_BINARY_FORMAT == 4)) ; elf64/win64
89 | db 0F3h, 00Fh, 01Eh, 0FAh ; endbr64
90 | %endif
91 | %endmacro
92 |
93 | ; Calls assembler function declared by DECLARE_FUNC
94 | ; Default visibility is PRIVATE (affects decoration)
95 | %macro CALL_FUNC 1-2.nolist PRIVATE
96 | %xdefine %%func_name %1
97 | %xdefine %%visibility %2
98 |
99 | ; Accepted visibility values are PUBLIC and PRIVATE
100 | %ifnidni %%visibility, PUBLIC
101 | %ifnidni %%visibility, PRIVATE
102 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
103 | %endif
104 | %endif
105 |
106 | ; __cdecl on WIN32/OSXEM64T obligates to have undersore prefix decoration.
107 | ; Only PUBLIC functions are decorated.
108 | %ifidni %%visibility, PUBLIC
109 | %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T
110 | %xdefine %%func_name _%1
111 | %endif
112 | %endif
113 |
114 | call %%func_name
115 | %endmacro
116 |
117 | ; Declares function decorated by appropriate CPU prefix (for the merged library)
118 | ; Default visibility (if not defined) is PUBLIC.
119 | %macro IPPASM 1-2.nolist PUBLIC
120 | %xdefine %%func_name %1
121 | %xdefine %%visibility %2
122 |
123 | %ifctx _IPPASM_CTX_
124 | %fatal "IPPASM: already in the context, need to call ENDFUNC"
125 | %endif
126 | %push _IPPASM_CTX_
127 |
128 | %push _CPU_PREFIX_DECORATE_CTX_
129 | CPU_PREFIX_DECORATE %%func_name
130 | %xdefine %%func_name %$decorated_func_name
131 | %assign %$$decoration_length %$decoration_length
132 | %pop _CPU_PREFIX_DECORATE_CTX_
133 |
134 | DECLARE_FUNC %%func_name, %%visibility
135 | %endmacro
136 |
137 | ; Calls assembler function declared by IPPASM
138 | ; Default visibility is PRIVATE (affects decoration)
139 | %macro CALL_IPPASM 1-2.nolist PRIVATE
140 | %xdefine %%func_name %1
141 | %xdefine %%visibility %2
142 |
143 | ; Accepted visibility values are PUBLIC and PRIVATE
144 | %ifnidni %%visibility, PUBLIC
145 | %ifnidni %%visibility, PRIVATE
146 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC.
147 | %endif
148 | %endif
149 |
150 | %push _CPU_PREFIX_DECORATE_CTX_
151 | CPU_PREFIX_DECORATE %%func_name
152 | %xdefine %%func_name %$decorated_func_name
153 | %pop _CPU_PREFIX_DECORATE_CTX_
154 |
155 | CALL_FUNC %%func_name,%%visibility
156 | %endmacro
157 |
158 | ; End function macro - required to be called after IPPASM or DECLARE_FUNC macro invokation.
159 | %macro ENDFUNC 1.nolist
160 | %xdefine %%func_name %1
161 | %ifnctx _DECLARE_FUNC_CTX_
162 | %fatal "Not in the context: _DECLARE_FUNC_CTX_"
163 | %endif
164 |
165 | ; Cross-check of context variable with macro parameter
166 | %defstr %%func_name_str %%func_name
167 | %defstr %%func_name_ctx_str %$func_name_ctx
168 | %substr %%func_name_ctx_str_not_decorated %%func_name_ctx_str %[%$decoration_length+1],-1 ; remove decoration (first X symbols)
169 | %ifnidn %%func_name_str,%%func_name_ctx_str
170 | %ifnidn %%func_name_str,%%func_name_ctx_str_not_decorated
171 | %fatal ENDFUNC: function name [%%func_name] does match context: [%$func_name_ctx]
172 | %endif
173 | %endif
174 |
175 | ; Add local label to be able calculate function size
176 | ; Take function name from the context (real declaration name)
177 | .LEnd_%+%$func_name_ctx:
178 | %pop _DECLARE_FUNC_CTX_
179 |
180 | %ifctx _IPPASM_CTX_
181 | %pop _IPPASM_CTX_
182 | %endif
183 | %endmacro
184 |
185 | %endif
186 |
--------------------------------------------------------------------------------
/mpn/asm/ia_emm.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2014-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %include "asmdefs.inc"
18 | %include "ia_common.inc"
19 | %include "utils.inc"
20 |
21 | ; Decorates function name with appropriate CPU prefix (for the merged library).
22 | ; The macro is context-dependent and returns decorated name in the %$decorated_func_name
23 | ; context variable.
24 | %macro CPU_PREFIX_DECORATE 1.nolist
25 | %ifnctx _CPU_PREFIX_DECORATE_CTX_
26 | %fatal "Not in the context: _CPU_PREFIX_DECORATE_CTX_"
27 | %endif
28 |
29 | ; Add CPU-specific suffix for the dispatched library
30 | %ifdef _OWN_MERGED_BLD
31 | %if (__ARCH == __ARCH_PX)
32 | %xdefine %%func_name px_%1
33 | %assign %%decoration_length 3
34 | %endif
35 | %if (__ARCH == __ARCH_W7)
36 | %xdefine %%func_name w7_%1
37 | %assign %%decoration_length 3
38 | %endif
39 | %if (__ARCH == __ARCH_V8)
40 | %xdefine %%func_name v8_%1
41 | %assign %%decoration_length 3
42 | %endif
43 | %if (__ARCH == __ARCH_S8)
44 | %xdefine %%func_name s8_%1
45 | %assign %%decoration_length 3
46 | %endif
47 | %if (__ARCH == __ARCH_P8)
48 | %xdefine %%func_name p8_%1
49 | %assign %%decoration_length 3
50 | %endif
51 | %if (__ARCH == __ARCH_G9)
52 | %xdefine %%func_name g9_%1
53 | %assign %%decoration_length 3
54 | %endif
55 | %if (__ARCH == __ARCH_H9)
56 | %xdefine %%func_name h9_%1
57 | %assign %%decoration_length 3
58 | %endif
59 | %else
60 | %xdefine %%func_name %1
61 | %assign %%decoration_length 0
62 | %endif
63 |
64 | %ifndef %%func_name
65 | %fatal "CPU_PREFIX_DECORATE: unknown decoration for: __ARCH = " __ARCH
66 | %endif
67 | %xdefine %$decorated_func_name %[%%func_name]
68 | %assign %$decoration_length %%decoration_length
69 | %endmacro
70 |
71 | %define NONVOLATILE_REGS_32_GPR ebp,ebx,esi,edi
72 |
73 | ; Saves non-volatile GPR registers on stack.
74 | ; Input - list of used registers.
75 | %macro USES_GPR 1+.nolist
76 | %assign LOCAL_FRAME 0
77 | %assign GPR_FRAME 0
78 | %define GPR_CUR
79 |
80 | BEGIN_INTERSECT
81 | INTERSECT {%1},{%[NONVOLATILE_REGS_32_GPR]}
82 | ; List of non-volatile GPR registers in the order they will be pushed on stack
83 | %xdefine GPR_CUR %$intersection
84 | %assign GPR_FRAME %$cardinality * 4
85 | END_INTERSECT
86 |
87 | ; Push non-volatile GPRs on stack
88 | FOREACH GPR_CUR,{push}
89 |
90 | ; Set up offset of arguments from ESP
91 | %assign ARG_1 %[GPR_FRAME + 4]
92 | %endmacro
93 |
94 | ; Restore preliminary saved by USES_GPR non-volatile GPR registers from the stack.
95 | ; The macro shall be called after function processing.
96 | %macro REST_GPR 0.nolist
97 | %ifndef GPR_CUR
98 | %fatal "REST_GPR: no GPR_CUR defined"
99 | %endif
100 | ; Pop saved GPRs from the stack
101 | RFOREACH GPR_CUR,{pop}
102 | %endmacro
103 |
104 | %macro LD_ADDR 2.nolist
105 | %xdefine %%reg %1
106 | %xdefine %%addr %2
107 |
108 | %ifdef IPP_PIC
109 | call %%LABEL
110 | %%LABEL: pop %%reg
111 | sub %%reg, %%LABEL-%%addr
112 | %else
113 | lea %%reg, [%%addr]
114 | %endif
115 | %endmacro
116 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/bn_usqrschool.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2010-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: Cryptography Primitive.
20 | ; BNU squaring support
21 | ;
22 | ;
23 |
24 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
25 | ;;
26 | ;; MULx1 genaral-case squarer macros
27 | ;;
28 |
29 | ;; dst = src * B epilogue (srcLen=4*n+3)
30 | %macro sMULx1_4N_3_ELOG 8.nolist
31 | %xdefine %%rDst %1
32 | %xdefine %%rSrc %2
33 | %xdefine %%update_idx %3
34 | %xdefine %%B %4
35 | %xdefine %%T0 %5
36 | %xdefine %%T1 %6
37 | %xdefine %%T2 %7
38 | %xdefine %%T3 %8
39 |
40 | mul %%B
41 | xor %%T1, %%T1
42 | add %%T0, rax
43 | mov qword [%%rDst+sizeof(qword)], %%T0
44 | mov rax, qword [%%rSrc+sizeof(qword)*2]
45 | adc %%T1, rdx
46 |
47 | mul %%B
48 | xor %%T2, %%T2
49 | add %%T1, rax
50 | mov qword [%%rDst+sizeof(qword)*2], %%T1
51 | mov rax, qword [%%rSrc+sizeof(qword)*3]
52 | adc %%T2, rdx
53 |
54 | mul %%B
55 | %%update_idx
56 | add %%T2, rax
57 | mov qword [%%rDst+sizeof(qword)*3], %%T2
58 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
59 | adc rdx, 0
60 |
61 | mov qword [%%rDst+sizeof(qword)*4], rdx
62 | add %%rDst, sizeof(qword)
63 | %endmacro
64 |
65 | ;; dst = src * B epilogue (srcLen=4*n+1)
66 | %macro sMULx1_4N_1_ELOG 8.nolist
67 | %xdefine %%rDst %1
68 | %xdefine %%rSrc %2
69 | %xdefine %%update_idx %3
70 | %xdefine %%B %4
71 | %xdefine %%T0 %5
72 | %xdefine %%T1 %6
73 | %xdefine %%T2 %7
74 | %xdefine %%T3 %8
75 |
76 | mul %%B
77 | %%update_idx
78 | add %%T0, rax
79 | mov qword [%%rDst+sizeof(qword)*3], %%T0
80 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
81 | adc rdx, 0
82 |
83 | mov qword [%%rDst+sizeof(qword)*4], rdx
84 | add %%rDst, sizeof(qword)
85 | %endmacro
86 |
87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
88 | ;;
89 | ;; MULx2 genaral-case multiplier macros
90 | ;;
91 |
92 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+1)
93 | %macro sMULx2_4N_1_ELOG 9.nolist
94 | %xdefine %%rDst %1
95 | %xdefine %%rSrc %2
96 | %xdefine %%update_idx %3
97 | %xdefine %%B0 %4
98 | %xdefine %%B1 %5
99 | %xdefine %%T0 %6
100 | %xdefine %%T1 %7
101 | %xdefine %%T2 %8
102 | %xdefine %%T3 %9
103 |
104 | mul %%B1 ; {T2:T1} += a[lenA-1]*B1
105 | ;add rDst, sizeof(qword)*2
106 | %%update_idx
107 | mov qword [%%rDst+sizeof(qword)*3], %%T0
108 | add %%T1, rax
109 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
110 | adc rdx, %%T2
111 |
112 | mov qword [%%rDst+sizeof(qword)*4], %%T1
113 | mov qword [%%rDst+sizeof(qword)*5], rdx
114 | %endmacro
115 |
116 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+3)
117 | %macro sMULx2_4N_3_ELOG 9.nolist
118 | %xdefine %%rDst %1
119 | %xdefine %%rSrc %2
120 | %xdefine %%update_idx %3
121 | %xdefine %%B0 %4
122 | %xdefine %%B1 %5
123 | %xdefine %%T0 %6
124 | %xdefine %%T1 %7
125 | %xdefine %%T2 %8
126 | %xdefine %%T3 %9
127 |
128 | mul %%B1 ; {T2:T1} += a[lenA-3]*B1
129 | xor %%T3, %%T3
130 | add %%T1, rax
131 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2]
132 | adc %%T2, rdx
133 |
134 | mul %%B0 ; {T3:T2:T1} += a[LenA-2]*B0
135 | mov qword [%%rDst+sizeof(qword)], %%T0
136 | add %%T1, rax
137 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2]
138 | adc %%T2, rdx
139 | adc %%T3, 0
140 |
141 | mul %%B1 ; {T3:T2} += a[lenA-2]*B1
142 | xor %%T0, %%T0
143 | add %%T2, rax
144 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1]
145 | adc %%T3, rdx
146 |
147 | mul %%B0 ; {T0:T3:T2} += a[lenA-1]*B0
148 | mov qword [%%rDst+sizeof(qword)*2], %%T1
149 | add %%T2, rax
150 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1]
151 | adc %%T3, rdx
152 | adc %%T0, 0
153 |
154 | mul %%B1 ; {T0:T3} += a[lenA-1]*B1
155 | ;add rDst, sizeof(qword)*2
156 | %%update_idx
157 | mov qword [%%rDst+sizeof(qword)*3], %%T2
158 | add %%T3, rax
159 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
160 | adc rdx, %%T0
161 |
162 | mov qword [%%rDst+sizeof(qword)*4], %%T3
163 | mov qword [%%rDst+sizeof(qword)*5], rdx
164 | %endmacro
165 |
166 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
167 | ;;
168 | ;; MLAx2 genaral-case multiplier macros
169 | ;;
170 |
171 | ;;
172 | ;; B0 = rSrc[-2]
173 | ;; B1 = rSrc[-1]
174 | ;; inp_vector = rSrc
175 | ;; out_vector = rDst
176 | %macro sMLAx2_PLOG 8.nolist
177 | %xdefine %%rDst %1
178 | %xdefine %%rSrc %2
179 | %xdefine %%B0 %3
180 | %xdefine %%B1 %4
181 | %xdefine %%T0 %5
182 | %xdefine %%T1 %6
183 | %xdefine %%T2 %7
184 | %xdefine %%T3 %8
185 |
186 | mov %%B0, qword [%%rSrc-2*sizeof(qword)] ; preload a[-2]
187 | mov %%B1, qword [%%rSrc-sizeof(qword)] ; and a[i-1]
188 |
189 | mov rax, %%B1
190 | mul %%B0 ; a[-2]*a[i-1]
191 | xor %%T0, %%T0
192 |
193 | add qword [%%rDst-sizeof(qword)], rax
194 | mov rax, qword [%%rSrc] ; a[i]
195 | adc %%T0, rdx
196 |
197 | mul %%B0 ; B0*a[i]
198 | xor %%T1, %%T1
199 | xor %%T2, %%T2
200 | add %%T0, rax
201 | mov rax, qword [%%rSrc] ; a[i]
202 | adc %%T1, rdx
203 | %endmacro
204 |
205 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+1)
206 | %macro sMLAx2_4N_1_ELOG 9.nolist
207 | %xdefine %%rDst %1
208 | %xdefine %%rSrc %2
209 | %xdefine %%update_idx %3
210 | %xdefine %%B0 %4
211 | %xdefine %%B1 %5
212 | %xdefine %%T0 %6
213 | %xdefine %%T1 %7
214 | %xdefine %%T2 %8
215 | %xdefine %%T3 %9
216 |
217 | mul %%B1 ; {T2:T1} += a[lenA-1]*B1 + r[lenA-1]
218 | ;add rDst, sizeof(qword)*2
219 | %%update_idx
220 | add %%T0, qword [%%rDst+sizeof(qword)*3]
221 | mov qword [%%rDst+sizeof(qword)*3], %%T0
222 | adc %%T1, rax
223 | adc rdx, %%T2
224 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
225 |
226 | mov qword [%%rDst+sizeof(qword)*4], %%T1
227 | mov qword [%%rDst+sizeof(qword)*5], rdx
228 | %endmacro
229 |
230 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+3)
231 | %macro sMLAx2_4N_3_ELOG 9.nolist
232 | %xdefine %%rDst %1
233 | %xdefine %%rSrc %2
234 | %xdefine %%update_idx %3
235 | %xdefine %%B0 %4
236 | %xdefine %%B1 %5
237 | %xdefine %%T0 %6
238 | %xdefine %%T1 %7
239 | %xdefine %%T2 %8
240 | %xdefine %%T3 %9
241 |
242 | mul %%B1 ; {T2:T1} += a[lenA-3]*B1
243 | xor %%T3, %%T3
244 | add %%T1, rax
245 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2]
246 | adc %%T2, rdx
247 |
248 | mul %%B0 ; {T3:T2:T1} += a[LenA-2]*B0 + r[len-3]
249 | add %%T0, qword [%%rDst+sizeof(qword)]
250 | mov qword [%%rDst+sizeof(qword)], %%T0
251 | adc %%T1, rax
252 | adc %%T2, rdx
253 | adc %%T3, 0
254 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2]
255 |
256 | mul %%B1 ; {T3:T2} += a[lenA-2]*B1
257 | xor %%T0, %%T0
258 | add %%T2, rax
259 | adc %%T3, rdx
260 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1]
261 |
262 | mul %%B0 ; {T0:T3:T2} += a[lenA-1]*B0 + r[lenA-2]
263 | add %%T1, qword [%%rDst+sizeof(qword)*2]
264 | mov qword [%%rDst+sizeof(qword)*2], %%T1
265 | adc %%T2, rax
266 | adc %%T3, rdx
267 | adc %%T0, 0
268 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1]
269 |
270 | mul %%B1 ; {T0:T3} += a[lenA-1]*B1 + r[lenA-1]
271 | ;add rDst, sizeof(qword)*2
272 | %%update_idx
273 | add %%T2, qword [%%rDst+sizeof(qword)*3]
274 | mov qword [%%rDst+sizeof(qword)*3], %%T2
275 | adc %%T3, rax
276 | adc rdx, %%T0
277 | ;mov rax, qword [rSrc+idx*sizeof(qword)]
278 |
279 | mov qword [%%rDst+sizeof(qword)*4], %%T3
280 | mov qword [%%rDst+sizeof(qword)*5], rdx
281 | %endmacro
282 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/clear_regs.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %ifndef _CLEAR_REGS_ASM_
18 | %define _CLEAR_REGS_ASM_
19 |
20 | %include "os.inc"
21 |
22 | ;
23 | ; This macro clears any GP registers passed
24 | ;
25 | %macro clear_gps 1-16
26 | %define %%NUM_REGS %0
27 | %rep %%NUM_REGS
28 | xor %1, %1
29 | %rotate 1
30 | %endrep
31 | %endmacro
32 |
33 | ;
34 | ; This macro clears any XMM registers passed on SSE
35 | ;
36 | %macro clear_xmms_sse 1-16
37 | %define %%NUM_REGS %0
38 | %rep %%NUM_REGS
39 | pxor %1, %1
40 | %rotate 1
41 | %endrep
42 | %endmacro
43 |
44 | ;
45 | ; This macro clears any XMM registers passed on AVX
46 | ;
47 | %macro clear_xmms_avx 1-16
48 | %define %%NUM_REGS %0
49 | %rep %%NUM_REGS
50 | vpxor %1, %1
51 | %rotate 1
52 | %endrep
53 | %endmacro
54 |
55 | ;
56 | ; This macro clears any YMM registers passed
57 | ;
58 | %macro clear_ymms 1-16
59 | %define %%NUM_REGS %0
60 | %rep %%NUM_REGS
61 | vpxor %1, %1
62 | %rotate 1
63 | %endrep
64 | %endmacro
65 |
66 | ;
67 | ; This macro clears any ZMM registers passed
68 | ;
69 | %macro clear_zmms 1-32
70 | %define %%NUM_REGS %0
71 | %rep %%NUM_REGS
72 | vpxorq %1, %1
73 | %rotate 1
74 | %endrep
75 | %endmacro
76 |
77 | ;
78 | ; This macro clears all scratch GP registers
79 | ; for Windows or Linux
80 | ;
81 | %macro clear_scratch_gps_asm 0
82 | clear_gps rax, rcx, rdx, r8, r9, r10, r11
83 | %ifdef LINUX
84 | clear_gps rdi, rsi
85 | %endif
86 | %endmacro
87 |
88 | ;
89 | ; This macro clears all scratch XMM registers on SSE
90 | ;
91 | %macro clear_scratch_xmms_sse_asm 0
92 | %ifdef LINUX
93 | %assign i 0
94 | %rep 16
95 | pxor xmm %+ i, xmm %+ i
96 | %assign i (i+1)
97 | %endrep
98 | ; On Windows, XMM0-XMM5 registers are scratch registers
99 | %else
100 | %assign i 0
101 | %rep 6
102 | pxor xmm %+ i, xmm %+ i
103 | %assign i (i+1)
104 | %endrep
105 | %endif ; LINUX
106 | %endmacro
107 |
108 | ;
109 | ; This macro clears all scratch XMM registers on AVX
110 | ;
111 | %macro clear_scratch_xmms_avx_asm 0
112 | %ifdef LINUX
113 | vzeroall
114 | ; On Windows, XMM0-XMM5 registers are scratch registers
115 | %else
116 | %assign i 0
117 | %rep 6
118 | vpxor xmm %+ i, xmm %+ i
119 | %assign i (i+1)
120 | %endrep
121 | %endif ; LINUX
122 | %endmacro
123 |
124 | ;
125 | ; This macro clears all scratch YMM registers
126 | ;
127 | ; It should be called before restoring the XMM registers
128 | ; for Windows (XMM6-XMM15)
129 | ;
130 | %macro clear_scratch_ymms_asm 0
131 | ; On Linux, all YMM registers are scratch registers
132 | %ifdef LINUX
133 | vzeroall
134 | ; On Windows, YMM0-YMM5 registers are scratch registers.
135 | ; YMM6-YMM15 upper 128 bits are scratch registers too, but
136 | ; the lower 128 bits are to be restored after calling these function
137 | ; which clears the upper bits too.
138 | %else
139 | %assign i 0
140 | %rep 6
141 | vpxor ymm %+ i, ymm %+ i
142 | %assign i (i+1)
143 | %endrep
144 | %endif ; LINUX
145 | %endmacro
146 |
147 | ;
148 | ; This macro clears all scratch ZMM registers
149 | ;
150 | ; It should be called before restoring the XMM registers
151 | ; for Windows (XMM6-XMM15). YMM registers are used
152 | ; on purpose, since XOR'ing YMM registers is faster
153 | ; than XOR'ing ZMM registers, and the operation clears
154 | ; also the upper 256 bits
155 | ;
156 | %macro clear_scratch_zmms_asm 0
157 | ; On Linux, all ZMM registers are scratch registers
158 | %ifdef LINUX
159 | vzeroall
160 | ;; vzeroall only clears the first 16 ZMM registers
161 | %assign i 16
162 | %rep 16
163 | vpxorq ymm %+ i, ymm %+ i
164 | %assign i (i+1)
165 | %endrep
166 | ; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
167 | ; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
168 | ; the lower 128 bits are to be restored after calling these function
169 | ; which clears the upper bits too.
170 | %else
171 | %assign i 0
172 | %rep 6
173 | vpxorq ymm %+ i, ymm %+ i
174 | %assign i (i+1)
175 | %endrep
176 |
177 | %assign i 16
178 | %rep 16
179 | vpxorq ymm %+ i, ymm %+ i
180 | %assign i (i+1)
181 | %endrep
182 | %endif ; LINUX
183 | %endmacro
184 |
185 | %endif ;; _CLEAR_REGS_ASM
186 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/cpinitas.asm:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2014-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %include "asmdefs.inc"
18 | %include "ia_32e.inc"
19 |
20 | %assign LOCAL_ALIGN_FACTOR 32
21 |
22 | %ifdef __ARCH_DATA
23 |
24 | segment .text align=LOCAL_ALIGN_FACTOR
25 |
26 | ;####################################################################
27 | ;# void cpGetReg( int* buf, int valueEAX, int valueECX ); #
28 | ;####################################################################
29 |
30 | %ifdef WIN32E
31 | %define buf rcx
32 | %define valueEAX edx
33 | %define valueECX r8d
34 | %else
35 | %define buf rdi
36 | %define valueEAX esi
37 | %define valueECX edx
38 | %endif
39 |
40 | align LOCAL_ALIGN_FACTOR
41 | DECLARE_FUNC cpGetReg,PUBLIC
42 | push rbx
43 | movsxd r9, valueEAX
44 | movsxd r10, valueECX
45 | mov r11, buf
46 |
47 | mov rax, r9
48 | mov rcx, r10
49 | xor ebx, ebx
50 | xor edx, edx
51 | cpuid
52 | mov [r11], eax
53 | mov [r11 + 4], ebx
54 | mov [r11 + 8], ecx
55 | mov [r11 + 12], edx
56 | pop rbx
57 | ret
58 | ENDFUNC cpGetReg
59 |
60 | ;###################################################
61 |
62 | ; OSXSAVE support, feature information after cpuid(1), ECX, bit 27 ( XGETBV is enabled by OS )
63 | %assign XSAVEXGETBV_FLAG 8000000h
64 |
65 | ; Feature information after XGETBV(ECX=0), EAX, bits 2,1 ( XMM state and YMM state are enabled by OS )
66 | %assign XGETBV_MASK 06h
67 |
68 | %assign XGETBV_AVX512_MASK 0E0h
69 |
70 | align LOCAL_ALIGN_FACTOR
71 | DECLARE_FUNC cp_is_avx_extension,PUBLIC
72 | push rbx
73 | mov eax, 1
74 | cpuid
75 | xor eax, eax
76 | and ecx, 018000000h
77 | cmp ecx, 018000000h
78 | jne .not_avx
79 | xor ecx, ecx
80 | db 00fh,001h,0d0h ; xgetbv
81 | mov ecx, eax
82 | xor eax, eax
83 | and ecx, XGETBV_MASK
84 | cmp ecx, XGETBV_MASK
85 | jne .not_avx
86 | mov eax, 1
87 | .not_avx:
88 | pop rbx
89 | ret
90 | ENDFUNC cp_is_avx_extension
91 |
92 | align LOCAL_ALIGN_FACTOR
93 | DECLARE_FUNC cp_is_avx512_extension,PUBLIC
94 | push rbx
95 | mov eax, 1
96 | cpuid
97 | xor eax, eax
98 | and ecx, XSAVEXGETBV_FLAG
99 | cmp ecx, XSAVEXGETBV_FLAG
100 | jne .not_avx512
101 | xor ecx, ecx
102 | db 00fh,001h,0d0h ; xgetbv
103 | mov ecx, eax
104 | xor eax, eax
105 | and ecx, XGETBV_AVX512_MASK
106 | cmp ecx, XGETBV_AVX512_MASK
107 | jne .not_avx512
108 | mov eax, 1
109 | .not_avx512:
110 | pop rbx
111 | ret
112 | ENDFUNC cp_is_avx512_extension
113 |
114 | align LOCAL_ALIGN_FACTOR
115 | DECLARE_FUNC cp_issue_avx512_instruction,PUBLIC
116 | db 062h,0f1h,07dh,048h,0efh,0c0h ; vpxord zmm0, zmm0, zmm0
117 | xor eax, eax
118 | ret
119 | ENDFUNC cp_issue_avx512_instruction
120 |
121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 |
123 | align LOCAL_ALIGN_FACTOR
124 | DECLARE_FUNC cp_get_pentium_counter,PUBLIC
125 | rdtsc
126 | sal rdx,32
127 | or rax,rdx
128 | ret
129 | ENDFUNC cp_get_pentium_counter
130 |
131 | align LOCAL_ALIGN_FACTOR
132 | DECLARE_FUNC cpStartTscp,PUBLIC
133 | push rbx
134 | xor rax, rax
135 | cpuid
136 | pop rbx
137 | rdtscp
138 | sal rdx,32
139 | or rax,rdx
140 | ret
141 | ENDFUNC cpStartTscp
142 |
143 | align LOCAL_ALIGN_FACTOR
144 | DECLARE_FUNC cpStopTscp,PUBLIC
145 | rdtscp
146 | sal rdx,32
147 | or rax,rdx
148 | push rax
149 | push rbx
150 | xor rax, rax
151 | cpuid
152 | pop rbx
153 | pop rax
154 | ret
155 | ENDFUNC cpStopTscp
156 |
157 | align LOCAL_ALIGN_FACTOR
158 | DECLARE_FUNC cpStartTsc,PUBLIC
159 | push rbx
160 | xor rax, rax
161 | cpuid
162 | pop rbx
163 | rdtsc
164 | sal rdx,32
165 | or rax,rdx
166 | ret
167 | ENDFUNC cpStartTsc
168 |
169 | align LOCAL_ALIGN_FACTOR
170 | DECLARE_FUNC cpStopTsc,PUBLIC
171 | rdtsc
172 | sal rdx,32
173 | or rax,rdx
174 | push rax
175 | push rbx
176 | xor rax, rax
177 | cpuid
178 | pop rbx
179 | pop rax
180 | ret
181 | ENDFUNC cpStopTsc
182 |
183 |
184 | ;*****************************************
185 | ; int cpGetCacheSize( int* tableCache );
186 | align LOCAL_ALIGN_FACTOR
187 | %define table rdi
188 | DECLARE_FUNC cpGetCacheSize,PUBLIC
189 | %assign LOCAL_FRAME 16
190 | USES_GPR rsi, rdi, rbx, rbp
191 | USES_XMM
192 | COMP_ABI 1
193 |
194 | mov rbp, rsp
195 | xor esi, esi
196 |
197 | mov eax, 2
198 | cpuid
199 |
200 | cmp al, 1
201 | jne .GetCacheSize_11
202 |
203 | test eax, 080000000h
204 | jz .GetCacheSize_00
205 | xor eax, eax
206 | .GetCacheSize_00:
207 | test ebx, 080000000h
208 | jz .GetCacheSize_01
209 | xor ebx, ebx
210 | .GetCacheSize_01:
211 | test ecx, 080000000h
212 | jz .GetCacheSize_02
213 | xor ecx, ecx
214 | .GetCacheSize_02:
215 | test edx, 080000000h
216 | jz .GetCacheSize_03
217 | xor edx, edx
218 |
219 | .GetCacheSize_03:
220 | test eax, eax
221 | jz .GetCacheSize_04
222 | mov [rbp], eax
223 | add rbp, 4
224 | add esi, 3
225 | .GetCacheSize_04:
226 | test ebx, ebx
227 | jz .GetCacheSize_05
228 | mov [rbp], ebx
229 | add rbp, 4
230 | add esi, 4
231 | .GetCacheSize_05:
232 | test ecx, ecx
233 | jz .GetCacheSize_06
234 | mov [rbp], ecx
235 | add rbp, 4
236 | add esi, 4
237 | .GetCacheSize_06:
238 | test edx, edx
239 | jz .GetCacheSize_07
240 | mov [rbp], edx
241 | add esi, 4
242 |
243 | .GetCacheSize_07:
244 | test esi, esi
245 | jz .GetCacheSize_11
246 | mov eax, -1
247 | .GetCacheSize_08:
248 | xor edx, edx
249 | add edx, [table]
250 | jz .ExitGetCacheSize00
251 | add table, 8
252 | mov ecx, esi
253 | .GetCacheSize_09:
254 | cmp dl, BYTE [rsp + rcx]
255 | je .GetCacheSize_10
256 | dec ecx
257 | jnz .GetCacheSize_09
258 | jmp .GetCacheSize_08
259 |
260 | .GetCacheSize_10:
261 | mov eax, [table - 4]
262 |
263 | .ExitGetCacheSize00:
264 | REST_XMM
265 | REST_GPR
266 | ret
267 |
268 | .GetCacheSize_11:
269 | mov eax, -1
270 | jmp .ExitGetCacheSize00
271 | ENDFUNC cpGetCacheSize
272 |
273 | ;****************************
274 |
275 | %endif ; __ARCH_DATA
276 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/emulator.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2009-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: EM64T Cryptography Primitive.
20 | ;
21 | ;
22 | ;
23 |
24 | %ifndef _EMULATOR_INC_
25 | %define _EMULATOR_INC_
26 |
27 | %macro my_pclmulqdq 3.nolist
28 | %xdefine %%xxDst %1
29 | %xdefine %%xxSrc %2
30 | %xdefine %%xxOp %3
31 |
32 | %if (my_emulator == 0)
33 | pclmulqdq %%xxDst, %%xxSrc, %%xxOp
34 | %else
35 | ;;
36 | ;; rsp
37 | ;; registers
38 | ;; +00 => xxDst
39 | ;; +16 => xxSrc
40 |
41 | pushf
42 | push rax
43 | push rbx
44 | push rcx
45 | push rdx
46 | push rdi
47 | push rsi
48 | push rbp
49 | push r8
50 | push r9
51 | push r10
52 | push r11
53 | push r12
54 | push r13
55 | push r14
56 | push r15
57 |
58 | %assign %%stackSize (sizeof(oword)*2)
59 | sub rsp,%%stackSize
60 |
61 | movdqu oword [rsp+00], %%xxDst ;; save Dst
62 | movdqu oword [rsp+16], %%xxSrc ;; save Src
63 |
64 | lea rcx, [rsp+00]
65 | lea rdx, [rsp+16]
66 | mov r8, %%xxOp
67 |
68 | sub rsp, (sizeof(qword)*3)
69 | call emu_pclmulqdq
70 | add rsp, (sizeof(qword)*3)
71 |
72 | movdqu %%xxDst, oword [rsp+00] ;; return Dst
73 | ;movdqu xxSrc, oword [rsp+16] ;; return Src
74 | add esp, %%stackSize
75 |
76 | pop r15
77 | pop r14
78 | pop r13
79 | pop r12
80 | pop r11
81 | pop r10
82 | pop r9
83 | pop r8
84 | pop rbp
85 | pop rsi
86 | pop rdi
87 | pop rdx
88 | pop rcx
89 | pop rbx
90 | pop rax
91 | popf
92 | %endif
93 | %endmacro
94 |
95 | %macro my_aesenc 2.nolist
96 | %xdefine %%xxDst %1
97 | %xdefine %%xxSrc %2
98 |
99 | %if (my_emulator == 0)
100 | aesenc %%xxDst, %%xxSrc
101 | %else
102 | pushf
103 | push rax
104 | push rbx
105 | push rcx
106 | push rdx
107 | push rdi
108 | push rsi
109 | push rbp
110 | push r8
111 | push r9
112 | push r10
113 | push r11
114 | push r12
115 | push r13
116 | push r14
117 | push r15
118 |
119 | %assign %%stackSize (sizeof(oword)*2)
120 | sub rsp,%%stackSize
121 |
122 | movdqu oword [rsp+00], %%xxDst ;; save Dst
123 | movdqu oword [rsp+16], %%xxSrc ;; save Src
124 |
125 | lea rcx, [rsp+00]
126 | lea rdx, [rsp+16]
127 |
128 | sub rsp, (sizeof(qword)*2)
129 | call emu_aesenc
130 | add rsp, (sizeof(qword)*2)
131 |
132 | movdqu %%xxDst, oword [rsp+00] ;; return Dst
133 | add esp, %%stackSize
134 |
135 | pop r15
136 | pop r14
137 | pop r13
138 | pop r12
139 | pop r11
140 | pop r10
141 | pop r9
142 | pop r8
143 | pop rbp
144 | pop rsi
145 | pop rdi
146 | pop rdx
147 | pop rcx
148 | pop rbx
149 | pop rax
150 | popf
151 | %endif
152 | %endmacro
153 |
154 | %macro my_aesenclast 2.nolist
155 | %xdefine %%xxDst %1
156 | %xdefine %%xxSrc %2
157 |
158 | %if (my_emulator == 0)
159 | aesenclast %%xxDst, %%xxSrc
160 | %else
161 | pushf
162 | push rax
163 | push rbx
164 | push rcx
165 | push rdx
166 | push rdi
167 | push rsi
168 | push rbp
169 | push r8
170 | push r9
171 | push r10
172 | push r11
173 | push r12
174 | push r13
175 | push r14
176 | push r15
177 |
178 | %assign %%stackSize (sizeof(oword)*2)
179 | sub rsp,%%stackSize
180 |
181 | movdqu oword [rsp+00], %%xxDst ;; save Dst
182 | movdqu oword [rsp+16], %%xxSrc ;; save Src
183 |
184 | lea rcx, [rsp+00]
185 | lea rdx, [rsp+16]
186 |
187 | sub rsp, (sizeof(qword)*2)
188 | call emu_aesenclast
189 | add rsp, (sizeof(qword)*2)
190 |
191 | movdqu %%xxDst, oword [rsp+00] ;; return Dst
192 | add esp, %%stackSize
193 |
194 | pop r15
195 | pop r14
196 | pop r13
197 | pop r12
198 | pop r11
199 | pop r10
200 | pop r9
201 | pop r8
202 | pop rbp
203 | pop rsi
204 | pop rdi
205 | pop rdx
206 | pop rcx
207 | pop rbx
208 | pop rax
209 | popf
210 | %endif
211 | %endmacro
212 |
213 | %macro my_aesdec 2.nolist
214 | %xdefine %%xxDst %1
215 | %xdefine %%xxSrc %2
216 |
217 | %if (my_emulator == 0)
218 | aesdec %%xxDst, %%xxSrc
219 | %else
220 | pushf
221 | push rax
222 | push rbx
223 | push rcx
224 | push rdx
225 | push rdi
226 | push rsi
227 | push rbp
228 | push r8
229 | push r9
230 | push r10
231 | push r11
232 | push r12
233 | push r13
234 | push r14
235 | push r15
236 |
237 | %assign %%stackSize (sizeof(oword)*2)
238 | sub rsp,%%stackSize
239 |
240 | movdqu oword [rsp+00], %%xxDst ;; save Dst
241 | movdqu oword [rsp+16], %%xxSrc ;; save Src
242 |
243 | lea rcx, [rsp+00]
244 | lea rdx, [rsp+16]
245 |
246 | sub rsp, (sizeof(qword)*2)
247 | call emu_aesdec
248 | add rsp, (sizeof(qword)*2)
249 |
250 | movdqu %%xxDst, oword [rsp+00] ;; return Dst
251 | add esp, %%stackSize
252 |
253 | pop r15
254 | pop r14
255 | pop r13
256 | pop r12
257 | pop r11
258 | pop r10
259 | pop r9
260 | pop r8
261 | pop rbp
262 | pop rsi
263 | pop rdi
264 | pop rdx
265 | pop rcx
266 | pop rbx
267 | pop rax
268 | popf
269 | %endif
270 | %endmacro
271 |
272 | %macro my_aesdeclast 2.nolist
273 | %xdefine %%xxDst %1
274 | %xdefine %%xxSrc %2
275 |
276 | %if (my_emulator == 0)
277 | aesenclast %%xxDst, %%xxSrc
278 | %else
279 | pushf
280 | push rax
281 | push rbx
282 | push rcx
283 | push rdx
284 | push rdi
285 | push rsi
286 | push rbp
287 | push r8
288 | push r9
289 | push r10
290 | push r11
291 | push r12
292 | push r13
293 | push r14
294 | push r15
295 |
296 | %assign %%stackSize (sizeof(oword)*2)
297 | sub rsp,%%stackSize
298 |
299 | movdqu oword [rsp+00], %%xxDst ;; save Dst
300 | movdqu oword [rsp+16], %%xxSrc ;; save Src
301 |
302 | lea rcx, [rsp+00]
303 | lea rdx, [rsp+16]
304 |
305 | sub rsp, (sizeof(qword)*2)
306 | call emu_aesdeclast
307 | add rsp, (sizeof(qword)*2)
308 |
309 | movdqu %%xxDst, oword [rsp+00] ;; return Dst
310 | add esp, %%stackSize
311 |
312 | pop r15
313 | pop r14
314 | pop r13
315 | pop r12
316 | pop r11
317 | pop r10
318 | pop r9
319 | pop r8
320 | pop rbp
321 | pop rsi
322 | pop rdi
323 | pop rdx
324 | pop rcx
325 | pop rbx
326 | pop rax
327 | popf
328 | %endif
329 | %endmacro
330 |
331 | %if (my_emulator != 0)
332 | extern emu_pclmulqdq
333 | extern emu_aesenc
334 | extern emu_aesenclast
335 | extern emu_aesdec
336 | extern emu_aesdeclast
337 | %endif
338 |
339 | %endif
340 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/ia_32e_regs.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2012-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: EM64T Cryptography Primitive.
20 | ;
21 | ;
22 | ;
23 |
24 | ;;
25 | ;; Just for unify GPRs usage
26 | ;;
27 |
28 | %ifndef _IA_32_REGS_INC_
29 | %define _IA_32_REGS_INC_
30 |
31 | %define r0 rax ;; 64-bits GPRs
32 | %define r1 rbx
33 | %define r2 rcx
34 | %define r3 rdx
35 | %define r4 rdi
36 | %define r5 rsi
37 | %define r6 rbp
38 | %define r7 rsp
39 |
40 | %define r0d eax ;; 32-bits GPRs
41 | %define r1d ebx
42 | %define r2d ecx
43 | %define r3d edx
44 | %define r4d edi
45 | %define r5d esi
46 | %define r6d ebp
47 | %define r7d esp
48 |
49 | %define raxd eax
50 | %define rbxd ebx
51 | %define rcxd ecx
52 | %define rdxd edx
53 | %define rdid edi
54 | %define rsid esi
55 | %define rbpd ebp
56 |
57 | %define r0w ax ;; 16-bits GPRs
58 | %define r1w bx
59 | %define r2w cx
60 | %define r3w dx
61 | %define r4w di
62 | %define r5w si
63 | %define r6w bp
64 | %define r7w sp
65 |
66 | %define raxw ax
67 | %define rbxw bx
68 | %define rcxw cx
69 | %define rdxw dx
70 | %define rdiw di
71 | %define rsiw si
72 | %define rbpw bp
73 |
74 | %define r0b al ;; 8-bits GPRs
75 | %define r1b bl
76 | %define r2b cl
77 | %define r3b dl
78 | %define r4b dil
79 | %define r5b sil
80 | %define r6b bpl
81 | %define r7b spl
82 |
83 | %define raxb al
84 | %define rbxb bl
85 | %define rcxb cl
86 | %define rdxb dl
87 | %define rdib dil
88 | %define rsib sil
89 | %define rbpb bpl
90 |
91 | %define raxbl al
92 | %define rbxbl bl
93 | %define rcxbl cl
94 | %define rdxbl dl
95 | %define raxbh ah
96 | %define rbxbh bh
97 | %define rcxbh ch
98 | %define rdxbh dh
99 |
100 | ;;
101 | ;; Register Parameters (depend on used OS)
102 | ;;
103 | %ifdef WIN32E
104 | %define rpar1 rcx
105 | %define rpar2 rdx
106 | %define rpar3 r8
107 | %define rpar4 r9
108 | %define rpar5 [rsp + ARG_5]
109 | %define rpar6 [rsp + ARG_6]
110 | %endif
111 |
112 | %ifdef LINUX32E
113 | %define rpar1 rdi
114 | %define rpar2 rsi
115 | %define rpar3 rdx
116 | %define rpar4 rcx
117 | %define rpar5 r8
118 | %define rpar6 r9
119 | %endif
120 |
121 | ;; use GPR implementation everywhere possible
122 | %assign GPR_version 1
123 |
124 | %endif
125 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_uadd_m7as.asm:
--------------------------------------------------------------------------------
1 | %include "asmdefs.inc"
2 | %include "ia_32e.inc"
3 |
4 | ;
5 | ; carry, r[:n] = a[:n] + b[:n]
6 | ; uint64_t mpn_add_vectorized(uint64_t *r, const uint64_t *a, const uint64_t *b, unsigned int n)
7 | ;
8 |
9 | segment .text align=ARCH_ALIGN_FACTOR
10 |
11 | align ARCH_ALIGN_FACTOR
12 | IPPASM mpn_add_vectorized,PUBLIC
13 | %assign LOCAL_FRAME 0
14 | USES_GPR rsi,rdi
15 | USES_XMM
16 | COMP_ABI 4
17 |
18 | ; rdi = r
19 | ; rsi = a
20 | ; rdx = b
21 | ; rcx = n
22 |
23 | movsxd rcx, ecx ; unsigned length
24 | xor rax, rax
25 |
26 | cmp rcx, 2
27 | jge .ADD_GE2
28 |
29 | ;********** lenSrcA == 1 *************************************
30 | add rax, rax
31 | mov r8, qword [rsi] ; rsi = a
32 | adc r8, qword [rdx] ; r8 = a+b = s
33 | mov qword [rdi], r8 ; save s
34 | sbb rax, rax ;
35 | jmp .FINAL
36 |
37 | ;********** lenSrcA == 1 END ********************************
38 |
39 | .ADD_GE2:
40 | jg .ADD_GT2
41 |
42 | ;********** lenSrcA == 2 *************************************
43 | add rax, rax
44 | mov r8, qword [rsi] ; r8 = a0
45 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
46 | mov r9, qword [rsi+8] ; r9 = a1
47 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
48 | mov qword [rdi], r8 ; save s0
49 | mov qword [rdi+8], r9 ; save s1
50 | sbb rax, rax ; rax = carry
51 | jmp .FINAL
52 |
53 | ;********** lenSrcA == 2 END *********************************
54 |
55 | .ADD_GT2:
56 | cmp rcx, 4
57 | jge .ADD_GE4
58 |
59 | ;********** lenSrcA == 3 *************************************
60 | add rax, rax
61 | mov r8, qword [rsi] ; r8 = a0
62 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
63 | mov r9, qword [rsi+8] ; r9 = a1
64 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
65 | mov r10, qword [rsi+16] ; r10 = a2
66 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
67 | mov qword [rdi], r8 ; save s0
68 | mov qword [rdi+8], r9 ; save s1
69 | mov qword [rdi+16], r10 ; save s2
70 | sbb rax, rax ; rax = carry
71 | jmp .FINAL
72 |
73 | ;********** lenSrcA == 3 END *********************************
74 |
75 | .ADD_GE4:
76 | jg .ADD_GT4
77 |
78 | ;********** lenSrcA == 4 *************************************
79 | add rax, rax
80 | mov r8, qword [rsi] ; r8 = a0
81 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
82 | mov r9, qword [rsi+8] ; r9 = a1
83 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
84 | mov r10, qword [rsi+16] ; r10 = a2
85 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
86 | mov r11, qword [rsi+24] ; r11 = a3
87 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3
88 | mov qword [rdi], r8 ; save s0
89 | mov qword [rdi+8], r9 ; save s1
90 | mov qword [rdi+16], r10 ; save s2
91 | mov qword [rdi+24], r11 ; save s2
92 | sbb rax, rax ; rax = carry
93 | jmp .FINAL
94 |
95 | ;********** lenSrcA == 4 END *********************************
96 |
97 | .ADD_GT4:
98 | cmp rcx, 6
99 | jge .ADD_GE6
100 |
101 | ;********** lenSrcA == 5 *************************************
102 | add rax, rax
103 | mov r8, qword [rsi] ; r8 = a0
104 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
105 | mov r9, qword [rsi+8] ; r9 = a1
106 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
107 | mov r10, qword [rsi+16] ; r10 = a2
108 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
109 | mov r11, qword [rsi+24] ; r11 = a3
110 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3
111 | mov rcx, qword [rsi+32] ; rcx = a4
112 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4
113 | mov qword [rdi], r8 ; save s0
114 | mov qword [rdi+8], r9 ; save s1
115 | mov qword [rdi+16], r10 ; save s2
116 | mov qword [rdi+24], r11 ; save s3
117 | mov qword [rdi+32], rcx ; save s4
118 | sbb rax, rax ; rax = carry
119 | jmp .FINAL
120 |
121 | ;********** lenSrcA == 5 END *********************************
122 |
123 | .ADD_GE6:
124 | jg .ADD_GT6
125 |
126 | ;********** lenSrcA == 6 *************************************
127 | add rax, rax
128 | mov r8, qword [rsi] ; r8 = a0
129 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
130 | mov r9, qword [rsi+8] ; r9 = a1
131 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
132 | mov r10, qword [rsi+16] ; r10 = a2
133 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
134 | mov r11, qword [rsi+24] ; r11 = a3
135 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3
136 | mov rcx, qword [rsi+32] ; rcx = a4
137 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4
138 | mov rsi, qword [rsi+40] ; rsi = a5
139 | adc rsi, qword [rdx+40] ; rsi = a5+b5 = s5
140 | mov qword [rdi], r8 ; save s0
141 | mov qword [rdi+8], r9 ; save s1
142 | mov qword [rdi+16], r10 ; save s2
143 | mov qword [rdi+24], r11 ; save s3
144 | mov qword [rdi+32], rcx ; save s4
145 | mov qword [rdi+40], rsi ; save s5
146 | sbb rax, rax ; rax = carry
147 | jmp .FINAL
148 |
149 | ;********** lenSrcA == 6 END *********************************
150 |
151 | .ADD_GT6:
152 | cmp rcx, 8
153 | jge .ADD_GE8
154 |
155 | .ADD_EQ7:
156 | ;********** lenSrcA == 7 *************************************
157 | add rax, rax
158 | mov r8, qword [rsi] ; r8 = a0
159 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
160 | mov r9, qword [rsi+8] ; r9 = a1
161 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
162 | mov r10, qword [rsi+16] ; r10 = a2
163 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
164 | mov r11, qword [rsi+24] ; r11 = a3
165 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3
166 | mov rcx, qword [rsi+32] ; rcx = a4
167 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4
168 | mov qword [rdi], r8 ; save s0
169 | mov r8, qword [rsi+40] ; r8 = a5
170 | adc r8, qword [rdx+40] ; r8 = a5+b5 = s5
171 | mov rsi, qword [rsi+48] ; rsi = a6
172 | adc rsi, qword [rdx+48] ; rsi = a6+b6 = s6
173 | mov qword [rdi+8], r9 ; save s1
174 | mov qword [rdi+16], r10 ; save s2
175 | mov qword [rdi+24], r11 ; save s3
176 | mov qword [rdi+32], rcx ; save s4
177 | mov qword [rdi+40], r8 ; save s5
178 | mov qword [rdi+48], rsi ; save s6
179 | sbb rax, rax ; rax = carry
180 | jmp .FINAL
181 |
182 | ;********** lenSrcA == 7 END *********************************
183 |
184 |
185 | .ADD_GE8:
186 | jg .ADD_GT8
187 |
188 | ;********** lenSrcA == 8 *************************************
189 | add rax, rax
190 | mov r8, qword [rsi] ; r8 = a0
191 | adc r8, qword [rdx] ; r8 = a0+b0 = s0
192 | mov r9, qword [rsi+8] ; r9 = a1
193 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1
194 | mov r10, qword [rsi+16] ; r10 = a2
195 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2
196 | mov r11, qword [rsi+24] ; r11 = a3
197 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3
198 | mov rcx, qword [rsi+32] ; rcx = a4
199 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4
200 | mov qword [rdi], r8 ; save s0
201 | mov r8, qword [rsi+40] ; r8 = a5
202 | adc r8, qword [rdx+40] ; r8 = a5+b5 = s5
203 | mov qword [rdi+8], r9 ; save s1
204 | mov r9, qword [rsi+48] ; r9 = a7
205 | adc r9, qword [rdx+48] ; r9 = a7+b7 = s7
206 | mov rsi, qword [rsi+56] ; rsi = a6
207 | adc rsi, qword [rdx+56] ; rsi = a6+b6 = s6
208 | mov qword [rdi+16], r10 ; save s2
209 | mov qword [rdi+24], r11 ; save s3
210 | mov qword [rdi+32], rcx ; save s4
211 | mov qword [rdi+40], r8 ; save s5
212 | mov qword [rdi+48], r9 ; save s6
213 | mov qword [rdi+56], rsi ; save s7
214 | sbb rax, rax ; rax = carry
215 | jmp .FINAL
216 |
217 | ;********** lenSrcA == 8 END *********************************
218 |
219 |
220 | ;********** lenSrcA > 8 *************************************
221 |
222 | .ADD_GT8:
223 | mov r8, rax
224 | mov rax, rcx ; rax = len
225 | and rcx, 3 ;
226 | xor rcx, rax ;
227 | lea rsi, [rsi+8*rcx] ;
228 | lea rdx, [rdx+8*rcx] ;
229 | lea rdi, [rdi+8*rcx] ;
230 | neg rcx
231 | add r8, r8
232 | jmp .ADD_GLOOP
233 |
234 | align ARCH_ALIGN_FACTOR
235 | .ADD_GLOOP:
236 | mov r8, qword [rsi+8*rcx] ; r8 = a0
237 | mov r9, qword [rsi+8*rcx+8] ; r9 = a1
238 | mov r10, qword [rsi+8*rcx+16] ; r10 = a2
239 | mov r11, qword [rsi+8*rcx+24] ; r11 = a3
240 | adc r8, qword [rdx+8*rcx] ; r8 = a0+b0 = r0
241 | adc r9, qword [rdx+8*rcx+8] ; r9 = a1+b1 = r1
242 | adc r10, qword [rdx+8*rcx+16] ; r10 = a2+b2 = r2
243 | adc r11, qword [rdx+8*rcx+24] ; r11 = a3+b3 = r3
244 | mov qword [rdi+8*rcx], r8 ;
245 | mov qword [rdi+8*rcx+8], r9 ;
246 | mov qword [rdi+8*rcx+16], r10 ;
247 | mov qword [rdi+8*rcx+24], r11 ;
248 | lea rcx, [rcx+4]
249 | jrcxz .ADD_LLAST0
250 | jmp .ADD_GLOOP
251 |
252 | .ADD_LLAST0:
253 | sbb rcx, rcx
254 | and rax, 3
255 | jz .FIN0
256 |
257 | .ADD_LLOOP:
258 | test rax, 2
259 | jz .ADD_LLAST1
260 |
261 | add rcx, rcx
262 | mov r8, qword [rsi] ; r8 = a0
263 | mov r9, qword [rsi+8] ; r9 = a1
264 | adc r8, qword [rdx] ; r8 = a0+b0 = r0
265 | adc r9, qword [rdx+8] ; r9 = a1+b1 = r1
266 | mov qword [rdi], r8 ;
267 | mov qword [rdi+8], r9 ;
268 | sbb rcx, rcx
269 | test rax, 1
270 | jz .FIN0
271 |
272 | add rsi, 16
273 | add rdx, 16
274 | add rdi, 16
275 |
276 | .ADD_LLAST1:
277 | add rcx, rcx
278 | mov r8, qword [rsi] ; r8 = a0
279 | adc r8, qword [rdx] ; r8 = a0+b0 = r0
280 | mov qword [rdi], r8 ;
281 | sbb rcx, rcx
282 |
283 | .FIN0:
284 | mov rax, rcx
285 |
286 | ;******************* .FINAL ***********************************************************
287 |
288 | .FINAL:
289 | neg rax
290 | REST_XMM
291 | REST_GPR
292 | ret
293 | ENDFUNC mpn_add_vectorized
294 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_uinc_udec_m7as.asm:
--------------------------------------------------------------------------------
1 | %include "asmdefs.inc"
2 | %include "ia_32e.inc"
3 |
4 | %if (__ARCH32E >= __ARCH32E_M7)
5 |
6 | segment .text align=ARCH_ALIGN_FACTOR
7 |
8 | ;
9 | ; carry, r[:size] = a[:size] + w
10 | ; uint64_t mpn_inc_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w)
11 | ;
12 | align ARCH_ALIGN_FACTOR
13 | IPPASM mpn_inc_vectorized,PUBLIC
14 | %assign LOCAL_FRAME 0
15 | USES_GPR rsi,rdi
16 | USES_XMM
17 | COMP_ABI 4
18 |
19 | ; rdi = r
20 | ; rsi = a
21 | ; rdx = size
22 | ; rcx = w
23 |
24 | movsxd rdx, edx ; length
25 |
26 | mov r8, qword [rsi] ; r[0] = r[0]+increment
27 | add r8, rcx
28 | mov qword [rdi], r8
29 |
30 | lea rsi, [rsi+rdx*sizeof(qword)]
31 | lea rdi, [rdi+rdx*sizeof(qword)]
32 | lea rcx, [rdx*sizeof(qword)]
33 |
34 | sbb rax, rax ; save cf
35 | neg rcx ; rcx = negative length (bytes)
36 | add rcx, sizeof(qword)
37 | jrcxz .exit
38 | add rax, rax ; restore cf
39 | jnc .copy
40 |
41 | align ARCH_ALIGN_FACTOR
42 | .inc_loop:
43 | mov r8, qword [rsi+rcx]
44 | adc r8, 0
45 | mov qword [rdi+rcx], r8
46 | lea rcx, [rcx+sizeof(qword)]
47 | jrcxz .exit_loop
48 | jnc .exit_loop
49 | jmp .inc_loop
50 | .exit_loop:
51 | sbb rax, rax ; save cf
52 |
53 | .copy:
54 | cmp rsi, rdi
55 | jz .exit
56 | jrcxz .exit
57 | .copy_loop:
58 | mov r8, qword [rsi+rcx]
59 | mov qword [rdi+rcx], r8
60 | add rcx, sizeof(qword)
61 | jnz .copy_loop
62 |
63 | .exit:
64 | neg rax
65 | REST_XMM
66 | REST_GPR
67 | ret
68 | ENDFUNC mpn_inc_vectorized
69 |
70 |
71 | ;
72 | ; borrow, r[:size] = a[:size] - w
73 | ; uint64_t mpn_dec_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w)
74 | ;
75 |
76 | align ARCH_ALIGN_FACTOR
77 | IPPASM mpn_dec_vectorized,PUBLIC
78 | %assign LOCAL_FRAME 0
79 | USES_GPR rsi,rdi
80 | USES_XMM
81 | COMP_ABI 4
82 |
83 | ; rdi = r
84 | ; rsi = a
85 | ; rdx = size
86 | ; rcx = w
87 |
88 | movsxd rdx, edx ; length
89 |
90 | mov r8, qword [rsi] ; r[0] = r[0]+increment
91 | sub r8, rcx
92 | mov qword [rdi], r8
93 |
94 | lea rsi, [rsi+rdx*sizeof(qword)]
95 | lea rdi, [rdi+rdx*sizeof(qword)]
96 | lea rcx, [rdx*sizeof(qword)]
97 |
98 | sbb rax, rax ; save cf
99 | neg rcx ; rcx = negative length (bytes)
100 | add rcx, sizeof(qword)
101 | jrcxz .exit
102 | add rax, rax ; restore cf
103 | jnc .copy
104 |
105 | align ARCH_ALIGN_FACTOR
106 | .inc_loop:
107 | mov r8, qword [rsi+rcx]
108 | sbb r8, 0
109 | mov qword [rdi+rcx], r8
110 | lea rcx, [rcx+sizeof(qword)]
111 | jrcxz .exit_loop
112 | jnc .exit_loop
113 | jmp .inc_loop
114 | .exit_loop:
115 | sbb rax, rax ; save cf
116 |
117 | .copy:
118 | cmp rsi, rdi
119 | jz .exit
120 | jrcxz .exit
121 | .copy_loop:
122 | mov r8, qword [rsi+rcx]
123 | mov qword [rdi+rcx], r8
124 | add rcx, sizeof(qword)
125 | jnz .copy_loop
126 |
127 | .exit:
128 | neg rax
129 | REST_XMM
130 | REST_GPR
131 | ret
132 | ENDFUNC mpn_dec_vectorized
133 |
134 | %endif
135 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_acc_m7as.asm:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: Cryptography Primitive.
20 | ; Big Number Operations
21 | ;
22 | ; Content:
23 | ; cpMulDgt_BNU()
24 | ; mpn_mul_acc()
25 | ; cpSubMulDgt_BNU()
26 | ; mpn_mul_acc()
27 | ;
28 | ;
29 |
30 | %include "asmdefs.inc"
31 | %include "ia_32e.inc"
32 | %include "ia_32e_regs.inc"
33 | %include "bn_umulschool.inc"
34 |
35 | %if (__ARCH32E >= __ARCH32E_M7)
36 |
37 | segment .text align=ARCH_ALIGN_FACTOR
38 |
39 |
40 | ;*************************************************************
41 | ; uint64_t mpn_mul_acc(uint64_t* pDst,
42 | ; const uint64_t* pSrcA,
43 | ; int len,
44 | ; uint64_t B )
45 | ;*************************************************************
46 | align ARCH_ALIGN_FACTOR
47 | IPPASM mpn_mul_acc,PUBLIC
48 | %assign LOCAL_FRAME 0
49 | USES_GPR rbx,rsi,rdi,r11,r12
50 | USES_XMM
51 | COMP_ABI 4
52 |
53 | ; rdi = pDst
54 | ; rsi = pSrc
55 | ; rdx = len
56 | ; rcx = B
57 |
58 | %xdefine B0 rcx ; b
59 |
60 | %xdefine T0 r8 ; temporary
61 | %xdefine T1 r9
62 | %xdefine T2 r10
63 | %xdefine T3 r11
64 |
65 | %xdefine idx rbx ; index
66 | %xdefine rDst rdi
67 | %xdefine rSrc rsi
68 |
69 | mov edx, edx ; unsigned length
70 |
71 | mov rax, qword [rsi]
72 | cmp rdx, 1
73 | jnz .general_case
74 |
75 | mul rcx
76 | add qword [rdi], rax
77 | adc rdx, 0
78 | mov rax, rdx
79 | REST_XMM
80 | REST_GPR
81 | ret
82 |
83 | .general_case:
84 | lea rSrc, [rSrc+rdx*sizeof(qword)-sizeof(qword)*5]
85 | lea rDst, [rDst+rdx*sizeof(qword)-sizeof(qword)*5]
86 | mov idx, dword 5
87 | sub idx, rdx ; negative counter -(len-5)
88 |
89 | mul rcx ; {T1:T0} = a[0]*B
90 | mov T0, rax
91 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)]
92 | mov T1, rdx
93 |
94 | cmp idx, 0
95 | jge .skip_muladd_loop4
96 |
97 | align ARCH_ALIGN_FACTOR
98 | .muladd_loop4:
99 | mul rcx ; a[4*i+1]*B
100 | xor T2, T2
101 | add qword [rDst+idx*sizeof(qword)], T0
102 | adc T1, rax
103 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
104 | adc T2, rdx
105 |
106 | mul rcx ; a[4*i+2]*B
107 | xor T3, T3
108 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
109 | adc T2, rax
110 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
111 | adc T3, rdx
112 |
113 | mul rcx ; a[4*i+3]*B
114 | xor T0, T0
115 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
116 | adc T3, rax
117 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4]
118 | adc T0, rdx
119 |
120 | mul rcx ; a[4*i+4]*B
121 | xor T1, T1
122 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
123 | adc T0, rax
124 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*5]
125 | adc T1, rdx
126 |
127 | add idx, 4
128 | jnc .muladd_loop4
129 |
130 | .skip_muladd_loop4:
131 | mul rcx
132 | xor T2, T2
133 | add qword [rDst+idx*sizeof(qword)], T0
134 | adc T1, rax
135 | adc T2, rdx
136 |
137 | cmp idx, 2
138 | ja .fin_mul1x4n_2 ; idx=3
139 | jz .fin_mul1x4n_3 ; idx=2
140 | jp .fin_mul1x4n_4 ; idx=1
141 | ; .fin_mul1x4n_1 ; idx=0
142 |
143 | .fin_mul1x4n_1:
144 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
145 | mul rcx
146 | xor T3, T3
147 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
148 | adc T2, rax
149 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
150 | adc T3, rdx
151 |
152 | mul rcx
153 | xor T0, T0
154 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
155 | adc T3, rax
156 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4]
157 | adc T0, rdx
158 |
159 | mul rcx
160 | xor T1, T1
161 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
162 | adc T0, rax
163 | adc rdx, 0
164 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*4], T0
165 | adc rdx, 0
166 | mov rax, rdx
167 | jmp .exit
168 |
169 | .fin_mul1x4n_4:
170 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
171 | mul rcx
172 | xor T3, T3
173 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
174 | adc T2, rax
175 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3]
176 | adc T3, rdx
177 |
178 | mul rcx
179 | xor T0, T0
180 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
181 | adc T3, rax
182 | adc rdx, 0
183 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3
184 | adc rdx, 0
185 | mov rax, rdx
186 | jmp .exit
187 |
188 | .fin_mul1x4n_3:
189 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2]
190 | mul rcx
191 | xor T3, T3
192 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
193 | adc T2, rax
194 | adc rdx, 0
195 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2
196 | adc rdx, 0
197 | mov rax, rdx
198 | jmp .exit
199 |
200 | .fin_mul1x4n_2:
201 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1
202 | adc T2, 0
203 | mov rax, T2
204 |
205 | .exit:
206 | REST_XMM
207 | REST_GPR
208 | ret
209 | ENDFUNC mpn_mul_acc
210 |
211 | %endif
212 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_m7as.asm:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: Cryptography Primitive.
20 | ; Big Number Operations
21 | ;
22 | ; Content:
23 | ; mpn_mul()
24 | ;
25 | ;
26 |
27 | %include "asmdefs.inc"
28 | %include "ia_32e.inc"
29 | %include "bn_umulschool.inc"
30 | %include "variant.inc"
31 |
32 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_OFF_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
33 | %if (__ARCH32E >= __ARCH32E_M7) && (__ARCH32E < __ARCH32E_L9)
34 |
35 |
36 | segment .text align=ARCH_ALIGN_FACTOR
37 |
38 |
39 | ;*************************************************************
40 | ;* uint64_t mpn_mul(uint64_t* pR;
41 | ;* const uint64_t* pA, int aSize,
42 | ;* const uint64_t* pB, int bSize)
43 | ;* returns pR[aSize+bSize]
44 | ;*
45 | ;*************************************************************
46 | align ARCH_ALIGN_FACTOR
47 | IPPASM mpn_mul,PUBLIC
48 | %assign LOCAL_FRAME (1*sizeof(qword))
49 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
50 | USES_XMM
51 | COMP_ABI 5
52 |
53 | ; rdi = pDst
54 | ; rsi = pSrcA
55 | ; edx = lenA
56 | ; rcx = pSrcB
57 | ; r8d = lenB
58 |
59 | ;;
60 | ;; stack structure:
61 | ;;counterB = (0)
62 | ;;counterA = (counterB+sizeof(qword))
63 | %assign counterA (0)
64 |
65 |
66 | cmp edx, r8d
67 | jl .general_case_mul_entry
68 | jg .general_case_mul
69 | %if (__ARCH32E < __ARCH32E_E9)
70 | cmp edx, 4
71 | %else
72 | cmp edx, 8
73 | %endif
74 | jg .general_case_mul
75 |
76 | %if (__ARCH32E >= __ARCH32E_E9)
77 | cmp edx, 4
78 | jg .more_then_4
79 | %endif
80 |
81 | cmp edx, 3
82 | ja .mul_4x4
83 | jz .mul_3x3
84 | jp .mul_2x2
85 | ; mul_1x1
86 |
87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
88 | ;;
89 | ;; fixed-size multipliers (1-4)
90 | ;;
91 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
92 | align ARCH_ALIGN_FACTOR
93 | .mul_1x1:
94 | mov rax, qword [rsi]
95 | mul qword [rcx]
96 | mov qword [rdi], rax
97 | mov qword [rdi+sizeof(qword)], rdx
98 | mov rax, qword [rdi+sizeof(qword)*1]
99 | REST_XMM
100 | REST_GPR
101 | ret
102 |
103 | align ARCH_ALIGN_FACTOR
104 | .mul_2x2:
105 | mov r8, [rcx]
106 | mov r9, [rcx+sizeof(qword)*1]
107 | MUL_NxN 2, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
108 | mov rax, qword [rdi+sizeof(qword)*3]
109 | REST_XMM
110 | REST_GPR
111 | ret
112 |
113 | align ARCH_ALIGN_FACTOR
114 | .mul_3x3:
115 | mov r8, [rcx]
116 | mov r9, [rcx+sizeof(qword)*1]
117 | mov r10,[rcx+sizeof(qword)*2]
118 | MUL_NxN 3, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
119 | mov rax, qword [rdi+sizeof(qword)*5]
120 | REST_XMM
121 | REST_GPR
122 | ret
123 |
124 | align ARCH_ALIGN_FACTOR
125 | .mul_4x4:
126 | mov r8, [rcx]
127 | mov r9, [rcx+sizeof(qword)*1]
128 | mov r10,[rcx+sizeof(qword)*2]
129 | mov r11,[rcx+sizeof(qword)*3]
130 | MUL_NxN 4, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
131 | mov rax, qword [rdi+sizeof(qword)*7]
132 | REST_XMM
133 | REST_GPR
134 | ret
135 |
136 | %if (__ARCH32E >= __ARCH32E_E9)
137 | .more_then_4:
138 | cmp edx, 7
139 | ja .mul_8x8
140 | jz .mul_7x7
141 | jp .mul_6x6
142 | ; mul_5x5
143 |
144 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
145 | ;;
146 | ;; fixed-size multipliers (5-8)
147 | ;;
148 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
149 | align ARCH_ALIGN_FACTOR
150 | .mul_5x5:
151 | mov r8, [rcx]
152 | mov r9, [rcx+sizeof(qword)*1]
153 | mov r10,[rcx+sizeof(qword)*2]
154 | mov r11,[rcx+sizeof(qword)*3]
155 | mov r12,[rcx+sizeof(qword)*4]
156 | MUL_NxN 5, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
157 | mov rax, qword [rdi+sizeof(qword)*9]
158 | REST_XMM
159 | REST_GPR
160 | ret
161 |
162 | align ARCH_ALIGN_FACTOR
163 | .mul_6x6:
164 | mov r8, [rcx]
165 | mov r9, [rcx+sizeof(qword)*1]
166 | mov r10,[rcx+sizeof(qword)*2]
167 | mov r11,[rcx+sizeof(qword)*3]
168 | mov r12,[rcx+sizeof(qword)*4]
169 | mov r13,[rcx+sizeof(qword)*5]
170 | MUL_NxN 6, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
171 | mov rax, qword [rdi+sizeof(qword)*11]
172 | REST_XMM
173 | REST_GPR
174 | ret
175 |
176 | align ARCH_ALIGN_FACTOR
177 | .mul_7x7:
178 | mov r8, [rcx]
179 | mov r9, [rcx+sizeof(qword)*1]
180 | mov r10,[rcx+sizeof(qword)*2]
181 | mov r11,[rcx+sizeof(qword)*3]
182 | mov r12,[rcx+sizeof(qword)*4]
183 | mov r13,[rcx+sizeof(qword)*5]
184 | mov r14,[rcx+sizeof(qword)*6]
185 | MUL_NxN 7, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
186 | mov rax, qword [rdi+sizeof(qword)*13]
187 | REST_XMM
188 | REST_GPR
189 | ret
190 |
191 | align ARCH_ALIGN_FACTOR
192 | .mul_8x8:
193 | mov r8, [rcx]
194 | mov r9, [rcx+sizeof(qword)*1]
195 | mov r10,[rcx+sizeof(qword)*2]
196 | mov r11,[rcx+sizeof(qword)*3]
197 | mov r12,[rcx+sizeof(qword)*4]
198 | mov r13,[rcx+sizeof(qword)*5]
199 | mov r14,[rcx+sizeof(qword)*6]
200 | mov r15,[rcx+sizeof(qword)*7]
201 | MUL_NxN 8, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8
202 | mov rax, qword [rdi+sizeof(qword)*15]
203 | REST_XMM
204 | REST_GPR
205 | ret
206 | %endif
207 |
208 |
209 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210 | ;;
211 | ;; general case multiplier
212 | ;;
213 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
214 | align ARCH_ALIGN_FACTOR
215 | .general_case_mul_entry:
216 | ; swap operands %if lenA < lenB then exchange operands
217 | xor rsi, rcx
218 | xor edx, r8d
219 | xor rcx, rsi
220 | xor r8d, edx
221 | xor rsi, rcx
222 | xor edx, r8d
223 |
224 | %xdefine B0 r10 ; b[i], b[i+1]
225 | %xdefine B1 r11
226 |
227 | %xdefine T0 r12 ; temporary
228 | %xdefine T1 r13
229 | %xdefine T2 r14
230 | %xdefine T3 r15
231 |
232 | %xdefine idx rbx ; index
233 | %xdefine rDst rdi
234 | %xdefine rSrc rsi
235 |
236 | align ARCH_ALIGN_FACTOR
237 | .general_case_mul:
238 | movsxd rdx, edx ; expand length
239 | movsxd r8, r8d
240 |
241 | lea rdi, [rdi+rdx*sizeof(qword)-sizeof(qword)*4] ; rdi = &R[lenA-4]
242 | lea rsi, [rsi+rdx*sizeof(qword)-sizeof(qword)*4] ; rsi = &A[lenA-4]
243 |
244 | mov idx, dword 4 ; negative
245 | sub idx, rdx ; A-counter
246 | mov qword [rsp+counterA], idx
247 |
248 | mov rax, qword [rsi+idx*sizeof(qword)] ; a[0]
249 | mov B0, qword [rcx] ; b[0]
250 | test r8, 1
251 | jz .init_even_B
252 |
253 | ;********** lenSrcB = 2*n+ 1 (multiply only) *********************
254 | .init_odd_B:
255 | xor T0, T0
256 | cmp idx, 0
257 | jge .skip_mul1
258 |
259 | MULx1 rdi, rsi, idx, B0, T0, T1, T2, T3
260 |
261 | .skip_mul1:
262 | cmp idx, 2
263 | ja .fin_mul1x4n_1 ; idx=3
264 | jz .fin_mul1x4n_2 ; idx=2
265 | jp .fin_mul1x4n_3 ; idx=1
266 | ; fin_mul1x4n_4 ; idx=0
267 |
268 | .fin_mul1x4n_4:
269 | MULx1_4N_4_ELOG rdi, rsi, B0, T0,T1,T2,T3
270 | add rcx, sizeof(qword)
271 | add r8, 1
272 | jmp .mla2x4n_4
273 | .fin_mul1x4n_3:
274 | MULx1_4N_3_ELOG rdi, rsi, B0, T0,T1,T2,T3
275 | add rcx, sizeof(qword)
276 | add r8, 1
277 | jmp .mla2x4n_3
278 | .fin_mul1x4n_2:
279 | MULx1_4N_2_ELOG rdi, rsi, B0, T0,T1,T2,T3
280 | add rcx, sizeof(qword)
281 | add r8, 1
282 | jmp .mla2x4n_2
283 | .fin_mul1x4n_1:
284 | MULx1_4N_1_ELOG rdi, rsi, B0, T0,T1,T2,T3
285 | add rcx, sizeof(qword)
286 | add r8, 1
287 | jmp .mla2x4n_1
288 |
289 |
290 | ;********** lenSrcB = 2*n (multiply only) ************************
291 | .init_even_B:
292 | mov rbp, rax
293 | mul B0 ; {T2:T1:T0} = a[0]*B0
294 | mov B1, qword [rcx+sizeof(qword)]
295 | xor T2, T2
296 | mov T0, rax
297 | mov rax, rbp ; restore a[0]
298 | mov T1, rdx
299 |
300 | cmp idx, 0
301 | jge .skip_mul_nx2
302 |
303 | MULx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3
304 |
305 | .skip_mul_nx2:
306 | cmp idx, 2
307 | ja .fin_mul2x4n_1 ; idx=3
308 | jz .fin_mul2x4n_2 ; idx=2
309 | jp .fin_mul2x4n_3 ; idx=1
310 | ; fin_mul2x4n_4 ; idx=0
311 |
312 | .fin_mul2x4n_4:
313 | MULx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
314 | add rcx, sizeof(qword)*2
315 | align ARCH_ALIGN_FACTOR
316 | .mla2x4n_4:
317 | sub r8, 2
318 | jz .quit
319 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3
320 | cmp idx, 0
321 | jz .skip_mla_x2
322 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3
323 | .skip_mla_x2:
324 | MLAx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
325 | add rcx, sizeof(qword)*2
326 | jmp .mla2x4n_4
327 |
328 | .fin_mul2x4n_3:
329 | MULx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
330 | add rcx, sizeof(qword)*2
331 | align ARCH_ALIGN_FACTOR
332 | .mla2x4n_3:
333 | sub r8, 2
334 | jz .quit
335 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3
336 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3
337 | MLAx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
338 | add rcx, sizeof(qword)*2
339 | jmp .mla2x4n_3
340 |
341 | .fin_mul2x4n_2:
342 | MULx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
343 | add rcx, sizeof(qword)*2
344 | align ARCH_ALIGN_FACTOR
345 | .mla2x4n_2:
346 | sub r8, 2
347 | jz .quit
348 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3
349 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3
350 | MLAx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
351 | add rcx, sizeof(qword)*2
352 | jmp .mla2x4n_2
353 |
354 | .fin_mul2x4n_1:
355 | MULx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
356 | add rcx, sizeof(qword)*2
357 | align ARCH_ALIGN_FACTOR
358 | .mla2x4n_1:
359 | sub r8, 2
360 | jz .quit
361 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3
362 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3
363 | MLAx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3
364 | add rcx, sizeof(qword)*2
365 | jmp .mla2x4n_1
366 |
367 | .quit:
368 | mov rax, rdx
369 |
370 | REST_XMM
371 | REST_GPR
372 | ret
373 | ENDFUNC mpn_mul
374 |
375 | %endif
376 |
377 | %endif ;; _ADCOX_NI_ENABLING_
378 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_usqr_redc_srvl9.asm:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: Cryptography Primitive.
20 | ; Big Number Multiplicative Operations
21 | ;
22 | ; Content:
23 | ; mpn_mul()
24 | ; mpn_sqr()
25 | ; mpn_montgomery_reduce_bin()
26 | ;
27 | ; Implementation is using mulx and adcx/adox instruvtions
28 | ;
29 | ;
30 |
31 | %include "asmdefs.inc"
32 | %include "ia_32e.inc"
33 | %include "variant.inc"
34 |
35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
36 | %if (__ARCH32E >= __ARCH32E_L9)
37 |
38 | %assign _xEMULATION_ 1
39 |
40 | segment .text align=ARCH_ALIGN_FACTOR
41 |
42 |
43 | %include "bn_umul.inc"
44 | %include "bn_usqr.inc"
45 | %include "mred.inc"
46 |
47 | ;*************************************************************
48 | ;* uint64_t mpn_mul(uint64_t* pR;
49 | ;* const uint64_t* pA, int aSize,
50 | ;* const uint64_t* pB, int bSize)
51 | ;*
52 | ;*************************************************************
53 | align ARCH_ALIGN_FACTOR
54 | IPPASM mpn_mul,PUBLIC
55 | %assign LOCAL_FRAME 0
56 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
57 | USES_XMM
58 | COMP_ABI 5
59 |
60 | ; rdi = pR
61 | ; rsi = pA
62 | ; edx = nsA
63 | ; rcx = pB
64 | ; r8d = nsB
65 |
66 | movsxd rdx, edx ; expand length
67 | movsxd rbx, r8d
68 |
69 | xor r8, r8 ; clear scratch
70 | xor r9, r9
71 | xor r10, r10
72 | xor r11, r11
73 | xor r12, r12
74 | xor r13, r13
75 | xor r14, r14
76 | xor r15, r15
77 |
78 | cmp rdx, rbx
79 | jl .swap_operans ; nsA < nsB
80 | jg .test_8N_case ; test %if nsA=8*N and nsB=8*M
81 |
82 | cmp rdx, 16
83 | jg .test_8N_case
84 |
85 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86 | ;; short nsA==nsB (1,..,16)
87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
88 | cmp rdx, 4
89 | jg .more_then_4
90 |
91 | cmp edx, 3
92 | ja .mul_4_4
93 | jz .mul_3_3
94 | jp .mul_2_2
95 | ; mul_1_1
96 |
97 | .mul_1_1:
98 | MUL_NxN 1, rdi, rsi, rcx, rbx,rbp, r8
99 | jmp .quit
100 | .mul_2_2:
101 | MUL_NxN 2, rdi, rsi, rcx, rbx,rbp, r8,r9
102 | jmp .quit
103 | .mul_3_3:
104 | MUL_NxN 3, rdi, rsi, rcx, rbx,rbp, r8,r9,r10
105 | jmp .quit
106 | .mul_4_4:
107 | MUL_NxN 4, rdi, rsi, rcx, rbx,rbp, r8,r9,r10,r11
108 | jmp .quit
109 |
110 | .more_then_4:
111 | GET_EP rax, mul_lxl_basic, rdx, rbp
112 | call rax
113 | jmp .quit
114 |
115 | .swap_operans:
116 | SWAP rsi, rcx ; swap operands
117 | SWAP rdx, rbx
118 |
119 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
120 | ;; 8*N x 8*M case multiplier
121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 | .test_8N_case:
123 | mov rax, rdx
124 | or rax, rbx
125 | and rax, 7
126 | jnz .general_mul
127 |
128 | CALL_FUNC mul_8Nx8M
129 | jmp .quit
130 |
131 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
132 | ;; general case multiplier
133 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
134 | .general_mul:
135 | CALL_FUNC mul_NxM
136 | jmp .quit
137 |
138 | .quit:
139 | REST_XMM
140 | REST_GPR
141 | ret
142 | ENDFUNC mpn_mul
143 |
144 | ;*************************************************************
145 | ;*
146 | ;* uint64_t mpn_sqr(uint64_t* pR;
147 | ;* const uint64_t* pA, int aSize)
148 | ;*
149 | ;*************************************************************
150 | align ARCH_ALIGN_FACTOR
151 | IPPASM mpn_sqr,PUBLIC
152 | %assign LOCAL_FRAME 0
153 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
154 | USES_XMM
155 | COMP_ABI 3
156 |
157 | movsxd rdx, edx ; expand length
158 |
159 | xor r8, r8 ; clear scratch
160 | xor r9, r9
161 | xor r10, r10
162 | xor r11, r11
163 | xor r12, r12
164 | xor r13, r13
165 | xor r14, r14
166 | xor r15, r15
167 |
168 | cmp rdx, 16
169 | jg .test_8N_case
170 |
171 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
172 | ;; short nsA (1,..,16)
173 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
174 | GET_EP rax, sqr_l_basic, rdx, rbp
175 | call rax
176 | jmp .quit
177 |
178 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
179 | ;; 8N case squarer
180 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181 | .test_8N_case:
182 | test rdx, 7
183 | jnz .general_sqr
184 |
185 | CALL_FUNC sqr_8N
186 | jmp .quit
187 |
188 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189 | ;; general case squarer
190 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191 | .general_sqr:
192 | CALL_FUNC sqr_N
193 |
194 | .quit:
195 | REST_XMM
196 | REST_GPR
197 | ret
198 | ENDFUNC mpn_sqr
199 |
200 | ;*************************************************************
201 | ;*
202 | ;* uint64_t mpn_montgomery_reduce_bin(uint64_t* pR;
203 | ;* uint64_t* pProduct,
204 | ;* const uint64_t* pModulus, int mSize,
205 | ;* uint64_t m)
206 | ;*************************************************************
207 | align ARCH_ALIGN_FACTOR
208 | IPPASM mpn_montgomery_reduce_bin,PUBLIC
209 | %assign LOCAL_FRAME (0)
210 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
211 | USES_XMM
212 | COMP_ABI 5
213 | ;pR (rdi) address of the reduction
214 | ;pProduct (rsi) address of the temporary product
215 | ;pModulus (rdx) address of the modulus
216 | ;mSize (rcx) size of the modulus
217 | ;m0 (r8) montgomery helper (m')
218 |
219 | mov r15, rdi ; store reduction address
220 |
221 | ; reload parameters for future convinience:
222 | mov rdi, rsi ; rdi = temporary product buffer
223 | mov rsi, rdx ; rsi = modulus
224 | movsxd rdx, ecx ; rdx = length of modulus
225 |
226 | cmp rdx, 16
227 | ja .test_8N_case ; length of modulus >16
228 |
229 | cmp rdx, 4
230 | ja .above4 ; length of modulus 4,..,16
231 |
232 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
233 | ;; short modulus (1,..,4)
234 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
235 | cmp rdx, 3
236 | ja .red_4
237 | jz .red_3
238 | jp .red_2
239 | ; red_1
240 |
241 | .red_1:
242 | mov r9, qword [rdi+sizeof(qword)*0]
243 | MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9
244 | jmp .quit
245 |
246 | .red_2:
247 | mov r9, qword [rdi+sizeof(qword)*0]
248 | mov r10, qword [rdi+sizeof(qword)*1]
249 | MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10
250 | jmp .quit
251 |
252 | .red_3:
253 | mov r9, qword [rdi+sizeof(qword)*0]
254 | mov r10, qword [rdi+sizeof(qword)*1]
255 | mov r11, qword [rdi+sizeof(qword)*2]
256 | MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11
257 | jmp .quit
258 |
259 | .red_4:
260 | mov r9, qword [rdi+sizeof(qword)*0]
261 | mov r10, qword [rdi+sizeof(qword)*1]
262 | mov r11, qword [rdi+sizeof(qword)*2]
263 | mov r12, qword [rdi+sizeof(qword)*3]
264 | MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12
265 | jmp .quit
266 |
267 |
268 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
269 | ;; short modulus (5,..,16)
270 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
271 | .above4:
272 | mov rbp, rdx
273 | sub rbp, 4
274 | GET_EP rax, mred_short, rbp ; mred procedure
275 |
276 | call rax
277 | jmp .quit
278 |
279 |
280 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
281 | ;; 8N case squarer
282 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283 | .test_8N_case:
284 | test rdx, 7
285 | jnz .general_case
286 |
287 | CALL_FUNC mred_8N
288 | jmp .quit
289 |
290 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
291 | ;;
292 | ;; general case modulus
293 | ;;
294 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295 | .general_case:
296 | CALL_FUNC mred_N
297 |
298 | .quit:
299 | REST_XMM
300 | REST_GPR
301 | ret
302 | ENDFUNC mpn_montgomery_reduce_bin
303 |
304 | %endif
305 |
306 | %endif ;; _ADCOX_NI_ENABLING_
307 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mpi_umul_usqr_redc_srvl9pp.asm:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: Cryptography Primitive.
20 | ; Big Number Multiplicative Operations
21 | ;
22 | ; Content:
23 | ; mpi_umul_bin_adx()
24 | ; mpi_usqr_bin_adx()
25 | ; mpi_montgomery_reduce_bin_adx()
26 | ;
27 | ; Implementation is using mulx and adcx/adox instruvtions
28 | ;
29 | ;
30 |
31 | %include "asmdefs.inc"
32 | %include "ia_32e.inc"
33 | %include "variant.inc"
34 |
35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_)
36 | %if (__ARCH32E >= __ARCH32E_L9)
37 |
38 | %assign _xEMULATION_ 1
39 | %assign _ADCX_ADOX_ 1
40 |
41 | segment .text align=ARCH_ALIGN_FACTOR
42 |
43 | %include "bn_umulpp.inc"
44 | %include "bn_usqrpp.inc"
45 | %include "mred_pp.inc"
46 |
47 | ;*************************************************************
48 | ;* uint64_t mpi_umul_bin_adx(uint64_t* pR;
49 | ;* const uint64_t* pA, int aSize,
50 | ;* const uint64_t* pB, int bSize)
51 | ;*************************************************************
52 | align ARCH_ALIGN_FACTOR
53 | IPPASM mpi_umul_bin_adx,PUBLIC
54 | %assign LOCAL_FRAME 0
55 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
56 | USES_XMM
57 | COMP_ABI 5
58 |
59 | ; rdi = pR
60 | ; rsi = pA
61 | ; edx = nsA
62 | ; rcx = pB
63 | ; r8d = nsB
64 |
65 | movsxd rdx, edx ; expand length
66 | movsxd rbx, r8d
67 |
68 | xor r8, r8 ; clear scratch
69 | xor r9, r9
70 | xor r10, r10
71 | xor r11, r11
72 | xor r12, r12
73 | xor r13, r13
74 | xor r14, r14
75 | xor r15, r15
76 |
77 | cmp rdx, rbx
78 | jl .swap_operans ; nsA < nsB
79 | jg .test_8N_case ; test %if nsA=8*N and nsB=8*M
80 |
81 | cmp rdx, 16
82 | jg .test_8N_case
83 |
84 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
85 | ;; short nsA==nsB (1,..,16)
86 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
87 | cmp rdx, 4
88 | jg .more_then_4
89 |
90 | cmp edx, 3
91 | ja .mul_4_4
92 | jz .mul_3_3
93 | jp .mul_2_2
94 | ; mul_1_1
95 |
96 | .mul_1_1:
97 | MUL_NxN 1, rdi, rsi, rcx, rbx, rbp, r8
98 | jmp .quit
99 | .mul_2_2:
100 | MUL_NxN 2, rdi, rsi, rcx, rbx, rbp, r8, r9
101 | jmp .quit
102 | .mul_3_3:
103 | MUL_NxN 3, rdi, rsi, rcx, rbx, rbp, r8, r9, r10
104 | jmp .quit
105 | .mul_4_4:
106 | MUL_NxN 4, rdi, rsi, rcx, rbx, rbp, r8, r9, r10, r11
107 | jmp .quit
108 |
109 | .more_then_4:
110 | GET_EP rax, mul_lxl_basic, rdx, rbp
111 | call rax
112 | jmp .quit
113 |
114 | .swap_operans:
115 | SWAP rsi, rcx ; swap operands
116 | SWAP rdx, rbx
117 |
118 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
119 | ;; 8*N x 8*M case multiplier
120 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121 | .test_8N_case:
122 | mov rax, rdx
123 | or rax, rbx
124 | and rax, 7
125 | jnz .general_mul
126 |
127 | CALL_FUNC mul_8Nx8M_adcox
128 | jmp .quit
129 |
130 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
131 | ;; general case multiplier
132 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133 | .general_mul:
134 | CALL_FUNC mul_NxM_adcox
135 | jmp .quit
136 |
137 | .quit:
138 | REST_XMM
139 | REST_GPR
140 | ret
141 | ENDFUNC mpi_umul_bin_adx
142 |
143 | ;*************************************************************
144 | ;*
145 | ;* uint64_t mpi_usqr_bin_adx(uint64_t* pR;
146 | ;* const uint64_t* pA, int aSize)
147 | ;*
148 | ;*************************************************************
149 | align ARCH_ALIGN_FACTOR
150 | IPPASM mpi_usqr_bin_adx,PUBLIC
151 | %assign LOCAL_FRAME 0
152 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
153 | USES_XMM
154 | COMP_ABI 3
155 |
156 | movsxd rdx, edx ; expand length
157 |
158 | xor r8, r8 ; clear scratch
159 | xor r9, r9
160 | xor r10, r10
161 | xor r11, r11
162 | xor r12, r12
163 | xor r13, r13
164 | xor r14, r14
165 | xor r15, r15
166 |
167 | cmp rdx, 16
168 | jg .test_8N_case
169 |
170 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
171 | ;; short nsA (1,..,16)
172 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 | GET_EP rax, sqr_l_basic, rdx, rbp
174 | call rax
175 | jmp .quit
176 |
177 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
178 | ;; 8N case squarer
179 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
180 | .test_8N_case:
181 | test rdx, 7
182 | jnz .general_sqr
183 |
184 | CALL_FUNC sqr_8N_adcox
185 | jmp .quit
186 |
187 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188 | ;; general case squarer
189 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
190 | .general_sqr:
191 | CALL_FUNC sqr_N_adcox
192 |
193 | .quit:
194 | REST_XMM
195 | REST_GPR
196 | ret
197 | ENDFUNC mpi_usqr_bin_adx
198 |
199 | ;*************************************************************
200 | ;*
201 | ;* uint64_t mpi_montgomery_reduce_bin_adx(uint64_t* pR;
202 | ;* uint64_t* pProduct,
203 | ;* const uint64_t* pModulus, int mSize,
204 | ;* uint64_t m)
205 | ;*************************************************************
206 | align ARCH_ALIGN_FACTOR
207 | IPPASM mpi_montgomery_reduce_bin_adx,PUBLIC
208 | %assign LOCAL_FRAME (0)
209 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15
210 | USES_XMM
211 | COMP_ABI 5
212 | ;pR (rdi) address of the reduction
213 | ;pProduct (rsi) address of the temporary product
214 | ;pModulus (rdx) address of the modulus
215 | ;mSize (rcx) size of the modulus
216 | ;m0 (r8) montgomery helper (m')
217 |
218 | mov r15, rdi ; store reduction address
219 |
220 | ; reload parameters for future convinience:
221 | mov rdi, rsi ; rdi = temporary product buffer
222 | mov rsi, rdx ; rsi = modulus
223 | movsxd rdx, ecx ; rdx = length of modulus
224 |
225 | cmp rdx, 16
226 | ja .test_8N_case ; length of modulus >16
227 |
228 | cmp rdx, 4
229 | ja .above4 ; length of modulus 4,..,16
230 |
231 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
232 | ;; short modulus (1,..,4)
233 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
234 | cmp rdx, 3
235 | ja .red_4
236 | jz .red_3
237 | jp .red_2
238 | ; red_1
239 |
240 | .red_1:
241 | mov r9, qword [rdi+sizeof(qword)*0]
242 | MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9
243 | jmp .quit
244 |
245 | .red_2:
246 | mov r9, qword [rdi+sizeof(qword)*0]
247 | mov r10, qword [rdi+sizeof(qword)*1]
248 | MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10
249 | jmp .quit
250 |
251 | .red_3:
252 | mov r9, qword [rdi+sizeof(qword)*0]
253 | mov r10, qword [rdi+sizeof(qword)*1]
254 | mov r11, qword [rdi+sizeof(qword)*2]
255 | MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11
256 | jmp .quit
257 |
258 | .red_4:
259 | mov r9, qword [rdi+sizeof(qword)*0]
260 | mov r10, qword [rdi+sizeof(qword)*1]
261 | mov r11, qword [rdi+sizeof(qword)*2]
262 | mov r12, qword [rdi+sizeof(qword)*3]
263 | MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12
264 | jmp .quit
265 |
266 |
267 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
268 | ;; short modulus (5,..,16)
269 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
270 | .above4:
271 | mov rbp, rdx
272 | sub rbp, 4
273 | GET_EP rax, mred_short, rbp ; mred procedure
274 |
275 | call rax
276 | jmp .quit
277 |
278 |
279 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
280 | ;; 8N case squarer
281 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282 | .test_8N_case:
283 | test rdx, 7
284 | jnz .general_case
285 |
286 | CALL_FUNC mred_8N_adcox
287 | jmp .quit
288 |
289 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
290 | ;;
291 | ;; general case modulus
292 | ;;
293 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
294 | .general_case:
295 | CALL_FUNC mred_N_adcox
296 |
297 | .quit:
298 | REST_XMM
299 | REST_GPR
300 | ret
301 | ENDFUNC mpi_montgomery_reduce_bin_adx
302 |
303 | %endif
304 |
305 | %endif ;; _ADCOX_NI_ENABLING_
306 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/mulx.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2013-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ;
19 | ; Purpose: EM64T Cryptography Primitive.
20 | ; Emulation of Intel(R) instructions MULX, ADCX, ADOX (for debug only)
21 | ;
22 | ;
23 | %ifndef _PCPMULX_INC_
24 | %assign _PCPMULX_INC_ 1
25 |
26 | %ifndef _EMULATION_
27 | %macro gsmulx 3.nolist
28 | %xdefine %%resH %1
29 | %xdefine %%resL %2
30 | %xdefine %%src %3
31 |
32 | mulx %%resH,%%resL,%%src
33 | %endmacro
34 |
35 | %endif
36 |
37 | %ifdef _EMULATION_
38 | %macro gsmulx 3.nolist
39 | %xdefine %%resH %1
40 | %xdefine %%resL %2
41 | %xdefine %%src %3
42 |
43 | pushf ;; store flags
44 |
45 | sub rsp, sizeof(qword)*4
46 | mov [rsp-sizeof(qword)*3], rax ;; store RAX
47 | mov [rsp-sizeof(qword)*2], rdx ;; store RDX
48 | mov rax,rdx
49 | mov rdx, %%src
50 |
51 | mul rdx
52 |
53 | mov [rsp-sizeof(qword)*1], rax ;; store Low product
54 | mov [rsp-sizeof(qword)*0], rdx ;; store Hig product
55 |
56 | mov rax, [rsp-sizeof(qword)*3] ;; re-store RAX
57 | mov rdx, [rsp-sizeof(qword)*2] ;; re-store RDX
58 | mov %%resL, [rsp-sizeof(qword)*1];; load Low product
59 | mov %%resH, [rsp-sizeof(qword)*0];; load Hig product
60 | add rsp, sizeof(qword)*4
61 |
62 | popf ;; re-store flags
63 | %endmacro
64 |
65 | %endif
66 |
67 | %ifndef _EMULATION_
68 | %macro gsadcx 2.nolist
69 | %xdefine %%rdst %1
70 | %xdefine %%rsrc %2
71 |
72 | adcx %%rdst, %%rsrc
73 | %endmacro
74 |
75 | %endif
76 |
77 | %ifdef _EMULATION_
78 | %macro gsadcx 2.nolist
79 | %xdefine %%rdst %1
80 | %xdefine %%src %2
81 |
82 | push %%rdst ;; slot for result
83 | push rax ;; save rax
84 | pushfq ;; flags before adc
85 |
86 | adc %%rdst, %%src
87 | mov [rsp+2*sizeof(qword)], %%rdst
88 |
89 | pushfq ;; rsrc = flags after operation
90 | pop rax
91 | and rax, 1 ;; cf after operation
92 | and qword [rsp], (-2) ;; clear cf before operation
93 | or [rsp], rax ;; new psw
94 | popfq
95 |
96 | pop rax
97 | pop %%rdst
98 | %endmacro
99 |
100 | %endif
101 |
102 | %ifndef _EMULATION_
103 | %macro gsadox 2.nolist
104 | %xdefine %%rdst %1
105 | %xdefine %%rsrc %2
106 |
107 | adox %%rdst, %%rsrc
108 | %endmacro
109 |
110 | %endif
111 |
112 | %ifdef _EMULATION_
113 | %macro gsadox 2.nolist
114 | %xdefine %%rdst %1
115 | %xdefine %%src %2
116 |
117 | push %%rdst
118 | push rax ;; save rax
119 |
120 | pushfq ;; rax = flags before adc
121 | mov rax, [rsp]
122 | and rax, 800h ;; of
123 | xor [rsp], rax ;; clear of
124 |
125 | shr rax, 11 ;; mov of to cf position
126 | push rax ;; new psw
127 | popfq
128 |
129 | %ifidni %%src,rax
130 | mov rax, [rsp+sizeof(qword)]
131 | %endif
132 | %ifidni %%rdst,rax
133 | mov %%rdst, [rsp+2*sizeof(qword)]
134 | %endif
135 |
136 | adc %%rdst, %%src
137 | mov [rsp+2*sizeof(qword)], %%rdst
138 |
139 | pushfq ;; rsrc = flags after operation
140 | pop rax
141 | and rax, 1 ;; cf after operation
142 |
143 | shl rax, 11 ;; mov cf into of position
144 | or [rsp], rax ;; new psw
145 | popfq
146 |
147 | pop rax
148 | pop %%rdst
149 | %endmacro
150 |
151 | %endif
152 |
153 | %endif ;; _PCPMULX_INC_
154 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/os.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %ifndef OS_ASM_FILE
18 | %define OS_ASM_FILE
19 |
20 | %ifndef WIN_ABI
21 | %ifidn __OUTPUT_FORMAT__, win64
22 | %define WIN_ABI
23 | %endif
24 | %endif
25 |
26 | %ifndef LINUX
27 | %ifidn __OUTPUT_FORMAT__, elf64
28 | %define LINUX
29 | %endif
30 | %endif
31 |
32 | ;; code is the same for linux and macos
33 | %ifndef LINUX
34 | %ifidn __OUTPUT_FORMAT__, macho64
35 | %define LINUX
36 | %endif
37 | %endif
38 |
39 | %endif ; OS_ASM_FILE
40 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/reg_sizes.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ; define d and w variants for registers
18 |
19 | %ifndef _REG_SIZES_ASM_
20 | %define _REG_SIZES_ASM_
21 |
22 | %define raxd eax
23 | %define raxw ax
24 | %define raxb al
25 |
26 | %define rbxd ebx
27 | %define rbxw bx
28 | %define rbxb bl
29 |
30 | %define rcxd ecx
31 | %define rcxw cx
32 | %define rcxb cl
33 |
34 | %define rdxd edx
35 | %define rdxw dx
36 | %define rdxb dl
37 |
38 | %define rsid esi
39 | %define rsiw si
40 | %define rsib sil
41 |
42 | %define rdid edi
43 | %define rdiw di
44 | %define rdib dil
45 |
46 | %define rbpd ebp
47 | %define rbpw bp
48 | %define rbpb bpl
49 |
50 | %define zmm0x xmm0
51 | %define zmm1x xmm1
52 | %define zmm2x xmm2
53 | %define zmm3x xmm3
54 | %define zmm4x xmm4
55 | %define zmm5x xmm5
56 | %define zmm6x xmm6
57 | %define zmm7x xmm7
58 | %define zmm8x xmm8
59 | %define zmm9x xmm9
60 | %define zmm10x xmm10
61 | %define zmm11x xmm11
62 | %define zmm12x xmm12
63 | %define zmm13x xmm13
64 | %define zmm14x xmm14
65 | %define zmm15x xmm15
66 | %define zmm16x xmm16
67 | %define zmm17x xmm17
68 | %define zmm18x xmm18
69 | %define zmm19x xmm19
70 | %define zmm20x xmm20
71 | %define zmm21x xmm21
72 | %define zmm22x xmm22
73 | %define zmm23x xmm23
74 | %define zmm24x xmm24
75 | %define zmm25x xmm25
76 | %define zmm26x xmm26
77 | %define zmm27x xmm27
78 | %define zmm28x xmm28
79 | %define zmm29x xmm29
80 | %define zmm30x xmm30
81 | %define zmm31x xmm31
82 |
83 | %define ymm0x xmm0
84 | %define ymm1x xmm1
85 | %define ymm2x xmm2
86 | %define ymm3x xmm3
87 | %define ymm4x xmm4
88 | %define ymm5x xmm5
89 | %define ymm6x xmm6
90 | %define ymm7x xmm7
91 | %define ymm8x xmm8
92 | %define ymm9x xmm9
93 | %define ymm10x xmm10
94 | %define ymm11x xmm11
95 | %define ymm12x xmm12
96 | %define ymm13x xmm13
97 | %define ymm14x xmm14
98 | %define ymm15x xmm15
99 | %define ymm16x xmm16
100 | %define ymm17x xmm17
101 | %define ymm18x xmm18
102 | %define ymm19x xmm19
103 | %define ymm20x xmm20
104 | %define ymm21x xmm21
105 | %define ymm22x xmm22
106 | %define ymm23x xmm23
107 | %define ymm24x xmm24
108 | %define ymm25x xmm25
109 | %define ymm26x xmm26
110 | %define ymm27x xmm27
111 | %define ymm28x xmm28
112 | %define ymm29x xmm29
113 | %define ymm30x xmm30
114 | %define ymm31x xmm31
115 |
116 | %define xmm0x xmm0
117 | %define xmm1x xmm1
118 | %define xmm2x xmm2
119 | %define xmm3x xmm3
120 | %define xmm4x xmm4
121 | %define xmm5x xmm5
122 | %define xmm6x xmm6
123 | %define xmm7x xmm7
124 | %define xmm8x xmm8
125 | %define xmm9x xmm9
126 | %define xmm10x xmm10
127 | %define xmm11x xmm11
128 | %define xmm12x xmm12
129 | %define xmm13x xmm13
130 | %define xmm14x xmm14
131 | %define xmm15x xmm15
132 | %define xmm16x xmm16
133 | %define xmm17x xmm17
134 | %define xmm18x xmm18
135 | %define xmm19x xmm19
136 | %define xmm20x xmm20
137 | %define xmm21x xmm21
138 | %define xmm22x xmm22
139 | %define xmm23x xmm23
140 | %define xmm24x xmm24
141 | %define xmm25x xmm25
142 | %define xmm26x xmm26
143 | %define xmm27x xmm27
144 | %define xmm28x xmm28
145 | %define xmm29x xmm29
146 | %define xmm30x xmm30
147 | %define xmm31x xmm31
148 |
149 | %define zmm0y ymm0
150 | %define zmm1y ymm1
151 | %define zmm2y ymm2
152 | %define zmm3y ymm3
153 | %define zmm4y ymm4
154 | %define zmm5y ymm5
155 | %define zmm6y ymm6
156 | %define zmm7y ymm7
157 | %define zmm8y ymm8
158 | %define zmm9y ymm9
159 | %define zmm10y ymm10
160 | %define zmm11y ymm11
161 | %define zmm12y ymm12
162 | %define zmm13y ymm13
163 | %define zmm14y ymm14
164 | %define zmm15y ymm15
165 | %define zmm16y ymm16
166 | %define zmm17y ymm17
167 | %define zmm18y ymm18
168 | %define zmm19y ymm19
169 | %define zmm20y ymm20
170 | %define zmm21y ymm21
171 | %define zmm22y ymm22
172 | %define zmm23y ymm23
173 | %define zmm24y ymm24
174 | %define zmm25y ymm25
175 | %define zmm26y ymm26
176 | %define zmm27y ymm27
177 | %define zmm28y ymm28
178 | %define zmm29y ymm29
179 | %define zmm30y ymm30
180 | %define zmm31y ymm31
181 |
182 | %define xmm0y ymm0
183 | %define xmm1y ymm1
184 | %define xmm2y ymm2
185 | %define xmm3y ymm3
186 | %define xmm4y ymm4
187 | %define xmm5y ymm5
188 | %define xmm6y ymm6
189 | %define xmm7y ymm7
190 | %define xmm8y ymm8
191 | %define xmm9y ymm9
192 | %define xmm10y ymm10
193 | %define xmm11y ymm11
194 | %define xmm12y ymm12
195 | %define xmm13y ymm13
196 | %define xmm14y ymm14
197 | %define xmm15y ymm15
198 | %define xmm16y ymm16
199 | %define xmm17y ymm17
200 | %define xmm18y ymm18
201 | %define xmm19y ymm19
202 | %define xmm20y ymm20
203 | %define xmm21y ymm21
204 | %define xmm22y ymm22
205 | %define xmm23y ymm23
206 | %define xmm24y ymm24
207 | %define xmm25y ymm25
208 | %define xmm26y ymm26
209 | %define xmm27y ymm27
210 | %define xmm28y ymm28
211 | %define xmm29y ymm29
212 | %define xmm30y ymm30
213 | %define xmm31y ymm31
214 |
215 | %define xmm0z zmm0
216 | %define xmm1z zmm1
217 | %define xmm2z zmm2
218 | %define xmm3z zmm3
219 | %define xmm4z zmm4
220 | %define xmm5z zmm5
221 | %define xmm6z zmm6
222 | %define xmm7z zmm7
223 | %define xmm8z zmm8
224 | %define xmm9z zmm9
225 | %define xmm10z zmm10
226 | %define xmm11z zmm11
227 | %define xmm12z zmm12
228 | %define xmm13z zmm13
229 | %define xmm14z zmm14
230 | %define xmm15z zmm15
231 | %define xmm16z zmm16
232 | %define xmm17z zmm17
233 | %define xmm18z zmm18
234 | %define xmm19z zmm19
235 | %define xmm20z zmm20
236 | %define xmm21z zmm21
237 | %define xmm22z zmm22
238 | %define xmm23z zmm23
239 | %define xmm24z zmm24
240 | %define xmm25z zmm25
241 | %define xmm26z zmm26
242 | %define xmm27z zmm27
243 | %define xmm28z zmm28
244 | %define xmm29z zmm29
245 | %define xmm30z zmm30
246 | %define xmm31z zmm31
247 |
248 | %define ymm0z zmm0
249 | %define ymm1z zmm1
250 | %define ymm2z zmm2
251 | %define ymm3z zmm3
252 | %define ymm4z zmm4
253 | %define ymm5z zmm5
254 | %define ymm6z zmm6
255 | %define ymm7z zmm7
256 | %define ymm8z zmm8
257 | %define ymm9z zmm9
258 | %define ymm10z zmm10
259 | %define ymm11z zmm11
260 | %define ymm12z zmm12
261 | %define ymm13z zmm13
262 | %define ymm14z zmm14
263 | %define ymm15z zmm15
264 | %define ymm16z zmm16
265 | %define ymm17z zmm17
266 | %define ymm18z zmm18
267 | %define ymm19z zmm19
268 | %define ymm20z zmm20
269 | %define ymm21z zmm21
270 | %define ymm22z zmm22
271 | %define ymm23z zmm23
272 | %define ymm24z zmm24
273 | %define ymm25z zmm25
274 | %define ymm26z zmm26
275 | %define ymm27z zmm27
276 | %define ymm28z zmm28
277 | %define ymm29z zmm29
278 | %define ymm30z zmm30
279 | %define ymm31z zmm31
280 |
281 | %define DWORD(reg) reg %+ d
282 | %define WORD(reg) reg %+ w
283 | %define BYTE(reg) reg %+ b
284 |
285 | %define XWORD(reg) reg %+ x
286 | %define YWORD(reg) reg %+ y
287 | %define ZWORD(reg) reg %+ z
288 |
289 | %endif ;; _REG_SIZES_ASM_
290 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/variant.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ; Intel(R) Integrated Performance Primitives
19 | ; Cryptographic Primitives (ippcp)
20 | ;
21 | ; Purpose:
22 | ; Define ippCP variant
23 | ;
24 | ; do not changes in definitions below!
25 | ;
26 |
27 | ;;
28 | ;; modes of the feature
29 | ;;
30 | %assign _FEATURE_OFF_ 0 ;; feature is OFF
31 | %assign _FEATURE_ON_ 1 ;; feature is ON
32 | %assign _FEATURE_TICKTOCK_ 2 ;; dectect is feature OFF/ON
33 |
34 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
35 | ; %define _XMM7560_ 1
36 | %ifdef _XMM7560_
37 | %include "variant_xmm7560.inc"
38 | %endif
39 |
40 | ; %define _TXT_ACM_ 1
41 | %ifdef _TXT_ACM_
42 | %include "variant_txt_acm.inc"
43 | %endif
44 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45 |
46 | ;;
47 | ;; it possible to force use of C-version of some implementtaions
48 | ;; instead of ASM one
49 | ;;
50 | %ifndef _USE_C_MPZ_uadd_
51 | %assign _USE_C_MPZ_uadd_ _FEATURE_OFF_
52 | %endif
53 |
54 | %ifndef _USE_C_MPZ_usub_
55 | %assign _USE_C_MPZ_usub_ _FEATURE_OFF_
56 | %endif
57 |
58 | %ifndef _USE_C_MPZ_uadd_word_
59 | %assign _USE_C_MPZ_uadd_word_ _FEATURE_OFF_
60 | %endif
61 |
62 | %ifndef _USE_C_batch_mul_add_
63 | %assign _USE_C_batch_mul_add_ _FEATURE_OFF_
64 | %endif
65 |
66 | %ifndef _USE_C_batch_mul_
67 | %assign _USE_C_batch_mul_ _FEATURE_OFF_
68 | %endif
69 |
70 | %ifndef _USE_C_bn_sqr_words_
71 | %assign _USE_C_cpMulSqr_BNU_vectorized_ _FEATURE_OFF_
72 | %endif
73 |
74 | %ifndef _USE_C_bn_mont_red_words_
75 | %assign _USE_C_bn_mont_red_words_ _FEATURE_OFF_
76 | %endif
77 |
78 | ;;
79 | ;; set _AES_NI_ENABLING_
80 | ;;
81 | %ifdef __ARCH_AES_NI_
82 | %if (__ARCH_AES_NI_ == 0)
83 | %assign _AES_NI_ENABLING_ _FEATURE_OFF_
84 | %elif (__ARCH_AES_NI_ == 1)
85 | %assign _AES_NI_ENABLING_ _FEATURE_ON_
86 | %else
87 | %error
88 | %endif
89 | %else
90 | %if (__ARCH32E >= __ARCH32E_Y8)
91 | %assign _AES_NI_ENABLING_ _FEATURE_TICKTOCK_
92 | %else
93 | %assign _AES_NI_ENABLING_ _FEATURE_OFF_
94 | %endif
95 | %endif
96 |
97 | ;;
98 | ;; if there is no outside assignment
99 | ;; set _SHA_NI_ENABLING_ based on CPU specification
100 | ;;
101 | %ifndef _SHA_NI_ENABLING_
102 | %if (__ARCH32E >= __ARCH32E_Y8 )
103 | %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_
104 | %else
105 | %assign _SHA_NI_ENABLING_ _FEATURE_OFF_
106 | %endif
107 | %endif
108 |
109 | ;;
110 | ;; set _ADCOX_NI_ENABLING_
111 | ;;
112 | %ifdef __ARCH_ADCX_NI_
113 | %if (__ARCH_ADCX_NI_ == 0)
114 | %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_
115 | %elif (__ARCH_ADCX_NI_ == 1)
116 | %assign _ADCOX_NI_ENABLING_ _FEATURE_ON_
117 | %else
118 | %error
119 | %endif
120 | %else
121 | %if (__ARCH32E >= __ARCH32E_L9)
122 | %assign _ADCOX_NI_ENABLING_ _FEATURE_TICKTOCK_
123 | %else
124 | %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_
125 | %endif
126 | %endif
127 |
128 |
129 | ;;
130 | ;; select Hash algorithm
131 | ;;
132 | %ifndef _DISABLE_ALG_SHA1_
133 | %assign _ENABLE_ALG_SHA1_ _FEATURE_ON_ ;; SHA1 on
134 | %else
135 | %assign _ENABLE_ALG_SHA1_ _FEATURE_OFF_ ;; SHA1 on
136 | %endif
137 |
138 | %ifndef _DISABLE_ALG_SHA256_
139 | %assign _ENABLE_ALG_SHA256_ _FEATURE_ON_ ;; SHA256 on
140 | %else
141 | %assign _ENABLE_ALG_SHA256_ _FEATURE_OFF_ ;; SHA256 off
142 | %endif
143 |
144 | %ifndef _DISABLE_ALG_SHA521_
145 | %assign _ENABLE_ALG_SHA512_ _FEATURE_ON_ ;; SHA512 on
146 | %else
147 | %assign _ENABLE_ALG_SHA512_ _FEATURE_OFF_ ;; SHA512 off
148 | %endif
149 |
150 | %ifndef _DISABLE_ALG_MD5_
151 | %assign _ENABLE_ALG_MD5_ _FEATURE_ON_ ;; MD5 on
152 | %else
153 | %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_ ;; MD5 off
154 | %endif
155 |
156 | %ifndef _DISABLE_ALG_SM3_
157 | %assign _ENABLE_ALG_SM3_ _FEATURE_ON_ ;; SM3 on
158 | %else
159 | %assign _ENABLE_ALG_SM3_ _FEATURE_OFF_ ;; SM3 off
160 | %endif
161 |
162 | ;;
163 | ;; BN arithmetic
164 | ;;
165 | %assign _ENABLE_KARATSUBA_ _FEATURE_OFF_ ;; not use Karatsuba method for multiplication
166 |
167 | ;;
168 | ;; EC specific
169 | ;;
170 | %assign _ECP_IMPL_NONE_ 0
171 | %assign _ECP_IMPL_ARBIRTRARY_ 1
172 | %assign _ECP_IMPL_SPECIFIC_ 2
173 | %assign _ECP_IMPL_MFM_ 3
174 |
175 | %ifndef _ECP_128_
176 | %assign _ECP_128_ _ECP_IMPL_SPECIFIC_
177 | %endif
178 |
179 | %ifndef _ECP_192_
180 | %assign _ECP_192_ _ECP_IMPL_MFM_
181 | %endif
182 |
183 | %ifndef _ECP_224_
184 | %assign _ECP_224_ _ECP_IMPL_MFM_
185 | %endif
186 |
187 | %ifndef _ECP_256_
188 | %assign _ECP_256_ _ECP_IMPL_MFM_
189 | %endif
190 |
191 | %ifndef _ECP_384_
192 | %assign _ECP_384_ _ECP_IMPL_MFM_
193 | %endif
194 |
195 | %ifndef _ECP_521_
196 | %assign _ECP_521_ _ECP_IMPL_MFM_
197 | %endif
198 |
199 | %ifndef _ECP_SM2_
200 | %assign _ECP_SM2_ _ECP_IMPL_MFM_
201 | %endif
202 |
--------------------------------------------------------------------------------
/mpn/asm/intel64/variant_txt_acm.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | ;
18 | ; Intel(R) Integrated Performance Primitives
19 | ; Cryptographic Primitives (ippcp)
20 | ;
21 | ; Purpose:
22 | ; Update standard ippCP variant
23 | ;
24 | ; do not changes in definitions below!
25 | ;
26 |
27 | %ifdef _TXT_ACM_
28 |
29 | ;;
30 | ;; HASH algs outside settings
31 | ;;
32 | %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_
33 |
34 | ;;
35 | ;; select Hash algorithm
36 | ;;
37 | ; %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_
38 |
39 | %endif
40 |
--------------------------------------------------------------------------------
/mpn/asm/utils.inc:
--------------------------------------------------------------------------------
1 | ;===============================================================================
2 | ; Copyright 2015-2020 Intel Corporation
3 | ;
4 | ; Licensed under the Apache License, Version 2.0 (the "License");
5 | ; you may not use this file except in compliance with the License.
6 | ; You may obtain a copy of the License at
7 | ;
8 | ; http://www.apache.org/licenses/LICENSE-2.0
9 | ;
10 | ; Unless required by applicable law or agreed to in writing, software
11 | ; distributed under the License is distributed on an "AS IS" BASIS,
12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | ; See the License for the specific language governing permissions and
14 | ; limitations under the License.
15 | ;===============================================================================
16 |
17 | %ifndef __UTILS_INC__
18 | %define __UTILS_INC__ 1
19 |
20 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters.
21 | ; A list is processed in direct order. Note: an input list can be empty.
22 | %macro FOREACH 2-*.nolist
23 | %rotate -1
24 | %xdefine %%functor %1
25 | %rep %0-1
26 | %rotate 1
27 | %ifnempty %1
28 | %%functor %1
29 | %endif
30 | %endrep
31 | %endmacro
32 |
33 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters.
34 | ; A list is processed in reverse order. Note: an input list can be empty.
35 | %macro RFOREACH 2-*.nolist
36 | %rotate -1
37 | %xdefine %%functor %1
38 | %rep %0-1
39 | %rotate -1
40 | %ifnempty %1
41 | %%functor %1
42 | %endif
43 | %endrep
44 | %endmacro
45 |
46 | ; Shall be called before INTERSECT macro to open corresponding context.
47 | %macro BEGIN_INTERSECT 0.nolist
48 | %push _INTERSECT_CTX_
49 | %xdefine %$intersection
50 | %assign %$cardinality 0
51 | %endmacro
52 |
53 | ; Shall be called after INTERSECT macro to close corresponding context.
54 | %macro END_INTERSECT 0.nolist
55 | %pop _INTERSECT_CTX_
56 | %endmacro
57 |
58 | ; The macro searches intersection between two lists.
59 | ; Input: two comma-separated lists, enclosed in curly braces.
60 | ; Output:
61 | ; - Intersection will be located in the %$instersection context macro (can be empty).
62 | ; - Count of intersection elements list will be stored in the %$cardinality context variable.
63 | %macro INTERSECT 2.nolist
64 | %ifnctx _INTERSECT_CTX_
65 | %fatal "Not in the context: _INTERSECT_CTX_"
66 | %endif
67 |
68 | %xdefine %%list1 %1
69 | %xdefine %%list2 %2
70 |
71 | FOREACH %%list1,{?INTERSECT_BODY {%%list2},}
72 | %endmacro
73 |
74 | ; Helper macro to concatenate two lists.
75 | ; The result will be stored in the 3rd parameter that must be a macro identifier.
76 | %macro CONCATENATE 3.nolist
77 | %ifnid %3
78 | %fatal "CONCATENATE: 3rd parameter must be a macro identifier."
79 | %endif
80 | %define %3 %[%1]
81 | %ifnempty %3
82 | %ifnempty %2
83 | %define %3 %[%3],%[%2]
84 | %endif
85 | %else
86 | %define %3 %[%2]
87 | %endif
88 | %endmacro
89 |
90 | ; Helper macro that searches the specified element in the input list.
91 | ; Input:
92 | ; - Last parameter - target element
93 | ; - First parameters refer to the list where the search is processed.
94 | ; Output:
95 | ; - The macro is context dependent and upon the element is found, the context macro %$elem_exists will be defined.
96 | %macro ?FIND 2-*.nolist
97 | %ifnctx _FIND_CTX_
98 | %fatal "Not in the context: _FIND_CTX_"
99 | %endif
100 | %rotate -1
101 | %xdefine %%elem_to_check %1
102 | %undef %$elem_exists
103 |
104 | %rep %0-1
105 | %rotate -1
106 | %ifidni %%elem_to_check, %1
107 | %define %$elem_exists %1
108 | %exitrep
109 | %endif
110 | %endrep
111 | %endmacro
112 |
113 | ; Macro that finds and collects intersection elements. To be used as INTERSECT macro functor.
114 | %macro ?INTERSECT_BODY 2.nolist
115 | %xdefine %%list %1
116 | %xdefine %%elem %2
117 |
118 | %push _FIND_CTX_
119 | ?FIND %%list,%%elem
120 | %ifdef %$elem_exists
121 | %ifempty %$$intersection
122 | %define %$$intersection %2
123 | %else
124 | %define %$$intersection %[%$$intersection],%%elem
125 | %endif
126 | %assign %$$cardinality %$$cardinality + 1
127 | %endif
128 | %pop _FIND_CTX_
129 | %endmacro
130 |
131 | %endif
132 |
--------------------------------------------------------------------------------
/mpn/mpn-asm.c:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include "mpn-asm.h"
17 |
18 | #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
19 | // clang-format off
20 | const unsigned char __mpi_clz_tab[129] = {
21 | 1, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
22 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
23 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
24 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
25 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
26 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 | 9,
30 | };
31 | // clang-format on
32 | #endif
33 |
--------------------------------------------------------------------------------
/mpn/mpn-binary.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef MULTIPLE_PRECISION_BINARY_H
17 | #define MULTIPLE_PRECISION_BINARY_H
18 |
19 | #include
20 | #include
21 |
22 | #define BITS_PER_BYTE 8 /* @constant: bits per byte */
23 | #define BITS_PER_CHAR 4 /* @constant: bits per character */
24 | #define MPN_MAX_BITS (UINT_MAX / BITS_PER_BYTE) /* @note: mpn width limitation */
25 | #define MPN_BITS_TO_BYTES(n) (((n) + BITS_PER_BYTE - 1) / BITS_PER_BYTE)
26 |
27 | /* swap variable */
28 | #define SWAP(type, a, b) \
29 | do { \
30 | type __t = a; \
31 | (a) = (b); \
32 | (b) = __t; \
33 | } while (0)
34 |
35 | /* copy(increment) */
36 | #define COPY(dst, src, to) \
37 | for (mpn_size_t __i = 0; __i < (to); __i++) { (dst)[__i] = (src)[__i]; }
38 |
39 | /* expand by zeros */
40 | #define ZEROIZE(dst, from, to) \
41 | for (mpn_size_t __i = (from); __i < (to); __i++) { (dst)[__i] = 0; }
42 |
43 | /* copy and expand the left by zeros */
44 | #define ZEXPAND(dst, dstlen, src, srclen) \
45 | { \
46 | mpn_size_t __i; \
47 | for (__i = 0; __i < (srclen); __i++) { (dst)[__i] = (src)[__i]; } \
48 | for (; __i < (dstlen); __i++) { (dst)[__i] = 0; } \
49 | }
50 |
51 | /**
52 | * mpn alignment
53 | */
54 | MPN_INLINE mpn_size_t mpi_aligned_diff(void *ptr, uintptr_t alignment)
55 | {
56 | return (mpn_size_t)((~(((uintptr_t)ptr) & (alignment - 1)) + 1) & (alignment - 1));
57 | }
58 |
59 | MPN_INLINE mpn_size_t mpi_aligned_size(mpn_size_t size, mpn_size_t alignment)
60 | {
61 | return (size + (alignment - 1)) & (-alignment);
62 | }
63 |
64 | MPN_INLINE mpn_limb_t *mpi_aligned_pointer(void *ptr, uintptr_t alignment)
65 | {
66 | return (mpn_limb_t *)((uintptr_t)((unsigned char *)ptr + alignment - 1) & (-alignment));
67 | }
68 |
69 | /**
70 | * basic constant-time operation
71 | */
72 | /* return all-ones if MSB(a) == 1; otherwise, all-zeros */
73 | MPN_INLINE mpn_limb_t mpn_limb_test_msb_consttime(mpn_limb_t a)
74 | {
75 | return (mpn_limb_t)0 - (a >> (sizeof(a) * BITS_PER_BYTE - 1));
76 | }
77 |
78 | /* return all-ones if |a| equals zero; otherwise, all-zeros */
79 | MPN_INLINE mpn_limb_t mpn_limb_is_zero_consttime(mpn_limb_t a)
80 | {
81 | mpn_limb_t t = ~a & (a - 1);
82 | return (mpn_limb_t)0 - (t >> (sizeof(t) * BITS_PER_BYTE - 1));
83 | }
84 |
85 | /* copy under mask: dst[] = (a[] & mask) ^ (b[] & ~mask) */
86 | MPN_INLINE void mpn_masked_copy_consttime(mpn_limb_t *dst, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t len,
87 | mpn_limb_t mask)
88 | {
89 | mpn_limb_t rmask = ~mask;
90 | for (mpn_size_t i = 0; i < len; i++) { dst[i] = (a[i] & mask) ^ (b[i] & rmask); }
91 | }
92 |
93 | /* conditional swap: a[], b[] = b[], a[] if cond; otherwise not changed */
94 | MPN_INLINE void mpn_masked_swap_consttime(mpn_limb_t *a, mpn_limb_t *b, mpn_size_t n, unsigned cond)
95 | {
96 | mpn_limb_t mask = cond;
97 | mask = ((~mask & ((mask - 1))) >> (sizeof(mpn_limb_t) * BITS_PER_BYTE - 1)) - 1;
98 | for (mpn_size_t i = 0; i < n; i++) {
99 | mpn_limb_t t = (a[i] ^ b[i]) & mask;
100 | a[i] ^= t;
101 | b[i] ^= t;
102 | }
103 | }
104 |
105 | /* conditional move: dst[] = cond ? src[] : dst[] */
106 | MPN_INLINE void mpn_masked_move_consttime(mpn_limb_t *dst, const mpn_limb_t *src, mpn_size_t len, unsigned cond)
107 | {
108 | mpn_masked_copy_consttime(dst, src, dst, len, (mpn_limb_t)0 - cond != 0);
109 | }
110 |
111 | #if defined(__cplusplus)
112 | extern "C" {
113 | #endif
114 |
115 | /**
116 | * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros
117 | */
118 | mpn_limb_t mpn_is_zero(const mpn_limb_t *buff, mpn_size_t bufflen);
119 |
120 | /**
121 | * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros(constant-time version)
122 | */
123 | mpn_limb_t mpn_is_zero_consttime(const mpn_limb_t *buff, mpn_size_t bufflen);
124 |
125 | /**
126 | * mpn: get most significant bit
127 | */
128 | mpn_size_t mpn_bits(const mpn_limb_t *data, mpn_size_t size);
129 |
130 | /**
131 | * mpn: get most significant bit(constant-time version)
132 | */
133 | mpn_size_t mpn_bits_consttime(const mpn_limb_t *data, mpn_size_t size);
134 |
135 | /**
136 | * mpn: get most significant limb
137 | */
138 | mpn_size_t mpn_limbs(const mpn_limb_t *data, mpn_size_t size);
139 |
140 | /**
141 | * mpn: get most significant limb(constant-time version)
142 | */
143 | mpn_size_t mpn_limbs_consttime(const mpn_limb_t *data, mpn_size_t size);
144 |
145 | /**
146 | * mpn: unsigned comparison
147 | *
148 | * @note:
149 | * 1. return 1 if a[] > b[]; 0 if a[] = b[]; -1 if a[] < b[]
150 | */
151 | int mpn_cmp(const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize);
152 |
153 |
154 | /**
155 | * mpn: left shift
156 | *
157 | * @note:
158 | * 1. required bit_size(r) >= bit_size(a) + nbits
159 | * 2. the return is number of |mpn_limb_t| of the result |r|
160 | * 3. r == a is acceptable
161 | */
162 | mpn_size_t mpn_lshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits);
163 |
164 | /**
165 | * mpn: right shift
166 | *
167 | * @note:
168 | * 1. required bit_size(r) >= bit_size(a) - nbits
169 | * 2. the return is number of |mpn_limb_t| of the result |r|
170 | * 3. r == a is acceptable
171 | */
172 | mpn_size_t mpn_rshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits);
173 |
174 | /**
175 | * mpn addition: carry, r = a[:n] + b[:n]
176 | */
177 | mpn_limb_t mpn_add_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n);
178 |
179 | /**
180 | * mpn: carry, r[] = a[] + b[]
181 | */
182 | mpn_limb_t mpn_add(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b,
183 | mpn_size_t bsize);
184 |
185 | /**
186 | * mpn: carry, r[:n] = a[:n] + w
187 | */
188 | mpn_limb_t mpn_inc_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t size, mpn_limb_t w);
189 |
190 | /**
191 | * mpn: carry, r[] = a[] + w
192 | */
193 | mpn_limb_t mpn_inc(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
194 |
195 | /**
196 | * mpn subtraction: borrow, r[:n] = a[:n] - b[:n]
197 | *
198 | * @note:
199 | * 1. make sure r->room is enough to store the result
200 | * minimal advise size: MAX(bit_size(a), bit_size(b)) + 1
201 | */
202 | mpn_limb_t mpn_sub_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n);
203 |
204 | /**
205 | * mpn subtraction: size, r[] = a[] - b[]
206 | */
207 | mpn_size_t mpn_sub(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b,
208 | mpn_size_t bsize);
209 |
210 | /**
211 | * mpn: borrow, r[:n] = a[:n] - w
212 | */
213 | mpn_limb_t mpn_dec_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
214 |
215 | /**
216 | * mpn: size, r[] = a[] - w
217 | */
218 | mpn_size_t mpn_dec(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w);
219 |
220 | /**
221 | * mpn multiplication: extension, r[:asize+bsize] = a[:asize] * b[:bsize]
222 | * @note:
223 | * 1. (IMPORTANT)make sure size of |r| isn't less than |asize| + |bsize|
224 | * 2. the return is the highest unit |mpn_limb_t|
225 | */
226 | mpn_limb_t mpn_mul(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize);
227 |
228 | /**
229 | * mpn multiply-and-add: extension, r[] += a[] * b
230 | * @note:
231 | * 1. (IMPORTANT)make sure size of |r| isn't less than |asize|
232 | * 2. the return is extension of result of multiply-and-add.
233 | */
234 | mpn_limb_t mpn_mul_acc(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t b);
235 |
236 | /**
237 | * mpn square: r[] = a[] ^ 2
238 | *
239 | * @note:
240 | * 1. make sure r->room is enough to store the result
241 | * minimal advise size: 2 * bit_size(a)
242 | */
243 | mpn_limb_t mpn_sqr(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t anum);
244 |
245 | /**
246 | * mpn division: xsize, q, x(q = x / y, x = x % y)
247 | */
248 | mpn_size_t mpn_div(mpn_limb_t *q, mpn_size_t *qsize, mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize);
249 |
250 | /**
251 | * mpn modular: x[] = x[] % y[]
252 | */
253 | mpn_size_t mpn_mod(mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize);
254 |
255 | /**
256 | * mpn: division(n by 1)
257 | *
258 | * @note:
259 | * 1. required length of q should be not smaller than size
260 | */
261 | mpn_size_t mpn_div_limb(mpn_limb_t q[], const mpn_limb_t x[], mpn_size_t size, mpn_limb_t *r, mpn_limb_t d);
262 |
263 | /**
264 | * mpn: division(n by 2)
265 | *
266 | * @note:
267 | * 1. required length of q should be not smaller than size
268 | */
269 | mpn_size_t mpn_div_double_limbs(mpn_limb_t q[], mpn_limb_t r[2], const mpn_limb_t n[], mpn_size_t nn,
270 | const mpn_limb_t d[2]);
271 |
272 | /**
273 | * @brief: multiplicative inversion
274 | *
275 | * @params:
276 | * a/asize: source (value) BigNum A whose size is asize
277 | * m/msize: source (modulus) BigNum M whose size is msize
278 | * invbuf: buffer of inv
279 | * abuf : buffer of A
280 | * mbuf : buffer of M
281 | * r : result BigNum
282 | */
283 | mpn_size_t mpn_mod_invert(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *m, mpn_size_t msize,
284 | mpn_optimizer_t *optimizer);
285 |
286 | /**
287 | * mpn: create mpn from hex string
288 | */
289 | mpn_size_t mpn_from_string(mpn_limb_t *r, mpn_size_t size, const char *in, mpn_size_t inlen);
290 |
291 | /**
292 | * mpn: convert mpn to hex string
293 | */
294 | mpn_size_t mpn_to_string(char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size);
295 |
296 | /**
297 | * mpn: create mpn from big-endian octets
298 | */
299 | mpn_size_t mpn_from_octets(mpn_limb_t *r, mpn_size_t size, const unsigned char *in, mpn_size_t inlen);
300 |
301 | /**
302 | * mpn: convert mpn to big-endian octets
303 | */
304 | mpn_size_t mpn_to_octets(unsigned char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size);
305 |
306 | /**
307 | * leading zeros counting(constant-time version)
308 | */
309 | mpn_size_t mpn_limb_nlz_consttime(mpn_limb_t x);
310 |
311 | /**
312 | * trailing zeros counting(constant-time version)
313 | */
314 | mpn_size_t mpn_limb_ntz_consttime(mpn_limb_t x);
315 |
316 | /**
317 | * greatest common divisor(mpn_limb_t)
318 | */
319 | mpn_limb_t mpn_limb_gcd(mpn_limb_t a, mpn_limb_t b);
320 |
321 | /**
322 | * mpn: generate in range
323 | *
324 | * @note:
325 | * 1. length of |r| >= hilen
326 | */
327 | int mpn_random_range(mpn_limb_t *r, mpn_size_t maxtries, const mpn_limb_t *lo, mpn_size_t lolen, const mpn_limb_t *hi,
328 | mpn_size_t hilen, int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state);
329 |
330 | /**
331 | * test if |a| and |b| are coprime
332 | */
333 | int mpn_is_coprime(mpn_limb_t *a, mpn_size_t asize, mpn_limb_t *b, mpn_size_t bsize, mpn_optimizer_t *optimizer);
334 |
335 | #if defined(__cplusplus)
336 | }
337 | #endif
338 |
339 | #endif
340 |
--------------------------------------------------------------------------------
/mpn/mpn-montgomery.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef MULTIPLE_PRECISION_MONTGOMERY_H
17 | #define MULTIPLE_PRECISION_MONTGOMERY_H
18 |
19 | #include
20 |
21 | #if defined(__cplusplus)
22 | extern "C" {
23 | #endif
24 |
25 | typedef struct {
26 | mpn_size_t modbits; /**< size of modulus in bit */
27 | mpn_size_t modsize; /**< size of modulus in mpn_limb_t */
28 | mpn_limb_t k0; /**< low word of (1/modulus) mod R */
29 | mpn_limb_t *modulus; /**< modulus */
30 | mpn_limb_t *montR; /**< mont_enc(1) */
31 | mpn_limb_t *montRR; /**< mont_enc(1) ^ 2 */
32 |
33 | mpn_optimizer_t *optimizer; /**< optimizer for montgomery operation */
34 | } mpn_montgomery_t;
35 |
36 | /**
37 | * mpn montgomery: create montgomery context
38 | *
39 | */
40 | mpn_montgomery_t *mpn_montgomery_create(mpn_size_t mbits, mpn_size_t psize);
41 |
42 | /**
43 | * mpn montgomery: destory montgomery context
44 | *
45 | */
46 | void mpn_montgomery_destory(mpn_montgomery_t *mont);
47 |
48 | /**
49 | * mpn montgomery: intialize montgomery context with modulus
50 | *
51 | */
52 | int mpn_montgomery_set_modulus_bin(mpn_montgomery_t *mont, const mpn_limb_t *modulus, mpn_size_t mbits);
53 |
54 | /**
55 | * mpn montgomery: montgomery reduction
56 | *
57 | * @note:
58 | * 1. m0: low word of (1 / modulus) mod b
59 | * 2. r = T/R mod m
60 | */
61 | void mpn_montgomery_reduce_bin(mpn_limb_t *r, mpn_limb_t *product, const mpn_limb_t *m, mpn_size_t mnum, mpn_limb_t m0);
62 |
63 | /**
64 | * mpn montgomery: r[] = to_mont(a[])
65 | *
66 | * @requirements:
67 | * 1. length of r: modsize
68 | * 2. length of a: modsize
69 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t)
70 | */
71 | void mpn_montgomery_encode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
72 |
73 | /**
74 | * mpn montgomery: r[] = from_mont(a)
75 | *
76 | * @requirements:
77 | * 1. length of r: modsize
78 | * 2. length of a: modsize
79 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t)
80 | */
81 | void mpn_montgomery_decode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
82 |
83 | /**
84 | * mpn montgomery: r = (a + b) mod m
85 | *
86 | * @requirements:
87 | * 1. length of r: modsize
88 | * 2. length of a: modsize
89 | * 3. length of b: modsize
90 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t)
91 | */
92 | void mpn_montgomery_add(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
93 |
94 | /**
95 | * mpn montgomery: r = (a - b) mod m
96 | *
97 | * @requirements:
98 | * 1. length of r: modsize
99 | * 2. length of a: modsize
100 | * 3. length of b: modsize
101 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t)
102 | */
103 | void mpn_montgomery_sub(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
104 |
105 | /**
106 | * mpn montgomery: r = -b mod m = (m - b) mod m
107 | *
108 | * @requirements:
109 | * 1. length of r: modsize
110 | * 2. length of a: modsize
111 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t)
112 | */
113 | void mpn_montgomery_negative(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
114 |
115 | /**
116 | * mpn montgomery: r = (a / 2) mod m
117 | *
118 | * @requirements:
119 | * 1. length of r: modsize
120 | * 2. length of a: modsize
121 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t)
122 | */
123 | void mpn_montgomery_halve(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
124 |
125 | /**
126 | * mpn montgomery: r = (a * 2) mod m
127 | *
128 | * @requirements:
129 | * 1. length of r: modsize
130 | * 2. length of a: modsize
131 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t)
132 | */
133 | void mpn_montgomery_double(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
134 |
135 | /**
136 | * mpn montgomery: r = (a * 3) mod m
137 | *
138 | * @requirements:
139 | * 1. length of r: modsize
140 | * 2. length of a: modsize
141 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t)
142 | */
143 | void mpn_montgomery_triple(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
144 |
145 | /**
146 | * mpn montgomery: r = prod mod m
147 | *
148 | * @requirements:
149 | * 1. length of r: modsize
150 | * 2. length of rod: modsize
151 | * 4. memory size from the pool: N/A
152 | */
153 | void mpn_montgomery_reduce(mpn_limb_t *r, mpn_limb_t *prod, mpn_montgomery_t *mont);
154 |
155 | /**
156 | * mpn montgomery: r = (a * b) mod m
157 | *
158 | * @requirements:
159 | * 1. length of r: modsize
160 | * 2. length of a: modsize
161 | * 3. length of b: modsize
162 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2
163 | */
164 | void mpn_montgomery_mul(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont);
165 |
166 | /**
167 | * mpn montgomery: r = (a ^ 2) mod m
168 | *
169 | * @requirements:
170 | * 1. length of r: modsize
171 | * 2. length of a: modsize
172 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2
173 | */
174 | void mpn_montgomery_square(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont);
175 |
176 | /**
177 | * montgomery factor k0 = -((modulus^-1 mod B) %B)
178 | */
179 | mpn_limb_t mpn_montgomery_factor(mpn_limb_t m0);
180 |
181 | /**
182 | * mpn montgomery: binary exponentiation
183 | *
184 | */
185 | mpn_size_t mpn_montgomery_exp(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e,
186 | mpn_size_t ebits, mpn_montgomery_t *mont);
187 |
188 | /**
189 | * mpn montgomery: binary exponentiation(consttime)
190 | *
191 | */
192 | mpn_size_t mpn_montgomery_exp_consttime(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e,
193 | mpn_size_t ebits, mpn_montgomery_t *mont);
194 |
195 | #if defined(__cplusplus)
196 | }
197 | #endif
198 |
199 | #endif
200 |
--------------------------------------------------------------------------------
/mpn/mpn-optimizer.c:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include "mpn-binary.h"
17 |
18 | /**
19 | * mpn optimizer: create optimizer for mpn operation
20 | *
21 | * @note:
22 | * 1. room: room size of optimizer chunk, in unit of 'mpn_limb_t'
23 | */
24 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room)
25 | {
26 | if (room == 0) {
27 | /* it's meaningless to create 0-length optimizer */
28 | return NULL;
29 | }
30 | size_t size = sizeof(mpn_optimizer_t) + MPN_LIMB_BYTES + room * sizeof(mpn_limb_t);
31 | mpn_optimizer_t *optimizer = (mpn_optimizer_t *)MPI_ALLOCATE(size);
32 | if (optimizer != NULL) {
33 | optimizer->size = 0;
34 | optimizer->next = NULL;
35 | optimizer->room = room;
36 | optimizer->chunk = mpi_aligned_pointer((unsigned char *)optimizer + sizeof(mpn_optimizer_t), MPN_LIMB_BYTES);
37 | }
38 |
39 | return optimizer;
40 | }
41 |
42 | /**
43 | * mpn optimizer: reset optimizer, mark all as unused
44 | */
45 | void mpn_optimizer_reset(mpn_optimizer_t *optimizer)
46 | {
47 | mpn_optimizer_t *curr = optimizer;
48 | while (curr != NULL) {
49 | curr->size = 0;
50 | curr = curr->next;
51 | }
52 | }
53 |
54 | /**
55 | * mpn optimizer: destory optimizer
56 | */
57 | void mpn_optimizer_destory(mpn_optimizer_t *optimizer)
58 | {
59 | mpn_optimizer_t *curr = optimizer, *next;
60 | while (curr != NULL) {
61 | next = curr->next;
62 | MPI_DEALLOCATE(curr); /* cleanse and free mpn_optimizer_t node */
63 | curr = next;
64 | }
65 | }
66 |
67 | /**
68 | * mpn optimizer: get memory chunk for mpn operation
69 | *
70 | * @note:
71 | * 1. size: size of chunk, in unit of 'mpn_limb_t'
72 | */
73 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *optimizer, mpn_size_t size)
74 | {
75 | if (optimizer == NULL) {
76 | MPI_RAISE_ERROR(-EINVAL);
77 | return NULL;
78 | }
79 | if (size == 0) { return NULL; }
80 |
81 | mpn_size_t total = 0;
82 | mpn_optimizer_t *curr = optimizer, *prev = NULL;
83 | while (curr != NULL) {
84 | total += curr->size;
85 | prev = curr;
86 | curr = curr->next;
87 | }
88 |
89 | if (prev->room - prev->size >= size) {
90 | curr = prev;
91 | } else {
92 | mpn_size_t room = size + total / 2; // XXX: optimize growth rule
93 | prev->next = curr = mpn_optimizer_create(room);
94 | }
95 |
96 | if (curr != NULL) {
97 | mpn_limb_t *p = &curr->chunk[curr->size];
98 | curr->size += size;
99 |
100 | return p;
101 | } else {
102 | MPI_RAISE_ERROR(-ENOMEM);
103 |
104 | return NULL;
105 | }
106 | }
107 |
108 | /**
109 | * mpn optimizer: put back memory chunk
110 | */
111 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size)
112 | {
113 | if (optimizer == NULL) { return; }
114 |
115 | mpn_optimizer_t *curr = optimizer, *prev = NULL;
116 | while (curr != NULL) {
117 | prev = curr;
118 | curr = curr->next;
119 | }
120 |
121 | if (prev->size >= size) { prev->size -= size; }
122 | }
123 |
--------------------------------------------------------------------------------
/mpn/mpn-optimizer.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2021 Ethan.cr.yp.to
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef MULTIPLE_PRECISION_OPTIMIZER_H
17 | #define MULTIPLE_PRECISION_OPTIMIZER_H
18 |
19 | #include
20 |
21 | #if defined(__cplusplus)
22 | extern "C" {
23 | #endif
24 |
25 | typedef struct mpn_optimizer_t {
26 | mpn_size_t size; /**< offset of used chunk */
27 | mpn_size_t room; /**< max size of chunk */
28 | mpn_limb_t *chunk; /**< mpn chunk */
29 | struct mpn_optimizer_t *next; /**< next optimizer node */
30 | } mpn_optimizer_t;
31 |
32 | /**
33 | * mpn optimizer: create optimizer for mpn operation
34 | *
35 | * @note:
36 | * 1. room: room size of optimizer chunk, in unit of 'mpn_limb_t'
37 | */
38 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room);
39 |
40 | /**
41 | * mpn optimizer: destory optimizer
42 | */
43 | void mpn_optimizer_destory(mpn_optimizer_t *opt);
44 |
45 | /**
46 | * mpn optimizer: get memory chunk for mpn operation
47 | *
48 | * @note:
49 | * 1. size: size of chunk, in unit of 'mpn_limb_t'
50 | */
51 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *opt, mpn_size_t size);
52 |
53 | /**
54 | * mpn optimizer: put back memory chunk
55 | */
56 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size);
57 |
58 | /**
59 | * mpn optimizer: reset optimizer, mark all as unused
60 | */
61 | void mpn_optimizer_reset(mpn_optimizer_t *opt);
62 |
63 | #if defined(__cplusplus)
64 | }
65 | #endif
66 |
67 | #endif
68 |
--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # unit-test
2 | ADD_EXECUTABLE(unittest-mpi unittest-mpi.cpp)
3 | TARGET_LINK_LIBRARIES(unittest-mpi mpi crypto dl gtest pthread)
4 | ADD_TEST(NAME unittest-mpi COMMAND unittest-mpi
5 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
6 | )
7 | ConfigureTarget(unittest-mpi)
8 |
9 | IF (BUILD_VENDOR)
10 | ADD_DEPENDENCIES(unittest-mpi openssl)
11 | ENDIF ()
12 |
13 | # benchmark
14 | ADD_EXECUTABLE(benchmark benchmark.cpp)
15 | TARGET_LINK_LIBRARIES(benchmark mpi crypto pthread dl)
16 | TARGET_COMPILE_OPTIONS(benchmark PRIVATE -std=gnu++17)
17 | ConfigureTarget(benchmark)
18 |
19 | IF (BUILD_VENDOR)
20 | ADD_DEPENDENCIES(benchmark openssl)
21 | ENDIF ()
22 |
23 | INSTALL(TARGETS benchmark unittest-mpi RUNTIME DESTINATION bin)
24 |
--------------------------------------------------------------------------------
/tests/test.cc:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2022 Kiran Nowak(kiran.nowak@gmail.com)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | #include "logger.h"
23 | #include "tabulate.h"
24 | #include "benchmark.h"
25 |
26 | template
27 | T reverse(T n);
28 |
29 | unsigned char reverse(unsigned char n)
30 | {
31 | #ifdef USE_SMALL_LOOKUP_TABLE
32 | // clang-format off
33 | static const unsigned char lookup[16] = {
34 | 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
35 | 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
36 | };
37 | // clang-format on
38 |
39 | // Detailed breakdown of the math
40 | // + lookup reverse of bottom nibble
41 | // | + grab bottom nibble
42 | // | | + move bottom result into top nibble
43 | // | | | + combine the bottom and top results
44 | // | | | | + lookup reverse of top nibble
45 | // | | | | | + grab top nibble
46 | // V V V V V V
47 | // (lookup[n&0b1111] << 4) | lookup[n>>4]
48 |
49 | // Reverse the top and bottom nibble then swap them.
50 | return (lookup[n & 0b1111] << 4) | lookup[n >> 4];
51 | #else
52 | // clang-format off
53 | static const unsigned char reversed[] = {
54 | 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
55 | 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
56 | 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
57 | 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
58 | 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
59 | 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
60 | 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
61 | 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
62 | 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
63 | 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
64 | 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
65 | 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
66 | 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
67 | 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
68 | 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
69 | 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
70 | 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
71 | 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
72 | 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
73 | 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
74 | 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
75 | 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
76 | 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
77 | 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
78 | 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
79 | 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
80 | 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
81 | 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
82 | 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
83 | 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
84 | 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
85 | 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
86 | };
87 | // clang-format on
88 |
89 | return reversed[n];
90 | #endif
91 | }
92 |
93 | unsigned int reverse(unsigned int n)
94 | {
95 | unsigned int m = n;
96 | unsigned char *p = (unsigned char *)(&m);
97 |
98 | if (sizeof(unsigned int) == 4) {
99 | return (reverse(p[0]) << 24) | (reverse(p[1]) << 16) | (reverse(p[2]) << 8) | reverse(p[3]);
100 | } else {
101 | for (unsigned i = 0; i < sizeof(unsigned int) / 2; i++) {
102 | unsigned char h = p[i], l = p[sizeof(unsigned int) - 1 - i];
103 | p[i] = reverse(l);
104 | p[sizeof(unsigned int) - 1 - i] = reverse(h);
105 | }
106 |
107 | return m;
108 | }
109 | }
110 |
111 | unsigned int reverse_ref(unsigned int num)
112 | {
113 | unsigned int count = sizeof(num) * 8 - 1;
114 | unsigned int reverse_num = num;
115 |
116 | num >>= 1;
117 | while (num) {
118 | reverse_num <<= 1;
119 | reverse_num |= num & 1;
120 | num >>= 1;
121 | count--;
122 | }
123 | reverse_num <<= count;
124 |
125 | return reverse_num;
126 | }
127 |
128 |
129 | static int clz(unsigned int x)
130 | {
131 | static_assert(sizeof(unsigned int) == 4, "unsigned int must be 32 bits");
132 |
133 | // clang-format off
134 | static const char debruijn32[32] = {
135 | 0, 31, 9, 30, 3, 8, 13, 29,
136 | 2, 5, 7, 21, 12, 24, 28, 19,
137 | 1, 10, 4, 14, 6, 22, 25, 20,
138 | 11, 15, 23, 26, 16, 27, 17, 18,
139 | };
140 | // clang-format on
141 | x |= x >> 1;
142 | x |= x >> 2;
143 | x |= x >> 4;
144 | x |= x >> 8;
145 | x |= x >> 16;
146 | x++;
147 |
148 | return debruijn32[x * 0x076be629 >> 27];
149 | }
150 |
151 | int merge(unsigned int &merged, unsigned int hi, unsigned int lo)
152 | {
153 | auto hbits = clz(hi);
154 | auto lbits = clz(lo);
155 | merged = lo | reverse(hi);
156 |
157 | return static_cast(lbits + hbits) - static_cast(sizeof(unsigned int) * 8);
158 | }
159 |
160 | struct data {
161 | std::string ma;
162 | std::string mb;
163 | std::string mc;
164 | data(const std::string &a, const std::string &b, const std::string &c) : ma(a), mb(b), mc(c) {}
165 | };
166 |
167 | namespace logging
168 | {
169 | template <>
170 | inline std::string to_string(const std::vector &v)
171 | {
172 | using namespace tabulate;
173 | Table table("Company", "Contact", "Country");
174 | table[0].format().align(Align::center);
175 | for (auto const &item : v) { table.add(item.ma, item.mb, item.mc); }
176 |
177 | // Iterate over rows in the table
178 | size_t index = 0;
179 | for (auto &row : table) {
180 | row.format().styles(Style::bold);
181 |
182 | // Set blue background color for alternate rows
183 | if (index > 0 && index % 2 == 0) {
184 | for (auto &cell : row) { cell.format().background_color(Color::blue); }
185 | }
186 | index += 1;
187 | }
188 |
189 | return table.xterm();
190 | }
191 | } // namespace logging
192 |
193 | int main()
194 | {
195 | {
196 | int a = 1;
197 | float b = 2.0;
198 | std::string c = "three";
199 | bool d = true;
200 | std::vector e{1, 3, 5, 7, 9};
201 | std::vector f{
202 | data("Alfreds Futterkiste", "Maria Anders", "Germany"),
203 | data("Centro comercial Moctezuma", "Francisco Chang", "Mexico"),
204 | data("Ernst Handel", "Roland Mendel", "Austria"),
205 | data("Island Trading", "Helen Bennett", "UK"),
206 | data("Laughing Bacchus Winecellars", "Yoshi Tannamuri", "Canada"),
207 | data("Magazzini Alimentari Riuniti", "Giovanni Rovelli", "Italy"),
208 | };
209 |
210 | enum flags {
211 | FLAG1 = 0x1,
212 | FLAG2 = 0x2,
213 | FLAG3 = 0x4,
214 | } g = FLAG2,
215 | h = static_cast(FLAG1 | FLAG3);
216 |
217 | llogi(a, b, c, d, e, f, f[0].mc, g, h);
218 | }
219 |
220 | {
221 | struct {
222 | unsigned int hi, lo;
223 | } datas[] = {
224 | {.hi = 0x01, .lo = 0x1000},
225 | {.hi = 0x09, .lo = 0x1000},
226 | {.hi = 0x10, .lo = 0x1000},
227 | {.hi = 0xF1, .lo = 0x1000},
228 | };
229 |
230 | std::cout << std::endl;
231 | for (auto const &data : datas) {
232 | int rbits;
233 | unsigned int merged;
234 |
235 | printf("merge(0x%02X, 0x%04X): ", data.hi, data.lo);
236 | if ((rbits = merge(merged, data.hi, data.lo)) >= 0) {
237 | printf("0x%08X, remain-bits = %2d\n", merged, rbits);
238 | } else {
239 | printf("failed.\n");
240 | }
241 | }
242 | }
243 |
244 | {
245 | BENCHER(reverse_, DoNotOptimize(reverse(__j)), 20, 20000000);
246 | BENCHER(reverse_ref_, DoNotOptimize(reverse_ref(__j)), 20, 20000000);
247 |
248 | std::cout << std::endl;
249 | std::cout << "reverse: avg = " << reverse_avg << ", stddev = " << reverse_stddev << std::endl;
250 | std::cout << "reverse(ref): avg = " << reverse_ref_avg << ", stddev = " << reverse_ref_stddev << std::endl;
251 | std::cout << "perf-diff: " << reverse_ref_avg / reverse_avg << std::endl;
252 | }
253 | }
254 |
--------------------------------------------------------------------------------