├── .clang-format ├── .cmake-format.yaml ├── .github └── workflows │ ├── benchmark.yml │ ├── ci.yml │ ├── coverage.yml │ └── multiarch.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake ├── Config.cmake.in └── ConfigureTarget.cmake ├── docs └── README.template.md ├── mpi ├── CMakeLists.txt ├── mpi-prime.c ├── mpi-rsa.c ├── mpi-rsa.h ├── mpi.c └── mpi.h ├── mpn ├── CMakeLists.txt ├── asm │ ├── asmdefs.inc │ ├── ia_32e.inc │ ├── ia_common.inc │ ├── ia_emm.inc │ ├── intel64 │ │ ├── bn_uaddadd_m7as.asm │ │ ├── bn_uaddsub_m7as.asm │ │ ├── bn_um7.inc │ │ ├── bn_umul.inc │ │ ├── bn_umul_basic.inc │ │ ├── bn_umul_fix.inc │ │ ├── bn_umulpp.inc │ │ ├── bn_umulpp_basic.inc │ │ ├── bn_umulpp_fix.inc │ │ ├── bn_umulschool.inc │ │ ├── bn_usqr.inc │ │ ├── bn_usqr_basic.inc │ │ ├── bn_usqrpp.inc │ │ ├── bn_usqrpp_basic.inc │ │ ├── bn_usqrschool.inc │ │ ├── clear_regs.inc │ │ ├── cpinitas.asm │ │ ├── emulator.inc │ │ ├── ia_32e_regs.inc │ │ ├── memcpy.inc │ │ ├── mont_mul1024_avx2as.asm │ │ ├── mont_mul_avx2as.asm │ │ ├── mont_sqr1024_avx2as.asm │ │ ├── mont_sqr_avx2as.asm │ │ ├── mpi_mont_reduction_m7as.asm │ │ ├── mpi_uadd_m7as.asm │ │ ├── mpi_udiv_u32_m7as.asm │ │ ├── mpi_uinc_udec_m7as.asm │ │ ├── mpi_umul_acc_m7as.asm │ │ ├── mpi_umul_m7as.asm │ │ ├── mpi_umul_usqr_redc_srvl9.asm │ │ ├── mpi_umul_usqr_redc_srvl9pp.asm │ │ ├── mpi_usqr_m7as.asm │ │ ├── mpi_usub_m7as.asm │ │ ├── mred.inc │ │ ├── mred_basic.inc │ │ ├── mred_pp.inc │ │ ├── mred_pp_basic.inc │ │ ├── mulx.inc │ │ ├── os.inc │ │ ├── reg_sizes.inc │ │ ├── variant.inc │ │ └── variant_txt_acm.inc │ ├── montgomery-avx2.c │ ├── montgomery-avx512.c │ └── utils.inc ├── mpn-asm.c ├── mpn-asm.h ├── mpn-binary.c ├── mpn-binary.h ├── mpn-conf.h ├── mpn-montgomery.c ├── mpn-montgomery.h ├── mpn-optimizer.c └── mpn-optimizer.h └── tests ├── CMakeLists.txt ├── benchmark.cpp ├── ini.h ├── logger.h ├── mpi-compiler.h ├── mpn-division.c ├── nameof.h ├── profiler.h ├── tabulate.h ├── test.cc └── unittest-mpi.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | # configured with https://zed0.co.uk/clang-format-configurator 2 | 3 | --- 4 | Language: Cpp 5 | AccessModifierOffset: '-2' 6 | AlignAfterOpenBracket: Align 7 | AlignConsecutiveMacros: 'true' 8 | AlignConsecutiveAssignments: 'false' 9 | AlignConsecutiveDeclarations: 'false' 10 | AlignEscapedNewlines: Left 11 | AlignOperands: 'true' 12 | AlignTrailingComments: 'true' 13 | AllowAllArgumentsOnNextLine: 'true' 14 | AllowAllConstructorInitializersOnNextLine: 'true' 15 | AllowAllParametersOfDeclarationOnNextLine: 'true' 16 | AllowShortBlocksOnASingleLine: 'false' 17 | AllowShortCaseLabelsOnASingleLine: 'false' 18 | AllowShortFunctionsOnASingleLine: Empty 19 | AllowShortIfStatementsOnASingleLine: WithoutElse 20 | AllowShortLambdasOnASingleLine: None 21 | AllowShortLoopsOnASingleLine: 'true' 22 | AlwaysBreakAfterDefinitionReturnType: None 23 | AlwaysBreakAfterReturnType: None 24 | AlwaysBreakBeforeMultilineStrings: 'false' 25 | AlwaysBreakTemplateDeclarations: 'Yes' 26 | BinPackArguments: 'true' 27 | BinPackParameters: 'true' 28 | BraceWrapping: 29 | AfterCaseLabel: 'false' 30 | AfterClass: 'false' 31 | AfterControlStatement: 'false' 32 | AfterEnum: 'false' 33 | AfterFunction: 'true' 34 | AfterNamespace: 'true' 35 | AfterObjCDeclaration: 'false' 36 | AfterStruct: 'false' 37 | AfterUnion: 'false' 38 | AfterExternBlock: 'false' 39 | BeforeCatch: 'false' 40 | BeforeElse: 'false' 41 | IndentBraces: 'false' 42 | SplitEmptyFunction: 'true' 43 | SplitEmptyRecord: 'true' 44 | SplitEmptyNamespace: 'true' 45 | BreakBeforeBinaryOperators: NonAssignment 46 | BreakBeforeBraces: Custom 47 | BreakBeforeTernaryOperators: 'true' 48 | BreakConstructorInitializers: BeforeColon 49 | BreakInheritanceList: BeforeColon 50 | BreakStringLiterals: 'true' 51 | ColumnLimit: '120' 52 | CompactNamespaces: 'false' 53 | ConstructorInitializerAllOnOneLineOrOnePerLine: 'true' 54 | ConstructorInitializerIndentWidth: '4' 55 | ContinuationIndentWidth: '4' 56 | Cpp11BracedListStyle: 'true' 57 | DerivePointerAlignment: 'false' 58 | DisableFormat: 'false' 59 | ExperimentalAutoDetectBinPacking: 'false' 60 | FixNamespaceComments: 'true' 61 | ForEachMacros: ['foreach', 'FOREACH', 'RANGES_FOR', 'hlist_for_each_entry_continue', 'hlist_for_each_entry', 'hlist_for_each_entry_from', 'hlist_for_each_entry_safe', 'hlist_for_each_safe', 'list_for_each_entry', 'list_for_each_entry_continue', 'list_for_each_entry_continue_reverse', 'list_for_each_entry_from', 'list_for_each_entry_reverse', 'list_for_each_entry_safe', 'list_for_each_entry_safe_continue', 'list_for_each_entry_safe_from', 'list_for_each_entry_safe_reverse', 'list_for_each_from', 'list_for_each_prev', 'list_for_each_prev_safe', 'list_for_each_safe'] 62 | TypenameMacros: ['STACK_OF', 'LIST'] 63 | IncludeBlocks: Regroup 64 | IncludeIsMainRegex: '([-_](test|unittest))?$' 65 | IndentCaseLabels: 'true' 66 | IndentPPDirectives: None 67 | IndentWidth: '4' 68 | IndentWrappedFunctionNames: 'false' 69 | KeepEmptyLinesAtTheStartOfBlocks: 'false' 70 | MaxEmptyLinesToKeep: '3' 71 | NamespaceIndentation: None 72 | PenaltyBreakAssignment: '2' 73 | PenaltyBreakBeforeFirstCallParameter: '1' 74 | PenaltyBreakComment: '300' 75 | PenaltyBreakFirstLessLess: '120' 76 | PenaltyBreakString: '1000' 77 | PenaltyBreakTemplateDeclaration: '10' 78 | PenaltyExcessCharacter: '1000000' 79 | PenaltyReturnTypeOnItsOwnLine: '500' 80 | PointerAlignment: Right 81 | RawStringFormats: 82 | - Language: Cpp 83 | Delimiters: 84 | - 'cc' 85 | - 'CC' 86 | - 'cpp' 87 | - 'Cpp' 88 | - 'CPP' 89 | - 'c++' 90 | - 'C++' 91 | CanonicalDelimiter: '' 92 | BasedOnStyle: google 93 | - Language: TextProto 94 | Delimiters: 95 | - 'pb' 96 | - 'PB' 97 | - 'proto' 98 | - 'PROTO' 99 | EnclosingFunctions: 100 | - EqualsProto 101 | - EquivToProto 102 | - PARSE_PARTIAL_TEXT_PROTO 103 | - PARSE_TEST_PROTO 104 | - PARSE_TEXT_PROTO 105 | - ParseTextOrDie 106 | - ParseTextProtoOrDie 107 | CanonicalDelimiter: '' 108 | BasedOnStyle: google 109 | ReflowComments: 'true' 110 | SortIncludes: 'false' 111 | SortUsingDeclarations: 'false' 112 | SpaceAfterCStyleCast: 'false' 113 | SpaceAfterLogicalNot: 'false' 114 | SpaceAfterTemplateKeyword: 'true' 115 | SpaceBeforeAssignmentOperators: 'true' 116 | SpaceBeforeCpp11BracedList: 'false' 117 | SpaceBeforeCtorInitializerColon: 'true' 118 | SpaceBeforeInheritanceColon: 'true' 119 | SpaceBeforeParens: ControlStatements 120 | SpaceBeforeRangeBasedForLoopColon: 'true' 121 | SpaceInEmptyParentheses: 'false' 122 | SpacesBeforeTrailingComments: '1' 123 | SpacesInAngles: 'false' 124 | SpacesInCStyleCastParentheses: 'false' 125 | SpacesInContainerLiterals: 'false' 126 | SpacesInParentheses: 'false' 127 | SpacesInSquareBrackets: 'false' 128 | Standard: Auto 129 | StatementMacros: ['__maybe_unused'] 130 | TabWidth: '4' 131 | UseTab: Never 132 | ... 133 | -------------------------------------------------------------------------------- /.cmake-format.yaml: -------------------------------------------------------------------------------- 1 | _help_parse: Options affecting listfile parsing 2 | parse: 3 | _help_additional_commands: 4 | - Specify structure for custom cmake functions 5 | additional_commands: 6 | APPEND_TO_LISTS: 7 | kwargs: 8 | LISTS: "*" 9 | VALUES: "*" 10 | target_sources: 11 | flags: 12 | - PUBLIC 13 | - PRIVATE 14 | _help_vartags: 15 | - Specify variable tags. 16 | vartags: [] 17 | _help_proptags: 18 | - Specify property tags. 19 | proptags: [] 20 | _help_format: Options affecting formatting. 21 | format: 22 | _help_line_width: 23 | - How wide to allow formatted cmake files 24 | line_width: 80 25 | _help_tab_size: 26 | - How many spaces to tab for indent 27 | tab_size: 2 28 | _help_max_subgroups_hwrap: 29 | - If an argument group contains more than this many sub-groups 30 | - (parg or kwarg groups) then force it to a vertical layout. 31 | max_subgroups_hwrap: 6 32 | _help_max_pargs_hwrap: 33 | - If a positional argument group contains more than this many 34 | - arguments, then force it to a vertical layout. 35 | max_pargs_hwrap: 8 36 | _help_max_rows_cmdline: 37 | - If a cmdline positional group consumes more than this many 38 | - lines without nesting, then invalidate the layout (and nest) 39 | max_rows_cmdline: 6 40 | _help_separate_ctrl_name_with_space: 41 | - If true, separate flow control names from their parentheses 42 | - with a space 43 | separate_ctrl_name_with_space: true 44 | _help_separate_fn_name_with_space: 45 | - If true, separate function names from parentheses with a 46 | - space 47 | separate_fn_name_with_space: false 48 | _help_dangle_parens: 49 | - If a statement is wrapped to more than one line, than dangle 50 | - the closing parenthesis on its own line. 51 | dangle_parens: true 52 | _help_dangle_align: 53 | - If the trailing parenthesis must be 'dangled' on its on 54 | - "line, then align it to this reference: `prefix`: the start" 55 | - "of the statement, `prefix-indent`: the start of the" 56 | - "statement, plus one indentation level, `child`: align to" 57 | - the column of the arguments 58 | dangle_align: prefix 59 | _help_min_prefix_chars: 60 | - If the statement spelling length (including space and 61 | - parenthesis) is smaller than this amount, then force reject 62 | - nested layouts. 63 | min_prefix_chars: 4 64 | _help_max_prefix_chars: 65 | - If the statement spelling length (including space and 66 | - parenthesis) is larger than the tab width by more than this 67 | - amount, then force reject un-nested layouts. 68 | max_prefix_chars: 10 69 | _help_max_lines_hwrap: 70 | - If a candidate layout is wrapped horizontally but it exceeds 71 | - this many lines, then reject the layout. 72 | max_lines_hwrap: 10 73 | _help_line_ending: 74 | - What style line endings to use in the output. 75 | line_ending: unix 76 | _help_command_case: 77 | - Format command names consistently as 'lower' or 'upper' case 78 | command_case: upper 79 | _help_keyword_case: 80 | - Format keywords consistently as 'lower' or 'upper' case 81 | keyword_case: upper 82 | _help_always_wrap: 83 | - A list of command names which should always be wrapped 84 | always_wrap: [] 85 | _help_enable_sort: 86 | - If true, the argument lists which are known to be sortable 87 | - will be sorted lexicographicall 88 | enable_sort: true 89 | _help_autosort: 90 | - If true, the parsers may infer whether or not an argument 91 | - list is sortable (without annotation). 92 | autosort: false 93 | _help_require_valid_layout: 94 | - By default, if cmake-format cannot successfully fit 95 | - everything into the desired linewidth it will apply the 96 | - last, most agressive attempt that it made. If this flag is 97 | - True, however, cmake-format will print error, exit with non- 98 | - zero status code, and write-out nothing 99 | require_valid_layout: false 100 | _help_layout_passes: 101 | - A dictionary mapping layout nodes to a list of wrap 102 | - decisions. See the documentation for more information. 103 | layout_passes: {} 104 | _help_markup: Options affecting comment reflow and formatting. 105 | markup: 106 | _help_bullet_char: 107 | - What character to use for bulleted lists 108 | bullet_char: "*" 109 | _help_enum_char: 110 | - What character to use as punctuation after numerals in an 111 | - enumerated list 112 | enum_char: . 113 | _help_first_comment_is_literal: 114 | - If comment markup is enabled, don't reflow the first comment 115 | - block in each listfile. Use this to preserve formatting of 116 | - your copyright/license statements. 117 | first_comment_is_literal: false 118 | _help_literal_comment_pattern: 119 | - If comment markup is enabled, don't reflow any comment block 120 | - which matches this (regex) pattern. Default is `None` 121 | - (disabled). 122 | literal_comment_pattern: null 123 | _help_fence_pattern: 124 | - Regular expression to match preformat fences in comments 125 | - default= ``r'^\s*([`~]{3}[`~]*)(.*)$'`` 126 | fence_pattern: ^\s*([`~]{3}[`~]*)(.*)$ 127 | _help_ruler_pattern: 128 | - Regular expression to match rulers in comments default= 129 | - '``r''^\s*[^\w\s]{3}.*[^\w\s]{3}$''``' 130 | ruler_pattern: ^\s*[^\w\s]{3}.*[^\w\s]{3}$ 131 | _help_explicit_trailing_pattern: 132 | - If a comment line matches starts with this pattern then it 133 | - is explicitly a trailing comment for the preceeding 134 | - argument. Default is '#<' 135 | explicit_trailing_pattern: "#<" 136 | _help_hashruler_min_length: 137 | - If a comment line starts with at least this many consecutive 138 | - hash characters, then don't lstrip() them off. This allows 139 | - for lazy hash rulers where the first hash char is not 140 | - separated by space 141 | hashruler_min_length: 10 142 | _help_canonicalize_hashrulers: 143 | - If true, then insert a space between the first hash char and 144 | - remaining hash chars in a hash ruler, and normalize its 145 | - length to fill the column 146 | canonicalize_hashrulers: true 147 | _help_enable_markup: 148 | - enable comment markup parsing and reflow 149 | enable_markup: false 150 | _help_lint: Options affecting the linter 151 | lint: 152 | _help_disabled_codes: 153 | - a list of lint codes to disable 154 | disabled_codes: [] 155 | _help_function_pattern: 156 | - regular expression pattern describing valid function names 157 | function_pattern: "[0-9a-z_]+" 158 | _help_macro_pattern: 159 | - regular expression pattern describing valid macro names 160 | macro_pattern: "[0-9A-Z_]+" 161 | _help_global_var_pattern: 162 | - regular expression pattern describing valid names for 163 | - variables with global (cache) scope 164 | global_var_pattern: "[A-Z][0-9A-Z_]+" 165 | _help_internal_var_pattern: 166 | - regular expression pattern describing valid names for 167 | - variables with global scope (but internal semantic) 168 | internal_var_pattern: _[A-Z][0-9A-Z_]+ 169 | _help_local_var_pattern: 170 | - regular expression pattern describing valid names for 171 | - variables with local scope 172 | local_var_pattern: "[a-z][a-z0-9_]+" 173 | _help_private_var_pattern: 174 | - regular expression pattern describing valid names for 175 | - privatedirectory variables 176 | private_var_pattern: _[0-9a-z_]+ 177 | _help_public_var_pattern: 178 | - regular expression pattern describing valid names for public 179 | - directory variables 180 | public_var_pattern: "[A-Z][0-9A-Z_]+" 181 | _help_argument_var_pattern: 182 | - regular expression pattern describing valid names for 183 | - function/macro arguments and loop variables. 184 | argument_var_pattern: "[a-z][a-z0-9_]+" 185 | _help_keyword_pattern: 186 | - regular expression pattern describing valid names for 187 | - keywords used in functions or macros 188 | keyword_pattern: "[A-Z][0-9A-Z_]+" 189 | _help_max_conditionals_custom_parser: 190 | - In the heuristic for C0201, how many conditionals to match 191 | - within a loop in before considering the loop a parser. 192 | max_conditionals_custom_parser: 2 193 | _help_min_statement_spacing: 194 | - Require at least this many newlines between statements 195 | min_statement_spacing: 1 196 | _help_max_statement_spacing: 197 | - Require no more than this many newlines between statements 198 | max_statement_spacing: 2 199 | max_returns: 6 200 | max_branches: 12 201 | max_arguments: 5 202 | max_localvars: 15 203 | max_statements: 50 204 | _help_encode: Options affecting file encoding 205 | encode: 206 | _help_emit_byteorder_mark: 207 | - If true, emit the unicode byte-order mark (BOM) at the start 208 | - of the file 209 | emit_byteorder_mark: false 210 | _help_input_encoding: 211 | - Specify the encoding of the input file. Defaults to utf-8 212 | input_encoding: utf-8 213 | _help_output_encoding: 214 | - Specify the encoding of the output file. Defaults to utf-8. 215 | - Note that cmake only claims to support utf-8 so be careful 216 | - when using anything else 217 | output_encoding: utf-8 218 | _help_misc: Miscellaneous configurations options. 219 | misc: 220 | _help_per_command: 221 | - A dictionary containing any per-command configuration 222 | - overrides. Currently only `command_case` is supported. 223 | per_command: {} 224 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: benchmark 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: [ubuntu-latest] 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Install requirements 13 | run: | 14 | sudo apt-get update -q -y 15 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake 16 | 17 | - name: Configure 18 | run: cmake -B ${{github.workspace}}/build -DMPN_NO_ASM=ON -DBUILD_VENDOR=ON 19 | 20 | - name: Build 21 | run: cmake --build ${{github.workspace}}/build 22 | 23 | - name: Run Benchmark 24 | working-directory: ${{github.workspace}}/build 25 | run: | 26 | ${{github.workspace}}/build/tests/benchmark | tee ${{github.workspace}}/build/benchmark.txt 27 | cat ${{github.workspace}}/docs/README.template.md > ${{github.workspace}}/README.md 28 | echo -e '## Benchmark(libmpi VS openssl)\n' >> ${{github.workspace}}/README.md 29 | awk '/-----BEGIN MARKDOWN TABLE-----/{ f = 1; next } /-----END MARKDOWN TABLE-----/{ f = 0 } f' benchmark.txt >> ${{github.workspace}}/README.md 30 | git add ${{github.workspace}}/README.md 31 | 32 | - name: Commit files 33 | run: | 34 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 35 | git config --local user.name "github-actions[bot]" 36 | git commit -m "Update performance data" -a 37 | 38 | - name: Push changes 39 | uses: ad-m/github-push-action@master 40 | with: 41 | github_token: ${{ secrets.GITHUB_TOKEN }} 42 | branch: ${{ github.ref }} 43 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ${{ matrix.distro }} 8 | 9 | strategy: 10 | matrix: 11 | distro: [ubuntu-latest, macos-latest] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Install requirements 17 | id: requirements 18 | run: | 19 | case "${{ matrix.distro }}" in 20 | ubuntu*|jessie|stretch|buster|bullseye) 21 | sudo apt-get update -q -y 22 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake 23 | ;; 24 | macos*) 25 | brew install nasm googletest openssl@1.1 openssl@3 26 | brew link openssl --force 27 | echo ::set-output name=LDFLAGS::"-L/usr/local/opt/openssl@1.1/lib" 28 | echo ::set-output name=CPPFLAGS::"-I/usr/local/opt/openssl@1.1/include" 29 | ;; 30 | fedora*) 31 | sudo dnf -y update 32 | sudo dnf -y install gcc g++ git nasm gtest openssl cmake 33 | ;; 34 | alpine*) 35 | apk update 36 | apk add gcc g++ git nasm gtest openssl cmake 37 | ;; 38 | esac 39 | 40 | - name: Configure 41 | run: cmake -B ${{github.workspace}}/build -DCMAKE_VERBOSE_MAKEFILE=ON -DMPN_NO_ASM=ON -DCMAKE_CXX_FLAGS=${{ steps.requirements.outputs.CPPFLAGS }} -DCMAKE_EXE_LINKER_FLAGS=${{ steps.requirements.outputs.LDFLAGS }} 42 | 43 | - name: Build 44 | run: cmake --build ${{github.workspace}}/build 45 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: coverage 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: [ubuntu-latest] 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Install requirements 13 | run: | 14 | sudo apt-get update -q -y 15 | sudo apt-get install -q -y gcc g++ git nasm libgtest-dev openssl cmake 16 | 17 | - name: Configure 18 | run: cmake -B ${{github.workspace}}/build -DGCOV=ON -DCMAKE_BUILD_TYPE=Debug -DMPN_NO_ASM=ON 19 | 20 | - name: Build 21 | run: cmake --build ${{github.workspace}}/build 22 | 23 | - name: Run Test 24 | working-directory: ${{github.workspace}}/build 25 | run: | 26 | make test || true 27 | ${{github.workspace}}/build/tests/benchmark || true 28 | 29 | - name: Upload To CodeCov 30 | run: bash <(curl -s https://codecov.io/bash) 31 | -------------------------------------------------------------------------------- /.github/workflows/multiarch.yml: -------------------------------------------------------------------------------- 1 | name: multiarch 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build_job: 7 | # The host should always be linux 8 | runs-on: ubuntu-18.04 9 | name: Build on ${{ matrix.distro }} ${{ matrix.arch }} 10 | 11 | # Run steps on a matrix of 3 arch/distro combinations 12 | strategy: 13 | matrix: 14 | include: 15 | - arch: aarch64 16 | distro: ubuntu18.04 17 | # - arch: armv7 18 | # distro: ubuntu20.04 19 | - arch: s390x 20 | distro: fedora_latest 21 | # - arch: ppc64le 22 | # distro: alpine_latest 23 | 24 | steps: 25 | - uses: actions/checkout@v2.1.0 26 | - name: Building 27 | uses: uraimo/run-on-arch-action@v2.1.1 28 | id: build 29 | with: 30 | arch: ${{ matrix.arch }} 31 | distro: ${{ matrix.distro }} 32 | 33 | # Not required, but speeds up builds 34 | githubToken: ${{ github.token }} 35 | 36 | # Mount the github.workspace directory as /workspace in the container 37 | dockerRunArgs: | 38 | --volume "${{ github.workspace }}:/workspace" 39 | 40 | # Pass some environment variables to the container 41 | env: | 42 | workspace: /workspace 43 | 44 | # The shell to run commands with in the container 45 | shell: /bin/bash 46 | 47 | # Install some dependencies in the container. This speeds up builds if 48 | # you are also using githubToken. Any dependencies installed here will 49 | # be part of the container image that gets cached, so subsequent 50 | # builds don't have to re-install them. The image layer is cached 51 | # publicly in your project's package repository, so it is vital that 52 | # no secrets are present in the container state or logs. 53 | install: | 54 | case "${{ matrix.distro }}" in 55 | ubuntu*|jessie|stretch|buster|bullseye) 56 | apt-get update -q -y 57 | apt-get install -q -y gcc g++ nasm libgtest-dev openssl cmake 58 | ;; 59 | macos*) 60 | brew update 61 | brew install nasm googletest openssl 62 | brew link openssl --force 63 | export LDFLAGS="-L/usr/local/opt/openssl@1.1/lib" 64 | export CPPFLAGS="-I/usr/local/opt/openssl@1.1/include" 65 | ;; 66 | fedora*) 67 | dnf -y update 68 | dnf -y install gcc g++ nasm gtest openssl cmake 69 | ;; 70 | alpine*) 71 | apk update 72 | apk add gcc g++ nasm gtest openssl cmake 73 | ;; 74 | esac 75 | 76 | # Configure and Build 77 | run: | 78 | mkdir -p ${workspace}/build && cd ${workspace}/build 79 | cmake .. && make 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # ignore directories 35 | build/** 36 | .vscode/** 37 | vendor/** 38 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | fail_fast: false 2 | 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.2.0 6 | hooks: 7 | - id: check-added-large-files 8 | - id: trailing-whitespace 9 | args: [--markdown-linebreak-ext=md] 10 | - id: check-merge-conflict 11 | - id: check-json 12 | - id: check-yaml 13 | args: [--allow-multiple-document] 14 | - id: check-case-conflict 15 | - id: check-symlinks 16 | - id: end-of-file-fixer 17 | - id: pretty-format-json 18 | - repo: git://github.com/doublify/pre-commit-clang-format 19 | rev: 62302476d0da01515660132d76902359bed0f782 20 | hooks: 21 | - id: clang-format 22 | entry: clang-format 23 | language: system 24 | files: \.(c|cc|cxx|cpp|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|proto|vert)$ 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.0) 2 | PROJECT("Cryptograph Algorithms Implementation") 3 | 4 | INCLUDE(CMakePackageConfigHelpers) 5 | INCLUDE(cmake/ConfigureTarget.cmake) 6 | 7 | ADD_COMPILE_OPTIONS(-Wno-deprecated-declarations) 8 | 9 | IF (NOT DEFINED ARCH) 10 | # MATCHES "^(os|ios|android|linux|win32)$ 11 | IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|amd64)$") 12 | SET(ARCH "x86_64") 13 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64") 14 | # cmake reports AMD64 on Windows, but we might be building for 32-bit. 15 | IF (CMAKE_SIZEOF_VOID_P EQUAL 8) 16 | SET(ARCH "x86_64") 17 | ELSE () 18 | SET(ARCH "x86") 19 | ENDIF () 20 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86|i386|i386)$") 21 | SET(ARCH "x86") 22 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(aarch64|arm64|arm64e)$") 23 | SET(ARCH "aarch64") 24 | ELSEIF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^arm*") 25 | SET(ARCH "arm") 26 | ELSE () 27 | SET(ARCH "generic") 28 | MESSAGE(STATUE "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR}) 29 | ENDIF () 30 | ENDIF () 31 | 32 | IF (UNIX) 33 | IF (${ARCH} STREQUAL "aarch64") 34 | IF (APPLE) 35 | SET(PERLASM_STYLE ios64) 36 | ELSE () 37 | SET(PERLASM_STYLE linux64) 38 | ENDIF () 39 | ELSEIF (${ARCH} STREQUAL "arm") 40 | IF (APPLE) 41 | SET(PERLASM_STYLE ios32) 42 | ELSE () 43 | SET(PERLASM_STYLE linux32) 44 | ENDIF () 45 | ELSE () 46 | IF (${ARCH} STREQUAL "x86") 47 | SET(PERLASM_FLAGS "-fPIC -DCRYPTO_IA32_SSE2") 48 | ENDIF () 49 | IF (APPLE) 50 | SET(PERLASM_STYLE macosx) 51 | ELSE () 52 | SET(PERLASM_STYLE elf) 53 | ENDIF () 54 | ENDIF () 55 | SET(ASM_EXT S) 56 | ENABLE_LANGUAGE(ASM) 57 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,--noexecstack") 58 | 59 | # Clang's integerated assembler does not support debug symbols. 60 | IF (NOT CMAKE_ASM_COMPILER_ID MATCHES "Clang") 61 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -Wa,-g") 62 | ENDIF () 63 | 64 | # CMake does not add -isysroot and -arch flags to assembly. 65 | IF (APPLE) 66 | IF (CMAKE_OSX_SYSROOT) 67 | SET(CMAKE_ASM_FLAGS 68 | "${CMAKE_ASM_FLAGS} -isysroot \"${CMAKE_OSX_SYSROOT}\"" 69 | ) 70 | ENDIF () 71 | FOREACH (arch ${CMAKE_OSX_ARCHITECTURES}) 72 | SET(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -arch ${arch}") 73 | ENDFOREACH () 74 | ENDIF () 75 | ELSE () 76 | IF (${ARCH} STREQUAL "x86_64") 77 | SET(PERLASM_STYLE nasm) 78 | ELSE () 79 | SET(PERLASM_STYLE win32n) 80 | SET(PERLASM_FLAGS "-DCRYPTO_IA32_SSE2") 81 | ENDIF () 82 | SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -gcv8") 83 | 84 | # On Windows, we use the NASM output, specifically built with Yasm. 85 | SET(ASM_EXT asm) 86 | ENDIF () 87 | 88 | FIND_PACKAGE(Perl REQUIRED) 89 | MACRO (PERLASM dest src) 90 | ADD_CUSTOM_COMMAND( 91 | OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${dest} 92 | COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR} 93 | COMMAND CC=${CMAKE_C_COMPILER} ${PERL_EXECUTABLE} ${src} ${PERLASM_STYLE} 94 | ${PERLASM_FLAGS} ${ARGN} ${CMAKE_CURRENT_BINARY_DIR}/${dest} 95 | DEPENDS ${src} ${CMAKE_SOURCE_DIR}/perlasm/arm-xlate.pl 96 | ${CMAKE_SOURCE_DIR}/perlasm/x86_64-xlate.pl 97 | ${CMAKE_SOURCE_DIR}/perlasm/x86asm.pl 98 | ${CMAKE_SOURCE_DIR}/perlasm/x86gas.pl 99 | ${CMAKE_SOURCE_DIR}/perlasm/x86masm.pl 100 | ${CMAKE_SOURCE_DIR}/perlasm/x86nasm.pl 101 | WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} 102 | ) 103 | ENDMACRO () 104 | 105 | IF (BUILD_VENDOR) 106 | INCLUDE(ExternalProject) 107 | IF (NOT EXISTS ${CMAKE_SOURCE_DIR}/vendor) 108 | FILE(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/vendor) 109 | ENDIF () 110 | 111 | INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/include) 112 | LINK_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/lib) 113 | SET(ENV{PATH} ${CMAKE_BINARY_DIR}/bin:$ENV{PATH}) 114 | SET(ENV{PKG_CONFIG_PATH} ${CMAKE_BINARY_DIR}/lib/pkgconfig) 115 | 116 | # cmake-format: off 117 | SET(NASM_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/nasm-2.15.05.tar.gz) 118 | SET(NASM_DOWNLOAD_URL https://www.nasm.us/pub/nasm/releasebuilds/2.15.05/nasm-2.15.05.tar.gz) 119 | IF (NOT EXISTS ${NASM_LOCAL_FILE}) 120 | FILE( 121 | DOWNLOAD ${NASM_DOWNLOAD_URL} ${NASM_LOCAL_FILE} 122 | TIMEOUT 60 123 | TLS_VERIFY ON 124 | ) 125 | ENDIF () 126 | EXTERNALPROJECT_ADD( 127 | nasm 128 | URL ${NASM_LOCAL_FILE} 129 | CONFIGURE_COMMAND ./configure --prefix=${CMAKE_BINARY_DIR} 130 | BUILD_COMMAND make -j${CONCURRENCY} 131 | BUILD_IN_SOURCE 1 132 | ) 133 | 134 | SET(OPENSSL_LOCAL_FILE ${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i.tar.gz) 135 | SET(OPENSSL_DOWNLOAD_URL https://www.openssl.org/source/old/1.1.1/openssl-1.1.1i.tar.gz) 136 | IF (NOT EXISTS ${OPENSSL_LOCAL_FILE}) 137 | FILE( 138 | DOWNLOAD ${OPENSSL_DOWNLOAD_URL} ${OPENSSL_LOCAL_FILE} 139 | TIMEOUT 60 140 | TLS_VERIFY ON 141 | ) 142 | ENDIF () 143 | EXTERNALPROJECT_ADD( 144 | openssl 145 | URL ${OPENSSL_LOCAL_FILE} 146 | CONFIGURE_COMMAND ./config no-shared no-asm -d --prefix=${CMAKE_BINARY_DIR} 147 | BUILD_COMMAND make depend && make -j${CONCURRENCY} 148 | INSTALL_COMMAND make install_sw 149 | BUILD_IN_SOURCE 1 150 | ) 151 | # cmake-format: on 152 | LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/lib) 153 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/vendor/openssl-1.1.1i/export/include) 154 | ELSE () 155 | FIND_PROGRAM(OPENSSL openssl REQUIRED) 156 | GET_FILENAME_COMPONENT(OPENSSL_DIR ${OPENSSL} DIRECTORY CACHE) 157 | LINK_DIRECTORIES(${OPENSSL_DIR}/../lib) 158 | INCLUDE_DIRECTORIES(${OPENSSL_DIR}/../include) 159 | ENDIF () 160 | 161 | LINK_DIRECTORIES(/usr/local/lib) 162 | INCLUDE_DIRECTORIES(/usr/local/include ${CMAKE_BINARY_DIR}/include) 163 | 164 | # mpn 165 | ADD_SUBDIRECTORY(mpn) 166 | 167 | # mpi 168 | ADD_SUBDIRECTORY(mpi) 169 | 170 | # tests 171 | ENABLE_TESTING() 172 | ADD_SUBDIRECTORY(tests) 173 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # libmpi 2 | 3 | [![license](https://img.shields.io/badge/license-Apache-brightgreen.svg?style=flat)](https://github.com/vxfury/libmpi/blob/master/LICENSE) 4 | [![CI Status](https://github.com/vxfury/libmpi/workflows/ci/badge.svg)](https://github.com/vxfury/libmpi/actions) 5 | [![codecov](https://codecov.io/gh/vxfury/libmpi/branch/main/graph/badge.svg?token=5IfLTTEcnF)](https://codecov.io/gh/vxfury/libmpi) 6 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/vxfury/libmpi?color=red&label=release) 7 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/vxfury/libmpi/pulls) 8 | 9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA 10 | ## Benchmark(libmpi VS openssl) 11 | 12 | | brief | average time
(nanoseconds) | instability
(coefficient of variation) | rating | 13 | | :-- | :-: | :-: | :-: | 14 | | from-string(mpi vs openssl) | 2443.7
30303.4* | 0.0701562 | 12.4006
(Tu es mon meilleur frère...)
| 15 | | to-string(mpi vs openssl) | 1328.88
3463.21* | 0.109777 | 2.60612
(Tu peux faire mieux, continue)
| 16 | | from-octets(mpi vs openssl) | 273.632
702.13* | 0.0870046 | 2.56597
(Tu peux faire mieux, continue)
| 17 | | to-octets(mpi vs openssl) | 172.067
1475.5* | 0.359989 | 8.57515
(C'est super, dessine-toi une tarte)
| 18 | | add(mpi vs openssl) | 51.1222
333.814* | 0.164442 | 6.52973
(C'est super, dessine-toi une tarte)
| 19 | | add-assign(mpi vs openssl) | 56.7424
332.054* | 0.202937 | 5.85196
(C'est super, dessine-toi une tarte)
| 20 | | sub(mpi vs openssl) | 61.6028
162.647* | 0.207007 | 2.64025
(Tu peux faire mieux, continue)
| 21 | | sub-assign(mpi vs openssl) | 58.2224
288.852* | 0.155195 | 4.96119
(Tu peux faire mieux, continue)
| 22 | | mul(mpi vs openssl) | 2070.41
14037.9* | 0.0553581 | 6.78025
(C'est super, dessine-toi une tarte)
| 23 | | sqr(mpi vs openssl) | 1329.62
8760.12* | 0.168403 | 6.58845
(C'est super, dessine-toi une tarte)
| 24 | | MUL2(a * 2 = a + a) | 37.5416 | 0.163214 | N/A | 25 | | MUL2(a * 2 = a << 1) | 77.5234 | 0.113647 | N/A | 26 | -------------------------------------------------------------------------------- /cmake/Config.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | find_package(Threads) 4 | include(${CMAKE_CURRENT_LIST_DIR}/libacoTargets.cmake) 5 | -------------------------------------------------------------------------------- /docs/README.template.md: -------------------------------------------------------------------------------- 1 | # libmpi 2 | 3 | [![license](https://img.shields.io/badge/license-Apache-brightgreen.svg?style=flat)](https://github.com/vxfury/libmpi/blob/master/LICENSE) 4 | [![CI Status](https://github.com/vxfury/libmpi/workflows/ci/badge.svg)](https://github.com/vxfury/libmpi/actions) 5 | [![codecov](https://codecov.io/gh/vxfury/libmpi/branch/main/graph/badge.svg?token=5IfLTTEcnF)](https://codecov.io/gh/vxfury/libmpi) 6 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/vxfury/libmpi?color=red&label=release) 7 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/vxfury/libmpi/pulls) 8 | 9 | Multiple Precision Integer and Relevant Algorithms, such as Bignum, RSA, DH, ECDH, ECDSA 10 | -------------------------------------------------------------------------------- /mpi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Multiple Precision Integer and Relevant Algorithms 2 | 3 | CONFIGURE_FILE(mpi.h ${CMAKE_BINARY_DIR}/include/mpi/mpi.h COPYONLY) 4 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi.h DESTINATION include/mpi) 5 | ADD_LIBRARY(mpi mpi.c mpi-prime.c) 6 | ConfigureTarget(mpi) 7 | TARGET_LINK_LIBRARIES(mpi PUBLIC mpn) 8 | INSTALL(TARGETS mpi ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) 9 | 10 | # RSA(Rivest–Shamir–Adleman) Algorithm 11 | OPTION(MPI_NO_RSA "build without rsa algorithm" OFF) 12 | IF (NOT MPI_NO_RSA) 13 | CONFIGURE_FILE(mpi-rsa.h ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h COPYONLY) 14 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpi/mpi-rsa.h 15 | DESTINATION include/mpi 16 | ) 17 | TARGET_SOURCES(mpi PRIVATE mpi-rsa.c) 18 | ENDIF () 19 | -------------------------------------------------------------------------------- /mpi/mpi-rsa.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef MULTIPLE_PRECISION_RSA_H 17 | #define MULTIPLE_PRECISION_RSA_H 18 | 19 | #include 20 | #include 21 | 22 | #if defined(__cplusplus) 23 | extern "C" { 24 | #endif 25 | 26 | typedef struct { 27 | unsigned int nbits; /* RSA modulus bitsize */ 28 | unsigned int ebits; /* RSA public exp bitsize */ 29 | unsigned int dbits; /* RSA private exp bitsize */ 30 | unsigned int pbits; /* RSA p-factor bitsize */ 31 | unsigned int qbits; /* RSA q-factor bitsize */ 32 | 33 | mpn_limb_t *e; /* public exponent, bitsize(e) = ebits */ 34 | mpn_limb_t *d; /* private exponent, bitsize(d) = dbits <= nbits */ 35 | mpn_limb_t *dp; /* the first factor's CRT exponent, d mod (p - 1), bitsize(dp) <= pbits */ 36 | mpn_limb_t *dq; /* the second factor's CRT exponent, d mod (q - 1), bitsize(dq) <= qbits */ 37 | mpn_limb_t *qinv; /* the (first) CRT coefficient, q^(-1) mode p, bitsize(qinv) <= pbits */ 38 | 39 | mpn_montgomery_t *montN; /* montgomery context for (N, the modulus, bitsize(n) = nbits) */ 40 | mpn_montgomery_t *montP; /* montgomery context for (P, the first factor) */ 41 | mpn_montgomery_t *montQ; /* montgomery context for (Q, the second factor) */ 42 | 43 | /* TODO: multiple-primes support */ 44 | unsigned int primes; 45 | struct rsa_factor { 46 | unsigned int bits; /* bit-size of factor */ 47 | mpn_limb_t *r; /* factor */ 48 | mpn_limb_t *d; /* factor's CRT exponent */ 49 | mpn_limb_t *t; /* factor's CRT coefficient */ 50 | } factors[0]; 51 | } rsa_key_t; 52 | 53 | rsa_key_t *rsa_new(unsigned int ebits, unsigned int nbits, unsigned int primes); 54 | void rsa_free(rsa_key_t *key); 55 | 56 | int rsa_import(rsa_key_t *key, const mpi_t *n, const mpi_t *e, const mpi_t *d, const mpi_t *dp, const mpi_t *dq, 57 | const mpi_t *qinv); 58 | rsa_key_t *rsa_generate_key(const mpi_t *pubexp, unsigned int nbits, unsigned int primes, 59 | int (*rand_bytes)(void *, unsigned char *, unsigned int), void *rand_state); 60 | 61 | int rsa_pub_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key); 62 | int rsa_prv_cipher(mpi_t *r, const mpi_t *x, const rsa_key_t *key); 63 | int rsa_prv_cipher_crt(mpi_t *r, const mpi_t *x, const rsa_key_t *key); 64 | 65 | #if defined(__cplusplus) 66 | } 67 | #endif 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /mpi/mpi.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | /** 17 | * @brief: multiple precision integer: configurations, macros, and prototypes 18 | * 19 | * @note: 20 | * 1. Assume that all variable representing size will never overflow 21 | */ 22 | 23 | #ifndef MULTIPLE_PRECISION_H 24 | #define MULTIPLE_PRECISION_H 25 | 26 | #include 27 | #include 28 | 29 | #if defined(__cplusplus) 30 | extern "C" { 31 | #endif 32 | 33 | /** 34 | * mpi implementation 35 | */ 36 | #define MPI_SIGN_NEGTIVE 1 /* a < 0, negtive */ 37 | #define MPI_SIGN_NON_NEGTIVE 0 /* a >= 0, non-negtive */ 38 | #define MPI_ATTR_NOTOWNED 0x01 /* TODO: data field not owned by */ 39 | #define MPI_ATTR_DETACHED 0x02 /* TODO: detached data field */ 40 | #define MPI_ATTR_AUTOSIZE 0x04 /* TODO: resize data field automatically */ 41 | 42 | typedef struct { 43 | unsigned int attr; /**< mpi attributes */ 44 | unsigned int sign; /**< mpi sign: negtive or not */ 45 | mpn_size_t size; /**< mpi size (count of mpn_limb_t) */ 46 | mpn_size_t room; /**< mpi max size (count of mpn_limb_t) */ 47 | mpn_limb_t *data; /**< mpi data chunk(most significant limb at the largest) */ 48 | } mpi_t; 49 | #define MPI_ALIGNED_HEAD_LIMBS ((mpn_size_t)((sizeof(mpi_t) + sizeof(mpn_limb_t) - 1) / sizeof(mpn_limb_t))) 50 | 51 | /** High-Level APIs */ 52 | /** 53 | * create mpi with expected bits |bits| to reserve 54 | * 55 | * |bits| == 0, to create empty room 56 | * 57 | * @performance: Locality of reference and Cacheline alignment 58 | * mpi_t and this->data will be allocated as a continuous memory chunk 59 | */ 60 | mpi_t *mpi_create(mpn_size_t bits); 61 | 62 | /** 63 | * create mpi(detached) with expected bits |bits| to reserve 64 | * 65 | * |bits| == 0, to create empty room 66 | */ 67 | mpi_t *mpi_create_detached(mpn_size_t bits); 68 | 69 | /** 70 | * duplicate big-numer |a| 71 | */ 72 | mpi_t *mpi_dup(const mpi_t *a); 73 | 74 | /** 75 | * clear and release mpi |v| 76 | */ 77 | void mpi_destory(mpi_t *v); 78 | 79 | /** 80 | * make mpi with given chunk 81 | */ 82 | void mpi_make(mpi_t *r, mpn_limb_t *data, mpn_size_t size); 83 | 84 | /** 85 | * copy big-numer |a| to |r| 86 | * 87 | * @note: 88 | * 1. resize |r| to proper size before copy 89 | */ 90 | int mpi_copy(mpi_t *r, const mpi_t *a); 91 | 92 | /** 93 | * compare mpi |a| and |b| 94 | * 0, if |a| = |b| 95 | * 1, if |a| > |b| 96 | * -1, if |a| < |b| 97 | * otherwise, error code 98 | */ 99 | int mpi_cmp(const mpi_t *a, const mpi_t *b); 100 | 101 | /** 102 | * get bit size of mpi |a|(constant-time version) 103 | * 104 | * @note: 105 | * 1. 0, if a is NULL 106 | */ 107 | mpn_size_t mpi_bits(const mpi_t *a); 108 | 109 | /** 110 | * get byte size of mpi |a|(constant-time version) 111 | * 112 | * @note: 113 | * 1. 0, if a is NULL 114 | */ 115 | mpn_size_t mpi_bytes(const mpi_t *a); 116 | 117 | /** 118 | * get max bit size of mpi |a|(constant-time version) 119 | * 120 | * @note: 121 | * 1. 0, if a is NULL 122 | */ 123 | mpn_size_t mpi_max_bits(const mpi_t *a); 124 | 125 | /** 126 | * get max byte size of mpi |a|(constant-time version) 127 | * 128 | * @note: 129 | * 1. 0, if a is NULL 130 | */ 131 | mpn_size_t mpi_max_bytes(const mpi_t *a); 132 | 133 | /** 134 | * mpi: expand mpi to expected bits |bits| 135 | * 136 | * @note: 137 | * 1. maybe fail when no enough memory or invalid size given 138 | */ 139 | mpi_t *mpi_expand(mpi_t *v, mpn_size_t bits); 140 | 141 | /** 142 | * resize mpi to expected bits |bits| 143 | * 144 | * @note: 145 | * 1. maybe fail when no enough memory or invalid size given 146 | * 147 | */ 148 | mpi_t *mpi_resize(mpi_t *v, mpn_size_t bits); 149 | 150 | /** 151 | * zeroize mpi |v| 152 | */ 153 | int mpi_zeroize(mpi_t *v); 154 | 155 | /** 156 | * set mpi |r| to unsigned sigle-precision integer |v| 157 | */ 158 | int mpi_set_limb(mpi_t *r, mpn_limb_t v); 159 | 160 | /** 161 | * initialize mpi |v| from octets |buff|/|bufflen| 162 | * 163 | * @note: 164 | * 1. if *|v| is NULL, mpi will be created with proper size 165 | * 2. if *|v| isn't NULL, mpi-number will be resized, and maybe *|v| will be set to a new memory chunk 166 | */ 167 | int mpi_from_octets(mpi_t **v, const unsigned char *buff, mpn_size_t bufflen); 168 | 169 | /** 170 | * convert mpi to big-endian octets 171 | */ 172 | int mpi_to_octets(const mpi_t *a, unsigned char *out, mpn_size_t outsize, mpn_size_t *outlen); 173 | 174 | /** 175 | * initialize mpi |v| from hex-string |a| 176 | */ 177 | int mpi_from_string(mpi_t **v, const char *a); 178 | 179 | /** 180 | * convert mpi to string 181 | * 182 | * @note: 183 | * 1. FREE the return pointer after usage 184 | */ 185 | char *mpi_to_string(const mpi_t *v); 186 | 187 | /** 188 | * mpi addition: |r| = |a| + |b| 189 | * 190 | * @note: 191 | * 1. make sure r->room is enough to store the result 192 | * minimal advise size: MAX(bit_size(a), bit_size(b)) + 1 193 | */ 194 | int mpi_add(mpi_t *r, const mpi_t *a, const mpi_t *b); 195 | 196 | /** 197 | * mpi addition: |r| = |a| + w 198 | * 199 | * @note: 200 | * 1. make sure r->room is enough to store the result 201 | * minimal advise size: MAX(bit_size(a), bit_size(w)) + 1 202 | */ 203 | int mpi_add_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w); 204 | 205 | /** 206 | * mpi subtraction: |r| = |a| - |b| 207 | * 208 | * @note: 209 | * 1. make sure r->room is enough to store the result 210 | * minimal advise size: MAX(bit_size(a), bit_size(b)) 211 | * 2. make sure |a| >= |b| 'cause negative mpi not supported till now 212 | */ 213 | int mpi_sub(mpi_t *r, const mpi_t *a, const mpi_t *b); 214 | 215 | /** 216 | * mpi subtraction: |r| = |a| - w 217 | * 218 | * @note: 219 | * 1. make sure r->room is enough to store the result 220 | * minimal advise size: MAX(bit_size(a), bit_size(w)) 221 | */ 222 | int mpi_sub_limb(mpi_t *r, const mpi_t *a, mpn_limb_t w); 223 | 224 | /** 225 | * mpi multiplication: |r| = |a| * |b| 226 | * 227 | * @note: 228 | * 1. make sure r->room is enough to store the result 229 | * minimal advise size: bit_size(a) + bit_size(b) + MPN_LIMB_BITS 230 | */ 231 | int mpi_mul(mpi_t *r, const mpi_t *a, const mpi_t *b); 232 | 233 | /** 234 | * mpi multiplication: |r| = |a| * |b| 235 | * 236 | * @note: 237 | * 1. make sure r->room is enough to store the result 238 | * minimal advise size: bit_size(a) + bit_size(b) 239 | */ 240 | int mpi_mul_limb(mpi_t *r, const mpi_t *a, mpn_limb_t b); 241 | 242 | /** 243 | * mpi square: |r| = |a| ^ 2 244 | * 245 | * @note: 246 | * 1. make sure r->room is enough to store the result 247 | * minimal advise size: 2 * bit_size(a) 248 | */ 249 | int mpi_sqr(mpi_t *r, const mpi_t *a); 250 | 251 | /** 252 | * mpi division: |q|, |r| = |x| / |y|, |x| = |q| * |y| + |r|(0 <= |r| < |y|) 253 | * 254 | * @note: 255 | * 1. make sure room of |q|, |r| is enough to store the result 256 | * minimal advise size: bit_size(r) = bit_size(y) 257 | */ 258 | int mpi_div(mpi_t *q, mpi_t *r, const mpi_t *x, const mpi_t *y); 259 | 260 | /** 261 | * mpi division: q, r = a / w 262 | */ 263 | mpn_limb_t mpi_div_limb(mpi_t *a, mpn_limb_t w); 264 | 265 | /** 266 | * mpi modular: r = a mod m 267 | */ 268 | mpn_limb_t mpi_mod_limb(const mpi_t *a, mpn_limb_t w); 269 | 270 | /** 271 | * greatest common divisor 272 | */ 273 | int mpi_gcd(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer); 274 | 275 | /** 276 | * greatest common divisor(constant-time version) 277 | */ 278 | int mpi_gcd_consttime(mpi_t *r, const mpi_t *a, const mpi_t *b, mpn_optimizer_t *optimizer); 279 | 280 | /** 281 | * mpi modular: r = a mod m 282 | */ 283 | int mpi_mod(mpi_t *r, const mpi_t *a, const mpi_t *m); 284 | 285 | /** 286 | * mpi exponentiation: r = g ^ e 287 | */ 288 | int mpi_exp(mpi_t *r, const mpi_t *g, const mpi_t *e); 289 | 290 | /** 291 | * mpi exponentiation(word): r = g ^ e 292 | */ 293 | int mpi_exp_limb(mpi_t *r, const mpi_t *g, mpn_limb_t e); 294 | 295 | /** 296 | * get bit 297 | */ 298 | int mpi_get_bit(const mpi_t *a, mpn_size_t n); 299 | 300 | /** 301 | * set bit 302 | */ 303 | int mpi_set_bit(const mpi_t *a, mpn_size_t n); 304 | 305 | /** 306 | * clr bit 307 | */ 308 | int mpi_clr_bit(const mpi_t *a, mpn_size_t n); 309 | 310 | /** 311 | * left-shift: |r| = |a| << n 312 | */ 313 | int mpi_lshift(mpi_t *r, const mpi_t *a, mpn_size_t n); 314 | 315 | /** 316 | * right-shift: |r| = |a| >> n 317 | */ 318 | int mpi_rshift(mpi_t *r, const mpi_t *a, mpn_size_t n); 319 | 320 | /** 321 | * conditional swap(constant-time version) 322 | */ 323 | int mpi_swap_consttime(unsigned condition, mpi_t *a, mpi_t *b, mpn_size_t n); 324 | 325 | /** 326 | * mpi(prime): test if a is a prime 327 | * 328 | * @note: 329 | * 1. return 0 if the number is composite 330 | * 1 if it is prime with an error probability of less than 0.25^checks 331 | */ 332 | int mpi_is_prime(const mpi_t *a, mpn_size_t checks, unsigned do_trial_division, mpn_optimizer_t *optimizer, 333 | int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state); 334 | 335 | /** 336 | * mpi(prime): enerates a pseudo-random prime number of at least bit length |bits| 337 | * 338 | * @note: 339 | * 1. The returned number is probably prime with a negligible error. 340 | * 2. If |add| is NULL the returned prime number will have exact bit length |bits| with the top most two 341 | * bits set. 342 | * 3. The prime may have to fulfill additional requirements for use in Diffie-Hellman key exchange: 343 | * If |add| is not NULL, the prime will fulfill the condition p % |add| == |rem| (p % |add| == 1 if 344 | * |rem| == NULL) in order to suit a given generator. 345 | * 346 | * If |safe| is true, it will be a safe prime (i.e. a prime p so hat (p-1)/2 is also prime). 347 | * If |safe| is true, and |rem| == NULL the condition will be p % |add| == 3. 348 | * It is recommended that |add| is a multiple of 4. 349 | */ 350 | int mpi_generate_prime(mpi_t *ret, mpn_size_t bits, unsigned safe, const mpi_t *add, const mpi_t *rem, 351 | int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state); 352 | 353 | 354 | /** 355 | * mpn optimizer: get mpi with specified room from optimizer 356 | * 357 | * @note: 358 | * 1. size: size of chunk, in unit of 'mpn_limb_t' 359 | */ 360 | mpi_t *mpi_optimizer_get(mpn_optimizer_t *optimizer, mpn_size_t size); 361 | 362 | /** 363 | * mpn optimizer: put back mpi of specified room 364 | */ 365 | void mpi_optimizer_put(mpn_optimizer_t *optimizer, mpn_size_t size); 366 | 367 | 368 | /** 369 | * mpn montgomery: intialize montgomery context with modulus 370 | * 371 | */ 372 | int mpi_montgomery_set_modulus(mpn_montgomery_t *mont, const mpi_t *modulus); 373 | 374 | /** 375 | * mpn montgomery: exponentiation 376 | * 377 | */ 378 | int mpi_montgomery_exp(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont); 379 | 380 | /** 381 | * mpn montgomery: exponentiation(constant-time version) 382 | * 383 | */ 384 | int mpi_montgomery_exp_consttime(mpi_t *r, const mpi_t *x, const mpi_t *e, mpn_montgomery_t *mont); 385 | 386 | #if defined(__cplusplus) 387 | } 388 | #endif 389 | 390 | #endif 391 | -------------------------------------------------------------------------------- /mpn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Multiple-Precision-Natural-Number 2 | 3 | CONFIGURE_FILE(mpn-asm.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-asm.h COPYONLY) 4 | CONFIGURE_FILE(mpn-conf.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h COPYONLY) 5 | CONFIGURE_FILE( 6 | mpn-binary.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h COPYONLY 7 | ) 8 | CONFIGURE_FILE( 9 | mpn-optimizer.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h COPYONLY 10 | ) 11 | CONFIGURE_FILE( 12 | mpn-montgomery.h ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h COPYONLY 13 | ) 14 | 15 | INSTALL(FILES ${CMAKE_BINARY_DIR}/include/mpn/mpn-conf.h 16 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-optimizer.h 17 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-binary.h 18 | ${CMAKE_BINARY_DIR}/include/mpn/mpn-montgomery.h 19 | DESTINATION include/mpn 20 | ) 21 | 22 | ADD_LIBRARY(mpn mpn-binary.c mpn-asm.c mpn-optimizer.c mpn-montgomery.c) 23 | ConfigureTarget(mpn) 24 | INSTALL(TARGETS mpn ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) 25 | 26 | OPTION(MPN_NO_ASM "disable asm for mpn" OFF) 27 | IF ((NOT MPN_NO_ASM) AND (CMAKE_SYSTEM_NAME STREQUAL "Linux")) 28 | ENABLE_LANGUAGE(ASM_NASM) 29 | IF (NOT DEFINED ARCH) 30 | SET(ARCH ${CMAKE_SYSTEM_PROCESSOR}) 31 | ENDIF () 32 | SET(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D_L9 -DLINUX32E") 33 | IF (${ARCH} STREQUAL "x86_64") 34 | FILE(GLOB ASM_SOURCE asm/intel64/*.asm) 35 | TARGET_SOURCES(mpn PRIVATE ${ASM_SOURCE}) 36 | TARGET_INCLUDE_DIRECTORIES(mpn PRIVATE asm asm/intel64) 37 | TARGET_COMPILE_DEFINITIONS( 38 | mpn 39 | PRIVATE -DMPN_UADD_VECTORIZED_ASM 40 | -DMPN_USUB_VECTORIZED_ASM 41 | -DMPN_UINC_VECTORIZED_ASM 42 | -DMPN_UDEC_VECTORIZED_ASM 43 | -DMPN_UDIV_ASM 44 | -DMPN_UMUL_ASM 45 | -DMPN_USQR_ASM 46 | -DMPN_UMUL_ADD_ASM 47 | -DMPN_MONT_REDC_ASM 48 | ) 49 | ENDIF () 50 | ENDIF () 51 | 52 | IF (MPN_NO_INLINE_ASM) 53 | TARGET_COMPILE_DEFINITIONS(mpn PRIVATE -DMPN_NO_INLINE_ASM) 54 | ENDIF() 55 | -------------------------------------------------------------------------------- /mpn/asm/asmdefs.inc: -------------------------------------------------------------------------------- 1 | %ifndef __ASMDEFS_INC__ 2 | %define __ASMDEFS_INC__ 1 3 | 4 | %assign __ARCH_PX 0 ; pure C-code ia32 5 | %assign __ARCH_M5 1 ; Intel(R) Quark(TM) processor - ia32 6 | %assign __ARCH_W7 8 ; Intel(R) Streaming SIMD Extensions 2 - ia32 7 | %assign __ARCH_T7 16 ; Intel(R) Streaming SIMD Extensions 3 - ia32 8 | %assign __ARCH_V8 32 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) 9 | %assign __ARCH_S8 33 ; SSSE3 + MOVBE instruction - ia32 10 | %assign __ARCH_P8 64 ; Intel(R) Streaming SIMD Extensions 4.2 - ia32 11 | %assign __ARCH_G9 128 ; Intel(R) Advanced Vector Extensions - ia32 12 | %assign __ARCH_H9 256 ; Intel(R) Advanced Vector Extensions 2 - ia32 13 | %assign __ARCH_I0 512 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - ia32 14 | %assign __ARCH_S0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32 15 | 16 | %assign __ARCH32E_PX __ARCH_PX ; pure C-code x64 17 | %assign __ARCH32E_M7 32 ; Intel(R) Streaming SIMD Extensions 3 - intel64 18 | %assign __ARCH32E_U8 64 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64 19 | %assign __ARCH32E_N8 65 ; SSSE3 + MOVBE instruction - intel64 20 | %assign __ARCH32E_Y8 128 ; Intel(R) Streaming SIMD Extensions 4.2 - intel64 21 | %assign __ARCH32E_E9 256 ; Intel(R) Advanced Vector Extensions - intel64 22 | %assign __ARCH32E_L9 512 ; Intel(R) Advanced Vector Extensions 2 - intel64 23 | %assign __ARCH32E_N0 1024 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon Phi(TM) processor (formerly Knight Landing) - intel64 24 | %assign __ARCH32E_K0 2048 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64 25 | 26 | %assign __ARCH __ARCH_PX 27 | %assign __ARCH32E __ARCH32E_PX 28 | 29 | %ifdef _M5 ; Intel(R) Quark(TM) processor - ia32 30 | %assign __ARCH __ARCH_M5 31 | %elifdef _W7 ; Intel(R) Streaming SIMD Extensions 2 - ia32 32 | %assign __ARCH __ARCH_W7 33 | %elifdef _T7 ; Intel(R) Streaming SIMD Extensions 3 - ia32 34 | %assign __ARCH __ARCH_T7 35 | %elifdef _V8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) 36 | %assign __ARCH __ARCH_V8 37 | %elifdef _S8 ; SSSE3 + MOVBE instruction - ia32 38 | %assign __ARCH __ARCH_S8 39 | %elifdef _P8 ; Intel(R) Streaming SIMD Extensions 4.2 - ia32 40 | %assign __ARCH __ARCH_P8 41 | %elifdef _G9 ; Intel(R) Advanced Vector Extensions - ia32 42 | %assign ARCH_ALIGN_FACTOR 32 43 | %assign __ARCH __ARCH_G9 44 | %elifdef _H9 ; Intel(R) Advanced Vector Extensions 2 - ia32 45 | %assign ARCH_ALIGN_FACTOR 32 46 | %assign __ARCH __ARCH_H9 47 | %elifdef _S0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - ia32 48 | %assign ARCH_ALIGN_FACTOR 64 49 | %assign __ARCH __ARCH_S0 50 | %elifdef _M7 ; Intel(R) Streaming SIMD Extensions 3 - intel64 51 | %assign __ARCH __ARCH_PX 52 | %assign __ARCH32E __ARCH32E_M7 53 | %elifdef _U8 ; Supplemental Streaming SIMD Extensions 3 (SSSE3) - intel64 54 | %assign __ARCH __ARCH_PX 55 | %assign __ARCH32E __ARCH32E_U8 56 | %elifdef _N8 ; SSSE3 + MOVBE instruction - intel64 57 | %assign __ARCH __ARCH_PX 58 | %assign __ARCH32E __ARCH32E_N8 59 | %elifdef _Y8 ; Intel(R) Streaming SIMD Extensions 4.2 - intel64 60 | %assign __ARCH __ARCH_PX 61 | %assign __ARCH32E __ARCH32E_Y8 62 | %elifdef _E9 ; Intel(R) Advanced Vector Extensions - intel64 63 | %assign ARCH_ALIGN_FACTOR 32 64 | %assign __ARCH __ARCH_PX 65 | %assign __ARCH32E __ARCH32E_E9 66 | %elifdef _L9 ; Intel(R) Advanced Vector Extensions 2 - intel64 67 | %assign ARCH_ALIGN_FACTOR 32 68 | %assign __ARCH __ARCH_PX 69 | %assign __ARCH32E __ARCH32E_L9 70 | %elifdef _N0 ; Intel(R) Advanced Vector Extensions 512 (formerly Knights Landing) - intel64 71 | %assign ARCH_ALIGN_FACTOR 64 72 | %assign __ARCH __ARCH_PX 73 | %assign __ARCH32E __ARCH32E_N0 74 | %elifdef _K0 ; Intel(R) Advanced Vector Extensions 512 - Intel(R) Xeon(R) processor (formerly Skylake) - intel64 75 | %assign ARCH_ALIGN_FACTOR 64 76 | %assign __ARCH __ARCH_PX 77 | %assign __ARCH32E __ARCH32E_K0 78 | %else 79 | %assign __ARCH __ARCH_PX ; pure C-code 80 | %endif 81 | 82 | %if (__ARCH > __ARCH_H9) || (__ARCH32E > __ARCH32E_L9) 83 | %assign ARCH_ALIGN_FACTOR 64 84 | %elif (__ARCH > __ARCH_P8) || (__ARCH32E > __ARCH32E_Y8) 85 | %assign ARCH_ALIGN_FACTOR 32 86 | %else 87 | %assign ARCH_ALIGN_FACTOR 16 88 | %endif 89 | 90 | ; noexec stack 91 | %ifdef LINUX32 92 | %ifndef OSX32 93 | section .note.GNU-stack noalloc noexec nowrite progbits 94 | %endif 95 | %endif 96 | 97 | ; noexec stack 98 | %ifdef LINUX32E 99 | %ifndef OSXEM64T 100 | %ifndef _ARCH_KNC 101 | section .note.GNU-stack noalloc noexec nowrite progbits 102 | %endif 103 | %endif 104 | %endif 105 | 106 | 107 | %ifidn __OUTPUT_FORMAT__, elf32 108 | %assign IPP_BINARY_FORMAT 0 109 | %elifidn __OUTPUT_FORMAT__, elf64 110 | %assign IPP_BINARY_FORMAT 1 111 | %elifidn __OUTPUT_FORMAT__, macho64 112 | %assign IPP_BINARY_FORMAT 2 113 | %elifidn __OUTPUT_FORMAT__, win32 114 | %assign IPP_BINARY_FORMAT 3 115 | %elifidn __OUTPUT_FORMAT__, win64 116 | %assign IPP_BINARY_FORMAT 4 117 | %else 118 | %fatal Unsupported output format: __OUTPUT_FORMAT__. Shall be: elf32, elf64, win32, win64, macho64 119 | %endif 120 | 121 | %ifdef _MERGED_BLD 122 | %assign _OWN_MERGED_BLD 1 123 | %endif ; _MERGED_BLD 124 | 125 | ; data compilation definitions: merged builds shall compile data only as 126 | ; part of one single object build to avoid multiple definition warnings at link time 127 | %ifndef _MERGED_BLD 128 | %assign __ARCH_DATA 1 129 | %else 130 | %if (__ARCH == __ARCH_G9) || (__ARCH32E == __ARCH32E_E9) 131 | %assign __ARCH_DATA 1 132 | %endif 133 | %endif ; _MERGED_BLD 134 | 135 | ; Definitions of sizeof(type) 136 | %iassign ZWORD_size 64 ; zmm-word 137 | %iassign YWORD_size 32 ; ymm-word 138 | %iassign OWORD_size 16 ; octo-word 139 | %iassign TWORD_size 10 ; ten-bytes word 140 | %iassign QWORD_size 8 ; quad-word 141 | %iassign DWORD_size 4 ; double-word 142 | %iassign WORD_size 2 143 | %iassign BYTE_size 1 144 | 145 | %idefine YMMWORD YWORD 146 | %idefine XMMWORD OWORD 147 | %iassign YMMWORD_size YWORD_size 148 | %iassign XMMWORD_size OWORD_size 149 | 150 | %idefine sizeof(_x_) _x_%+_size 151 | 152 | %endif 153 | -------------------------------------------------------------------------------- /mpn/asm/ia_common.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2014-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %ifndef __IA_COMMON_INC__ 18 | %define __IA_COMMON_INC__ 1 19 | 20 | ; use multi-byte nop's sequences to align loops and jmp's when threshold is reached 21 | %use smartalign 22 | ALIGNMODE p6,16 23 | 24 | ; Declares function, sets visibility and binding and adds __cdecl decoration when needed. 25 | %macro DECLARE_FUNC 2-3.nolist 26 | %xdefine %%func_name %1 27 | %xdefine %%visibility %2 28 | %xdefine %%binding %3 29 | 30 | %ifctx _DECLARE_FUNC_CTX_ 31 | %fatal "DECLARE_FUNC: already in the context, need to call ENDFUNC" 32 | %endif 33 | 34 | ; Accepted visibility values are PUBLIC and PRIVATE 35 | %ifnidni %%visibility, PUBLIC 36 | %ifnidni %%visibility, PRIVATE 37 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC. 38 | %endif 39 | %endif 40 | 41 | ; Accepted binding values are WEAK or STRONG (default) 42 | %ifnempty %%binding 43 | %ifnidni %%binding, WEAK 44 | %ifnidni %%binding, STRONG 45 | %%fatal Function %%func_name binding is not properly defined. Shall be: WEAK or STRONG. 46 | %endif 47 | %endif 48 | %endif 49 | 50 | ; Function decoration length 51 | %assign %%decoration_length 0 52 | 53 | ; The __cdecl calling convention name decoration (to have interoperability with C). 54 | ; Only public functions are decorated 55 | %ifidni %%visibility, PUBLIC 56 | %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T 57 | %xdefine %%func_name _%[%%func_name] 58 | %assign %%decoration_length %%decoration_length+1 59 | %endif 60 | %endif 61 | 62 | ; If current macro is called from IPPASM macro, then function might be decorated by CPU-prefix 63 | %ifctx _IPPASM_CTX_ 64 | %assign %%decoration_length %%decoration_length + %$decoration_length ; %$decoration_length belongs to _IPPASM_CTX_ 65 | %endif 66 | 67 | %push _DECLARE_FUNC_CTX_ 68 | ; setup context variables to use in ENDFUNC 69 | %xdefine %$func_name_ctx %%func_name 70 | %assign %$decoration_length %%decoration_length ; %$decoration_length belongs to _DECLARE_FUNC_CTX_ 71 | 72 | %ifidn %%visibility, PUBLIC 73 | %if (IPP_BINARY_FORMAT < 2) ; LINUX32 or LINUX32E 74 | %ifnempty %%binding 75 | global %%func_name:function %%binding (%%func_name%+.LEnd_%+%%func_name - %%func_name) 76 | %else 77 | global %%func_name:function (%%func_name%+.LEnd_%+%%func_name - %%func_name) 78 | %endif 79 | %else 80 | global %%func_name 81 | %endif 82 | %endif 83 | %%func_name: 84 | 85 | ; CET enabling (macOS not supported) 86 | %if ((IPP_BINARY_FORMAT == 0) || (IPP_BINARY_FORMAT == 3)) ; elf32/win32 87 | db 0F3h, 00Fh, 01Eh, 0FBh ; endbr32 88 | %elif ((IPP_BINARY_FORMAT == 1) || (IPP_BINARY_FORMAT == 4)) ; elf64/win64 89 | db 0F3h, 00Fh, 01Eh, 0FAh ; endbr64 90 | %endif 91 | %endmacro 92 | 93 | ; Calls assembler function declared by DECLARE_FUNC 94 | ; Default visibility is PRIVATE (affects decoration) 95 | %macro CALL_FUNC 1-2.nolist PRIVATE 96 | %xdefine %%func_name %1 97 | %xdefine %%visibility %2 98 | 99 | ; Accepted visibility values are PUBLIC and PRIVATE 100 | %ifnidni %%visibility, PUBLIC 101 | %ifnidni %%visibility, PRIVATE 102 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC. 103 | %endif 104 | %endif 105 | 106 | ; __cdecl on WIN32/OSXEM64T obligates to have undersore prefix decoration. 107 | ; Only PUBLIC functions are decorated. 108 | %ifidni %%visibility, PUBLIC 109 | %if ((IPP_BINARY_FORMAT == 2) || (IPP_BINARY_FORMAT == 3)) ; WIN32 or OSXEM64T 110 | %xdefine %%func_name _%1 111 | %endif 112 | %endif 113 | 114 | call %%func_name 115 | %endmacro 116 | 117 | ; Declares function decorated by appropriate CPU prefix (for the merged library) 118 | ; Default visibility (if not defined) is PUBLIC. 119 | %macro IPPASM 1-2.nolist PUBLIC 120 | %xdefine %%func_name %1 121 | %xdefine %%visibility %2 122 | 123 | %ifctx _IPPASM_CTX_ 124 | %fatal "IPPASM: already in the context, need to call ENDFUNC" 125 | %endif 126 | %push _IPPASM_CTX_ 127 | 128 | %push _CPU_PREFIX_DECORATE_CTX_ 129 | CPU_PREFIX_DECORATE %%func_name 130 | %xdefine %%func_name %$decorated_func_name 131 | %assign %$$decoration_length %$decoration_length 132 | %pop _CPU_PREFIX_DECORATE_CTX_ 133 | 134 | DECLARE_FUNC %%func_name, %%visibility 135 | %endmacro 136 | 137 | ; Calls assembler function declared by IPPASM 138 | ; Default visibility is PRIVATE (affects decoration) 139 | %macro CALL_IPPASM 1-2.nolist PRIVATE 140 | %xdefine %%func_name %1 141 | %xdefine %%visibility %2 142 | 143 | ; Accepted visibility values are PUBLIC and PRIVATE 144 | %ifnidni %%visibility, PUBLIC 145 | %ifnidni %%visibility, PRIVATE 146 | %fatal Function %%func_name visibility is not properly defined. Shall be: PRIVATE or PUBLIC. 147 | %endif 148 | %endif 149 | 150 | %push _CPU_PREFIX_DECORATE_CTX_ 151 | CPU_PREFIX_DECORATE %%func_name 152 | %xdefine %%func_name %$decorated_func_name 153 | %pop _CPU_PREFIX_DECORATE_CTX_ 154 | 155 | CALL_FUNC %%func_name,%%visibility 156 | %endmacro 157 | 158 | ; End function macro - required to be called after IPPASM or DECLARE_FUNC macro invokation. 159 | %macro ENDFUNC 1.nolist 160 | %xdefine %%func_name %1 161 | %ifnctx _DECLARE_FUNC_CTX_ 162 | %fatal "Not in the context: _DECLARE_FUNC_CTX_" 163 | %endif 164 | 165 | ; Cross-check of context variable with macro parameter 166 | %defstr %%func_name_str %%func_name 167 | %defstr %%func_name_ctx_str %$func_name_ctx 168 | %substr %%func_name_ctx_str_not_decorated %%func_name_ctx_str %[%$decoration_length+1],-1 ; remove decoration (first X symbols) 169 | %ifnidn %%func_name_str,%%func_name_ctx_str 170 | %ifnidn %%func_name_str,%%func_name_ctx_str_not_decorated 171 | %fatal ENDFUNC: function name [%%func_name] does match context: [%$func_name_ctx] 172 | %endif 173 | %endif 174 | 175 | ; Add local label to be able calculate function size 176 | ; Take function name from the context (real declaration name) 177 | .LEnd_%+%$func_name_ctx: 178 | %pop _DECLARE_FUNC_CTX_ 179 | 180 | %ifctx _IPPASM_CTX_ 181 | %pop _IPPASM_CTX_ 182 | %endif 183 | %endmacro 184 | 185 | %endif 186 | -------------------------------------------------------------------------------- /mpn/asm/ia_emm.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2014-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %include "asmdefs.inc" 18 | %include "ia_common.inc" 19 | %include "utils.inc" 20 | 21 | ; Decorates function name with appropriate CPU prefix (for the merged library). 22 | ; The macro is context-dependent and returns decorated name in the %$decorated_func_name 23 | ; context variable. 24 | %macro CPU_PREFIX_DECORATE 1.nolist 25 | %ifnctx _CPU_PREFIX_DECORATE_CTX_ 26 | %fatal "Not in the context: _CPU_PREFIX_DECORATE_CTX_" 27 | %endif 28 | 29 | ; Add CPU-specific suffix for the dispatched library 30 | %ifdef _OWN_MERGED_BLD 31 | %if (__ARCH == __ARCH_PX) 32 | %xdefine %%func_name px_%1 33 | %assign %%decoration_length 3 34 | %endif 35 | %if (__ARCH == __ARCH_W7) 36 | %xdefine %%func_name w7_%1 37 | %assign %%decoration_length 3 38 | %endif 39 | %if (__ARCH == __ARCH_V8) 40 | %xdefine %%func_name v8_%1 41 | %assign %%decoration_length 3 42 | %endif 43 | %if (__ARCH == __ARCH_S8) 44 | %xdefine %%func_name s8_%1 45 | %assign %%decoration_length 3 46 | %endif 47 | %if (__ARCH == __ARCH_P8) 48 | %xdefine %%func_name p8_%1 49 | %assign %%decoration_length 3 50 | %endif 51 | %if (__ARCH == __ARCH_G9) 52 | %xdefine %%func_name g9_%1 53 | %assign %%decoration_length 3 54 | %endif 55 | %if (__ARCH == __ARCH_H9) 56 | %xdefine %%func_name h9_%1 57 | %assign %%decoration_length 3 58 | %endif 59 | %else 60 | %xdefine %%func_name %1 61 | %assign %%decoration_length 0 62 | %endif 63 | 64 | %ifndef %%func_name 65 | %fatal "CPU_PREFIX_DECORATE: unknown decoration for: __ARCH = " __ARCH 66 | %endif 67 | %xdefine %$decorated_func_name %[%%func_name] 68 | %assign %$decoration_length %%decoration_length 69 | %endmacro 70 | 71 | %define NONVOLATILE_REGS_32_GPR ebp,ebx,esi,edi 72 | 73 | ; Saves non-volatile GPR registers on stack. 74 | ; Input - list of used registers. 75 | %macro USES_GPR 1+.nolist 76 | %assign LOCAL_FRAME 0 77 | %assign GPR_FRAME 0 78 | %define GPR_CUR 79 | 80 | BEGIN_INTERSECT 81 | INTERSECT {%1},{%[NONVOLATILE_REGS_32_GPR]} 82 | ; List of non-volatile GPR registers in the order they will be pushed on stack 83 | %xdefine GPR_CUR %$intersection 84 | %assign GPR_FRAME %$cardinality * 4 85 | END_INTERSECT 86 | 87 | ; Push non-volatile GPRs on stack 88 | FOREACH GPR_CUR,{push} 89 | 90 | ; Set up offset of arguments from ESP 91 | %assign ARG_1 %[GPR_FRAME + 4] 92 | %endmacro 93 | 94 | ; Restore preliminary saved by USES_GPR non-volatile GPR registers from the stack. 95 | ; The macro shall be called after function processing. 96 | %macro REST_GPR 0.nolist 97 | %ifndef GPR_CUR 98 | %fatal "REST_GPR: no GPR_CUR defined" 99 | %endif 100 | ; Pop saved GPRs from the stack 101 | RFOREACH GPR_CUR,{pop} 102 | %endmacro 103 | 104 | %macro LD_ADDR 2.nolist 105 | %xdefine %%reg %1 106 | %xdefine %%addr %2 107 | 108 | %ifdef IPP_PIC 109 | call %%LABEL 110 | %%LABEL: pop %%reg 111 | sub %%reg, %%LABEL-%%addr 112 | %else 113 | lea %%reg, [%%addr] 114 | %endif 115 | %endmacro 116 | -------------------------------------------------------------------------------- /mpn/asm/intel64/bn_usqrschool.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2010-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: Cryptography Primitive. 20 | ; BNU squaring support 21 | ; 22 | ; 23 | 24 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 25 | ;; 26 | ;; MULx1 genaral-case squarer macros 27 | ;; 28 | 29 | ;; dst = src * B epilogue (srcLen=4*n+3) 30 | %macro sMULx1_4N_3_ELOG 8.nolist 31 | %xdefine %%rDst %1 32 | %xdefine %%rSrc %2 33 | %xdefine %%update_idx %3 34 | %xdefine %%B %4 35 | %xdefine %%T0 %5 36 | %xdefine %%T1 %6 37 | %xdefine %%T2 %7 38 | %xdefine %%T3 %8 39 | 40 | mul %%B 41 | xor %%T1, %%T1 42 | add %%T0, rax 43 | mov qword [%%rDst+sizeof(qword)], %%T0 44 | mov rax, qword [%%rSrc+sizeof(qword)*2] 45 | adc %%T1, rdx 46 | 47 | mul %%B 48 | xor %%T2, %%T2 49 | add %%T1, rax 50 | mov qword [%%rDst+sizeof(qword)*2], %%T1 51 | mov rax, qword [%%rSrc+sizeof(qword)*3] 52 | adc %%T2, rdx 53 | 54 | mul %%B 55 | %%update_idx 56 | add %%T2, rax 57 | mov qword [%%rDst+sizeof(qword)*3], %%T2 58 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 59 | adc rdx, 0 60 | 61 | mov qword [%%rDst+sizeof(qword)*4], rdx 62 | add %%rDst, sizeof(qword) 63 | %endmacro 64 | 65 | ;; dst = src * B epilogue (srcLen=4*n+1) 66 | %macro sMULx1_4N_1_ELOG 8.nolist 67 | %xdefine %%rDst %1 68 | %xdefine %%rSrc %2 69 | %xdefine %%update_idx %3 70 | %xdefine %%B %4 71 | %xdefine %%T0 %5 72 | %xdefine %%T1 %6 73 | %xdefine %%T2 %7 74 | %xdefine %%T3 %8 75 | 76 | mul %%B 77 | %%update_idx 78 | add %%T0, rax 79 | mov qword [%%rDst+sizeof(qword)*3], %%T0 80 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 81 | adc rdx, 0 82 | 83 | mov qword [%%rDst+sizeof(qword)*4], rdx 84 | add %%rDst, sizeof(qword) 85 | %endmacro 86 | 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 88 | ;; 89 | ;; MULx2 genaral-case multiplier macros 90 | ;; 91 | 92 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+1) 93 | %macro sMULx2_4N_1_ELOG 9.nolist 94 | %xdefine %%rDst %1 95 | %xdefine %%rSrc %2 96 | %xdefine %%update_idx %3 97 | %xdefine %%B0 %4 98 | %xdefine %%B1 %5 99 | %xdefine %%T0 %6 100 | %xdefine %%T1 %7 101 | %xdefine %%T2 %8 102 | %xdefine %%T3 %9 103 | 104 | mul %%B1 ; {T2:T1} += a[lenA-1]*B1 105 | ;add rDst, sizeof(qword)*2 106 | %%update_idx 107 | mov qword [%%rDst+sizeof(qword)*3], %%T0 108 | add %%T1, rax 109 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 110 | adc rdx, %%T2 111 | 112 | mov qword [%%rDst+sizeof(qword)*4], %%T1 113 | mov qword [%%rDst+sizeof(qword)*5], rdx 114 | %endmacro 115 | 116 | ;; dst = src * {B1:B0} epilogue (srcLen=4*n+3) 117 | %macro sMULx2_4N_3_ELOG 9.nolist 118 | %xdefine %%rDst %1 119 | %xdefine %%rSrc %2 120 | %xdefine %%update_idx %3 121 | %xdefine %%B0 %4 122 | %xdefine %%B1 %5 123 | %xdefine %%T0 %6 124 | %xdefine %%T1 %7 125 | %xdefine %%T2 %8 126 | %xdefine %%T3 %9 127 | 128 | mul %%B1 ; {T2:T1} += a[lenA-3]*B1 129 | xor %%T3, %%T3 130 | add %%T1, rax 131 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2] 132 | adc %%T2, rdx 133 | 134 | mul %%B0 ; {T3:T2:T1} += a[LenA-2]*B0 135 | mov qword [%%rDst+sizeof(qword)], %%T0 136 | add %%T1, rax 137 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2] 138 | adc %%T2, rdx 139 | adc %%T3, 0 140 | 141 | mul %%B1 ; {T3:T2} += a[lenA-2]*B1 142 | xor %%T0, %%T0 143 | add %%T2, rax 144 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1] 145 | adc %%T3, rdx 146 | 147 | mul %%B0 ; {T0:T3:T2} += a[lenA-1]*B0 148 | mov qword [%%rDst+sizeof(qword)*2], %%T1 149 | add %%T2, rax 150 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1] 151 | adc %%T3, rdx 152 | adc %%T0, 0 153 | 154 | mul %%B1 ; {T0:T3} += a[lenA-1]*B1 155 | ;add rDst, sizeof(qword)*2 156 | %%update_idx 157 | mov qword [%%rDst+sizeof(qword)*3], %%T2 158 | add %%T3, rax 159 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 160 | adc rdx, %%T0 161 | 162 | mov qword [%%rDst+sizeof(qword)*4], %%T3 163 | mov qword [%%rDst+sizeof(qword)*5], rdx 164 | %endmacro 165 | 166 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 167 | ;; 168 | ;; MLAx2 genaral-case multiplier macros 169 | ;; 170 | 171 | ;; 172 | ;; B0 = rSrc[-2] 173 | ;; B1 = rSrc[-1] 174 | ;; inp_vector = rSrc 175 | ;; out_vector = rDst 176 | %macro sMLAx2_PLOG 8.nolist 177 | %xdefine %%rDst %1 178 | %xdefine %%rSrc %2 179 | %xdefine %%B0 %3 180 | %xdefine %%B1 %4 181 | %xdefine %%T0 %5 182 | %xdefine %%T1 %6 183 | %xdefine %%T2 %7 184 | %xdefine %%T3 %8 185 | 186 | mov %%B0, qword [%%rSrc-2*sizeof(qword)] ; preload a[-2] 187 | mov %%B1, qword [%%rSrc-sizeof(qword)] ; and a[i-1] 188 | 189 | mov rax, %%B1 190 | mul %%B0 ; a[-2]*a[i-1] 191 | xor %%T0, %%T0 192 | 193 | add qword [%%rDst-sizeof(qword)], rax 194 | mov rax, qword [%%rSrc] ; a[i] 195 | adc %%T0, rdx 196 | 197 | mul %%B0 ; B0*a[i] 198 | xor %%T1, %%T1 199 | xor %%T2, %%T2 200 | add %%T0, rax 201 | mov rax, qword [%%rSrc] ; a[i] 202 | adc %%T1, rdx 203 | %endmacro 204 | 205 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+1) 206 | %macro sMLAx2_4N_1_ELOG 9.nolist 207 | %xdefine %%rDst %1 208 | %xdefine %%rSrc %2 209 | %xdefine %%update_idx %3 210 | %xdefine %%B0 %4 211 | %xdefine %%B1 %5 212 | %xdefine %%T0 %6 213 | %xdefine %%T1 %7 214 | %xdefine %%T2 %8 215 | %xdefine %%T3 %9 216 | 217 | mul %%B1 ; {T2:T1} += a[lenA-1]*B1 + r[lenA-1] 218 | ;add rDst, sizeof(qword)*2 219 | %%update_idx 220 | add %%T0, qword [%%rDst+sizeof(qword)*3] 221 | mov qword [%%rDst+sizeof(qword)*3], %%T0 222 | adc %%T1, rax 223 | adc rdx, %%T2 224 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 225 | 226 | mov qword [%%rDst+sizeof(qword)*4], %%T1 227 | mov qword [%%rDst+sizeof(qword)*5], rdx 228 | %endmacro 229 | 230 | ;; dst = + src * {B1:B0} epilogue (srcLen=4*n+3) 231 | %macro sMLAx2_4N_3_ELOG 9.nolist 232 | %xdefine %%rDst %1 233 | %xdefine %%rSrc %2 234 | %xdefine %%update_idx %3 235 | %xdefine %%B0 %4 236 | %xdefine %%B1 %5 237 | %xdefine %%T0 %6 238 | %xdefine %%T1 %7 239 | %xdefine %%T2 %8 240 | %xdefine %%T3 %9 241 | 242 | mul %%B1 ; {T2:T1} += a[lenA-3]*B1 243 | xor %%T3, %%T3 244 | add %%T1, rax 245 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2] 246 | adc %%T2, rdx 247 | 248 | mul %%B0 ; {T3:T2:T1} += a[LenA-2]*B0 + r[len-3] 249 | add %%T0, qword [%%rDst+sizeof(qword)] 250 | mov qword [%%rDst+sizeof(qword)], %%T0 251 | adc %%T1, rax 252 | adc %%T2, rdx 253 | adc %%T3, 0 254 | mov rax, qword [%%rSrc+sizeof(qword)*2] ; a[lenA-2] 255 | 256 | mul %%B1 ; {T3:T2} += a[lenA-2]*B1 257 | xor %%T0, %%T0 258 | add %%T2, rax 259 | adc %%T3, rdx 260 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1] 261 | 262 | mul %%B0 ; {T0:T3:T2} += a[lenA-1]*B0 + r[lenA-2] 263 | add %%T1, qword [%%rDst+sizeof(qword)*2] 264 | mov qword [%%rDst+sizeof(qword)*2], %%T1 265 | adc %%T2, rax 266 | adc %%T3, rdx 267 | adc %%T0, 0 268 | mov rax, qword [%%rSrc+sizeof(qword)*3] ; a[lenA-1] 269 | 270 | mul %%B1 ; {T0:T3} += a[lenA-1]*B1 + r[lenA-1] 271 | ;add rDst, sizeof(qword)*2 272 | %%update_idx 273 | add %%T2, qword [%%rDst+sizeof(qword)*3] 274 | mov qword [%%rDst+sizeof(qword)*3], %%T2 275 | adc %%T3, rax 276 | adc rdx, %%T0 277 | ;mov rax, qword [rSrc+idx*sizeof(qword)] 278 | 279 | mov qword [%%rDst+sizeof(qword)*4], %%T3 280 | mov qword [%%rDst+sizeof(qword)*5], rdx 281 | %endmacro 282 | -------------------------------------------------------------------------------- /mpn/asm/intel64/clear_regs.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %ifndef _CLEAR_REGS_ASM_ 18 | %define _CLEAR_REGS_ASM_ 19 | 20 | %include "os.inc" 21 | 22 | ; 23 | ; This macro clears any GP registers passed 24 | ; 25 | %macro clear_gps 1-16 26 | %define %%NUM_REGS %0 27 | %rep %%NUM_REGS 28 | xor %1, %1 29 | %rotate 1 30 | %endrep 31 | %endmacro 32 | 33 | ; 34 | ; This macro clears any XMM registers passed on SSE 35 | ; 36 | %macro clear_xmms_sse 1-16 37 | %define %%NUM_REGS %0 38 | %rep %%NUM_REGS 39 | pxor %1, %1 40 | %rotate 1 41 | %endrep 42 | %endmacro 43 | 44 | ; 45 | ; This macro clears any XMM registers passed on AVX 46 | ; 47 | %macro clear_xmms_avx 1-16 48 | %define %%NUM_REGS %0 49 | %rep %%NUM_REGS 50 | vpxor %1, %1 51 | %rotate 1 52 | %endrep 53 | %endmacro 54 | 55 | ; 56 | ; This macro clears any YMM registers passed 57 | ; 58 | %macro clear_ymms 1-16 59 | %define %%NUM_REGS %0 60 | %rep %%NUM_REGS 61 | vpxor %1, %1 62 | %rotate 1 63 | %endrep 64 | %endmacro 65 | 66 | ; 67 | ; This macro clears any ZMM registers passed 68 | ; 69 | %macro clear_zmms 1-32 70 | %define %%NUM_REGS %0 71 | %rep %%NUM_REGS 72 | vpxorq %1, %1 73 | %rotate 1 74 | %endrep 75 | %endmacro 76 | 77 | ; 78 | ; This macro clears all scratch GP registers 79 | ; for Windows or Linux 80 | ; 81 | %macro clear_scratch_gps_asm 0 82 | clear_gps rax, rcx, rdx, r8, r9, r10, r11 83 | %ifdef LINUX 84 | clear_gps rdi, rsi 85 | %endif 86 | %endmacro 87 | 88 | ; 89 | ; This macro clears all scratch XMM registers on SSE 90 | ; 91 | %macro clear_scratch_xmms_sse_asm 0 92 | %ifdef LINUX 93 | %assign i 0 94 | %rep 16 95 | pxor xmm %+ i, xmm %+ i 96 | %assign i (i+1) 97 | %endrep 98 | ; On Windows, XMM0-XMM5 registers are scratch registers 99 | %else 100 | %assign i 0 101 | %rep 6 102 | pxor xmm %+ i, xmm %+ i 103 | %assign i (i+1) 104 | %endrep 105 | %endif ; LINUX 106 | %endmacro 107 | 108 | ; 109 | ; This macro clears all scratch XMM registers on AVX 110 | ; 111 | %macro clear_scratch_xmms_avx_asm 0 112 | %ifdef LINUX 113 | vzeroall 114 | ; On Windows, XMM0-XMM5 registers are scratch registers 115 | %else 116 | %assign i 0 117 | %rep 6 118 | vpxor xmm %+ i, xmm %+ i 119 | %assign i (i+1) 120 | %endrep 121 | %endif ; LINUX 122 | %endmacro 123 | 124 | ; 125 | ; This macro clears all scratch YMM registers 126 | ; 127 | ; It should be called before restoring the XMM registers 128 | ; for Windows (XMM6-XMM15) 129 | ; 130 | %macro clear_scratch_ymms_asm 0 131 | ; On Linux, all YMM registers are scratch registers 132 | %ifdef LINUX 133 | vzeroall 134 | ; On Windows, YMM0-YMM5 registers are scratch registers. 135 | ; YMM6-YMM15 upper 128 bits are scratch registers too, but 136 | ; the lower 128 bits are to be restored after calling these function 137 | ; which clears the upper bits too. 138 | %else 139 | %assign i 0 140 | %rep 6 141 | vpxor ymm %+ i, ymm %+ i 142 | %assign i (i+1) 143 | %endrep 144 | %endif ; LINUX 145 | %endmacro 146 | 147 | ; 148 | ; This macro clears all scratch ZMM registers 149 | ; 150 | ; It should be called before restoring the XMM registers 151 | ; for Windows (XMM6-XMM15). YMM registers are used 152 | ; on purpose, since XOR'ing YMM registers is faster 153 | ; than XOR'ing ZMM registers, and the operation clears 154 | ; also the upper 256 bits 155 | ; 156 | %macro clear_scratch_zmms_asm 0 157 | ; On Linux, all ZMM registers are scratch registers 158 | %ifdef LINUX 159 | vzeroall 160 | ;; vzeroall only clears the first 16 ZMM registers 161 | %assign i 16 162 | %rep 16 163 | vpxorq ymm %+ i, ymm %+ i 164 | %assign i (i+1) 165 | %endrep 166 | ; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers. 167 | ; ZMM6-ZMM15 upper 384 bits are scratch registers too, but 168 | ; the lower 128 bits are to be restored after calling these function 169 | ; which clears the upper bits too. 170 | %else 171 | %assign i 0 172 | %rep 6 173 | vpxorq ymm %+ i, ymm %+ i 174 | %assign i (i+1) 175 | %endrep 176 | 177 | %assign i 16 178 | %rep 16 179 | vpxorq ymm %+ i, ymm %+ i 180 | %assign i (i+1) 181 | %endrep 182 | %endif ; LINUX 183 | %endmacro 184 | 185 | %endif ;; _CLEAR_REGS_ASM 186 | -------------------------------------------------------------------------------- /mpn/asm/intel64/cpinitas.asm: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2014-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %include "asmdefs.inc" 18 | %include "ia_32e.inc" 19 | 20 | %assign LOCAL_ALIGN_FACTOR 32 21 | 22 | %ifdef __ARCH_DATA 23 | 24 | segment .text align=LOCAL_ALIGN_FACTOR 25 | 26 | ;#################################################################### 27 | ;# void cpGetReg( int* buf, int valueEAX, int valueECX ); # 28 | ;#################################################################### 29 | 30 | %ifdef WIN32E 31 | %define buf rcx 32 | %define valueEAX edx 33 | %define valueECX r8d 34 | %else 35 | %define buf rdi 36 | %define valueEAX esi 37 | %define valueECX edx 38 | %endif 39 | 40 | align LOCAL_ALIGN_FACTOR 41 | DECLARE_FUNC cpGetReg,PUBLIC 42 | push rbx 43 | movsxd r9, valueEAX 44 | movsxd r10, valueECX 45 | mov r11, buf 46 | 47 | mov rax, r9 48 | mov rcx, r10 49 | xor ebx, ebx 50 | xor edx, edx 51 | cpuid 52 | mov [r11], eax 53 | mov [r11 + 4], ebx 54 | mov [r11 + 8], ecx 55 | mov [r11 + 12], edx 56 | pop rbx 57 | ret 58 | ENDFUNC cpGetReg 59 | 60 | ;################################################### 61 | 62 | ; OSXSAVE support, feature information after cpuid(1), ECX, bit 27 ( XGETBV is enabled by OS ) 63 | %assign XSAVEXGETBV_FLAG 8000000h 64 | 65 | ; Feature information after XGETBV(ECX=0), EAX, bits 2,1 ( XMM state and YMM state are enabled by OS ) 66 | %assign XGETBV_MASK 06h 67 | 68 | %assign XGETBV_AVX512_MASK 0E0h 69 | 70 | align LOCAL_ALIGN_FACTOR 71 | DECLARE_FUNC cp_is_avx_extension,PUBLIC 72 | push rbx 73 | mov eax, 1 74 | cpuid 75 | xor eax, eax 76 | and ecx, 018000000h 77 | cmp ecx, 018000000h 78 | jne .not_avx 79 | xor ecx, ecx 80 | db 00fh,001h,0d0h ; xgetbv 81 | mov ecx, eax 82 | xor eax, eax 83 | and ecx, XGETBV_MASK 84 | cmp ecx, XGETBV_MASK 85 | jne .not_avx 86 | mov eax, 1 87 | .not_avx: 88 | pop rbx 89 | ret 90 | ENDFUNC cp_is_avx_extension 91 | 92 | align LOCAL_ALIGN_FACTOR 93 | DECLARE_FUNC cp_is_avx512_extension,PUBLIC 94 | push rbx 95 | mov eax, 1 96 | cpuid 97 | xor eax, eax 98 | and ecx, XSAVEXGETBV_FLAG 99 | cmp ecx, XSAVEXGETBV_FLAG 100 | jne .not_avx512 101 | xor ecx, ecx 102 | db 00fh,001h,0d0h ; xgetbv 103 | mov ecx, eax 104 | xor eax, eax 105 | and ecx, XGETBV_AVX512_MASK 106 | cmp ecx, XGETBV_AVX512_MASK 107 | jne .not_avx512 108 | mov eax, 1 109 | .not_avx512: 110 | pop rbx 111 | ret 112 | ENDFUNC cp_is_avx512_extension 113 | 114 | align LOCAL_ALIGN_FACTOR 115 | DECLARE_FUNC cp_issue_avx512_instruction,PUBLIC 116 | db 062h,0f1h,07dh,048h,0efh,0c0h ; vpxord zmm0, zmm0, zmm0 117 | xor eax, eax 118 | ret 119 | ENDFUNC cp_issue_avx512_instruction 120 | 121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 122 | 123 | align LOCAL_ALIGN_FACTOR 124 | DECLARE_FUNC cp_get_pentium_counter,PUBLIC 125 | rdtsc 126 | sal rdx,32 127 | or rax,rdx 128 | ret 129 | ENDFUNC cp_get_pentium_counter 130 | 131 | align LOCAL_ALIGN_FACTOR 132 | DECLARE_FUNC cpStartTscp,PUBLIC 133 | push rbx 134 | xor rax, rax 135 | cpuid 136 | pop rbx 137 | rdtscp 138 | sal rdx,32 139 | or rax,rdx 140 | ret 141 | ENDFUNC cpStartTscp 142 | 143 | align LOCAL_ALIGN_FACTOR 144 | DECLARE_FUNC cpStopTscp,PUBLIC 145 | rdtscp 146 | sal rdx,32 147 | or rax,rdx 148 | push rax 149 | push rbx 150 | xor rax, rax 151 | cpuid 152 | pop rbx 153 | pop rax 154 | ret 155 | ENDFUNC cpStopTscp 156 | 157 | align LOCAL_ALIGN_FACTOR 158 | DECLARE_FUNC cpStartTsc,PUBLIC 159 | push rbx 160 | xor rax, rax 161 | cpuid 162 | pop rbx 163 | rdtsc 164 | sal rdx,32 165 | or rax,rdx 166 | ret 167 | ENDFUNC cpStartTsc 168 | 169 | align LOCAL_ALIGN_FACTOR 170 | DECLARE_FUNC cpStopTsc,PUBLIC 171 | rdtsc 172 | sal rdx,32 173 | or rax,rdx 174 | push rax 175 | push rbx 176 | xor rax, rax 177 | cpuid 178 | pop rbx 179 | pop rax 180 | ret 181 | ENDFUNC cpStopTsc 182 | 183 | 184 | ;***************************************** 185 | ; int cpGetCacheSize( int* tableCache ); 186 | align LOCAL_ALIGN_FACTOR 187 | %define table rdi 188 | DECLARE_FUNC cpGetCacheSize,PUBLIC 189 | %assign LOCAL_FRAME 16 190 | USES_GPR rsi, rdi, rbx, rbp 191 | USES_XMM 192 | COMP_ABI 1 193 | 194 | mov rbp, rsp 195 | xor esi, esi 196 | 197 | mov eax, 2 198 | cpuid 199 | 200 | cmp al, 1 201 | jne .GetCacheSize_11 202 | 203 | test eax, 080000000h 204 | jz .GetCacheSize_00 205 | xor eax, eax 206 | .GetCacheSize_00: 207 | test ebx, 080000000h 208 | jz .GetCacheSize_01 209 | xor ebx, ebx 210 | .GetCacheSize_01: 211 | test ecx, 080000000h 212 | jz .GetCacheSize_02 213 | xor ecx, ecx 214 | .GetCacheSize_02: 215 | test edx, 080000000h 216 | jz .GetCacheSize_03 217 | xor edx, edx 218 | 219 | .GetCacheSize_03: 220 | test eax, eax 221 | jz .GetCacheSize_04 222 | mov [rbp], eax 223 | add rbp, 4 224 | add esi, 3 225 | .GetCacheSize_04: 226 | test ebx, ebx 227 | jz .GetCacheSize_05 228 | mov [rbp], ebx 229 | add rbp, 4 230 | add esi, 4 231 | .GetCacheSize_05: 232 | test ecx, ecx 233 | jz .GetCacheSize_06 234 | mov [rbp], ecx 235 | add rbp, 4 236 | add esi, 4 237 | .GetCacheSize_06: 238 | test edx, edx 239 | jz .GetCacheSize_07 240 | mov [rbp], edx 241 | add esi, 4 242 | 243 | .GetCacheSize_07: 244 | test esi, esi 245 | jz .GetCacheSize_11 246 | mov eax, -1 247 | .GetCacheSize_08: 248 | xor edx, edx 249 | add edx, [table] 250 | jz .ExitGetCacheSize00 251 | add table, 8 252 | mov ecx, esi 253 | .GetCacheSize_09: 254 | cmp dl, BYTE [rsp + rcx] 255 | je .GetCacheSize_10 256 | dec ecx 257 | jnz .GetCacheSize_09 258 | jmp .GetCacheSize_08 259 | 260 | .GetCacheSize_10: 261 | mov eax, [table - 4] 262 | 263 | .ExitGetCacheSize00: 264 | REST_XMM 265 | REST_GPR 266 | ret 267 | 268 | .GetCacheSize_11: 269 | mov eax, -1 270 | jmp .ExitGetCacheSize00 271 | ENDFUNC cpGetCacheSize 272 | 273 | ;**************************** 274 | 275 | %endif ; __ARCH_DATA 276 | -------------------------------------------------------------------------------- /mpn/asm/intel64/emulator.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2009-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: EM64T Cryptography Primitive. 20 | ; 21 | ; 22 | ; 23 | 24 | %ifndef _EMULATOR_INC_ 25 | %define _EMULATOR_INC_ 26 | 27 | %macro my_pclmulqdq 3.nolist 28 | %xdefine %%xxDst %1 29 | %xdefine %%xxSrc %2 30 | %xdefine %%xxOp %3 31 | 32 | %if (my_emulator == 0) 33 | pclmulqdq %%xxDst, %%xxSrc, %%xxOp 34 | %else 35 | ;; 36 | ;; rsp 37 | ;; registers 38 | ;; +00 => xxDst 39 | ;; +16 => xxSrc 40 | 41 | pushf 42 | push rax 43 | push rbx 44 | push rcx 45 | push rdx 46 | push rdi 47 | push rsi 48 | push rbp 49 | push r8 50 | push r9 51 | push r10 52 | push r11 53 | push r12 54 | push r13 55 | push r14 56 | push r15 57 | 58 | %assign %%stackSize (sizeof(oword)*2) 59 | sub rsp,%%stackSize 60 | 61 | movdqu oword [rsp+00], %%xxDst ;; save Dst 62 | movdqu oword [rsp+16], %%xxSrc ;; save Src 63 | 64 | lea rcx, [rsp+00] 65 | lea rdx, [rsp+16] 66 | mov r8, %%xxOp 67 | 68 | sub rsp, (sizeof(qword)*3) 69 | call emu_pclmulqdq 70 | add rsp, (sizeof(qword)*3) 71 | 72 | movdqu %%xxDst, oword [rsp+00] ;; return Dst 73 | ;movdqu xxSrc, oword [rsp+16] ;; return Src 74 | add esp, %%stackSize 75 | 76 | pop r15 77 | pop r14 78 | pop r13 79 | pop r12 80 | pop r11 81 | pop r10 82 | pop r9 83 | pop r8 84 | pop rbp 85 | pop rsi 86 | pop rdi 87 | pop rdx 88 | pop rcx 89 | pop rbx 90 | pop rax 91 | popf 92 | %endif 93 | %endmacro 94 | 95 | %macro my_aesenc 2.nolist 96 | %xdefine %%xxDst %1 97 | %xdefine %%xxSrc %2 98 | 99 | %if (my_emulator == 0) 100 | aesenc %%xxDst, %%xxSrc 101 | %else 102 | pushf 103 | push rax 104 | push rbx 105 | push rcx 106 | push rdx 107 | push rdi 108 | push rsi 109 | push rbp 110 | push r8 111 | push r9 112 | push r10 113 | push r11 114 | push r12 115 | push r13 116 | push r14 117 | push r15 118 | 119 | %assign %%stackSize (sizeof(oword)*2) 120 | sub rsp,%%stackSize 121 | 122 | movdqu oword [rsp+00], %%xxDst ;; save Dst 123 | movdqu oword [rsp+16], %%xxSrc ;; save Src 124 | 125 | lea rcx, [rsp+00] 126 | lea rdx, [rsp+16] 127 | 128 | sub rsp, (sizeof(qword)*2) 129 | call emu_aesenc 130 | add rsp, (sizeof(qword)*2) 131 | 132 | movdqu %%xxDst, oword [rsp+00] ;; return Dst 133 | add esp, %%stackSize 134 | 135 | pop r15 136 | pop r14 137 | pop r13 138 | pop r12 139 | pop r11 140 | pop r10 141 | pop r9 142 | pop r8 143 | pop rbp 144 | pop rsi 145 | pop rdi 146 | pop rdx 147 | pop rcx 148 | pop rbx 149 | pop rax 150 | popf 151 | %endif 152 | %endmacro 153 | 154 | %macro my_aesenclast 2.nolist 155 | %xdefine %%xxDst %1 156 | %xdefine %%xxSrc %2 157 | 158 | %if (my_emulator == 0) 159 | aesenclast %%xxDst, %%xxSrc 160 | %else 161 | pushf 162 | push rax 163 | push rbx 164 | push rcx 165 | push rdx 166 | push rdi 167 | push rsi 168 | push rbp 169 | push r8 170 | push r9 171 | push r10 172 | push r11 173 | push r12 174 | push r13 175 | push r14 176 | push r15 177 | 178 | %assign %%stackSize (sizeof(oword)*2) 179 | sub rsp,%%stackSize 180 | 181 | movdqu oword [rsp+00], %%xxDst ;; save Dst 182 | movdqu oword [rsp+16], %%xxSrc ;; save Src 183 | 184 | lea rcx, [rsp+00] 185 | lea rdx, [rsp+16] 186 | 187 | sub rsp, (sizeof(qword)*2) 188 | call emu_aesenclast 189 | add rsp, (sizeof(qword)*2) 190 | 191 | movdqu %%xxDst, oword [rsp+00] ;; return Dst 192 | add esp, %%stackSize 193 | 194 | pop r15 195 | pop r14 196 | pop r13 197 | pop r12 198 | pop r11 199 | pop r10 200 | pop r9 201 | pop r8 202 | pop rbp 203 | pop rsi 204 | pop rdi 205 | pop rdx 206 | pop rcx 207 | pop rbx 208 | pop rax 209 | popf 210 | %endif 211 | %endmacro 212 | 213 | %macro my_aesdec 2.nolist 214 | %xdefine %%xxDst %1 215 | %xdefine %%xxSrc %2 216 | 217 | %if (my_emulator == 0) 218 | aesdec %%xxDst, %%xxSrc 219 | %else 220 | pushf 221 | push rax 222 | push rbx 223 | push rcx 224 | push rdx 225 | push rdi 226 | push rsi 227 | push rbp 228 | push r8 229 | push r9 230 | push r10 231 | push r11 232 | push r12 233 | push r13 234 | push r14 235 | push r15 236 | 237 | %assign %%stackSize (sizeof(oword)*2) 238 | sub rsp,%%stackSize 239 | 240 | movdqu oword [rsp+00], %%xxDst ;; save Dst 241 | movdqu oword [rsp+16], %%xxSrc ;; save Src 242 | 243 | lea rcx, [rsp+00] 244 | lea rdx, [rsp+16] 245 | 246 | sub rsp, (sizeof(qword)*2) 247 | call emu_aesdec 248 | add rsp, (sizeof(qword)*2) 249 | 250 | movdqu %%xxDst, oword [rsp+00] ;; return Dst 251 | add esp, %%stackSize 252 | 253 | pop r15 254 | pop r14 255 | pop r13 256 | pop r12 257 | pop r11 258 | pop r10 259 | pop r9 260 | pop r8 261 | pop rbp 262 | pop rsi 263 | pop rdi 264 | pop rdx 265 | pop rcx 266 | pop rbx 267 | pop rax 268 | popf 269 | %endif 270 | %endmacro 271 | 272 | %macro my_aesdeclast 2.nolist 273 | %xdefine %%xxDst %1 274 | %xdefine %%xxSrc %2 275 | 276 | %if (my_emulator == 0) 277 | aesenclast %%xxDst, %%xxSrc 278 | %else 279 | pushf 280 | push rax 281 | push rbx 282 | push rcx 283 | push rdx 284 | push rdi 285 | push rsi 286 | push rbp 287 | push r8 288 | push r9 289 | push r10 290 | push r11 291 | push r12 292 | push r13 293 | push r14 294 | push r15 295 | 296 | %assign %%stackSize (sizeof(oword)*2) 297 | sub rsp,%%stackSize 298 | 299 | movdqu oword [rsp+00], %%xxDst ;; save Dst 300 | movdqu oword [rsp+16], %%xxSrc ;; save Src 301 | 302 | lea rcx, [rsp+00] 303 | lea rdx, [rsp+16] 304 | 305 | sub rsp, (sizeof(qword)*2) 306 | call emu_aesdeclast 307 | add rsp, (sizeof(qword)*2) 308 | 309 | movdqu %%xxDst, oword [rsp+00] ;; return Dst 310 | add esp, %%stackSize 311 | 312 | pop r15 313 | pop r14 314 | pop r13 315 | pop r12 316 | pop r11 317 | pop r10 318 | pop r9 319 | pop r8 320 | pop rbp 321 | pop rsi 322 | pop rdi 323 | pop rdx 324 | pop rcx 325 | pop rbx 326 | pop rax 327 | popf 328 | %endif 329 | %endmacro 330 | 331 | %if (my_emulator != 0) 332 | extern emu_pclmulqdq 333 | extern emu_aesenc 334 | extern emu_aesenclast 335 | extern emu_aesdec 336 | extern emu_aesdeclast 337 | %endif 338 | 339 | %endif 340 | -------------------------------------------------------------------------------- /mpn/asm/intel64/ia_32e_regs.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2012-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: EM64T Cryptography Primitive. 20 | ; 21 | ; 22 | ; 23 | 24 | ;; 25 | ;; Just for unify GPRs usage 26 | ;; 27 | 28 | %ifndef _IA_32_REGS_INC_ 29 | %define _IA_32_REGS_INC_ 30 | 31 | %define r0 rax ;; 64-bits GPRs 32 | %define r1 rbx 33 | %define r2 rcx 34 | %define r3 rdx 35 | %define r4 rdi 36 | %define r5 rsi 37 | %define r6 rbp 38 | %define r7 rsp 39 | 40 | %define r0d eax ;; 32-bits GPRs 41 | %define r1d ebx 42 | %define r2d ecx 43 | %define r3d edx 44 | %define r4d edi 45 | %define r5d esi 46 | %define r6d ebp 47 | %define r7d esp 48 | 49 | %define raxd eax 50 | %define rbxd ebx 51 | %define rcxd ecx 52 | %define rdxd edx 53 | %define rdid edi 54 | %define rsid esi 55 | %define rbpd ebp 56 | 57 | %define r0w ax ;; 16-bits GPRs 58 | %define r1w bx 59 | %define r2w cx 60 | %define r3w dx 61 | %define r4w di 62 | %define r5w si 63 | %define r6w bp 64 | %define r7w sp 65 | 66 | %define raxw ax 67 | %define rbxw bx 68 | %define rcxw cx 69 | %define rdxw dx 70 | %define rdiw di 71 | %define rsiw si 72 | %define rbpw bp 73 | 74 | %define r0b al ;; 8-bits GPRs 75 | %define r1b bl 76 | %define r2b cl 77 | %define r3b dl 78 | %define r4b dil 79 | %define r5b sil 80 | %define r6b bpl 81 | %define r7b spl 82 | 83 | %define raxb al 84 | %define rbxb bl 85 | %define rcxb cl 86 | %define rdxb dl 87 | %define rdib dil 88 | %define rsib sil 89 | %define rbpb bpl 90 | 91 | %define raxbl al 92 | %define rbxbl bl 93 | %define rcxbl cl 94 | %define rdxbl dl 95 | %define raxbh ah 96 | %define rbxbh bh 97 | %define rcxbh ch 98 | %define rdxbh dh 99 | 100 | ;; 101 | ;; Register Parameters (depend on used OS) 102 | ;; 103 | %ifdef WIN32E 104 | %define rpar1 rcx 105 | %define rpar2 rdx 106 | %define rpar3 r8 107 | %define rpar4 r9 108 | %define rpar5 [rsp + ARG_5] 109 | %define rpar6 [rsp + ARG_6] 110 | %endif 111 | 112 | %ifdef LINUX32E 113 | %define rpar1 rdi 114 | %define rpar2 rsi 115 | %define rpar3 rdx 116 | %define rpar4 rcx 117 | %define rpar5 r8 118 | %define rpar6 r9 119 | %endif 120 | 121 | ;; use GPR implementation everywhere possible 122 | %assign GPR_version 1 123 | 124 | %endif 125 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_uadd_m7as.asm: -------------------------------------------------------------------------------- 1 | %include "asmdefs.inc" 2 | %include "ia_32e.inc" 3 | 4 | ; 5 | ; carry, r[:n] = a[:n] + b[:n] 6 | ; uint64_t mpn_add_vectorized(uint64_t *r, const uint64_t *a, const uint64_t *b, unsigned int n) 7 | ; 8 | 9 | segment .text align=ARCH_ALIGN_FACTOR 10 | 11 | align ARCH_ALIGN_FACTOR 12 | IPPASM mpn_add_vectorized,PUBLIC 13 | %assign LOCAL_FRAME 0 14 | USES_GPR rsi,rdi 15 | USES_XMM 16 | COMP_ABI 4 17 | 18 | ; rdi = r 19 | ; rsi = a 20 | ; rdx = b 21 | ; rcx = n 22 | 23 | movsxd rcx, ecx ; unsigned length 24 | xor rax, rax 25 | 26 | cmp rcx, 2 27 | jge .ADD_GE2 28 | 29 | ;********** lenSrcA == 1 ************************************* 30 | add rax, rax 31 | mov r8, qword [rsi] ; rsi = a 32 | adc r8, qword [rdx] ; r8 = a+b = s 33 | mov qword [rdi], r8 ; save s 34 | sbb rax, rax ; 35 | jmp .FINAL 36 | 37 | ;********** lenSrcA == 1 END ******************************** 38 | 39 | .ADD_GE2: 40 | jg .ADD_GT2 41 | 42 | ;********** lenSrcA == 2 ************************************* 43 | add rax, rax 44 | mov r8, qword [rsi] ; r8 = a0 45 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 46 | mov r9, qword [rsi+8] ; r9 = a1 47 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 48 | mov qword [rdi], r8 ; save s0 49 | mov qword [rdi+8], r9 ; save s1 50 | sbb rax, rax ; rax = carry 51 | jmp .FINAL 52 | 53 | ;********** lenSrcA == 2 END ********************************* 54 | 55 | .ADD_GT2: 56 | cmp rcx, 4 57 | jge .ADD_GE4 58 | 59 | ;********** lenSrcA == 3 ************************************* 60 | add rax, rax 61 | mov r8, qword [rsi] ; r8 = a0 62 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 63 | mov r9, qword [rsi+8] ; r9 = a1 64 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 65 | mov r10, qword [rsi+16] ; r10 = a2 66 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 67 | mov qword [rdi], r8 ; save s0 68 | mov qword [rdi+8], r9 ; save s1 69 | mov qword [rdi+16], r10 ; save s2 70 | sbb rax, rax ; rax = carry 71 | jmp .FINAL 72 | 73 | ;********** lenSrcA == 3 END ********************************* 74 | 75 | .ADD_GE4: 76 | jg .ADD_GT4 77 | 78 | ;********** lenSrcA == 4 ************************************* 79 | add rax, rax 80 | mov r8, qword [rsi] ; r8 = a0 81 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 82 | mov r9, qword [rsi+8] ; r9 = a1 83 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 84 | mov r10, qword [rsi+16] ; r10 = a2 85 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 86 | mov r11, qword [rsi+24] ; r11 = a3 87 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3 88 | mov qword [rdi], r8 ; save s0 89 | mov qword [rdi+8], r9 ; save s1 90 | mov qword [rdi+16], r10 ; save s2 91 | mov qword [rdi+24], r11 ; save s2 92 | sbb rax, rax ; rax = carry 93 | jmp .FINAL 94 | 95 | ;********** lenSrcA == 4 END ********************************* 96 | 97 | .ADD_GT4: 98 | cmp rcx, 6 99 | jge .ADD_GE6 100 | 101 | ;********** lenSrcA == 5 ************************************* 102 | add rax, rax 103 | mov r8, qword [rsi] ; r8 = a0 104 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 105 | mov r9, qword [rsi+8] ; r9 = a1 106 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 107 | mov r10, qword [rsi+16] ; r10 = a2 108 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 109 | mov r11, qword [rsi+24] ; r11 = a3 110 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3 111 | mov rcx, qword [rsi+32] ; rcx = a4 112 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4 113 | mov qword [rdi], r8 ; save s0 114 | mov qword [rdi+8], r9 ; save s1 115 | mov qword [rdi+16], r10 ; save s2 116 | mov qword [rdi+24], r11 ; save s3 117 | mov qword [rdi+32], rcx ; save s4 118 | sbb rax, rax ; rax = carry 119 | jmp .FINAL 120 | 121 | ;********** lenSrcA == 5 END ********************************* 122 | 123 | .ADD_GE6: 124 | jg .ADD_GT6 125 | 126 | ;********** lenSrcA == 6 ************************************* 127 | add rax, rax 128 | mov r8, qword [rsi] ; r8 = a0 129 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 130 | mov r9, qword [rsi+8] ; r9 = a1 131 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 132 | mov r10, qword [rsi+16] ; r10 = a2 133 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 134 | mov r11, qword [rsi+24] ; r11 = a3 135 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3 136 | mov rcx, qword [rsi+32] ; rcx = a4 137 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4 138 | mov rsi, qword [rsi+40] ; rsi = a5 139 | adc rsi, qword [rdx+40] ; rsi = a5+b5 = s5 140 | mov qword [rdi], r8 ; save s0 141 | mov qword [rdi+8], r9 ; save s1 142 | mov qword [rdi+16], r10 ; save s2 143 | mov qword [rdi+24], r11 ; save s3 144 | mov qword [rdi+32], rcx ; save s4 145 | mov qword [rdi+40], rsi ; save s5 146 | sbb rax, rax ; rax = carry 147 | jmp .FINAL 148 | 149 | ;********** lenSrcA == 6 END ********************************* 150 | 151 | .ADD_GT6: 152 | cmp rcx, 8 153 | jge .ADD_GE8 154 | 155 | .ADD_EQ7: 156 | ;********** lenSrcA == 7 ************************************* 157 | add rax, rax 158 | mov r8, qword [rsi] ; r8 = a0 159 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 160 | mov r9, qword [rsi+8] ; r9 = a1 161 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 162 | mov r10, qword [rsi+16] ; r10 = a2 163 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 164 | mov r11, qword [rsi+24] ; r11 = a3 165 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3 166 | mov rcx, qword [rsi+32] ; rcx = a4 167 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4 168 | mov qword [rdi], r8 ; save s0 169 | mov r8, qword [rsi+40] ; r8 = a5 170 | adc r8, qword [rdx+40] ; r8 = a5+b5 = s5 171 | mov rsi, qword [rsi+48] ; rsi = a6 172 | adc rsi, qword [rdx+48] ; rsi = a6+b6 = s6 173 | mov qword [rdi+8], r9 ; save s1 174 | mov qword [rdi+16], r10 ; save s2 175 | mov qword [rdi+24], r11 ; save s3 176 | mov qword [rdi+32], rcx ; save s4 177 | mov qword [rdi+40], r8 ; save s5 178 | mov qword [rdi+48], rsi ; save s6 179 | sbb rax, rax ; rax = carry 180 | jmp .FINAL 181 | 182 | ;********** lenSrcA == 7 END ********************************* 183 | 184 | 185 | .ADD_GE8: 186 | jg .ADD_GT8 187 | 188 | ;********** lenSrcA == 8 ************************************* 189 | add rax, rax 190 | mov r8, qword [rsi] ; r8 = a0 191 | adc r8, qword [rdx] ; r8 = a0+b0 = s0 192 | mov r9, qword [rsi+8] ; r9 = a1 193 | adc r9, qword [rdx+8] ; r9 = a1+b1 = s1 194 | mov r10, qword [rsi+16] ; r10 = a2 195 | adc r10, qword [rdx+16] ; r10 = a2+b2 = s2 196 | mov r11, qword [rsi+24] ; r11 = a3 197 | adc r11, qword [rdx+24] ; r11 = a3+b3 = s3 198 | mov rcx, qword [rsi+32] ; rcx = a4 199 | adc rcx, qword [rdx+32] ; rcx = a4+b4 = s4 200 | mov qword [rdi], r8 ; save s0 201 | mov r8, qword [rsi+40] ; r8 = a5 202 | adc r8, qword [rdx+40] ; r8 = a5+b5 = s5 203 | mov qword [rdi+8], r9 ; save s1 204 | mov r9, qword [rsi+48] ; r9 = a7 205 | adc r9, qword [rdx+48] ; r9 = a7+b7 = s7 206 | mov rsi, qword [rsi+56] ; rsi = a6 207 | adc rsi, qword [rdx+56] ; rsi = a6+b6 = s6 208 | mov qword [rdi+16], r10 ; save s2 209 | mov qword [rdi+24], r11 ; save s3 210 | mov qword [rdi+32], rcx ; save s4 211 | mov qword [rdi+40], r8 ; save s5 212 | mov qword [rdi+48], r9 ; save s6 213 | mov qword [rdi+56], rsi ; save s7 214 | sbb rax, rax ; rax = carry 215 | jmp .FINAL 216 | 217 | ;********** lenSrcA == 8 END ********************************* 218 | 219 | 220 | ;********** lenSrcA > 8 ************************************* 221 | 222 | .ADD_GT8: 223 | mov r8, rax 224 | mov rax, rcx ; rax = len 225 | and rcx, 3 ; 226 | xor rcx, rax ; 227 | lea rsi, [rsi+8*rcx] ; 228 | lea rdx, [rdx+8*rcx] ; 229 | lea rdi, [rdi+8*rcx] ; 230 | neg rcx 231 | add r8, r8 232 | jmp .ADD_GLOOP 233 | 234 | align ARCH_ALIGN_FACTOR 235 | .ADD_GLOOP: 236 | mov r8, qword [rsi+8*rcx] ; r8 = a0 237 | mov r9, qword [rsi+8*rcx+8] ; r9 = a1 238 | mov r10, qword [rsi+8*rcx+16] ; r10 = a2 239 | mov r11, qword [rsi+8*rcx+24] ; r11 = a3 240 | adc r8, qword [rdx+8*rcx] ; r8 = a0+b0 = r0 241 | adc r9, qword [rdx+8*rcx+8] ; r9 = a1+b1 = r1 242 | adc r10, qword [rdx+8*rcx+16] ; r10 = a2+b2 = r2 243 | adc r11, qword [rdx+8*rcx+24] ; r11 = a3+b3 = r3 244 | mov qword [rdi+8*rcx], r8 ; 245 | mov qword [rdi+8*rcx+8], r9 ; 246 | mov qword [rdi+8*rcx+16], r10 ; 247 | mov qword [rdi+8*rcx+24], r11 ; 248 | lea rcx, [rcx+4] 249 | jrcxz .ADD_LLAST0 250 | jmp .ADD_GLOOP 251 | 252 | .ADD_LLAST0: 253 | sbb rcx, rcx 254 | and rax, 3 255 | jz .FIN0 256 | 257 | .ADD_LLOOP: 258 | test rax, 2 259 | jz .ADD_LLAST1 260 | 261 | add rcx, rcx 262 | mov r8, qword [rsi] ; r8 = a0 263 | mov r9, qword [rsi+8] ; r9 = a1 264 | adc r8, qword [rdx] ; r8 = a0+b0 = r0 265 | adc r9, qword [rdx+8] ; r9 = a1+b1 = r1 266 | mov qword [rdi], r8 ; 267 | mov qword [rdi+8], r9 ; 268 | sbb rcx, rcx 269 | test rax, 1 270 | jz .FIN0 271 | 272 | add rsi, 16 273 | add rdx, 16 274 | add rdi, 16 275 | 276 | .ADD_LLAST1: 277 | add rcx, rcx 278 | mov r8, qword [rsi] ; r8 = a0 279 | adc r8, qword [rdx] ; r8 = a0+b0 = r0 280 | mov qword [rdi], r8 ; 281 | sbb rcx, rcx 282 | 283 | .FIN0: 284 | mov rax, rcx 285 | 286 | ;******************* .FINAL *********************************************************** 287 | 288 | .FINAL: 289 | neg rax 290 | REST_XMM 291 | REST_GPR 292 | ret 293 | ENDFUNC mpn_add_vectorized 294 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_uinc_udec_m7as.asm: -------------------------------------------------------------------------------- 1 | %include "asmdefs.inc" 2 | %include "ia_32e.inc" 3 | 4 | %if (__ARCH32E >= __ARCH32E_M7) 5 | 6 | segment .text align=ARCH_ALIGN_FACTOR 7 | 8 | ; 9 | ; carry, r[:size] = a[:size] + w 10 | ; uint64_t mpn_inc_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w) 11 | ; 12 | align ARCH_ALIGN_FACTOR 13 | IPPASM mpn_inc_vectorized,PUBLIC 14 | %assign LOCAL_FRAME 0 15 | USES_GPR rsi,rdi 16 | USES_XMM 17 | COMP_ABI 4 18 | 19 | ; rdi = r 20 | ; rsi = a 21 | ; rdx = size 22 | ; rcx = w 23 | 24 | movsxd rdx, edx ; length 25 | 26 | mov r8, qword [rsi] ; r[0] = r[0]+increment 27 | add r8, rcx 28 | mov qword [rdi], r8 29 | 30 | lea rsi, [rsi+rdx*sizeof(qword)] 31 | lea rdi, [rdi+rdx*sizeof(qword)] 32 | lea rcx, [rdx*sizeof(qword)] 33 | 34 | sbb rax, rax ; save cf 35 | neg rcx ; rcx = negative length (bytes) 36 | add rcx, sizeof(qword) 37 | jrcxz .exit 38 | add rax, rax ; restore cf 39 | jnc .copy 40 | 41 | align ARCH_ALIGN_FACTOR 42 | .inc_loop: 43 | mov r8, qword [rsi+rcx] 44 | adc r8, 0 45 | mov qword [rdi+rcx], r8 46 | lea rcx, [rcx+sizeof(qword)] 47 | jrcxz .exit_loop 48 | jnc .exit_loop 49 | jmp .inc_loop 50 | .exit_loop: 51 | sbb rax, rax ; save cf 52 | 53 | .copy: 54 | cmp rsi, rdi 55 | jz .exit 56 | jrcxz .exit 57 | .copy_loop: 58 | mov r8, qword [rsi+rcx] 59 | mov qword [rdi+rcx], r8 60 | add rcx, sizeof(qword) 61 | jnz .copy_loop 62 | 63 | .exit: 64 | neg rax 65 | REST_XMM 66 | REST_GPR 67 | ret 68 | ENDFUNC mpn_inc_vectorized 69 | 70 | 71 | ; 72 | ; borrow, r[:size] = a[:size] - w 73 | ; uint64_t mpn_dec_vectorized(uint64_t *r, const uint64_t *a, unsigned int size, uint64_t w) 74 | ; 75 | 76 | align ARCH_ALIGN_FACTOR 77 | IPPASM mpn_dec_vectorized,PUBLIC 78 | %assign LOCAL_FRAME 0 79 | USES_GPR rsi,rdi 80 | USES_XMM 81 | COMP_ABI 4 82 | 83 | ; rdi = r 84 | ; rsi = a 85 | ; rdx = size 86 | ; rcx = w 87 | 88 | movsxd rdx, edx ; length 89 | 90 | mov r8, qword [rsi] ; r[0] = r[0]+increment 91 | sub r8, rcx 92 | mov qword [rdi], r8 93 | 94 | lea rsi, [rsi+rdx*sizeof(qword)] 95 | lea rdi, [rdi+rdx*sizeof(qword)] 96 | lea rcx, [rdx*sizeof(qword)] 97 | 98 | sbb rax, rax ; save cf 99 | neg rcx ; rcx = negative length (bytes) 100 | add rcx, sizeof(qword) 101 | jrcxz .exit 102 | add rax, rax ; restore cf 103 | jnc .copy 104 | 105 | align ARCH_ALIGN_FACTOR 106 | .inc_loop: 107 | mov r8, qword [rsi+rcx] 108 | sbb r8, 0 109 | mov qword [rdi+rcx], r8 110 | lea rcx, [rcx+sizeof(qword)] 111 | jrcxz .exit_loop 112 | jnc .exit_loop 113 | jmp .inc_loop 114 | .exit_loop: 115 | sbb rax, rax ; save cf 116 | 117 | .copy: 118 | cmp rsi, rdi 119 | jz .exit 120 | jrcxz .exit 121 | .copy_loop: 122 | mov r8, qword [rsi+rcx] 123 | mov qword [rdi+rcx], r8 124 | add rcx, sizeof(qword) 125 | jnz .copy_loop 126 | 127 | .exit: 128 | neg rax 129 | REST_XMM 130 | REST_GPR 131 | ret 132 | ENDFUNC mpn_dec_vectorized 133 | 134 | %endif 135 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_umul_acc_m7as.asm: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: Cryptography Primitive. 20 | ; Big Number Operations 21 | ; 22 | ; Content: 23 | ; cpMulDgt_BNU() 24 | ; mpn_mul_acc() 25 | ; cpSubMulDgt_BNU() 26 | ; mpn_mul_acc() 27 | ; 28 | ; 29 | 30 | %include "asmdefs.inc" 31 | %include "ia_32e.inc" 32 | %include "ia_32e_regs.inc" 33 | %include "bn_umulschool.inc" 34 | 35 | %if (__ARCH32E >= __ARCH32E_M7) 36 | 37 | segment .text align=ARCH_ALIGN_FACTOR 38 | 39 | 40 | ;************************************************************* 41 | ; uint64_t mpn_mul_acc(uint64_t* pDst, 42 | ; const uint64_t* pSrcA, 43 | ; int len, 44 | ; uint64_t B ) 45 | ;************************************************************* 46 | align ARCH_ALIGN_FACTOR 47 | IPPASM mpn_mul_acc,PUBLIC 48 | %assign LOCAL_FRAME 0 49 | USES_GPR rbx,rsi,rdi,r11,r12 50 | USES_XMM 51 | COMP_ABI 4 52 | 53 | ; rdi = pDst 54 | ; rsi = pSrc 55 | ; rdx = len 56 | ; rcx = B 57 | 58 | %xdefine B0 rcx ; b 59 | 60 | %xdefine T0 r8 ; temporary 61 | %xdefine T1 r9 62 | %xdefine T2 r10 63 | %xdefine T3 r11 64 | 65 | %xdefine idx rbx ; index 66 | %xdefine rDst rdi 67 | %xdefine rSrc rsi 68 | 69 | mov edx, edx ; unsigned length 70 | 71 | mov rax, qword [rsi] 72 | cmp rdx, 1 73 | jnz .general_case 74 | 75 | mul rcx 76 | add qword [rdi], rax 77 | adc rdx, 0 78 | mov rax, rdx 79 | REST_XMM 80 | REST_GPR 81 | ret 82 | 83 | .general_case: 84 | lea rSrc, [rSrc+rdx*sizeof(qword)-sizeof(qword)*5] 85 | lea rDst, [rDst+rdx*sizeof(qword)-sizeof(qword)*5] 86 | mov idx, dword 5 87 | sub idx, rdx ; negative counter -(len-5) 88 | 89 | mul rcx ; {T1:T0} = a[0]*B 90 | mov T0, rax 91 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)] 92 | mov T1, rdx 93 | 94 | cmp idx, 0 95 | jge .skip_muladd_loop4 96 | 97 | align ARCH_ALIGN_FACTOR 98 | .muladd_loop4: 99 | mul rcx ; a[4*i+1]*B 100 | xor T2, T2 101 | add qword [rDst+idx*sizeof(qword)], T0 102 | adc T1, rax 103 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2] 104 | adc T2, rdx 105 | 106 | mul rcx ; a[4*i+2]*B 107 | xor T3, T3 108 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1 109 | adc T2, rax 110 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3] 111 | adc T3, rdx 112 | 113 | mul rcx ; a[4*i+3]*B 114 | xor T0, T0 115 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2 116 | adc T3, rax 117 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4] 118 | adc T0, rdx 119 | 120 | mul rcx ; a[4*i+4]*B 121 | xor T1, T1 122 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3 123 | adc T0, rax 124 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*5] 125 | adc T1, rdx 126 | 127 | add idx, 4 128 | jnc .muladd_loop4 129 | 130 | .skip_muladd_loop4: 131 | mul rcx 132 | xor T2, T2 133 | add qword [rDst+idx*sizeof(qword)], T0 134 | adc T1, rax 135 | adc T2, rdx 136 | 137 | cmp idx, 2 138 | ja .fin_mul1x4n_2 ; idx=3 139 | jz .fin_mul1x4n_3 ; idx=2 140 | jp .fin_mul1x4n_4 ; idx=1 141 | ; .fin_mul1x4n_1 ; idx=0 142 | 143 | .fin_mul1x4n_1: 144 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2] 145 | mul rcx 146 | xor T3, T3 147 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1 148 | adc T2, rax 149 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3] 150 | adc T3, rdx 151 | 152 | mul rcx 153 | xor T0, T0 154 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2 155 | adc T3, rax 156 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*4] 157 | adc T0, rdx 158 | 159 | mul rcx 160 | xor T1, T1 161 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3 162 | adc T0, rax 163 | adc rdx, 0 164 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*4], T0 165 | adc rdx, 0 166 | mov rax, rdx 167 | jmp .exit 168 | 169 | .fin_mul1x4n_4: 170 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2] 171 | mul rcx 172 | xor T3, T3 173 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1 174 | adc T2, rax 175 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*3] 176 | adc T3, rdx 177 | 178 | mul rcx 179 | xor T0, T0 180 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2 181 | adc T3, rax 182 | adc rdx, 0 183 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*3], T3 184 | adc rdx, 0 185 | mov rax, rdx 186 | jmp .exit 187 | 188 | .fin_mul1x4n_3: 189 | mov rax, qword [rSrc+idx*sizeof(qword)+sizeof(qword)*2] 190 | mul rcx 191 | xor T3, T3 192 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1 193 | adc T2, rax 194 | adc rdx, 0 195 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)*2], T2 196 | adc rdx, 0 197 | mov rax, rdx 198 | jmp .exit 199 | 200 | .fin_mul1x4n_2: 201 | add qword [rDst+idx*sizeof(qword)+sizeof(qword)], T1 202 | adc T2, 0 203 | mov rax, T2 204 | 205 | .exit: 206 | REST_XMM 207 | REST_GPR 208 | ret 209 | ENDFUNC mpn_mul_acc 210 | 211 | %endif 212 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_umul_m7as.asm: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: Cryptography Primitive. 20 | ; Big Number Operations 21 | ; 22 | ; Content: 23 | ; mpn_mul() 24 | ; 25 | ; 26 | 27 | %include "asmdefs.inc" 28 | %include "ia_32e.inc" 29 | %include "bn_umulschool.inc" 30 | %include "variant.inc" 31 | 32 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_OFF_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_) 33 | %if (__ARCH32E >= __ARCH32E_M7) && (__ARCH32E < __ARCH32E_L9) 34 | 35 | 36 | segment .text align=ARCH_ALIGN_FACTOR 37 | 38 | 39 | ;************************************************************* 40 | ;* uint64_t mpn_mul(uint64_t* pR; 41 | ;* const uint64_t* pA, int aSize, 42 | ;* const uint64_t* pB, int bSize) 43 | ;* returns pR[aSize+bSize] 44 | ;* 45 | ;************************************************************* 46 | align ARCH_ALIGN_FACTOR 47 | IPPASM mpn_mul,PUBLIC 48 | %assign LOCAL_FRAME (1*sizeof(qword)) 49 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 50 | USES_XMM 51 | COMP_ABI 5 52 | 53 | ; rdi = pDst 54 | ; rsi = pSrcA 55 | ; edx = lenA 56 | ; rcx = pSrcB 57 | ; r8d = lenB 58 | 59 | ;; 60 | ;; stack structure: 61 | ;;counterB = (0) 62 | ;;counterA = (counterB+sizeof(qword)) 63 | %assign counterA (0) 64 | 65 | 66 | cmp edx, r8d 67 | jl .general_case_mul_entry 68 | jg .general_case_mul 69 | %if (__ARCH32E < __ARCH32E_E9) 70 | cmp edx, 4 71 | %else 72 | cmp edx, 8 73 | %endif 74 | jg .general_case_mul 75 | 76 | %if (__ARCH32E >= __ARCH32E_E9) 77 | cmp edx, 4 78 | jg .more_then_4 79 | %endif 80 | 81 | cmp edx, 3 82 | ja .mul_4x4 83 | jz .mul_3x3 84 | jp .mul_2x2 85 | ; mul_1x1 86 | 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 88 | ;; 89 | ;; fixed-size multipliers (1-4) 90 | ;; 91 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 92 | align ARCH_ALIGN_FACTOR 93 | .mul_1x1: 94 | mov rax, qword [rsi] 95 | mul qword [rcx] 96 | mov qword [rdi], rax 97 | mov qword [rdi+sizeof(qword)], rdx 98 | mov rax, qword [rdi+sizeof(qword)*1] 99 | REST_XMM 100 | REST_GPR 101 | ret 102 | 103 | align ARCH_ALIGN_FACTOR 104 | .mul_2x2: 105 | mov r8, [rcx] 106 | mov r9, [rcx+sizeof(qword)*1] 107 | MUL_NxN 2, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 108 | mov rax, qword [rdi+sizeof(qword)*3] 109 | REST_XMM 110 | REST_GPR 111 | ret 112 | 113 | align ARCH_ALIGN_FACTOR 114 | .mul_3x3: 115 | mov r8, [rcx] 116 | mov r9, [rcx+sizeof(qword)*1] 117 | mov r10,[rcx+sizeof(qword)*2] 118 | MUL_NxN 3, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 119 | mov rax, qword [rdi+sizeof(qword)*5] 120 | REST_XMM 121 | REST_GPR 122 | ret 123 | 124 | align ARCH_ALIGN_FACTOR 125 | .mul_4x4: 126 | mov r8, [rcx] 127 | mov r9, [rcx+sizeof(qword)*1] 128 | mov r10,[rcx+sizeof(qword)*2] 129 | mov r11,[rcx+sizeof(qword)*3] 130 | MUL_NxN 4, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 131 | mov rax, qword [rdi+sizeof(qword)*7] 132 | REST_XMM 133 | REST_GPR 134 | ret 135 | 136 | %if (__ARCH32E >= __ARCH32E_E9) 137 | .more_then_4: 138 | cmp edx, 7 139 | ja .mul_8x8 140 | jz .mul_7x7 141 | jp .mul_6x6 142 | ; mul_5x5 143 | 144 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 145 | ;; 146 | ;; fixed-size multipliers (5-8) 147 | ;; 148 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 149 | align ARCH_ALIGN_FACTOR 150 | .mul_5x5: 151 | mov r8, [rcx] 152 | mov r9, [rcx+sizeof(qword)*1] 153 | mov r10,[rcx+sizeof(qword)*2] 154 | mov r11,[rcx+sizeof(qword)*3] 155 | mov r12,[rcx+sizeof(qword)*4] 156 | MUL_NxN 5, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 157 | mov rax, qword [rdi+sizeof(qword)*9] 158 | REST_XMM 159 | REST_GPR 160 | ret 161 | 162 | align ARCH_ALIGN_FACTOR 163 | .mul_6x6: 164 | mov r8, [rcx] 165 | mov r9, [rcx+sizeof(qword)*1] 166 | mov r10,[rcx+sizeof(qword)*2] 167 | mov r11,[rcx+sizeof(qword)*3] 168 | mov r12,[rcx+sizeof(qword)*4] 169 | mov r13,[rcx+sizeof(qword)*5] 170 | MUL_NxN 6, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 171 | mov rax, qword [rdi+sizeof(qword)*11] 172 | REST_XMM 173 | REST_GPR 174 | ret 175 | 176 | align ARCH_ALIGN_FACTOR 177 | .mul_7x7: 178 | mov r8, [rcx] 179 | mov r9, [rcx+sizeof(qword)*1] 180 | mov r10,[rcx+sizeof(qword)*2] 181 | mov r11,[rcx+sizeof(qword)*3] 182 | mov r12,[rcx+sizeof(qword)*4] 183 | mov r13,[rcx+sizeof(qword)*5] 184 | mov r14,[rcx+sizeof(qword)*6] 185 | MUL_NxN 7, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 186 | mov rax, qword [rdi+sizeof(qword)*13] 187 | REST_XMM 188 | REST_GPR 189 | ret 190 | 191 | align ARCH_ALIGN_FACTOR 192 | .mul_8x8: 193 | mov r8, [rcx] 194 | mov r9, [rcx+sizeof(qword)*1] 195 | mov r10,[rcx+sizeof(qword)*2] 196 | mov r11,[rcx+sizeof(qword)*3] 197 | mov r12,[rcx+sizeof(qword)*4] 198 | mov r13,[rcx+sizeof(qword)*5] 199 | mov r14,[rcx+sizeof(qword)*6] 200 | mov r15,[rcx+sizeof(qword)*7] 201 | MUL_NxN 8, rdi, rsi, rcx, rbx, rbp, r15, r14, r13, r12, r11, r10, r9, r8 202 | mov rax, qword [rdi+sizeof(qword)*15] 203 | REST_XMM 204 | REST_GPR 205 | ret 206 | %endif 207 | 208 | 209 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 210 | ;; 211 | ;; general case multiplier 212 | ;; 213 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 214 | align ARCH_ALIGN_FACTOR 215 | .general_case_mul_entry: 216 | ; swap operands %if lenA < lenB then exchange operands 217 | xor rsi, rcx 218 | xor edx, r8d 219 | xor rcx, rsi 220 | xor r8d, edx 221 | xor rsi, rcx 222 | xor edx, r8d 223 | 224 | %xdefine B0 r10 ; b[i], b[i+1] 225 | %xdefine B1 r11 226 | 227 | %xdefine T0 r12 ; temporary 228 | %xdefine T1 r13 229 | %xdefine T2 r14 230 | %xdefine T3 r15 231 | 232 | %xdefine idx rbx ; index 233 | %xdefine rDst rdi 234 | %xdefine rSrc rsi 235 | 236 | align ARCH_ALIGN_FACTOR 237 | .general_case_mul: 238 | movsxd rdx, edx ; expand length 239 | movsxd r8, r8d 240 | 241 | lea rdi, [rdi+rdx*sizeof(qword)-sizeof(qword)*4] ; rdi = &R[lenA-4] 242 | lea rsi, [rsi+rdx*sizeof(qword)-sizeof(qword)*4] ; rsi = &A[lenA-4] 243 | 244 | mov idx, dword 4 ; negative 245 | sub idx, rdx ; A-counter 246 | mov qword [rsp+counterA], idx 247 | 248 | mov rax, qword [rsi+idx*sizeof(qword)] ; a[0] 249 | mov B0, qword [rcx] ; b[0] 250 | test r8, 1 251 | jz .init_even_B 252 | 253 | ;********** lenSrcB = 2*n+ 1 (multiply only) ********************* 254 | .init_odd_B: 255 | xor T0, T0 256 | cmp idx, 0 257 | jge .skip_mul1 258 | 259 | MULx1 rdi, rsi, idx, B0, T0, T1, T2, T3 260 | 261 | .skip_mul1: 262 | cmp idx, 2 263 | ja .fin_mul1x4n_1 ; idx=3 264 | jz .fin_mul1x4n_2 ; idx=2 265 | jp .fin_mul1x4n_3 ; idx=1 266 | ; fin_mul1x4n_4 ; idx=0 267 | 268 | .fin_mul1x4n_4: 269 | MULx1_4N_4_ELOG rdi, rsi, B0, T0,T1,T2,T3 270 | add rcx, sizeof(qword) 271 | add r8, 1 272 | jmp .mla2x4n_4 273 | .fin_mul1x4n_3: 274 | MULx1_4N_3_ELOG rdi, rsi, B0, T0,T1,T2,T3 275 | add rcx, sizeof(qword) 276 | add r8, 1 277 | jmp .mla2x4n_3 278 | .fin_mul1x4n_2: 279 | MULx1_4N_2_ELOG rdi, rsi, B0, T0,T1,T2,T3 280 | add rcx, sizeof(qword) 281 | add r8, 1 282 | jmp .mla2x4n_2 283 | .fin_mul1x4n_1: 284 | MULx1_4N_1_ELOG rdi, rsi, B0, T0,T1,T2,T3 285 | add rcx, sizeof(qword) 286 | add r8, 1 287 | jmp .mla2x4n_1 288 | 289 | 290 | ;********** lenSrcB = 2*n (multiply only) ************************ 291 | .init_even_B: 292 | mov rbp, rax 293 | mul B0 ; {T2:T1:T0} = a[0]*B0 294 | mov B1, qword [rcx+sizeof(qword)] 295 | xor T2, T2 296 | mov T0, rax 297 | mov rax, rbp ; restore a[0] 298 | mov T1, rdx 299 | 300 | cmp idx, 0 301 | jge .skip_mul_nx2 302 | 303 | MULx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3 304 | 305 | .skip_mul_nx2: 306 | cmp idx, 2 307 | ja .fin_mul2x4n_1 ; idx=3 308 | jz .fin_mul2x4n_2 ; idx=2 309 | jp .fin_mul2x4n_3 ; idx=1 310 | ; fin_mul2x4n_4 ; idx=0 311 | 312 | .fin_mul2x4n_4: 313 | MULx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 314 | add rcx, sizeof(qword)*2 315 | align ARCH_ALIGN_FACTOR 316 | .mla2x4n_4: 317 | sub r8, 2 318 | jz .quit 319 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3 320 | cmp idx, 0 321 | jz .skip_mla_x2 322 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3 323 | .skip_mla_x2: 324 | MLAx2_4N_4_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 325 | add rcx, sizeof(qword)*2 326 | jmp .mla2x4n_4 327 | 328 | .fin_mul2x4n_3: 329 | MULx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 330 | add rcx, sizeof(qword)*2 331 | align ARCH_ALIGN_FACTOR 332 | .mla2x4n_3: 333 | sub r8, 2 334 | jz .quit 335 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3 336 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3 337 | MLAx2_4N_3_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 338 | add rcx, sizeof(qword)*2 339 | jmp .mla2x4n_3 340 | 341 | .fin_mul2x4n_2: 342 | MULx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 343 | add rcx, sizeof(qword)*2 344 | align ARCH_ALIGN_FACTOR 345 | .mla2x4n_2: 346 | sub r8, 2 347 | jz .quit 348 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3 349 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3 350 | MLAx2_4N_2_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 351 | add rcx, sizeof(qword)*2 352 | jmp .mla2x4n_2 353 | 354 | .fin_mul2x4n_1: 355 | MULx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 356 | add rcx, sizeof(qword)*2 357 | align ARCH_ALIGN_FACTOR 358 | .mla2x4n_1: 359 | sub r8, 2 360 | jz .quit 361 | MLAx2_PLOG B0,B1, rcx, T0,T1,T2,T3 362 | MLAx2 rdi, rsi, idx, B0,B1, T0,T1,T2,T3 363 | MLAx2_4N_1_ELOG rdi, rsi, B0,B1, T0,T1,T2,T3 364 | add rcx, sizeof(qword)*2 365 | jmp .mla2x4n_1 366 | 367 | .quit: 368 | mov rax, rdx 369 | 370 | REST_XMM 371 | REST_GPR 372 | ret 373 | ENDFUNC mpn_mul 374 | 375 | %endif 376 | 377 | %endif ;; _ADCOX_NI_ENABLING_ 378 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_umul_usqr_redc_srvl9.asm: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: Cryptography Primitive. 20 | ; Big Number Multiplicative Operations 21 | ; 22 | ; Content: 23 | ; mpn_mul() 24 | ; mpn_sqr() 25 | ; mpn_montgomery_reduce_bin() 26 | ; 27 | ; Implementation is using mulx and adcx/adox instruvtions 28 | ; 29 | ; 30 | 31 | %include "asmdefs.inc" 32 | %include "ia_32e.inc" 33 | %include "variant.inc" 34 | 35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_) 36 | %if (__ARCH32E >= __ARCH32E_L9) 37 | 38 | %assign _xEMULATION_ 1 39 | 40 | segment .text align=ARCH_ALIGN_FACTOR 41 | 42 | 43 | %include "bn_umul.inc" 44 | %include "bn_usqr.inc" 45 | %include "mred.inc" 46 | 47 | ;************************************************************* 48 | ;* uint64_t mpn_mul(uint64_t* pR; 49 | ;* const uint64_t* pA, int aSize, 50 | ;* const uint64_t* pB, int bSize) 51 | ;* 52 | ;************************************************************* 53 | align ARCH_ALIGN_FACTOR 54 | IPPASM mpn_mul,PUBLIC 55 | %assign LOCAL_FRAME 0 56 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 57 | USES_XMM 58 | COMP_ABI 5 59 | 60 | ; rdi = pR 61 | ; rsi = pA 62 | ; edx = nsA 63 | ; rcx = pB 64 | ; r8d = nsB 65 | 66 | movsxd rdx, edx ; expand length 67 | movsxd rbx, r8d 68 | 69 | xor r8, r8 ; clear scratch 70 | xor r9, r9 71 | xor r10, r10 72 | xor r11, r11 73 | xor r12, r12 74 | xor r13, r13 75 | xor r14, r14 76 | xor r15, r15 77 | 78 | cmp rdx, rbx 79 | jl .swap_operans ; nsA < nsB 80 | jg .test_8N_case ; test %if nsA=8*N and nsB=8*M 81 | 82 | cmp rdx, 16 83 | jg .test_8N_case 84 | 85 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 86 | ;; short nsA==nsB (1,..,16) 87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 88 | cmp rdx, 4 89 | jg .more_then_4 90 | 91 | cmp edx, 3 92 | ja .mul_4_4 93 | jz .mul_3_3 94 | jp .mul_2_2 95 | ; mul_1_1 96 | 97 | .mul_1_1: 98 | MUL_NxN 1, rdi, rsi, rcx, rbx,rbp, r8 99 | jmp .quit 100 | .mul_2_2: 101 | MUL_NxN 2, rdi, rsi, rcx, rbx,rbp, r8,r9 102 | jmp .quit 103 | .mul_3_3: 104 | MUL_NxN 3, rdi, rsi, rcx, rbx,rbp, r8,r9,r10 105 | jmp .quit 106 | .mul_4_4: 107 | MUL_NxN 4, rdi, rsi, rcx, rbx,rbp, r8,r9,r10,r11 108 | jmp .quit 109 | 110 | .more_then_4: 111 | GET_EP rax, mul_lxl_basic, rdx, rbp 112 | call rax 113 | jmp .quit 114 | 115 | .swap_operans: 116 | SWAP rsi, rcx ; swap operands 117 | SWAP rdx, rbx 118 | 119 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 120 | ;; 8*N x 8*M case multiplier 121 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 122 | .test_8N_case: 123 | mov rax, rdx 124 | or rax, rbx 125 | and rax, 7 126 | jnz .general_mul 127 | 128 | CALL_FUNC mul_8Nx8M 129 | jmp .quit 130 | 131 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 132 | ;; general case multiplier 133 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 134 | .general_mul: 135 | CALL_FUNC mul_NxM 136 | jmp .quit 137 | 138 | .quit: 139 | REST_XMM 140 | REST_GPR 141 | ret 142 | ENDFUNC mpn_mul 143 | 144 | ;************************************************************* 145 | ;* 146 | ;* uint64_t mpn_sqr(uint64_t* pR; 147 | ;* const uint64_t* pA, int aSize) 148 | ;* 149 | ;************************************************************* 150 | align ARCH_ALIGN_FACTOR 151 | IPPASM mpn_sqr,PUBLIC 152 | %assign LOCAL_FRAME 0 153 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 154 | USES_XMM 155 | COMP_ABI 3 156 | 157 | movsxd rdx, edx ; expand length 158 | 159 | xor r8, r8 ; clear scratch 160 | xor r9, r9 161 | xor r10, r10 162 | xor r11, r11 163 | xor r12, r12 164 | xor r13, r13 165 | xor r14, r14 166 | xor r15, r15 167 | 168 | cmp rdx, 16 169 | jg .test_8N_case 170 | 171 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 172 | ;; short nsA (1,..,16) 173 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 174 | GET_EP rax, sqr_l_basic, rdx, rbp 175 | call rax 176 | jmp .quit 177 | 178 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 179 | ;; 8N case squarer 180 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 181 | .test_8N_case: 182 | test rdx, 7 183 | jnz .general_sqr 184 | 185 | CALL_FUNC sqr_8N 186 | jmp .quit 187 | 188 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 189 | ;; general case squarer 190 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 191 | .general_sqr: 192 | CALL_FUNC sqr_N 193 | 194 | .quit: 195 | REST_XMM 196 | REST_GPR 197 | ret 198 | ENDFUNC mpn_sqr 199 | 200 | ;************************************************************* 201 | ;* 202 | ;* uint64_t mpn_montgomery_reduce_bin(uint64_t* pR; 203 | ;* uint64_t* pProduct, 204 | ;* const uint64_t* pModulus, int mSize, 205 | ;* uint64_t m) 206 | ;************************************************************* 207 | align ARCH_ALIGN_FACTOR 208 | IPPASM mpn_montgomery_reduce_bin,PUBLIC 209 | %assign LOCAL_FRAME (0) 210 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 211 | USES_XMM 212 | COMP_ABI 5 213 | ;pR (rdi) address of the reduction 214 | ;pProduct (rsi) address of the temporary product 215 | ;pModulus (rdx) address of the modulus 216 | ;mSize (rcx) size of the modulus 217 | ;m0 (r8) montgomery helper (m') 218 | 219 | mov r15, rdi ; store reduction address 220 | 221 | ; reload parameters for future convinience: 222 | mov rdi, rsi ; rdi = temporary product buffer 223 | mov rsi, rdx ; rsi = modulus 224 | movsxd rdx, ecx ; rdx = length of modulus 225 | 226 | cmp rdx, 16 227 | ja .test_8N_case ; length of modulus >16 228 | 229 | cmp rdx, 4 230 | ja .above4 ; length of modulus 4,..,16 231 | 232 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 233 | ;; short modulus (1,..,4) 234 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 235 | cmp rdx, 3 236 | ja .red_4 237 | jz .red_3 238 | jp .red_2 239 | ; red_1 240 | 241 | .red_1: 242 | mov r9, qword [rdi+sizeof(qword)*0] 243 | MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9 244 | jmp .quit 245 | 246 | .red_2: 247 | mov r9, qword [rdi+sizeof(qword)*0] 248 | mov r10, qword [rdi+sizeof(qword)*1] 249 | MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10 250 | jmp .quit 251 | 252 | .red_3: 253 | mov r9, qword [rdi+sizeof(qword)*0] 254 | mov r10, qword [rdi+sizeof(qword)*1] 255 | mov r11, qword [rdi+sizeof(qword)*2] 256 | MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11 257 | jmp .quit 258 | 259 | .red_4: 260 | mov r9, qword [rdi+sizeof(qword)*0] 261 | mov r10, qword [rdi+sizeof(qword)*1] 262 | mov r11, qword [rdi+sizeof(qword)*2] 263 | mov r12, qword [rdi+sizeof(qword)*3] 264 | MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12 265 | jmp .quit 266 | 267 | 268 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 269 | ;; short modulus (5,..,16) 270 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 271 | .above4: 272 | mov rbp, rdx 273 | sub rbp, 4 274 | GET_EP rax, mred_short, rbp ; mred procedure 275 | 276 | call rax 277 | jmp .quit 278 | 279 | 280 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 281 | ;; 8N case squarer 282 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 283 | .test_8N_case: 284 | test rdx, 7 285 | jnz .general_case 286 | 287 | CALL_FUNC mred_8N 288 | jmp .quit 289 | 290 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 291 | ;; 292 | ;; general case modulus 293 | ;; 294 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 295 | .general_case: 296 | CALL_FUNC mred_N 297 | 298 | .quit: 299 | REST_XMM 300 | REST_GPR 301 | ret 302 | ENDFUNC mpn_montgomery_reduce_bin 303 | 304 | %endif 305 | 306 | %endif ;; _ADCOX_NI_ENABLING_ 307 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mpi_umul_usqr_redc_srvl9pp.asm: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: Cryptography Primitive. 20 | ; Big Number Multiplicative Operations 21 | ; 22 | ; Content: 23 | ; mpi_umul_bin_adx() 24 | ; mpi_usqr_bin_adx() 25 | ; mpi_montgomery_reduce_bin_adx() 26 | ; 27 | ; Implementation is using mulx and adcx/adox instruvtions 28 | ; 29 | ; 30 | 31 | %include "asmdefs.inc" 32 | %include "ia_32e.inc" 33 | %include "variant.inc" 34 | 35 | %if (_ADCOX_NI_ENABLING_ == _FEATURE_ON_) || (_ADCOX_NI_ENABLING_ == _FEATURE_TICKTOCK_) 36 | %if (__ARCH32E >= __ARCH32E_L9) 37 | 38 | %assign _xEMULATION_ 1 39 | %assign _ADCX_ADOX_ 1 40 | 41 | segment .text align=ARCH_ALIGN_FACTOR 42 | 43 | %include "bn_umulpp.inc" 44 | %include "bn_usqrpp.inc" 45 | %include "mred_pp.inc" 46 | 47 | ;************************************************************* 48 | ;* uint64_t mpi_umul_bin_adx(uint64_t* pR; 49 | ;* const uint64_t* pA, int aSize, 50 | ;* const uint64_t* pB, int bSize) 51 | ;************************************************************* 52 | align ARCH_ALIGN_FACTOR 53 | IPPASM mpi_umul_bin_adx,PUBLIC 54 | %assign LOCAL_FRAME 0 55 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 56 | USES_XMM 57 | COMP_ABI 5 58 | 59 | ; rdi = pR 60 | ; rsi = pA 61 | ; edx = nsA 62 | ; rcx = pB 63 | ; r8d = nsB 64 | 65 | movsxd rdx, edx ; expand length 66 | movsxd rbx, r8d 67 | 68 | xor r8, r8 ; clear scratch 69 | xor r9, r9 70 | xor r10, r10 71 | xor r11, r11 72 | xor r12, r12 73 | xor r13, r13 74 | xor r14, r14 75 | xor r15, r15 76 | 77 | cmp rdx, rbx 78 | jl .swap_operans ; nsA < nsB 79 | jg .test_8N_case ; test %if nsA=8*N and nsB=8*M 80 | 81 | cmp rdx, 16 82 | jg .test_8N_case 83 | 84 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 85 | ;; short nsA==nsB (1,..,16) 86 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 87 | cmp rdx, 4 88 | jg .more_then_4 89 | 90 | cmp edx, 3 91 | ja .mul_4_4 92 | jz .mul_3_3 93 | jp .mul_2_2 94 | ; mul_1_1 95 | 96 | .mul_1_1: 97 | MUL_NxN 1, rdi, rsi, rcx, rbx, rbp, r8 98 | jmp .quit 99 | .mul_2_2: 100 | MUL_NxN 2, rdi, rsi, rcx, rbx, rbp, r8, r9 101 | jmp .quit 102 | .mul_3_3: 103 | MUL_NxN 3, rdi, rsi, rcx, rbx, rbp, r8, r9, r10 104 | jmp .quit 105 | .mul_4_4: 106 | MUL_NxN 4, rdi, rsi, rcx, rbx, rbp, r8, r9, r10, r11 107 | jmp .quit 108 | 109 | .more_then_4: 110 | GET_EP rax, mul_lxl_basic, rdx, rbp 111 | call rax 112 | jmp .quit 113 | 114 | .swap_operans: 115 | SWAP rsi, rcx ; swap operands 116 | SWAP rdx, rbx 117 | 118 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 119 | ;; 8*N x 8*M case multiplier 120 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 121 | .test_8N_case: 122 | mov rax, rdx 123 | or rax, rbx 124 | and rax, 7 125 | jnz .general_mul 126 | 127 | CALL_FUNC mul_8Nx8M_adcox 128 | jmp .quit 129 | 130 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 131 | ;; general case multiplier 132 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 133 | .general_mul: 134 | CALL_FUNC mul_NxM_adcox 135 | jmp .quit 136 | 137 | .quit: 138 | REST_XMM 139 | REST_GPR 140 | ret 141 | ENDFUNC mpi_umul_bin_adx 142 | 143 | ;************************************************************* 144 | ;* 145 | ;* uint64_t mpi_usqr_bin_adx(uint64_t* pR; 146 | ;* const uint64_t* pA, int aSize) 147 | ;* 148 | ;************************************************************* 149 | align ARCH_ALIGN_FACTOR 150 | IPPASM mpi_usqr_bin_adx,PUBLIC 151 | %assign LOCAL_FRAME 0 152 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 153 | USES_XMM 154 | COMP_ABI 3 155 | 156 | movsxd rdx, edx ; expand length 157 | 158 | xor r8, r8 ; clear scratch 159 | xor r9, r9 160 | xor r10, r10 161 | xor r11, r11 162 | xor r12, r12 163 | xor r13, r13 164 | xor r14, r14 165 | xor r15, r15 166 | 167 | cmp rdx, 16 168 | jg .test_8N_case 169 | 170 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 171 | ;; short nsA (1,..,16) 172 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 173 | GET_EP rax, sqr_l_basic, rdx, rbp 174 | call rax 175 | jmp .quit 176 | 177 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 178 | ;; 8N case squarer 179 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 180 | .test_8N_case: 181 | test rdx, 7 182 | jnz .general_sqr 183 | 184 | CALL_FUNC sqr_8N_adcox 185 | jmp .quit 186 | 187 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188 | ;; general case squarer 189 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 190 | .general_sqr: 191 | CALL_FUNC sqr_N_adcox 192 | 193 | .quit: 194 | REST_XMM 195 | REST_GPR 196 | ret 197 | ENDFUNC mpi_usqr_bin_adx 198 | 199 | ;************************************************************* 200 | ;* 201 | ;* uint64_t mpi_montgomery_reduce_bin_adx(uint64_t* pR; 202 | ;* uint64_t* pProduct, 203 | ;* const uint64_t* pModulus, int mSize, 204 | ;* uint64_t m) 205 | ;************************************************************* 206 | align ARCH_ALIGN_FACTOR 207 | IPPASM mpi_montgomery_reduce_bin_adx,PUBLIC 208 | %assign LOCAL_FRAME (0) 209 | USES_GPR rbx,rbp,rsi,rdi,r12,r13,r14,r15 210 | USES_XMM 211 | COMP_ABI 5 212 | ;pR (rdi) address of the reduction 213 | ;pProduct (rsi) address of the temporary product 214 | ;pModulus (rdx) address of the modulus 215 | ;mSize (rcx) size of the modulus 216 | ;m0 (r8) montgomery helper (m') 217 | 218 | mov r15, rdi ; store reduction address 219 | 220 | ; reload parameters for future convinience: 221 | mov rdi, rsi ; rdi = temporary product buffer 222 | mov rsi, rdx ; rsi = modulus 223 | movsxd rdx, ecx ; rdx = length of modulus 224 | 225 | cmp rdx, 16 226 | ja .test_8N_case ; length of modulus >16 227 | 228 | cmp rdx, 4 229 | ja .above4 ; length of modulus 4,..,16 230 | 231 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 232 | ;; short modulus (1,..,4) 233 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 234 | cmp rdx, 3 235 | ja .red_4 236 | jz .red_3 237 | jp .red_2 238 | ; red_1 239 | 240 | .red_1: 241 | mov r9, qword [rdi+sizeof(qword)*0] 242 | MRED_FIX 1, r15, rdi, rsi, r8, rbp,rbx, r9 243 | jmp .quit 244 | 245 | .red_2: 246 | mov r9, qword [rdi+sizeof(qword)*0] 247 | mov r10, qword [rdi+sizeof(qword)*1] 248 | MRED_FIX 2, r15, rdi, rsi, r8, rbp,rbx, r9,r10 249 | jmp .quit 250 | 251 | .red_3: 252 | mov r9, qword [rdi+sizeof(qword)*0] 253 | mov r10, qword [rdi+sizeof(qword)*1] 254 | mov r11, qword [rdi+sizeof(qword)*2] 255 | MRED_FIX 3, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11 256 | jmp .quit 257 | 258 | .red_4: 259 | mov r9, qword [rdi+sizeof(qword)*0] 260 | mov r10, qword [rdi+sizeof(qword)*1] 261 | mov r11, qword [rdi+sizeof(qword)*2] 262 | mov r12, qword [rdi+sizeof(qword)*3] 263 | MRED_FIX 4, r15, rdi, rsi, r8, rbp,rbx, r9,r10,r11,r12 264 | jmp .quit 265 | 266 | 267 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 268 | ;; short modulus (5,..,16) 269 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 270 | .above4: 271 | mov rbp, rdx 272 | sub rbp, 4 273 | GET_EP rax, mred_short, rbp ; mred procedure 274 | 275 | call rax 276 | jmp .quit 277 | 278 | 279 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 280 | ;; 8N case squarer 281 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 282 | .test_8N_case: 283 | test rdx, 7 284 | jnz .general_case 285 | 286 | CALL_FUNC mred_8N_adcox 287 | jmp .quit 288 | 289 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 290 | ;; 291 | ;; general case modulus 292 | ;; 293 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 294 | .general_case: 295 | CALL_FUNC mred_N_adcox 296 | 297 | .quit: 298 | REST_XMM 299 | REST_GPR 300 | ret 301 | ENDFUNC mpi_montgomery_reduce_bin_adx 302 | 303 | %endif 304 | 305 | %endif ;; _ADCOX_NI_ENABLING_ 306 | -------------------------------------------------------------------------------- /mpn/asm/intel64/mulx.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2013-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; 19 | ; Purpose: EM64T Cryptography Primitive. 20 | ; Emulation of Intel(R) instructions MULX, ADCX, ADOX (for debug only) 21 | ; 22 | ; 23 | %ifndef _PCPMULX_INC_ 24 | %assign _PCPMULX_INC_ 1 25 | 26 | %ifndef _EMULATION_ 27 | %macro gsmulx 3.nolist 28 | %xdefine %%resH %1 29 | %xdefine %%resL %2 30 | %xdefine %%src %3 31 | 32 | mulx %%resH,%%resL,%%src 33 | %endmacro 34 | 35 | %endif 36 | 37 | %ifdef _EMULATION_ 38 | %macro gsmulx 3.nolist 39 | %xdefine %%resH %1 40 | %xdefine %%resL %2 41 | %xdefine %%src %3 42 | 43 | pushf ;; store flags 44 | 45 | sub rsp, sizeof(qword)*4 46 | mov [rsp-sizeof(qword)*3], rax ;; store RAX 47 | mov [rsp-sizeof(qword)*2], rdx ;; store RDX 48 | mov rax,rdx 49 | mov rdx, %%src 50 | 51 | mul rdx 52 | 53 | mov [rsp-sizeof(qword)*1], rax ;; store Low product 54 | mov [rsp-sizeof(qword)*0], rdx ;; store Hig product 55 | 56 | mov rax, [rsp-sizeof(qword)*3] ;; re-store RAX 57 | mov rdx, [rsp-sizeof(qword)*2] ;; re-store RDX 58 | mov %%resL, [rsp-sizeof(qword)*1];; load Low product 59 | mov %%resH, [rsp-sizeof(qword)*0];; load Hig product 60 | add rsp, sizeof(qword)*4 61 | 62 | popf ;; re-store flags 63 | %endmacro 64 | 65 | %endif 66 | 67 | %ifndef _EMULATION_ 68 | %macro gsadcx 2.nolist 69 | %xdefine %%rdst %1 70 | %xdefine %%rsrc %2 71 | 72 | adcx %%rdst, %%rsrc 73 | %endmacro 74 | 75 | %endif 76 | 77 | %ifdef _EMULATION_ 78 | %macro gsadcx 2.nolist 79 | %xdefine %%rdst %1 80 | %xdefine %%src %2 81 | 82 | push %%rdst ;; slot for result 83 | push rax ;; save rax 84 | pushfq ;; flags before adc 85 | 86 | adc %%rdst, %%src 87 | mov [rsp+2*sizeof(qword)], %%rdst 88 | 89 | pushfq ;; rsrc = flags after operation 90 | pop rax 91 | and rax, 1 ;; cf after operation 92 | and qword [rsp], (-2) ;; clear cf before operation 93 | or [rsp], rax ;; new psw 94 | popfq 95 | 96 | pop rax 97 | pop %%rdst 98 | %endmacro 99 | 100 | %endif 101 | 102 | %ifndef _EMULATION_ 103 | %macro gsadox 2.nolist 104 | %xdefine %%rdst %1 105 | %xdefine %%rsrc %2 106 | 107 | adox %%rdst, %%rsrc 108 | %endmacro 109 | 110 | %endif 111 | 112 | %ifdef _EMULATION_ 113 | %macro gsadox 2.nolist 114 | %xdefine %%rdst %1 115 | %xdefine %%src %2 116 | 117 | push %%rdst 118 | push rax ;; save rax 119 | 120 | pushfq ;; rax = flags before adc 121 | mov rax, [rsp] 122 | and rax, 800h ;; of 123 | xor [rsp], rax ;; clear of 124 | 125 | shr rax, 11 ;; mov of to cf position 126 | push rax ;; new psw 127 | popfq 128 | 129 | %ifidni %%src,rax 130 | mov rax, [rsp+sizeof(qword)] 131 | %endif 132 | %ifidni %%rdst,rax 133 | mov %%rdst, [rsp+2*sizeof(qword)] 134 | %endif 135 | 136 | adc %%rdst, %%src 137 | mov [rsp+2*sizeof(qword)], %%rdst 138 | 139 | pushfq ;; rsrc = flags after operation 140 | pop rax 141 | and rax, 1 ;; cf after operation 142 | 143 | shl rax, 11 ;; mov cf into of position 144 | or [rsp], rax ;; new psw 145 | popfq 146 | 147 | pop rax 148 | pop %%rdst 149 | %endmacro 150 | 151 | %endif 152 | 153 | %endif ;; _PCPMULX_INC_ 154 | -------------------------------------------------------------------------------- /mpn/asm/intel64/os.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %ifndef OS_ASM_FILE 18 | %define OS_ASM_FILE 19 | 20 | %ifndef WIN_ABI 21 | %ifidn __OUTPUT_FORMAT__, win64 22 | %define WIN_ABI 23 | %endif 24 | %endif 25 | 26 | %ifndef LINUX 27 | %ifidn __OUTPUT_FORMAT__, elf64 28 | %define LINUX 29 | %endif 30 | %endif 31 | 32 | ;; code is the same for linux and macos 33 | %ifndef LINUX 34 | %ifidn __OUTPUT_FORMAT__, macho64 35 | %define LINUX 36 | %endif 37 | %endif 38 | 39 | %endif ; OS_ASM_FILE 40 | -------------------------------------------------------------------------------- /mpn/asm/intel64/reg_sizes.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; define d and w variants for registers 18 | 19 | %ifndef _REG_SIZES_ASM_ 20 | %define _REG_SIZES_ASM_ 21 | 22 | %define raxd eax 23 | %define raxw ax 24 | %define raxb al 25 | 26 | %define rbxd ebx 27 | %define rbxw bx 28 | %define rbxb bl 29 | 30 | %define rcxd ecx 31 | %define rcxw cx 32 | %define rcxb cl 33 | 34 | %define rdxd edx 35 | %define rdxw dx 36 | %define rdxb dl 37 | 38 | %define rsid esi 39 | %define rsiw si 40 | %define rsib sil 41 | 42 | %define rdid edi 43 | %define rdiw di 44 | %define rdib dil 45 | 46 | %define rbpd ebp 47 | %define rbpw bp 48 | %define rbpb bpl 49 | 50 | %define zmm0x xmm0 51 | %define zmm1x xmm1 52 | %define zmm2x xmm2 53 | %define zmm3x xmm3 54 | %define zmm4x xmm4 55 | %define zmm5x xmm5 56 | %define zmm6x xmm6 57 | %define zmm7x xmm7 58 | %define zmm8x xmm8 59 | %define zmm9x xmm9 60 | %define zmm10x xmm10 61 | %define zmm11x xmm11 62 | %define zmm12x xmm12 63 | %define zmm13x xmm13 64 | %define zmm14x xmm14 65 | %define zmm15x xmm15 66 | %define zmm16x xmm16 67 | %define zmm17x xmm17 68 | %define zmm18x xmm18 69 | %define zmm19x xmm19 70 | %define zmm20x xmm20 71 | %define zmm21x xmm21 72 | %define zmm22x xmm22 73 | %define zmm23x xmm23 74 | %define zmm24x xmm24 75 | %define zmm25x xmm25 76 | %define zmm26x xmm26 77 | %define zmm27x xmm27 78 | %define zmm28x xmm28 79 | %define zmm29x xmm29 80 | %define zmm30x xmm30 81 | %define zmm31x xmm31 82 | 83 | %define ymm0x xmm0 84 | %define ymm1x xmm1 85 | %define ymm2x xmm2 86 | %define ymm3x xmm3 87 | %define ymm4x xmm4 88 | %define ymm5x xmm5 89 | %define ymm6x xmm6 90 | %define ymm7x xmm7 91 | %define ymm8x xmm8 92 | %define ymm9x xmm9 93 | %define ymm10x xmm10 94 | %define ymm11x xmm11 95 | %define ymm12x xmm12 96 | %define ymm13x xmm13 97 | %define ymm14x xmm14 98 | %define ymm15x xmm15 99 | %define ymm16x xmm16 100 | %define ymm17x xmm17 101 | %define ymm18x xmm18 102 | %define ymm19x xmm19 103 | %define ymm20x xmm20 104 | %define ymm21x xmm21 105 | %define ymm22x xmm22 106 | %define ymm23x xmm23 107 | %define ymm24x xmm24 108 | %define ymm25x xmm25 109 | %define ymm26x xmm26 110 | %define ymm27x xmm27 111 | %define ymm28x xmm28 112 | %define ymm29x xmm29 113 | %define ymm30x xmm30 114 | %define ymm31x xmm31 115 | 116 | %define xmm0x xmm0 117 | %define xmm1x xmm1 118 | %define xmm2x xmm2 119 | %define xmm3x xmm3 120 | %define xmm4x xmm4 121 | %define xmm5x xmm5 122 | %define xmm6x xmm6 123 | %define xmm7x xmm7 124 | %define xmm8x xmm8 125 | %define xmm9x xmm9 126 | %define xmm10x xmm10 127 | %define xmm11x xmm11 128 | %define xmm12x xmm12 129 | %define xmm13x xmm13 130 | %define xmm14x xmm14 131 | %define xmm15x xmm15 132 | %define xmm16x xmm16 133 | %define xmm17x xmm17 134 | %define xmm18x xmm18 135 | %define xmm19x xmm19 136 | %define xmm20x xmm20 137 | %define xmm21x xmm21 138 | %define xmm22x xmm22 139 | %define xmm23x xmm23 140 | %define xmm24x xmm24 141 | %define xmm25x xmm25 142 | %define xmm26x xmm26 143 | %define xmm27x xmm27 144 | %define xmm28x xmm28 145 | %define xmm29x xmm29 146 | %define xmm30x xmm30 147 | %define xmm31x xmm31 148 | 149 | %define zmm0y ymm0 150 | %define zmm1y ymm1 151 | %define zmm2y ymm2 152 | %define zmm3y ymm3 153 | %define zmm4y ymm4 154 | %define zmm5y ymm5 155 | %define zmm6y ymm6 156 | %define zmm7y ymm7 157 | %define zmm8y ymm8 158 | %define zmm9y ymm9 159 | %define zmm10y ymm10 160 | %define zmm11y ymm11 161 | %define zmm12y ymm12 162 | %define zmm13y ymm13 163 | %define zmm14y ymm14 164 | %define zmm15y ymm15 165 | %define zmm16y ymm16 166 | %define zmm17y ymm17 167 | %define zmm18y ymm18 168 | %define zmm19y ymm19 169 | %define zmm20y ymm20 170 | %define zmm21y ymm21 171 | %define zmm22y ymm22 172 | %define zmm23y ymm23 173 | %define zmm24y ymm24 174 | %define zmm25y ymm25 175 | %define zmm26y ymm26 176 | %define zmm27y ymm27 177 | %define zmm28y ymm28 178 | %define zmm29y ymm29 179 | %define zmm30y ymm30 180 | %define zmm31y ymm31 181 | 182 | %define xmm0y ymm0 183 | %define xmm1y ymm1 184 | %define xmm2y ymm2 185 | %define xmm3y ymm3 186 | %define xmm4y ymm4 187 | %define xmm5y ymm5 188 | %define xmm6y ymm6 189 | %define xmm7y ymm7 190 | %define xmm8y ymm8 191 | %define xmm9y ymm9 192 | %define xmm10y ymm10 193 | %define xmm11y ymm11 194 | %define xmm12y ymm12 195 | %define xmm13y ymm13 196 | %define xmm14y ymm14 197 | %define xmm15y ymm15 198 | %define xmm16y ymm16 199 | %define xmm17y ymm17 200 | %define xmm18y ymm18 201 | %define xmm19y ymm19 202 | %define xmm20y ymm20 203 | %define xmm21y ymm21 204 | %define xmm22y ymm22 205 | %define xmm23y ymm23 206 | %define xmm24y ymm24 207 | %define xmm25y ymm25 208 | %define xmm26y ymm26 209 | %define xmm27y ymm27 210 | %define xmm28y ymm28 211 | %define xmm29y ymm29 212 | %define xmm30y ymm30 213 | %define xmm31y ymm31 214 | 215 | %define xmm0z zmm0 216 | %define xmm1z zmm1 217 | %define xmm2z zmm2 218 | %define xmm3z zmm3 219 | %define xmm4z zmm4 220 | %define xmm5z zmm5 221 | %define xmm6z zmm6 222 | %define xmm7z zmm7 223 | %define xmm8z zmm8 224 | %define xmm9z zmm9 225 | %define xmm10z zmm10 226 | %define xmm11z zmm11 227 | %define xmm12z zmm12 228 | %define xmm13z zmm13 229 | %define xmm14z zmm14 230 | %define xmm15z zmm15 231 | %define xmm16z zmm16 232 | %define xmm17z zmm17 233 | %define xmm18z zmm18 234 | %define xmm19z zmm19 235 | %define xmm20z zmm20 236 | %define xmm21z zmm21 237 | %define xmm22z zmm22 238 | %define xmm23z zmm23 239 | %define xmm24z zmm24 240 | %define xmm25z zmm25 241 | %define xmm26z zmm26 242 | %define xmm27z zmm27 243 | %define xmm28z zmm28 244 | %define xmm29z zmm29 245 | %define xmm30z zmm30 246 | %define xmm31z zmm31 247 | 248 | %define ymm0z zmm0 249 | %define ymm1z zmm1 250 | %define ymm2z zmm2 251 | %define ymm3z zmm3 252 | %define ymm4z zmm4 253 | %define ymm5z zmm5 254 | %define ymm6z zmm6 255 | %define ymm7z zmm7 256 | %define ymm8z zmm8 257 | %define ymm9z zmm9 258 | %define ymm10z zmm10 259 | %define ymm11z zmm11 260 | %define ymm12z zmm12 261 | %define ymm13z zmm13 262 | %define ymm14z zmm14 263 | %define ymm15z zmm15 264 | %define ymm16z zmm16 265 | %define ymm17z zmm17 266 | %define ymm18z zmm18 267 | %define ymm19z zmm19 268 | %define ymm20z zmm20 269 | %define ymm21z zmm21 270 | %define ymm22z zmm22 271 | %define ymm23z zmm23 272 | %define ymm24z zmm24 273 | %define ymm25z zmm25 274 | %define ymm26z zmm26 275 | %define ymm27z zmm27 276 | %define ymm28z zmm28 277 | %define ymm29z zmm29 278 | %define ymm30z zmm30 279 | %define ymm31z zmm31 280 | 281 | %define DWORD(reg) reg %+ d 282 | %define WORD(reg) reg %+ w 283 | %define BYTE(reg) reg %+ b 284 | 285 | %define XWORD(reg) reg %+ x 286 | %define YWORD(reg) reg %+ y 287 | %define ZWORD(reg) reg %+ z 288 | 289 | %endif ;; _REG_SIZES_ASM_ 290 | -------------------------------------------------------------------------------- /mpn/asm/intel64/variant.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; Intel(R) Integrated Performance Primitives 19 | ; Cryptographic Primitives (ippcp) 20 | ; 21 | ; Purpose: 22 | ; Define ippCP variant 23 | ; 24 | ; do not changes in definitions below! 25 | ; 26 | 27 | ;; 28 | ;; modes of the feature 29 | ;; 30 | %assign _FEATURE_OFF_ 0 ;; feature is OFF 31 | %assign _FEATURE_ON_ 1 ;; feature is ON 32 | %assign _FEATURE_TICKTOCK_ 2 ;; dectect is feature OFF/ON 33 | 34 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 35 | ; %define _XMM7560_ 1 36 | %ifdef _XMM7560_ 37 | %include "variant_xmm7560.inc" 38 | %endif 39 | 40 | ; %define _TXT_ACM_ 1 41 | %ifdef _TXT_ACM_ 42 | %include "variant_txt_acm.inc" 43 | %endif 44 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 45 | 46 | ;; 47 | ;; it possible to force use of C-version of some implementtaions 48 | ;; instead of ASM one 49 | ;; 50 | %ifndef _USE_C_MPZ_uadd_ 51 | %assign _USE_C_MPZ_uadd_ _FEATURE_OFF_ 52 | %endif 53 | 54 | %ifndef _USE_C_MPZ_usub_ 55 | %assign _USE_C_MPZ_usub_ _FEATURE_OFF_ 56 | %endif 57 | 58 | %ifndef _USE_C_MPZ_uadd_word_ 59 | %assign _USE_C_MPZ_uadd_word_ _FEATURE_OFF_ 60 | %endif 61 | 62 | %ifndef _USE_C_batch_mul_add_ 63 | %assign _USE_C_batch_mul_add_ _FEATURE_OFF_ 64 | %endif 65 | 66 | %ifndef _USE_C_batch_mul_ 67 | %assign _USE_C_batch_mul_ _FEATURE_OFF_ 68 | %endif 69 | 70 | %ifndef _USE_C_bn_sqr_words_ 71 | %assign _USE_C_cpMulSqr_BNU_vectorized_ _FEATURE_OFF_ 72 | %endif 73 | 74 | %ifndef _USE_C_bn_mont_red_words_ 75 | %assign _USE_C_bn_mont_red_words_ _FEATURE_OFF_ 76 | %endif 77 | 78 | ;; 79 | ;; set _AES_NI_ENABLING_ 80 | ;; 81 | %ifdef __ARCH_AES_NI_ 82 | %if (__ARCH_AES_NI_ == 0) 83 | %assign _AES_NI_ENABLING_ _FEATURE_OFF_ 84 | %elif (__ARCH_AES_NI_ == 1) 85 | %assign _AES_NI_ENABLING_ _FEATURE_ON_ 86 | %else 87 | %error 88 | %endif 89 | %else 90 | %if (__ARCH32E >= __ARCH32E_Y8) 91 | %assign _AES_NI_ENABLING_ _FEATURE_TICKTOCK_ 92 | %else 93 | %assign _AES_NI_ENABLING_ _FEATURE_OFF_ 94 | %endif 95 | %endif 96 | 97 | ;; 98 | ;; if there is no outside assignment 99 | ;; set _SHA_NI_ENABLING_ based on CPU specification 100 | ;; 101 | %ifndef _SHA_NI_ENABLING_ 102 | %if (__ARCH32E >= __ARCH32E_Y8 ) 103 | %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_ 104 | %else 105 | %assign _SHA_NI_ENABLING_ _FEATURE_OFF_ 106 | %endif 107 | %endif 108 | 109 | ;; 110 | ;; set _ADCOX_NI_ENABLING_ 111 | ;; 112 | %ifdef __ARCH_ADCX_NI_ 113 | %if (__ARCH_ADCX_NI_ == 0) 114 | %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_ 115 | %elif (__ARCH_ADCX_NI_ == 1) 116 | %assign _ADCOX_NI_ENABLING_ _FEATURE_ON_ 117 | %else 118 | %error 119 | %endif 120 | %else 121 | %if (__ARCH32E >= __ARCH32E_L9) 122 | %assign _ADCOX_NI_ENABLING_ _FEATURE_TICKTOCK_ 123 | %else 124 | %assign _ADCOX_NI_ENABLING_ _FEATURE_OFF_ 125 | %endif 126 | %endif 127 | 128 | 129 | ;; 130 | ;; select Hash algorithm 131 | ;; 132 | %ifndef _DISABLE_ALG_SHA1_ 133 | %assign _ENABLE_ALG_SHA1_ _FEATURE_ON_ ;; SHA1 on 134 | %else 135 | %assign _ENABLE_ALG_SHA1_ _FEATURE_OFF_ ;; SHA1 on 136 | %endif 137 | 138 | %ifndef _DISABLE_ALG_SHA256_ 139 | %assign _ENABLE_ALG_SHA256_ _FEATURE_ON_ ;; SHA256 on 140 | %else 141 | %assign _ENABLE_ALG_SHA256_ _FEATURE_OFF_ ;; SHA256 off 142 | %endif 143 | 144 | %ifndef _DISABLE_ALG_SHA521_ 145 | %assign _ENABLE_ALG_SHA512_ _FEATURE_ON_ ;; SHA512 on 146 | %else 147 | %assign _ENABLE_ALG_SHA512_ _FEATURE_OFF_ ;; SHA512 off 148 | %endif 149 | 150 | %ifndef _DISABLE_ALG_MD5_ 151 | %assign _ENABLE_ALG_MD5_ _FEATURE_ON_ ;; MD5 on 152 | %else 153 | %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_ ;; MD5 off 154 | %endif 155 | 156 | %ifndef _DISABLE_ALG_SM3_ 157 | %assign _ENABLE_ALG_SM3_ _FEATURE_ON_ ;; SM3 on 158 | %else 159 | %assign _ENABLE_ALG_SM3_ _FEATURE_OFF_ ;; SM3 off 160 | %endif 161 | 162 | ;; 163 | ;; BN arithmetic 164 | ;; 165 | %assign _ENABLE_KARATSUBA_ _FEATURE_OFF_ ;; not use Karatsuba method for multiplication 166 | 167 | ;; 168 | ;; EC specific 169 | ;; 170 | %assign _ECP_IMPL_NONE_ 0 171 | %assign _ECP_IMPL_ARBIRTRARY_ 1 172 | %assign _ECP_IMPL_SPECIFIC_ 2 173 | %assign _ECP_IMPL_MFM_ 3 174 | 175 | %ifndef _ECP_128_ 176 | %assign _ECP_128_ _ECP_IMPL_SPECIFIC_ 177 | %endif 178 | 179 | %ifndef _ECP_192_ 180 | %assign _ECP_192_ _ECP_IMPL_MFM_ 181 | %endif 182 | 183 | %ifndef _ECP_224_ 184 | %assign _ECP_224_ _ECP_IMPL_MFM_ 185 | %endif 186 | 187 | %ifndef _ECP_256_ 188 | %assign _ECP_256_ _ECP_IMPL_MFM_ 189 | %endif 190 | 191 | %ifndef _ECP_384_ 192 | %assign _ECP_384_ _ECP_IMPL_MFM_ 193 | %endif 194 | 195 | %ifndef _ECP_521_ 196 | %assign _ECP_521_ _ECP_IMPL_MFM_ 197 | %endif 198 | 199 | %ifndef _ECP_SM2_ 200 | %assign _ECP_SM2_ _ECP_IMPL_MFM_ 201 | %endif 202 | -------------------------------------------------------------------------------- /mpn/asm/intel64/variant_txt_acm.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | ; 18 | ; Intel(R) Integrated Performance Primitives 19 | ; Cryptographic Primitives (ippcp) 20 | ; 21 | ; Purpose: 22 | ; Update standard ippCP variant 23 | ; 24 | ; do not changes in definitions below! 25 | ; 26 | 27 | %ifdef _TXT_ACM_ 28 | 29 | ;; 30 | ;; HASH algs outside settings 31 | ;; 32 | %assign _SHA_NI_ENABLING_ _FEATURE_TICKTOCK_ 33 | 34 | ;; 35 | ;; select Hash algorithm 36 | ;; 37 | ; %assign _ENABLE_ALG_MD5_ _FEATURE_OFF_ 38 | 39 | %endif 40 | -------------------------------------------------------------------------------- /mpn/asm/utils.inc: -------------------------------------------------------------------------------- 1 | ;=============================================================================== 2 | ; Copyright 2015-2020 Intel Corporation 3 | ; 4 | ; Licensed under the Apache License, Version 2.0 (the "License"); 5 | ; you may not use this file except in compliance with the License. 6 | ; You may obtain a copy of the License at 7 | ; 8 | ; http://www.apache.org/licenses/LICENSE-2.0 9 | ; 10 | ; Unless required by applicable law or agreed to in writing, software 11 | ; distributed under the License is distributed on an "AS IS" BASIS, 12 | ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ; See the License for the specific language governing permissions and 14 | ; limitations under the License. 15 | ;=============================================================================== 16 | 17 | %ifndef __UTILS_INC__ 18 | %define __UTILS_INC__ 1 19 | 20 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters. 21 | ; A list is processed in direct order. Note: an input list can be empty. 22 | %macro FOREACH 2-*.nolist 23 | %rotate -1 24 | %xdefine %%functor %1 25 | %rep %0-1 26 | %rotate 1 27 | %ifnempty %1 28 | %%functor %1 29 | %endif 30 | %endrep 31 | %endmacro 32 | 33 | ; Apply a functor provided as a last parameter to each element of the list, provided as sequence of first parameters. 34 | ; A list is processed in reverse order. Note: an input list can be empty. 35 | %macro RFOREACH 2-*.nolist 36 | %rotate -1 37 | %xdefine %%functor %1 38 | %rep %0-1 39 | %rotate -1 40 | %ifnempty %1 41 | %%functor %1 42 | %endif 43 | %endrep 44 | %endmacro 45 | 46 | ; Shall be called before INTERSECT macro to open corresponding context. 47 | %macro BEGIN_INTERSECT 0.nolist 48 | %push _INTERSECT_CTX_ 49 | %xdefine %$intersection 50 | %assign %$cardinality 0 51 | %endmacro 52 | 53 | ; Shall be called after INTERSECT macro to close corresponding context. 54 | %macro END_INTERSECT 0.nolist 55 | %pop _INTERSECT_CTX_ 56 | %endmacro 57 | 58 | ; The macro searches intersection between two lists. 59 | ; Input: two comma-separated lists, enclosed in curly braces. 60 | ; Output: 61 | ; - Intersection will be located in the %$instersection context macro (can be empty). 62 | ; - Count of intersection elements list will be stored in the %$cardinality context variable. 63 | %macro INTERSECT 2.nolist 64 | %ifnctx _INTERSECT_CTX_ 65 | %fatal "Not in the context: _INTERSECT_CTX_" 66 | %endif 67 | 68 | %xdefine %%list1 %1 69 | %xdefine %%list2 %2 70 | 71 | FOREACH %%list1,{?INTERSECT_BODY {%%list2},} 72 | %endmacro 73 | 74 | ; Helper macro to concatenate two lists. 75 | ; The result will be stored in the 3rd parameter that must be a macro identifier. 76 | %macro CONCATENATE 3.nolist 77 | %ifnid %3 78 | %fatal "CONCATENATE: 3rd parameter must be a macro identifier." 79 | %endif 80 | %define %3 %[%1] 81 | %ifnempty %3 82 | %ifnempty %2 83 | %define %3 %[%3],%[%2] 84 | %endif 85 | %else 86 | %define %3 %[%2] 87 | %endif 88 | %endmacro 89 | 90 | ; Helper macro that searches the specified element in the input list. 91 | ; Input: 92 | ; - Last parameter - target element 93 | ; - First parameters refer to the list where the search is processed. 94 | ; Output: 95 | ; - The macro is context dependent and upon the element is found, the context macro %$elem_exists will be defined. 96 | %macro ?FIND 2-*.nolist 97 | %ifnctx _FIND_CTX_ 98 | %fatal "Not in the context: _FIND_CTX_" 99 | %endif 100 | %rotate -1 101 | %xdefine %%elem_to_check %1 102 | %undef %$elem_exists 103 | 104 | %rep %0-1 105 | %rotate -1 106 | %ifidni %%elem_to_check, %1 107 | %define %$elem_exists %1 108 | %exitrep 109 | %endif 110 | %endrep 111 | %endmacro 112 | 113 | ; Macro that finds and collects intersection elements. To be used as INTERSECT macro functor. 114 | %macro ?INTERSECT_BODY 2.nolist 115 | %xdefine %%list %1 116 | %xdefine %%elem %2 117 | 118 | %push _FIND_CTX_ 119 | ?FIND %%list,%%elem 120 | %ifdef %$elem_exists 121 | %ifempty %$$intersection 122 | %define %$$intersection %2 123 | %else 124 | %define %$$intersection %[%$$intersection],%%elem 125 | %endif 126 | %assign %$$cardinality %$$cardinality + 1 127 | %endif 128 | %pop _FIND_CTX_ 129 | %endmacro 130 | 131 | %endif 132 | -------------------------------------------------------------------------------- /mpn/mpn-asm.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "mpn-asm.h" 17 | 18 | #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 19 | // clang-format off 20 | const unsigned char __mpi_clz_tab[129] = { 21 | 1, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 22 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 23 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 24 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 25 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 26 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 27 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 28 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 29 | 9, 30 | }; 31 | // clang-format on 32 | #endif 33 | -------------------------------------------------------------------------------- /mpn/mpn-binary.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef MULTIPLE_PRECISION_BINARY_H 17 | #define MULTIPLE_PRECISION_BINARY_H 18 | 19 | #include 20 | #include 21 | 22 | #define BITS_PER_BYTE 8 /* @constant: bits per byte */ 23 | #define BITS_PER_CHAR 4 /* @constant: bits per character */ 24 | #define MPN_MAX_BITS (UINT_MAX / BITS_PER_BYTE) /* @note: mpn width limitation */ 25 | #define MPN_BITS_TO_BYTES(n) (((n) + BITS_PER_BYTE - 1) / BITS_PER_BYTE) 26 | 27 | /* swap variable */ 28 | #define SWAP(type, a, b) \ 29 | do { \ 30 | type __t = a; \ 31 | (a) = (b); \ 32 | (b) = __t; \ 33 | } while (0) 34 | 35 | /* copy(increment) */ 36 | #define COPY(dst, src, to) \ 37 | for (mpn_size_t __i = 0; __i < (to); __i++) { (dst)[__i] = (src)[__i]; } 38 | 39 | /* expand by zeros */ 40 | #define ZEROIZE(dst, from, to) \ 41 | for (mpn_size_t __i = (from); __i < (to); __i++) { (dst)[__i] = 0; } 42 | 43 | /* copy and expand the left by zeros */ 44 | #define ZEXPAND(dst, dstlen, src, srclen) \ 45 | { \ 46 | mpn_size_t __i; \ 47 | for (__i = 0; __i < (srclen); __i++) { (dst)[__i] = (src)[__i]; } \ 48 | for (; __i < (dstlen); __i++) { (dst)[__i] = 0; } \ 49 | } 50 | 51 | /** 52 | * mpn alignment 53 | */ 54 | MPN_INLINE mpn_size_t mpi_aligned_diff(void *ptr, uintptr_t alignment) 55 | { 56 | return (mpn_size_t)((~(((uintptr_t)ptr) & (alignment - 1)) + 1) & (alignment - 1)); 57 | } 58 | 59 | MPN_INLINE mpn_size_t mpi_aligned_size(mpn_size_t size, mpn_size_t alignment) 60 | { 61 | return (size + (alignment - 1)) & (-alignment); 62 | } 63 | 64 | MPN_INLINE mpn_limb_t *mpi_aligned_pointer(void *ptr, uintptr_t alignment) 65 | { 66 | return (mpn_limb_t *)((uintptr_t)((unsigned char *)ptr + alignment - 1) & (-alignment)); 67 | } 68 | 69 | /** 70 | * basic constant-time operation 71 | */ 72 | /* return all-ones if MSB(a) == 1; otherwise, all-zeros */ 73 | MPN_INLINE mpn_limb_t mpn_limb_test_msb_consttime(mpn_limb_t a) 74 | { 75 | return (mpn_limb_t)0 - (a >> (sizeof(a) * BITS_PER_BYTE - 1)); 76 | } 77 | 78 | /* return all-ones if |a| equals zero; otherwise, all-zeros */ 79 | MPN_INLINE mpn_limb_t mpn_limb_is_zero_consttime(mpn_limb_t a) 80 | { 81 | mpn_limb_t t = ~a & (a - 1); 82 | return (mpn_limb_t)0 - (t >> (sizeof(t) * BITS_PER_BYTE - 1)); 83 | } 84 | 85 | /* copy under mask: dst[] = (a[] & mask) ^ (b[] & ~mask) */ 86 | MPN_INLINE void mpn_masked_copy_consttime(mpn_limb_t *dst, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t len, 87 | mpn_limb_t mask) 88 | { 89 | mpn_limb_t rmask = ~mask; 90 | for (mpn_size_t i = 0; i < len; i++) { dst[i] = (a[i] & mask) ^ (b[i] & rmask); } 91 | } 92 | 93 | /* conditional swap: a[], b[] = b[], a[] if cond; otherwise not changed */ 94 | MPN_INLINE void mpn_masked_swap_consttime(mpn_limb_t *a, mpn_limb_t *b, mpn_size_t n, unsigned cond) 95 | { 96 | mpn_limb_t mask = cond; 97 | mask = ((~mask & ((mask - 1))) >> (sizeof(mpn_limb_t) * BITS_PER_BYTE - 1)) - 1; 98 | for (mpn_size_t i = 0; i < n; i++) { 99 | mpn_limb_t t = (a[i] ^ b[i]) & mask; 100 | a[i] ^= t; 101 | b[i] ^= t; 102 | } 103 | } 104 | 105 | /* conditional move: dst[] = cond ? src[] : dst[] */ 106 | MPN_INLINE void mpn_masked_move_consttime(mpn_limb_t *dst, const mpn_limb_t *src, mpn_size_t len, unsigned cond) 107 | { 108 | mpn_masked_copy_consttime(dst, src, dst, len, (mpn_limb_t)0 - cond != 0); 109 | } 110 | 111 | #if defined(__cplusplus) 112 | extern "C" { 113 | #endif 114 | 115 | /** 116 | * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros 117 | */ 118 | mpn_limb_t mpn_is_zero(const mpn_limb_t *buff, mpn_size_t bufflen); 119 | 120 | /** 121 | * mpn: ALL-ones if buff[::] is zero, otherwise ALL-zeros(constant-time version) 122 | */ 123 | mpn_limb_t mpn_is_zero_consttime(const mpn_limb_t *buff, mpn_size_t bufflen); 124 | 125 | /** 126 | * mpn: get most significant bit 127 | */ 128 | mpn_size_t mpn_bits(const mpn_limb_t *data, mpn_size_t size); 129 | 130 | /** 131 | * mpn: get most significant bit(constant-time version) 132 | */ 133 | mpn_size_t mpn_bits_consttime(const mpn_limb_t *data, mpn_size_t size); 134 | 135 | /** 136 | * mpn: get most significant limb 137 | */ 138 | mpn_size_t mpn_limbs(const mpn_limb_t *data, mpn_size_t size); 139 | 140 | /** 141 | * mpn: get most significant limb(constant-time version) 142 | */ 143 | mpn_size_t mpn_limbs_consttime(const mpn_limb_t *data, mpn_size_t size); 144 | 145 | /** 146 | * mpn: unsigned comparison 147 | * 148 | * @note: 149 | * 1. return 1 if a[] > b[]; 0 if a[] = b[]; -1 if a[] < b[] 150 | */ 151 | int mpn_cmp(const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize); 152 | 153 | 154 | /** 155 | * mpn: left shift 156 | * 157 | * @note: 158 | * 1. required bit_size(r) >= bit_size(a) + nbits 159 | * 2. the return is number of |mpn_limb_t| of the result |r| 160 | * 3. r == a is acceptable 161 | */ 162 | mpn_size_t mpn_lshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits); 163 | 164 | /** 165 | * mpn: right shift 166 | * 167 | * @note: 168 | * 1. required bit_size(r) >= bit_size(a) - nbits 169 | * 2. the return is number of |mpn_limb_t| of the result |r| 170 | * 3. r == a is acceptable 171 | */ 172 | mpn_size_t mpn_rshift(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_size_t nbits); 173 | 174 | /** 175 | * mpn addition: carry, r = a[:n] + b[:n] 176 | */ 177 | mpn_limb_t mpn_add_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n); 178 | 179 | /** 180 | * mpn: carry, r[] = a[] + b[] 181 | */ 182 | mpn_limb_t mpn_add(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, 183 | mpn_size_t bsize); 184 | 185 | /** 186 | * mpn: carry, r[:n] = a[:n] + w 187 | */ 188 | mpn_limb_t mpn_inc_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t size, mpn_limb_t w); 189 | 190 | /** 191 | * mpn: carry, r[] = a[] + w 192 | */ 193 | mpn_limb_t mpn_inc(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w); 194 | 195 | /** 196 | * mpn subtraction: borrow, r[:n] = a[:n] - b[:n] 197 | * 198 | * @note: 199 | * 1. make sure r->room is enough to store the result 200 | * minimal advise size: MAX(bit_size(a), bit_size(b)) + 1 201 | */ 202 | mpn_limb_t mpn_sub_vectorized(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_size_t n); 203 | 204 | /** 205 | * mpn subtraction: size, r[] = a[] - b[] 206 | */ 207 | mpn_size_t mpn_sub(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, 208 | mpn_size_t bsize); 209 | 210 | /** 211 | * mpn: borrow, r[:n] = a[:n] - w 212 | */ 213 | mpn_limb_t mpn_dec_vectorized(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w); 214 | 215 | /** 216 | * mpn: size, r[] = a[] - w 217 | */ 218 | mpn_size_t mpn_dec(mpn_limb_t *r, mpn_size_t rroom, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t w); 219 | 220 | /** 221 | * mpn multiplication: extension, r[:asize+bsize] = a[:asize] * b[:bsize] 222 | * @note: 223 | * 1. (IMPORTANT)make sure size of |r| isn't less than |asize| + |bsize| 224 | * 2. the return is the highest unit |mpn_limb_t| 225 | */ 226 | mpn_limb_t mpn_mul(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *b, mpn_size_t bsize); 227 | 228 | /** 229 | * mpn multiply-and-add: extension, r[] += a[] * b 230 | * @note: 231 | * 1. (IMPORTANT)make sure size of |r| isn't less than |asize| 232 | * 2. the return is extension of result of multiply-and-add. 233 | */ 234 | mpn_limb_t mpn_mul_acc(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, mpn_limb_t b); 235 | 236 | /** 237 | * mpn square: r[] = a[] ^ 2 238 | * 239 | * @note: 240 | * 1. make sure r->room is enough to store the result 241 | * minimal advise size: 2 * bit_size(a) 242 | */ 243 | mpn_limb_t mpn_sqr(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t anum); 244 | 245 | /** 246 | * mpn division: xsize, q, x(q = x / y, x = x % y) 247 | */ 248 | mpn_size_t mpn_div(mpn_limb_t *q, mpn_size_t *qsize, mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize); 249 | 250 | /** 251 | * mpn modular: x[] = x[] % y[] 252 | */ 253 | mpn_size_t mpn_mod(mpn_limb_t *x, mpn_size_t xsize, mpn_limb_t *y, mpn_size_t ysize); 254 | 255 | /** 256 | * mpn: division(n by 1) 257 | * 258 | * @note: 259 | * 1. required length of q should be not smaller than size 260 | */ 261 | mpn_size_t mpn_div_limb(mpn_limb_t q[], const mpn_limb_t x[], mpn_size_t size, mpn_limb_t *r, mpn_limb_t d); 262 | 263 | /** 264 | * mpn: division(n by 2) 265 | * 266 | * @note: 267 | * 1. required length of q should be not smaller than size 268 | */ 269 | mpn_size_t mpn_div_double_limbs(mpn_limb_t q[], mpn_limb_t r[2], const mpn_limb_t n[], mpn_size_t nn, 270 | const mpn_limb_t d[2]); 271 | 272 | /** 273 | * @brief: multiplicative inversion 274 | * 275 | * @params: 276 | * a/asize: source (value) BigNum A whose size is asize 277 | * m/msize: source (modulus) BigNum M whose size is msize 278 | * invbuf: buffer of inv 279 | * abuf : buffer of A 280 | * mbuf : buffer of M 281 | * r : result BigNum 282 | */ 283 | mpn_size_t mpn_mod_invert(mpn_limb_t *r, const mpn_limb_t *a, mpn_size_t asize, const mpn_limb_t *m, mpn_size_t msize, 284 | mpn_optimizer_t *optimizer); 285 | 286 | /** 287 | * mpn: create mpn from hex string 288 | */ 289 | mpn_size_t mpn_from_string(mpn_limb_t *r, mpn_size_t size, const char *in, mpn_size_t inlen); 290 | 291 | /** 292 | * mpn: convert mpn to hex string 293 | */ 294 | mpn_size_t mpn_to_string(char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size); 295 | 296 | /** 297 | * mpn: create mpn from big-endian octets 298 | */ 299 | mpn_size_t mpn_from_octets(mpn_limb_t *r, mpn_size_t size, const unsigned char *in, mpn_size_t inlen); 300 | 301 | /** 302 | * mpn: convert mpn to big-endian octets 303 | */ 304 | mpn_size_t mpn_to_octets(unsigned char *out, mpn_size_t outsize, const mpn_limb_t *a, mpn_size_t size); 305 | 306 | /** 307 | * leading zeros counting(constant-time version) 308 | */ 309 | mpn_size_t mpn_limb_nlz_consttime(mpn_limb_t x); 310 | 311 | /** 312 | * trailing zeros counting(constant-time version) 313 | */ 314 | mpn_size_t mpn_limb_ntz_consttime(mpn_limb_t x); 315 | 316 | /** 317 | * greatest common divisor(mpn_limb_t) 318 | */ 319 | mpn_limb_t mpn_limb_gcd(mpn_limb_t a, mpn_limb_t b); 320 | 321 | /** 322 | * mpn: generate in range 323 | * 324 | * @note: 325 | * 1. length of |r| >= hilen 326 | */ 327 | int mpn_random_range(mpn_limb_t *r, mpn_size_t maxtries, const mpn_limb_t *lo, mpn_size_t lolen, const mpn_limb_t *hi, 328 | mpn_size_t hilen, int (*rand_bytes)(void *, unsigned char *, mpn_size_t), void *rand_state); 329 | 330 | /** 331 | * test if |a| and |b| are coprime 332 | */ 333 | int mpn_is_coprime(mpn_limb_t *a, mpn_size_t asize, mpn_limb_t *b, mpn_size_t bsize, mpn_optimizer_t *optimizer); 334 | 335 | #if defined(__cplusplus) 336 | } 337 | #endif 338 | 339 | #endif 340 | -------------------------------------------------------------------------------- /mpn/mpn-montgomery.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef MULTIPLE_PRECISION_MONTGOMERY_H 17 | #define MULTIPLE_PRECISION_MONTGOMERY_H 18 | 19 | #include 20 | 21 | #if defined(__cplusplus) 22 | extern "C" { 23 | #endif 24 | 25 | typedef struct { 26 | mpn_size_t modbits; /**< size of modulus in bit */ 27 | mpn_size_t modsize; /**< size of modulus in mpn_limb_t */ 28 | mpn_limb_t k0; /**< low word of (1/modulus) mod R */ 29 | mpn_limb_t *modulus; /**< modulus */ 30 | mpn_limb_t *montR; /**< mont_enc(1) */ 31 | mpn_limb_t *montRR; /**< mont_enc(1) ^ 2 */ 32 | 33 | mpn_optimizer_t *optimizer; /**< optimizer for montgomery operation */ 34 | } mpn_montgomery_t; 35 | 36 | /** 37 | * mpn montgomery: create montgomery context 38 | * 39 | */ 40 | mpn_montgomery_t *mpn_montgomery_create(mpn_size_t mbits, mpn_size_t psize); 41 | 42 | /** 43 | * mpn montgomery: destory montgomery context 44 | * 45 | */ 46 | void mpn_montgomery_destory(mpn_montgomery_t *mont); 47 | 48 | /** 49 | * mpn montgomery: intialize montgomery context with modulus 50 | * 51 | */ 52 | int mpn_montgomery_set_modulus_bin(mpn_montgomery_t *mont, const mpn_limb_t *modulus, mpn_size_t mbits); 53 | 54 | /** 55 | * mpn montgomery: montgomery reduction 56 | * 57 | * @note: 58 | * 1. m0: low word of (1 / modulus) mod b 59 | * 2. r = T/R mod m 60 | */ 61 | void mpn_montgomery_reduce_bin(mpn_limb_t *r, mpn_limb_t *product, const mpn_limb_t *m, mpn_size_t mnum, mpn_limb_t m0); 62 | 63 | /** 64 | * mpn montgomery: r[] = to_mont(a[]) 65 | * 66 | * @requirements: 67 | * 1. length of r: modsize 68 | * 2. length of a: modsize 69 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t) 70 | */ 71 | void mpn_montgomery_encode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 72 | 73 | /** 74 | * mpn montgomery: r[] = from_mont(a) 75 | * 76 | * @requirements: 77 | * 1. length of r: modsize 78 | * 2. length of a: modsize 79 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t) 80 | */ 81 | void mpn_montgomery_decode(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 82 | 83 | /** 84 | * mpn montgomery: r = (a + b) mod m 85 | * 86 | * @requirements: 87 | * 1. length of r: modsize 88 | * 2. length of a: modsize 89 | * 3. length of b: modsize 90 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) 91 | */ 92 | void mpn_montgomery_add(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont); 93 | 94 | /** 95 | * mpn montgomery: r = (a - b) mod m 96 | * 97 | * @requirements: 98 | * 1. length of r: modsize 99 | * 2. length of a: modsize 100 | * 3. length of b: modsize 101 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) 102 | */ 103 | void mpn_montgomery_sub(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont); 104 | 105 | /** 106 | * mpn montgomery: r = -b mod m = (m - b) mod m 107 | * 108 | * @requirements: 109 | * 1. length of r: modsize 110 | * 2. length of a: modsize 111 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) 112 | */ 113 | void mpn_montgomery_negative(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 114 | 115 | /** 116 | * mpn montgomery: r = (a / 2) mod m 117 | * 118 | * @requirements: 119 | * 1. length of r: modsize 120 | * 2. length of a: modsize 121 | * 3. memory size from the pool: modsize * sizeof(mpn_limb_t) 122 | */ 123 | void mpn_montgomery_halve(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 124 | 125 | /** 126 | * mpn montgomery: r = (a * 2) mod m 127 | * 128 | * @requirements: 129 | * 1. length of r: modsize 130 | * 2. length of a: modsize 131 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) 132 | */ 133 | void mpn_montgomery_double(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 134 | 135 | /** 136 | * mpn montgomery: r = (a * 3) mod m 137 | * 138 | * @requirements: 139 | * 1. length of r: modsize 140 | * 2. length of a: modsize 141 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) 142 | */ 143 | void mpn_montgomery_triple(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 144 | 145 | /** 146 | * mpn montgomery: r = prod mod m 147 | * 148 | * @requirements: 149 | * 1. length of r: modsize 150 | * 2. length of rod: modsize 151 | * 4. memory size from the pool: N/A 152 | */ 153 | void mpn_montgomery_reduce(mpn_limb_t *r, mpn_limb_t *prod, mpn_montgomery_t *mont); 154 | 155 | /** 156 | * mpn montgomery: r = (a * b) mod m 157 | * 158 | * @requirements: 159 | * 1. length of r: modsize 160 | * 2. length of a: modsize 161 | * 3. length of b: modsize 162 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2 163 | */ 164 | void mpn_montgomery_mul(mpn_limb_t *r, const mpn_limb_t *a, const mpn_limb_t *b, mpn_montgomery_t *mont); 165 | 166 | /** 167 | * mpn montgomery: r = (a ^ 2) mod m 168 | * 169 | * @requirements: 170 | * 1. length of r: modsize 171 | * 2. length of a: modsize 172 | * 4. memory size from the pool: modsize * sizeof(mpn_limb_t) * 2 173 | */ 174 | void mpn_montgomery_square(mpn_limb_t *r, const mpn_limb_t *a, mpn_montgomery_t *mont); 175 | 176 | /** 177 | * montgomery factor k0 = -((modulus^-1 mod B) %B) 178 | */ 179 | mpn_limb_t mpn_montgomery_factor(mpn_limb_t m0); 180 | 181 | /** 182 | * mpn montgomery: binary exponentiation 183 | * 184 | */ 185 | mpn_size_t mpn_montgomery_exp(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e, 186 | mpn_size_t ebits, mpn_montgomery_t *mont); 187 | 188 | /** 189 | * mpn montgomery: binary exponentiation(consttime) 190 | * 191 | */ 192 | mpn_size_t mpn_montgomery_exp_consttime(mpn_limb_t *y, const mpn_limb_t *x, mpn_size_t xsize, const mpn_limb_t *e, 193 | mpn_size_t ebits, mpn_montgomery_t *mont); 194 | 195 | #if defined(__cplusplus) 196 | } 197 | #endif 198 | 199 | #endif 200 | -------------------------------------------------------------------------------- /mpn/mpn-optimizer.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "mpn-binary.h" 17 | 18 | /** 19 | * mpn optimizer: create optimizer for mpn operation 20 | * 21 | * @note: 22 | * 1. room: room size of optimizer chunk, in unit of 'mpn_limb_t' 23 | */ 24 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room) 25 | { 26 | if (room == 0) { 27 | /* it's meaningless to create 0-length optimizer */ 28 | return NULL; 29 | } 30 | size_t size = sizeof(mpn_optimizer_t) + MPN_LIMB_BYTES + room * sizeof(mpn_limb_t); 31 | mpn_optimizer_t *optimizer = (mpn_optimizer_t *)MPI_ALLOCATE(size); 32 | if (optimizer != NULL) { 33 | optimizer->size = 0; 34 | optimizer->next = NULL; 35 | optimizer->room = room; 36 | optimizer->chunk = mpi_aligned_pointer((unsigned char *)optimizer + sizeof(mpn_optimizer_t), MPN_LIMB_BYTES); 37 | } 38 | 39 | return optimizer; 40 | } 41 | 42 | /** 43 | * mpn optimizer: reset optimizer, mark all as unused 44 | */ 45 | void mpn_optimizer_reset(mpn_optimizer_t *optimizer) 46 | { 47 | mpn_optimizer_t *curr = optimizer; 48 | while (curr != NULL) { 49 | curr->size = 0; 50 | curr = curr->next; 51 | } 52 | } 53 | 54 | /** 55 | * mpn optimizer: destory optimizer 56 | */ 57 | void mpn_optimizer_destory(mpn_optimizer_t *optimizer) 58 | { 59 | mpn_optimizer_t *curr = optimizer, *next; 60 | while (curr != NULL) { 61 | next = curr->next; 62 | MPI_DEALLOCATE(curr); /* cleanse and free mpn_optimizer_t node */ 63 | curr = next; 64 | } 65 | } 66 | 67 | /** 68 | * mpn optimizer: get memory chunk for mpn operation 69 | * 70 | * @note: 71 | * 1. size: size of chunk, in unit of 'mpn_limb_t' 72 | */ 73 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *optimizer, mpn_size_t size) 74 | { 75 | if (optimizer == NULL) { 76 | MPI_RAISE_ERROR(-EINVAL); 77 | return NULL; 78 | } 79 | if (size == 0) { return NULL; } 80 | 81 | mpn_size_t total = 0; 82 | mpn_optimizer_t *curr = optimizer, *prev = NULL; 83 | while (curr != NULL) { 84 | total += curr->size; 85 | prev = curr; 86 | curr = curr->next; 87 | } 88 | 89 | if (prev->room - prev->size >= size) { 90 | curr = prev; 91 | } else { 92 | mpn_size_t room = size + total / 2; // XXX: optimize growth rule 93 | prev->next = curr = mpn_optimizer_create(room); 94 | } 95 | 96 | if (curr != NULL) { 97 | mpn_limb_t *p = &curr->chunk[curr->size]; 98 | curr->size += size; 99 | 100 | return p; 101 | } else { 102 | MPI_RAISE_ERROR(-ENOMEM); 103 | 104 | return NULL; 105 | } 106 | } 107 | 108 | /** 109 | * mpn optimizer: put back memory chunk 110 | */ 111 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size) 112 | { 113 | if (optimizer == NULL) { return; } 114 | 115 | mpn_optimizer_t *curr = optimizer, *prev = NULL; 116 | while (curr != NULL) { 117 | prev = curr; 118 | curr = curr->next; 119 | } 120 | 121 | if (prev->size >= size) { prev->size -= size; } 122 | } 123 | -------------------------------------------------------------------------------- /mpn/mpn-optimizer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Ethan.cr.yp.to 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef MULTIPLE_PRECISION_OPTIMIZER_H 17 | #define MULTIPLE_PRECISION_OPTIMIZER_H 18 | 19 | #include 20 | 21 | #if defined(__cplusplus) 22 | extern "C" { 23 | #endif 24 | 25 | typedef struct mpn_optimizer_t { 26 | mpn_size_t size; /**< offset of used chunk */ 27 | mpn_size_t room; /**< max size of chunk */ 28 | mpn_limb_t *chunk; /**< mpn chunk */ 29 | struct mpn_optimizer_t *next; /**< next optimizer node */ 30 | } mpn_optimizer_t; 31 | 32 | /** 33 | * mpn optimizer: create optimizer for mpn operation 34 | * 35 | * @note: 36 | * 1. room: room size of optimizer chunk, in unit of 'mpn_limb_t' 37 | */ 38 | mpn_optimizer_t *mpn_optimizer_create(mpn_size_t room); 39 | 40 | /** 41 | * mpn optimizer: destory optimizer 42 | */ 43 | void mpn_optimizer_destory(mpn_optimizer_t *opt); 44 | 45 | /** 46 | * mpn optimizer: get memory chunk for mpn operation 47 | * 48 | * @note: 49 | * 1. size: size of chunk, in unit of 'mpn_limb_t' 50 | */ 51 | mpn_limb_t *mpn_optimizer_get_limbs(mpn_optimizer_t *opt, mpn_size_t size); 52 | 53 | /** 54 | * mpn optimizer: put back memory chunk 55 | */ 56 | void mpn_optimizer_put_limbs(mpn_optimizer_t *optimizer, mpn_size_t size); 57 | 58 | /** 59 | * mpn optimizer: reset optimizer, mark all as unused 60 | */ 61 | void mpn_optimizer_reset(mpn_optimizer_t *opt); 62 | 63 | #if defined(__cplusplus) 64 | } 65 | #endif 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # unit-test 2 | ADD_EXECUTABLE(unittest-mpi unittest-mpi.cpp) 3 | TARGET_LINK_LIBRARIES(unittest-mpi mpi crypto dl gtest pthread) 4 | ADD_TEST(NAME unittest-mpi COMMAND unittest-mpi 5 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 6 | ) 7 | ConfigureTarget(unittest-mpi) 8 | 9 | IF (BUILD_VENDOR) 10 | ADD_DEPENDENCIES(unittest-mpi openssl) 11 | ENDIF () 12 | 13 | # benchmark 14 | ADD_EXECUTABLE(benchmark benchmark.cpp) 15 | TARGET_LINK_LIBRARIES(benchmark mpi crypto pthread dl) 16 | TARGET_COMPILE_OPTIONS(benchmark PRIVATE -std=gnu++17) 17 | ConfigureTarget(benchmark) 18 | 19 | IF (BUILD_VENDOR) 20 | ADD_DEPENDENCIES(benchmark openssl) 21 | ENDIF () 22 | 23 | INSTALL(TARGETS benchmark unittest-mpi RUNTIME DESTINATION bin) 24 | -------------------------------------------------------------------------------- /tests/test.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2022 Kiran Nowak(kiran.nowak@gmail.com) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "logger.h" 23 | #include "tabulate.h" 24 | #include "benchmark.h" 25 | 26 | template 27 | T reverse(T n); 28 | 29 | unsigned char reverse(unsigned char n) 30 | { 31 | #ifdef USE_SMALL_LOOKUP_TABLE 32 | // clang-format off 33 | static const unsigned char lookup[16] = { 34 | 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, 35 | 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, 36 | }; 37 | // clang-format on 38 | 39 | // Detailed breakdown of the math 40 | // + lookup reverse of bottom nibble 41 | // | + grab bottom nibble 42 | // | | + move bottom result into top nibble 43 | // | | | + combine the bottom and top results 44 | // | | | | + lookup reverse of top nibble 45 | // | | | | | + grab top nibble 46 | // V V V V V V 47 | // (lookup[n&0b1111] << 4) | lookup[n>>4] 48 | 49 | // Reverse the top and bottom nibble then swap them. 50 | return (lookup[n & 0b1111] << 4) | lookup[n >> 4]; 51 | #else 52 | // clang-format off 53 | static const unsigned char reversed[] = { 54 | 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 55 | 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 56 | 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 57 | 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 58 | 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 59 | 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 60 | 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 61 | 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, 62 | 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 63 | 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 64 | 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 65 | 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, 66 | 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 67 | 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 68 | 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 69 | 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, 70 | 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 71 | 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, 72 | 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 73 | 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, 74 | 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 75 | 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 76 | 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 77 | 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, 78 | 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 79 | 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 80 | 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 81 | 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, 82 | 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 83 | 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 84 | 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 85 | 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, 86 | }; 87 | // clang-format on 88 | 89 | return reversed[n]; 90 | #endif 91 | } 92 | 93 | unsigned int reverse(unsigned int n) 94 | { 95 | unsigned int m = n; 96 | unsigned char *p = (unsigned char *)(&m); 97 | 98 | if (sizeof(unsigned int) == 4) { 99 | return (reverse(p[0]) << 24) | (reverse(p[1]) << 16) | (reverse(p[2]) << 8) | reverse(p[3]); 100 | } else { 101 | for (unsigned i = 0; i < sizeof(unsigned int) / 2; i++) { 102 | unsigned char h = p[i], l = p[sizeof(unsigned int) - 1 - i]; 103 | p[i] = reverse(l); 104 | p[sizeof(unsigned int) - 1 - i] = reverse(h); 105 | } 106 | 107 | return m; 108 | } 109 | } 110 | 111 | unsigned int reverse_ref(unsigned int num) 112 | { 113 | unsigned int count = sizeof(num) * 8 - 1; 114 | unsigned int reverse_num = num; 115 | 116 | num >>= 1; 117 | while (num) { 118 | reverse_num <<= 1; 119 | reverse_num |= num & 1; 120 | num >>= 1; 121 | count--; 122 | } 123 | reverse_num <<= count; 124 | 125 | return reverse_num; 126 | } 127 | 128 | 129 | static int clz(unsigned int x) 130 | { 131 | static_assert(sizeof(unsigned int) == 4, "unsigned int must be 32 bits"); 132 | 133 | // clang-format off 134 | static const char debruijn32[32] = { 135 | 0, 31, 9, 30, 3, 8, 13, 29, 136 | 2, 5, 7, 21, 12, 24, 28, 19, 137 | 1, 10, 4, 14, 6, 22, 25, 20, 138 | 11, 15, 23, 26, 16, 27, 17, 18, 139 | }; 140 | // clang-format on 141 | x |= x >> 1; 142 | x |= x >> 2; 143 | x |= x >> 4; 144 | x |= x >> 8; 145 | x |= x >> 16; 146 | x++; 147 | 148 | return debruijn32[x * 0x076be629 >> 27]; 149 | } 150 | 151 | int merge(unsigned int &merged, unsigned int hi, unsigned int lo) 152 | { 153 | auto hbits = clz(hi); 154 | auto lbits = clz(lo); 155 | merged = lo | reverse(hi); 156 | 157 | return static_cast(lbits + hbits) - static_cast(sizeof(unsigned int) * 8); 158 | } 159 | 160 | struct data { 161 | std::string ma; 162 | std::string mb; 163 | std::string mc; 164 | data(const std::string &a, const std::string &b, const std::string &c) : ma(a), mb(b), mc(c) {} 165 | }; 166 | 167 | namespace logging 168 | { 169 | template <> 170 | inline std::string to_string(const std::vector &v) 171 | { 172 | using namespace tabulate; 173 | Table table("Company", "Contact", "Country"); 174 | table[0].format().align(Align::center); 175 | for (auto const &item : v) { table.add(item.ma, item.mb, item.mc); } 176 | 177 | // Iterate over rows in the table 178 | size_t index = 0; 179 | for (auto &row : table) { 180 | row.format().styles(Style::bold); 181 | 182 | // Set blue background color for alternate rows 183 | if (index > 0 && index % 2 == 0) { 184 | for (auto &cell : row) { cell.format().background_color(Color::blue); } 185 | } 186 | index += 1; 187 | } 188 | 189 | return table.xterm(); 190 | } 191 | } // namespace logging 192 | 193 | int main() 194 | { 195 | { 196 | int a = 1; 197 | float b = 2.0; 198 | std::string c = "three"; 199 | bool d = true; 200 | std::vector e{1, 3, 5, 7, 9}; 201 | std::vector f{ 202 | data("Alfreds Futterkiste", "Maria Anders", "Germany"), 203 | data("Centro comercial Moctezuma", "Francisco Chang", "Mexico"), 204 | data("Ernst Handel", "Roland Mendel", "Austria"), 205 | data("Island Trading", "Helen Bennett", "UK"), 206 | data("Laughing Bacchus Winecellars", "Yoshi Tannamuri", "Canada"), 207 | data("Magazzini Alimentari Riuniti", "Giovanni Rovelli", "Italy"), 208 | }; 209 | 210 | enum flags { 211 | FLAG1 = 0x1, 212 | FLAG2 = 0x2, 213 | FLAG3 = 0x4, 214 | } g = FLAG2, 215 | h = static_cast(FLAG1 | FLAG3); 216 | 217 | llogi(a, b, c, d, e, f, f[0].mc, g, h); 218 | } 219 | 220 | { 221 | struct { 222 | unsigned int hi, lo; 223 | } datas[] = { 224 | {.hi = 0x01, .lo = 0x1000}, 225 | {.hi = 0x09, .lo = 0x1000}, 226 | {.hi = 0x10, .lo = 0x1000}, 227 | {.hi = 0xF1, .lo = 0x1000}, 228 | }; 229 | 230 | std::cout << std::endl; 231 | for (auto const &data : datas) { 232 | int rbits; 233 | unsigned int merged; 234 | 235 | printf("merge(0x%02X, 0x%04X): ", data.hi, data.lo); 236 | if ((rbits = merge(merged, data.hi, data.lo)) >= 0) { 237 | printf("0x%08X, remain-bits = %2d\n", merged, rbits); 238 | } else { 239 | printf("failed.\n"); 240 | } 241 | } 242 | } 243 | 244 | { 245 | BENCHER(reverse_, DoNotOptimize(reverse(__j)), 20, 20000000); 246 | BENCHER(reverse_ref_, DoNotOptimize(reverse_ref(__j)), 20, 20000000); 247 | 248 | std::cout << std::endl; 249 | std::cout << "reverse: avg = " << reverse_avg << ", stddev = " << reverse_stddev << std::endl; 250 | std::cout << "reverse(ref): avg = " << reverse_ref_avg << ", stddev = " << reverse_ref_stddev << std::endl; 251 | std::cout << "perf-diff: " << reverse_ref_avg / reverse_avg << std::endl; 252 | } 253 | } 254 | --------------------------------------------------------------------------------