├── README.md
├── vecarith.vcxproj.filters
├── vecarith.sln
├── x64_bench
├── Makefile
├── pmod.h
├── x64_arith.h
├── monty_arith.h
├── bigarith.h
├── util.h
├── util.c
├── pmod.c
├── x64_arith.c
├── main.c
└── bigarith.c
├── common.c
├── Makefile
├── vecarith.vcxproj
├── vecarith.h
└── main.c
/README.md:
--------------------------------------------------------------------------------
1 | # avx512_modexp
2 | A test library for computing modular exponentiation in parallel using AVX-512 vector arithmetic
3 |
4 | Verified to work with gcc 7.3.0, gcc 11.1.0 and icc 18.0.3.
5 |
6 | build with (required)
7 | make SKYLAKEX=1
8 |
9 | optionally add this to build line to change the length of test numbers (N needs to be a multiple of 128)
10 | MAXBITS=N
11 |
12 | optionally add this to build line to change the compiler to gcc-7.3.0 from the default icc
13 | COMPILER=gcc730
14 |
15 | optionally add this to build line to change the compiler to gcc-11.1.0 from the default icc
16 | COMPILER=gcc11
17 |
18 | optionally add this to build line to use the double precision FMA arithmetic instead of 32-bit integer arithmetic.
19 | If this is specified then MAXBITS must be multiples of 208
20 | BASE51=1
21 |
22 |
23 | Run the executable with 2 arguments: number of threads and whether or not to verify all of the results using GMP.
24 | For example, to run with 4 threads and skip verification:
25 | ./avx512_modexp 4 0
26 |
--------------------------------------------------------------------------------
/vecarith.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Header Files
20 |
21 |
22 |
23 |
24 | Source Files
25 |
26 |
27 | Source Files
28 |
29 |
30 | Source Files
31 |
32 |
33 | Source Files
34 |
35 |
36 |
--------------------------------------------------------------------------------
/vecarith.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.271
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vecarith", "vecarith.vcxproj", "{F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x64.ActiveCfg = Debug|x64
17 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x64.Build.0 = Debug|x64
18 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x86.ActiveCfg = Debug|Win32
19 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x86.Build.0 = Debug|Win32
20 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x64.ActiveCfg = Release|x64
21 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x64.Build.0 = Release|x64
22 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x86.ActiveCfg = Release|Win32
23 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {DE0A4EE7-27A6-4F85-AB25-7EAB760DF485}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/x64_bench/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2014, Ben Buhrow
3 | # All rights reserved.
4 | #
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are met:
7 | #
8 | # 1. Redistributions of source code must retain the above copyright notice, this
9 | # list of conditions and the following disclaimer.
10 | # 2. Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | #
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | #
25 | # The views and conclusions contained in the software and documentation are those
26 | # of the authors and should not be interpreted as representing official policies,
27 | # either expressed or implied, of the FreeBSD Project.
28 | #
29 | #
30 |
31 |
32 | #--------------------------- EXAMPLE BUILDS -------------------------
33 | # make MAXBITS=256
34 | # make MAXBITS=512
35 |
36 |
37 |
38 | #--------------------------- flags -------------------------
39 | CC = gcc
40 | WARN_FLAGS = -Wall #-W -Wconversion
41 | OPT_FLAGS = -O3
42 | INC = -I. -I../gmp-6.1.2/include/
43 | LIBS = -L../gmp-6.1.2/lib/
44 | BINNAME = pmod_bench
45 |
46 | #--------------------------- make options -------------------------
47 |
48 |
49 | ifeq ($(COMPILER),gcc73)
50 | CC = gcc-7.3.0
51 | endif
52 |
53 | ifdef MAXBITS
54 | CFLAGS += -DMAXBITS=$(MAXBITS)
55 | endif
56 |
57 | ifdef VERBOSE
58 | CFLAGS += -DVERBOSE=$(VERBOSE)
59 | endif
60 |
61 | CFLAGS += -g $(OPT_FLAGS) $(WARN_FLAGS) $(INC)
62 | LIBS += -lm -lgmp
63 |
64 | #--------------------------- file lists -------------------------
65 | SRCS = \
66 | bigarith.c \
67 | x64_arith.c \
68 | monty_arith.c \
69 | pmod.c \
70 | main.c \
71 | util.c
72 |
73 | OBJS = $(SRCS:.c=.o)
74 |
75 | #---------------------------Header file lists -------------------------
76 | HEAD = \
77 | monty_arith.h \
78 | bigarith.h \
79 | x64_arith.h \
80 | pmod.h \
81 | util.h
82 |
83 | #---------------------------Make Targets -------------------------
84 |
85 | all: $(OBJS)
86 | rm -f libpmod.a
87 | ar r libpmod.a $(OBJS)
88 | ranlib libpmod.a
89 | $(CC) $(CFLAGS) $(OBJS) -o $(BINNAME) libpmod.a $(LIBS)
90 |
91 |
92 | clean:
93 | rm -f $(OBJS)
94 |
95 | #---------------------------Build Rules -------------------------
96 |
97 |
98 | %$(OBJ_EXT): %.c $(HEAD)
99 | $(CC) $(CFLAGS) -c -o $@ $<
100 |
101 |
--------------------------------------------------------------------------------
/x64_bench/pmod.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose
2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide.
3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may
4 | // not use this file except in compliance with the License. You may obtain
5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
6 | // Unless required by applicable law or agreed to in writing, software
7 | // distributed under the License is distributed on an "AS IS" BASIS,
8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
9 | // including conditions of title, non-infringement, merchantability,
10 | // or fitness for a particular purpose
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | // This file is a snapshot of a work in progress, originated by Mayo
14 | // Clinic SPPDG.
15 |
16 | /*
17 | Copyright (c) 2021, Ben Buhrow
18 | All rights reserved.
19 |
20 | Redistribution and use in source and binary forms, with or without
21 | modification, are permitted provided that the following conditions are met:
22 |
23 | 1. Redistributions of source code must retain the above copyright notice, this
24 | list of conditions and the following disclaimer.
25 | 2. Redistributions in binary form must reproduce the above copyright notice,
26 | this list of conditions and the following disclaimer in the documentation
27 | and/or other materials provided with the distribution.
28 |
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 |
40 | The views and conclusions contained in the software and documentation are those
41 | of the authors and should not be interpreted as representing official policies,
42 | either expressed or implied, of the FreeBSD Project.
43 | */
44 |
45 | #ifndef _PMOD_H
46 | #define _PMOD_H
47 |
48 | // Modular exponentiation relies on a big-integer math library and libraries
49 | // that perform modular arithmetic. We define routines that use a
50 | // homegrown bigint library.
51 | #include "bigarith.h"
52 | #include "monty_arith.h"
53 |
54 | typedef struct
55 | {
56 | bignum **libpmod_gwin;
57 | } pmod_t;
58 |
59 | #define MAX_WINSIZE 8
60 |
61 | int get_winsize(void);
62 | int get_bitwin(bignum *b, int bitloc, int winsize, int winmask);
63 | void lr_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s);
64 | void lrwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s);
65 | void lroddwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s);
66 |
67 | void pmodlib_init(pmod_t *pmod_state);
68 | void pmodlib_free(pmod_t *pmod_state);
69 |
70 |
71 |
72 | #endif
--------------------------------------------------------------------------------
/x64_bench/x64_arith.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #ifndef _X64_ARITH_H
31 | #define _X64_ARITH_H
32 |
33 | // this file declares special routines for low-level x64 arithmetic
34 | // used as subroutines for modular multiplication.
35 | #include
36 |
37 |
38 |
39 |
40 | void spAdd(uint64_t u, uint64_t v, uint64_t *sum, uint64_t *carry);
41 | void spAdd3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sum, uint64_t *carry);
42 | void spSub3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sub, uint64_t *borrow);
43 | void spSub(uint64_t u, uint64_t v, uint64_t *sub, uint64_t *borrow);
44 | uint64_t spDivide(uint64_t *q, uint64_t *r, uint64_t u[2], uint64_t v);
45 | void spMultiply(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry);
46 | uint64_t spDiv(uint64_t *q, uint64_t *r, uint64_t u1, uint64_t u0, uint64_t v);
47 | uint64_t spMod(uint64_t u1, uint64_t u0, uint64_t v);
48 | void spMul(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry);
49 | void spMulAdd1(uint64_t u, uint64_t v, uint64_t w,
50 | uint64_t *product, uint64_t *carry);
51 | void spMulAdd2(uint64_t u, uint64_t v, uint64_t w,
52 | uint64_t c, uint64_t *product, uint64_t *carry);
53 | void spMulAdd2x(uint64_t u, uint64_t v, uint64_t w,
54 | uint64_t c, uint64_t *product, uint64_t *carry);
55 | void spMulAddc(uint64_t u, uint64_t v, uint64_t * w);
56 | void spSqrMulAcc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w);
57 | void mpSub(uint64_t * u, uint64_t * n, uint64_t * w, int sz);
58 | void mpSub1(uint64_t * u, uint64_t n, uint64_t * w, int sz);
59 | void mpAdd1(uint64_t * u, uint64_t n, uint64_t * w, int sz);
60 | void spMul2Acc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w);
61 | void spMulAddcr(uint64_t u, uint64_t v, uint64_t * w);
62 | void spMulDblAdd_1(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout);
63 | void spMulDblAdd_2(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout);
64 | void spMulDblAdd_3(uint64_t u, uint64_t v, uint64_t * w);
65 | void mpAdd1b(uint64_t * u, uint64_t n, uint64_t * w, int sz);
66 |
67 |
68 | #endif
69 |
--------------------------------------------------------------------------------
/x64_bench/monty_arith.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #ifndef _MONTY_H
31 | #define _MONTY_H
32 |
33 | #include "bigarith.h"
34 |
35 | /* montgomery arithmetic operations */
36 | typedef struct
37 | {
38 | bignum *r;
39 | bignum *n;
40 | bignum *np;
41 | bignum *nhat;
42 | bignum *vnhat;
43 | bignum *rhat;
44 | bignum *rmask;
45 | bignum *one;
46 | bignum *mtmp1;
47 | bignum *mtmp2;
48 | bignum *mtmp3;
49 | bignum *mtmp4;
50 | base_t rho;
51 | } monty;
52 |
53 | // montgomery arithmetic setup and conversion
54 | void to_monty(monty *mdata, bignum * x);
55 | monty * monty_alloc();
56 | void monty_init(monty * in, bignum * n, int verbose);
57 | void monty_free(monty *mdata);
58 |
59 | // pointers to the current mul/sqr scanning technique
60 | void(*mul_ptr)(monty *, bignum *, bignum *, bignum *, bignum *);
61 | void(*sqr_ptr)(monty *, bignum *, bignum *, bignum *);
62 |
63 | // montgomery multipliers for various scanning techniques
64 | void mulmod_sos(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s);
65 | void mulmod_cios(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s);
66 | void mulmod_fios(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s);
67 | void mulmod_fips(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s);
68 | void mulmod_bps(monty* mdata, bignum* u, bignum* v, bignum* w, bignum* s);
69 |
70 | // montgomery squaring that just calls mulmod for various scanning techniques
71 | void sqrmod_sos_mul(monty *mdata, bignum * u, bignum * w, bignum * s);
72 | void sqrmod_fips_mul(monty *mdata, bignum * u, bignum * w, bignum * s);
73 | void sqrmod_fios_mul(monty *mdata, bignum * u, bignum * w, bignum * s);
74 | void sqrmod_cios_mul(monty *mdata, bignum * u, bignum * w, bignum * s);
75 | void sqrmod_bps_mul(monty* mdata, bignum* u, bignum* w, bignum* s);
76 |
77 | // specialized montgomery squaring for various scanning techniques
78 | void sqrmod_sos(monty *mdata, bignum * u, bignum * w, bignum * s);
79 | void sqrmod_cios(monty* mdata, bignum* u, bignum* w, bignum* s);
80 | void sqrmod_fios(monty *mdata, bignum * u, bignum * w, bignum * s);
81 | void sqrmod_fips(monty* mdata, bignum* u, bignum* w, bignum* s);
82 | void sqrmod_bps(monty* mdata, bignum* u, bignum* w, bignum* s);
83 |
84 | #endif // _MONTY_H
85 |
--------------------------------------------------------------------------------
/x64_bench/bigarith.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #ifndef _BIG_ARITH_H
31 | #define _BIG_ARITH_H
32 |
33 | #include
34 | #include
35 | #include "util.h"
36 | #include "x64_arith.h"
37 |
38 | #define DIGITBITS 64
39 |
40 | #ifdef _MSC_VER
41 | #define base_t unsigned long long
42 | #define base_signed_t long long
43 | #define MAXDIGIT 0xffffffffffffffff
44 | #define HIBITMASK 0x8000000000000000
45 | #define MAX_DEC_WORD 0x8AC7230489E80000ULL
46 | #define DEC_DIGIT_PER_WORD 19
47 | #define HEX_DIGIT_PER_WORD 16
48 | #define HALFMASK 0xffffffff
49 | #define HALFBITS 32
50 | #else
51 | #if DIGITBITS == 64
52 |
53 | #define base_t uint64_t
54 | #define base_signed_t int64_t
55 | #define HALFBITS 32
56 | #define HALFMASK 0xffffffff
57 | #define MAXDIGIT 0xffffffffffffffff
58 | #define HIBITMASK 0x8000000000000000
59 | #define VECLEN 8
60 | #define MAX_DEC_WORD 0x8AC7230489E80000ULL
61 | #define DEC_DIGIT_PER_WORD 19
62 | #define HEX_DIGIT_PER_WORD 16
63 |
64 | #else
65 |
66 | #define base_t uint32_t
67 | #define base_signed_t int32_t
68 | #define HALFBITS 16
69 | #define HALFMASK 0xffff
70 | #define MAXDIGIT 0xffffffff
71 | #define HIBITMASK 0x80000000
72 | #define VECLEN 16
73 | #define MAX_DEC_WORD 0x3b9aca00
74 | #define DEC_DIGIT_PER_WORD 9
75 | #define HEX_DIGIT_PER_WORD 8
76 |
77 | #endif
78 | #endif
79 |
80 | // supported:
81 | // any N divisible by 32 for MAXBITS < 512
82 | // any N divisible by 128 MAXBITS >= 128
83 | #ifndef MAXBITS
84 | #define MAXBITS 512
85 | #endif
86 |
87 | #define NWORDS (MAXBITS / DIGITBITS)
88 |
89 | typedef struct
90 | {
91 | base_t *data;
92 | int size;
93 | } bignum;
94 |
95 |
96 | /* basic arithmetic operations: fixed allocation, variable sized, non-signed */
97 | int zBits(bignum * n);
98 | base_t spBits(base_t n);
99 | void zSet1(bignum *dest, base_t value);
100 | void zCopy(bignum * src, bignum * dest);
101 | void zAdd(bignum * u, bignum * v, bignum * w);
102 | void zShortAdd(bignum * u, base_t v, bignum * w);
103 | int zSub(bignum * u, bignum * v, bignum * w);
104 | void zShortSub(bignum * u, base_t v, bignum * w);
105 | int zCompare(bignum * u, bignum * v);
106 | int zCompare1(bignum * u, base_t v);
107 | base_t zShortDiv(bignum * u, base_t v, bignum * q);
108 | void zDiv(bignum * u, bignum * v, bignum * q, bignum * r);
109 | int shortCompare(base_t p[2], base_t t[2]);
110 | int shortSubtract(base_t u[2], base_t v[2], base_t w[2]);
111 | void zMul(bignum * u, bignum * v, bignum * w);
112 | void zMult(bignum * u, bignum * v, bignum * w, bignum *tmp);
113 | void zModMul(bignum * u, bignum * v, bignum * n, bignum * w);
114 | void zShortMul(bignum * u, base_t v, bignum * w);
115 | void zSqr(bignum * x, bignum * w);
116 | void zShiftLeft(bignum * a, bignum * b, int x);
117 | void zShiftLeft_1(bignum * a, bignum * b);
118 | void zShiftRight(bignum * a, bignum * b, int x);
119 | void zShiftRight_1(bignum * a, bignum * b);
120 | void spAdd(base_t u, base_t v, base_t *sum, base_t *carry);
121 | void spAdd3(base_t u, base_t v, base_t w, base_t *sum, base_t *carry);
122 | void spSub3(base_t u, base_t v, base_t w, base_t *sub, base_t *borrow);
123 | void spSub(base_t u, base_t v, base_t *sub, base_t *borrow);
124 | base_t spDivide(base_t *q, base_t *r, base_t u[2], base_t v);
125 | void spMultiply(base_t u, base_t v, base_t *product, base_t *carry);
126 | void spMulAdd(base_t u, base_t v, base_t w, base_t t, base_t *lower, base_t *carry);
127 | void spMulMod(base_t u, base_t v, base_t m, base_t *w);
128 | void sp2big(base_t src, bignum * dest);
129 | void zClear(bignum * n);
130 | void zClearFull(bignum * n);
131 | void zClamp(bignum * n);
132 | void zPrint(bignum *n);
133 | int ndigits_1(base_t n);
134 | bignum * zInit(void);
135 | void zFree(bignum *n);
136 | void xGCD(bignum *a, bignum *b, bignum *x, bignum *y, bignum *g);
137 | int zBinGCD(bignum *u, bignum *v, bignum *w);
138 | int zLEGCD(bignum *u, bignum *v, bignum *w);
139 | base_t spGCD(base_t x, base_t y);
140 | void zModMuls(bignum * u, bignum * v, bignum * n, bignum * w, bignum *s1, bignum *s2);
141 | void zModExp(bignum *d, bignum *b, bignum *e, bignum *m);
142 |
143 | void str2hexz(char in[], bignum * u);
144 | void zDec2Hex(bignum * u, bignum * v);
145 | char *z2decstr(bignum * n);
146 | void zHex2Dec(bignum * u, bignum * v);
147 |
148 | #endif // _BIGARITH_H
149 |
150 |
--------------------------------------------------------------------------------
/common.c:
--------------------------------------------------------------------------------
1 | #include "vecarith.h"
2 |
3 | void(*vecmulmod_ptr)(bignum*, bignum*, bignum*, bignum*, bignum*, monty*);
4 | void(*vecsqrmod_ptr)(bignum*, bignum*, bignum*, bignum*, monty*);
5 | int(*montsetup_ptr)(bignum*, bignum*, bignum*, base_t*);
6 | void(*vecmodexp_ptr)(bignum*, bignum*, bignum*, bignum*, bignum*, bignum*, monty* m);
7 |
8 | int get_winsize(void)
9 | {
10 | // the window size is based on minimizing the total number of multiplications
11 | // in the windowed exponentiation. experiments show that this is best;
12 | // the growing size of the table doesn't change the calculus, at least
13 | // on the KNL.
14 | int size;
15 | int muls;
16 | int minmuls = 99999999;
17 | int minsize = 4;
18 |
19 | for (size = 2; size <= 8; size++)
20 | {
21 | muls = (NWORDS * DIGITBITS / size) + (1 << size);
22 | if (muls < minmuls)
23 | {
24 | minmuls = muls;
25 | minsize = size;
26 | }
27 | }
28 |
29 | return minsize;
30 | }
31 |
32 | int get_bitwin(bignum *e, int bitloc, int winsize, int lane, int winmask)
33 | {
34 | int bstr;
35 | int bitstart = (bitloc - winsize + 1);
36 | int word = bitloc / DIGITBITS;
37 | int word2 = bitstart / DIGITBITS;
38 |
39 | bitstart = bitstart % DIGITBITS;
40 |
41 | if (word == word2)
42 | {
43 | bstr = (e->data[lane + word * VECLEN] >> bitstart) & winmask;
44 | }
45 | else
46 | {
47 | int upperbits = (bitloc % DIGITBITS) + 1;
48 |
49 | bstr = (e->data[lane + word2 * VECLEN] >> bitstart);
50 | bstr |= ((e->data[lane + word * VECLEN]) << (winsize - upperbits));
51 | bstr &= winmask;
52 | }
53 |
54 | return bstr;
55 | }
56 |
57 |
58 | bignum * vecInit(void)
59 | {
60 | int i;
61 | size_t sz = VECLEN * (2 * NWORDS + 4);
62 | bignum *n;
63 | n = (bignum *)malloc(sizeof(bignum));
64 |
65 | n->data = (base_t *)xmalloc_align(sz * sizeof(base_t));
66 | if (n->data == NULL)
67 | {
68 | printf("could not allocate memory\n");
69 | exit(2);
70 | }
71 |
72 | for (i = 0; i < sz; i++)
73 | {
74 | n->data[i] = 0;
75 | }
76 | n->size = 1;
77 |
78 | return n;
79 | }
80 |
81 | void vecCopy(bignum * src, bignum * dest)
82 | {
83 | //physically copy the digits of u into the digits of v
84 | int su = VECLEN * (2 * NWORDS + 1);
85 |
86 | memcpy(dest->data, src->data, su * sizeof(base_t));
87 | dest->size = src->size; // = NWORDS;
88 | return;
89 | }
90 |
91 | void vecCopyn(bignum * src, bignum * dest, int size)
92 | {
93 | //physically copy the digits of u into the digits of v
94 | int su = VECLEN * size;
95 |
96 | memcpy(dest->data, src->data, su * sizeof(base_t));
97 | dest->size = size;
98 | return;
99 | }
100 |
101 | void vecClear(bignum *n)
102 | {
103 | memset(n->data, 0, VECLEN*(2 * NWORDS + 1) * sizeof(base_t));
104 | return;
105 | }
106 |
107 | void vecFree(bignum *n)
108 | {
109 | align_free(n->data);
110 | free(n);
111 | }
112 |
113 | void copy_vec_lane(bignum *src, bignum *dest, int num, int size)
114 | {
115 | int j;
116 |
117 | for (j = 0; j < size; j++)
118 | {
119 | dest->data[num + j * VECLEN] = src->data[num + j * VECLEN];
120 | }
121 |
122 | return;
123 | }
124 |
125 | void monty_init_vec(monty *mdata, bignum * n, int verbose)
126 | {
127 | int j;
128 | // for a input modulus n, initialize constants for
129 | // montogomery representation
130 | // this assumes that n is relatively prime to 2, i.e. is odd.
131 | // In this version we assume the input monty structure has
132 | // already been allocated and we just perform the calculations.
133 |
134 | if (verbose)
135 | printf("initializing montgomery representation\n");
136 |
137 | memset(mdata->n->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t));
138 | memset(mdata->r->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t));
139 | memset(mdata->rhat->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t));
140 | memset(mdata->one->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t));
141 | memset(mdata->vrho, 0, VECLEN * sizeof(base_t));
142 |
143 | vecCopy(n, mdata->n);
144 | montsetup_ptr(mdata->n, mdata->r, mdata->rhat, mdata->vrho);
145 |
146 | for (j = 0; j < VECLEN; j++)
147 | {
148 | mdata->one->data[j] = 1;
149 | }
150 |
151 | vecmulmod_ptr(mdata->one, mdata->rhat, mdata->one, n, mdata->mtmp1, mdata); // monty rep
152 | vecCopyn(mdata->one, mdata->g[0], NWORDS);
153 |
154 | return;
155 |
156 | }
157 |
158 | monty* monty_alloc(void)
159 | {
160 | int i;
161 | monty *mdata = (monty *)malloc(sizeof(monty));
162 |
163 | mdata->r = vecInit();
164 | mdata->n = vecInit();
165 | mdata->nhat = vecInit();
166 | mdata->vnhat = vecInit();
167 | mdata->rhat = vecInit();
168 | mdata->rmask = vecInit();
169 | mdata->one = vecInit();
170 | mdata->mtmp1 = vecInit();
171 | mdata->mtmp2 = vecInit();
172 | mdata->mtmp3 = vecInit();
173 | mdata->mtmp4 = vecInit();
174 |
175 | mdata->g = (bignum **)malloc((1 << MAX_WINSIZE) * sizeof(bignum *));
176 | mdata->g[0] = vecInit();
177 |
178 | for (i = 1; i < (1 << MAX_WINSIZE); i++)
179 | {
180 | mdata->g[i] = vecInit();
181 | }
182 |
183 | mdata->vrho = (base_t *)xmalloc_align(VECLEN * sizeof(base_t));
184 |
185 | return mdata;
186 | }
187 |
188 | void monty_free(monty *mdata)
189 | {
190 | int i;
191 |
192 | vecFree(mdata->mtmp1);
193 | vecFree(mdata->mtmp2);
194 | vecFree(mdata->mtmp3);
195 | vecFree(mdata->mtmp4);
196 | vecFree(mdata->rhat);
197 | vecFree(mdata->one);
198 | vecFree(mdata->n);
199 | vecFree(mdata->nhat);
200 | vecFree(mdata->r);
201 | align_free(mdata->vrho);
202 |
203 | for (i = 0; i < (1 << MAX_WINSIZE); i++)
204 | {
205 | vecFree(mdata->g[i]);
206 | }
207 | free(mdata->g);
208 |
209 | return;
210 | }
211 |
212 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2014, Ben Buhrow
3 | # All rights reserved.
4 | #
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are met:
7 | #
8 | # 1. Redistributions of source code must retain the above copyright notice, this
9 | # list of conditions and the following disclaimer.
10 | # 2. Redistributions in binary form must reproduce the above copyright notice,
11 | # this list of conditions and the following disclaimer in the documentation
12 | # and/or other materials provided with the distribution.
13 | #
14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | #
25 | # The views and conclusions contained in the software and documentation are those
26 | # of the authors and should not be interpreted as representing official policies,
27 | # either expressed or implied, of the FreeBSD Project.
28 | #
29 | #
30 | # Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose
31 | # Processor Development Group (SPPDG). All Rights Reserved Worldwide.
32 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
33 | # not use this file except in compliance with the License. You may obtain
34 | # a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
35 | # Unless required by applicable law or agreed to in writing, software
36 | # distributed under the License is distributed on an "AS IS" BASIS,
37 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
38 | # including conditions of title, non-infringement, merchantability,
39 | # or fitness for a particular purpose
40 | # See the License for the specific language governing permissions and
41 | # limitations under the License.
42 | # This file is a snapshot of a work in progress, originated by Mayo
43 | # Clinic SPPDG.
44 |
45 |
46 |
47 |
48 | #--------------------------- flags -------------------------
49 | CC = icc
50 | #CFLAGS = -g -march=core2 -mtune=core2
51 | #CFLAGS = -static
52 | #CFLAGS = -S -fsource-asm
53 | WARN_FLAGS = -Wall #-W -Wconversion
54 | OPT_FLAGS = -O2
55 | INC = -I.
56 | LIBS =
57 | BINNAME = avx512_modexp
58 | CFLAGS += -I../gmp_install/gmp-6.2.0/include/
59 | CFLAGS += -L../gmp_install/gmp-6.2.0/lib/
60 | CFLAGS += -g -gdwarf-4
61 |
62 | #--------------------------- make options -------------------------
63 |
64 |
65 | ifeq ($(COMPILER),mingw)
66 | # NOTE: Using -fcall-used instead of -ffixed is much better and still works.
67 | # -fcall-used simply prevents the named registers from being saved/restored while
68 | # -ffixed prevents them from being used at all. The code benefits a lot from being
69 | # able to use all 32 zmm registers.
70 | CC = gcc
71 | BINNAME = avx512_modexp_mingw
72 | CFLAGS += -fopenmp
73 | CFLAGS += -fcall-used-xmm16 -fcall-used-xmm17 -fcall-used-xmm18 -fcall-used-xmm19
74 | CFLAGS += -fcall-used-xmm20 -fcall-used-xmm21 -fcall-used-xmm22 -fcall-used-xmm23
75 | CFLAGS += -fcall-used-xmm24 -fcall-used-xmm25 -fcall-used-xmm26 -fcall-used-xmm27
76 | CFLAGS += -fcall-used-xmm28 -fcall-used-xmm29 -fcall-used-xmm30 -fcall-used-xmm31
77 | else ifeq ($(COMPILER),gcc730)
78 | CC = gcc-7.3.0
79 | CFLAGS += -fopenmp
80 | else ifeq ($(COMPILER),gcc11)
81 | CC = gcc-11.1.0
82 | CFLAGS += -fopenmp
83 | else
84 | CFLAGS += -qopenmp
85 | endif
86 |
87 | ifdef MAXBITS
88 | CFLAGS += -DMAXBITS=$(MAXBITS)
89 | endif
90 |
91 | ifdef BASE52
92 | CFLAGS += -DBASE52
93 | endif
94 |
95 | ifeq ($(KNL),1)
96 | ifeq ($(COMPILER),gcc)
97 | CFLAGS += -march=knl -DTARGET_KNL
98 | else
99 | CFLAGS += -xMIC-AVX512 -DTARGET_KNL
100 | endif
101 | OBJ_EXT = .o
102 | BINNAME := ${BINNAME:%=%_knl}
103 | else
104 | OBJ_EXT = .o
105 | ifeq ($(SKYLAKEX),1)
106 | CFLAGS += -DSKYLAKEX
107 | ifeq ($(COMPILER),icc)
108 | CFLAGS += -march=skylake-avx512 -DTARGET_KNL
109 | else
110 | CFLAGS += -march=skylake-avx512 -DTARGET_KNL
111 | endif
112 | else
113 | OPT_FLAGS += -mavx
114 | endif
115 | endif
116 |
117 |
118 | ifeq ($(CC),icc)
119 | CFLAGS += -qmkl
120 | endif
121 |
122 |
123 | ifeq ($(PROFILE),1)
124 | CFLAGS += -pg
125 | BINNAME := ${BINNAME:%=%_prof}
126 | endif
127 |
128 |
129 | CFLAGS += -g $(OPT_FLAGS) $(WARN_FLAGS) $(INC)
130 |
131 | ifeq ($(STATIC),1)
132 | CFLAGS += -static-intel -static
133 | LIBS += -L/usr/lib/x86_64-redhat-linux6E/lib64/ -lm
134 | else
135 | LIBS += -lm -lgmp
136 | endif
137 |
138 |
139 | #--------------------------- file lists -------------------------
140 | SRCS = \
141 | common.c \
142 | vecarith52.c \
143 | vecarith.c \
144 | main.c
145 |
146 | OBJS = $(SRCS:.c=$(OBJ_EXT))
147 |
148 |
149 |
150 | #---------------------------Header file lists -------------------------
151 | HEAD = \
152 | vecarith.h
153 |
154 | #---------------------------Make Targets -------------------------
155 |
156 | all: $(OBJS)
157 | rm -f libvecarith.a
158 | ar r libvecarith.a $(OBJS)
159 | ranlib libvecarith.a
160 | $(CC) $(CFLAGS) $(OBJS) -o $(BINNAME) libvecarith.a $(LIBS)
161 |
162 |
163 | clean:
164 | rm -f $(OBJS)
165 |
166 | #---------------------------Build Rules -------------------------
167 |
168 |
169 | %$(OBJ_EXT): %.c $(HEAD)
170 | $(CC) $(CFLAGS) -c -o $@ $<
171 |
172 |
--------------------------------------------------------------------------------
/vecarith.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 | 15.0
23 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}
24 | vecarith
25 | 10.0
26 |
27 |
28 |
29 | Application
30 | true
31 | v143
32 | MultiByte
33 |
34 |
35 | Application
36 | false
37 | v143
38 | true
39 | MultiByte
40 |
41 |
42 | Application
43 | true
44 | v143
45 | MultiByte
46 |
47 |
48 | Application
49 | false
50 | v143
51 | true
52 | MultiByte
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 | Level3
76 | Disabled
77 | true
78 | true
79 |
80 |
81 |
82 |
83 | Level3
84 | Disabled
85 | true
86 | true
87 |
88 |
89 |
90 |
91 | Level3
92 | MaxSpeed
93 | true
94 | true
95 | true
96 | true
97 |
98 |
99 | true
100 | true
101 |
102 |
103 |
104 |
105 | Level3
106 | MaxSpeed
107 | true
108 | true
109 | true
110 | true
111 | Y:\projects\mpir-3.0.0;%(AdditionalIncludeDirectories)
112 |
113 |
114 | true
115 | true
116 | Y:\projects\mpir-3.0.0\lib\x64\Release\bkup_mpir_gc;%(AdditionalLibraryDirectories)
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/x64_bench/util.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #ifndef _UTIL_H
31 | #define _UTIL_H
32 |
33 | // ============================================================================
34 | // some standard headers
35 | // ============================================================================
36 | #include
37 | #include
38 | #include
39 | #include // for uint32_t, etc.
40 | #include
41 | #include
42 | #include
43 | #if defined(WIN32)
44 | #define WIN32_LEAN_AND_MEAN
45 | #include
46 | #include
47 | #include
48 | #include
49 | #endif
50 |
51 | #ifndef _MSC_VER
52 | #include //for gettimeofday using gcc
53 | #include
54 | #endif
55 |
56 |
57 | // ============================================================================
58 | // useful definitions
59 | // ============================================================================
60 | #define MIN(a,b) ((a) < (b)? (a) : (b))
61 | #define MAX(a,b) ((a) > (b)? (a) : (b))
62 | #define SIGN(a) ((a) < 0 ? -1 : 1)
63 |
64 | #define INV_2_POW_48 3.5527136788005009293556213378906e-15
65 | #define INV_2_POW_52 2.2204460492503130808472633361816e-16
66 | #define INV_2_POW_64 5.4210108624275221700372640043497e-20
67 | #define INV_2_POW_26 1.490116119384765625e-8
68 | #define INV_2_POW_32 2.3283064365386962890625e-10
69 | #define PI 3.1415926535897932384626433832795
70 | #define HBAR 6.58211928000e-7 // (ev * ns)
71 | #define INV_HBAR 1.519267514702347e+06 // (eV * ns)^-1
72 | #define INV_HBAR_RESIDUE 5.873436907300274 // 1/hbar mod 2*pi
73 |
74 | #define INLINE __inline
75 | #define LOWER(x) ((x) & HALFMASK)
76 | #define UPPER(x) ((x) >> HALFBITS)
77 | #define strto_uint64 strtoull
78 | #define DEC 10
79 | #define HEX 16
80 | #define DEFINED 1
81 | #ifdef NOTDEF
82 | #undef NOTDEF
83 | #endif
84 |
85 | // portable 64-bit formatting and aligned memory
86 | #if defined(_MSC_VER) || defined(__MINGW32__)
87 | #define PRId64 "I64d"
88 | #define PRIu64 "I64u"
89 | #define PRIx64 "I64x"
90 |
91 | #define align_free _aligned_free
92 | #define ALIGNED_MEM __declspec(align(64))
93 |
94 | #elif defined(__x86_64__)
95 |
96 | #define align_free free
97 | #if defined (__INTEL_COMPILER)
98 | #define ALIGNED_MEM __declspec(align(64))
99 | #else
100 | #define ALIGNED_MEM __attribute__((aligned(64)))
101 | #endif
102 |
103 | #define PRId64 "ld"
104 | #define PRIu64 "lu"
105 | #define PRIx64 "lx"
106 | #define BSCu "lu"
107 | #define BSCx "lx"
108 | #define BSCu0 "019lu" // base string conversion with leading zeros
109 | #define BSCx0 "019lx" // base string conversion with leading zeros
110 | #elif defined(__i386__)
111 |
112 | #define align_free free
113 | #if defined (__INTEL_COMPILER)
114 | #define ALIGNED_MEM __declspec(align(64))
115 | #else
116 | #define ALIGNED_MEM __attribute__((aligned(64)))
117 | #endif
118 |
119 | #define PRId64 "lld"
120 | #define PRIu64 "llu"
121 | #define PRIx64 "llx"
122 | #define BSCu "u"
123 | #define BSCx "x"
124 | #define BSCu0 "09u"
125 | #define BSCx0 "09x"
126 | #endif
127 |
128 | #ifdef _MSC_VER
129 | #define strto_uint64 _strtoui64
130 | #else
131 | #define strto_uint64 strtoull
132 | #endif
133 |
134 |
135 | // ============================================================================
136 | // memory allocation
137 | // ============================================================================
138 | static __inline void * xmalloc_align(size_t len)
139 | {
140 | #if defined (_MSC_VER) || defined(__MINGW32__)
141 | void *ptr = _aligned_malloc(len, 64);
142 | #elif defined (__APPLE__)
143 | void *ptr = malloc(len);
144 | #elif defined (__GNUC__)
145 | void *ptr = memalign(64, len);
146 | #define align_free free
147 | #else
148 | void *ptr = malloc(len);
149 | #endif
150 |
151 | if (ptr == NULL) {
152 | printf("failed to allocate %u aligned bytes\n", (uint32_t)len);
153 | exit(-1);
154 | }
155 |
156 | return ptr;
157 | }
158 |
159 | static __inline void * xmalloc(size_t len) {
160 | void *ptr = malloc(len);
161 | if (ptr == NULL) {
162 | printf("failed to allocate %u bytes\n", (uint32_t)len);
163 | exit(-1);
164 | }
165 | return ptr;
166 | }
167 |
168 | static __inline void * xcalloc(size_t num, size_t len) {
169 | void *ptr = calloc(num, len);
170 | if (ptr == NULL) {
171 | printf("failed to calloc %u bytes\n", (uint32_t)(num * len));
172 | exit(-1);
173 | }
174 | return ptr;
175 | }
176 |
177 | static __inline void * xrealloc(void *iptr, size_t len) {
178 | void *ptr = realloc(iptr, len);
179 | if (ptr == NULL) {
180 | printf("failed to reallocate %u bytes\n", (uint32_t)len);
181 | exit(-1);
182 | }
183 | return ptr;
184 | }
185 |
186 | // ============================================================================
187 | // randomness
188 | // ============================================================================
189 | typedef struct
190 | {
191 | uint32_t hi;
192 | uint32_t low;
193 | } rand_t;
194 |
195 | uint32_t spRand(uint64_t *state, uint32_t lower, uint32_t upper);
196 | uint64_t spRand64(uint64_t *state);
197 | uint64_t spRand64_range(uint64_t *state, uint64_t lower, uint64_t upper);
198 | void get_random_seeds(rand_t *r);
199 |
200 | rand_t g_rand;
201 | uint64_t LCGSTATE;
202 |
203 | // ============================================================================
204 | // hashing
205 | // ============================================================================
206 | uint64_t hash64(uint64_t in);
207 |
208 |
209 | // ============================================================================
210 | // sorting (qsort)
211 | // ============================================================================
212 |
213 | static int qcomp_uint32(const void *x, const void *y)
214 | {
215 | uint32_t *xx = (uint32_t *)x;
216 | uint32_t *yy = (uint32_t *)y;
217 |
218 | if (*xx > *yy)
219 | return 1;
220 | else if (*xx == *yy)
221 | return 0;
222 | else
223 | return -1;
224 | }
225 |
226 |
227 | // ============================================================================
228 | // precision time
229 | // ============================================================================
230 | uint64_t read_clock(void);
231 | uint64_t measure_processor_speed(int millisec);
232 |
233 | #ifdef _MSC_VER
234 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
235 |
236 | struct timeval
237 | {
238 | long tv_sec;
239 | long tv_usec;
240 | };
241 |
242 | struct timezone
243 | {
244 | int tz_minuteswest; /* minutes W of Greenwich */
245 | int tz_dsttime; /* type of dst correction */
246 | };
247 | #endif
248 |
249 | double my_difftime(struct timeval *, struct timeval *);
250 | #if defined (_MSC_VER)
251 | int gettimeofday(struct timeval *tv, struct timezone *tz);
252 |
253 | static void usleep(uint32_t usec)
254 | {
255 | Sleep(usec / 1000);
256 | }
257 | #endif
258 |
259 | #endif
260 |
--------------------------------------------------------------------------------
/x64_bench/util.c:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #include "util.h"
31 |
32 |
33 | // ============================================================================
34 | // precision time
35 | // ============================================================================
36 | #ifdef _MSC_VER
37 |
38 | /* Core aware timing on Windows, courtesy of Brian Gladman */
39 |
40 | #if defined( _WIN64 )
41 |
42 | #define current_processor_number GetCurrentProcessorNumber
43 |
44 | #else
45 |
46 | unsigned long current_processor_number(void)
47 | {
48 | __asm
49 | {
50 | mov eax,1
51 | cpuid
52 | shr ebx,24
53 | mov eax, ebx
54 | }
55 | }
56 |
57 | #endif
58 |
59 | int lock_thread_to_core(void)
60 | { DWORD_PTR afp, afs;
61 |
62 | if(GetProcessAffinityMask(GetCurrentProcess(), &afp, &afs))
63 | {
64 | afp &= (DWORD_PTR)(1 << current_processor_number());
65 | if(SetThreadAffinityMask(GetCurrentThread(), afp))
66 | return EXIT_SUCCESS;
67 | }
68 | return EXIT_FAILURE;
69 | }
70 |
71 | int unlock_thread_from_core(void)
72 | { DWORD_PTR afp, afs;
73 |
74 | if(GetProcessAffinityMask(GetCurrentProcess(), &afp, &afs))
75 | {
76 | if(SetThreadAffinityMask(GetCurrentThread(), afp))
77 | return EXIT_SUCCESS;
78 | }
79 | return EXIT_FAILURE;
80 | }
81 |
82 | double cycles_per_second = 0.0;
83 | double ticks_per_second = 0.0;
84 | double cycles_per_tick = 0.0;
85 |
86 | uint64_t measure_processor_speed(int millisec)
87 | { unsigned long long cycles;
88 |
89 | lock_thread_to_core();
90 | cycles = __rdtsc();
91 | Sleep(millisec);
92 | cycles = __rdtsc() - cycles;
93 | unlock_thread_from_core();
94 | cycles_per_second = 10.0 * (double)cycles;
95 |
96 | if(ticks_per_second == 0.0)
97 | { LARGE_INTEGER ll;
98 | QueryPerformanceFrequency(&ll);
99 | ticks_per_second = (double)ll.QuadPart;
100 | cycles_per_tick = cycles_per_second / ticks_per_second;
101 | }
102 | return cycles;
103 | }
104 |
105 | double get_tsc_time(void)
106 | {
107 | if(cycles_per_second == 0.0)
108 | measure_processor_speed(100);
109 | return __rdtsc() / cycles_per_second;
110 | }
111 |
112 | double get_pfc_time(void)
113 | { LARGE_INTEGER ll;
114 |
115 | if(ticks_per_second == 0.0)
116 | measure_processor_speed(100);
117 | QueryPerformanceCounter(&ll);
118 | return ll.QuadPart / ticks_per_second;
119 | }
120 |
121 | #else
122 |
123 | double cycles_per_second = 0.0;
124 |
125 | uint64_t measure_processor_speed(int millisec)
126 | {
127 | uint64_t cycles;
128 | struct timeval start, stop;
129 | double t_time;
130 |
131 | gettimeofday(&start,NULL);
132 |
133 | cycles = read_clock();
134 | do
135 | {
136 | gettimeofday (&stop, NULL);
137 | t_time = my_difftime (&start, &stop);
138 | }
139 | while (t_time*1000 < millisec);
140 | cycles = read_clock() - cycles;
141 |
142 | return cycles; /* return cycles per second */
143 | }
144 |
145 | #endif
146 |
147 |
148 | uint64_t read_clock(void)
149 | {
150 | #if defined(__GNUC__) && (defined(__i386__) || defined(GCC_ASM64X) )
151 | uint32_t lo, hi;
152 | asm("rdtsc":"=d"(hi),"=a"(lo));
153 | return (uint64)hi << 32 | lo;
154 |
155 | #elif defined(_MSC_VER)
156 | LARGE_INTEGER ll;
157 | QueryPerformanceCounter(&ll);
158 | return (uint64_t)(ll.QuadPart * cycles_per_tick);
159 | #else
160 | struct timeval thistime;
161 | gettimeofday(&thistime, NULL);
162 | return (uint64_t)(cycles_per_second *
163 | (thistime.tv_sec + thistime.tv_usec / 1000000.0));
164 | #endif
165 | }
166 |
167 | #ifdef _MSC_VER
168 | int gettimeofday(struct timeval *tv, struct timezone *tz)
169 | {
170 | FILETIME ft;
171 | unsigned __int64 tmpres = 0;
172 | static int tzflag;
173 |
174 | if (NULL != tv)
175 | {
176 | GetSystemTimeAsFileTime(&ft);
177 |
178 | tmpres |= ft.dwHighDateTime;
179 | tmpres <<= 32;
180 | tmpres |= ft.dwLowDateTime;
181 |
182 | /*converting file time to unix epoch*/
183 | tmpres /= 10; /*convert into microseconds*/
184 | tmpres -= DELTA_EPOCH_IN_MICROSECS;
185 | tv->tv_sec = (long)(tmpres / 1000000UL);
186 | tv->tv_usec = (long)(tmpres % 1000000UL);
187 | }
188 |
189 | if (NULL != tz)
190 | {
191 | if (!tzflag)
192 | {
193 | _tzset();
194 | tzflag++;
195 | }
196 | tz->tz_minuteswest = _timezone / 60;
197 | tz->tz_dsttime = _daylight;
198 | }
199 |
200 | return 0;
201 | }
202 | #endif
203 |
204 | double my_difftime(struct timeval * start, struct timeval * end)
205 | {
206 | double secs;
207 | double usecs;
208 |
209 | if (start->tv_sec == end->tv_sec) {
210 | secs = 0;
211 | usecs = end->tv_usec - start->tv_usec;
212 | }
213 | else {
214 | usecs = 1000000 - start->tv_usec;
215 | secs = end->tv_sec - (start->tv_sec + 1);
216 | usecs += end->tv_usec;
217 | if (usecs >= 1000000) {
218 | usecs -= 1000000;
219 | secs += 1;
220 | }
221 | }
222 |
223 | return secs + usecs / 1000000.;
224 | }
225 |
226 |
227 | // ============================================================================
228 | // randomness
229 | // ============================================================================
230 | void get_random_seeds(rand_t *r) {
231 |
232 | uint32_t tmp_seed1, tmp_seed2;
233 |
234 | #ifndef WIN32
235 |
236 | FILE *rand_device = fopen("/dev/urandom", "r");
237 |
238 | if (rand_device != NULL) {
239 | fread(&tmp_seed1, sizeof(uint32_t), (size_t)1, rand_device);
240 | fread(&tmp_seed2, sizeof(uint32_t), (size_t)1, rand_device);
241 | fclose(rand_device);
242 | }
243 | else
244 |
245 | #endif
246 | {
247 | /* For everyone else, sample the current time,
248 | the high-res timer (hopefully not correlated to the
249 | current time), and the process ID. Multithreaded
250 | applications should fold in the thread ID too */
251 |
252 | uint64_t high_res_time = read_clock();
253 | tmp_seed1 = ((uint32_t)(high_res_time >> 32) ^
254 | (uint32_t)time(NULL)) *
255 | (uint32_t)getpid();
256 | tmp_seed2 = (uint32_t)high_res_time;
257 | }
258 |
259 | /* The final seeds are the result of a multiplicative
260 | hash of the initial seeds */
261 |
262 | r->low = tmp_seed1 * ((uint32_t)40499 * 65543);
263 | r->hi = tmp_seed2 * ((uint32_t)40499 * 65543);
264 | }
265 |
266 | // Knuth's 64 bit MMIX LCG, using a global 64 bit state variable.
267 | uint32_t spRand(uint64_t *state, uint32_t lower, uint32_t upper)
268 | {
269 | // advance the state of the LCG and return the appropriate result
270 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL;
271 | return lower + (uint32_t)(
272 | (double)(upper - lower) * (double)(*state >> 32) * INV_2_POW_32);
273 | }
274 |
275 | uint64_t spRand64_range(uint64_t *state, uint64_t lower, uint64_t upper)
276 | {
277 | // advance the state of the LCG and return the appropriate result
278 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL;
279 | return lower + (uint64_t)(
280 | (double)(upper - lower) * ((double)*state * INV_2_POW_64));
281 | }
282 |
283 | uint64_t spRand64(uint64_t *state)
284 | {
285 | // advance the state of the LCG and return the appropriate result.
286 | // assume lower = 0 and upper = maxint
287 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL;
288 | return *state;
289 | }
290 |
291 | // ============================================================================
292 | // hashing
293 | // ============================================================================
294 |
295 | // FNV-1 hash algorithm:
296 | // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
297 | uint64_t hash64(uint64_t in)
298 | {
299 | uint64_t hash = 14695981039346656037ULL;
300 | uint64_t prime = 1099511628211ULL;
301 | uint64_t hash_mask;
302 | uint64_t xor;
303 |
304 | hash = hash * prime;
305 | hash_mask = 0xffffffffffffff00ULL;
306 | xor = hash ^ in;
307 | hash = (hash & hash_mask) | (xor & (~hash_mask));
308 |
309 | hash = hash * prime;
310 | hash_mask = 0xffffffffffff00ffULL;
311 | xor = hash ^ in;
312 | hash = (hash & hash_mask) | (xor & (~hash_mask));
313 |
314 | hash = hash * prime;
315 | hash_mask = 0xffffffffff00ffffULL;
316 | xor = hash ^ in;
317 | hash = (hash & hash_mask) | (xor & (~hash_mask));
318 |
319 | hash = hash * prime;
320 | hash_mask = 0xffffffff00ffffffULL;
321 | xor = hash ^ in;
322 | hash = (hash & hash_mask) | (xor & (~hash_mask));
323 |
324 | hash = hash * prime;
325 | hash_mask = 0xffffff00ffffffffULL;
326 | xor = hash ^ in;
327 | hash = (hash & hash_mask) | (xor & (~hash_mask));
328 |
329 | hash = hash * prime;
330 | hash_mask = 0xffff00ffffffffffULL;
331 | xor = hash ^ in;
332 | hash = (hash & hash_mask) | (xor & (~hash_mask));
333 |
334 | hash = hash * prime;
335 | hash_mask = 0xff00ffffffffffffULL;
336 | xor = hash ^ in;
337 | hash = (hash & hash_mask) | (xor & (~hash_mask));
338 |
339 | hash = hash * prime;
340 | hash_mask = 0x00ffffffffffffffULL;
341 | xor = hash ^ in;
342 | hash = (hash & hash_mask) | (xor & (~hash_mask));
343 |
344 | return hash;
345 | }
346 |
347 |
--------------------------------------------------------------------------------
/x64_bench/pmod.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose
2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide.
3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may
4 | // not use this file except in compliance with the License. You may obtain
5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
6 | // Unless required by applicable law or agreed to in writing, software
7 | // distributed under the License is distributed on an "AS IS" BASIS,
8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
9 | // including conditions of title, non-infringement, merchantability,
10 | // or fitness for a particular purpose
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | // This file is a snapshot of a work in progress, originated by Mayo
14 | // Clinic SPPDG.
15 |
16 | /*
17 | Copyright (c) 2021, Ben Buhrow
18 | All rights reserved.
19 |
20 | Redistribution and use in source and binary forms, with or without
21 | modification, are permitted provided that the following conditions are met:
22 |
23 | 1. Redistributions of source code must retain the above copyright notice, this
24 | list of conditions and the following disclaimer.
25 | 2. Redistributions in binary form must reproduce the above copyright notice,
26 | this list of conditions and the following disclaimer in the documentation
27 | and/or other materials provided with the distribution.
28 |
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 |
40 | The views and conclusions contained in the software and documentation are those
41 | of the authors and should not be interpreted as representing official policies,
42 | either expressed or implied, of the FreeBSD Project.
43 | */
44 | #include "pmod.h"
45 |
46 |
47 | void pmodlib_init(pmod_t *pmod_state)
48 | {
49 | int i;
50 |
51 | // accomodate window sizes up to 8
52 | pmod_state->libpmod_gwin = (bignum **)malloc((1 << MAX_WINSIZE) * sizeof(bignum *));
53 | pmod_state->libpmod_gwin[0] = zInit();
54 |
55 | for (i = 1; i < (1 << MAX_WINSIZE); i++)
56 | {
57 | pmod_state->libpmod_gwin[i] = zInit();
58 | }
59 |
60 | return;
61 | }
62 |
63 | void pmodlib_free(pmod_t *pmod_state)
64 | {
65 | int i;
66 |
67 | for (i = 0; i < (1 << MAX_WINSIZE); i++)
68 | {
69 | zFree(pmod_state->libpmod_gwin[i]);
70 | }
71 | free(pmod_state->libpmod_gwin);
72 |
73 | return;
74 | }
75 |
76 | void lr_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s)
77 | {
78 | bignum *acc;
79 | int i;
80 | int j;
81 |
82 | acc = zInit();
83 | zCopy(mdata->one, acc);
84 |
85 | for (i = NWORDS - 1; i >= 0; i--)
86 | {
87 | for (j = 63; j >= 0; j--)
88 | {
89 | sqr_ptr(mdata, acc, acc, s);
90 | if (b->data[i] & (1ULL << j))
91 | mul_ptr(mdata, acc, a, acc, s);
92 | }
93 | }
94 |
95 | memset(mdata->mtmp1->data, 0, 2 * NWORDS * sizeof(base_t));
96 | zSet1(mdata->mtmp1, 1);
97 | mul_ptr(mdata, acc, mdata->mtmp1, c, s);
98 |
99 | // final check to ensure c < N
100 | i = 1;
101 | for (j = NWORDS - 1; j >= 0; j--)
102 | {
103 | if (c->data[j] > mdata->n->data[j])
104 | break;
105 |
106 | if (c->data[j] < mdata->n->data[j])
107 | {
108 | i = 0;
109 | break;
110 | }
111 | }
112 |
113 | if (i)
114 | {
115 | mpSub(c->data, mdata->n->data, c->data, NWORDS);
116 | }
117 |
118 | zFree(acc);
119 | c->size = NWORDS;
120 |
121 | return;
122 | }
123 |
124 | int get_winsize(void)
125 | {
126 | // the window size is based on minimizing the total number of multiplications
127 | // in the windowed exponentiation. experiments show that this is best;
128 | // the growing size of the table doesn't change the calculus, at least
129 | // on the KNL.
130 | int size;
131 | int muls;
132 | int minmuls = 99999999;
133 | int minsize = 4;
134 |
135 | for (size = 2; size <= 8; size++)
136 | {
137 | muls = (MAXBITS / size) + (1 << size);
138 | if (muls < minmuls)
139 | {
140 | minmuls = muls;
141 | minsize = size;
142 | }
143 | }
144 |
145 | return minsize;
146 | }
147 |
148 | int get_bitwin(bignum *b, int bitloc, int winsize, int winmask)
149 | {
150 | int bstr;
151 | int bitstart = (bitloc - winsize + 1);
152 | int word = bitloc / 64;
153 | int word2 = bitstart / 64;
154 |
155 | bitstart = bitstart % 64;
156 |
157 | if (word == word2)
158 | {
159 | bstr = (b->data[word] >> bitstart) & winmask;
160 | }
161 | else
162 | {
163 | int upperbits = (bitloc % 64) + 1;
164 |
165 | bstr = (b->data[word2] >> bitstart);
166 | bstr |= ((b->data[word]) << (winsize - upperbits));
167 | bstr &= winmask;
168 | }
169 |
170 | return bstr;
171 | }
172 |
173 | int get_oddbitwin(bignum *b, int bitloc, int winsize, int winmask, int *m)
174 | {
175 | int bstr;
176 | int bitstart = (bitloc - winsize + 1);
177 | int word = bitloc / 64;
178 | int word2 = bitstart / 64;
179 |
180 | bitstart = bitstart % 64;
181 |
182 | if (word == word2)
183 | {
184 | bstr = (b->data[word] >> bitstart) & winmask;
185 | }
186 | else
187 | {
188 | int upperbits = (bitloc % 64) + 1;
189 |
190 | bstr = (b->data[word2] >> bitstart);
191 | bstr |= ((b->data[word]) << (winsize - upperbits));
192 | bstr &= winmask;
193 | }
194 |
195 | *m = 0;
196 | while ((bstr & 1) == 0)
197 | {
198 | if (bstr == 0)
199 | break;
200 |
201 | (*m)++;
202 | bstr >>= 1;
203 | }
204 |
205 | return bstr;
206 | }
207 |
208 | void lrwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s)
209 | {
210 | bignum *acc;
211 | int i, j, bit = MAXBITS - 1;
212 | int k = get_winsize();
213 | bignum **g = pmod_state->libpmod_gwin; // storage for windowed method precomputation
214 | int mask;
215 | int bstr;
216 |
217 | mask = 0;
218 | for (j = 0; j < k; j++)
219 | {
220 | mask = (mask << 1) | 1;
221 | }
222 |
223 | acc = zInit();
224 | zCopy(mdata->one, acc);
225 |
226 | // precomputations, b^i for 0 <= i < 2^k
227 | memcpy(g[1]->data, a->data, NWORDS * sizeof(base_t));
228 | for (i = 2; i < (1 << k); i++)
229 | {
230 | mul_ptr(mdata, g[i - 1], a, g[i], s);
231 | }
232 |
233 | // L-R windowed exponentiation. Scan the exponent bit-vector
234 | // backward instead of flipping and shifting it.
235 | while (bit >= 0)
236 | {
237 | if (bit < k)
238 | {
239 | // grab the last bits of the exponent.
240 | // accommodates exponent lengths not divisible
241 | // by the window size
242 | mask = 0x0;
243 | for (j = 0; j < (bit + 1); j++)
244 | {
245 | sqr_ptr(mdata, acc, acc, s);
246 | mask = (mask << 1) | 1;
247 | }
248 |
249 | bstr = b->data[0] & mask;
250 | }
251 | else
252 | {
253 | // grab the next k bits of the exponent.
254 | bstr = get_bitwin(b, bit, k, mask);
255 | for (j = 0; j < k; j++)
256 | {
257 | sqr_ptr(mdata, acc, acc, s);
258 | }
259 | }
260 |
261 | if (bstr > 0)
262 | mul_ptr(mdata, acc, g[bstr], acc, s);
263 |
264 | bit -= k;
265 |
266 | }
267 |
268 | memset(mdata->mtmp1->data, 0, 2 * NWORDS * sizeof(base_t));
269 | zSet1(mdata->mtmp1, 1);
270 | mul_ptr(mdata, acc, mdata->mtmp1, c, s);
271 |
272 | // final check to ensure c < N
273 | i = 1;
274 | for (j = NWORDS - 1; j >= 0; j--)
275 | {
276 | if (c->data[j] > mdata->n->data[j])
277 | break;
278 |
279 | if (c->data[j] < mdata->n->data[j])
280 | {
281 | i = 0;
282 | break;
283 | }
284 | }
285 |
286 | if (i)
287 | {
288 | mpSub(c->data, mdata->n->data, c->data, NWORDS);
289 | }
290 | c->size = NWORDS;
291 |
292 | zFree(acc);
293 |
294 | return;
295 | }
296 |
297 | void lroddwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s)
298 | {
299 | bignum *acc;
300 | int i, j, bit = MAXBITS - 1;
301 | int k = get_winsize();
302 | bignum **g = pmod_state->libpmod_gwin; // storage for windowed method precomputation
303 | int mask;
304 | int bstr;
305 | int m;
306 |
307 | mask = 0;
308 | for (j = 0; j < k; j++)
309 | {
310 | mask = (mask << 1) | 1;
311 | }
312 |
313 | acc = zInit();
314 | zCopy(mdata->one, acc);
315 |
316 | // precomputations, b^i for 0 <= i < 2^k, i odd (except i = 2).
317 | // half the setup cost for minimal extra overhead while scanning
318 | // the exponent vector... not as secure because order of
319 | // operations (squaring/multiply) depends on exponent bits.
320 | memcpy(g[1]->data, a->data, NWORDS * sizeof(base_t));
321 | mul_ptr(mdata, g[1], a, g[2], s);
322 | //printf("g[%d] ", 2); zPrint(g[2]); printf("\n");
323 | for (i = 3; i < (1 << k); i += 2)
324 | {
325 | mul_ptr(mdata, g[i - 2], g[2], g[i], s);
326 | //printf("g[%d] ", i); zPrint(g[i]); printf("\n");
327 | }
328 |
329 | //printf("acc init "); zPrint(acc); printf("\n");
330 |
331 | // L-R windowed exponentiation. Scan the exponent bit-vector
332 | // backward instead of flipping and shifting it.
333 | while (bit >= 0)
334 | {
335 | if (bit < (k- 1))
336 | {
337 | // grab the last bits of the exponent.
338 | // accommodates exponent lengths not divisible
339 | // by the window size
340 | mask = 0x0;
341 | k = (bit + 1);
342 | for (j = 0; j < k; j++)
343 | {
344 | mask = (mask << 1) | 1;
345 | }
346 | }
347 |
348 | // grab the next k bits of the exponent.
349 | bstr = get_oddbitwin(b, bit, k, mask, &m);
350 | for (j = 0; j < (k - m); j++)
351 | {
352 | sqr_ptr(mdata, acc, acc, s);
353 | //printf("sqr bit %03d ", bit); zPrint(acc); printf("\n");
354 | }
355 |
356 | if (bstr > 0)
357 | {
358 | mul_ptr(mdata, acc, g[bstr], acc, s);
359 | //printf("mul bit %03d ", bit); zPrint(acc); printf("\n");
360 | }
361 |
362 | for (j = 0; j < m; j++)
363 | {
364 | sqr_ptr(mdata, acc, acc, s);
365 | //printf("sqr bit %03d ", bit); zPrint(acc); printf("\n");
366 | }
367 |
368 | bit -= k;
369 | }
370 |
371 | memset(mdata->mtmp1->data, 0, 2*NWORDS * sizeof(base_t));
372 | zSet1(mdata->mtmp1, 1);
373 | mul_ptr(mdata, acc, mdata->mtmp1, c, s);
374 |
375 | // final check to ensure c < N
376 | i = 1;
377 | for (j = NWORDS - 1; j >= 0; j--)
378 | {
379 | if (c->data[j] > mdata->n->data[j])
380 | break;
381 |
382 | if (c->data[j] < mdata->n->data[j])
383 | {
384 | i = 0;
385 | break;
386 | }
387 | }
388 |
389 | if (i)
390 | {
391 | mpSub(c->data, mdata->n->data, c->data, NWORDS);
392 | }
393 | c->size = NWORDS;
394 |
395 | zFree(acc);
396 |
397 | return;
398 | }
399 |
400 |
--------------------------------------------------------------------------------
/x64_bench/x64_arith.c:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 | #include "x64_arith.h"
30 |
31 | __inline void spAdd(uint64_t u, uint64_t v, uint64_t *sum, uint64_t *carry)
32 | {
33 | uint64_t s, c;
34 |
35 | s = v;
36 | c = 0;
37 |
38 | __asm__("movq %2, %%rax \n\t"
39 | "addq %%rax, %3 \n\t"
40 | "adcq $0, %4 \n\t"
41 | : "=r"(s), "=r"(c)
42 | : "r"(u), "0"(s), "1"(c)
43 | : "%rax", "memory", "cc");
44 |
45 | *sum = s;
46 | *carry = c;
47 |
48 | return;
49 | }
50 |
51 | __inline void spAdd3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sum, uint64_t *carry)
52 | {
53 | uint64_t s, c;
54 |
55 | s = v;
56 | c = 0;
57 |
58 | __asm__("movq %2, %%rax \n\t"
59 | "addq %3, %%rax \n\t"
60 | "adcq $0, %5 \n\t"
61 | "addq %%rax, %4 \n\t"
62 | "adcq $0, %5 \n\t"
63 | : "=r"(s), "=r"(c)
64 | : "r"(u), "r"(w), "0"(s), "1"(c)
65 | : "%rax", "memory", "cc");
66 |
67 | *sum = s;
68 | *carry = c;
69 |
70 | return;
71 | }
72 |
73 | __inline void spSub3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sub, uint64_t *borrow)
74 | {
75 | uint64_t s, b;
76 |
77 | s = v;
78 | b = 0;
79 |
80 | __asm__("movq %2, %%rax \n\t"
81 | "subq %4, %%rax \n\t"
82 | "adcq $0, %5 \n\t"
83 | "subq %3, %%rax \n\t"
84 | "adcq $0, %5 \n\t"
85 | "movq %%rax, %4 \n\t"
86 | : "=r"(s), "=r"(b)
87 | : "r"(u), "r"(w), "0"(s), "1"(b)
88 | : "%rax", "memory", "cc");
89 |
90 | *sub = s;
91 | *borrow = b;
92 |
93 | return;
94 | }
95 |
96 | __inline void spSub(uint64_t u, uint64_t v, uint64_t *sub, uint64_t *borrow)
97 | {
98 | uint64_t s, b;
99 |
100 | s = v;
101 | b = 0;
102 |
103 | __asm__("movq %2, %%rax \n\t"
104 | "subq %3, %%rax \n\t"
105 | "adcq $0, %4 \n\t"
106 | "movq %%rax, %3 \n\t"
107 | : "=r"(s), "=r"(b)
108 | : "r"(u), "0"(s), "1"(b)
109 | : "%rax", "memory", "cc");
110 |
111 | *sub = s;
112 | *borrow = b;
113 |
114 | return;
115 | }
116 |
117 | __inline uint64_t spDivide(uint64_t *q, uint64_t *r, uint64_t u[2], uint64_t v)
118 | {
119 | *r = u[1];
120 | *q = u[0];
121 | __asm__("divq %4"
122 | : "=a"(*q), "=d"(*r)
123 | : "1"(*r), "0"(*q), "r"(v));
124 |
125 | return 0;
126 | }
127 |
128 | __inline void spMultiply(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry)
129 | {
130 | *product = v;
131 | *carry = u;
132 |
133 | __asm__("movq %2, %%rax \n\t"
134 | "mulq %3 \n\t"
135 | "movq %%rax, %0 \n\t"
136 | "movq %%rdx, %1 \n\t"
137 | : "=r"(*product), "=r"(*carry)
138 | : "1"(*carry), "0"(*product)
139 | : "%rax", "%rdx", "memory", "cc");
140 |
141 | return;
142 | }
143 |
144 | __inline uint64_t spDiv(uint64_t *q, uint64_t *r, uint64_t u1, uint64_t u0, uint64_t v)
145 | {
146 | *r = u1;
147 | *q = u0;
148 | __asm__("divq %4"
149 | : "=a"(*q), "=d"(*r)
150 | : "1"(*r), "0"(*q), "r"(v));
151 |
152 | return 0;
153 | }
154 |
155 | __inline uint64_t spMod(uint64_t u1, uint64_t u0, uint64_t v)
156 | {
157 | __asm__("divq %4"
158 | : "=a"(u0), "=d"(u1)
159 | : "1"(u1), "0"(u0), "r"(v));
160 |
161 | return u1;
162 | }
163 |
164 | __inline void spMul(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry)
165 | {
166 | *product = v;
167 | *carry = u;
168 |
169 | __asm__("movq %2, %%rax \n\t"
170 | "mulq %3 \n\t"
171 | "movq %%rax, %0 \n\t"
172 | "movq %%rdx, %1 \n\t"
173 | : "=r"(*product), "=r"(*carry)
174 | : "0"(*product), "1"(*carry)
175 | : "rax", "rdx", "cc");
176 |
177 | return;
178 | }
179 |
180 | __inline void spMulAdd1(uint64_t u, uint64_t v, uint64_t w,
181 | uint64_t *product, uint64_t *carry)
182 | {
183 | *product = v;
184 | *carry = u;
185 |
186 | __asm__("movq %2, %%rax \n\t"
187 | "mulq %3 \n\t"
188 | "addq %4, %%rax \n\t"
189 | "adcq $0, %%rdx \n\t"
190 | "movq %%rax, %0 \n\t"
191 | "movq %%rdx, %1 \n\t"
192 | : "=r"(*product), "=r"(*carry)
193 | : "1"(*carry), "0"(*product), "r"(w)
194 | : "rax", "rdx", "cc");
195 |
196 | return;
197 | }
198 |
199 | __inline void spMulAdd2(uint64_t u, uint64_t v, uint64_t w,
200 | uint64_t c, uint64_t *product, uint64_t *carry)
201 | {
202 | *product = v;
203 | *carry = u;
204 |
205 | __asm__("movq %2, %%rax \n\t"
206 | "mulq %3 \n\t"
207 | "addq %4, %%rax \n\t"
208 | "adcq $0, %%rdx \n\t"
209 | "addq %5, %%rax \n\t"
210 | "adcq $0, %%rdx \n\t"
211 | "movq %%rax, %0 \n\t"
212 | "movq %%rdx, %1 \n\t"
213 | : "=r"(*product), "=r"(*carry)
214 | : "1"(*carry), "0"(*product), "r"(w), "r"(c)
215 | : "rax", "rdx", "cc");
216 |
217 | return;
218 | }
219 |
220 | __inline void spMulAdd2x(uint64_t u, uint64_t v, uint64_t w,
221 | uint64_t c, uint64_t *product, uint64_t *carry)
222 | {
223 | *product = v;
224 | *carry = u;
225 |
226 | // maximum in all inputs won't overflow outputs:
227 | // 0xffffffffffffffff ^ 2 + 2 * 0xffffffffffffffff = 0xffffffffffffffffffffffffffffffff
228 |
229 | __asm__("movq %2, %%rdx \n\t"
230 | "addq %5, %4 \n\t" /* add current output to previous carry */
231 | "mulx %3, %0, %1 \n\t" /* multiply */
232 | "adcq $0, %1 \n\t" /* add carry into himul result */
233 | "addq %4, %0 \n\t" /* lowmul + current output + previous carry, store into current output */
234 | "adcq $0, %1 \n\t" /* carry prop into himul */
235 | : "=r"(*product), "=r"(*carry)
236 | : "1"(*carry), "0"(*product), "r"(w), "r"(c)
237 | : "r10", "rdx", "r11", "r12", "cc");
238 |
239 | return;
240 | }
241 |
242 | __inline void mpSub(uint64_t * u, uint64_t * n, uint64_t * w, int sz)
243 | {
244 | int i;
245 | uint64_t b, d;
246 |
247 | b = 0;
248 | for (i = 0; i < sz; i++)
249 | {
250 | spSub3(u[i], n[i], b, &w[i], &b);
251 | }
252 |
253 | if (b)
254 | spSub(u[i], b, &w[i], &b);
255 |
256 | return;
257 | }
258 |
259 | __inline void mpSub1(uint64_t * u, uint64_t n, uint64_t * w, int sz)
260 | {
261 | int i = 0;
262 | uint64_t b;
263 |
264 | b = 0;
265 | spSub3(u[i], n, b, &w[i], &b);
266 | i++;
267 | while (i < sz)
268 | {
269 | spSub(u[i], b, &w[i], &b);
270 | i++;
271 | }
272 |
273 | if (b)
274 | spSub(u[i], b, &w[i], &b);
275 |
276 | return;
277 | }
278 |
279 | __inline void spMulAddc(uint64_t u, uint64_t v, uint64_t * w)
280 | {
281 | // for use with product scanning approach...
282 | // multiply u*v.
283 | // add result into w[0] and w[1] and carry propagate once.
284 |
285 | __asm__("movq %0, %%rax \n\t"
286 | "mulq %1 \n\t"
287 | "movq 16(%2), %%r10 \n\t"
288 | "addq 0(%2), %%rax \n\t"
289 | "adcq 8(%2), %%rdx \n\t"
290 | "adcq $0, %%r10 \n\t"
291 | "movq %%rax, 0(%2) \n\t"
292 | "movq %%rdx, 8(%2) \n\t"
293 | "movq %%r10, 16(%2) \n\t"
294 | :
295 | : "r"(u), "r"(v), "r"(w)
296 | : "rax", "rdx", "r10", "cc", "memory");
297 |
298 | return;
299 | }
300 |
301 | __inline void spMul2Acc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w)
302 | {
303 | // for use with product scanning approach...
304 | // multiply u*v.
305 | // add result into w[0] and w[1] and carry propagate once.
306 | // multiply n*s.
307 | // add result into w[0] and w[1] and carry propagate once.
308 |
309 | __asm__("movq %0, %%rax \n\t"
310 | "mulq %1 \n\t"
311 | "movq 16(%4), %%r10 \n\t"
312 | "addq 0(%4), %%rax \n\t"
313 | "movq %%rax, %%r11 \n\t"
314 | "adcq 8(%4), %%rdx \n\t"
315 | "movq %%rdx, %%r12 \n\t"
316 | "adcq $0, %%r10 \n\t"
317 | "movq %2, %%rax \n\t"
318 | "mulq %3 \n\t"
319 | "addq %%r11, %%rax \n\t"
320 | "adcq %%r12, %%rdx \n\t"
321 | "adcq $0, %%r10 \n\t"
322 | "movq %%rax, 0(%4) \n\t"
323 | "movq %%rdx, 8(%4) \n\t"
324 | "movq %%r10, 16(%4) \n\t"
325 | :
326 | : "r"(u), "r"(v), "r"(n), "r"(s), "r"(w)
327 | : "rax", "rdx", "r10", "r11", "r12", "cc", "memory");
328 |
329 | return;
330 | }
331 |
332 | __inline void spMulAddcr(uint64_t u, uint64_t v, uint64_t * w)
333 | {
334 | // for use with product scanning approach...
335 | // multiply u*v.
336 | // add result into w[0] and w[1] and carry propagate once.
337 | // final output rotation.
338 |
339 | __asm__("movq %0, %%rax \n\t"
340 | "mulq %1 \n\t"
341 | "movq 16(%2), %%r10 \n\t"
342 | "addq 0(%2), %%rax \n\t"
343 | "adcq 8(%2), %%rdx \n\t"
344 | "adcq $0, %%r10 \n\t"
345 | "xorq %%rax, %%rax \n\t"
346 | "movq %%rdx, 0(%2) \n\t"
347 | "movq %%r10, 8(%2) \n\t"
348 | "movq %%rax, 16(%2) \n\t"
349 | :
350 | : "r"(u), "r"(v), "r"(w)
351 | : "rax", "rdx", "r10", "cc", "memory");
352 |
353 | return;
354 | }
355 |
356 | __inline void spMulDblAdd_1(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout)
357 | {
358 | // for use with sos squaring approach...
359 | // multiply u*v and add carryin to the 2nd result word.
360 | // add result twice into w[0] and w[1] and return any further carryout.
361 |
362 | __asm__("movq %3, %%rax \n\t"
363 | "mulq %4 \n\t"
364 | "xorq %%r10, %%r10 \n\t"
365 | "addq %%rax, %%rax \n\t"
366 | "adcq %%rdx, %%rdx \n\t"
367 | "adcq $0, %%r10 \n\t"
368 | "addq %5, %%rax \n\t"
369 | "adcq %6, %%rdx \n\t"
370 | "adcq $0, %%r10 \n\t"
371 | "adcq %7, %%rdx \n\t"
372 | "adcq $0, %%r10 \n\t"
373 | "movq %%rax, %0 \n\t"
374 | "movq %%rdx, %1 \n\t"
375 | "movq %%r10, %2 \n\t"
376 | : "=r"(w[0]), "=r"(w[1]), "=r"(*carryout)
377 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(carryin)
378 | : "rax", "rdx", "r10", "cc");
379 |
380 | return;
381 | }
382 |
383 | __inline void spMulDblAdd_2(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout)
384 | {
385 | // for use with sos squaring approach...
386 | // multiply u*v and add carryin to the 2nd result word.
387 | // add result twice into w[0] and w[1] and return any further carryout.
388 | // same approach as _1, except instead of add and adc we use shldq/shl
389 |
390 | __asm__("movq %3, %%rax \n\t"
391 | "mulq %4 \n\t"
392 | "xorq %%r10, %%r10 \n\t"
393 | "shldq $1, %%rax, %%rdx \n\t"
394 | "adcq $0, %%r10 \n\t"
395 | "shlq $1, %%rax \n\t"
396 | "addq %5, %%rax \n\t"
397 | "adcq %6, %%rdx \n\t"
398 | "adcq $0, %%r10 \n\t"
399 | "adcq %7, %%rdx \n\t"
400 | "adcq $0, %%r10 \n\t"
401 | "movq %%rax, %0 \n\t"
402 | "movq %%rdx, %1 \n\t"
403 | "movq %%r10, %2 \n\t"
404 | : "=r"(w[0]), "=r"(w[1]), "=r"(*carryout)
405 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(carryin)
406 | : "rax", "rdx", "r10", "cc");
407 |
408 | return;
409 | }
410 |
411 | __inline void spMulDblAdd_3(uint64_t u, uint64_t v, uint64_t * w)
412 | {
413 | // for use with fips squaring approach...
414 | // multiply u*v.
415 | // add result twice into w[0], w[1], and w[2].
416 |
417 | __asm__("movq %3, %%rax \n\t"
418 | "mulq %4 \n\t"
419 | "movq %7, %%r10 \n\t"
420 | "addq %%rax, %%rax \n\t"
421 | "adcq %%rdx, %%rdx \n\t"
422 | "adcq $0, %%r10 \n\t"
423 | "addq %5, %%rax \n\t"
424 | "adcq %6, %%rdx \n\t"
425 | "adcq $0, %%r10 \n\t"
426 | "movq %%rax, %0 \n\t"
427 | "movq %%rdx, %1 \n\t"
428 | "movq %%r10, %2 \n\t"
429 | : "=r"(w[0]), "=r"(w[1]), "=r"(w[2])
430 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(w[2])
431 | : "rax", "rdx", "r10", "cc");
432 |
433 | return;
434 | }
435 |
436 | __inline void spSqrMulAcc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w)
437 | {
438 | // for use with fips squaring approach on cross-terms...
439 | // multiply u*v.
440 | // add result twice into w[0], w[1], and w[2].
441 | // multiply n*s.
442 | // add result once into w[0], w[1], and w[2].
443 |
444 | __asm__("movq %3, %%rax \n\t"
445 | "mulq %4 \n\t"
446 | "movq %7, %%r10 \n\t"
447 | "addq %%rax, %%rax \n\t"
448 | "adcq %%rdx, %%rdx \n\t"
449 | "adcq $0, %%r10 \n\t"
450 | "addq %5, %%rax \n\t"
451 | "movq %%rax, %%r11 \n\t"
452 | "adcq %6, %%rdx \n\t"
453 | "movq %%rdx, %%r12 \n\t"
454 | "adcq $0, %%r10 \n\t"
455 | "movq %8, %%rax \n\t"
456 | "mulq %9 \n\t"
457 | "addq %%r11, %%rax \n\t"
458 | "adcq %%r12, %%rdx \n\t"
459 | "adcq $0, %%r10 \n\t"
460 | "movq %%rax, %0 \n\t"
461 | "movq %%rdx, %1 \n\t"
462 | "movq %%r10, %2 \n\t"
463 | : "=r"(w[0]), "=r"(w[1]), "=r"(w[2])
464 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(w[2]), "r"(n), "r"(s)
465 | : "rax", "rdx", "r10", "r11", "r12", "cc");
466 |
467 | return;
468 | }
469 |
470 | void mpAdd1b(uint64_t * u, uint64_t n, uint64_t * w, int sz)
471 | {
472 | // assume u and w point to the same thing, so we
473 | // can stop as soon as there is no carry.
474 | int i = 0;
475 | uint64_t c = 0;
476 |
477 | spAdd3(u[i], n, c, &w[i], &c);
478 | i++;
479 | while ((i < sz) && (c > 0))
480 | {
481 | spAdd(u[i], c, &w[i], &c);
482 | i++;
483 | }
484 |
485 | if (c)
486 | spAdd(u[i], c, &w[i], &c);
487 |
488 | return;
489 | }
490 |
491 | void mpAdd1(uint64_t * u, uint64_t n, uint64_t * w, int sz)
492 | {
493 | int i = 0;
494 | uint64_t c;
495 |
496 | c = 0;
497 | spAdd3(u[i], n, c, &w[i], &c);
498 | i++;
499 | while (i < sz)
500 | {
501 | spAdd(u[i], c, &w[i], &c);
502 | i++;
503 | }
504 |
505 | if (c)
506 | spAdd(u[i], c, &w[i], &c);
507 |
508 | return;
509 | }
510 |
511 | void mpAdd(uint64_t * u, uint64_t * v, uint64_t * w, int sz)
512 | {
513 | int i = 0;
514 | uint64_t c;
515 |
516 | c = 0;
517 | for (i = 0; i < sz; i++)
518 | {
519 | spAdd3(u[i], v[i], c, &w[i], &c);
520 | }
521 | w[i] = c;
522 |
523 | return;
524 | }
525 |
526 |
--------------------------------------------------------------------------------
/vecarith.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2014, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 |
29 |
30 | Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose
31 | Processor Development Group (SPPDG). All Rights Reserved Worldwide.
32 | Licensed under the Apache License, Version 2.0 (the "License"); you may
33 | not use this file except in compliance with the License. You may obtain
34 | a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
35 | Unless required by applicable law or agreed to in writing, software
36 | distributed under the License is distributed on an "AS IS" BASIS,
37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
38 | including conditions of title, non-infringement, merchantability,
39 | or fitness for a particular purpose
40 | See the License for the specific language governing permissions and
41 | limitations under the License.
42 | This file is a snapshot of a work in progress, originated by Mayo
43 | Clinic SPPDG.
44 | */
45 |
46 | #include
47 | #include
48 | #include
49 | #include
50 | #include
51 | #include
52 | #include
53 | #include
54 | #include //for gettimeofday using gcc
55 | #include
56 | #include
57 |
58 | // ============================================================================
59 | // vecarith config
60 | // ============================================================================
61 | #ifdef BASE52
62 | #define DIGITBITS 52
63 | #define base_t uint64_t
64 | #define base_signed_t int64_t
65 | // for 52-bit radix
66 | #define HALFBITS 26
67 | #define HALFMASK 0x3ffffff
68 | #define MAXDIGIT 0xfffffffffffffULL
69 | #define HIBITMASK 0x8000000000000ULL
70 | #define VECLEN 8
71 | #else
72 | #define DIGITBITS 32
73 | #define base_t uint32_t
74 | #define base_signed_t int32_t
75 | #define HALFBITS 16
76 | #define HALFMASK 0xffff
77 | #define MAXDIGIT 0xffffffff
78 | #define HIBITMASK 0x80000000
79 | #define VECLEN 16
80 | #endif
81 |
82 | #ifndef MAXBITS
83 | #define MAXBITS 512
84 | #endif
85 | #define NWORDS (MAXBITS / DIGITBITS)
86 | #define DEFINED 1
87 | #define MAX_WINSIZE 8
88 |
89 | // ============================================================================
90 | // useful definitions
91 | // ============================================================================
92 | #define MIN(a,b) ((a) < (b)? (a) : (b))
93 | #define MAX(a,b) ((a) > (b)? (a) : (b))
94 | #define SIGN(a) ((a) < 0 ? -1 : 1)
95 |
96 | #define INV_2_POW_48 3.5527136788005009293556213378906e-15
97 | #define INV_2_POW_52 2.2204460492503130808472633361816e-16
98 | #define INV_2_POW_64 5.4210108624275221700372640043497e-20
99 | #define INV_2_POW_26 1.490116119384765625e-8
100 | #define INV_2_POW_32 2.3283064365386962890625e-10
101 | #define PI 3.1415926535897932384626433832795
102 | #define LN2 0.69314718055994530941723212145818
103 | #ifdef _MSC_VER
104 | #define strto_uint64 _strtoui64
105 | #else
106 | #define strto_uint64 strtoull
107 | #endif
108 |
109 | // portable 64-bit formatting
110 | #if defined(_MSC_VER) || defined(__MINGW32__)
111 | #define PRId64 "I64d"
112 | #define PRIu64 "I64u"
113 | #define PRIx64 "I64x"
114 | #elif defined(__x86_64__)
115 | #define PRId64 "ld"
116 | #define PRIu64 "lu"
117 | #define PRIx64 "lx"
118 | #define BSCu "lu"
119 | #define BSCx "lx"
120 | #define BSCu0 "019lu" // base string conversion with leading zeros
121 | #define BSCx0 "019lx" // base string conversion with leading zeros
122 | #elif defined(__i386__)
123 | #define PRId64 "lld"
124 | #define PRIu64 "llu"
125 | #define PRIx64 "llx"
126 | #define BSCu "u"
127 | #define BSCx "x"
128 | #define BSCu0 "09u"
129 | #define BSCx0 "09x"
130 | #endif
131 |
132 |
133 | #if defined (__INTEL_COMPILER)
134 | #define ALIGNED_MEM __declspec(align(64))
135 | #else
136 | #define ALIGNED_MEM __attribute__((aligned(64)))
137 | #endif
138 |
139 |
140 | // ============================================================================
141 | // memory allocation
142 | // ============================================================================
143 | static __inline void * xmalloc_align(size_t len)
144 | {
145 | #if defined (_MSC_VER) || defined(__MINGW32__)
146 | void *ptr = _aligned_malloc(len, 64);
147 | #define align_free _aligned_free
148 | #elif defined (__APPLE__)
149 | void *ptr = malloc(len);
150 | #elif defined (__GNUC__)
151 | void *ptr = memalign(64, len);
152 | #define align_free free
153 | #else
154 | void *ptr = malloc(len);
155 | #endif
156 |
157 | if (ptr == NULL) {
158 | printf("failed to allocate %u aligned bytes\n", (uint32_t)len); fflush(stdout);
159 | exit(-1);
160 | }
161 |
162 | return ptr;
163 | }
164 |
165 | static __inline void * xmalloc(size_t len) {
166 | void *ptr = malloc(len);
167 | if (ptr == NULL) {
168 | printf("failed to allocate %u bytes\n", (uint32_t)len); fflush(stdout);
169 | exit(-1);
170 | }
171 | return ptr;
172 | }
173 |
174 | static __inline void * xcalloc(size_t num, size_t len) {
175 | void *ptr = calloc(num, len);
176 | if (ptr == NULL) {
177 | printf("failed to calloc %u bytes\n", (uint32_t)(num * len)); fflush(stdout);
178 | exit(-1);
179 | }
180 | return ptr;
181 | }
182 |
183 | static __inline void * xrealloc(void *iptr, size_t len) {
184 | void *ptr = realloc(iptr, len);
185 | if (ptr == NULL) {
186 | printf("failed to reallocate %u bytes\n", (uint32_t)len); fflush(stdout);
187 | exit(-1);
188 | }
189 | return ptr;
190 | }
191 |
192 | // ============================================================================
193 | // vector bignum structure
194 | // ============================================================================
195 | typedef struct
196 | {
197 | base_t *data;
198 | int size;
199 | } bignum;
200 |
201 | // ============================================================================
202 | // montgomery arithmetic
203 | // ============================================================================
204 | typedef struct
205 | {
206 | bignum *r;
207 | bignum *n;
208 | bignum *nhat;
209 | bignum *vnhat;
210 | bignum *rhat;
211 | bignum *rmask;
212 | bignum *one;
213 | bignum *mtmp1;
214 | bignum *mtmp2;
215 | bignum *mtmp3;
216 | bignum *mtmp4;
217 | bignum **g; // storage for windowed method precomputation
218 | base_t *vrho;
219 | base_t rho;
220 | } monty;
221 |
222 | monty* monty_alloc(void);
223 | void monty_free(monty *mdata);
224 | void monty_init_vec(monty *mdata, bignum * n, int verbose);
225 | int get_winsize(void);
226 | int get_bitwin(bignum* e, int bitloc, int winsize, int lane, int winmask);
227 | // 32-bit words, 16x
228 | int vec_montgomery_setup(bignum * a, bignum *r, bignum *rhat, base_t *rho);
229 | void vecmulmod(bignum *a, bignum *b, bignum *c, bignum *n, bignum *s, monty *mdata);
230 | void vecsqrmod(bignum *a, bignum *c, bignum *n, bignum *s, monty *mdata);
231 | void vecmodexp(bignum *d, bignum *b, bignum *e, bignum *m,
232 | bignum *s, bignum *one, monty *mdata);
233 | // 52-bit words, 8x
234 | int vec_montgomery_setup52(bignum * a, bignum *r, bignum *rhat, base_t *rho);
235 | void vecmulmod52(bignum *a, bignum *b, bignum *c, bignum *n, bignum *s, monty *mdata);
236 | void vecsqrmod52(bignum *a, bignum *c, bignum *n, bignum *s, monty *mdata);
237 | void vecmodexp52(bignum *d, bignum *b, bignum *e, bignum *m,
238 | bignum *s, bignum *one, monty *mdata);
239 |
240 | extern void(*vecmulmod_ptr)(bignum *, bignum *, bignum *, bignum *, bignum *, monty *);
241 | extern void(*vecsqrmod_ptr)(bignum *, bignum *, bignum *, bignum *, monty *);
242 | extern int(*montsetup_ptr)(bignum *, bignum *, bignum *, base_t *);
243 | extern void(*vecmodexp_ptr)(bignum *, bignum *, bignum *, bignum *, bignum *, bignum *, monty *m);
244 |
245 | // ============================================================================
246 | // vector bignum arithmetic and conversions
247 | // ============================================================================
248 | bignum * vecInit(void);
249 | void vecCopy(bignum * src, bignum * dest);
250 | void vecCopyn(bignum * src, bignum * dest, int size);
251 | void vecClear(bignum *n);
252 | void vecFree(bignum *n);
253 | void broadcast_bignum_to_vec(bignum *src, bignum *vec_dest);
254 | bignum * bignums_to_vec(bignum **src, int num);
255 | void insert_bignum_in_vec(bignum *src, bignum *vec_dest, int num);
256 | void extract_bignum_from_vec(bignum *vec_src, bignum *dest, int num);
257 | void copy_vec_lane(bignum *src, bignum *dest, int num, int size);
258 | uint32_t vec_gte(bignum * u, bignum * v);
259 | uint32_t vec_mask_gte(uint32_t mask, bignum* u, bignum* v);
260 | uint32_t vec_eq(base_t * u, base_t * v, int sz);
261 | uint32_t vec_bignum_mask_lshift_1(bignum * u, uint32_t wmask);
262 | void vec_bignum_mask_rshift_1(bignum * u, uint32_t wmask);
263 | void vec_bignum_mask_sub(bignum *a, bignum *b, bignum *c, uint32_t wmask);
264 |
265 | // ============================================================================
266 | // vector bignum52 arithmetic and conversions
267 | // ============================================================================
268 | int vec_montgomery_setup52(bignum * a, bignum *r, bignum *rhat, base_t *rho);
269 | void vec_bignum52_mask_sub(bignum *a, bignum *b, bignum *c, uint32_t wmask);
270 | void vec_bignum52_mask_rshift_1(bignum * u, uint32_t wmask);
271 | uint32_t vec_bignum52_mask_lshift_1(bignum * u, uint32_t wmask);
272 | uint32_t vec_eq52(base_t * u, base_t * v, int sz);
273 | uint32_t vec_gte52(bignum * u, bignum * v);
274 |
275 | // ---------------------------------------------------------------------
276 | // emulated instructions
277 | // ---------------------------------------------------------------------
278 | __m512i __inline _mm512_mulhi_epu32(__m512i a, __m512i b)
279 | {
280 | __m512i t1 = _mm512_shuffle_epi32(a, 0xB1);
281 | __m512i t2 = _mm512_shuffle_epi32(b, 0xB1);
282 | __m512i evens = _mm512_mul_epu32(a, b);
283 | __m512i odds = _mm512_mul_epu32(t1, t2);
284 | //return _mm512_mask_mov_epi32(_mm512_shuffle_epi32(evens, 0xB1), 0xaaaa, odds);
285 | return _mm512_mask_mov_epi32(odds, 0x5555, _mm512_shuffle_epi32(evens, 0xB1));
286 | }
287 |
288 | __m512i __inline _mm512_mask_adc_epi32(__m512i a, __mmask16 m, __mmask16 c, __m512i b, __mmask16 *cout)
289 | {
290 | __m512i t = _mm512_add_epi32(a, b);
291 | *cout = _mm512_cmplt_epu32_mask(t, a);
292 | __m512i t2 = _mm512_mask_add_epi32(a, m, t, _mm512_maskz_set1_epi32(c, 1));
293 | *cout = _mm512_kor(*cout, _mm512_mask_cmplt_epu32_mask(m, t2, t));
294 | return t2;
295 | }
296 |
297 | __m512i __inline _mm512_adc_epi32_test1(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout)
298 | {
299 | __m512i t = _mm512_add_epi32(a, b);
300 | *cout = _mm512_cmplt_epu32_mask(t, a);
301 | __m512i t2 = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1));
302 | *cout = _mm512_kor(*cout, _mm512_cmplt_epu32_mask(t2, t));
303 | return t2;
304 | }
305 |
306 | __m512i __inline _mm512_adc_epi32_test2(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout)
307 | {
308 | // looks like a slightly improved data dependency chain...
309 | // but it tested slower for 1024-b inputs...
310 | __m512i t = _mm512_add_epi32(a, b);
311 | __mmask16 gt0 = _mm512_kor(_mm512_test_epi32_mask(b, b), c);
312 |
313 | t = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1));
314 | *cout = _mm512_kand(_mm512_cmple_epu32_mask(t, a), gt0);
315 | return t;
316 | }
317 |
318 | __m512i __inline _mm512_adc_epi32(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout)
319 | {
320 | __m512i t = _mm512_add_epi32(a, b);
321 | t = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1));
322 | *cout = _mm512_cmplt_epu32_mask(t, a) | (_mm512_cmpeq_epu32_mask(t, a) & c);
323 | return t;
324 | }
325 |
326 | __m512i __inline _mm512_addcarry_epi32(__m512i a, __mmask16 c, __mmask16 *cout)
327 | {
328 | __m512i t = _mm512_add_epi32(a, _mm512_maskz_set1_epi32(c, 1));
329 | *cout = _mm512_cmplt_epu32_mask(t, a);
330 | return t;
331 | }
332 |
333 | __m512i __inline _mm512_subborrow_epi32(__m512i a, __mmask16 c, __mmask16 *cout)
334 | {
335 | __m512i t = _mm512_sub_epi32(a, _mm512_maskz_set1_epi32(c, 1));
336 | *cout = _mm512_cmpeq_epu32_mask(a, _mm512_setzero_epi32());
337 | return t;
338 | }
339 |
340 | __m512i __inline _mm512_mask_sbb_epi32(__m512i a, __mmask16 m, __mmask16 c, __m512i b, __mmask16 *cout)
341 | {
342 | __m512i t = _mm512_sub_epi32(a, b);
343 | *cout = _mm512_mask_cmpgt_epu32_mask(m, t, a);
344 | __m512i t2 = _mm512_mask_sub_epi32(a, m, t, _mm512_maskz_set1_epi32(c, 1));
345 | *cout = _mm512_kor(*cout, _mm512_mask_cmpgt_epu32_mask(m, t2, t));
346 | return t2;
347 | }
348 |
349 | __m512i __inline _mm512_sbb_epi32(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout)
350 | {
351 | __m512i t = _mm512_sub_epi32(a, b);
352 | *cout = _mm512_cmpgt_epu32_mask(t, a);
353 | __m512i t2 = _mm512_sub_epi32(t, _mm512_maskz_set1_epi32(c, 1));
354 | *cout = _mm512_kor(*cout, _mm512_cmpgt_epu32_mask(t2, t));
355 | return t2;
356 | }
357 |
358 | __m512i __inline _mm512_sbb_epi64(__m512i a, __mmask8 c, __m512i b, __mmask8 *cout)
359 | {
360 | __m512i t = _mm512_sub_epi64(a, b);
361 | *cout = _mm512_cmpgt_epu64_mask(t, a);
362 | __m512i t2 = _mm512_sub_epi64(t, _mm512_maskz_set1_epi64(c, 1));
363 | *cout = _mm512_kor(*cout, _mm512_cmpgt_epu64_mask(t2, t));
364 | return t2;
365 | }
366 |
367 | __m512i __inline _mm512_addsetc_epi32(__m512i a, __m512i b, __mmask16 *cout)
368 | {
369 | __m512i t = _mm512_add_epi32(a, b);
370 | *cout = _mm512_cmplt_epu32_mask(t, a);
371 | return t;
372 | }
373 |
374 | __m512i __inline _mm512_subsetc_epi32(__m512i a, __m512i b, __mmask16 *cout)
375 | {
376 | __m512i t = _mm512_sub_epi32(a, b);
377 | *cout = _mm512_cmpgt_epu32_mask(b, a);
378 | return t;
379 | }
380 |
381 | __inline void _mm512_epi32_to_eo64(__m512i a, __m512i *e64, __m512i *o64)
382 | {
383 | *e64 = _mm512_maskz_mov_epi32(0x5555, a);
384 | *o64 = _mm512_maskz_mov_epi32(0x5555, _mm512_shuffle_epi32(a, 0xB1));
385 | return;
386 | }
387 |
388 | __inline __m512i _mm512_eo64lo_to_epi32(__m512i e64, __m512i o64)
389 | {
390 | return _mm512_mask_blend_epi32(0xAAAA, e64, _mm512_shuffle_epi32(o64, 0xB1));
391 | }
392 |
393 | __inline __m512i _mm512_eo64hi_to_epi32(__m512i e64, __m512i o64)
394 | {
395 | return _mm512_mask_blend_epi32(0xAAAA, _mm512_shuffle_epi32(e64, 0xB1), o64);
396 | }
397 |
398 | __inline void _mm512_mul_eo64_epi32(__m512i a, __m512i b, __m512i *e64, __m512i *o64)
399 | {
400 | // multiply the 16-element 32-bit vectors a and b to produce two 8-element
401 | // 64-bit vector products e64 and o64, where e64 is the even elements
402 | // of a*b and o64 is the odd elements of a*b
403 | //__m512i t1 = _mm512_shuffle_epi32(a, 0xB1);
404 | //__m512i t2 = _mm512_shuffle_epi32(b, 0xB1);
405 |
406 | //_mm512_shuffle_epi32(a, 0xB1);
407 | //_mm512_shuffle_epi32(b, 0xB1);
408 | *e64 = _mm512_mul_epu32(a, b);
409 | *o64 = _mm512_mul_epu32(_mm512_shuffle_epi32(a, 0xB1), _mm512_shuffle_epi32(b, 0xB1));
410 |
411 | return;
412 | }
413 |
414 | #define _mm512_iseven_epi32(x) \
415 | _mm512_cmp_epi32_mask(_mm512_setzero_epi32(), _mm512_and_epi32((x), _mm512_set1_epi32(1)), _MM_CMPINT_EQ)
416 |
417 | #define _mm512_isodd_epi32(x) \
418 | _mm512_cmp_epi32_mask(_mm512_set1_epi32(1), _mm512_and_epi32((x), _mm512_set1_epi32(1)), _MM_CMPINT_EQ)
419 |
420 |
421 |
--------------------------------------------------------------------------------
/x64_bench/main.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose
2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide.
3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may
4 | // not use this file except in compliance with the License. You may obtain
5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
6 | // Unless required by applicable law or agreed to in writing, software
7 | // distributed under the License is distributed on an "AS IS" BASIS,
8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
9 | // including conditions of title, non-infringement, merchantability,
10 | // or fitness for a particular purpose
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | // This file is a snapshot of a work in progress, originated by Mayo
14 | // Clinic SPPDG.
15 |
16 | /*
17 | Copyright (c) 2021, Ben Buhrow
18 | All rights reserved.
19 |
20 | Redistribution and use in source and binary forms, with or without
21 | modification, are permitted provided that the following conditions are met:
22 |
23 | 1. Redistributions of source code must retain the above copyright notice, this
24 | list of conditions and the following disclaimer.
25 | 2. Redistributions in binary form must reproduce the above copyright notice,
26 | this list of conditions and the following disclaimer in the documentation
27 | and/or other materials provided with the distribution.
28 |
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 |
40 | The views and conclusions contained in the software and documentation are those
41 | of the authors and should not be interpreted as representing official policies,
42 | either expressed or implied, of the FreeBSD Project.
43 | */
44 |
45 | // this file defines a test harness for various modular exponentiation routines.
46 | // We read command line options and execute the appropriate test(s).
47 | #include "util.h"
48 | #include "bigarith.h"
49 | #include "pmod.h"
50 | #include "monty_arith.h"
51 | #include "x64_arith.h"
52 | #include "gmp.h"
53 |
54 | void mul_test(int iterations, int verbose, uint64_t lcg_state, pmod_t* pmod_state)
55 | {
56 | int i, j, r;
57 | bignum* a, * b, * c, * n, * s;
58 | monty* mdata;
59 | uint64_t chksum = 0;
60 |
61 | struct timeval stopt; // stop time of this job
62 | struct timeval startt; // start time of this job
63 | double t_time = 0.;
64 |
65 | a = zInit();
66 | b = zInit();
67 | c = zInit();
68 | n = zInit();
69 | s = zInit();
70 | mdata = monty_alloc();
71 |
72 | for (j = 0; j < NWORDS; j++)
73 | a->data[j] = spRand64(&lcg_state);
74 | a->size = NWORDS;
75 |
76 | for (j = 0; j < NWORDS; j++)
77 | b->data[j] = spRand64(&lcg_state);
78 | b->size = NWORDS;
79 |
80 | for (j = 0; j < NWORDS; j++)
81 | n->data[j] = spRand64(&lcg_state);
82 | n->size = NWORDS;
83 |
84 | if ((n->data[0] & 1) == 0)
85 | n->data[0]++;
86 |
87 | // initialize the montgomery representation of this modulus.
88 | monty_init(mdata, n, verbose);
89 |
90 | if (verbose > 0)
91 | {
92 | printf("initial a = "); zPrint(a); printf("\n");
93 | printf("initial b = "); zPrint(b); printf("\n");
94 | printf("initial n = "); zPrint(n); printf("\n");
95 | }
96 |
97 | to_monty(mdata, a);
98 | to_monty(mdata, b);
99 |
100 | gettimeofday(&startt, NULL);
101 |
102 | for (i = 0; i < iterations; i++)
103 | {
104 | if (verbose > 1)
105 | {
106 | printf("test %d:\n", i);
107 | printf("a = "); zPrint(a); printf("\n");
108 | printf("b = "); zPrint(b); printf("\n");
109 | printf("n = "); zPrint(n); printf("\n");
110 | }
111 |
112 | mul_ptr(mdata, a, b, a, n);
113 | chksum += a->data[0];
114 |
115 | if (verbose > 0)
116 | {
117 | printf("result: "); zPrint(a); printf("\n");
118 | }
119 | }
120 |
121 | gettimeofday(&stopt, NULL);
122 | t_time = my_difftime(&startt, &stopt);
123 |
124 | if (verbose > 0)
125 | {
126 | printf("final result: "); zPrint(a); printf("\n");
127 | }
128 |
129 | printf("final chksum: %lu\n", chksum);
130 |
131 | printf("%d mulredc tests took %.4f seconds\n", iterations, t_time);
132 |
133 | zFree(a);
134 | zFree(b);
135 | zFree(c);
136 | zFree(n);
137 | zFree(s);
138 | monty_free(mdata);
139 |
140 | return;
141 | }
142 |
143 | void sqr_test(int iterations, int verbose, uint64_t lcg_state, pmod_t* pmod_state)
144 | {
145 | int i, j, r;
146 | bignum* a, * b, * c, * n, * s;
147 | monty* mdata;
148 | uint64_t chksum = 0;
149 |
150 | struct timeval stopt; // stop time of this job
151 | struct timeval startt; // start time of this job
152 | double t_time = 0.;
153 |
154 | a = zInit();
155 | b = zInit();
156 | c = zInit();
157 | n = zInit();
158 | s = zInit();
159 | mdata = monty_alloc();
160 |
161 | for (j = 0; j < NWORDS; j++)
162 | a->data[j] = spRand64(&lcg_state);
163 | a->size = NWORDS;
164 |
165 | for (j = 0; j < NWORDS; j++)
166 | n->data[j] = spRand64(&lcg_state);
167 | n->size = NWORDS;
168 |
169 | if ((n->data[0] & 1) == 0)
170 | n->data[0]++;
171 |
172 | // initialize the montgomery representation of this modulus.
173 | monty_init(mdata, n, verbose);
174 |
175 | if (verbose > 0)
176 | {
177 | printf("initial a = "); zPrint(a); printf("\n");
178 | printf("initial n = "); zPrint(n); printf("\n");
179 | }
180 |
181 | to_monty(mdata, a);
182 |
183 | gettimeofday(&startt, NULL);
184 |
185 | for (i = 0; i < iterations; i++)
186 | {
187 | if (verbose > 1)
188 | {
189 | printf("test %d:\n", i);
190 | printf("a = "); zPrint(a); printf("\n");
191 | printf("n = "); zPrint(n); printf("\n");
192 | }
193 |
194 | sqr_ptr(mdata, a, a, n);
195 | chksum += a->data[0];
196 |
197 | if (verbose > 1)
198 | {
199 | printf("result: "); zPrint(a); printf("\n");
200 | }
201 | }
202 |
203 | gettimeofday(&stopt, NULL);
204 | t_time = my_difftime(&startt, &stopt);
205 |
206 | if (verbose > 0)
207 | {
208 | printf("final result: "); zPrint(a); printf("\n");
209 | }
210 |
211 | printf("final chksum: %lu\n", chksum);
212 | printf("%d sqrredc tests took %.4f seconds\n", iterations, t_time);
213 |
214 | zFree(a);
215 | zFree(b);
216 | zFree(c);
217 | zFree(n);
218 | zFree(s);
219 | monty_free(mdata);
220 |
221 | return;
222 | }
223 |
224 | void monty_test(int iterations, int verbose, uint64_t lcg_state, pmod_t *pmod_state)
225 | {
226 | int i, j, r;
227 | bignum *a, *b, *c, *n, *s;
228 | monty *mdata;
229 |
230 | struct timeval stopt; // stop time of this job
231 | struct timeval startt; // start time of this job
232 | double t_time = 0.;
233 |
234 | a = zInit();
235 | b = zInit();
236 | c = zInit();
237 | n = zInit();
238 | s = zInit();
239 | mdata = monty_alloc();
240 |
241 | gettimeofday(&startt, NULL);
242 |
243 | for (i = 0; i < iterations; i++)
244 | {
245 | for (j = 0; j < NWORDS; j++)
246 | a->data[j] = spRand64(&lcg_state);
247 | a->size = NWORDS;
248 |
249 | for (j = 0; j < NWORDS; j++)
250 | b->data[j] = spRand64(&lcg_state);
251 | b->size = NWORDS;
252 |
253 | for (j = 0; j < NWORDS; j++)
254 | n->data[j] = spRand64(&lcg_state);
255 | n->size = NWORDS;
256 |
257 | if ((n->data[0] & 1) == 0)
258 | n->data[0]++;
259 |
260 | // initialize the montgomery representation of this modulus.
261 | monty_init(mdata, n, verbose);
262 |
263 | if (verbose)
264 | {
265 | printf("test %d:\n", i);
266 | printf("a = "); zPrint(a); printf("\n");
267 | printf("b = "); zPrint(b); printf("\n");
268 | printf("n = "); zPrint(n); printf("\n");
269 | }
270 |
271 | to_monty(mdata, a);
272 |
273 | lroddwin_powm(pmod_state, mdata, c, a, b, n, s);
274 |
275 | if (verbose)
276 | {
277 | printf("result: "); zPrint(c); printf("\n");
278 | }
279 | }
280 |
281 | gettimeofday(&stopt, NULL);
282 | t_time = my_difftime(&startt, &stopt);
283 |
284 | printf("%d powm tests took %.4f seconds\n", iterations, t_time);
285 |
286 | zFree(a);
287 | zFree(b);
288 | zFree(c);
289 | zFree(n);
290 | zFree(s);
291 | monty_free(mdata);
292 |
293 | return;
294 | }
295 |
296 | int main(int argc, char **argv)
297 | {
298 | struct timeval stopt; // stop time of this job
299 | struct timeval startt; // start time of this job
300 | double t_time = 0.;
301 | int iterations = 1000;
302 | int seed;
303 | uint64_t *lcg_state;
304 | pmod_t *pmod_state;
305 | int verbose = 0;
306 | int taskid, numtasks;
307 |
308 | if (argc > 1)
309 | {
310 | iterations = atoi(argv[1]);
311 | }
312 |
313 | if (argc > 2)
314 | {
315 | verbose = atoi(argv[2]);
316 | }
317 |
318 | if (argc > 3)
319 | {
320 | seed = atoi(argv[3]);
321 | }
322 | else
323 | {
324 | gettimeofday(&startt, NULL);
325 | seed = hash64((startt.tv_usec));
326 | }
327 |
328 |
329 | lcg_state = (uint64_t *)malloc(1 * sizeof(uint64_t));
330 | lcg_state[0] = hash64((seed));
331 | pmod_state = (pmod_t *)malloc(sizeof(pmod_t));
332 | pmodlib_init(pmod_state);
333 |
334 | printf("commencing benchmarks with MAXBITS = %d, NWORDS = %d\n",
335 | MAXBITS, NWORDS);
336 |
337 | // configure benchmark tests
338 | int do_pmod_tests = 1;
339 | int do_mulsqr_tests = 1;
340 |
341 | int bench_sos = 1;
342 | int bench_fios = 1;
343 | int bench_fips = 1;
344 | int bench_cios = 1;
345 | int bench_bps = 1;
346 | int bench_gmp = 1;
347 |
348 | if (do_mulsqr_tests)
349 | {
350 | if (bench_sos)
351 | {
352 | printf("commencing %d mulredc iterations using mulmod_sos\n", iterations);
353 | mul_ptr = &mulmod_sos;
354 | mul_test(iterations, verbose, lcg_state[0], pmod_state);
355 | }
356 |
357 | if (bench_cios)
358 | {
359 | printf("commencing %d mulredc iterations using mulmod_cios\n", iterations);
360 | mul_ptr = &mulmod_cios;
361 | mul_test(iterations, verbose, lcg_state[0], pmod_state);
362 | }
363 |
364 | if (bench_bps)
365 | {
366 | printf("commencing %d mulredc iterations using mulmod_bps\n", iterations);
367 | mul_ptr = &mulmod_bps;
368 | mul_test(iterations, verbose, lcg_state[0], pmod_state);
369 | }
370 |
371 | if (bench_fios)
372 | {
373 | printf("commencing %d mulredc iterations using mulmod_fios\n", iterations);
374 | mul_ptr = &mulmod_fios;
375 | mul_test(iterations, verbose, lcg_state[0], pmod_state);
376 | }
377 |
378 | if (bench_fips)
379 | {
380 | printf("commencing %d mulredc iterations using mulmod_fips\n", iterations);
381 | mul_ptr = &mulmod_fips;
382 | mul_test(iterations, verbose, lcg_state[0], pmod_state);
383 | }
384 |
385 | if (bench_sos)
386 | {
387 | printf("commencing %d sqrredc iterations using sqrmod_sos\n", iterations);
388 | sqr_ptr = &sqrmod_sos;
389 | sqr_test(iterations, verbose, lcg_state[0], pmod_state);
390 | }
391 |
392 | if (bench_cios)
393 | {
394 | printf("commencing %d sqrredc iterations using sqrmod_cios\n", iterations);
395 | sqr_ptr = &sqrmod_cios;
396 | sqr_test(iterations, verbose, lcg_state[0], pmod_state);
397 | }
398 |
399 | if (bench_bps)
400 | {
401 | printf("commencing %d sqrredc iterations using sqrmod_bps\n", iterations);
402 | sqr_ptr = &sqrmod_bps;
403 | sqr_test(iterations, verbose, lcg_state[0], pmod_state);
404 | }
405 |
406 | if (bench_fios)
407 | {
408 | printf("commencing %d sqrredc iterations using sqrmod_fios\n", iterations);
409 | sqr_ptr = &sqrmod_fios;
410 | sqr_test(iterations, verbose, lcg_state[0], pmod_state);
411 | }
412 |
413 | if (bench_fips)
414 | {
415 | printf("commencing %d sqrredc iterations using sqrmod_fips\n", iterations);
416 | sqr_ptr = &sqrmod_fips;
417 | sqr_test(iterations, verbose, lcg_state[0], pmod_state);
418 | }
419 |
420 | // gmp SOS mul
421 | if (bench_gmp)
422 | {
423 | mpz_t a, b, n, t, nhat, r, u;
424 | int i, j, k;
425 | struct timeval stopt; // stop time of this job
426 | struct timeval startt; // start time of this job
427 | double t_time = 0.;
428 | int numbits = MAXBITS;
429 | uint64_t lcg = lcg_state[0];
430 | uint64_t chksum = 0;
431 |
432 | mpz_init(a);
433 | mpz_init(b);
434 | mpz_init(nhat);
435 | mpz_init(r);
436 | mpz_init(n);
437 | mpz_init(t);
438 | mpz_init(u);
439 |
440 | gettimeofday(&startt, NULL);
441 |
442 | mpz_set_ui(a, 0);
443 | for (j = 0; j < NWORDS; j++)
444 | {
445 | uint64_t x = spRand64(&lcg);
446 | mpz_set_ui(t, x);
447 | mpz_mul_2exp(t, t, 64 * j);
448 | mpz_add(a, a, t);
449 | }
450 |
451 | mpz_set_ui(b, 0);
452 | for (j = 0; j < NWORDS; j++)
453 | {
454 | uint64_t x = spRand64(&lcg);
455 | mpz_set_ui(t, x);
456 | mpz_mul_2exp(t, t, 64 * j);
457 | mpz_add(b, b, t);
458 | }
459 |
460 | mpz_set_ui(n, 0);
461 | for (j = 0; j < NWORDS; j++)
462 | {
463 | uint64_t x = spRand64(&lcg);
464 | mpz_set_ui(t, x);
465 | mpz_mul_2exp(t, t, 64 * j);
466 | mpz_add(n, n, t);
467 | }
468 |
469 | if ((mpz_get_ui(n) & 1) == 0)
470 | {
471 | mpz_add_ui(n, n, 1);
472 | }
473 |
474 | printf("commencing %d mulredc iterations using gmp \n", iterations);
475 |
476 | if (verbose > 0)
477 | {
478 | gmp_printf("initial a: %Zx\n", a);
479 | gmp_printf("initial b: %Zx\n", b);
480 | gmp_printf("initial n: %Zx\n", n);
481 | }
482 |
483 | // monty setup
484 | mpz_set_ui(r, 1);
485 | mpz_mul_2exp(r, r, MAXBITS);
486 | mpz_invert(nhat, n, r);
487 | mpz_sub(nhat, r, nhat);
488 | mpz_mul(a, r, a);
489 | mpz_tdiv_r(a, a, n);
490 | mpz_mul(b, r, b);
491 | mpz_tdiv_r(b, b, n);
492 |
493 | gettimeofday(&startt, NULL);
494 | for (k = 0; k < iterations; k++)
495 | {
496 | mpz_mul(t, a, b);
497 | mpz_tdiv_r_2exp(a, t, MAXBITS);
498 | mpz_mul(u, a, nhat);
499 | mpz_tdiv_r_2exp(u, u, MAXBITS);
500 | mpz_mul(a, u, n);
501 | mpz_add(a, t, a);
502 | mpz_tdiv_q_2exp(a, a, MAXBITS);
503 | if (mpz_sizeinbase(a, 2) > MAXBITS)
504 | mpz_sub(a, a, n);
505 |
506 | if (verbose > 0)
507 | {
508 | gmp_printf("result: %Zx\n", a);
509 | }
510 | chksum += a->_mp_d[0];
511 | }
512 |
513 | gettimeofday(&stopt, NULL);
514 | t_time = my_difftime(&startt, &stopt);
515 |
516 | if (verbose > 0)
517 | {
518 | gmp_printf("final result: %Zx\n", a);
519 | }
520 |
521 | printf("final chksum: %lu\n", chksum);
522 | printf("%d mulredc tests took %.4f seconds\n", iterations, t_time);
523 |
524 | mpz_clear(a);
525 | mpz_clear(b);
526 | mpz_clear(r);
527 | mpz_clear(nhat);
528 | mpz_clear(n);
529 | mpz_clear(t);
530 | mpz_clear(u);
531 | }
532 |
533 | // gmp SOS sqr
534 | if (bench_gmp)
535 | {
536 | mpz_t a, b, n, t, nhat, r, u;
537 | int i, j, k;
538 | struct timeval stopt; // stop time of this job
539 | struct timeval startt; // start time of this job
540 | double t_time = 0.;
541 | int numbits = MAXBITS;
542 | uint64_t lcg = lcg_state[0];
543 | uint64_t chksum = 0;
544 |
545 | mpz_init(a);
546 | mpz_init(b);
547 | mpz_init(nhat);
548 | mpz_init(r);
549 | mpz_init(n);
550 | mpz_init(t);
551 | mpz_init(u);
552 |
553 | gettimeofday(&startt, NULL);
554 |
555 | mpz_set_ui(a, 0);
556 | for (j = 0; j < NWORDS; j++)
557 | {
558 | uint64_t x = spRand64(&lcg);
559 | mpz_set_ui(t, x);
560 | mpz_mul_2exp(t, t, 64 * j);
561 | mpz_add(a, a, t);
562 | }
563 |
564 | mpz_set_ui(n, 0);
565 | for (j = 0; j < NWORDS; j++)
566 | {
567 | uint64_t x = spRand64(&lcg);
568 | mpz_set_ui(t, x);
569 | mpz_mul_2exp(t, t, 64 * j);
570 | mpz_add(n, n, t);
571 | }
572 |
573 | if ((mpz_get_ui(n) & 1) == 0)
574 | {
575 | mpz_add_ui(n, n, 1);
576 | }
577 |
578 | printf("commencing %d sqrredc iterations using gmp \n", iterations);
579 |
580 | if (verbose > 0)
581 | {
582 | gmp_printf("initial a: %Zx\n", a);
583 | gmp_printf("initial n: %Zx\n", n);
584 | }
585 |
586 | // monty setup
587 | mpz_set_ui(r, 1);
588 | mpz_mul_2exp(r, r, MAXBITS);
589 | mpz_invert(nhat, n, r);
590 | mpz_sub(nhat, r, nhat);
591 | mpz_mul(a, r, a);
592 | mpz_tdiv_r(a, a, n);
593 |
594 | gettimeofday(&startt, NULL);
595 | for (k = 0; k < iterations; k++)
596 | {
597 | mpz_mul(t, a, a);
598 | mpz_tdiv_r_2exp(a, t, MAXBITS);
599 | mpz_mul(u, a, nhat);
600 | mpz_tdiv_r_2exp(u, u, MAXBITS);
601 | mpz_mul(a, u, n);
602 | mpz_add(a, t, a);
603 | mpz_tdiv_q_2exp(a, a, MAXBITS);
604 | if (mpz_sizeinbase(a, 2) > MAXBITS)
605 | mpz_sub(a, a, n);
606 |
607 | chksum += mpz_get_ui(a);
608 | if (verbose > 0)
609 | {
610 | gmp_printf("result: %Zx\n", a);
611 | }
612 | }
613 |
614 | gettimeofday(&stopt, NULL);
615 | t_time = my_difftime(&startt, &stopt);
616 |
617 | if (verbose > 0)
618 | {
619 | gmp_printf("final result: %Zx\n", a);
620 | }
621 |
622 | printf("final chksum: %lu\n", chksum);
623 | printf("%d sqrredc tests took %.4f seconds\n", iterations, t_time);
624 |
625 | mpz_clear(a);
626 | mpz_clear(b);
627 | mpz_clear(r);
628 | mpz_clear(nhat);
629 | mpz_clear(n);
630 | mpz_clear(t);
631 | mpz_clear(u);
632 | }
633 | }
634 |
635 | if (do_mulsqr_tests)
636 | iterations /= 10000;
637 |
638 | if (do_pmod_tests)
639 | {
640 | if (bench_sos)
641 | {
642 | printf("commencing %d powm iterations using mulmod_sos\n", iterations);
643 | mul_ptr = &mulmod_sos;
644 | sqr_ptr = &sqrmod_sos_mul;
645 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
646 | }
647 |
648 | if (bench_sos)
649 | {
650 | printf("commencing %d powm iterations using mulmod_sos and sqrmod_sos\n", iterations);
651 | mul_ptr = &mulmod_sos;
652 | sqr_ptr = &sqrmod_sos;
653 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
654 | }
655 |
656 | if (bench_cios)
657 | {
658 | printf("commencing %d powm iterations using mulmod_cios\n", iterations);
659 | mul_ptr = &mulmod_cios;
660 | sqr_ptr = &sqrmod_cios_mul;
661 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
662 | }
663 |
664 | if (bench_cios)
665 | {
666 | printf("commencing %d powm iterations using mulmod_cios and sqrmod_cios\n", iterations);
667 | mul_ptr = &mulmod_cios;
668 | sqr_ptr = &sqrmod_cios;
669 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
670 | }
671 |
672 | if (bench_bps)
673 | {
674 | printf("commencing %d powm iterations using mulmod_bps\n", iterations);
675 | mul_ptr = &mulmod_bps;
676 | sqr_ptr = &sqrmod_bps_mul;
677 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
678 | }
679 |
680 | if (bench_bps)
681 | {
682 | printf("commencing %d powm iterations using mulmod_bps and sqrmod_bps\n", iterations);
683 | mul_ptr = &mulmod_bps;
684 | sqr_ptr = &sqrmod_bps;
685 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
686 | }
687 |
688 | if (bench_fios)
689 | {
690 | printf("commencing %d powm iterations using mulmod_fios\n", iterations);
691 | mul_ptr = &mulmod_fios;
692 | sqr_ptr = &sqrmod_fios_mul;
693 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
694 | }
695 |
696 | if (bench_fios)
697 | {
698 | printf("commencing %d powm iterations using mulmod_fios and sqrmod_fios\n", iterations);
699 | mul_ptr = &mulmod_fios;
700 | sqr_ptr = &sqrmod_fios;
701 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
702 |
703 | }
704 |
705 | if (bench_fips)
706 | {
707 | printf("commencing %d powm iterations using mulmod_fips\n", iterations);
708 | mul_ptr = &mulmod_fips;
709 | sqr_ptr = &sqrmod_fips_mul;
710 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
711 | }
712 |
713 | if (bench_fips)
714 | {
715 | printf("commencing %d powm iterations using mulmod_fips and sqrmod_fips\n", iterations);
716 | mul_ptr = &mulmod_fips;
717 | sqr_ptr = &sqrmod_fips;
718 | monty_test(iterations, verbose, lcg_state[0], pmod_state);
719 | }
720 |
721 | // gmp comparison. These results won't match the ones above because
722 | // we use a different RNG (the builtin gmp RNG).
723 | if (bench_gmp)
724 | {
725 | mpz_t a, b, n, t, aa, bb;
726 | int i, j, k;
727 | struct timeval stopt; // stop time of this job
728 | struct timeval startt; // start time of this job
729 | double t_time = 0.;
730 | int numbits = MAXBITS;
731 | gmp_randstate_t gmp_randstate;
732 |
733 | mpz_init(a);
734 | mpz_init(b);
735 | mpz_init(aa);
736 | mpz_init(bb);
737 | mpz_init(n);
738 | mpz_init(t);
739 |
740 | gettimeofday(&startt, NULL);
741 | srand(42); // lcg_state[0]);
742 | gmp_randinit_default(gmp_randstate);
743 | gmp_randseed_ui(gmp_randstate, rand());
744 |
745 | printf("commencing %d powm iterations using gmp powm\n", iterations);
746 | mpz_urandomb(n, gmp_randstate, numbits);
747 | if (mpz_even_p(n))
748 | mpz_add_ui(n, n, 1);
749 |
750 | gettimeofday(&startt, NULL);
751 | for (k = 0; k < iterations; k++)
752 | {
753 | mpz_urandomb(a, gmp_randstate, numbits);
754 | mpz_urandomb(b, gmp_randstate, numbits);
755 |
756 | mpz_tdiv_r(a, a, n);
757 | mpz_tdiv_r(b, b, n);
758 |
759 | if (verbose)
760 | {
761 | printf("test %d:\n", k);
762 | printf("a = "); gmp_printf("%Zx\n", a);
763 | printf("b = "); gmp_printf("%Zx\n", b);
764 | printf("n = "); gmp_printf("%Zx\n", n);
765 | }
766 |
767 | mpz_set(aa, a);
768 | mpz_set(bb, b);
769 |
770 | mpz_powm(a, aa, bb, n);
771 |
772 | if (verbose)
773 | {
774 | gmp_printf("result: %Zx\n", a);
775 | }
776 | }
777 |
778 | gettimeofday(&stopt, NULL);
779 | t_time = my_difftime(&startt, &stopt);
780 |
781 | printf("%d powm tests took %.4f seconds\n", iterations, t_time);
782 |
783 | mpz_clear(a);
784 | mpz_clear(b);
785 | mpz_clear(aa);
786 | mpz_clear(bb);
787 | mpz_clear(n);
788 | mpz_clear(t);
789 | }
790 | }
791 |
792 | free(lcg_state);
793 | pmodlib_free(pmod_state);
794 | free(pmod_state);
795 |
796 | return 0;
797 | }
798 |
--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2014, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 |
29 |
30 | Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose
31 | Processor Development Group (SPPDG). All Rights Reserved Worldwide.
32 | Licensed under the Apache License, Version 2.0 (the "License"); you may
33 | not use this file except in compliance with the License. You may obtain
34 | a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
35 | Unless required by applicable law or agreed to in writing, software
36 | distributed under the License is distributed on an "AS IS" BASIS,
37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
38 | including conditions of title, non-infringement, merchantability,
39 | or fitness for a particular purpose
40 | See the License for the specific language governing permissions and
41 | limitations under the License.
42 | This file is a snapshot of a work in progress, originated by Mayo
43 | Clinic SPPDG.
44 | */
45 |
46 | #include "vecarith.h"
47 | #include "gmp.h"
48 | #include "omp.h"
49 |
50 | uint64_t *LCG_STATE;
51 |
52 | uint64_t spRand64(uint64_t *state)
53 | {
54 | // advance the state of the LCG and return the appropriate result.
55 | // assume lower = 0 and upper = maxint
56 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL;
57 | return *state;
58 | }
59 |
60 | // FNV-1 hash algorithm:
61 | // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
62 | uint64_t hash64(uint64_t in)
63 | {
64 | uint64_t hash = 14695981039346656037ULL;
65 | uint64_t prime = 1099511628211ULL;
66 | uint64_t hash_mask;
67 | uint64_t xor;
68 |
69 | hash = hash * prime;
70 | hash_mask = 0xffffffffffffff00ULL;
71 | xor = hash ^ in;
72 | hash = (hash & hash_mask) | (xor & (~hash_mask));
73 |
74 | hash = hash * prime;
75 | hash_mask = 0xffffffffffff00ffULL;
76 | xor = hash ^ in;
77 | hash = (hash & hash_mask) | (xor & (~hash_mask));
78 |
79 | hash = hash * prime;
80 | hash_mask = 0xffffffffff00ffffULL;
81 | xor = hash ^ in;
82 | hash = (hash & hash_mask) | (xor & (~hash_mask));
83 |
84 | hash = hash * prime;
85 | hash_mask = 0xffffffff00ffffffULL;
86 | xor = hash ^ in;
87 | hash = (hash & hash_mask) | (xor & (~hash_mask));
88 |
89 | hash = hash * prime;
90 | hash_mask = 0xffffff00ffffffffULL;
91 | xor = hash ^ in;
92 | hash = (hash & hash_mask) | (xor & (~hash_mask));
93 |
94 | hash = hash * prime;
95 | hash_mask = 0xffff00ffffffffffULL;
96 | xor = hash ^ in;
97 | hash = (hash & hash_mask) | (xor & (~hash_mask));
98 |
99 | hash = hash * prime;
100 | hash_mask = 0xff00ffffffffffffULL;
101 | xor = hash ^ in;
102 | hash = (hash & hash_mask) | (xor & (~hash_mask));
103 |
104 | hash = hash * prime;
105 | hash_mask = 0x00ffffffffffffffULL;
106 | xor = hash ^ in;
107 | hash = (hash & hash_mask) | (xor & (~hash_mask));
108 |
109 | return hash;
110 | }
111 |
112 | double my_difftime(struct timeval * start, struct timeval * end)
113 | {
114 | double secs;
115 | double usecs;
116 |
117 | if (start->tv_sec == end->tv_sec) {
118 | secs = 0;
119 | usecs = end->tv_usec - start->tv_usec;
120 | }
121 | else {
122 | usecs = 1000000 - start->tv_usec;
123 | secs = end->tv_sec - (start->tv_sec + 1);
124 | usecs += end->tv_usec;
125 | if (usecs >= 1000000) {
126 | usecs -= 1000000;
127 | secs += 1;
128 | }
129 | }
130 |
131 | return secs + usecs / 1000000.;
132 | }
133 |
134 | void extract_bignum_from_vec_to_mpz(mpz_t dest, bignum *vec_src, int num, int sz)
135 | {
136 | int j;
137 |
138 | if (dest == NULL)
139 | {
140 | printf("invalid dest address in extract_vec_bignum_from_vec_to_mpz\n");
141 | }
142 |
143 | mpz_set_ui(dest, 0);
144 | for (j = sz - 1; j >= 0; j--)
145 | {
146 | mpz_mul_2exp(dest, dest, DIGITBITS);
147 | mpz_add_ui(dest, dest, vec_src->data[num + j * VECLEN]);
148 | }
149 |
150 | return;
151 | }
152 |
153 | void vecpmodtest(int do_verification, int threads, int verbose)
154 | {
155 | // test the pmod by comparing all results to those computed using
156 | // validated scalar code.
157 | double *elapsed_time;
158 | int t;
159 | //gmp_randstate_t rng_state;
160 |
161 | //gmp_randinit_default(rng_state);
162 | elapsed_time = (double *)malloc(threads * sizeof(double));
163 |
164 | LCG_STATE = (uint64_t *)malloc(threads * sizeof(uint64_t));
165 |
166 | for (t = 0; t < threads; t++)
167 | {
168 | LCG_STATE[t] = hash64(t);
169 | }
170 |
171 | printf("commencing test: all variable (random)\n");
172 | #pragma omp parallel num_threads(threads)
173 | {
174 | int i, j;
175 |
176 | // timing variables
177 | struct timeval stopt; // stop time of this job
178 | struct timeval startt; // start time of this job
179 | double t_time;
180 |
181 | mpz_t base, exp, mod, t1, t2;
182 |
183 | int loc_iterations;
184 | int tid = omp_get_thread_num();
185 | monty *mtest;
186 |
187 | // vector bignums
188 | bignum *b = vecInit();
189 | bignum *d = vecInit();
190 | bignum *m = vecInit();
191 | bignum *e = vecInit();
192 | bignum *s = vecInit();
193 | bignum *one = vecInit();
194 |
195 | mpz_init(base);
196 | mpz_init(exp);
197 | mpz_init(mod);
198 | mpz_init(t1);
199 | mpz_init(t2);
200 |
201 | //gmp_randseed_ui(rng_state, tid);
202 |
203 | // attempt to scale the number of iterations with input size
204 | // so this doesn't take forever.
205 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS);
206 |
207 | if (MAXBITS >= 4096)
208 | loc_iterations *= 1;
209 | else if (MAXBITS >= 2048)
210 | loc_iterations *= 2;
211 | else if (MAXBITS >= 1024)
212 | loc_iterations *= 5;
213 | else if (MAXBITS >= 512)
214 | loc_iterations *= 10;
215 | else if (MAXBITS >= 256)
216 | loc_iterations *= 25;
217 | else
218 | loc_iterations *= 100;
219 |
220 | #ifdef BASE52
221 | loc_iterations *= 3;
222 | #endif
223 |
224 | #ifdef TARGET_KNL
225 | loc_iterations /= 3;
226 | #endif
227 |
228 | mtest = monty_alloc();
229 |
230 | #pragma omp barrier
231 |
232 | gettimeofday(&startt, NULL);
233 |
234 | for (j = 0; j < VECLEN; j++)
235 | {
236 | one->data[j] = 1;
237 | }
238 |
239 | printf("thread %d starting %d iterations\n", tid, loc_iterations);
240 |
241 | // now do the calculation "b^e % m" a bunch of times
242 | for (i = 0; i < loc_iterations; i++)
243 | {
244 |
245 | #ifdef BASE52
246 | //int tmp = ceil(MAXBITS / 64);
247 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
248 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
249 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
250 |
251 | for (j = 0; j < VECLEN; j++)
252 | {
253 | int k;
254 | for (k = 0; k < NWORDS; k++)
255 | {
256 | uint64_t r1 = spRand64(&LCG_STATE[t]);
257 | uint64_t r2 = spRand64(&LCG_STATE[t]);
258 | uint64_t r3 = spRand64(&LCG_STATE[t]);
259 |
260 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
261 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
262 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
263 | }
264 | }
265 |
266 | #else
267 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
268 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
269 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
270 |
271 | for (j = 0; j < VECLEN; j++)
272 | {
273 | int k;
274 | for (k = 0; k < NWORDS; k++)
275 | {
276 | uint64_t r1 = spRand64(&LCG_STATE[t]);
277 | uint64_t r2 = spRand64(&LCG_STATE[t]);
278 | uint64_t r3 = spRand64(&LCG_STATE[t]);
279 |
280 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
281 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
282 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
283 | }
284 | }
285 | #endif
286 | for (j = 0; j < VECLEN; j++)
287 | m->data[j] |= 0x1;
288 |
289 | if (0)
290 | {
291 | continue;
292 | }
293 |
294 | if (verbose > 1)
295 | {
296 | for (j = 0; j < VECLEN; j++)
297 | {
298 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
299 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
300 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
301 |
302 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n",
303 | j, base, exp, mod);
304 | }
305 | }
306 |
307 | // now we actually do the (vectorized) montgomery initialization
308 | // on our vector of random moduli.
309 | monty_init_vec(mtest, m, 0);
310 |
311 | if (verbose > 1)
312 | {
313 | for (j = 0; j < VECLEN; j++)
314 | {
315 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS);
316 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS);
317 |
318 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n",
319 | j, base, mod, mtest->vrho[j]);
320 | }
321 | }
322 |
323 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep
324 |
325 | if (verbose > 1)
326 | {
327 | for (j = 0; j < VECLEN; j++)
328 | {
329 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
330 |
331 | gmp_printf("monty(base%d) = %Zx\n", j, base);
332 | }
333 | }
334 |
335 | vecmodexp_ptr(d, b, e, m, s, mtest->one, mtest); // powm
336 | vecmulmod_ptr(d, one, d, m, s, mtest); // normal rep
337 |
338 | if (verbose > 1)
339 | {
340 | for (j = 0; j < VECLEN; j++)
341 | {
342 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS);
343 |
344 | gmp_printf("modexp%d = %Zx\n", j, base);
345 | }
346 | }
347 |
348 | // now verify each result
349 | if (do_verification)
350 | {
351 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep
352 | for (j = 0; j < VECLEN; j++)
353 | {
354 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS);
355 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
356 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
357 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
358 |
359 | mpz_powm(t2, base, exp, mod);
360 |
361 | if (verbose)
362 | {
363 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n",
364 | i, j, t2, t1);
365 | }
366 |
367 | if (mpz_cmp(t1, t2) != 0)
368 | {
369 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n",
370 | i, j, base, exp, mod, t2, t1);
371 | exit(1);
372 | }
373 |
374 | }
375 | }
376 | }
377 |
378 | monty_free(mtest);
379 |
380 | if ((tid == 0) && (do_verification == 1))
381 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations);
382 |
383 | gettimeofday(&stopt, NULL);
384 | t_time = my_difftime(&startt, &stopt);
385 | elapsed_time[tid] = t_time;
386 |
387 | if (tid == 0)
388 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time);
389 |
390 | mpz_clear(t1);
391 | mpz_clear(t2);
392 | mpz_clear(base);
393 | mpz_clear(mod);
394 | mpz_clear(exp);
395 | vecFree(m);
396 | vecFree(b);
397 | vecFree(d);
398 | vecFree(s);
399 | vecFree(e);
400 | }
401 |
402 | {
403 | int i;
404 | double sum = 0.0;
405 | double min_t = 9999999999.;
406 | double max_t = 0.;
407 |
408 | for (i = 0; i < threads; i++)
409 | {
410 | sum += elapsed_time[i];
411 | if (elapsed_time[i] < min_t)
412 | min_t = elapsed_time[i];
413 | if (elapsed_time[i] > max_t)
414 | max_t = elapsed_time[i];
415 | }
416 |
417 | printf("average elapsed time = %1.4f\n", sum / threads);
418 | printf("min elapsed time = %1.4f\n", min_t);
419 | printf("max elapsed time = %1.4f\n", max_t);
420 | }
421 |
422 | free(elapsed_time);
423 | free(LCG_STATE);
424 |
425 | printf("\n\n");
426 |
427 | return;
428 | }
429 |
430 | void vecmultest(int do_verification, int threads, int verbose)
431 | {
432 | // test the pmod by comparing all results to those computed using
433 | // validated scalar code.
434 | double* elapsed_time;
435 | int t;
436 | //gmp_randstate_t rng_state;
437 |
438 | //gmp_randinit_default(rng_state);
439 | elapsed_time = (double*)malloc(threads * sizeof(double));
440 |
441 | LCG_STATE = (uint64_t*)malloc(threads * sizeof(uint64_t));
442 |
443 | for (t = 0; t < threads; t++)
444 | {
445 | LCG_STATE[t] = hash64(t);
446 | }
447 |
448 | do_verification = 0;
449 | printf("commencing test mulmod: all variable (random)\n");
450 | #pragma omp parallel num_threads(threads)
451 | {
452 | int i, j;
453 |
454 | // timing variables
455 | struct timeval stopt; // stop time of this job
456 | struct timeval startt; // start time of this job
457 | double t_time;
458 |
459 | mpz_t base, exp, mod, t1, t2;
460 |
461 | int loc_iterations;
462 | int tid = omp_get_thread_num();
463 | monty* mtest;
464 |
465 | // vector bignums
466 | bignum* b = vecInit();
467 | bignum* d = vecInit();
468 | bignum* m = vecInit();
469 | bignum* e = vecInit();
470 | bignum* s = vecInit();
471 | bignum* one = vecInit();
472 |
473 | mpz_init(base);
474 | mpz_init(exp);
475 | mpz_init(mod);
476 | mpz_init(t1);
477 | mpz_init(t2);
478 |
479 | //gmp_randseed_ui(rng_state, tid);
480 |
481 | // attempt to scale the number of iterations with input size
482 | // so this doesn't take forever.
483 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS);
484 |
485 | if (MAXBITS >= 4096)
486 | loc_iterations *= 1;
487 | else if (MAXBITS >= 2048)
488 | loc_iterations *= 2;
489 | else if (MAXBITS >= 1024)
490 | loc_iterations *= 5;
491 | else if (MAXBITS >= 512)
492 | loc_iterations *= 10;
493 | else if (MAXBITS >= 256)
494 | loc_iterations *= 25;
495 | else
496 | loc_iterations *= 100;
497 |
498 | #ifdef BASE52
499 | loc_iterations *= 3;
500 | #endif
501 |
502 | #ifdef TARGET_KNL
503 | loc_iterations /= 3;
504 | #endif
505 |
506 | loc_iterations *= 20000;
507 | mtest = monty_alloc();
508 |
509 | #pragma omp barrier
510 |
511 | gettimeofday(&startt, NULL);
512 |
513 | for (j = 0; j < VECLEN; j++)
514 | {
515 | one->data[j] = 1;
516 | }
517 |
518 | #ifdef BASE52
519 | //int tmp = ceil(MAXBITS / 64);
520 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
521 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
522 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
523 |
524 | for (j = 0; j < VECLEN; j++)
525 | {
526 | int k;
527 | for (k = 0; k < NWORDS; k++)
528 | {
529 | uint64_t r1 = spRand64(&LCG_STATE[t]);
530 | uint64_t r2 = spRand64(&LCG_STATE[t]);
531 | uint64_t r3 = spRand64(&LCG_STATE[t]);
532 |
533 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
534 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
535 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
536 | }
537 | }
538 |
539 | #else
540 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
541 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
542 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
543 |
544 | for (j = 0; j < VECLEN; j++)
545 | {
546 | int k;
547 | for (k = 0; k < NWORDS; k++)
548 | {
549 | uint64_t r1 = spRand64(&LCG_STATE[t]);
550 | uint64_t r2 = spRand64(&LCG_STATE[t]);
551 | uint64_t r3 = spRand64(&LCG_STATE[t]);
552 |
553 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
554 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
555 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
556 | }
557 | }
558 | #endif
559 | for (j = 0; j < VECLEN; j++)
560 | m->data[j] |= 0x1;
561 |
562 | if (verbose > 1)
563 | {
564 | for (j = 0; j < VECLEN; j++)
565 | {
566 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
567 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
568 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
569 |
570 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n",
571 | j, base, exp, mod);
572 | }
573 | }
574 |
575 | // now we actually do the (vectorized) montgomery initialization
576 | // on our vector of random moduli.
577 | monty_init_vec(mtest, m, 0);
578 |
579 | if (verbose > 1)
580 | {
581 | for (j = 0; j < VECLEN; j++)
582 | {
583 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS);
584 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS);
585 |
586 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n",
587 | j, base, mod, mtest->vrho[j]);
588 | }
589 | }
590 |
591 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep
592 | vecmulmod_ptr(e, mtest->rhat, e, m, s, mtest); // monty rep
593 |
594 | printf("thread %d starting %d iterations\n", tid, loc_iterations);
595 |
596 | // now do the calculation "b^e % m" a bunch of times
597 | for (i = 0; i < loc_iterations; i++)
598 | {
599 | if (verbose > 1)
600 | {
601 | for (j = 0; j < VECLEN; j++)
602 | {
603 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
604 |
605 | gmp_printf("monty(base%d) = %Zx\n", j, base);
606 | }
607 | }
608 |
609 | vecmulmod_ptr(b, e, b, m, s, mtest);
610 |
611 | if (verbose > 1)
612 | {
613 | for (j = 0; j < VECLEN; j++)
614 | {
615 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS);
616 |
617 | gmp_printf("modexp%d = %Zx\n", j, base);
618 | }
619 | }
620 |
621 | // now verify each result
622 | if (do_verification)
623 | {
624 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep
625 | for (j = 0; j < VECLEN; j++)
626 | {
627 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS);
628 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
629 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
630 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
631 |
632 | mpz_powm(t2, base, exp, mod);
633 |
634 | if (verbose)
635 | {
636 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n",
637 | i, j, t2, t1);
638 | }
639 |
640 | if (mpz_cmp(t1, t2) != 0)
641 | {
642 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n",
643 | i, j, base, exp, mod, t2, t1);
644 | exit(1);
645 | }
646 |
647 | }
648 | }
649 | }
650 |
651 | monty_free(mtest);
652 |
653 | if ((tid == 0) && (do_verification == 1))
654 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations);
655 |
656 | gettimeofday(&stopt, NULL);
657 | t_time = my_difftime(&startt, &stopt);
658 | elapsed_time[tid] = t_time;
659 |
660 | if (tid == 0)
661 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time);
662 |
663 | mpz_clear(t1);
664 | mpz_clear(t2);
665 | mpz_clear(base);
666 | mpz_clear(mod);
667 | mpz_clear(exp);
668 | vecFree(m);
669 | vecFree(b);
670 | vecFree(d);
671 | vecFree(s);
672 | vecFree(e);
673 | }
674 |
675 | {
676 | int i;
677 | double sum = 0.0;
678 | double min_t = 9999999999.;
679 | double max_t = 0.;
680 |
681 | for (i = 0; i < threads; i++)
682 | {
683 | sum += elapsed_time[i];
684 | if (elapsed_time[i] < min_t)
685 | min_t = elapsed_time[i];
686 | if (elapsed_time[i] > max_t)
687 | max_t = elapsed_time[i];
688 | }
689 |
690 | printf("average elapsed time = %1.4f\n", sum / threads);
691 | printf("min elapsed time = %1.4f\n", min_t);
692 | printf("max elapsed time = %1.4f\n", max_t);
693 | }
694 |
695 | free(elapsed_time);
696 | free(LCG_STATE);
697 |
698 | printf("\n\n");
699 |
700 | return;
701 | }
702 |
703 | void vecsqrtest(int do_verification, int threads, int verbose)
704 | {
705 | // test the pmod by comparing all results to those computed using
706 | // validated scalar code.
707 | double* elapsed_time;
708 | int t;
709 | //gmp_randstate_t rng_state;
710 |
711 | //gmp_randinit_default(rng_state);
712 | elapsed_time = (double*)malloc(threads * sizeof(double));
713 |
714 | LCG_STATE = (uint64_t*)malloc(threads * sizeof(uint64_t));
715 |
716 | for (t = 0; t < threads; t++)
717 | {
718 | LCG_STATE[t] = hash64(t);
719 | }
720 |
721 | do_verification = 0;
722 | printf("commencing test sqrmod: all variable (random)\n");
723 | #pragma omp parallel num_threads(threads)
724 | {
725 | int i, j;
726 |
727 | // timing variables
728 | struct timeval stopt; // stop time of this job
729 | struct timeval startt; // start time of this job
730 | double t_time;
731 |
732 | mpz_t base, exp, mod, t1, t2;
733 |
734 | int loc_iterations;
735 | int tid = omp_get_thread_num();
736 | monty* mtest;
737 |
738 | // vector bignums
739 | bignum* b = vecInit();
740 | bignum* d = vecInit();
741 | bignum* m = vecInit();
742 | bignum* e = vecInit();
743 | bignum* s = vecInit();
744 | bignum* one = vecInit();
745 |
746 | mpz_init(base);
747 | mpz_init(exp);
748 | mpz_init(mod);
749 | mpz_init(t1);
750 | mpz_init(t2);
751 |
752 | //gmp_randseed_ui(rng_state, tid);
753 |
754 | // attempt to scale the number of iterations with input size
755 | // so this doesn't take forever.
756 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS);
757 |
758 | if (MAXBITS >= 4096)
759 | loc_iterations *= 1;
760 | else if (MAXBITS >= 2048)
761 | loc_iterations *= 2;
762 | else if (MAXBITS >= 1024)
763 | loc_iterations *= 5;
764 | else if (MAXBITS >= 512)
765 | loc_iterations *= 10;
766 | else if (MAXBITS >= 256)
767 | loc_iterations *= 25;
768 | else
769 | loc_iterations *= 100;
770 |
771 | #ifdef BASE52
772 | loc_iterations *= 3;
773 | #endif
774 |
775 | #ifdef TARGET_KNL
776 | loc_iterations /= 3;
777 | #endif
778 |
779 | loc_iterations *= 20000;
780 | mtest = monty_alloc();
781 |
782 | #pragma omp barrier
783 |
784 | gettimeofday(&startt, NULL);
785 |
786 | for (j = 0; j < VECLEN; j++)
787 | {
788 | one->data[j] = 1;
789 | }
790 |
791 | #ifdef BASE52
792 | //int tmp = ceil(MAXBITS / 64);
793 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
794 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
795 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
796 |
797 | for (j = 0; j < VECLEN; j++)
798 | {
799 | int k;
800 | for (k = 0; k < NWORDS; k++)
801 | {
802 | uint64_t r1 = spRand64(&LCG_STATE[t]);
803 | uint64_t r2 = spRand64(&LCG_STATE[t]);
804 | uint64_t r3 = spRand64(&LCG_STATE[t]);
805 |
806 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
807 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
808 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
809 | }
810 | }
811 |
812 | #else
813 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8);
814 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8);
815 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8);
816 |
817 | for (j = 0; j < VECLEN; j++)
818 | {
819 | int k;
820 | for (k = 0; k < NWORDS; k++)
821 | {
822 | uint64_t r1 = spRand64(&LCG_STATE[t]);
823 | uint64_t r2 = spRand64(&LCG_STATE[t]);
824 | uint64_t r3 = spRand64(&LCG_STATE[t]);
825 |
826 | m->data[k * VECLEN + j] = r1 & MAXDIGIT;
827 | b->data[k * VECLEN + j] = r2 & MAXDIGIT;
828 | e->data[k * VECLEN + j] = r3 & MAXDIGIT;
829 | }
830 | }
831 | #endif
832 | for (j = 0; j < VECLEN; j++)
833 | m->data[j] |= 0x1;
834 |
835 | if (verbose > 1)
836 | {
837 | for (j = 0; j < VECLEN; j++)
838 | {
839 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
840 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
841 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
842 |
843 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n",
844 | j, base, exp, mod);
845 | }
846 | }
847 |
848 | // now we actually do the (vectorized) montgomery initialization
849 | // on our vector of random moduli.
850 | monty_init_vec(mtest, m, 0);
851 |
852 | if (verbose > 1)
853 | {
854 | for (j = 0; j < VECLEN; j++)
855 | {
856 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS);
857 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS);
858 |
859 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n",
860 | j, base, mod, mtest->vrho[j]);
861 | }
862 | }
863 |
864 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep
865 | vecmulmod_ptr(e, mtest->rhat, e, m, s, mtest); // monty rep
866 |
867 | printf("thread %d starting %d iterations\n", tid, loc_iterations);
868 |
869 | // now do the calculation "b^e % m" a bunch of times
870 | for (i = 0; i < loc_iterations; i++)
871 | {
872 | if (verbose > 1)
873 | {
874 | for (j = 0; j < VECLEN; j++)
875 | {
876 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
877 |
878 | gmp_printf("monty(base%d) = %Zx\n", j, base);
879 | }
880 | }
881 |
882 | vecsqrmod_ptr(b, b, m, s, mtest);
883 |
884 | if (verbose > 1)
885 | {
886 | for (j = 0; j < VECLEN; j++)
887 | {
888 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS);
889 |
890 | gmp_printf("modexp%d = %Zx\n", j, base);
891 | }
892 | }
893 |
894 | // now verify each result
895 | if (do_verification)
896 | {
897 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep
898 | for (j = 0; j < VECLEN; j++)
899 | {
900 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS);
901 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS);
902 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS);
903 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS);
904 |
905 | mpz_powm(t2, base, exp, mod);
906 |
907 | if (verbose)
908 | {
909 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n",
910 | i, j, t2, t1);
911 | }
912 |
913 | if (mpz_cmp(t1, t2) != 0)
914 | {
915 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n",
916 | i, j, base, exp, mod, t2, t1);
917 | exit(1);
918 | }
919 |
920 | }
921 | }
922 | }
923 |
924 | monty_free(mtest);
925 |
926 | if ((tid == 0) && (do_verification == 1))
927 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations);
928 |
929 | gettimeofday(&stopt, NULL);
930 | t_time = my_difftime(&startt, &stopt);
931 | elapsed_time[tid] = t_time;
932 |
933 | if (tid == 0)
934 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time);
935 |
936 | mpz_clear(t1);
937 | mpz_clear(t2);
938 | mpz_clear(base);
939 | mpz_clear(mod);
940 | mpz_clear(exp);
941 | vecFree(m);
942 | vecFree(b);
943 | vecFree(d);
944 | vecFree(s);
945 | vecFree(e);
946 | }
947 |
948 | {
949 | int i;
950 | double sum = 0.0;
951 | double min_t = 9999999999.;
952 | double max_t = 0.;
953 |
954 | for (i = 0; i < threads; i++)
955 | {
956 | sum += elapsed_time[i];
957 | if (elapsed_time[i] < min_t)
958 | min_t = elapsed_time[i];
959 | if (elapsed_time[i] > max_t)
960 | max_t = elapsed_time[i];
961 | }
962 |
963 | printf("average elapsed time = %1.4f\n", sum / threads);
964 | printf("min elapsed time = %1.4f\n", min_t);
965 | printf("max elapsed time = %1.4f\n", max_t);
966 | }
967 |
968 | free(elapsed_time);
969 | free(LCG_STATE);
970 |
971 | printf("\n\n");
972 |
973 | return;
974 | }
975 |
976 | int main(int argc, char **argv)
977 | {
978 | int threads;
979 | int do_verification = 1;
980 | int verbose = 0;
981 |
982 | if (argc < 2)
983 | {
984 | printf("usage: avx512_modexp $threads $do_verification\n");
985 | exit(1);
986 | }
987 | else if (argc == 3)
988 | {
989 | do_verification = atoi(argv[2]);
990 | }
991 |
992 | threads = atoi(argv[1]);
993 |
994 | printf("configured with MAXBITS = %d, DIGITBITS = %d, NUMWORDS = %d, VECLEN = %d\n",
995 | MAXBITS, DIGITBITS, NWORDS, VECLEN);
996 | printf("commencing modular exponentiation benchmarks\n"); fflush(stdout);
997 |
998 | #ifdef BASE52
999 | vecmulmod_ptr = &vecmulmod52;
1000 | vecsqrmod_ptr = &vecsqrmod52;
1001 | montsetup_ptr = &vec_montgomery_setup52;
1002 | vecmodexp_ptr = &vecmodexp52;
1003 | #else
1004 | vecmulmod_ptr = &vecmulmod;
1005 | vecsqrmod_ptr = &vecsqrmod;
1006 | montsetup_ptr = &vec_montgomery_setup;
1007 | vecmodexp_ptr = &vecmodexp;
1008 | #endif
1009 |
1010 | omp_set_num_threads(threads);
1011 | //vecmultest(do_verification, threads, verbose);
1012 | //vecsqrtest(do_verification, threads, verbose);
1013 | vecpmodtest(do_verification, threads, verbose);
1014 |
1015 | return 0;
1016 | }
1017 |
1018 |
1019 |
--------------------------------------------------------------------------------
/x64_bench/bigarith.c:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2021, Ben Buhrow
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 | The views and conclusions contained in the software and documentation are those
26 | of the authors and should not be interpreted as representing official policies,
27 | either expressed or implied, of the FreeBSD Project.
28 | */
29 |
30 | #include "bigarith.h"
31 |
32 | void zSet1(bignum *n, base_t d)
33 | {
34 | n->data[0] = d;
35 | n->size = 1;
36 | return;
37 | }
38 |
39 | int zBits(bignum * n)
40 | {
41 | if (n->size == 1)
42 | return spBits(n->data[0]);
43 | else
44 | return DIGITBITS*(n->size-1) + spBits(n->data[n->size-1]);
45 | }
46 |
47 | base_t spBits(base_t n)
48 | {
49 | int i = 0;
50 | while (n != 0)
51 | {
52 | n >>= 1;
53 | i++;
54 | }
55 | return i;
56 | }
57 |
58 | int ndigits_1(base_t n)
59 | {
60 | int i=0;
61 | while (n != 0)
62 | {
63 | n /= 10;
64 | i++;
65 | }
66 | if (i==0)
67 | i++;
68 | return i;
69 | }
70 |
71 | base_t spGCD(base_t x, base_t y)
72 | {
73 | base_t a,b,c;
74 | a=x; b=y;
75 | while (b != 0)
76 | {
77 | c=a%b;
78 | a=b;
79 | b=c;
80 | }
81 | return a;
82 | }
83 |
84 | void sp2big(base_t src, bignum * dest)
85 | {
86 | dest->data[0] = src;
87 | dest->size = 1;
88 | return;
89 | }
90 |
91 | void zClear(bignum * n)
92 | {
93 | int i;
94 | for (i = 0; i <= NWORDS; i++)
95 | n->data[i] = 0;
96 | n->size = 1;
97 | return;
98 | }
99 |
100 | void zClearFull(bignum * n)
101 | {
102 | int i;
103 | memset(n->data, 0, 2 * NWORDS * sizeof(base_t));
104 | n->size = 1;
105 | return;
106 | }
107 |
108 | bignum * zInit(void)
109 | {
110 | int i;
111 | size_t sz = 2 * (NWORDS + 4);
112 | bignum *n;
113 |
114 | n = (bignum *)malloc(sizeof(bignum));
115 |
116 | n->data = (base_t *)xmalloc_align(sz * sizeof(base_t));
117 | for (i = 0; i < sz; i++)
118 | {
119 | n->data[i] = 0;
120 | }
121 | n->size = 1;
122 |
123 | return n;
124 | }
125 |
126 | void zFree(bignum *n)
127 | {
128 | align_free(n->data);
129 | free(n);
130 | }
131 |
132 | void zPrint(bignum *n)
133 | {
134 | int j;
135 | for (j = MIN(n->size - 1, 2*NWORDS); j >= 0; j--)
136 | printf("%016lx", n->data[j]);
137 | return;
138 | }
139 |
140 | void zClamp(bignum * n)
141 | {
142 | int j;
143 | int sn = abs(n->size);
144 | int sign = n->size < 0;
145 |
146 | for (j = sn - 1; j >= 0; j--)
147 | {
148 | if (n->data[j] == 0)
149 | {
150 | sn--;
151 | }
152 | else
153 | break;
154 | }
155 |
156 | n->size = (sn == 0 ? 1 : sn);
157 | if (sign)
158 | n->size *= -1;
159 |
160 | return;
161 | }
162 |
163 | void zCopy(bignum * src, bignum * dest)
164 | {
165 | //physically copy the digits of u into the digits of v
166 | int su = abs(src->size);
167 | int i;
168 |
169 | //memcpy(dest->data, src->data, su * sizeof(base_t));
170 | for (i = 0; i < su; i++)
171 | {
172 | dest->data[i] = src->data[i];
173 | }
174 | dest->size = src->size;
175 | return;
176 | }
177 |
178 | void zAdd(bignum * u, bignum * v, bignum * w)
179 | {
180 | int i, su, sv;
181 | base_t *larger;
182 | base_t k;
183 | int n, m;
184 |
185 | if (u->size < 0)
186 | {
187 | if (v->size > 0)
188 | {
189 | //u is negative, v is not
190 | u->size *= -1;
191 | zSub(v, u, w);
192 | if (u != w)
193 | u->size *= -1;
194 | return;
195 | }
196 | }
197 | else if (v->size < 0)
198 | {
199 | //v is negative, u is not
200 | v->size *= -1;
201 | zSub(u, v, w);
202 | if (v != w)
203 | v->size *= -1;
204 | return;
205 | }
206 |
207 | su = abs(u->size);
208 | sv = abs(v->size);
209 |
210 | if (su >= sv)
211 | {
212 | larger = u->data;
213 | n = su;
214 | m = sv;
215 | }
216 | else
217 | {
218 | larger = v->data;
219 | n = sv;
220 | m = su;
221 | }
222 |
223 | k=0;
224 | for (i = 0; i < m; ++i)
225 | spAdd3(u->data[i], v->data[i], k, w->data + i, &k);
226 |
227 | for ( ; i < n; ++i)
228 | spAdd(larger[i], k, w->data + i, &k);
229 |
230 | w->size = n;
231 | if (k)
232 | {
233 | w->data[n] = k;
234 | w->size++;
235 | }
236 |
237 | // if one is negative then so is the other or we would be subtracting
238 | if (u->size < 0)
239 | w->size *= -1;
240 |
241 | return;
242 | }
243 |
244 | void zShortAdd(bignum * u, base_t v, bignum * w)
245 | {
246 | int i, su;
247 | base_t k;
248 |
249 | if (u->size < 0)
250 | {
251 | //u is negative
252 | u->size *= -1;
253 | zShortSub(u, v, w);
254 | w->size *= -1;
255 | u->size *= -1;
256 | return;
257 | }
258 |
259 | su = abs(u->size);
260 |
261 | zCopy(u,w);
262 |
263 | //add
264 | spAdd(u->data[0], v, w->data, &k);
265 |
266 | //add the carry
267 | spAdd(u->data[1], k, w->data + 1, &k);
268 |
269 | if (k)
270 | {
271 | //only rarely will the carry propagate more than one place
272 | //special case this.
273 | for (i = 2; i < su; ++i)
274 | spAdd(u->data[i], k, w->data + i, &k);
275 |
276 | w->size = u->size;
277 | if (k)
278 | {
279 | w->data[u->size] = k;
280 | w->size++;
281 | }
282 | }
283 |
284 | return;
285 | }
286 |
287 | int zSub(bignum * u, bignum * v, bignum * w)
288 | {
289 | base_t k = 0;
290 | int i, j, su, sv, sw, m, sign=0;
291 | base_t *bigger, *smaller;
292 |
293 | if (u->size < 0)
294 | {
295 | if (v->size > 0)
296 | {
297 | //u is negative, v is not, so really an addition
298 | u->size *= -1;
299 | zAdd(u, v, w);
300 | if (u != w)
301 | u->size *= -1;
302 | w->size *= -1;
303 | //printf("did an addition, result is neg\n");
304 | return 0;
305 | }
306 | else
307 | {
308 | //both are negative, so we really have -u + v or v - u
309 | v->size *= -1;
310 | u->size *= -1;
311 | zSub(v, u, w);
312 | if (v != w)
313 | v->size *= -1;
314 | if (u != w)
315 | u->size *= -1;
316 | //printf("both negative\n");
317 | return 0;
318 | }
319 | }
320 | else if (v->size < 0)
321 | {
322 | if (u->size > 0)
323 | {
324 | //v is negative, u is not, so really an addition
325 | v->size *= -1;
326 | zAdd(u, v, w);
327 | if (v != w)
328 | v->size *= -1;
329 | //printf("did an addition, result is pos\n");
330 | return 0;
331 | }
332 | }
333 |
334 | su = u->size;
335 | sv = v->size;
336 |
337 | if (su > sv)
338 | {
339 | bigger = u->data;
340 | smaller = v->data;
341 | sw = su;
342 | m = sv;
343 | goto beginsub;
344 | }
345 | if (su < sv)
346 | {
347 | bigger = v->data;
348 | smaller = u->data;
349 | sw = sv;
350 | m = su;
351 | sign=1;
352 | goto beginsub;
353 | }
354 |
355 | // same size
356 | m = su;
357 | sw = sv;
358 | for (i = su - 1; i >= 0; --i)
359 | {
360 | if (u->data[i] > v->data[i])
361 | {
362 | bigger = u->data;
363 | smaller = v->data;
364 | goto beginsub;
365 | }
366 | if (u->data[i] < v->data[i])
367 | {
368 | bigger = v->data;
369 | smaller = u->data;
370 | sign=1;
371 | goto beginsub;
372 | }
373 | }
374 |
375 | //equal if got to here
376 | w->size = 1;
377 | w->data[0] = 0;
378 | return 1;
379 |
380 | beginsub:
381 |
382 | for (j = 0; j < m; ++j)
383 | spSub3(bigger[j], smaller[j], k, w->data + j, &k);
384 |
385 |
386 | //if there is a leftover word that is != 0, then subtract any
387 | //carry and simply copy any other leftover words
388 |
389 | //if there is a leftover word that is == 0, then subtract with
390 | //borrow for the rest of the leftover words. this will happen rarely
391 |
392 | //leftover word?
393 | if (sw > m)
394 | {
395 | //not equal to zero?
396 | if (bigger[m] != 0)
397 | {
398 | //subtract any carry and copy the rest
399 | w->data[m] = bigger[m] - k;
400 | j = m + 1;
401 | m = sw;
402 | for(; j < m; j++)
403 | w->data[j] = bigger[j];
404 | }
405 | else
406 | {
407 | //equal to zero, need to subtract with borrow for the rest
408 | //of the leftover words.
409 | j = m;
410 | m = sw;
411 |
412 | for ( ; j < m; ++j)
413 | spSub3(bigger[j], 0, k, w->data + j, &k);
414 | }
415 | }
416 |
417 | w->size = sw;
418 | zClamp(w);
419 |
420 | if (sign)
421 | w->size *= -1;
422 |
423 | if (w->size == 0)
424 | w->size = 1;
425 |
426 | return 0;
427 | }
428 |
429 | void zShortSub(bignum * u, base_t v, bignum * w)
430 | {
431 | // w = u - v
432 | // assume both are initially positive; result can be negative
433 |
434 | int i, su = abs(u->size);
435 | base_t k = 0;
436 |
437 | su = abs(u->size);
438 | w->size = su;
439 |
440 | if (u->size < 0)
441 | {
442 | //u is negative, really an addition
443 | u->size *= -1;
444 | zShortAdd(u,v,w);
445 | u->size *= -1;
446 | w->size *= -1;
447 | return;
448 | }
449 |
450 | zCopy(u,w);
451 |
452 | //subtract
453 | spSub3(u->data[0],v,0,w->data,&k);
454 |
455 | //subtract the borrow
456 | spSub3(u->data[1],k,0,w->data+1,&k);
457 |
458 | if (k)
459 | {
460 | //propagate the borrow
461 | for (i=2;idata[i],0,k,w->data+i,&k);
463 | }
464 |
465 | //check if we lost the high digit
466 | if ((w->data[su - 1] == 0) && (su != 1))
467 | su--;
468 | w->size = su;
469 |
470 | //check for u < v
471 | if (k)
472 | {
473 | //then u < v, and result is negative
474 | w->data[0] = ~w->data[0];
475 | w->data[0]++;
476 | w->size *= -1;
477 | }
478 |
479 | return;
480 | }
481 |
482 | int zCompare(bignum * u, bignum * v)
483 | {
484 | //return 1 if u > v, -1 if u < v, 0 if equal
485 | int i,j,su,sv;
486 |
487 | i = u->size < 0;
488 | j = v->size < 0;
489 |
490 | su = abs(u->size);
491 | sv = abs(v->size);
492 |
493 | if (i > j)
494 | {
495 | //v pos, u neg
496 | //make sure both are not zero
497 | if ((u->data[0] == 0) && (su == 1) && (v->data[0] == 0) && (sv == 1))
498 | return 0;
499 | else
500 | return -1;
501 | }
502 | if (j > i)
503 | {
504 | //u pos, v neg
505 | //make sure both are not zero
506 | if ((u->data[0] == 0) && (su == 1) && (v->data[0] == 0) && (sv == 1))
507 | return 0;
508 | else
509 | return 1;
510 | }
511 |
512 | //check obvious
513 | if (j)
514 | { //both are negative
515 | if (su > sv) return -1;
516 | if (su < sv) return 1;
517 | }
518 | else
519 | { //both are positive
520 | if (su > sv) return 1;
521 | if (su < sv) return -1;
522 | }
523 |
524 | //if the numbers are both negative, then we'll need to switch the return value
525 | for (i = su - 1; i>=0; --i)
526 | {
527 | if (u->data[i] > v->data[i])
528 | return (1 - 2*j);
529 | if (u->data[i] < v->data[i])
530 | return (-1 + 2*j);
531 | }
532 |
533 | //equal if got to here
534 | return 0;
535 | }
536 |
537 | int zCompare1(bignum * u, base_t v)
538 | {
539 | // return 1 if u > v, -1 if u < v, 0 if equal.
540 | // single digit v is assumed to be positive.
541 | if (u->size < 0)
542 | {
543 | return -1;
544 | }
545 | else if (u->size > 1)
546 | {
547 | return 1;
548 | }
549 | else if (u->data[0] > v)
550 | {
551 | return 1;
552 | }
553 | else if (u->data[0] < v)
554 | {
555 | return -1;
556 | }
557 | else
558 | {
559 | return 0;
560 | }
561 | }
562 |
563 | base_t zShortDiv(bignum * u, base_t v, bignum * q)
564 | {
565 | // q = u/v
566 | // return the remainder
567 |
568 | int su = abs(u->size);
569 | int sign = u->size < 0 ? 1 : 0;
570 | int i;
571 | base_t rem = 0;
572 |
573 | q->size = su;
574 |
575 | i = su - 1;
576 | if (u->data[i] < v)
577 | {
578 | rem = u->data[i];
579 | q->data[i--] = 0;
580 | }
581 |
582 | while (i >= 0)
583 | {
584 | base_t quot1;
585 |
586 | #if DIGITBITS == 64
587 | __asm__ ("divq %4"
588 | : "=a"(quot1),"=d"(rem)
589 | : "1"(rem), "0"(u->data[i]), "r"(v) );
590 | #else
591 | __asm__ ("divl %4"
592 | : "=a"(quot1),"=d"(rem)
593 | : "1"(rem), "0"(u->data[i]), "r"(v) );
594 | #endif
595 |
596 | q->data[i] = quot1;
597 | i--;
598 | }
599 |
600 | //the quotient could be one limb smaller than the input
601 | if ((q->data[q->size - 1] == 0) && (q->size != 1))
602 | q->size--;
603 |
604 | if (sign)
605 | q->size *= -1;
606 |
607 | return rem;
608 | }
609 |
610 | void zDiv(bignum * u, bignum * v, bignum * q, bignum * r)
611 | {
612 | /*
613 | q = u \ v
614 | r = u mod v
615 | u is overwritten
616 |
617 | schoolbook long division. see knuth TAOCP, vol. 2
618 | */
619 |
620 | base_t v1=0,v2=0,d=0,k,qhat,rhat,uj2,tt[2],pp[2];
621 | int i,j,m,su,sv;
622 | int s =0,cmp,sdd,sd;
623 | unsigned int shift;
624 | base_t bitmask;
625 |
626 | su = abs(u->size);
627 | sv = abs(v->size);
628 | m = su-sv;
629 |
630 | //v > u, so just set q = 0 and r = u
631 | if (su < sv)
632 | {
633 | q->size = 1;
634 | zCopy(u,r);
635 |
636 | return;
637 | }
638 |
639 | if (sv == 1)
640 | {
641 | r->data[0] = zShortDiv(u, v->data[0], q);
642 | r->size = 1;
643 | s = (v->size < 0);
644 | if (s)
645 | {
646 | q->size *= -1;
647 | r->size *= -1;
648 | }
649 | return;
650 | }
651 |
652 | //u and v are the same length
653 | if (su == sv)
654 | {
655 | cmp = zCompare(u,v);
656 | //v > u, as above
657 | if (cmp < 0)
658 | {
659 | q->size = 1;
660 | zCopy(u,r);
661 | return;
662 | }
663 | else if (cmp == 0) //v == u, so set q = 1 and r = 0
664 | {
665 | q->size = 1;
666 | q->data[0] = 1;
667 | r->size = 1;
668 | r->data[0] = 0;
669 | return;
670 | }
671 | }
672 |
673 | //normalize v by left shifting until the high bit of v is set (v1 >= floor(2^31))
674 | bitmask = HIBITMASK;
675 | for (shift = 0; shift < DIGITBITS; ++shift)
676 | {
677 | if (v->data[sv-1] & bitmask)
678 | break;
679 | bitmask >>= 1;
680 | }
681 |
682 | //normalize v by shifting left (x2) shift number of times
683 | //overflow should never occur to v during normalization
684 | zShiftLeft(v,v,shift);
685 |
686 | //left shift u the same amount - may get an overflow here
687 | zShiftLeft(u,u,shift);
688 | if (abs(u->size) == su)
689 | { //no overflow - force extra digit
690 | if (u->size < 0)
691 | u->size--;
692 | else
693 | u->size++;
694 | u->data[su] = 0;
695 | su++;
696 | }
697 | else
698 | su++;
699 |
700 | //copy first two digits of v to local variables for quick access
701 | v1=v->data[sv-1];
702 | v2=v->data[sv-2];
703 |
704 | sdd=0;
705 | sd=0;
706 | //main loop
707 | for (j=0;j<=m;++j)
708 | {
709 | //calculate qhat
710 | tt[1] = u->data[su-j-1]; //first digit of normalized u
711 | tt[0] = u->data[su-j-2]; //second digit of normalized u
712 | if (tt[1] == v1)
713 | qhat = MAXDIGIT;
714 | else
715 | spDivide(&qhat, &rhat, tt, v1);
716 |
717 | //quick check if qhat is too big based on our initial guess involving
718 | //the first two digits of u and v.
719 | uj2 = u->data[su-j-3];
720 |
721 | while (1)
722 | {
723 | spMultiply(qhat,v1,&pp[0],&pp[1]);
724 | shortSubtract(tt,pp,tt);
725 | if (tt[1]) break;
726 | tt[1] = tt[0]; tt[0] = uj2;
727 | spMultiply(qhat,v2,&pp[0],&pp[1]);
728 | i = shortCompare(pp,tt); //p = v2*qhat, t = (uj*b+uj1-qhat*v1)*b + uj2
729 |
730 | if (i == 1)
731 | qhat--;
732 | else
733 | break;
734 | }
735 |
736 | //keep track of the significant digits
737 | if (qhat > 0)
738 | {
739 | sdd = sdd + 1 + sd;
740 | sd = 0;
741 | }
742 | else if (sdd != 0)
743 | sd++;
744 |
745 | //multiply and subtract, in situ
746 | k=0;
747 | for (i=0;idata[i],qhat,&pp[0],&pp[1]);
751 | spAdd(pp[0],k,&tt[0],&tt[1]);
752 | u->data[s] = u->data[s] - tt[0];
753 | //check if this result is negative, remember the borrow for the next digit
754 | if (u->data[s] > (u->data[s] + tt[0]))
755 | k = pp[1] + tt[1] + 1;
756 | else
757 | k = pp[1] + tt[1];
758 | }
759 |
760 | //if the final carry is bigger than the most significant digit of u, then qhat
761 | //was too big, i.e. qhat[v1v2...vn] > [u0u1u2...un]
762 | if (k > u->data[su-j-1])
763 | {
764 | //correct by decrementing qhat and adding back [v1v2...vn] to [u0u1...un]
765 | qhat--;
766 | //first subtract the final carry, yielding a negative number for [u0u1...un]
767 | u->data[su-j-1] -= k;
768 | //then add back v
769 | k=0;
770 | for (i=0;idata[su-j-sv+i-1],v->data[i],k,&u->data[su-j-sv+i-1],&k);
772 | u->data[su-j-1] += k;
773 | }
774 | else //else qhat was ok, subtract the final carry
775 | u->data[su-j-1] -= k;
776 |
777 | //set digit of q
778 | q->data[m-j] = qhat;
779 | }
780 | q->size = sdd+sd;
781 | zCopy(u,r);
782 |
783 | for (s=r->size - 1; s>=0; --s)
784 | {
785 | if ((r->data[s] == 0) && (r->size > 0))
786 | r->size--;
787 | else
788 | break;
789 | }
790 |
791 | //unnormalize.
792 | zShiftRight(v,v,shift);
793 | zShiftRight(r,r,shift);
794 |
795 | s = (u->size < 0) ^ (v->size < 0);
796 | if (s)
797 | {
798 | q->size *= -1;
799 | r->size *= -1;
800 | }
801 |
802 | return;
803 | }
804 |
805 | int shortCompare(base_t p[2], base_t t[2])
806 | {
807 | //utility function used in zDiv
808 | int i;
809 |
810 | for (i=1;i>=0;--i)
811 | {
812 | if (p[i] > t[i]) return 1;
813 | if (p[i] < t[i]) return -1;
814 | }
815 | return 0;
816 | }
817 |
818 | int shortSubtract(base_t u[2], base_t v[2], base_t w[2])
819 | {
820 | //utility function used in zDiv
821 | base_t j=0;
822 |
823 | w[0] = u[0] - v[0];
824 | if (w[0] > (MAXDIGIT - v[0]))
825 | {
826 | j=1;
827 | w[0] = w[0] + MAXDIGIT + 1;
828 | }
829 | w[1] = u[1] - v[1] - j;
830 |
831 | return 1;
832 | }
833 |
834 | void zMult(bignum * u, bignum * v, bignum * w, bignum *tmp)
835 | {
836 | //w = u*v
837 | base_t k = 0;
838 | int su, sv, i, j, signu, signv;
839 | base_t *wptr;
840 | int words = u->size;
841 |
842 | signu = u->size < 0;
843 | signv = v->size < 0;
844 |
845 | su = abs(u->size);
846 | sv = abs(v->size);
847 |
848 | //for each digit of u
849 | for (i = 0; i < su; ++i)
850 | {
851 | //take an inner product and add in-situ with the previous inner products
852 | k = 0;
853 | wptr = &tmp->data[i];
854 | for (j = 0; j < sv; ++j)
855 | {
856 | spMulAdd(u->data[i], v->data[j], wptr[j], k, &wptr[j], &k);
857 | }
858 | wptr[j] += k;
859 | }
860 | tmp->size = su + sv;
861 |
862 | zClamp(tmp);
863 |
864 | if (((u->size == 1) && (u->data[0] == 0)) || ((v->size == 1) && (v->data[0] == 0)))
865 | {
866 | w->size = 1;
867 | w->data[0] = 0;
868 | }
869 | else
870 | {
871 | zCopy(tmp, w);
872 |
873 | if (signu ^ signv)
874 | w->size *= -1;
875 | }
876 |
877 | return;
878 | }
879 |
880 | void zMul(bignum * u, bignum * v, bignum * w)
881 | {
882 | //w = u*v
883 | base_t k = 0;
884 | int su, sv, i, j, signu, signv;
885 | base_t *wptr;
886 | int words = u->size;
887 | bignum *tmp;
888 |
889 | signu = u->size < 0;
890 | signv = v->size < 0;
891 |
892 | tmp = zInit();
893 |
894 | su = abs(u->size);
895 | sv = abs(v->size);
896 |
897 | //for each digit of u
898 | for (i = 0; i < su; ++i)
899 | {
900 | //take an inner product and add in-situ with the previous inner products
901 | k=0;
902 | wptr = &tmp->data[i];
903 | for (j = 0; j < sv; ++j)
904 | {
905 | spMulAdd(u->data[i], v->data[j], wptr[j], k, &wptr[j], &k);
906 | }
907 | wptr[j] += k;
908 | }
909 | tmp->size = su+sv;
910 |
911 | zClamp(tmp);
912 |
913 | if (((u->size == 1) && (u->data[0] == 0)) || ((v->size == 1) && (v->data[0] == 0)))
914 | {
915 | w->size = 1;
916 | w->data[0] = 0;
917 | }
918 | else
919 | {
920 | zCopy(tmp, w);
921 |
922 | if (signu ^ signv)
923 | w->size *= -1;
924 | }
925 |
926 | zFree(tmp);
927 | return;
928 | }
929 |
930 | void zModMul(bignum * u, bignum * v, bignum * n, bignum * w)
931 | {
932 | bignum * t1, *t2;
933 |
934 | t1 = zInit();
935 | t2 = zInit();
936 |
937 | zMul(u,v,t1);
938 | zDiv(t1,n,t2,w);
939 |
940 | zFree(t1);
941 | zFree(t2);
942 | return;
943 | }
944 |
945 | void zModMuls(bignum * u, bignum * v, bignum * n, bignum * w, bignum *s1, bignum *s2)
946 | {
947 | zMul(u, v, s1);
948 | zDiv(s1, n, s2, w);
949 | return;
950 | }
951 |
952 | void zModExp(bignum *d, bignum *b, bignum *e, bignum *m)
953 | {
954 | // d = b^e mod m
955 | // all b and e vector elements can be different.
956 | // all m elements are the same.
957 | int i, word = 0, bit = 0;
958 | int j;
959 |
960 | bignum *s1, *s2, *bb, *t;
961 |
962 | s1 = zInit();
963 | s2 = zInit();
964 | bb = zInit();
965 | t = zInit();
966 |
967 | zCopy(b, bb);
968 | zSet1(d, 1);
969 |
970 | while (word < NWORDS)
971 | {
972 | if (e->data[word] & (1 << bit))
973 | {
974 | zModMuls(d, bb, m, d, s1, s2);
975 | }
976 |
977 | zModMuls(bb, bb, m, bb, s1, s2);
978 |
979 | bit++;
980 | if (bit == 32)
981 | {
982 | bit = 0;
983 | word++;
984 | }
985 | }
986 |
987 | zFree(s1);
988 | zFree(s2);
989 | zFree(bb);
990 | zFree(t);
991 | return;
992 | }
993 |
994 | void zShortMul(bignum * u, base_t v, bignum * w)
995 | {
996 | //w = u * v
997 | //schoolbook multiplication, see knuth TAOCP, vol. 2
998 | base_t k=0;
999 | long i;
1000 | long su;
1001 |
1002 | su = abs(u->size);
1003 |
1004 | //inner product
1005 | for (i = 0; i < su; ++i)
1006 | spMulAdd(u->data[i], v, 0, k, &w->data[i], &k);
1007 |
1008 | //if still have a carry, add a digit to w
1009 | if (k)
1010 | {
1011 | w->data[su]=k;
1012 | su++;
1013 | }
1014 |
1015 | if (v == 0)
1016 | {
1017 | w->size = 1;
1018 | }
1019 | else
1020 | {
1021 | w->size = su;
1022 |
1023 | if (u->size < 0)
1024 | w->size *= -1;
1025 | }
1026 |
1027 | return;
1028 | }
1029 |
1030 | void zSqr(bignum * x, bignum * w)
1031 | {
1032 | //this routine is faster than the generic comba sqr on MSVC x86_32 builds.
1033 | bignum *t;
1034 |
1035 | t = zInit();
1036 |
1037 | zCopy(x, t);
1038 | zMul(x, x, w);
1039 |
1040 | zFree(t);
1041 |
1042 | return;
1043 | }
1044 |
1045 | void zShiftLeft(bignum * a, bignum * b, int x)
1046 | {
1047 | /* Computes a = b << x */
1048 | int i,wordshift;
1049 | int y;
1050 | int sb,j;
1051 | base_t mask, carry, nextcarry;
1052 |
1053 | wordshift = x / DIGITBITS;
1054 | x = x % DIGITBITS;
1055 |
1056 | //create a mask for the bits that will overflow each digit
1057 | mask = HIBITMASK;
1058 | for (i = 1; i < x; ++i)
1059 | mask = (mask >> 1) | mask;
1060 |
1061 | if (x == 0) mask = 0x0;
1062 |
1063 | sb = abs(b->size);
1064 | a->size = sb;
1065 |
1066 | //for each digit, remember the highest x bits using the mask, then shift.
1067 | //the highest x bits becomes the lowest x bits for the next digit
1068 | y = DIGITBITS - x;
1069 | carry = 0;
1070 | for (j = 0; j < sb; ++j)
1071 | {
1072 | nextcarry = (b->data[j] & mask) >> y;
1073 | a->data[j] = (b->data[j] << x) | carry;
1074 | carry = nextcarry;
1075 | }
1076 |
1077 | if (carry)
1078 | {
1079 | a->data[sb] = carry;
1080 | a->size++;
1081 | }
1082 |
1083 | if (wordshift)
1084 | {
1085 | //now shift by any full words
1086 | for (i=a->size - 1;i>=0;i--)
1087 | a->data[i+wordshift] = a->data[i];
1088 | //zero out the ones that were shifted
1089 | for (i=wordshift-1;i>=0;i--)
1090 | a->data[i] = 0;
1091 | a->size += wordshift;
1092 | }
1093 |
1094 | if (b->size < 0)
1095 | a->size *= -1;
1096 |
1097 | return;
1098 | }
1099 |
1100 | void zShiftLeft_1(bignum * a, bignum * b)
1101 | {
1102 | /* Computes a = b << 1 */
1103 | int i;
1104 | int y;
1105 | int sb, j;
1106 | base_t mask, carry, nextcarry;
1107 |
1108 | //create a mask for the bits that will overflow each digit
1109 | mask = HIBITMASK;
1110 | sb = abs(b->size);
1111 | a->size = sb;
1112 |
1113 | //for each digit, remember the highest x bits using the mask, then shift.
1114 | //the highest x bits becomes the lowest x bits for the next digit
1115 | y = DIGITBITS - 1;
1116 | carry = 0;
1117 | for (j = 0; j < sb; ++j)
1118 | {
1119 | nextcarry = (b->data[j] & mask) >> y;
1120 | a->data[j] = (b->data[j] << 1) | carry;
1121 | carry = nextcarry;
1122 | }
1123 |
1124 | if (carry)
1125 | {
1126 | a->data[sb] = carry;
1127 | a->size++;
1128 | }
1129 |
1130 | if (b->size < 0)
1131 | a->size *= -1;
1132 |
1133 | return;
1134 | }
1135 |
1136 | void zShiftRight(bignum * a, bignum * b, int x)
1137 | { /* Computes a = b >> x */
1138 | int i, y, sign, wordshift;
1139 | int sb;
1140 | base_t mask, carry, nextcarry;
1141 |
1142 | wordshift = x / DIGITBITS;
1143 | x = x % DIGITBITS;
1144 |
1145 | //create a mask for the bits that will overflow each digit
1146 | mask = 0x1;
1147 | for (i = 1; i < x; ++i)
1148 | {
1149 | mask = (mask << 1) | mask;
1150 | }
1151 | if (x == 0) mask = 0x0;
1152 |
1153 | sign =( b->size < 0);
1154 | sb = abs(b->size);
1155 | a->size = sb;
1156 |
1157 | //for each digit, remember the lowest x bits using the mask, then shift.
1158 | //the lowest x bits becomes the highest x bits for the next digit
1159 | y = DIGITBITS - x;
1160 | carry = 0;
1161 | for (i = sb - 1; i >= 0; --i)
1162 | {
1163 | nextcarry = (b->data[i] & mask) << y;
1164 | a->data[i] = b->data[i] >> x | carry;
1165 | carry = nextcarry;
1166 | }
1167 |
1168 | if ((a->data[sb-1] == 0) && (a->size > 1))
1169 | a->size--;
1170 |
1171 | if (wordshift)
1172 | {
1173 | //now shift by any full words
1174 | for (i=0;isize - 1;i++)
1175 | a->data[i] = a->data[i+wordshift];
1176 | //zero out the ones that were shifted
1177 | a->size -= wordshift;
1178 | }
1179 |
1180 | if (sign)
1181 | a->size *= -1;
1182 |
1183 | return;
1184 | }
1185 |
1186 | void zShiftRight_1(bignum * a, bignum * b)
1187 | { /* Computes a = b >> x */
1188 | int i, sign;
1189 | int sb;
1190 | base_t mask, carry, nextcarry;
1191 |
1192 | //create a mask for the bits that will overflow each digit
1193 | mask = 0x1;
1194 |
1195 | sign = (b->size < 0);
1196 | sb = abs(b->size);
1197 | a->size = sb;
1198 |
1199 | //for each digit, remember the lowest x bits using the mask, then shift.
1200 | //the lowest x bits becomes the highest x bits for the next digit
1201 | carry = 0;
1202 | for (i = sb - 1; i >= 0; --i)
1203 | {
1204 | nextcarry = (b->data[i] & mask) << 31;
1205 | a->data[i] = (b->data[i] >> 1) | carry;
1206 | carry = nextcarry;
1207 | }
1208 |
1209 | if ((a->data[sb - 1] == 0) && (a->size > 1))
1210 | a->size--;
1211 |
1212 | if (sign)
1213 | a->size *= -1;
1214 |
1215 | return;
1216 | }
1217 |
1218 | int zLEGCD(bignum *u, bignum *v, bignum *w)
1219 | {
1220 | //use the Lehman-Euclid algorithm to calculate GCD(u,v) = w
1221 | //Algorithm L in Knuth, 4.5.2 p. 329
1222 | //assumes u,v nonnegative
1223 |
1224 | base_t aa,bb,cc,dd;
1225 | int i,j,k,it;
1226 | base_signed_t a,b,c,d,t;
1227 | base_t up,vdp, q1, q2;
1228 | base_t mask;
1229 | bignum *y, *zz; //t and w, in knuth
1230 | bignum *x; //tmp variable
1231 | bignum *uu, *vv; //so u and v don't get destroyed
1232 | bignum *uh, *vh;
1233 | base_t udp[2],vp[2];
1234 |
1235 |
1236 | #if DIGITBITS == 32
1237 | mask = 0xff000000;
1238 | #else
1239 | mask = 0xff00000000000000;
1240 | #endif
1241 |
1242 | i = zCompare1(u,0);
1243 | j = zCompare1(v,0);
1244 |
1245 | if (i == 0)
1246 | {
1247 | zCopy(v,w);
1248 | return 1;
1249 | }
1250 | if (j == 0)
1251 | {
1252 | zCopy(u,w);
1253 | return 1;
1254 | }
1255 |
1256 | //temp variables should be twice as big as the input, to make room
1257 | //for intermediate operations. w should be as big as the biggest input.
1258 | i = u->size;
1259 | j = v->size;
1260 | if (j > i)
1261 | i = j;
1262 |
1263 | y = zInit();
1264 | zz = zInit();
1265 | x = zInit();
1266 | uu = zInit();
1267 | vv = zInit();
1268 | uh = zInit();
1269 | vh = zInit();
1270 |
1271 | //put bigger number in uu, other in vv.
1272 | i = zCompare(u,v);
1273 | if (i >= 0)
1274 | {
1275 | zCopy(u,uu);
1276 | zCopy(v,vv);
1277 | }
1278 | else
1279 | {
1280 | zCopy(v,uu);
1281 | zCopy(u,vv);
1282 | }
1283 |
1284 | j=0;
1285 | while (vv->size > 1)
1286 | {
1287 | //Step L1
1288 | for (it=vv->size;itsize;it++)
1289 | vv->data[it]=0;
1290 | vv->size = uu->size;
1291 | //get the most significant 32 bits of u and v, such that uhat >= vhat
1292 | uh->data[1] = uu->data[uu->size - 1];
1293 | uh->data[0] = uu->data[uu->size - 2];
1294 | vh->data[1] = vv->data[vv->size - 1];
1295 | vh->data[0] = vv->data[vv->size - 2];
1296 | uh->size = vh->size = 2;
1297 |
1298 | //rightshift until uhat is a single word
1299 | //0xff000000 is magic
1300 | if ((uh->data[1] & mask) > 0)
1301 | {
1302 | uh->data[0] = uh->data[1];
1303 | vh->data[0] = vh->data[1];
1304 | }
1305 | else
1306 | {
1307 | i=0;
1308 | aa=uh->data[1];
1309 | while ((aa & MAXDIGIT) != 0)
1310 | {
1311 | aa >>= 1;
1312 | i++;
1313 | }
1314 | zShiftRight(uh,uh,i);
1315 | zShiftRight(vh,vh,i);
1316 | }
1317 |
1318 | //make u',v',u'',v''
1319 | up = uh->data[0];
1320 | vdp = vh->data[0];
1321 | if (up == MAXDIGIT)
1322 | {
1323 | udp[0] = 0;
1324 | udp[1] = 1;
1325 | }
1326 | else
1327 | {
1328 | udp[0] = up+1;
1329 | udp[1] = 0;
1330 | }
1331 |
1332 | if (vdp == MAXDIGIT)
1333 | {
1334 | vp[0] = 0;
1335 | vp[1] = 1;
1336 | }
1337 | else
1338 | {
1339 | vp[0] = vdp+1;
1340 | vp[1] = 0;
1341 | }
1342 |
1343 | a=1; b=0; c=0; d=1;
1344 |
1345 | k=0;
1346 | while (1)
1347 | {
1348 | //Step L2:
1349 | /*test quotient, protecting for overflow. the conditions:
1350 | 0 <= uhat + a <= 2^32
1351 | 0 <= uhat + b < 2^32
1352 | 0 <= vhat + c < 2^32
1353 | 0 <= vhat + d <= 2^32
1354 | will always hold. hence only need to check for the case where
1355 | uhat == MAX_DIGIT and a = 1 or vhat == MAX_DIGIT and d = 1
1356 | */
1357 |
1358 | //first check for /0
1359 | if (((vp[0] == 0) && (vp[1] == 0)) || (vdp == 0))
1360 | break;
1361 |
1362 | //u''/v''
1363 | if (udp[1] == 1)
1364 | {
1365 | spDivide(&q2,(base_t *)&t,udp,vdp);
1366 | }
1367 | else
1368 | q2 = udp[0]/vdp;
1369 |
1370 | //u'/v'
1371 | if (vp[1] >= 1)
1372 | q1=0;
1373 | else
1374 | q1 = up/vp[0];
1375 |
1376 | if (q1 != q2)
1377 | break;
1378 |
1379 | //Step L3: Emulate Euclid
1380 | t=a-q1*c;
1381 | a=c;
1382 | c=t;
1383 | t=b-q1*d;
1384 | b=d;
1385 | d=t;
1386 | t=up-q1*vp[0];
1387 | up=vp[0];
1388 | vp[0]=t;
1389 | t=udp[0]-q2*vdp;
1390 | udp[0]=vdp;
1391 | vdp=t;
1392 | k++;
1393 | if (k>10000)
1394 | goto free;
1395 | }
1396 |
1397 | //Step L4: multiprecision step
1398 | if (b==0)
1399 | {
1400 | for (i=vv->size-1;i>=0;i--)
1401 | {
1402 | if (vv->data[i] == 0)
1403 | vv->size--;
1404 | else
1405 | break;
1406 | }
1407 | zDiv(uu,vv,y,x); //y = u mod v
1408 | zCopy(vv,uu); //u = v
1409 | zCopy(x,vv); //v = y
1410 | }
1411 | else
1412 | {
1413 | //aa=abs(a);
1414 | //bb=abs(b);
1415 | //cc=abs(c);
1416 | //dd=abs(d);
1417 | if (a<0)
1418 | aa = -a;
1419 | else
1420 | aa = a;
1421 | if (b<0)
1422 | bb = -b;
1423 | else
1424 | bb = b;
1425 | if (c<0)
1426 | cc = -c;
1427 | else
1428 | cc = c;
1429 | if (d<0)
1430 | dd = -d;
1431 | else
1432 | dd = d;
1433 | zShortMul(uu,aa,y); //y = A*u
1434 | zShortMul(vv,bb,x); //y = y + B*v
1435 | if (a<0)
1436 | {
1437 | zSub(x,y,x);
1438 | zCopy(x,y);
1439 | }
1440 | else if (b<0)
1441 | zSub(y,x,y);
1442 | else
1443 | zAdd(y,x,y);
1444 |
1445 | zShortMul(uu,cc,zz); //z = c*u
1446 | zShortMul(vv,dd,x); //z = z + d*v
1447 | if (c<0)
1448 | {
1449 | zSub(x,zz,x);
1450 | zCopy(x,zz);
1451 | }
1452 | else // (d<0)
1453 | zSub(zz,x,zz);
1454 |
1455 | zCopy(y,uu); //u = y;
1456 | zCopy(zz,vv); //v = z;
1457 | }
1458 | j++;
1459 | if (j>10000)
1460 | goto free;
1461 | }
1462 |
1463 | //here, the size of v is 1, so finish up with regular GCD
1464 | zBinGCD(uu,vv,w);
1465 |
1466 | free:
1467 | zFree(y);
1468 | zFree(zz);
1469 | zFree(x);
1470 | zFree(uu);
1471 | zFree(vv);
1472 | zFree(uh);
1473 | zFree(vh);
1474 | return 1;
1475 | }
1476 |
1477 | int zBinGCD(bignum *u, bignum *v, bignum *w)
1478 | {
1479 | //computes w = gcd(u,v)
1480 | //follows algorithm B. p.321 Knuth Vol. 2
1481 |
1482 | bignum *uu, *vv, *t;
1483 | long i=0,j;
1484 | int k,sz;
1485 |
1486 | sz = abs(u->size);
1487 | if (abs(v->size) > sz)
1488 | {
1489 | sz = abs(v->size);
1490 | }
1491 |
1492 | i = zCompare1(u, 0);
1493 | j = zCompare1(v, 0);
1494 | if (i == 0)
1495 | {
1496 | zCopy(v,w);
1497 | return 1;
1498 | }
1499 | if (j == 0)
1500 | {
1501 | zCopy(u,w);
1502 | return 1;
1503 | }
1504 |
1505 | uu = zInit();
1506 | vv = zInit();
1507 | t = zInit();
1508 |
1509 | zCopy(u,uu);
1510 | zCopy(v,vv);
1511 |
1512 | //find power of 2 such that u and v are not both even
1513 | k = 0;
1514 | while(((uu->data[0] & 0x1) == 0) && ((vv->data[0] & 0x1) == 0))
1515 | {
1516 | zShiftRight(uu,uu,1);
1517 | zShiftRight(vv,vv,1);
1518 | k++;
1519 | }
1520 |
1521 | j=0;
1522 | do
1523 | {
1524 | if ((uu->data[0] & 0x1) == 0)
1525 | zShiftRight(uu,uu,1);
1526 | else if ((vv->data[0] & 0x1) == 0)
1527 | zShiftRight(vv,vv,1);
1528 | else
1529 | {
1530 | zSub(uu,vv,t);
1531 | zShiftRight(t,t,1);
1532 | if (zCompare(uu,vv) < 0)
1533 | zCopy(t,vv);
1534 | else
1535 | zCopy(t,uu);
1536 | }
1537 | ++j;
1538 | if (j>= 10000)
1539 | break;
1540 | } while (zCompare1(uu, 0) > 0);
1541 |
1542 | zClear(w);
1543 | w->data[0] = 1;
1544 | zShiftLeft(w,w,k);
1545 | zMul(w,vv,uu);
1546 | zCopy(uu,w);
1547 |
1548 | zFree(uu);
1549 | zFree(vv);
1550 | zFree(t);
1551 | return j;
1552 | }
1553 |
1554 | void xGCD(bignum *a, bignum *b, bignum *x, bignum *y, bignum *g)
1555 | {
1556 | //compute the extended GCD of a, b, returning g = GCD(a,b) and x, y
1557 | //such that ax + by = GCD(a,b) if a,b are coprime
1558 | bignum *t1, *t2, *t3, *u, *v, *r, *R, *q, *tmp;
1559 |
1560 | // int i;
1561 | /*
1562 |
1563 | Step 1:
1564 | if a < b then
1565 | Set u=0, v=1, and r=b
1566 | Set U=1, V=0, and R=a
1567 | else
1568 | Set u=1, v=0, and r=a
1569 | Set U=0, V=1, and R=b
1570 |
1571 | Step 2:
1572 | if R = 0 then return r (for the gcd) and no inverses exist.
1573 | if R = 1 then return R (for the gcd), V (for the inverse a(mod b)) and U (for the inverse of b(mod a)).
1574 |
1575 | Step 3:
1576 | Calculate q = int(r/R)
1577 | Calculate t1 = u - U*q
1578 | Calculate t2 = v - V*q
1579 | Calculate t3 = r - R*q
1580 | set u=U, v=V, r=R
1581 | set U=t1, V=t2, R=t3
1582 | goto Step 2.
1583 | */
1584 |
1585 | tmp = zInit();
1586 | t1 = zInit();
1587 | t2 = zInit();
1588 | t3 = zInit();
1589 | q = zInit();
1590 | r = zInit();
1591 | R = zInit();
1592 | u = zInit();
1593 | v = zInit();
1594 |
1595 | //need to check for temp allocation
1596 |
1597 | zClear(x);
1598 | zClear(y);
1599 |
1600 |
1601 | if (zCompare(a,b) < 0)
1602 | {
1603 | u->data[0]=0;
1604 | v->data[0]=1;
1605 | zCopy(b,r);
1606 | x->data[0]=1;
1607 | y->data[0]=0;
1608 | zCopy(a,R);
1609 | }
1610 | else
1611 | {
1612 | u->data[0]=1;
1613 | v->data[0]=0;
1614 | zCopy(a,r);
1615 | x->data[0]=0;
1616 | y->data[0]=1;
1617 | zCopy(b,R);
1618 | }
1619 |
1620 | while (1)
1621 | {
1622 | if (zCompare1(R, 0) == 0)
1623 | {
1624 | zCopy(r,g);
1625 | x->data[0] = 0;
1626 | x->size = 1;
1627 | y->data[0] = 0;
1628 | y->size = 1;
1629 | break;
1630 | }
1631 |
1632 | if (zCompare1(R, 1) == 0)
1633 | {
1634 | zCopy(R,g);
1635 | break;
1636 | }
1637 |
1638 | zCopy(r,tmp);
1639 | zDiv(tmp,R,q,t3); //q = int(r/R), t3 = r % R
1640 |
1641 | zMul(q,x,tmp); //t1 = u - U*q
1642 | zSub(u,tmp,t1);
1643 |
1644 | zMul(q,y,tmp); //t2 = v - V*q
1645 | zSub(v,tmp,t2);
1646 |
1647 | zCopy(x,u);
1648 | zCopy(y,v);
1649 | zCopy(R,r);
1650 |
1651 | zCopy(t1,x);
1652 | zCopy(t2,y);
1653 | zCopy(t3,R);
1654 |
1655 | //printf("iteration %d: x = %s\n", i, z2decstr(x));
1656 | //printf("iteration %d: y = %s\n", i, z2decstr(y));
1657 | //printf("iteration %d: g = %s\n", i, z2decstr(g));
1658 | //printf("iteration %d: r = %s\n", i, z2decstr(r));
1659 | //printf("iteration %d: R = %s\n", i, z2decstr(R));
1660 | //printf("iteration %d: q = %s\n", i, z2decstr(q));
1661 | //printf("iteration %d: u = %s\n", i, z2decstr(u));
1662 | //printf("iteration %d: v = %s\n", i, z2decstr(v));
1663 | //i++;
1664 | }
1665 |
1666 | if (x->size < 0)
1667 | {
1668 | x->size *= -1;
1669 | zSub(b,x,x);
1670 | }
1671 |
1672 | if (y->size < 0)
1673 | {
1674 | y->size *= -1;
1675 | zSub(a,y,y);
1676 | }
1677 |
1678 | zFree(tmp);
1679 | zFree(t1);
1680 | zFree(t2);
1681 | zFree(t3);
1682 | zFree(q);
1683 | zFree(r);
1684 | zFree(R);
1685 | zFree(u);
1686 | zFree(v);
1687 | return;
1688 | }
1689 |
1690 | void str2hexz(char in[], bignum * u)
1691 | {
1692 | // convert a string to a bigint
1693 | char *s2,*s;
1694 | char **ptr = NULL;
1695 |
1696 | // assume input is base10, we convert 9 digits at a time (32 bit words)
1697 | int i,j,su,base=10,step=9;
1698 | bignum * t;
1699 |
1700 | // allocate space for a temporary bignum
1701 | t = zInit();
1702 |
1703 | // work with a copy of in (because the first step in the conversion process
1704 | // inserts null characters into the string...). This could probably be changed.
1705 | s = (char *)malloc(8192*sizeof(char));
1706 | strcpy(s,in);
1707 |
1708 | // compute how many 9-digit decimal words we have in the string
1709 | su = strlen(s)/step + (strlen(s)%step != 0);
1710 |
1711 | // read 9 characters of s at a time into a base-10 bignum, 'u'
1712 | j=0;
1713 | for (i=0;idata[j] = strtoul(s2,ptr,base);
1717 | s2[0] = '\0';
1718 | j++;
1719 | }
1720 |
1721 | if (strlen(s) > 0)
1722 | {
1723 | s2 = s;
1724 | ptr = &s2;
1725 | t->data[j] = strtoul(s2,ptr,base);
1726 | }
1727 | t->size = j+1;
1728 |
1729 | // now convert the base-10 bignum to a binary bignum
1730 | zDec2Hex(t,u);
1731 |
1732 | // clear the upper words, if any
1733 | for (i = u->size; i < NWORDS; i++)
1734 | {
1735 | u->data[i] = 0;
1736 | }
1737 |
1738 | free(s);
1739 | zFree(t);
1740 | return;
1741 | }
1742 |
1743 | void zDec2Hex(bignum * u, bignum * v)
1744 | {
1745 | // convert u[] in dec to v[] in hex by multiplying the ith digit by (1e9)*i
1746 | // and adding to the previous digits
1747 |
1748 | bignum * a, *b, *vv;
1749 | base_t d = MAX_DEC_WORD;
1750 | int i, j;
1751 |
1752 | a = zInit();
1753 | b = zInit();
1754 | vv = zInit();
1755 | zClear(v);
1756 |
1757 | a->data[0] = 1;
1758 | for (i = 0; i < u->size; i++)
1759 | {
1760 | zShortMul(a, u->data[i], b);
1761 | zAdd(vv, b, vv);
1762 | zShortMul(a, d, a);
1763 | }
1764 |
1765 | zClamp(vv);
1766 | zCopy(vv, v);
1767 |
1768 | zFree(a);
1769 | zFree(b);
1770 | zFree(vv);
1771 |
1772 | return;
1773 | }
1774 |
1775 | void zHex2Dec(bignum * u, bignum * v)
1776 | {
1777 | //convert u[] in hex to v[] in decimal by repeatedly dividing
1778 | //u by 1e9 = 0x3b9aca00
1779 | //the remainder of the ith division is the ith decimal digit.
1780 | //when the quotient = 0, stop
1781 |
1782 | bignum * a, *b;
1783 | base_t d = MAX_DEC_WORD;
1784 | base_t r = 0;
1785 | int su = u->size;
1786 | //because decimal takes more room than hex to store
1787 |
1788 | a = zInit();
1789 | b = zInit();
1790 | zClear(v);
1791 |
1792 | zCopy(u, a);
1793 | v->size = 1;
1794 | do
1795 | {
1796 | r = zShortDiv(a, d, b);
1797 | v->data[v->size - 1] = r;
1798 | v->size++;
1799 | zCopy(b, a);
1800 | } while (zCompare1(a, 0) != 0);
1801 | v->size--;
1802 |
1803 | zFree(a);
1804 | zFree(b);
1805 | return;
1806 | }
1807 |
1808 | char *z2decstr(bignum * n)
1809 | {
1810 | //pass in a pointer to a string. if necessary, this routine will
1811 | //reallocate space for the string to accomodate its size. If this happens
1812 | //the pointer to the string's (likely) new location is automatically
1813 | //updated and returned.
1814 | bignum * a;
1815 | int i,sza,sign = 0;
1816 | char *tmp, *s;
1817 | int nchars, j;
1818 |
1819 | a = zInit();
1820 |
1821 | s = (char *)malloc(8192 * sizeof(char));
1822 |
1823 | strcpy(s,"");
1824 | nchars = 1;
1825 | if (n->size < 0)
1826 | {
1827 | sign = 1;
1828 | n->size *= -1;
1829 | sprintf(s, "-");
1830 | nchars++;
1831 | }
1832 |
1833 | zHex2Dec(n, a);
1834 | sza = abs(a->size);
1835 |
1836 | tmp = (char *)malloc((DEC_DIGIT_PER_WORD + 10) * sizeof(char));
1837 |
1838 | //print first word
1839 | #if DIGITBITS == 64
1840 | sprintf(s,"%s%lu", s, a->data[sza - 1]);
1841 | #else
1842 | sprintf(s, "%s%u", s, a->data[sza - 1]);
1843 | #endif
1844 | nchars += ndigits_1(a->data[sza-1]) - 1;
1845 |
1846 | //print the rest
1847 | for (i=sza - 2; i >= 0; i--)
1848 | {
1849 | #if DIGITBITS == 64
1850 | sprintf(tmp,"%019lu",a->data[i]);
1851 | #else
1852 | sprintf(tmp, "%09u", a->data[i]);
1853 | #endif
1854 | memcpy(s + nchars, tmp, DEC_DIGIT_PER_WORD * sizeof(char));
1855 | nchars += DEC_DIGIT_PER_WORD;
1856 | }
1857 | s[nchars] = '\0';
1858 |
1859 | free(tmp);
1860 | zFree(a);
1861 |
1862 | if (sign)
1863 | {
1864 | n->size *= -1;
1865 | }
1866 |
1867 | return s;
1868 | }
1869 |
1870 | void spMulAdd(base_t u, base_t v, base_t w, base_t t, base_t *lower, base_t *carry)
1871 | {
1872 | base_t k,p;
1873 | spMultiply(u,v,&p,carry);
1874 | spAdd3(p,w,t,lower,&k);
1875 | *carry += k;
1876 | return;
1877 | }
1878 |
1879 | void spMulMod(base_t u, base_t v, base_t m, base_t *w)
1880 | {
1881 | base_t p[2];
1882 | base_t q;
1883 |
1884 | spMultiply(u,v,&p[0],&p[1]);
1885 | spDivide(&q,w,p,m);
1886 |
1887 | return;
1888 | }
1889 |
1890 |
--------------------------------------------------------------------------------