├── README.md ├── vecarith.vcxproj.filters ├── vecarith.sln ├── x64_bench ├── Makefile ├── pmod.h ├── x64_arith.h ├── monty_arith.h ├── bigarith.h ├── util.h ├── util.c ├── pmod.c ├── x64_arith.c ├── main.c └── bigarith.c ├── common.c ├── Makefile ├── vecarith.vcxproj ├── vecarith.h └── main.c /README.md: -------------------------------------------------------------------------------- 1 | # avx512_modexp 2 | A test library for computing modular exponentiation in parallel using AVX-512 vector arithmetic 3 | 4 | Verified to work with gcc 7.3.0, gcc 11.1.0 and icc 18.0.3. 5 | 6 | build with (required) 7 | make SKYLAKEX=1 8 | 9 | optionally add this to build line to change the length of test numbers (N needs to be a multiple of 128) 10 | MAXBITS=N 11 | 12 | optionally add this to build line to change the compiler to gcc-7.3.0 from the default icc 13 | COMPILER=gcc730 14 | 15 | optionally add this to build line to change the compiler to gcc-11.1.0 from the default icc 16 | COMPILER=gcc11 17 | 18 | optionally add this to build line to use the double precision FMA arithmetic instead of 32-bit integer arithmetic. 19 | If this is specified then MAXBITS must be multiples of 208 20 | BASE51=1 21 | 22 | 23 | Run the executable with 2 arguments: number of threads and whether or not to verify all of the results using GMP. 24 | For example, to run with 4 threads and skip verification: 25 | ./avx512_modexp 4 0 26 | -------------------------------------------------------------------------------- /vecarith.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Header Files 20 | 21 | 22 | 23 | 24 | Source Files 25 | 26 | 27 | Source Files 28 | 29 | 30 | Source Files 31 | 32 | 33 | Source Files 34 | 35 | 36 | -------------------------------------------------------------------------------- /vecarith.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.271 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vecarith", "vecarith.vcxproj", "{F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x64.ActiveCfg = Debug|x64 17 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x64.Build.0 = Debug|x64 18 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x86.ActiveCfg = Debug|Win32 19 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Debug|x86.Build.0 = Debug|Win32 20 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x64.ActiveCfg = Release|x64 21 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x64.Build.0 = Release|x64 22 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x86.ActiveCfg = Release|Win32 23 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {DE0A4EE7-27A6-4F85-AB25-7EAB760DF485} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /x64_bench/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2014, Ben Buhrow 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 2. Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # 14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | # 25 | # The views and conclusions contained in the software and documentation are those 26 | # of the authors and should not be interpreted as representing official policies, 27 | # either expressed or implied, of the FreeBSD Project. 28 | # 29 | # 30 | 31 | 32 | #--------------------------- EXAMPLE BUILDS ------------------------- 33 | # make MAXBITS=256 34 | # make MAXBITS=512 35 | 36 | 37 | 38 | #--------------------------- flags ------------------------- 39 | CC = gcc 40 | WARN_FLAGS = -Wall #-W -Wconversion 41 | OPT_FLAGS = -O3 42 | INC = -I. -I../gmp-6.1.2/include/ 43 | LIBS = -L../gmp-6.1.2/lib/ 44 | BINNAME = pmod_bench 45 | 46 | #--------------------------- make options ------------------------- 47 | 48 | 49 | ifeq ($(COMPILER),gcc73) 50 | CC = gcc-7.3.0 51 | endif 52 | 53 | ifdef MAXBITS 54 | CFLAGS += -DMAXBITS=$(MAXBITS) 55 | endif 56 | 57 | ifdef VERBOSE 58 | CFLAGS += -DVERBOSE=$(VERBOSE) 59 | endif 60 | 61 | CFLAGS += -g $(OPT_FLAGS) $(WARN_FLAGS) $(INC) 62 | LIBS += -lm -lgmp 63 | 64 | #--------------------------- file lists ------------------------- 65 | SRCS = \ 66 | bigarith.c \ 67 | x64_arith.c \ 68 | monty_arith.c \ 69 | pmod.c \ 70 | main.c \ 71 | util.c 72 | 73 | OBJS = $(SRCS:.c=.o) 74 | 75 | #---------------------------Header file lists ------------------------- 76 | HEAD = \ 77 | monty_arith.h \ 78 | bigarith.h \ 79 | x64_arith.h \ 80 | pmod.h \ 81 | util.h 82 | 83 | #---------------------------Make Targets ------------------------- 84 | 85 | all: $(OBJS) 86 | rm -f libpmod.a 87 | ar r libpmod.a $(OBJS) 88 | ranlib libpmod.a 89 | $(CC) $(CFLAGS) $(OBJS) -o $(BINNAME) libpmod.a $(LIBS) 90 | 91 | 92 | clean: 93 | rm -f $(OBJS) 94 | 95 | #---------------------------Build Rules ------------------------- 96 | 97 | 98 | %$(OBJ_EXT): %.c $(HEAD) 99 | $(CC) $(CFLAGS) -c -o $@ $< 100 | 101 | -------------------------------------------------------------------------------- /x64_bench/pmod.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose 2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide. 3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may 4 | // not use this file except in compliance with the License. You may obtain 5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 9 | // including conditions of title, non-infringement, merchantability, 10 | // or fitness for a particular purpose 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | // This file is a snapshot of a work in progress, originated by Mayo 14 | // Clinic SPPDG. 15 | 16 | /* 17 | Copyright (c) 2021, Ben Buhrow 18 | All rights reserved. 19 | 20 | Redistribution and use in source and binary forms, with or without 21 | modification, are permitted provided that the following conditions are met: 22 | 23 | 1. Redistributions of source code must retain the above copyright notice, this 24 | list of conditions and the following disclaimer. 25 | 2. Redistributions in binary form must reproduce the above copyright notice, 26 | this list of conditions and the following disclaimer in the documentation 27 | and/or other materials provided with the distribution. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 | 40 | The views and conclusions contained in the software and documentation are those 41 | of the authors and should not be interpreted as representing official policies, 42 | either expressed or implied, of the FreeBSD Project. 43 | */ 44 | 45 | #ifndef _PMOD_H 46 | #define _PMOD_H 47 | 48 | // Modular exponentiation relies on a big-integer math library and libraries 49 | // that perform modular arithmetic. We define routines that use a 50 | // homegrown bigint library. 51 | #include "bigarith.h" 52 | #include "monty_arith.h" 53 | 54 | typedef struct 55 | { 56 | bignum **libpmod_gwin; 57 | } pmod_t; 58 | 59 | #define MAX_WINSIZE 8 60 | 61 | int get_winsize(void); 62 | int get_bitwin(bignum *b, int bitloc, int winsize, int winmask); 63 | void lr_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s); 64 | void lrwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s); 65 | void lroddwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s); 66 | 67 | void pmodlib_init(pmod_t *pmod_state); 68 | void pmodlib_free(pmod_t *pmod_state); 69 | 70 | 71 | 72 | #endif -------------------------------------------------------------------------------- /x64_bench/x64_arith.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #ifndef _X64_ARITH_H 31 | #define _X64_ARITH_H 32 | 33 | // this file declares special routines for low-level x64 arithmetic 34 | // used as subroutines for modular multiplication. 35 | #include 36 | 37 | 38 | 39 | 40 | void spAdd(uint64_t u, uint64_t v, uint64_t *sum, uint64_t *carry); 41 | void spAdd3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sum, uint64_t *carry); 42 | void spSub3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sub, uint64_t *borrow); 43 | void spSub(uint64_t u, uint64_t v, uint64_t *sub, uint64_t *borrow); 44 | uint64_t spDivide(uint64_t *q, uint64_t *r, uint64_t u[2], uint64_t v); 45 | void spMultiply(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry); 46 | uint64_t spDiv(uint64_t *q, uint64_t *r, uint64_t u1, uint64_t u0, uint64_t v); 47 | uint64_t spMod(uint64_t u1, uint64_t u0, uint64_t v); 48 | void spMul(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry); 49 | void spMulAdd1(uint64_t u, uint64_t v, uint64_t w, 50 | uint64_t *product, uint64_t *carry); 51 | void spMulAdd2(uint64_t u, uint64_t v, uint64_t w, 52 | uint64_t c, uint64_t *product, uint64_t *carry); 53 | void spMulAdd2x(uint64_t u, uint64_t v, uint64_t w, 54 | uint64_t c, uint64_t *product, uint64_t *carry); 55 | void spMulAddc(uint64_t u, uint64_t v, uint64_t * w); 56 | void spSqrMulAcc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w); 57 | void mpSub(uint64_t * u, uint64_t * n, uint64_t * w, int sz); 58 | void mpSub1(uint64_t * u, uint64_t n, uint64_t * w, int sz); 59 | void mpAdd1(uint64_t * u, uint64_t n, uint64_t * w, int sz); 60 | void spMul2Acc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w); 61 | void spMulAddcr(uint64_t u, uint64_t v, uint64_t * w); 62 | void spMulDblAdd_1(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout); 63 | void spMulDblAdd_2(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout); 64 | void spMulDblAdd_3(uint64_t u, uint64_t v, uint64_t * w); 65 | void mpAdd1b(uint64_t * u, uint64_t n, uint64_t * w, int sz); 66 | 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /x64_bench/monty_arith.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #ifndef _MONTY_H 31 | #define _MONTY_H 32 | 33 | #include "bigarith.h" 34 | 35 | /* montgomery arithmetic operations */ 36 | typedef struct 37 | { 38 | bignum *r; 39 | bignum *n; 40 | bignum *np; 41 | bignum *nhat; 42 | bignum *vnhat; 43 | bignum *rhat; 44 | bignum *rmask; 45 | bignum *one; 46 | bignum *mtmp1; 47 | bignum *mtmp2; 48 | bignum *mtmp3; 49 | bignum *mtmp4; 50 | base_t rho; 51 | } monty; 52 | 53 | // montgomery arithmetic setup and conversion 54 | void to_monty(monty *mdata, bignum * x); 55 | monty * monty_alloc(); 56 | void monty_init(monty * in, bignum * n, int verbose); 57 | void monty_free(monty *mdata); 58 | 59 | // pointers to the current mul/sqr scanning technique 60 | void(*mul_ptr)(monty *, bignum *, bignum *, bignum *, bignum *); 61 | void(*sqr_ptr)(monty *, bignum *, bignum *, bignum *); 62 | 63 | // montgomery multipliers for various scanning techniques 64 | void mulmod_sos(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s); 65 | void mulmod_cios(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s); 66 | void mulmod_fios(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s); 67 | void mulmod_fips(monty *mdata, bignum * u, bignum * v, bignum * w, bignum * s); 68 | void mulmod_bps(monty* mdata, bignum* u, bignum* v, bignum* w, bignum* s); 69 | 70 | // montgomery squaring that just calls mulmod for various scanning techniques 71 | void sqrmod_sos_mul(monty *mdata, bignum * u, bignum * w, bignum * s); 72 | void sqrmod_fips_mul(monty *mdata, bignum * u, bignum * w, bignum * s); 73 | void sqrmod_fios_mul(monty *mdata, bignum * u, bignum * w, bignum * s); 74 | void sqrmod_cios_mul(monty *mdata, bignum * u, bignum * w, bignum * s); 75 | void sqrmod_bps_mul(monty* mdata, bignum* u, bignum* w, bignum* s); 76 | 77 | // specialized montgomery squaring for various scanning techniques 78 | void sqrmod_sos(monty *mdata, bignum * u, bignum * w, bignum * s); 79 | void sqrmod_cios(monty* mdata, bignum* u, bignum* w, bignum* s); 80 | void sqrmod_fios(monty *mdata, bignum * u, bignum * w, bignum * s); 81 | void sqrmod_fips(monty* mdata, bignum* u, bignum* w, bignum* s); 82 | void sqrmod_bps(monty* mdata, bignum* u, bignum* w, bignum* s); 83 | 84 | #endif // _MONTY_H 85 | -------------------------------------------------------------------------------- /x64_bench/bigarith.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #ifndef _BIG_ARITH_H 31 | #define _BIG_ARITH_H 32 | 33 | #include 34 | #include 35 | #include "util.h" 36 | #include "x64_arith.h" 37 | 38 | #define DIGITBITS 64 39 | 40 | #ifdef _MSC_VER 41 | #define base_t unsigned long long 42 | #define base_signed_t long long 43 | #define MAXDIGIT 0xffffffffffffffff 44 | #define HIBITMASK 0x8000000000000000 45 | #define MAX_DEC_WORD 0x8AC7230489E80000ULL 46 | #define DEC_DIGIT_PER_WORD 19 47 | #define HEX_DIGIT_PER_WORD 16 48 | #define HALFMASK 0xffffffff 49 | #define HALFBITS 32 50 | #else 51 | #if DIGITBITS == 64 52 | 53 | #define base_t uint64_t 54 | #define base_signed_t int64_t 55 | #define HALFBITS 32 56 | #define HALFMASK 0xffffffff 57 | #define MAXDIGIT 0xffffffffffffffff 58 | #define HIBITMASK 0x8000000000000000 59 | #define VECLEN 8 60 | #define MAX_DEC_WORD 0x8AC7230489E80000ULL 61 | #define DEC_DIGIT_PER_WORD 19 62 | #define HEX_DIGIT_PER_WORD 16 63 | 64 | #else 65 | 66 | #define base_t uint32_t 67 | #define base_signed_t int32_t 68 | #define HALFBITS 16 69 | #define HALFMASK 0xffff 70 | #define MAXDIGIT 0xffffffff 71 | #define HIBITMASK 0x80000000 72 | #define VECLEN 16 73 | #define MAX_DEC_WORD 0x3b9aca00 74 | #define DEC_DIGIT_PER_WORD 9 75 | #define HEX_DIGIT_PER_WORD 8 76 | 77 | #endif 78 | #endif 79 | 80 | // supported: 81 | // any N divisible by 32 for MAXBITS < 512 82 | // any N divisible by 128 MAXBITS >= 128 83 | #ifndef MAXBITS 84 | #define MAXBITS 512 85 | #endif 86 | 87 | #define NWORDS (MAXBITS / DIGITBITS) 88 | 89 | typedef struct 90 | { 91 | base_t *data; 92 | int size; 93 | } bignum; 94 | 95 | 96 | /* basic arithmetic operations: fixed allocation, variable sized, non-signed */ 97 | int zBits(bignum * n); 98 | base_t spBits(base_t n); 99 | void zSet1(bignum *dest, base_t value); 100 | void zCopy(bignum * src, bignum * dest); 101 | void zAdd(bignum * u, bignum * v, bignum * w); 102 | void zShortAdd(bignum * u, base_t v, bignum * w); 103 | int zSub(bignum * u, bignum * v, bignum * w); 104 | void zShortSub(bignum * u, base_t v, bignum * w); 105 | int zCompare(bignum * u, bignum * v); 106 | int zCompare1(bignum * u, base_t v); 107 | base_t zShortDiv(bignum * u, base_t v, bignum * q); 108 | void zDiv(bignum * u, bignum * v, bignum * q, bignum * r); 109 | int shortCompare(base_t p[2], base_t t[2]); 110 | int shortSubtract(base_t u[2], base_t v[2], base_t w[2]); 111 | void zMul(bignum * u, bignum * v, bignum * w); 112 | void zMult(bignum * u, bignum * v, bignum * w, bignum *tmp); 113 | void zModMul(bignum * u, bignum * v, bignum * n, bignum * w); 114 | void zShortMul(bignum * u, base_t v, bignum * w); 115 | void zSqr(bignum * x, bignum * w); 116 | void zShiftLeft(bignum * a, bignum * b, int x); 117 | void zShiftLeft_1(bignum * a, bignum * b); 118 | void zShiftRight(bignum * a, bignum * b, int x); 119 | void zShiftRight_1(bignum * a, bignum * b); 120 | void spAdd(base_t u, base_t v, base_t *sum, base_t *carry); 121 | void spAdd3(base_t u, base_t v, base_t w, base_t *sum, base_t *carry); 122 | void spSub3(base_t u, base_t v, base_t w, base_t *sub, base_t *borrow); 123 | void spSub(base_t u, base_t v, base_t *sub, base_t *borrow); 124 | base_t spDivide(base_t *q, base_t *r, base_t u[2], base_t v); 125 | void spMultiply(base_t u, base_t v, base_t *product, base_t *carry); 126 | void spMulAdd(base_t u, base_t v, base_t w, base_t t, base_t *lower, base_t *carry); 127 | void spMulMod(base_t u, base_t v, base_t m, base_t *w); 128 | void sp2big(base_t src, bignum * dest); 129 | void zClear(bignum * n); 130 | void zClearFull(bignum * n); 131 | void zClamp(bignum * n); 132 | void zPrint(bignum *n); 133 | int ndigits_1(base_t n); 134 | bignum * zInit(void); 135 | void zFree(bignum *n); 136 | void xGCD(bignum *a, bignum *b, bignum *x, bignum *y, bignum *g); 137 | int zBinGCD(bignum *u, bignum *v, bignum *w); 138 | int zLEGCD(bignum *u, bignum *v, bignum *w); 139 | base_t spGCD(base_t x, base_t y); 140 | void zModMuls(bignum * u, bignum * v, bignum * n, bignum * w, bignum *s1, bignum *s2); 141 | void zModExp(bignum *d, bignum *b, bignum *e, bignum *m); 142 | 143 | void str2hexz(char in[], bignum * u); 144 | void zDec2Hex(bignum * u, bignum * v); 145 | char *z2decstr(bignum * n); 146 | void zHex2Dec(bignum * u, bignum * v); 147 | 148 | #endif // _BIGARITH_H 149 | 150 | -------------------------------------------------------------------------------- /common.c: -------------------------------------------------------------------------------- 1 | #include "vecarith.h" 2 | 3 | void(*vecmulmod_ptr)(bignum*, bignum*, bignum*, bignum*, bignum*, monty*); 4 | void(*vecsqrmod_ptr)(bignum*, bignum*, bignum*, bignum*, monty*); 5 | int(*montsetup_ptr)(bignum*, bignum*, bignum*, base_t*); 6 | void(*vecmodexp_ptr)(bignum*, bignum*, bignum*, bignum*, bignum*, bignum*, monty* m); 7 | 8 | int get_winsize(void) 9 | { 10 | // the window size is based on minimizing the total number of multiplications 11 | // in the windowed exponentiation. experiments show that this is best; 12 | // the growing size of the table doesn't change the calculus, at least 13 | // on the KNL. 14 | int size; 15 | int muls; 16 | int minmuls = 99999999; 17 | int minsize = 4; 18 | 19 | for (size = 2; size <= 8; size++) 20 | { 21 | muls = (NWORDS * DIGITBITS / size) + (1 << size); 22 | if (muls < minmuls) 23 | { 24 | minmuls = muls; 25 | minsize = size; 26 | } 27 | } 28 | 29 | return minsize; 30 | } 31 | 32 | int get_bitwin(bignum *e, int bitloc, int winsize, int lane, int winmask) 33 | { 34 | int bstr; 35 | int bitstart = (bitloc - winsize + 1); 36 | int word = bitloc / DIGITBITS; 37 | int word2 = bitstart / DIGITBITS; 38 | 39 | bitstart = bitstart % DIGITBITS; 40 | 41 | if (word == word2) 42 | { 43 | bstr = (e->data[lane + word * VECLEN] >> bitstart) & winmask; 44 | } 45 | else 46 | { 47 | int upperbits = (bitloc % DIGITBITS) + 1; 48 | 49 | bstr = (e->data[lane + word2 * VECLEN] >> bitstart); 50 | bstr |= ((e->data[lane + word * VECLEN]) << (winsize - upperbits)); 51 | bstr &= winmask; 52 | } 53 | 54 | return bstr; 55 | } 56 | 57 | 58 | bignum * vecInit(void) 59 | { 60 | int i; 61 | size_t sz = VECLEN * (2 * NWORDS + 4); 62 | bignum *n; 63 | n = (bignum *)malloc(sizeof(bignum)); 64 | 65 | n->data = (base_t *)xmalloc_align(sz * sizeof(base_t)); 66 | if (n->data == NULL) 67 | { 68 | printf("could not allocate memory\n"); 69 | exit(2); 70 | } 71 | 72 | for (i = 0; i < sz; i++) 73 | { 74 | n->data[i] = 0; 75 | } 76 | n->size = 1; 77 | 78 | return n; 79 | } 80 | 81 | void vecCopy(bignum * src, bignum * dest) 82 | { 83 | //physically copy the digits of u into the digits of v 84 | int su = VECLEN * (2 * NWORDS + 1); 85 | 86 | memcpy(dest->data, src->data, su * sizeof(base_t)); 87 | dest->size = src->size; // = NWORDS; 88 | return; 89 | } 90 | 91 | void vecCopyn(bignum * src, bignum * dest, int size) 92 | { 93 | //physically copy the digits of u into the digits of v 94 | int su = VECLEN * size; 95 | 96 | memcpy(dest->data, src->data, su * sizeof(base_t)); 97 | dest->size = size; 98 | return; 99 | } 100 | 101 | void vecClear(bignum *n) 102 | { 103 | memset(n->data, 0, VECLEN*(2 * NWORDS + 1) * sizeof(base_t)); 104 | return; 105 | } 106 | 107 | void vecFree(bignum *n) 108 | { 109 | align_free(n->data); 110 | free(n); 111 | } 112 | 113 | void copy_vec_lane(bignum *src, bignum *dest, int num, int size) 114 | { 115 | int j; 116 | 117 | for (j = 0; j < size; j++) 118 | { 119 | dest->data[num + j * VECLEN] = src->data[num + j * VECLEN]; 120 | } 121 | 122 | return; 123 | } 124 | 125 | void monty_init_vec(monty *mdata, bignum * n, int verbose) 126 | { 127 | int j; 128 | // for a input modulus n, initialize constants for 129 | // montogomery representation 130 | // this assumes that n is relatively prime to 2, i.e. is odd. 131 | // In this version we assume the input monty structure has 132 | // already been allocated and we just perform the calculations. 133 | 134 | if (verbose) 135 | printf("initializing montgomery representation\n"); 136 | 137 | memset(mdata->n->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t)); 138 | memset(mdata->r->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t)); 139 | memset(mdata->rhat->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t)); 140 | memset(mdata->one->data, 0, (2 * NWORDS * VECLEN + 1) * sizeof(base_t)); 141 | memset(mdata->vrho, 0, VECLEN * sizeof(base_t)); 142 | 143 | vecCopy(n, mdata->n); 144 | montsetup_ptr(mdata->n, mdata->r, mdata->rhat, mdata->vrho); 145 | 146 | for (j = 0; j < VECLEN; j++) 147 | { 148 | mdata->one->data[j] = 1; 149 | } 150 | 151 | vecmulmod_ptr(mdata->one, mdata->rhat, mdata->one, n, mdata->mtmp1, mdata); // monty rep 152 | vecCopyn(mdata->one, mdata->g[0], NWORDS); 153 | 154 | return; 155 | 156 | } 157 | 158 | monty* monty_alloc(void) 159 | { 160 | int i; 161 | monty *mdata = (monty *)malloc(sizeof(monty)); 162 | 163 | mdata->r = vecInit(); 164 | mdata->n = vecInit(); 165 | mdata->nhat = vecInit(); 166 | mdata->vnhat = vecInit(); 167 | mdata->rhat = vecInit(); 168 | mdata->rmask = vecInit(); 169 | mdata->one = vecInit(); 170 | mdata->mtmp1 = vecInit(); 171 | mdata->mtmp2 = vecInit(); 172 | mdata->mtmp3 = vecInit(); 173 | mdata->mtmp4 = vecInit(); 174 | 175 | mdata->g = (bignum **)malloc((1 << MAX_WINSIZE) * sizeof(bignum *)); 176 | mdata->g[0] = vecInit(); 177 | 178 | for (i = 1; i < (1 << MAX_WINSIZE); i++) 179 | { 180 | mdata->g[i] = vecInit(); 181 | } 182 | 183 | mdata->vrho = (base_t *)xmalloc_align(VECLEN * sizeof(base_t)); 184 | 185 | return mdata; 186 | } 187 | 188 | void monty_free(monty *mdata) 189 | { 190 | int i; 191 | 192 | vecFree(mdata->mtmp1); 193 | vecFree(mdata->mtmp2); 194 | vecFree(mdata->mtmp3); 195 | vecFree(mdata->mtmp4); 196 | vecFree(mdata->rhat); 197 | vecFree(mdata->one); 198 | vecFree(mdata->n); 199 | vecFree(mdata->nhat); 200 | vecFree(mdata->r); 201 | align_free(mdata->vrho); 202 | 203 | for (i = 0; i < (1 << MAX_WINSIZE); i++) 204 | { 205 | vecFree(mdata->g[i]); 206 | } 207 | free(mdata->g); 208 | 209 | return; 210 | } 211 | 212 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2014, Ben Buhrow 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, this 9 | # list of conditions and the following disclaimer. 10 | # 2. Redistributions in binary form must reproduce the above copyright notice, 11 | # this list of conditions and the following disclaimer in the documentation 12 | # and/or other materials provided with the distribution. 13 | # 14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | # 25 | # The views and conclusions contained in the software and documentation are those 26 | # of the authors and should not be interpreted as representing official policies, 27 | # either expressed or implied, of the FreeBSD Project. 28 | # 29 | # 30 | # Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose 31 | # Processor Development Group (SPPDG). All Rights Reserved Worldwide. 32 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 33 | # not use this file except in compliance with the License. You may obtain 34 | # a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 35 | # Unless required by applicable law or agreed to in writing, software 36 | # distributed under the License is distributed on an "AS IS" BASIS, 37 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 38 | # including conditions of title, non-infringement, merchantability, 39 | # or fitness for a particular purpose 40 | # See the License for the specific language governing permissions and 41 | # limitations under the License. 42 | # This file is a snapshot of a work in progress, originated by Mayo 43 | # Clinic SPPDG. 44 | 45 | 46 | 47 | 48 | #--------------------------- flags ------------------------- 49 | CC = icc 50 | #CFLAGS = -g -march=core2 -mtune=core2 51 | #CFLAGS = -static 52 | #CFLAGS = -S -fsource-asm 53 | WARN_FLAGS = -Wall #-W -Wconversion 54 | OPT_FLAGS = -O2 55 | INC = -I. 56 | LIBS = 57 | BINNAME = avx512_modexp 58 | CFLAGS += -I../gmp_install/gmp-6.2.0/include/ 59 | CFLAGS += -L../gmp_install/gmp-6.2.0/lib/ 60 | CFLAGS += -g -gdwarf-4 61 | 62 | #--------------------------- make options ------------------------- 63 | 64 | 65 | ifeq ($(COMPILER),mingw) 66 | # NOTE: Using -fcall-used instead of -ffixed is much better and still works. 67 | # -fcall-used simply prevents the named registers from being saved/restored while 68 | # -ffixed prevents them from being used at all. The code benefits a lot from being 69 | # able to use all 32 zmm registers. 70 | CC = gcc 71 | BINNAME = avx512_modexp_mingw 72 | CFLAGS += -fopenmp 73 | CFLAGS += -fcall-used-xmm16 -fcall-used-xmm17 -fcall-used-xmm18 -fcall-used-xmm19 74 | CFLAGS += -fcall-used-xmm20 -fcall-used-xmm21 -fcall-used-xmm22 -fcall-used-xmm23 75 | CFLAGS += -fcall-used-xmm24 -fcall-used-xmm25 -fcall-used-xmm26 -fcall-used-xmm27 76 | CFLAGS += -fcall-used-xmm28 -fcall-used-xmm29 -fcall-used-xmm30 -fcall-used-xmm31 77 | else ifeq ($(COMPILER),gcc730) 78 | CC = gcc-7.3.0 79 | CFLAGS += -fopenmp 80 | else ifeq ($(COMPILER),gcc11) 81 | CC = gcc-11.1.0 82 | CFLAGS += -fopenmp 83 | else 84 | CFLAGS += -qopenmp 85 | endif 86 | 87 | ifdef MAXBITS 88 | CFLAGS += -DMAXBITS=$(MAXBITS) 89 | endif 90 | 91 | ifdef BASE52 92 | CFLAGS += -DBASE52 93 | endif 94 | 95 | ifeq ($(KNL),1) 96 | ifeq ($(COMPILER),gcc) 97 | CFLAGS += -march=knl -DTARGET_KNL 98 | else 99 | CFLAGS += -xMIC-AVX512 -DTARGET_KNL 100 | endif 101 | OBJ_EXT = .o 102 | BINNAME := ${BINNAME:%=%_knl} 103 | else 104 | OBJ_EXT = .o 105 | ifeq ($(SKYLAKEX),1) 106 | CFLAGS += -DSKYLAKEX 107 | ifeq ($(COMPILER),icc) 108 | CFLAGS += -march=skylake-avx512 -DTARGET_KNL 109 | else 110 | CFLAGS += -march=skylake-avx512 -DTARGET_KNL 111 | endif 112 | else 113 | OPT_FLAGS += -mavx 114 | endif 115 | endif 116 | 117 | 118 | ifeq ($(CC),icc) 119 | CFLAGS += -qmkl 120 | endif 121 | 122 | 123 | ifeq ($(PROFILE),1) 124 | CFLAGS += -pg 125 | BINNAME := ${BINNAME:%=%_prof} 126 | endif 127 | 128 | 129 | CFLAGS += -g $(OPT_FLAGS) $(WARN_FLAGS) $(INC) 130 | 131 | ifeq ($(STATIC),1) 132 | CFLAGS += -static-intel -static 133 | LIBS += -L/usr/lib/x86_64-redhat-linux6E/lib64/ -lm 134 | else 135 | LIBS += -lm -lgmp 136 | endif 137 | 138 | 139 | #--------------------------- file lists ------------------------- 140 | SRCS = \ 141 | common.c \ 142 | vecarith52.c \ 143 | vecarith.c \ 144 | main.c 145 | 146 | OBJS = $(SRCS:.c=$(OBJ_EXT)) 147 | 148 | 149 | 150 | #---------------------------Header file lists ------------------------- 151 | HEAD = \ 152 | vecarith.h 153 | 154 | #---------------------------Make Targets ------------------------- 155 | 156 | all: $(OBJS) 157 | rm -f libvecarith.a 158 | ar r libvecarith.a $(OBJS) 159 | ranlib libvecarith.a 160 | $(CC) $(CFLAGS) $(OBJS) -o $(BINNAME) libvecarith.a $(LIBS) 161 | 162 | 163 | clean: 164 | rm -f $(OBJS) 165 | 166 | #---------------------------Build Rules ------------------------- 167 | 168 | 169 | %$(OBJ_EXT): %.c $(HEAD) 170 | $(CC) $(CFLAGS) -c -o $@ $< 171 | 172 | -------------------------------------------------------------------------------- /vecarith.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {F2AFAF8C-D04A-437F-9EAB-2B635C6FB1DB} 24 | vecarith 25 | 10.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v143 32 | MultiByte 33 | 34 | 35 | Application 36 | false 37 | v143 38 | true 39 | MultiByte 40 | 41 | 42 | Application 43 | true 44 | v143 45 | MultiByte 46 | 47 | 48 | Application 49 | false 50 | v143 51 | true 52 | MultiByte 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | Level3 76 | Disabled 77 | true 78 | true 79 | 80 | 81 | 82 | 83 | Level3 84 | Disabled 85 | true 86 | true 87 | 88 | 89 | 90 | 91 | Level3 92 | MaxSpeed 93 | true 94 | true 95 | true 96 | true 97 | 98 | 99 | true 100 | true 101 | 102 | 103 | 104 | 105 | Level3 106 | MaxSpeed 107 | true 108 | true 109 | true 110 | true 111 | Y:\projects\mpir-3.0.0;%(AdditionalIncludeDirectories) 112 | 113 | 114 | true 115 | true 116 | Y:\projects\mpir-3.0.0\lib\x64\Release\bkup_mpir_gc;%(AdditionalLibraryDirectories) 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /x64_bench/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #ifndef _UTIL_H 31 | #define _UTIL_H 32 | 33 | // ============================================================================ 34 | // some standard headers 35 | // ============================================================================ 36 | #include 37 | #include 38 | #include 39 | #include // for uint32_t, etc. 40 | #include 41 | #include 42 | #include 43 | #if defined(WIN32) 44 | #define WIN32_LEAN_AND_MEAN 45 | #include 46 | #include 47 | #include 48 | #include 49 | #endif 50 | 51 | #ifndef _MSC_VER 52 | #include //for gettimeofday using gcc 53 | #include 54 | #endif 55 | 56 | 57 | // ============================================================================ 58 | // useful definitions 59 | // ============================================================================ 60 | #define MIN(a,b) ((a) < (b)? (a) : (b)) 61 | #define MAX(a,b) ((a) > (b)? (a) : (b)) 62 | #define SIGN(a) ((a) < 0 ? -1 : 1) 63 | 64 | #define INV_2_POW_48 3.5527136788005009293556213378906e-15 65 | #define INV_2_POW_52 2.2204460492503130808472633361816e-16 66 | #define INV_2_POW_64 5.4210108624275221700372640043497e-20 67 | #define INV_2_POW_26 1.490116119384765625e-8 68 | #define INV_2_POW_32 2.3283064365386962890625e-10 69 | #define PI 3.1415926535897932384626433832795 70 | #define HBAR 6.58211928000e-7 // (ev * ns) 71 | #define INV_HBAR 1.519267514702347e+06 // (eV * ns)^-1 72 | #define INV_HBAR_RESIDUE 5.873436907300274 // 1/hbar mod 2*pi 73 | 74 | #define INLINE __inline 75 | #define LOWER(x) ((x) & HALFMASK) 76 | #define UPPER(x) ((x) >> HALFBITS) 77 | #define strto_uint64 strtoull 78 | #define DEC 10 79 | #define HEX 16 80 | #define DEFINED 1 81 | #ifdef NOTDEF 82 | #undef NOTDEF 83 | #endif 84 | 85 | // portable 64-bit formatting and aligned memory 86 | #if defined(_MSC_VER) || defined(__MINGW32__) 87 | #define PRId64 "I64d" 88 | #define PRIu64 "I64u" 89 | #define PRIx64 "I64x" 90 | 91 | #define align_free _aligned_free 92 | #define ALIGNED_MEM __declspec(align(64)) 93 | 94 | #elif defined(__x86_64__) 95 | 96 | #define align_free free 97 | #if defined (__INTEL_COMPILER) 98 | #define ALIGNED_MEM __declspec(align(64)) 99 | #else 100 | #define ALIGNED_MEM __attribute__((aligned(64))) 101 | #endif 102 | 103 | #define PRId64 "ld" 104 | #define PRIu64 "lu" 105 | #define PRIx64 "lx" 106 | #define BSCu "lu" 107 | #define BSCx "lx" 108 | #define BSCu0 "019lu" // base string conversion with leading zeros 109 | #define BSCx0 "019lx" // base string conversion with leading zeros 110 | #elif defined(__i386__) 111 | 112 | #define align_free free 113 | #if defined (__INTEL_COMPILER) 114 | #define ALIGNED_MEM __declspec(align(64)) 115 | #else 116 | #define ALIGNED_MEM __attribute__((aligned(64))) 117 | #endif 118 | 119 | #define PRId64 "lld" 120 | #define PRIu64 "llu" 121 | #define PRIx64 "llx" 122 | #define BSCu "u" 123 | #define BSCx "x" 124 | #define BSCu0 "09u" 125 | #define BSCx0 "09x" 126 | #endif 127 | 128 | #ifdef _MSC_VER 129 | #define strto_uint64 _strtoui64 130 | #else 131 | #define strto_uint64 strtoull 132 | #endif 133 | 134 | 135 | // ============================================================================ 136 | // memory allocation 137 | // ============================================================================ 138 | static __inline void * xmalloc_align(size_t len) 139 | { 140 | #if defined (_MSC_VER) || defined(__MINGW32__) 141 | void *ptr = _aligned_malloc(len, 64); 142 | #elif defined (__APPLE__) 143 | void *ptr = malloc(len); 144 | #elif defined (__GNUC__) 145 | void *ptr = memalign(64, len); 146 | #define align_free free 147 | #else 148 | void *ptr = malloc(len); 149 | #endif 150 | 151 | if (ptr == NULL) { 152 | printf("failed to allocate %u aligned bytes\n", (uint32_t)len); 153 | exit(-1); 154 | } 155 | 156 | return ptr; 157 | } 158 | 159 | static __inline void * xmalloc(size_t len) { 160 | void *ptr = malloc(len); 161 | if (ptr == NULL) { 162 | printf("failed to allocate %u bytes\n", (uint32_t)len); 163 | exit(-1); 164 | } 165 | return ptr; 166 | } 167 | 168 | static __inline void * xcalloc(size_t num, size_t len) { 169 | void *ptr = calloc(num, len); 170 | if (ptr == NULL) { 171 | printf("failed to calloc %u bytes\n", (uint32_t)(num * len)); 172 | exit(-1); 173 | } 174 | return ptr; 175 | } 176 | 177 | static __inline void * xrealloc(void *iptr, size_t len) { 178 | void *ptr = realloc(iptr, len); 179 | if (ptr == NULL) { 180 | printf("failed to reallocate %u bytes\n", (uint32_t)len); 181 | exit(-1); 182 | } 183 | return ptr; 184 | } 185 | 186 | // ============================================================================ 187 | // randomness 188 | // ============================================================================ 189 | typedef struct 190 | { 191 | uint32_t hi; 192 | uint32_t low; 193 | } rand_t; 194 | 195 | uint32_t spRand(uint64_t *state, uint32_t lower, uint32_t upper); 196 | uint64_t spRand64(uint64_t *state); 197 | uint64_t spRand64_range(uint64_t *state, uint64_t lower, uint64_t upper); 198 | void get_random_seeds(rand_t *r); 199 | 200 | rand_t g_rand; 201 | uint64_t LCGSTATE; 202 | 203 | // ============================================================================ 204 | // hashing 205 | // ============================================================================ 206 | uint64_t hash64(uint64_t in); 207 | 208 | 209 | // ============================================================================ 210 | // sorting (qsort) 211 | // ============================================================================ 212 | 213 | static int qcomp_uint32(const void *x, const void *y) 214 | { 215 | uint32_t *xx = (uint32_t *)x; 216 | uint32_t *yy = (uint32_t *)y; 217 | 218 | if (*xx > *yy) 219 | return 1; 220 | else if (*xx == *yy) 221 | return 0; 222 | else 223 | return -1; 224 | } 225 | 226 | 227 | // ============================================================================ 228 | // precision time 229 | // ============================================================================ 230 | uint64_t read_clock(void); 231 | uint64_t measure_processor_speed(int millisec); 232 | 233 | #ifdef _MSC_VER 234 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 235 | 236 | struct timeval 237 | { 238 | long tv_sec; 239 | long tv_usec; 240 | }; 241 | 242 | struct timezone 243 | { 244 | int tz_minuteswest; /* minutes W of Greenwich */ 245 | int tz_dsttime; /* type of dst correction */ 246 | }; 247 | #endif 248 | 249 | double my_difftime(struct timeval *, struct timeval *); 250 | #if defined (_MSC_VER) 251 | int gettimeofday(struct timeval *tv, struct timezone *tz); 252 | 253 | static void usleep(uint32_t usec) 254 | { 255 | Sleep(usec / 1000); 256 | } 257 | #endif 258 | 259 | #endif 260 | -------------------------------------------------------------------------------- /x64_bench/util.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #include "util.h" 31 | 32 | 33 | // ============================================================================ 34 | // precision time 35 | // ============================================================================ 36 | #ifdef _MSC_VER 37 | 38 | /* Core aware timing on Windows, courtesy of Brian Gladman */ 39 | 40 | #if defined( _WIN64 ) 41 | 42 | #define current_processor_number GetCurrentProcessorNumber 43 | 44 | #else 45 | 46 | unsigned long current_processor_number(void) 47 | { 48 | __asm 49 | { 50 | mov eax,1 51 | cpuid 52 | shr ebx,24 53 | mov eax, ebx 54 | } 55 | } 56 | 57 | #endif 58 | 59 | int lock_thread_to_core(void) 60 | { DWORD_PTR afp, afs; 61 | 62 | if(GetProcessAffinityMask(GetCurrentProcess(), &afp, &afs)) 63 | { 64 | afp &= (DWORD_PTR)(1 << current_processor_number()); 65 | if(SetThreadAffinityMask(GetCurrentThread(), afp)) 66 | return EXIT_SUCCESS; 67 | } 68 | return EXIT_FAILURE; 69 | } 70 | 71 | int unlock_thread_from_core(void) 72 | { DWORD_PTR afp, afs; 73 | 74 | if(GetProcessAffinityMask(GetCurrentProcess(), &afp, &afs)) 75 | { 76 | if(SetThreadAffinityMask(GetCurrentThread(), afp)) 77 | return EXIT_SUCCESS; 78 | } 79 | return EXIT_FAILURE; 80 | } 81 | 82 | double cycles_per_second = 0.0; 83 | double ticks_per_second = 0.0; 84 | double cycles_per_tick = 0.0; 85 | 86 | uint64_t measure_processor_speed(int millisec) 87 | { unsigned long long cycles; 88 | 89 | lock_thread_to_core(); 90 | cycles = __rdtsc(); 91 | Sleep(millisec); 92 | cycles = __rdtsc() - cycles; 93 | unlock_thread_from_core(); 94 | cycles_per_second = 10.0 * (double)cycles; 95 | 96 | if(ticks_per_second == 0.0) 97 | { LARGE_INTEGER ll; 98 | QueryPerformanceFrequency(&ll); 99 | ticks_per_second = (double)ll.QuadPart; 100 | cycles_per_tick = cycles_per_second / ticks_per_second; 101 | } 102 | return cycles; 103 | } 104 | 105 | double get_tsc_time(void) 106 | { 107 | if(cycles_per_second == 0.0) 108 | measure_processor_speed(100); 109 | return __rdtsc() / cycles_per_second; 110 | } 111 | 112 | double get_pfc_time(void) 113 | { LARGE_INTEGER ll; 114 | 115 | if(ticks_per_second == 0.0) 116 | measure_processor_speed(100); 117 | QueryPerformanceCounter(&ll); 118 | return ll.QuadPart / ticks_per_second; 119 | } 120 | 121 | #else 122 | 123 | double cycles_per_second = 0.0; 124 | 125 | uint64_t measure_processor_speed(int millisec) 126 | { 127 | uint64_t cycles; 128 | struct timeval start, stop; 129 | double t_time; 130 | 131 | gettimeofday(&start,NULL); 132 | 133 | cycles = read_clock(); 134 | do 135 | { 136 | gettimeofday (&stop, NULL); 137 | t_time = my_difftime (&start, &stop); 138 | } 139 | while (t_time*1000 < millisec); 140 | cycles = read_clock() - cycles; 141 | 142 | return cycles; /* return cycles per second */ 143 | } 144 | 145 | #endif 146 | 147 | 148 | uint64_t read_clock(void) 149 | { 150 | #if defined(__GNUC__) && (defined(__i386__) || defined(GCC_ASM64X) ) 151 | uint32_t lo, hi; 152 | asm("rdtsc":"=d"(hi),"=a"(lo)); 153 | return (uint64)hi << 32 | lo; 154 | 155 | #elif defined(_MSC_VER) 156 | LARGE_INTEGER ll; 157 | QueryPerformanceCounter(&ll); 158 | return (uint64_t)(ll.QuadPart * cycles_per_tick); 159 | #else 160 | struct timeval thistime; 161 | gettimeofday(&thistime, NULL); 162 | return (uint64_t)(cycles_per_second * 163 | (thistime.tv_sec + thistime.tv_usec / 1000000.0)); 164 | #endif 165 | } 166 | 167 | #ifdef _MSC_VER 168 | int gettimeofday(struct timeval *tv, struct timezone *tz) 169 | { 170 | FILETIME ft; 171 | unsigned __int64 tmpres = 0; 172 | static int tzflag; 173 | 174 | if (NULL != tv) 175 | { 176 | GetSystemTimeAsFileTime(&ft); 177 | 178 | tmpres |= ft.dwHighDateTime; 179 | tmpres <<= 32; 180 | tmpres |= ft.dwLowDateTime; 181 | 182 | /*converting file time to unix epoch*/ 183 | tmpres /= 10; /*convert into microseconds*/ 184 | tmpres -= DELTA_EPOCH_IN_MICROSECS; 185 | tv->tv_sec = (long)(tmpres / 1000000UL); 186 | tv->tv_usec = (long)(tmpres % 1000000UL); 187 | } 188 | 189 | if (NULL != tz) 190 | { 191 | if (!tzflag) 192 | { 193 | _tzset(); 194 | tzflag++; 195 | } 196 | tz->tz_minuteswest = _timezone / 60; 197 | tz->tz_dsttime = _daylight; 198 | } 199 | 200 | return 0; 201 | } 202 | #endif 203 | 204 | double my_difftime(struct timeval * start, struct timeval * end) 205 | { 206 | double secs; 207 | double usecs; 208 | 209 | if (start->tv_sec == end->tv_sec) { 210 | secs = 0; 211 | usecs = end->tv_usec - start->tv_usec; 212 | } 213 | else { 214 | usecs = 1000000 - start->tv_usec; 215 | secs = end->tv_sec - (start->tv_sec + 1); 216 | usecs += end->tv_usec; 217 | if (usecs >= 1000000) { 218 | usecs -= 1000000; 219 | secs += 1; 220 | } 221 | } 222 | 223 | return secs + usecs / 1000000.; 224 | } 225 | 226 | 227 | // ============================================================================ 228 | // randomness 229 | // ============================================================================ 230 | void get_random_seeds(rand_t *r) { 231 | 232 | uint32_t tmp_seed1, tmp_seed2; 233 | 234 | #ifndef WIN32 235 | 236 | FILE *rand_device = fopen("/dev/urandom", "r"); 237 | 238 | if (rand_device != NULL) { 239 | fread(&tmp_seed1, sizeof(uint32_t), (size_t)1, rand_device); 240 | fread(&tmp_seed2, sizeof(uint32_t), (size_t)1, rand_device); 241 | fclose(rand_device); 242 | } 243 | else 244 | 245 | #endif 246 | { 247 | /* For everyone else, sample the current time, 248 | the high-res timer (hopefully not correlated to the 249 | current time), and the process ID. Multithreaded 250 | applications should fold in the thread ID too */ 251 | 252 | uint64_t high_res_time = read_clock(); 253 | tmp_seed1 = ((uint32_t)(high_res_time >> 32) ^ 254 | (uint32_t)time(NULL)) * 255 | (uint32_t)getpid(); 256 | tmp_seed2 = (uint32_t)high_res_time; 257 | } 258 | 259 | /* The final seeds are the result of a multiplicative 260 | hash of the initial seeds */ 261 | 262 | r->low = tmp_seed1 * ((uint32_t)40499 * 65543); 263 | r->hi = tmp_seed2 * ((uint32_t)40499 * 65543); 264 | } 265 | 266 | // Knuth's 64 bit MMIX LCG, using a global 64 bit state variable. 267 | uint32_t spRand(uint64_t *state, uint32_t lower, uint32_t upper) 268 | { 269 | // advance the state of the LCG and return the appropriate result 270 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL; 271 | return lower + (uint32_t)( 272 | (double)(upper - lower) * (double)(*state >> 32) * INV_2_POW_32); 273 | } 274 | 275 | uint64_t spRand64_range(uint64_t *state, uint64_t lower, uint64_t upper) 276 | { 277 | // advance the state of the LCG and return the appropriate result 278 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL; 279 | return lower + (uint64_t)( 280 | (double)(upper - lower) * ((double)*state * INV_2_POW_64)); 281 | } 282 | 283 | uint64_t spRand64(uint64_t *state) 284 | { 285 | // advance the state of the LCG and return the appropriate result. 286 | // assume lower = 0 and upper = maxint 287 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL; 288 | return *state; 289 | } 290 | 291 | // ============================================================================ 292 | // hashing 293 | // ============================================================================ 294 | 295 | // FNV-1 hash algorithm: 296 | // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function 297 | uint64_t hash64(uint64_t in) 298 | { 299 | uint64_t hash = 14695981039346656037ULL; 300 | uint64_t prime = 1099511628211ULL; 301 | uint64_t hash_mask; 302 | uint64_t xor; 303 | 304 | hash = hash * prime; 305 | hash_mask = 0xffffffffffffff00ULL; 306 | xor = hash ^ in; 307 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 308 | 309 | hash = hash * prime; 310 | hash_mask = 0xffffffffffff00ffULL; 311 | xor = hash ^ in; 312 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 313 | 314 | hash = hash * prime; 315 | hash_mask = 0xffffffffff00ffffULL; 316 | xor = hash ^ in; 317 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 318 | 319 | hash = hash * prime; 320 | hash_mask = 0xffffffff00ffffffULL; 321 | xor = hash ^ in; 322 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 323 | 324 | hash = hash * prime; 325 | hash_mask = 0xffffff00ffffffffULL; 326 | xor = hash ^ in; 327 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 328 | 329 | hash = hash * prime; 330 | hash_mask = 0xffff00ffffffffffULL; 331 | xor = hash ^ in; 332 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 333 | 334 | hash = hash * prime; 335 | hash_mask = 0xff00ffffffffffffULL; 336 | xor = hash ^ in; 337 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 338 | 339 | hash = hash * prime; 340 | hash_mask = 0x00ffffffffffffffULL; 341 | xor = hash ^ in; 342 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 343 | 344 | return hash; 345 | } 346 | 347 | -------------------------------------------------------------------------------- /x64_bench/pmod.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose 2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide. 3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may 4 | // not use this file except in compliance with the License. You may obtain 5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 9 | // including conditions of title, non-infringement, merchantability, 10 | // or fitness for a particular purpose 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | // This file is a snapshot of a work in progress, originated by Mayo 14 | // Clinic SPPDG. 15 | 16 | /* 17 | Copyright (c) 2021, Ben Buhrow 18 | All rights reserved. 19 | 20 | Redistribution and use in source and binary forms, with or without 21 | modification, are permitted provided that the following conditions are met: 22 | 23 | 1. Redistributions of source code must retain the above copyright notice, this 24 | list of conditions and the following disclaimer. 25 | 2. Redistributions in binary form must reproduce the above copyright notice, 26 | this list of conditions and the following disclaimer in the documentation 27 | and/or other materials provided with the distribution. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 | 40 | The views and conclusions contained in the software and documentation are those 41 | of the authors and should not be interpreted as representing official policies, 42 | either expressed or implied, of the FreeBSD Project. 43 | */ 44 | #include "pmod.h" 45 | 46 | 47 | void pmodlib_init(pmod_t *pmod_state) 48 | { 49 | int i; 50 | 51 | // accomodate window sizes up to 8 52 | pmod_state->libpmod_gwin = (bignum **)malloc((1 << MAX_WINSIZE) * sizeof(bignum *)); 53 | pmod_state->libpmod_gwin[0] = zInit(); 54 | 55 | for (i = 1; i < (1 << MAX_WINSIZE); i++) 56 | { 57 | pmod_state->libpmod_gwin[i] = zInit(); 58 | } 59 | 60 | return; 61 | } 62 | 63 | void pmodlib_free(pmod_t *pmod_state) 64 | { 65 | int i; 66 | 67 | for (i = 0; i < (1 << MAX_WINSIZE); i++) 68 | { 69 | zFree(pmod_state->libpmod_gwin[i]); 70 | } 71 | free(pmod_state->libpmod_gwin); 72 | 73 | return; 74 | } 75 | 76 | void lr_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s) 77 | { 78 | bignum *acc; 79 | int i; 80 | int j; 81 | 82 | acc = zInit(); 83 | zCopy(mdata->one, acc); 84 | 85 | for (i = NWORDS - 1; i >= 0; i--) 86 | { 87 | for (j = 63; j >= 0; j--) 88 | { 89 | sqr_ptr(mdata, acc, acc, s); 90 | if (b->data[i] & (1ULL << j)) 91 | mul_ptr(mdata, acc, a, acc, s); 92 | } 93 | } 94 | 95 | memset(mdata->mtmp1->data, 0, 2 * NWORDS * sizeof(base_t)); 96 | zSet1(mdata->mtmp1, 1); 97 | mul_ptr(mdata, acc, mdata->mtmp1, c, s); 98 | 99 | // final check to ensure c < N 100 | i = 1; 101 | for (j = NWORDS - 1; j >= 0; j--) 102 | { 103 | if (c->data[j] > mdata->n->data[j]) 104 | break; 105 | 106 | if (c->data[j] < mdata->n->data[j]) 107 | { 108 | i = 0; 109 | break; 110 | } 111 | } 112 | 113 | if (i) 114 | { 115 | mpSub(c->data, mdata->n->data, c->data, NWORDS); 116 | } 117 | 118 | zFree(acc); 119 | c->size = NWORDS; 120 | 121 | return; 122 | } 123 | 124 | int get_winsize(void) 125 | { 126 | // the window size is based on minimizing the total number of multiplications 127 | // in the windowed exponentiation. experiments show that this is best; 128 | // the growing size of the table doesn't change the calculus, at least 129 | // on the KNL. 130 | int size; 131 | int muls; 132 | int minmuls = 99999999; 133 | int minsize = 4; 134 | 135 | for (size = 2; size <= 8; size++) 136 | { 137 | muls = (MAXBITS / size) + (1 << size); 138 | if (muls < minmuls) 139 | { 140 | minmuls = muls; 141 | minsize = size; 142 | } 143 | } 144 | 145 | return minsize; 146 | } 147 | 148 | int get_bitwin(bignum *b, int bitloc, int winsize, int winmask) 149 | { 150 | int bstr; 151 | int bitstart = (bitloc - winsize + 1); 152 | int word = bitloc / 64; 153 | int word2 = bitstart / 64; 154 | 155 | bitstart = bitstart % 64; 156 | 157 | if (word == word2) 158 | { 159 | bstr = (b->data[word] >> bitstart) & winmask; 160 | } 161 | else 162 | { 163 | int upperbits = (bitloc % 64) + 1; 164 | 165 | bstr = (b->data[word2] >> bitstart); 166 | bstr |= ((b->data[word]) << (winsize - upperbits)); 167 | bstr &= winmask; 168 | } 169 | 170 | return bstr; 171 | } 172 | 173 | int get_oddbitwin(bignum *b, int bitloc, int winsize, int winmask, int *m) 174 | { 175 | int bstr; 176 | int bitstart = (bitloc - winsize + 1); 177 | int word = bitloc / 64; 178 | int word2 = bitstart / 64; 179 | 180 | bitstart = bitstart % 64; 181 | 182 | if (word == word2) 183 | { 184 | bstr = (b->data[word] >> bitstart) & winmask; 185 | } 186 | else 187 | { 188 | int upperbits = (bitloc % 64) + 1; 189 | 190 | bstr = (b->data[word2] >> bitstart); 191 | bstr |= ((b->data[word]) << (winsize - upperbits)); 192 | bstr &= winmask; 193 | } 194 | 195 | *m = 0; 196 | while ((bstr & 1) == 0) 197 | { 198 | if (bstr == 0) 199 | break; 200 | 201 | (*m)++; 202 | bstr >>= 1; 203 | } 204 | 205 | return bstr; 206 | } 207 | 208 | void lrwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s) 209 | { 210 | bignum *acc; 211 | int i, j, bit = MAXBITS - 1; 212 | int k = get_winsize(); 213 | bignum **g = pmod_state->libpmod_gwin; // storage for windowed method precomputation 214 | int mask; 215 | int bstr; 216 | 217 | mask = 0; 218 | for (j = 0; j < k; j++) 219 | { 220 | mask = (mask << 1) | 1; 221 | } 222 | 223 | acc = zInit(); 224 | zCopy(mdata->one, acc); 225 | 226 | // precomputations, b^i for 0 <= i < 2^k 227 | memcpy(g[1]->data, a->data, NWORDS * sizeof(base_t)); 228 | for (i = 2; i < (1 << k); i++) 229 | { 230 | mul_ptr(mdata, g[i - 1], a, g[i], s); 231 | } 232 | 233 | // L-R windowed exponentiation. Scan the exponent bit-vector 234 | // backward instead of flipping and shifting it. 235 | while (bit >= 0) 236 | { 237 | if (bit < k) 238 | { 239 | // grab the last bits of the exponent. 240 | // accommodates exponent lengths not divisible 241 | // by the window size 242 | mask = 0x0; 243 | for (j = 0; j < (bit + 1); j++) 244 | { 245 | sqr_ptr(mdata, acc, acc, s); 246 | mask = (mask << 1) | 1; 247 | } 248 | 249 | bstr = b->data[0] & mask; 250 | } 251 | else 252 | { 253 | // grab the next k bits of the exponent. 254 | bstr = get_bitwin(b, bit, k, mask); 255 | for (j = 0; j < k; j++) 256 | { 257 | sqr_ptr(mdata, acc, acc, s); 258 | } 259 | } 260 | 261 | if (bstr > 0) 262 | mul_ptr(mdata, acc, g[bstr], acc, s); 263 | 264 | bit -= k; 265 | 266 | } 267 | 268 | memset(mdata->mtmp1->data, 0, 2 * NWORDS * sizeof(base_t)); 269 | zSet1(mdata->mtmp1, 1); 270 | mul_ptr(mdata, acc, mdata->mtmp1, c, s); 271 | 272 | // final check to ensure c < N 273 | i = 1; 274 | for (j = NWORDS - 1; j >= 0; j--) 275 | { 276 | if (c->data[j] > mdata->n->data[j]) 277 | break; 278 | 279 | if (c->data[j] < mdata->n->data[j]) 280 | { 281 | i = 0; 282 | break; 283 | } 284 | } 285 | 286 | if (i) 287 | { 288 | mpSub(c->data, mdata->n->data, c->data, NWORDS); 289 | } 290 | c->size = NWORDS; 291 | 292 | zFree(acc); 293 | 294 | return; 295 | } 296 | 297 | void lroddwin_powm(pmod_t *pmod_state, monty *mdata, bignum *c, bignum *a, bignum *b, bignum *n, bignum *s) 298 | { 299 | bignum *acc; 300 | int i, j, bit = MAXBITS - 1; 301 | int k = get_winsize(); 302 | bignum **g = pmod_state->libpmod_gwin; // storage for windowed method precomputation 303 | int mask; 304 | int bstr; 305 | int m; 306 | 307 | mask = 0; 308 | for (j = 0; j < k; j++) 309 | { 310 | mask = (mask << 1) | 1; 311 | } 312 | 313 | acc = zInit(); 314 | zCopy(mdata->one, acc); 315 | 316 | // precomputations, b^i for 0 <= i < 2^k, i odd (except i = 2). 317 | // half the setup cost for minimal extra overhead while scanning 318 | // the exponent vector... not as secure because order of 319 | // operations (squaring/multiply) depends on exponent bits. 320 | memcpy(g[1]->data, a->data, NWORDS * sizeof(base_t)); 321 | mul_ptr(mdata, g[1], a, g[2], s); 322 | //printf("g[%d] ", 2); zPrint(g[2]); printf("\n"); 323 | for (i = 3; i < (1 << k); i += 2) 324 | { 325 | mul_ptr(mdata, g[i - 2], g[2], g[i], s); 326 | //printf("g[%d] ", i); zPrint(g[i]); printf("\n"); 327 | } 328 | 329 | //printf("acc init "); zPrint(acc); printf("\n"); 330 | 331 | // L-R windowed exponentiation. Scan the exponent bit-vector 332 | // backward instead of flipping and shifting it. 333 | while (bit >= 0) 334 | { 335 | if (bit < (k- 1)) 336 | { 337 | // grab the last bits of the exponent. 338 | // accommodates exponent lengths not divisible 339 | // by the window size 340 | mask = 0x0; 341 | k = (bit + 1); 342 | for (j = 0; j < k; j++) 343 | { 344 | mask = (mask << 1) | 1; 345 | } 346 | } 347 | 348 | // grab the next k bits of the exponent. 349 | bstr = get_oddbitwin(b, bit, k, mask, &m); 350 | for (j = 0; j < (k - m); j++) 351 | { 352 | sqr_ptr(mdata, acc, acc, s); 353 | //printf("sqr bit %03d ", bit); zPrint(acc); printf("\n"); 354 | } 355 | 356 | if (bstr > 0) 357 | { 358 | mul_ptr(mdata, acc, g[bstr], acc, s); 359 | //printf("mul bit %03d ", bit); zPrint(acc); printf("\n"); 360 | } 361 | 362 | for (j = 0; j < m; j++) 363 | { 364 | sqr_ptr(mdata, acc, acc, s); 365 | //printf("sqr bit %03d ", bit); zPrint(acc); printf("\n"); 366 | } 367 | 368 | bit -= k; 369 | } 370 | 371 | memset(mdata->mtmp1->data, 0, 2*NWORDS * sizeof(base_t)); 372 | zSet1(mdata->mtmp1, 1); 373 | mul_ptr(mdata, acc, mdata->mtmp1, c, s); 374 | 375 | // final check to ensure c < N 376 | i = 1; 377 | for (j = NWORDS - 1; j >= 0; j--) 378 | { 379 | if (c->data[j] > mdata->n->data[j]) 380 | break; 381 | 382 | if (c->data[j] < mdata->n->data[j]) 383 | { 384 | i = 0; 385 | break; 386 | } 387 | } 388 | 389 | if (i) 390 | { 391 | mpSub(c->data, mdata->n->data, c->data, NWORDS); 392 | } 393 | c->size = NWORDS; 394 | 395 | zFree(acc); 396 | 397 | return; 398 | } 399 | 400 | -------------------------------------------------------------------------------- /x64_bench/x64_arith.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | #include "x64_arith.h" 30 | 31 | __inline void spAdd(uint64_t u, uint64_t v, uint64_t *sum, uint64_t *carry) 32 | { 33 | uint64_t s, c; 34 | 35 | s = v; 36 | c = 0; 37 | 38 | __asm__("movq %2, %%rax \n\t" 39 | "addq %%rax, %3 \n\t" 40 | "adcq $0, %4 \n\t" 41 | : "=r"(s), "=r"(c) 42 | : "r"(u), "0"(s), "1"(c) 43 | : "%rax", "memory", "cc"); 44 | 45 | *sum = s; 46 | *carry = c; 47 | 48 | return; 49 | } 50 | 51 | __inline void spAdd3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sum, uint64_t *carry) 52 | { 53 | uint64_t s, c; 54 | 55 | s = v; 56 | c = 0; 57 | 58 | __asm__("movq %2, %%rax \n\t" 59 | "addq %3, %%rax \n\t" 60 | "adcq $0, %5 \n\t" 61 | "addq %%rax, %4 \n\t" 62 | "adcq $0, %5 \n\t" 63 | : "=r"(s), "=r"(c) 64 | : "r"(u), "r"(w), "0"(s), "1"(c) 65 | : "%rax", "memory", "cc"); 66 | 67 | *sum = s; 68 | *carry = c; 69 | 70 | return; 71 | } 72 | 73 | __inline void spSub3(uint64_t u, uint64_t v, uint64_t w, uint64_t *sub, uint64_t *borrow) 74 | { 75 | uint64_t s, b; 76 | 77 | s = v; 78 | b = 0; 79 | 80 | __asm__("movq %2, %%rax \n\t" 81 | "subq %4, %%rax \n\t" 82 | "adcq $0, %5 \n\t" 83 | "subq %3, %%rax \n\t" 84 | "adcq $0, %5 \n\t" 85 | "movq %%rax, %4 \n\t" 86 | : "=r"(s), "=r"(b) 87 | : "r"(u), "r"(w), "0"(s), "1"(b) 88 | : "%rax", "memory", "cc"); 89 | 90 | *sub = s; 91 | *borrow = b; 92 | 93 | return; 94 | } 95 | 96 | __inline void spSub(uint64_t u, uint64_t v, uint64_t *sub, uint64_t *borrow) 97 | { 98 | uint64_t s, b; 99 | 100 | s = v; 101 | b = 0; 102 | 103 | __asm__("movq %2, %%rax \n\t" 104 | "subq %3, %%rax \n\t" 105 | "adcq $0, %4 \n\t" 106 | "movq %%rax, %3 \n\t" 107 | : "=r"(s), "=r"(b) 108 | : "r"(u), "0"(s), "1"(b) 109 | : "%rax", "memory", "cc"); 110 | 111 | *sub = s; 112 | *borrow = b; 113 | 114 | return; 115 | } 116 | 117 | __inline uint64_t spDivide(uint64_t *q, uint64_t *r, uint64_t u[2], uint64_t v) 118 | { 119 | *r = u[1]; 120 | *q = u[0]; 121 | __asm__("divq %4" 122 | : "=a"(*q), "=d"(*r) 123 | : "1"(*r), "0"(*q), "r"(v)); 124 | 125 | return 0; 126 | } 127 | 128 | __inline void spMultiply(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry) 129 | { 130 | *product = v; 131 | *carry = u; 132 | 133 | __asm__("movq %2, %%rax \n\t" 134 | "mulq %3 \n\t" 135 | "movq %%rax, %0 \n\t" 136 | "movq %%rdx, %1 \n\t" 137 | : "=r"(*product), "=r"(*carry) 138 | : "1"(*carry), "0"(*product) 139 | : "%rax", "%rdx", "memory", "cc"); 140 | 141 | return; 142 | } 143 | 144 | __inline uint64_t spDiv(uint64_t *q, uint64_t *r, uint64_t u1, uint64_t u0, uint64_t v) 145 | { 146 | *r = u1; 147 | *q = u0; 148 | __asm__("divq %4" 149 | : "=a"(*q), "=d"(*r) 150 | : "1"(*r), "0"(*q), "r"(v)); 151 | 152 | return 0; 153 | } 154 | 155 | __inline uint64_t spMod(uint64_t u1, uint64_t u0, uint64_t v) 156 | { 157 | __asm__("divq %4" 158 | : "=a"(u0), "=d"(u1) 159 | : "1"(u1), "0"(u0), "r"(v)); 160 | 161 | return u1; 162 | } 163 | 164 | __inline void spMul(uint64_t u, uint64_t v, uint64_t *product, uint64_t *carry) 165 | { 166 | *product = v; 167 | *carry = u; 168 | 169 | __asm__("movq %2, %%rax \n\t" 170 | "mulq %3 \n\t" 171 | "movq %%rax, %0 \n\t" 172 | "movq %%rdx, %1 \n\t" 173 | : "=r"(*product), "=r"(*carry) 174 | : "0"(*product), "1"(*carry) 175 | : "rax", "rdx", "cc"); 176 | 177 | return; 178 | } 179 | 180 | __inline void spMulAdd1(uint64_t u, uint64_t v, uint64_t w, 181 | uint64_t *product, uint64_t *carry) 182 | { 183 | *product = v; 184 | *carry = u; 185 | 186 | __asm__("movq %2, %%rax \n\t" 187 | "mulq %3 \n\t" 188 | "addq %4, %%rax \n\t" 189 | "adcq $0, %%rdx \n\t" 190 | "movq %%rax, %0 \n\t" 191 | "movq %%rdx, %1 \n\t" 192 | : "=r"(*product), "=r"(*carry) 193 | : "1"(*carry), "0"(*product), "r"(w) 194 | : "rax", "rdx", "cc"); 195 | 196 | return; 197 | } 198 | 199 | __inline void spMulAdd2(uint64_t u, uint64_t v, uint64_t w, 200 | uint64_t c, uint64_t *product, uint64_t *carry) 201 | { 202 | *product = v; 203 | *carry = u; 204 | 205 | __asm__("movq %2, %%rax \n\t" 206 | "mulq %3 \n\t" 207 | "addq %4, %%rax \n\t" 208 | "adcq $0, %%rdx \n\t" 209 | "addq %5, %%rax \n\t" 210 | "adcq $0, %%rdx \n\t" 211 | "movq %%rax, %0 \n\t" 212 | "movq %%rdx, %1 \n\t" 213 | : "=r"(*product), "=r"(*carry) 214 | : "1"(*carry), "0"(*product), "r"(w), "r"(c) 215 | : "rax", "rdx", "cc"); 216 | 217 | return; 218 | } 219 | 220 | __inline void spMulAdd2x(uint64_t u, uint64_t v, uint64_t w, 221 | uint64_t c, uint64_t *product, uint64_t *carry) 222 | { 223 | *product = v; 224 | *carry = u; 225 | 226 | // maximum in all inputs won't overflow outputs: 227 | // 0xffffffffffffffff ^ 2 + 2 * 0xffffffffffffffff = 0xffffffffffffffffffffffffffffffff 228 | 229 | __asm__("movq %2, %%rdx \n\t" 230 | "addq %5, %4 \n\t" /* add current output to previous carry */ 231 | "mulx %3, %0, %1 \n\t" /* multiply */ 232 | "adcq $0, %1 \n\t" /* add carry into himul result */ 233 | "addq %4, %0 \n\t" /* lowmul + current output + previous carry, store into current output */ 234 | "adcq $0, %1 \n\t" /* carry prop into himul */ 235 | : "=r"(*product), "=r"(*carry) 236 | : "1"(*carry), "0"(*product), "r"(w), "r"(c) 237 | : "r10", "rdx", "r11", "r12", "cc"); 238 | 239 | return; 240 | } 241 | 242 | __inline void mpSub(uint64_t * u, uint64_t * n, uint64_t * w, int sz) 243 | { 244 | int i; 245 | uint64_t b, d; 246 | 247 | b = 0; 248 | for (i = 0; i < sz; i++) 249 | { 250 | spSub3(u[i], n[i], b, &w[i], &b); 251 | } 252 | 253 | if (b) 254 | spSub(u[i], b, &w[i], &b); 255 | 256 | return; 257 | } 258 | 259 | __inline void mpSub1(uint64_t * u, uint64_t n, uint64_t * w, int sz) 260 | { 261 | int i = 0; 262 | uint64_t b; 263 | 264 | b = 0; 265 | spSub3(u[i], n, b, &w[i], &b); 266 | i++; 267 | while (i < sz) 268 | { 269 | spSub(u[i], b, &w[i], &b); 270 | i++; 271 | } 272 | 273 | if (b) 274 | spSub(u[i], b, &w[i], &b); 275 | 276 | return; 277 | } 278 | 279 | __inline void spMulAddc(uint64_t u, uint64_t v, uint64_t * w) 280 | { 281 | // for use with product scanning approach... 282 | // multiply u*v. 283 | // add result into w[0] and w[1] and carry propagate once. 284 | 285 | __asm__("movq %0, %%rax \n\t" 286 | "mulq %1 \n\t" 287 | "movq 16(%2), %%r10 \n\t" 288 | "addq 0(%2), %%rax \n\t" 289 | "adcq 8(%2), %%rdx \n\t" 290 | "adcq $0, %%r10 \n\t" 291 | "movq %%rax, 0(%2) \n\t" 292 | "movq %%rdx, 8(%2) \n\t" 293 | "movq %%r10, 16(%2) \n\t" 294 | : 295 | : "r"(u), "r"(v), "r"(w) 296 | : "rax", "rdx", "r10", "cc", "memory"); 297 | 298 | return; 299 | } 300 | 301 | __inline void spMul2Acc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w) 302 | { 303 | // for use with product scanning approach... 304 | // multiply u*v. 305 | // add result into w[0] and w[1] and carry propagate once. 306 | // multiply n*s. 307 | // add result into w[0] and w[1] and carry propagate once. 308 | 309 | __asm__("movq %0, %%rax \n\t" 310 | "mulq %1 \n\t" 311 | "movq 16(%4), %%r10 \n\t" 312 | "addq 0(%4), %%rax \n\t" 313 | "movq %%rax, %%r11 \n\t" 314 | "adcq 8(%4), %%rdx \n\t" 315 | "movq %%rdx, %%r12 \n\t" 316 | "adcq $0, %%r10 \n\t" 317 | "movq %2, %%rax \n\t" 318 | "mulq %3 \n\t" 319 | "addq %%r11, %%rax \n\t" 320 | "adcq %%r12, %%rdx \n\t" 321 | "adcq $0, %%r10 \n\t" 322 | "movq %%rax, 0(%4) \n\t" 323 | "movq %%rdx, 8(%4) \n\t" 324 | "movq %%r10, 16(%4) \n\t" 325 | : 326 | : "r"(u), "r"(v), "r"(n), "r"(s), "r"(w) 327 | : "rax", "rdx", "r10", "r11", "r12", "cc", "memory"); 328 | 329 | return; 330 | } 331 | 332 | __inline void spMulAddcr(uint64_t u, uint64_t v, uint64_t * w) 333 | { 334 | // for use with product scanning approach... 335 | // multiply u*v. 336 | // add result into w[0] and w[1] and carry propagate once. 337 | // final output rotation. 338 | 339 | __asm__("movq %0, %%rax \n\t" 340 | "mulq %1 \n\t" 341 | "movq 16(%2), %%r10 \n\t" 342 | "addq 0(%2), %%rax \n\t" 343 | "adcq 8(%2), %%rdx \n\t" 344 | "adcq $0, %%r10 \n\t" 345 | "xorq %%rax, %%rax \n\t" 346 | "movq %%rdx, 0(%2) \n\t" 347 | "movq %%r10, 8(%2) \n\t" 348 | "movq %%rax, 16(%2) \n\t" 349 | : 350 | : "r"(u), "r"(v), "r"(w) 351 | : "rax", "rdx", "r10", "cc", "memory"); 352 | 353 | return; 354 | } 355 | 356 | __inline void spMulDblAdd_1(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout) 357 | { 358 | // for use with sos squaring approach... 359 | // multiply u*v and add carryin to the 2nd result word. 360 | // add result twice into w[0] and w[1] and return any further carryout. 361 | 362 | __asm__("movq %3, %%rax \n\t" 363 | "mulq %4 \n\t" 364 | "xorq %%r10, %%r10 \n\t" 365 | "addq %%rax, %%rax \n\t" 366 | "adcq %%rdx, %%rdx \n\t" 367 | "adcq $0, %%r10 \n\t" 368 | "addq %5, %%rax \n\t" 369 | "adcq %6, %%rdx \n\t" 370 | "adcq $0, %%r10 \n\t" 371 | "adcq %7, %%rdx \n\t" 372 | "adcq $0, %%r10 \n\t" 373 | "movq %%rax, %0 \n\t" 374 | "movq %%rdx, %1 \n\t" 375 | "movq %%r10, %2 \n\t" 376 | : "=r"(w[0]), "=r"(w[1]), "=r"(*carryout) 377 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(carryin) 378 | : "rax", "rdx", "r10", "cc"); 379 | 380 | return; 381 | } 382 | 383 | __inline void spMulDblAdd_2(uint64_t u, uint64_t v, uint64_t carryin, uint64_t * w, uint64_t *carryout) 384 | { 385 | // for use with sos squaring approach... 386 | // multiply u*v and add carryin to the 2nd result word. 387 | // add result twice into w[0] and w[1] and return any further carryout. 388 | // same approach as _1, except instead of add and adc we use shldq/shl 389 | 390 | __asm__("movq %3, %%rax \n\t" 391 | "mulq %4 \n\t" 392 | "xorq %%r10, %%r10 \n\t" 393 | "shldq $1, %%rax, %%rdx \n\t" 394 | "adcq $0, %%r10 \n\t" 395 | "shlq $1, %%rax \n\t" 396 | "addq %5, %%rax \n\t" 397 | "adcq %6, %%rdx \n\t" 398 | "adcq $0, %%r10 \n\t" 399 | "adcq %7, %%rdx \n\t" 400 | "adcq $0, %%r10 \n\t" 401 | "movq %%rax, %0 \n\t" 402 | "movq %%rdx, %1 \n\t" 403 | "movq %%r10, %2 \n\t" 404 | : "=r"(w[0]), "=r"(w[1]), "=r"(*carryout) 405 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(carryin) 406 | : "rax", "rdx", "r10", "cc"); 407 | 408 | return; 409 | } 410 | 411 | __inline void spMulDblAdd_3(uint64_t u, uint64_t v, uint64_t * w) 412 | { 413 | // for use with fips squaring approach... 414 | // multiply u*v. 415 | // add result twice into w[0], w[1], and w[2]. 416 | 417 | __asm__("movq %3, %%rax \n\t" 418 | "mulq %4 \n\t" 419 | "movq %7, %%r10 \n\t" 420 | "addq %%rax, %%rax \n\t" 421 | "adcq %%rdx, %%rdx \n\t" 422 | "adcq $0, %%r10 \n\t" 423 | "addq %5, %%rax \n\t" 424 | "adcq %6, %%rdx \n\t" 425 | "adcq $0, %%r10 \n\t" 426 | "movq %%rax, %0 \n\t" 427 | "movq %%rdx, %1 \n\t" 428 | "movq %%r10, %2 \n\t" 429 | : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]) 430 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(w[2]) 431 | : "rax", "rdx", "r10", "cc"); 432 | 433 | return; 434 | } 435 | 436 | __inline void spSqrMulAcc(uint64_t u, uint64_t v, uint64_t n, uint64_t s, uint64_t * w) 437 | { 438 | // for use with fips squaring approach on cross-terms... 439 | // multiply u*v. 440 | // add result twice into w[0], w[1], and w[2]. 441 | // multiply n*s. 442 | // add result once into w[0], w[1], and w[2]. 443 | 444 | __asm__("movq %3, %%rax \n\t" 445 | "mulq %4 \n\t" 446 | "movq %7, %%r10 \n\t" 447 | "addq %%rax, %%rax \n\t" 448 | "adcq %%rdx, %%rdx \n\t" 449 | "adcq $0, %%r10 \n\t" 450 | "addq %5, %%rax \n\t" 451 | "movq %%rax, %%r11 \n\t" 452 | "adcq %6, %%rdx \n\t" 453 | "movq %%rdx, %%r12 \n\t" 454 | "adcq $0, %%r10 \n\t" 455 | "movq %8, %%rax \n\t" 456 | "mulq %9 \n\t" 457 | "addq %%r11, %%rax \n\t" 458 | "adcq %%r12, %%rdx \n\t" 459 | "adcq $0, %%r10 \n\t" 460 | "movq %%rax, %0 \n\t" 461 | "movq %%rdx, %1 \n\t" 462 | "movq %%r10, %2 \n\t" 463 | : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]) 464 | : "r"(u), "r"(v), "0"(w[0]), "1"(w[1]), "r"(w[2]), "r"(n), "r"(s) 465 | : "rax", "rdx", "r10", "r11", "r12", "cc"); 466 | 467 | return; 468 | } 469 | 470 | void mpAdd1b(uint64_t * u, uint64_t n, uint64_t * w, int sz) 471 | { 472 | // assume u and w point to the same thing, so we 473 | // can stop as soon as there is no carry. 474 | int i = 0; 475 | uint64_t c = 0; 476 | 477 | spAdd3(u[i], n, c, &w[i], &c); 478 | i++; 479 | while ((i < sz) && (c > 0)) 480 | { 481 | spAdd(u[i], c, &w[i], &c); 482 | i++; 483 | } 484 | 485 | if (c) 486 | spAdd(u[i], c, &w[i], &c); 487 | 488 | return; 489 | } 490 | 491 | void mpAdd1(uint64_t * u, uint64_t n, uint64_t * w, int sz) 492 | { 493 | int i = 0; 494 | uint64_t c; 495 | 496 | c = 0; 497 | spAdd3(u[i], n, c, &w[i], &c); 498 | i++; 499 | while (i < sz) 500 | { 501 | spAdd(u[i], c, &w[i], &c); 502 | i++; 503 | } 504 | 505 | if (c) 506 | spAdd(u[i], c, &w[i], &c); 507 | 508 | return; 509 | } 510 | 511 | void mpAdd(uint64_t * u, uint64_t * v, uint64_t * w, int sz) 512 | { 513 | int i = 0; 514 | uint64_t c; 515 | 516 | c = 0; 517 | for (i = 0; i < sz; i++) 518 | { 519 | spAdd3(u[i], v[i], c, &w[i], &c); 520 | } 521 | w[i] = c; 522 | 523 | return; 524 | } 525 | 526 | -------------------------------------------------------------------------------- /vecarith.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2014, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | 29 | 30 | Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose 31 | Processor Development Group (SPPDG). All Rights Reserved Worldwide. 32 | Licensed under the Apache License, Version 2.0 (the "License"); you may 33 | not use this file except in compliance with the License. You may obtain 34 | a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 35 | Unless required by applicable law or agreed to in writing, software 36 | distributed under the License is distributed on an "AS IS" BASIS, 37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 38 | including conditions of title, non-infringement, merchantability, 39 | or fitness for a particular purpose 40 | See the License for the specific language governing permissions and 41 | limitations under the License. 42 | This file is a snapshot of a work in progress, originated by Mayo 43 | Clinic SPPDG. 44 | */ 45 | 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include //for gettimeofday using gcc 55 | #include 56 | #include 57 | 58 | // ============================================================================ 59 | // vecarith config 60 | // ============================================================================ 61 | #ifdef BASE52 62 | #define DIGITBITS 52 63 | #define base_t uint64_t 64 | #define base_signed_t int64_t 65 | // for 52-bit radix 66 | #define HALFBITS 26 67 | #define HALFMASK 0x3ffffff 68 | #define MAXDIGIT 0xfffffffffffffULL 69 | #define HIBITMASK 0x8000000000000ULL 70 | #define VECLEN 8 71 | #else 72 | #define DIGITBITS 32 73 | #define base_t uint32_t 74 | #define base_signed_t int32_t 75 | #define HALFBITS 16 76 | #define HALFMASK 0xffff 77 | #define MAXDIGIT 0xffffffff 78 | #define HIBITMASK 0x80000000 79 | #define VECLEN 16 80 | #endif 81 | 82 | #ifndef MAXBITS 83 | #define MAXBITS 512 84 | #endif 85 | #define NWORDS (MAXBITS / DIGITBITS) 86 | #define DEFINED 1 87 | #define MAX_WINSIZE 8 88 | 89 | // ============================================================================ 90 | // useful definitions 91 | // ============================================================================ 92 | #define MIN(a,b) ((a) < (b)? (a) : (b)) 93 | #define MAX(a,b) ((a) > (b)? (a) : (b)) 94 | #define SIGN(a) ((a) < 0 ? -1 : 1) 95 | 96 | #define INV_2_POW_48 3.5527136788005009293556213378906e-15 97 | #define INV_2_POW_52 2.2204460492503130808472633361816e-16 98 | #define INV_2_POW_64 5.4210108624275221700372640043497e-20 99 | #define INV_2_POW_26 1.490116119384765625e-8 100 | #define INV_2_POW_32 2.3283064365386962890625e-10 101 | #define PI 3.1415926535897932384626433832795 102 | #define LN2 0.69314718055994530941723212145818 103 | #ifdef _MSC_VER 104 | #define strto_uint64 _strtoui64 105 | #else 106 | #define strto_uint64 strtoull 107 | #endif 108 | 109 | // portable 64-bit formatting 110 | #if defined(_MSC_VER) || defined(__MINGW32__) 111 | #define PRId64 "I64d" 112 | #define PRIu64 "I64u" 113 | #define PRIx64 "I64x" 114 | #elif defined(__x86_64__) 115 | #define PRId64 "ld" 116 | #define PRIu64 "lu" 117 | #define PRIx64 "lx" 118 | #define BSCu "lu" 119 | #define BSCx "lx" 120 | #define BSCu0 "019lu" // base string conversion with leading zeros 121 | #define BSCx0 "019lx" // base string conversion with leading zeros 122 | #elif defined(__i386__) 123 | #define PRId64 "lld" 124 | #define PRIu64 "llu" 125 | #define PRIx64 "llx" 126 | #define BSCu "u" 127 | #define BSCx "x" 128 | #define BSCu0 "09u" 129 | #define BSCx0 "09x" 130 | #endif 131 | 132 | 133 | #if defined (__INTEL_COMPILER) 134 | #define ALIGNED_MEM __declspec(align(64)) 135 | #else 136 | #define ALIGNED_MEM __attribute__((aligned(64))) 137 | #endif 138 | 139 | 140 | // ============================================================================ 141 | // memory allocation 142 | // ============================================================================ 143 | static __inline void * xmalloc_align(size_t len) 144 | { 145 | #if defined (_MSC_VER) || defined(__MINGW32__) 146 | void *ptr = _aligned_malloc(len, 64); 147 | #define align_free _aligned_free 148 | #elif defined (__APPLE__) 149 | void *ptr = malloc(len); 150 | #elif defined (__GNUC__) 151 | void *ptr = memalign(64, len); 152 | #define align_free free 153 | #else 154 | void *ptr = malloc(len); 155 | #endif 156 | 157 | if (ptr == NULL) { 158 | printf("failed to allocate %u aligned bytes\n", (uint32_t)len); fflush(stdout); 159 | exit(-1); 160 | } 161 | 162 | return ptr; 163 | } 164 | 165 | static __inline void * xmalloc(size_t len) { 166 | void *ptr = malloc(len); 167 | if (ptr == NULL) { 168 | printf("failed to allocate %u bytes\n", (uint32_t)len); fflush(stdout); 169 | exit(-1); 170 | } 171 | return ptr; 172 | } 173 | 174 | static __inline void * xcalloc(size_t num, size_t len) { 175 | void *ptr = calloc(num, len); 176 | if (ptr == NULL) { 177 | printf("failed to calloc %u bytes\n", (uint32_t)(num * len)); fflush(stdout); 178 | exit(-1); 179 | } 180 | return ptr; 181 | } 182 | 183 | static __inline void * xrealloc(void *iptr, size_t len) { 184 | void *ptr = realloc(iptr, len); 185 | if (ptr == NULL) { 186 | printf("failed to reallocate %u bytes\n", (uint32_t)len); fflush(stdout); 187 | exit(-1); 188 | } 189 | return ptr; 190 | } 191 | 192 | // ============================================================================ 193 | // vector bignum structure 194 | // ============================================================================ 195 | typedef struct 196 | { 197 | base_t *data; 198 | int size; 199 | } bignum; 200 | 201 | // ============================================================================ 202 | // montgomery arithmetic 203 | // ============================================================================ 204 | typedef struct 205 | { 206 | bignum *r; 207 | bignum *n; 208 | bignum *nhat; 209 | bignum *vnhat; 210 | bignum *rhat; 211 | bignum *rmask; 212 | bignum *one; 213 | bignum *mtmp1; 214 | bignum *mtmp2; 215 | bignum *mtmp3; 216 | bignum *mtmp4; 217 | bignum **g; // storage for windowed method precomputation 218 | base_t *vrho; 219 | base_t rho; 220 | } monty; 221 | 222 | monty* monty_alloc(void); 223 | void monty_free(monty *mdata); 224 | void monty_init_vec(monty *mdata, bignum * n, int verbose); 225 | int get_winsize(void); 226 | int get_bitwin(bignum* e, int bitloc, int winsize, int lane, int winmask); 227 | // 32-bit words, 16x 228 | int vec_montgomery_setup(bignum * a, bignum *r, bignum *rhat, base_t *rho); 229 | void vecmulmod(bignum *a, bignum *b, bignum *c, bignum *n, bignum *s, monty *mdata); 230 | void vecsqrmod(bignum *a, bignum *c, bignum *n, bignum *s, monty *mdata); 231 | void vecmodexp(bignum *d, bignum *b, bignum *e, bignum *m, 232 | bignum *s, bignum *one, monty *mdata); 233 | // 52-bit words, 8x 234 | int vec_montgomery_setup52(bignum * a, bignum *r, bignum *rhat, base_t *rho); 235 | void vecmulmod52(bignum *a, bignum *b, bignum *c, bignum *n, bignum *s, monty *mdata); 236 | void vecsqrmod52(bignum *a, bignum *c, bignum *n, bignum *s, monty *mdata); 237 | void vecmodexp52(bignum *d, bignum *b, bignum *e, bignum *m, 238 | bignum *s, bignum *one, monty *mdata); 239 | 240 | extern void(*vecmulmod_ptr)(bignum *, bignum *, bignum *, bignum *, bignum *, monty *); 241 | extern void(*vecsqrmod_ptr)(bignum *, bignum *, bignum *, bignum *, monty *); 242 | extern int(*montsetup_ptr)(bignum *, bignum *, bignum *, base_t *); 243 | extern void(*vecmodexp_ptr)(bignum *, bignum *, bignum *, bignum *, bignum *, bignum *, monty *m); 244 | 245 | // ============================================================================ 246 | // vector bignum arithmetic and conversions 247 | // ============================================================================ 248 | bignum * vecInit(void); 249 | void vecCopy(bignum * src, bignum * dest); 250 | void vecCopyn(bignum * src, bignum * dest, int size); 251 | void vecClear(bignum *n); 252 | void vecFree(bignum *n); 253 | void broadcast_bignum_to_vec(bignum *src, bignum *vec_dest); 254 | bignum * bignums_to_vec(bignum **src, int num); 255 | void insert_bignum_in_vec(bignum *src, bignum *vec_dest, int num); 256 | void extract_bignum_from_vec(bignum *vec_src, bignum *dest, int num); 257 | void copy_vec_lane(bignum *src, bignum *dest, int num, int size); 258 | uint32_t vec_gte(bignum * u, bignum * v); 259 | uint32_t vec_mask_gte(uint32_t mask, bignum* u, bignum* v); 260 | uint32_t vec_eq(base_t * u, base_t * v, int sz); 261 | uint32_t vec_bignum_mask_lshift_1(bignum * u, uint32_t wmask); 262 | void vec_bignum_mask_rshift_1(bignum * u, uint32_t wmask); 263 | void vec_bignum_mask_sub(bignum *a, bignum *b, bignum *c, uint32_t wmask); 264 | 265 | // ============================================================================ 266 | // vector bignum52 arithmetic and conversions 267 | // ============================================================================ 268 | int vec_montgomery_setup52(bignum * a, bignum *r, bignum *rhat, base_t *rho); 269 | void vec_bignum52_mask_sub(bignum *a, bignum *b, bignum *c, uint32_t wmask); 270 | void vec_bignum52_mask_rshift_1(bignum * u, uint32_t wmask); 271 | uint32_t vec_bignum52_mask_lshift_1(bignum * u, uint32_t wmask); 272 | uint32_t vec_eq52(base_t * u, base_t * v, int sz); 273 | uint32_t vec_gte52(bignum * u, bignum * v); 274 | 275 | // --------------------------------------------------------------------- 276 | // emulated instructions 277 | // --------------------------------------------------------------------- 278 | __m512i __inline _mm512_mulhi_epu32(__m512i a, __m512i b) 279 | { 280 | __m512i t1 = _mm512_shuffle_epi32(a, 0xB1); 281 | __m512i t2 = _mm512_shuffle_epi32(b, 0xB1); 282 | __m512i evens = _mm512_mul_epu32(a, b); 283 | __m512i odds = _mm512_mul_epu32(t1, t2); 284 | //return _mm512_mask_mov_epi32(_mm512_shuffle_epi32(evens, 0xB1), 0xaaaa, odds); 285 | return _mm512_mask_mov_epi32(odds, 0x5555, _mm512_shuffle_epi32(evens, 0xB1)); 286 | } 287 | 288 | __m512i __inline _mm512_mask_adc_epi32(__m512i a, __mmask16 m, __mmask16 c, __m512i b, __mmask16 *cout) 289 | { 290 | __m512i t = _mm512_add_epi32(a, b); 291 | *cout = _mm512_cmplt_epu32_mask(t, a); 292 | __m512i t2 = _mm512_mask_add_epi32(a, m, t, _mm512_maskz_set1_epi32(c, 1)); 293 | *cout = _mm512_kor(*cout, _mm512_mask_cmplt_epu32_mask(m, t2, t)); 294 | return t2; 295 | } 296 | 297 | __m512i __inline _mm512_adc_epi32_test1(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout) 298 | { 299 | __m512i t = _mm512_add_epi32(a, b); 300 | *cout = _mm512_cmplt_epu32_mask(t, a); 301 | __m512i t2 = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1)); 302 | *cout = _mm512_kor(*cout, _mm512_cmplt_epu32_mask(t2, t)); 303 | return t2; 304 | } 305 | 306 | __m512i __inline _mm512_adc_epi32_test2(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout) 307 | { 308 | // looks like a slightly improved data dependency chain... 309 | // but it tested slower for 1024-b inputs... 310 | __m512i t = _mm512_add_epi32(a, b); 311 | __mmask16 gt0 = _mm512_kor(_mm512_test_epi32_mask(b, b), c); 312 | 313 | t = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1)); 314 | *cout = _mm512_kand(_mm512_cmple_epu32_mask(t, a), gt0); 315 | return t; 316 | } 317 | 318 | __m512i __inline _mm512_adc_epi32(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout) 319 | { 320 | __m512i t = _mm512_add_epi32(a, b); 321 | t = _mm512_add_epi32(t, _mm512_maskz_set1_epi32(c, 1)); 322 | *cout = _mm512_cmplt_epu32_mask(t, a) | (_mm512_cmpeq_epu32_mask(t, a) & c); 323 | return t; 324 | } 325 | 326 | __m512i __inline _mm512_addcarry_epi32(__m512i a, __mmask16 c, __mmask16 *cout) 327 | { 328 | __m512i t = _mm512_add_epi32(a, _mm512_maskz_set1_epi32(c, 1)); 329 | *cout = _mm512_cmplt_epu32_mask(t, a); 330 | return t; 331 | } 332 | 333 | __m512i __inline _mm512_subborrow_epi32(__m512i a, __mmask16 c, __mmask16 *cout) 334 | { 335 | __m512i t = _mm512_sub_epi32(a, _mm512_maskz_set1_epi32(c, 1)); 336 | *cout = _mm512_cmpeq_epu32_mask(a, _mm512_setzero_epi32()); 337 | return t; 338 | } 339 | 340 | __m512i __inline _mm512_mask_sbb_epi32(__m512i a, __mmask16 m, __mmask16 c, __m512i b, __mmask16 *cout) 341 | { 342 | __m512i t = _mm512_sub_epi32(a, b); 343 | *cout = _mm512_mask_cmpgt_epu32_mask(m, t, a); 344 | __m512i t2 = _mm512_mask_sub_epi32(a, m, t, _mm512_maskz_set1_epi32(c, 1)); 345 | *cout = _mm512_kor(*cout, _mm512_mask_cmpgt_epu32_mask(m, t2, t)); 346 | return t2; 347 | } 348 | 349 | __m512i __inline _mm512_sbb_epi32(__m512i a, __mmask16 c, __m512i b, __mmask16 *cout) 350 | { 351 | __m512i t = _mm512_sub_epi32(a, b); 352 | *cout = _mm512_cmpgt_epu32_mask(t, a); 353 | __m512i t2 = _mm512_sub_epi32(t, _mm512_maskz_set1_epi32(c, 1)); 354 | *cout = _mm512_kor(*cout, _mm512_cmpgt_epu32_mask(t2, t)); 355 | return t2; 356 | } 357 | 358 | __m512i __inline _mm512_sbb_epi64(__m512i a, __mmask8 c, __m512i b, __mmask8 *cout) 359 | { 360 | __m512i t = _mm512_sub_epi64(a, b); 361 | *cout = _mm512_cmpgt_epu64_mask(t, a); 362 | __m512i t2 = _mm512_sub_epi64(t, _mm512_maskz_set1_epi64(c, 1)); 363 | *cout = _mm512_kor(*cout, _mm512_cmpgt_epu64_mask(t2, t)); 364 | return t2; 365 | } 366 | 367 | __m512i __inline _mm512_addsetc_epi32(__m512i a, __m512i b, __mmask16 *cout) 368 | { 369 | __m512i t = _mm512_add_epi32(a, b); 370 | *cout = _mm512_cmplt_epu32_mask(t, a); 371 | return t; 372 | } 373 | 374 | __m512i __inline _mm512_subsetc_epi32(__m512i a, __m512i b, __mmask16 *cout) 375 | { 376 | __m512i t = _mm512_sub_epi32(a, b); 377 | *cout = _mm512_cmpgt_epu32_mask(b, a); 378 | return t; 379 | } 380 | 381 | __inline void _mm512_epi32_to_eo64(__m512i a, __m512i *e64, __m512i *o64) 382 | { 383 | *e64 = _mm512_maskz_mov_epi32(0x5555, a); 384 | *o64 = _mm512_maskz_mov_epi32(0x5555, _mm512_shuffle_epi32(a, 0xB1)); 385 | return; 386 | } 387 | 388 | __inline __m512i _mm512_eo64lo_to_epi32(__m512i e64, __m512i o64) 389 | { 390 | return _mm512_mask_blend_epi32(0xAAAA, e64, _mm512_shuffle_epi32(o64, 0xB1)); 391 | } 392 | 393 | __inline __m512i _mm512_eo64hi_to_epi32(__m512i e64, __m512i o64) 394 | { 395 | return _mm512_mask_blend_epi32(0xAAAA, _mm512_shuffle_epi32(e64, 0xB1), o64); 396 | } 397 | 398 | __inline void _mm512_mul_eo64_epi32(__m512i a, __m512i b, __m512i *e64, __m512i *o64) 399 | { 400 | // multiply the 16-element 32-bit vectors a and b to produce two 8-element 401 | // 64-bit vector products e64 and o64, where e64 is the even elements 402 | // of a*b and o64 is the odd elements of a*b 403 | //__m512i t1 = _mm512_shuffle_epi32(a, 0xB1); 404 | //__m512i t2 = _mm512_shuffle_epi32(b, 0xB1); 405 | 406 | //_mm512_shuffle_epi32(a, 0xB1); 407 | //_mm512_shuffle_epi32(b, 0xB1); 408 | *e64 = _mm512_mul_epu32(a, b); 409 | *o64 = _mm512_mul_epu32(_mm512_shuffle_epi32(a, 0xB1), _mm512_shuffle_epi32(b, 0xB1)); 410 | 411 | return; 412 | } 413 | 414 | #define _mm512_iseven_epi32(x) \ 415 | _mm512_cmp_epi32_mask(_mm512_setzero_epi32(), _mm512_and_epi32((x), _mm512_set1_epi32(1)), _MM_CMPINT_EQ) 416 | 417 | #define _mm512_isodd_epi32(x) \ 418 | _mm512_cmp_epi32_mask(_mm512_set1_epi32(1), _mm512_and_epi32((x), _mm512_set1_epi32(1)), _MM_CMPINT_EQ) 419 | 420 | 421 | -------------------------------------------------------------------------------- /x64_bench/main.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 by The Mayo Clinic, though its Special Purpose 2 | // Processor Development Group (SPPDG). All Rights Reserved Worldwide. 3 | // Licensed under the Apache License, Version 2.0 (the "License"); you may 4 | // not use this file except in compliance with the License. You may obtain 5 | // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 9 | // including conditions of title, non-infringement, merchantability, 10 | // or fitness for a particular purpose 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | // This file is a snapshot of a work in progress, originated by Mayo 14 | // Clinic SPPDG. 15 | 16 | /* 17 | Copyright (c) 2021, Ben Buhrow 18 | All rights reserved. 19 | 20 | Redistribution and use in source and binary forms, with or without 21 | modification, are permitted provided that the following conditions are met: 22 | 23 | 1. Redistributions of source code must retain the above copyright notice, this 24 | list of conditions and the following disclaimer. 25 | 2. Redistributions in binary form must reproduce the above copyright notice, 26 | this list of conditions and the following disclaimer in the documentation 27 | and/or other materials provided with the distribution. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 | 40 | The views and conclusions contained in the software and documentation are those 41 | of the authors and should not be interpreted as representing official policies, 42 | either expressed or implied, of the FreeBSD Project. 43 | */ 44 | 45 | // this file defines a test harness for various modular exponentiation routines. 46 | // We read command line options and execute the appropriate test(s). 47 | #include "util.h" 48 | #include "bigarith.h" 49 | #include "pmod.h" 50 | #include "monty_arith.h" 51 | #include "x64_arith.h" 52 | #include "gmp.h" 53 | 54 | void mul_test(int iterations, int verbose, uint64_t lcg_state, pmod_t* pmod_state) 55 | { 56 | int i, j, r; 57 | bignum* a, * b, * c, * n, * s; 58 | monty* mdata; 59 | uint64_t chksum = 0; 60 | 61 | struct timeval stopt; // stop time of this job 62 | struct timeval startt; // start time of this job 63 | double t_time = 0.; 64 | 65 | a = zInit(); 66 | b = zInit(); 67 | c = zInit(); 68 | n = zInit(); 69 | s = zInit(); 70 | mdata = monty_alloc(); 71 | 72 | for (j = 0; j < NWORDS; j++) 73 | a->data[j] = spRand64(&lcg_state); 74 | a->size = NWORDS; 75 | 76 | for (j = 0; j < NWORDS; j++) 77 | b->data[j] = spRand64(&lcg_state); 78 | b->size = NWORDS; 79 | 80 | for (j = 0; j < NWORDS; j++) 81 | n->data[j] = spRand64(&lcg_state); 82 | n->size = NWORDS; 83 | 84 | if ((n->data[0] & 1) == 0) 85 | n->data[0]++; 86 | 87 | // initialize the montgomery representation of this modulus. 88 | monty_init(mdata, n, verbose); 89 | 90 | if (verbose > 0) 91 | { 92 | printf("initial a = "); zPrint(a); printf("\n"); 93 | printf("initial b = "); zPrint(b); printf("\n"); 94 | printf("initial n = "); zPrint(n); printf("\n"); 95 | } 96 | 97 | to_monty(mdata, a); 98 | to_monty(mdata, b); 99 | 100 | gettimeofday(&startt, NULL); 101 | 102 | for (i = 0; i < iterations; i++) 103 | { 104 | if (verbose > 1) 105 | { 106 | printf("test %d:\n", i); 107 | printf("a = "); zPrint(a); printf("\n"); 108 | printf("b = "); zPrint(b); printf("\n"); 109 | printf("n = "); zPrint(n); printf("\n"); 110 | } 111 | 112 | mul_ptr(mdata, a, b, a, n); 113 | chksum += a->data[0]; 114 | 115 | if (verbose > 0) 116 | { 117 | printf("result: "); zPrint(a); printf("\n"); 118 | } 119 | } 120 | 121 | gettimeofday(&stopt, NULL); 122 | t_time = my_difftime(&startt, &stopt); 123 | 124 | if (verbose > 0) 125 | { 126 | printf("final result: "); zPrint(a); printf("\n"); 127 | } 128 | 129 | printf("final chksum: %lu\n", chksum); 130 | 131 | printf("%d mulredc tests took %.4f seconds\n", iterations, t_time); 132 | 133 | zFree(a); 134 | zFree(b); 135 | zFree(c); 136 | zFree(n); 137 | zFree(s); 138 | monty_free(mdata); 139 | 140 | return; 141 | } 142 | 143 | void sqr_test(int iterations, int verbose, uint64_t lcg_state, pmod_t* pmod_state) 144 | { 145 | int i, j, r; 146 | bignum* a, * b, * c, * n, * s; 147 | monty* mdata; 148 | uint64_t chksum = 0; 149 | 150 | struct timeval stopt; // stop time of this job 151 | struct timeval startt; // start time of this job 152 | double t_time = 0.; 153 | 154 | a = zInit(); 155 | b = zInit(); 156 | c = zInit(); 157 | n = zInit(); 158 | s = zInit(); 159 | mdata = monty_alloc(); 160 | 161 | for (j = 0; j < NWORDS; j++) 162 | a->data[j] = spRand64(&lcg_state); 163 | a->size = NWORDS; 164 | 165 | for (j = 0; j < NWORDS; j++) 166 | n->data[j] = spRand64(&lcg_state); 167 | n->size = NWORDS; 168 | 169 | if ((n->data[0] & 1) == 0) 170 | n->data[0]++; 171 | 172 | // initialize the montgomery representation of this modulus. 173 | monty_init(mdata, n, verbose); 174 | 175 | if (verbose > 0) 176 | { 177 | printf("initial a = "); zPrint(a); printf("\n"); 178 | printf("initial n = "); zPrint(n); printf("\n"); 179 | } 180 | 181 | to_monty(mdata, a); 182 | 183 | gettimeofday(&startt, NULL); 184 | 185 | for (i = 0; i < iterations; i++) 186 | { 187 | if (verbose > 1) 188 | { 189 | printf("test %d:\n", i); 190 | printf("a = "); zPrint(a); printf("\n"); 191 | printf("n = "); zPrint(n); printf("\n"); 192 | } 193 | 194 | sqr_ptr(mdata, a, a, n); 195 | chksum += a->data[0]; 196 | 197 | if (verbose > 1) 198 | { 199 | printf("result: "); zPrint(a); printf("\n"); 200 | } 201 | } 202 | 203 | gettimeofday(&stopt, NULL); 204 | t_time = my_difftime(&startt, &stopt); 205 | 206 | if (verbose > 0) 207 | { 208 | printf("final result: "); zPrint(a); printf("\n"); 209 | } 210 | 211 | printf("final chksum: %lu\n", chksum); 212 | printf("%d sqrredc tests took %.4f seconds\n", iterations, t_time); 213 | 214 | zFree(a); 215 | zFree(b); 216 | zFree(c); 217 | zFree(n); 218 | zFree(s); 219 | monty_free(mdata); 220 | 221 | return; 222 | } 223 | 224 | void monty_test(int iterations, int verbose, uint64_t lcg_state, pmod_t *pmod_state) 225 | { 226 | int i, j, r; 227 | bignum *a, *b, *c, *n, *s; 228 | monty *mdata; 229 | 230 | struct timeval stopt; // stop time of this job 231 | struct timeval startt; // start time of this job 232 | double t_time = 0.; 233 | 234 | a = zInit(); 235 | b = zInit(); 236 | c = zInit(); 237 | n = zInit(); 238 | s = zInit(); 239 | mdata = monty_alloc(); 240 | 241 | gettimeofday(&startt, NULL); 242 | 243 | for (i = 0; i < iterations; i++) 244 | { 245 | for (j = 0; j < NWORDS; j++) 246 | a->data[j] = spRand64(&lcg_state); 247 | a->size = NWORDS; 248 | 249 | for (j = 0; j < NWORDS; j++) 250 | b->data[j] = spRand64(&lcg_state); 251 | b->size = NWORDS; 252 | 253 | for (j = 0; j < NWORDS; j++) 254 | n->data[j] = spRand64(&lcg_state); 255 | n->size = NWORDS; 256 | 257 | if ((n->data[0] & 1) == 0) 258 | n->data[0]++; 259 | 260 | // initialize the montgomery representation of this modulus. 261 | monty_init(mdata, n, verbose); 262 | 263 | if (verbose) 264 | { 265 | printf("test %d:\n", i); 266 | printf("a = "); zPrint(a); printf("\n"); 267 | printf("b = "); zPrint(b); printf("\n"); 268 | printf("n = "); zPrint(n); printf("\n"); 269 | } 270 | 271 | to_monty(mdata, a); 272 | 273 | lroddwin_powm(pmod_state, mdata, c, a, b, n, s); 274 | 275 | if (verbose) 276 | { 277 | printf("result: "); zPrint(c); printf("\n"); 278 | } 279 | } 280 | 281 | gettimeofday(&stopt, NULL); 282 | t_time = my_difftime(&startt, &stopt); 283 | 284 | printf("%d powm tests took %.4f seconds\n", iterations, t_time); 285 | 286 | zFree(a); 287 | zFree(b); 288 | zFree(c); 289 | zFree(n); 290 | zFree(s); 291 | monty_free(mdata); 292 | 293 | return; 294 | } 295 | 296 | int main(int argc, char **argv) 297 | { 298 | struct timeval stopt; // stop time of this job 299 | struct timeval startt; // start time of this job 300 | double t_time = 0.; 301 | int iterations = 1000; 302 | int seed; 303 | uint64_t *lcg_state; 304 | pmod_t *pmod_state; 305 | int verbose = 0; 306 | int taskid, numtasks; 307 | 308 | if (argc > 1) 309 | { 310 | iterations = atoi(argv[1]); 311 | } 312 | 313 | if (argc > 2) 314 | { 315 | verbose = atoi(argv[2]); 316 | } 317 | 318 | if (argc > 3) 319 | { 320 | seed = atoi(argv[3]); 321 | } 322 | else 323 | { 324 | gettimeofday(&startt, NULL); 325 | seed = hash64((startt.tv_usec)); 326 | } 327 | 328 | 329 | lcg_state = (uint64_t *)malloc(1 * sizeof(uint64_t)); 330 | lcg_state[0] = hash64((seed)); 331 | pmod_state = (pmod_t *)malloc(sizeof(pmod_t)); 332 | pmodlib_init(pmod_state); 333 | 334 | printf("commencing benchmarks with MAXBITS = %d, NWORDS = %d\n", 335 | MAXBITS, NWORDS); 336 | 337 | // configure benchmark tests 338 | int do_pmod_tests = 1; 339 | int do_mulsqr_tests = 1; 340 | 341 | int bench_sos = 1; 342 | int bench_fios = 1; 343 | int bench_fips = 1; 344 | int bench_cios = 1; 345 | int bench_bps = 1; 346 | int bench_gmp = 1; 347 | 348 | if (do_mulsqr_tests) 349 | { 350 | if (bench_sos) 351 | { 352 | printf("commencing %d mulredc iterations using mulmod_sos\n", iterations); 353 | mul_ptr = &mulmod_sos; 354 | mul_test(iterations, verbose, lcg_state[0], pmod_state); 355 | } 356 | 357 | if (bench_cios) 358 | { 359 | printf("commencing %d mulredc iterations using mulmod_cios\n", iterations); 360 | mul_ptr = &mulmod_cios; 361 | mul_test(iterations, verbose, lcg_state[0], pmod_state); 362 | } 363 | 364 | if (bench_bps) 365 | { 366 | printf("commencing %d mulredc iterations using mulmod_bps\n", iterations); 367 | mul_ptr = &mulmod_bps; 368 | mul_test(iterations, verbose, lcg_state[0], pmod_state); 369 | } 370 | 371 | if (bench_fios) 372 | { 373 | printf("commencing %d mulredc iterations using mulmod_fios\n", iterations); 374 | mul_ptr = &mulmod_fios; 375 | mul_test(iterations, verbose, lcg_state[0], pmod_state); 376 | } 377 | 378 | if (bench_fips) 379 | { 380 | printf("commencing %d mulredc iterations using mulmod_fips\n", iterations); 381 | mul_ptr = &mulmod_fips; 382 | mul_test(iterations, verbose, lcg_state[0], pmod_state); 383 | } 384 | 385 | if (bench_sos) 386 | { 387 | printf("commencing %d sqrredc iterations using sqrmod_sos\n", iterations); 388 | sqr_ptr = &sqrmod_sos; 389 | sqr_test(iterations, verbose, lcg_state[0], pmod_state); 390 | } 391 | 392 | if (bench_cios) 393 | { 394 | printf("commencing %d sqrredc iterations using sqrmod_cios\n", iterations); 395 | sqr_ptr = &sqrmod_cios; 396 | sqr_test(iterations, verbose, lcg_state[0], pmod_state); 397 | } 398 | 399 | if (bench_bps) 400 | { 401 | printf("commencing %d sqrredc iterations using sqrmod_bps\n", iterations); 402 | sqr_ptr = &sqrmod_bps; 403 | sqr_test(iterations, verbose, lcg_state[0], pmod_state); 404 | } 405 | 406 | if (bench_fios) 407 | { 408 | printf("commencing %d sqrredc iterations using sqrmod_fios\n", iterations); 409 | sqr_ptr = &sqrmod_fios; 410 | sqr_test(iterations, verbose, lcg_state[0], pmod_state); 411 | } 412 | 413 | if (bench_fips) 414 | { 415 | printf("commencing %d sqrredc iterations using sqrmod_fips\n", iterations); 416 | sqr_ptr = &sqrmod_fips; 417 | sqr_test(iterations, verbose, lcg_state[0], pmod_state); 418 | } 419 | 420 | // gmp SOS mul 421 | if (bench_gmp) 422 | { 423 | mpz_t a, b, n, t, nhat, r, u; 424 | int i, j, k; 425 | struct timeval stopt; // stop time of this job 426 | struct timeval startt; // start time of this job 427 | double t_time = 0.; 428 | int numbits = MAXBITS; 429 | uint64_t lcg = lcg_state[0]; 430 | uint64_t chksum = 0; 431 | 432 | mpz_init(a); 433 | mpz_init(b); 434 | mpz_init(nhat); 435 | mpz_init(r); 436 | mpz_init(n); 437 | mpz_init(t); 438 | mpz_init(u); 439 | 440 | gettimeofday(&startt, NULL); 441 | 442 | mpz_set_ui(a, 0); 443 | for (j = 0; j < NWORDS; j++) 444 | { 445 | uint64_t x = spRand64(&lcg); 446 | mpz_set_ui(t, x); 447 | mpz_mul_2exp(t, t, 64 * j); 448 | mpz_add(a, a, t); 449 | } 450 | 451 | mpz_set_ui(b, 0); 452 | for (j = 0; j < NWORDS; j++) 453 | { 454 | uint64_t x = spRand64(&lcg); 455 | mpz_set_ui(t, x); 456 | mpz_mul_2exp(t, t, 64 * j); 457 | mpz_add(b, b, t); 458 | } 459 | 460 | mpz_set_ui(n, 0); 461 | for (j = 0; j < NWORDS; j++) 462 | { 463 | uint64_t x = spRand64(&lcg); 464 | mpz_set_ui(t, x); 465 | mpz_mul_2exp(t, t, 64 * j); 466 | mpz_add(n, n, t); 467 | } 468 | 469 | if ((mpz_get_ui(n) & 1) == 0) 470 | { 471 | mpz_add_ui(n, n, 1); 472 | } 473 | 474 | printf("commencing %d mulredc iterations using gmp \n", iterations); 475 | 476 | if (verbose > 0) 477 | { 478 | gmp_printf("initial a: %Zx\n", a); 479 | gmp_printf("initial b: %Zx\n", b); 480 | gmp_printf("initial n: %Zx\n", n); 481 | } 482 | 483 | // monty setup 484 | mpz_set_ui(r, 1); 485 | mpz_mul_2exp(r, r, MAXBITS); 486 | mpz_invert(nhat, n, r); 487 | mpz_sub(nhat, r, nhat); 488 | mpz_mul(a, r, a); 489 | mpz_tdiv_r(a, a, n); 490 | mpz_mul(b, r, b); 491 | mpz_tdiv_r(b, b, n); 492 | 493 | gettimeofday(&startt, NULL); 494 | for (k = 0; k < iterations; k++) 495 | { 496 | mpz_mul(t, a, b); 497 | mpz_tdiv_r_2exp(a, t, MAXBITS); 498 | mpz_mul(u, a, nhat); 499 | mpz_tdiv_r_2exp(u, u, MAXBITS); 500 | mpz_mul(a, u, n); 501 | mpz_add(a, t, a); 502 | mpz_tdiv_q_2exp(a, a, MAXBITS); 503 | if (mpz_sizeinbase(a, 2) > MAXBITS) 504 | mpz_sub(a, a, n); 505 | 506 | if (verbose > 0) 507 | { 508 | gmp_printf("result: %Zx\n", a); 509 | } 510 | chksum += a->_mp_d[0]; 511 | } 512 | 513 | gettimeofday(&stopt, NULL); 514 | t_time = my_difftime(&startt, &stopt); 515 | 516 | if (verbose > 0) 517 | { 518 | gmp_printf("final result: %Zx\n", a); 519 | } 520 | 521 | printf("final chksum: %lu\n", chksum); 522 | printf("%d mulredc tests took %.4f seconds\n", iterations, t_time); 523 | 524 | mpz_clear(a); 525 | mpz_clear(b); 526 | mpz_clear(r); 527 | mpz_clear(nhat); 528 | mpz_clear(n); 529 | mpz_clear(t); 530 | mpz_clear(u); 531 | } 532 | 533 | // gmp SOS sqr 534 | if (bench_gmp) 535 | { 536 | mpz_t a, b, n, t, nhat, r, u; 537 | int i, j, k; 538 | struct timeval stopt; // stop time of this job 539 | struct timeval startt; // start time of this job 540 | double t_time = 0.; 541 | int numbits = MAXBITS; 542 | uint64_t lcg = lcg_state[0]; 543 | uint64_t chksum = 0; 544 | 545 | mpz_init(a); 546 | mpz_init(b); 547 | mpz_init(nhat); 548 | mpz_init(r); 549 | mpz_init(n); 550 | mpz_init(t); 551 | mpz_init(u); 552 | 553 | gettimeofday(&startt, NULL); 554 | 555 | mpz_set_ui(a, 0); 556 | for (j = 0; j < NWORDS; j++) 557 | { 558 | uint64_t x = spRand64(&lcg); 559 | mpz_set_ui(t, x); 560 | mpz_mul_2exp(t, t, 64 * j); 561 | mpz_add(a, a, t); 562 | } 563 | 564 | mpz_set_ui(n, 0); 565 | for (j = 0; j < NWORDS; j++) 566 | { 567 | uint64_t x = spRand64(&lcg); 568 | mpz_set_ui(t, x); 569 | mpz_mul_2exp(t, t, 64 * j); 570 | mpz_add(n, n, t); 571 | } 572 | 573 | if ((mpz_get_ui(n) & 1) == 0) 574 | { 575 | mpz_add_ui(n, n, 1); 576 | } 577 | 578 | printf("commencing %d sqrredc iterations using gmp \n", iterations); 579 | 580 | if (verbose > 0) 581 | { 582 | gmp_printf("initial a: %Zx\n", a); 583 | gmp_printf("initial n: %Zx\n", n); 584 | } 585 | 586 | // monty setup 587 | mpz_set_ui(r, 1); 588 | mpz_mul_2exp(r, r, MAXBITS); 589 | mpz_invert(nhat, n, r); 590 | mpz_sub(nhat, r, nhat); 591 | mpz_mul(a, r, a); 592 | mpz_tdiv_r(a, a, n); 593 | 594 | gettimeofday(&startt, NULL); 595 | for (k = 0; k < iterations; k++) 596 | { 597 | mpz_mul(t, a, a); 598 | mpz_tdiv_r_2exp(a, t, MAXBITS); 599 | mpz_mul(u, a, nhat); 600 | mpz_tdiv_r_2exp(u, u, MAXBITS); 601 | mpz_mul(a, u, n); 602 | mpz_add(a, t, a); 603 | mpz_tdiv_q_2exp(a, a, MAXBITS); 604 | if (mpz_sizeinbase(a, 2) > MAXBITS) 605 | mpz_sub(a, a, n); 606 | 607 | chksum += mpz_get_ui(a); 608 | if (verbose > 0) 609 | { 610 | gmp_printf("result: %Zx\n", a); 611 | } 612 | } 613 | 614 | gettimeofday(&stopt, NULL); 615 | t_time = my_difftime(&startt, &stopt); 616 | 617 | if (verbose > 0) 618 | { 619 | gmp_printf("final result: %Zx\n", a); 620 | } 621 | 622 | printf("final chksum: %lu\n", chksum); 623 | printf("%d sqrredc tests took %.4f seconds\n", iterations, t_time); 624 | 625 | mpz_clear(a); 626 | mpz_clear(b); 627 | mpz_clear(r); 628 | mpz_clear(nhat); 629 | mpz_clear(n); 630 | mpz_clear(t); 631 | mpz_clear(u); 632 | } 633 | } 634 | 635 | if (do_mulsqr_tests) 636 | iterations /= 10000; 637 | 638 | if (do_pmod_tests) 639 | { 640 | if (bench_sos) 641 | { 642 | printf("commencing %d powm iterations using mulmod_sos\n", iterations); 643 | mul_ptr = &mulmod_sos; 644 | sqr_ptr = &sqrmod_sos_mul; 645 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 646 | } 647 | 648 | if (bench_sos) 649 | { 650 | printf("commencing %d powm iterations using mulmod_sos and sqrmod_sos\n", iterations); 651 | mul_ptr = &mulmod_sos; 652 | sqr_ptr = &sqrmod_sos; 653 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 654 | } 655 | 656 | if (bench_cios) 657 | { 658 | printf("commencing %d powm iterations using mulmod_cios\n", iterations); 659 | mul_ptr = &mulmod_cios; 660 | sqr_ptr = &sqrmod_cios_mul; 661 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 662 | } 663 | 664 | if (bench_cios) 665 | { 666 | printf("commencing %d powm iterations using mulmod_cios and sqrmod_cios\n", iterations); 667 | mul_ptr = &mulmod_cios; 668 | sqr_ptr = &sqrmod_cios; 669 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 670 | } 671 | 672 | if (bench_bps) 673 | { 674 | printf("commencing %d powm iterations using mulmod_bps\n", iterations); 675 | mul_ptr = &mulmod_bps; 676 | sqr_ptr = &sqrmod_bps_mul; 677 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 678 | } 679 | 680 | if (bench_bps) 681 | { 682 | printf("commencing %d powm iterations using mulmod_bps and sqrmod_bps\n", iterations); 683 | mul_ptr = &mulmod_bps; 684 | sqr_ptr = &sqrmod_bps; 685 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 686 | } 687 | 688 | if (bench_fios) 689 | { 690 | printf("commencing %d powm iterations using mulmod_fios\n", iterations); 691 | mul_ptr = &mulmod_fios; 692 | sqr_ptr = &sqrmod_fios_mul; 693 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 694 | } 695 | 696 | if (bench_fios) 697 | { 698 | printf("commencing %d powm iterations using mulmod_fios and sqrmod_fios\n", iterations); 699 | mul_ptr = &mulmod_fios; 700 | sqr_ptr = &sqrmod_fios; 701 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 702 | 703 | } 704 | 705 | if (bench_fips) 706 | { 707 | printf("commencing %d powm iterations using mulmod_fips\n", iterations); 708 | mul_ptr = &mulmod_fips; 709 | sqr_ptr = &sqrmod_fips_mul; 710 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 711 | } 712 | 713 | if (bench_fips) 714 | { 715 | printf("commencing %d powm iterations using mulmod_fips and sqrmod_fips\n", iterations); 716 | mul_ptr = &mulmod_fips; 717 | sqr_ptr = &sqrmod_fips; 718 | monty_test(iterations, verbose, lcg_state[0], pmod_state); 719 | } 720 | 721 | // gmp comparison. These results won't match the ones above because 722 | // we use a different RNG (the builtin gmp RNG). 723 | if (bench_gmp) 724 | { 725 | mpz_t a, b, n, t, aa, bb; 726 | int i, j, k; 727 | struct timeval stopt; // stop time of this job 728 | struct timeval startt; // start time of this job 729 | double t_time = 0.; 730 | int numbits = MAXBITS; 731 | gmp_randstate_t gmp_randstate; 732 | 733 | mpz_init(a); 734 | mpz_init(b); 735 | mpz_init(aa); 736 | mpz_init(bb); 737 | mpz_init(n); 738 | mpz_init(t); 739 | 740 | gettimeofday(&startt, NULL); 741 | srand(42); // lcg_state[0]); 742 | gmp_randinit_default(gmp_randstate); 743 | gmp_randseed_ui(gmp_randstate, rand()); 744 | 745 | printf("commencing %d powm iterations using gmp powm\n", iterations); 746 | mpz_urandomb(n, gmp_randstate, numbits); 747 | if (mpz_even_p(n)) 748 | mpz_add_ui(n, n, 1); 749 | 750 | gettimeofday(&startt, NULL); 751 | for (k = 0; k < iterations; k++) 752 | { 753 | mpz_urandomb(a, gmp_randstate, numbits); 754 | mpz_urandomb(b, gmp_randstate, numbits); 755 | 756 | mpz_tdiv_r(a, a, n); 757 | mpz_tdiv_r(b, b, n); 758 | 759 | if (verbose) 760 | { 761 | printf("test %d:\n", k); 762 | printf("a = "); gmp_printf("%Zx\n", a); 763 | printf("b = "); gmp_printf("%Zx\n", b); 764 | printf("n = "); gmp_printf("%Zx\n", n); 765 | } 766 | 767 | mpz_set(aa, a); 768 | mpz_set(bb, b); 769 | 770 | mpz_powm(a, aa, bb, n); 771 | 772 | if (verbose) 773 | { 774 | gmp_printf("result: %Zx\n", a); 775 | } 776 | } 777 | 778 | gettimeofday(&stopt, NULL); 779 | t_time = my_difftime(&startt, &stopt); 780 | 781 | printf("%d powm tests took %.4f seconds\n", iterations, t_time); 782 | 783 | mpz_clear(a); 784 | mpz_clear(b); 785 | mpz_clear(aa); 786 | mpz_clear(bb); 787 | mpz_clear(n); 788 | mpz_clear(t); 789 | } 790 | } 791 | 792 | free(lcg_state); 793 | pmodlib_free(pmod_state); 794 | free(pmod_state); 795 | 796 | return 0; 797 | } 798 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2014, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | 29 | 30 | Copyright (c) 2018 by The Mayo Clinic, though its Special Purpose 31 | Processor Development Group (SPPDG). All Rights Reserved Worldwide. 32 | Licensed under the Apache License, Version 2.0 (the "License"); you may 33 | not use this file except in compliance with the License. You may obtain 34 | a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. 35 | Unless required by applicable law or agreed to in writing, software 36 | distributed under the License is distributed on an "AS IS" BASIS, 37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 38 | including conditions of title, non-infringement, merchantability, 39 | or fitness for a particular purpose 40 | See the License for the specific language governing permissions and 41 | limitations under the License. 42 | This file is a snapshot of a work in progress, originated by Mayo 43 | Clinic SPPDG. 44 | */ 45 | 46 | #include "vecarith.h" 47 | #include "gmp.h" 48 | #include "omp.h" 49 | 50 | uint64_t *LCG_STATE; 51 | 52 | uint64_t spRand64(uint64_t *state) 53 | { 54 | // advance the state of the LCG and return the appropriate result. 55 | // assume lower = 0 and upper = maxint 56 | *state = 6364136223846793005ULL * (*state) + 1442695040888963407ULL; 57 | return *state; 58 | } 59 | 60 | // FNV-1 hash algorithm: 61 | // http://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function 62 | uint64_t hash64(uint64_t in) 63 | { 64 | uint64_t hash = 14695981039346656037ULL; 65 | uint64_t prime = 1099511628211ULL; 66 | uint64_t hash_mask; 67 | uint64_t xor; 68 | 69 | hash = hash * prime; 70 | hash_mask = 0xffffffffffffff00ULL; 71 | xor = hash ^ in; 72 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 73 | 74 | hash = hash * prime; 75 | hash_mask = 0xffffffffffff00ffULL; 76 | xor = hash ^ in; 77 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 78 | 79 | hash = hash * prime; 80 | hash_mask = 0xffffffffff00ffffULL; 81 | xor = hash ^ in; 82 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 83 | 84 | hash = hash * prime; 85 | hash_mask = 0xffffffff00ffffffULL; 86 | xor = hash ^ in; 87 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 88 | 89 | hash = hash * prime; 90 | hash_mask = 0xffffff00ffffffffULL; 91 | xor = hash ^ in; 92 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 93 | 94 | hash = hash * prime; 95 | hash_mask = 0xffff00ffffffffffULL; 96 | xor = hash ^ in; 97 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 98 | 99 | hash = hash * prime; 100 | hash_mask = 0xff00ffffffffffffULL; 101 | xor = hash ^ in; 102 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 103 | 104 | hash = hash * prime; 105 | hash_mask = 0x00ffffffffffffffULL; 106 | xor = hash ^ in; 107 | hash = (hash & hash_mask) | (xor & (~hash_mask)); 108 | 109 | return hash; 110 | } 111 | 112 | double my_difftime(struct timeval * start, struct timeval * end) 113 | { 114 | double secs; 115 | double usecs; 116 | 117 | if (start->tv_sec == end->tv_sec) { 118 | secs = 0; 119 | usecs = end->tv_usec - start->tv_usec; 120 | } 121 | else { 122 | usecs = 1000000 - start->tv_usec; 123 | secs = end->tv_sec - (start->tv_sec + 1); 124 | usecs += end->tv_usec; 125 | if (usecs >= 1000000) { 126 | usecs -= 1000000; 127 | secs += 1; 128 | } 129 | } 130 | 131 | return secs + usecs / 1000000.; 132 | } 133 | 134 | void extract_bignum_from_vec_to_mpz(mpz_t dest, bignum *vec_src, int num, int sz) 135 | { 136 | int j; 137 | 138 | if (dest == NULL) 139 | { 140 | printf("invalid dest address in extract_vec_bignum_from_vec_to_mpz\n"); 141 | } 142 | 143 | mpz_set_ui(dest, 0); 144 | for (j = sz - 1; j >= 0; j--) 145 | { 146 | mpz_mul_2exp(dest, dest, DIGITBITS); 147 | mpz_add_ui(dest, dest, vec_src->data[num + j * VECLEN]); 148 | } 149 | 150 | return; 151 | } 152 | 153 | void vecpmodtest(int do_verification, int threads, int verbose) 154 | { 155 | // test the pmod by comparing all results to those computed using 156 | // validated scalar code. 157 | double *elapsed_time; 158 | int t; 159 | //gmp_randstate_t rng_state; 160 | 161 | //gmp_randinit_default(rng_state); 162 | elapsed_time = (double *)malloc(threads * sizeof(double)); 163 | 164 | LCG_STATE = (uint64_t *)malloc(threads * sizeof(uint64_t)); 165 | 166 | for (t = 0; t < threads; t++) 167 | { 168 | LCG_STATE[t] = hash64(t); 169 | } 170 | 171 | printf("commencing test: all variable (random)\n"); 172 | #pragma omp parallel num_threads(threads) 173 | { 174 | int i, j; 175 | 176 | // timing variables 177 | struct timeval stopt; // stop time of this job 178 | struct timeval startt; // start time of this job 179 | double t_time; 180 | 181 | mpz_t base, exp, mod, t1, t2; 182 | 183 | int loc_iterations; 184 | int tid = omp_get_thread_num(); 185 | monty *mtest; 186 | 187 | // vector bignums 188 | bignum *b = vecInit(); 189 | bignum *d = vecInit(); 190 | bignum *m = vecInit(); 191 | bignum *e = vecInit(); 192 | bignum *s = vecInit(); 193 | bignum *one = vecInit(); 194 | 195 | mpz_init(base); 196 | mpz_init(exp); 197 | mpz_init(mod); 198 | mpz_init(t1); 199 | mpz_init(t2); 200 | 201 | //gmp_randseed_ui(rng_state, tid); 202 | 203 | // attempt to scale the number of iterations with input size 204 | // so this doesn't take forever. 205 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS); 206 | 207 | if (MAXBITS >= 4096) 208 | loc_iterations *= 1; 209 | else if (MAXBITS >= 2048) 210 | loc_iterations *= 2; 211 | else if (MAXBITS >= 1024) 212 | loc_iterations *= 5; 213 | else if (MAXBITS >= 512) 214 | loc_iterations *= 10; 215 | else if (MAXBITS >= 256) 216 | loc_iterations *= 25; 217 | else 218 | loc_iterations *= 100; 219 | 220 | #ifdef BASE52 221 | loc_iterations *= 3; 222 | #endif 223 | 224 | #ifdef TARGET_KNL 225 | loc_iterations /= 3; 226 | #endif 227 | 228 | mtest = monty_alloc(); 229 | 230 | #pragma omp barrier 231 | 232 | gettimeofday(&startt, NULL); 233 | 234 | for (j = 0; j < VECLEN; j++) 235 | { 236 | one->data[j] = 1; 237 | } 238 | 239 | printf("thread %d starting %d iterations\n", tid, loc_iterations); 240 | 241 | // now do the calculation "b^e % m" a bunch of times 242 | for (i = 0; i < loc_iterations; i++) 243 | { 244 | 245 | #ifdef BASE52 246 | //int tmp = ceil(MAXBITS / 64); 247 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 248 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 249 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 250 | 251 | for (j = 0; j < VECLEN; j++) 252 | { 253 | int k; 254 | for (k = 0; k < NWORDS; k++) 255 | { 256 | uint64_t r1 = spRand64(&LCG_STATE[t]); 257 | uint64_t r2 = spRand64(&LCG_STATE[t]); 258 | uint64_t r3 = spRand64(&LCG_STATE[t]); 259 | 260 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 261 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 262 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 263 | } 264 | } 265 | 266 | #else 267 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 268 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 269 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 270 | 271 | for (j = 0; j < VECLEN; j++) 272 | { 273 | int k; 274 | for (k = 0; k < NWORDS; k++) 275 | { 276 | uint64_t r1 = spRand64(&LCG_STATE[t]); 277 | uint64_t r2 = spRand64(&LCG_STATE[t]); 278 | uint64_t r3 = spRand64(&LCG_STATE[t]); 279 | 280 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 281 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 282 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 283 | } 284 | } 285 | #endif 286 | for (j = 0; j < VECLEN; j++) 287 | m->data[j] |= 0x1; 288 | 289 | if (0) 290 | { 291 | continue; 292 | } 293 | 294 | if (verbose > 1) 295 | { 296 | for (j = 0; j < VECLEN; j++) 297 | { 298 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 299 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 300 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 301 | 302 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n", 303 | j, base, exp, mod); 304 | } 305 | } 306 | 307 | // now we actually do the (vectorized) montgomery initialization 308 | // on our vector of random moduli. 309 | monty_init_vec(mtest, m, 0); 310 | 311 | if (verbose > 1) 312 | { 313 | for (j = 0; j < VECLEN; j++) 314 | { 315 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS); 316 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS); 317 | 318 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n", 319 | j, base, mod, mtest->vrho[j]); 320 | } 321 | } 322 | 323 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep 324 | 325 | if (verbose > 1) 326 | { 327 | for (j = 0; j < VECLEN; j++) 328 | { 329 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 330 | 331 | gmp_printf("monty(base%d) = %Zx\n", j, base); 332 | } 333 | } 334 | 335 | vecmodexp_ptr(d, b, e, m, s, mtest->one, mtest); // powm 336 | vecmulmod_ptr(d, one, d, m, s, mtest); // normal rep 337 | 338 | if (verbose > 1) 339 | { 340 | for (j = 0; j < VECLEN; j++) 341 | { 342 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS); 343 | 344 | gmp_printf("modexp%d = %Zx\n", j, base); 345 | } 346 | } 347 | 348 | // now verify each result 349 | if (do_verification) 350 | { 351 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep 352 | for (j = 0; j < VECLEN; j++) 353 | { 354 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS); 355 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 356 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 357 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 358 | 359 | mpz_powm(t2, base, exp, mod); 360 | 361 | if (verbose) 362 | { 363 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n", 364 | i, j, t2, t1); 365 | } 366 | 367 | if (mpz_cmp(t1, t2) != 0) 368 | { 369 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n", 370 | i, j, base, exp, mod, t2, t1); 371 | exit(1); 372 | } 373 | 374 | } 375 | } 376 | } 377 | 378 | monty_free(mtest); 379 | 380 | if ((tid == 0) && (do_verification == 1)) 381 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations); 382 | 383 | gettimeofday(&stopt, NULL); 384 | t_time = my_difftime(&startt, &stopt); 385 | elapsed_time[tid] = t_time; 386 | 387 | if (tid == 0) 388 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time); 389 | 390 | mpz_clear(t1); 391 | mpz_clear(t2); 392 | mpz_clear(base); 393 | mpz_clear(mod); 394 | mpz_clear(exp); 395 | vecFree(m); 396 | vecFree(b); 397 | vecFree(d); 398 | vecFree(s); 399 | vecFree(e); 400 | } 401 | 402 | { 403 | int i; 404 | double sum = 0.0; 405 | double min_t = 9999999999.; 406 | double max_t = 0.; 407 | 408 | for (i = 0; i < threads; i++) 409 | { 410 | sum += elapsed_time[i]; 411 | if (elapsed_time[i] < min_t) 412 | min_t = elapsed_time[i]; 413 | if (elapsed_time[i] > max_t) 414 | max_t = elapsed_time[i]; 415 | } 416 | 417 | printf("average elapsed time = %1.4f\n", sum / threads); 418 | printf("min elapsed time = %1.4f\n", min_t); 419 | printf("max elapsed time = %1.4f\n", max_t); 420 | } 421 | 422 | free(elapsed_time); 423 | free(LCG_STATE); 424 | 425 | printf("\n\n"); 426 | 427 | return; 428 | } 429 | 430 | void vecmultest(int do_verification, int threads, int verbose) 431 | { 432 | // test the pmod by comparing all results to those computed using 433 | // validated scalar code. 434 | double* elapsed_time; 435 | int t; 436 | //gmp_randstate_t rng_state; 437 | 438 | //gmp_randinit_default(rng_state); 439 | elapsed_time = (double*)malloc(threads * sizeof(double)); 440 | 441 | LCG_STATE = (uint64_t*)malloc(threads * sizeof(uint64_t)); 442 | 443 | for (t = 0; t < threads; t++) 444 | { 445 | LCG_STATE[t] = hash64(t); 446 | } 447 | 448 | do_verification = 0; 449 | printf("commencing test mulmod: all variable (random)\n"); 450 | #pragma omp parallel num_threads(threads) 451 | { 452 | int i, j; 453 | 454 | // timing variables 455 | struct timeval stopt; // stop time of this job 456 | struct timeval startt; // start time of this job 457 | double t_time; 458 | 459 | mpz_t base, exp, mod, t1, t2; 460 | 461 | int loc_iterations; 462 | int tid = omp_get_thread_num(); 463 | monty* mtest; 464 | 465 | // vector bignums 466 | bignum* b = vecInit(); 467 | bignum* d = vecInit(); 468 | bignum* m = vecInit(); 469 | bignum* e = vecInit(); 470 | bignum* s = vecInit(); 471 | bignum* one = vecInit(); 472 | 473 | mpz_init(base); 474 | mpz_init(exp); 475 | mpz_init(mod); 476 | mpz_init(t1); 477 | mpz_init(t2); 478 | 479 | //gmp_randseed_ui(rng_state, tid); 480 | 481 | // attempt to scale the number of iterations with input size 482 | // so this doesn't take forever. 483 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS); 484 | 485 | if (MAXBITS >= 4096) 486 | loc_iterations *= 1; 487 | else if (MAXBITS >= 2048) 488 | loc_iterations *= 2; 489 | else if (MAXBITS >= 1024) 490 | loc_iterations *= 5; 491 | else if (MAXBITS >= 512) 492 | loc_iterations *= 10; 493 | else if (MAXBITS >= 256) 494 | loc_iterations *= 25; 495 | else 496 | loc_iterations *= 100; 497 | 498 | #ifdef BASE52 499 | loc_iterations *= 3; 500 | #endif 501 | 502 | #ifdef TARGET_KNL 503 | loc_iterations /= 3; 504 | #endif 505 | 506 | loc_iterations *= 20000; 507 | mtest = monty_alloc(); 508 | 509 | #pragma omp barrier 510 | 511 | gettimeofday(&startt, NULL); 512 | 513 | for (j = 0; j < VECLEN; j++) 514 | { 515 | one->data[j] = 1; 516 | } 517 | 518 | #ifdef BASE52 519 | //int tmp = ceil(MAXBITS / 64); 520 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 521 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 522 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 523 | 524 | for (j = 0; j < VECLEN; j++) 525 | { 526 | int k; 527 | for (k = 0; k < NWORDS; k++) 528 | { 529 | uint64_t r1 = spRand64(&LCG_STATE[t]); 530 | uint64_t r2 = spRand64(&LCG_STATE[t]); 531 | uint64_t r3 = spRand64(&LCG_STATE[t]); 532 | 533 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 534 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 535 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 536 | } 537 | } 538 | 539 | #else 540 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 541 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 542 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 543 | 544 | for (j = 0; j < VECLEN; j++) 545 | { 546 | int k; 547 | for (k = 0; k < NWORDS; k++) 548 | { 549 | uint64_t r1 = spRand64(&LCG_STATE[t]); 550 | uint64_t r2 = spRand64(&LCG_STATE[t]); 551 | uint64_t r3 = spRand64(&LCG_STATE[t]); 552 | 553 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 554 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 555 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 556 | } 557 | } 558 | #endif 559 | for (j = 0; j < VECLEN; j++) 560 | m->data[j] |= 0x1; 561 | 562 | if (verbose > 1) 563 | { 564 | for (j = 0; j < VECLEN; j++) 565 | { 566 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 567 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 568 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 569 | 570 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n", 571 | j, base, exp, mod); 572 | } 573 | } 574 | 575 | // now we actually do the (vectorized) montgomery initialization 576 | // on our vector of random moduli. 577 | monty_init_vec(mtest, m, 0); 578 | 579 | if (verbose > 1) 580 | { 581 | for (j = 0; j < VECLEN; j++) 582 | { 583 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS); 584 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS); 585 | 586 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n", 587 | j, base, mod, mtest->vrho[j]); 588 | } 589 | } 590 | 591 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep 592 | vecmulmod_ptr(e, mtest->rhat, e, m, s, mtest); // monty rep 593 | 594 | printf("thread %d starting %d iterations\n", tid, loc_iterations); 595 | 596 | // now do the calculation "b^e % m" a bunch of times 597 | for (i = 0; i < loc_iterations; i++) 598 | { 599 | if (verbose > 1) 600 | { 601 | for (j = 0; j < VECLEN; j++) 602 | { 603 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 604 | 605 | gmp_printf("monty(base%d) = %Zx\n", j, base); 606 | } 607 | } 608 | 609 | vecmulmod_ptr(b, e, b, m, s, mtest); 610 | 611 | if (verbose > 1) 612 | { 613 | for (j = 0; j < VECLEN; j++) 614 | { 615 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS); 616 | 617 | gmp_printf("modexp%d = %Zx\n", j, base); 618 | } 619 | } 620 | 621 | // now verify each result 622 | if (do_verification) 623 | { 624 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep 625 | for (j = 0; j < VECLEN; j++) 626 | { 627 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS); 628 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 629 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 630 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 631 | 632 | mpz_powm(t2, base, exp, mod); 633 | 634 | if (verbose) 635 | { 636 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n", 637 | i, j, t2, t1); 638 | } 639 | 640 | if (mpz_cmp(t1, t2) != 0) 641 | { 642 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n", 643 | i, j, base, exp, mod, t2, t1); 644 | exit(1); 645 | } 646 | 647 | } 648 | } 649 | } 650 | 651 | monty_free(mtest); 652 | 653 | if ((tid == 0) && (do_verification == 1)) 654 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations); 655 | 656 | gettimeofday(&stopt, NULL); 657 | t_time = my_difftime(&startt, &stopt); 658 | elapsed_time[tid] = t_time; 659 | 660 | if (tid == 0) 661 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time); 662 | 663 | mpz_clear(t1); 664 | mpz_clear(t2); 665 | mpz_clear(base); 666 | mpz_clear(mod); 667 | mpz_clear(exp); 668 | vecFree(m); 669 | vecFree(b); 670 | vecFree(d); 671 | vecFree(s); 672 | vecFree(e); 673 | } 674 | 675 | { 676 | int i; 677 | double sum = 0.0; 678 | double min_t = 9999999999.; 679 | double max_t = 0.; 680 | 681 | for (i = 0; i < threads; i++) 682 | { 683 | sum += elapsed_time[i]; 684 | if (elapsed_time[i] < min_t) 685 | min_t = elapsed_time[i]; 686 | if (elapsed_time[i] > max_t) 687 | max_t = elapsed_time[i]; 688 | } 689 | 690 | printf("average elapsed time = %1.4f\n", sum / threads); 691 | printf("min elapsed time = %1.4f\n", min_t); 692 | printf("max elapsed time = %1.4f\n", max_t); 693 | } 694 | 695 | free(elapsed_time); 696 | free(LCG_STATE); 697 | 698 | printf("\n\n"); 699 | 700 | return; 701 | } 702 | 703 | void vecsqrtest(int do_verification, int threads, int verbose) 704 | { 705 | // test the pmod by comparing all results to those computed using 706 | // validated scalar code. 707 | double* elapsed_time; 708 | int t; 709 | //gmp_randstate_t rng_state; 710 | 711 | //gmp_randinit_default(rng_state); 712 | elapsed_time = (double*)malloc(threads * sizeof(double)); 713 | 714 | LCG_STATE = (uint64_t*)malloc(threads * sizeof(uint64_t)); 715 | 716 | for (t = 0; t < threads; t++) 717 | { 718 | LCG_STATE[t] = hash64(t); 719 | } 720 | 721 | do_verification = 0; 722 | printf("commencing test sqrmod: all variable (random)\n"); 723 | #pragma omp parallel num_threads(threads) 724 | { 725 | int i, j; 726 | 727 | // timing variables 728 | struct timeval stopt; // stop time of this job 729 | struct timeval startt; // start time of this job 730 | double t_time; 731 | 732 | mpz_t base, exp, mod, t1, t2; 733 | 734 | int loc_iterations; 735 | int tid = omp_get_thread_num(); 736 | monty* mtest; 737 | 738 | // vector bignums 739 | bignum* b = vecInit(); 740 | bignum* d = vecInit(); 741 | bignum* m = vecInit(); 742 | bignum* e = vecInit(); 743 | bignum* s = vecInit(); 744 | bignum* one = vecInit(); 745 | 746 | mpz_init(base); 747 | mpz_init(exp); 748 | mpz_init(mod); 749 | mpz_init(t1); 750 | mpz_init(t2); 751 | 752 | //gmp_randseed_ui(rng_state, tid); 753 | 754 | // attempt to scale the number of iterations with input size 755 | // so this doesn't take forever. 756 | loc_iterations = 100000 * 2 / (NWORDS * DIGITBITS); 757 | 758 | if (MAXBITS >= 4096) 759 | loc_iterations *= 1; 760 | else if (MAXBITS >= 2048) 761 | loc_iterations *= 2; 762 | else if (MAXBITS >= 1024) 763 | loc_iterations *= 5; 764 | else if (MAXBITS >= 512) 765 | loc_iterations *= 10; 766 | else if (MAXBITS >= 256) 767 | loc_iterations *= 25; 768 | else 769 | loc_iterations *= 100; 770 | 771 | #ifdef BASE52 772 | loc_iterations *= 3; 773 | #endif 774 | 775 | #ifdef TARGET_KNL 776 | loc_iterations /= 3; 777 | #endif 778 | 779 | loc_iterations *= 20000; 780 | mtest = monty_alloc(); 781 | 782 | #pragma omp barrier 783 | 784 | gettimeofday(&startt, NULL); 785 | 786 | for (j = 0; j < VECLEN; j++) 787 | { 788 | one->data[j] = 1; 789 | } 790 | 791 | #ifdef BASE52 792 | //int tmp = ceil(MAXBITS / 64); 793 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 794 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 795 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 796 | 797 | for (j = 0; j < VECLEN; j++) 798 | { 799 | int k; 800 | for (k = 0; k < NWORDS; k++) 801 | { 802 | uint64_t r1 = spRand64(&LCG_STATE[t]); 803 | uint64_t r2 = spRand64(&LCG_STATE[t]); 804 | uint64_t r3 = spRand64(&LCG_STATE[t]); 805 | 806 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 807 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 808 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 809 | } 810 | } 811 | 812 | #else 813 | memset(m->data, 0, MAXBITS * 2 * VECLEN / 8); 814 | memset(e->data, 0, MAXBITS * 2 * VECLEN / 8); 815 | memset(b->data, 0, MAXBITS * 2 * VECLEN / 8); 816 | 817 | for (j = 0; j < VECLEN; j++) 818 | { 819 | int k; 820 | for (k = 0; k < NWORDS; k++) 821 | { 822 | uint64_t r1 = spRand64(&LCG_STATE[t]); 823 | uint64_t r2 = spRand64(&LCG_STATE[t]); 824 | uint64_t r3 = spRand64(&LCG_STATE[t]); 825 | 826 | m->data[k * VECLEN + j] = r1 & MAXDIGIT; 827 | b->data[k * VECLEN + j] = r2 & MAXDIGIT; 828 | e->data[k * VECLEN + j] = r3 & MAXDIGIT; 829 | } 830 | } 831 | #endif 832 | for (j = 0; j < VECLEN; j++) 833 | m->data[j] |= 0x1; 834 | 835 | if (verbose > 1) 836 | { 837 | for (j = 0; j < VECLEN; j++) 838 | { 839 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 840 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 841 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 842 | 843 | gmp_printf("init%d:\n\tbase = %Zx\n\texp = %Zx\n\tmod = %Zx\n", 844 | j, base, exp, mod); 845 | } 846 | } 847 | 848 | // now we actually do the (vectorized) montgomery initialization 849 | // on our vector of random moduli. 850 | monty_init_vec(mtest, m, 0); 851 | 852 | if (verbose > 1) 853 | { 854 | for (j = 0; j < VECLEN; j++) 855 | { 856 | extract_bignum_from_vec_to_mpz(base, mtest->rhat, j, NWORDS); 857 | extract_bignum_from_vec_to_mpz(mod, mtest->one, j, NWORDS); 858 | 859 | gmp_printf("init%d:\n\trhat = %Zx\n\tone = %Zx\n\trho = %08x\n", 860 | j, base, mod, mtest->vrho[j]); 861 | } 862 | } 863 | 864 | vecmulmod_ptr(b, mtest->rhat, b, m, s, mtest); // monty rep 865 | vecmulmod_ptr(e, mtest->rhat, e, m, s, mtest); // monty rep 866 | 867 | printf("thread %d starting %d iterations\n", tid, loc_iterations); 868 | 869 | // now do the calculation "b^e % m" a bunch of times 870 | for (i = 0; i < loc_iterations; i++) 871 | { 872 | if (verbose > 1) 873 | { 874 | for (j = 0; j < VECLEN; j++) 875 | { 876 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 877 | 878 | gmp_printf("monty(base%d) = %Zx\n", j, base); 879 | } 880 | } 881 | 882 | vecsqrmod_ptr(b, b, m, s, mtest); 883 | 884 | if (verbose > 1) 885 | { 886 | for (j = 0; j < VECLEN; j++) 887 | { 888 | extract_bignum_from_vec_to_mpz(base, d, j, NWORDS); 889 | 890 | gmp_printf("modexp%d = %Zx\n", j, base); 891 | } 892 | } 893 | 894 | // now verify each result 895 | if (do_verification) 896 | { 897 | vecmulmod_ptr(b, one, b, m, s, mtest); // normal rep 898 | for (j = 0; j < VECLEN; j++) 899 | { 900 | extract_bignum_from_vec_to_mpz(t1, d, j, NWORDS); 901 | extract_bignum_from_vec_to_mpz(base, b, j, NWORDS); 902 | extract_bignum_from_vec_to_mpz(exp, e, j, NWORDS); 903 | extract_bignum_from_vec_to_mpz(mod, m, j, NWORDS); 904 | 905 | mpz_powm(t2, base, exp, mod); 906 | 907 | if (verbose) 908 | { 909 | gmp_printf("iteration %d lane %d:\n\tgmp = %Zx\n\ttest = %Zx\n", 910 | i, j, t2, t1); 911 | } 912 | 913 | if (mpz_cmp(t1, t2) != 0) 914 | { 915 | gmp_printf("iteration %d error lane %d:\nbase = %Zx\nexp = %Zx\nmod = %Zx\ngmp = %Zx\ntest = %Zx\n", 916 | i, j, base, exp, mod, t2, t1); 917 | exit(1); 918 | } 919 | 920 | } 921 | } 922 | } 923 | 924 | monty_free(mtest); 925 | 926 | if ((tid == 0) && (do_verification == 1)) 927 | printf("verified %d x 16 vecModExp (all variable) results\n", loc_iterations); 928 | 929 | gettimeofday(&stopt, NULL); 930 | t_time = my_difftime(&startt, &stopt); 931 | elapsed_time[tid] = t_time; 932 | 933 | if (tid == 0) 934 | printf("Test with %d iterations took %1.4f seconds.\n", loc_iterations, t_time); 935 | 936 | mpz_clear(t1); 937 | mpz_clear(t2); 938 | mpz_clear(base); 939 | mpz_clear(mod); 940 | mpz_clear(exp); 941 | vecFree(m); 942 | vecFree(b); 943 | vecFree(d); 944 | vecFree(s); 945 | vecFree(e); 946 | } 947 | 948 | { 949 | int i; 950 | double sum = 0.0; 951 | double min_t = 9999999999.; 952 | double max_t = 0.; 953 | 954 | for (i = 0; i < threads; i++) 955 | { 956 | sum += elapsed_time[i]; 957 | if (elapsed_time[i] < min_t) 958 | min_t = elapsed_time[i]; 959 | if (elapsed_time[i] > max_t) 960 | max_t = elapsed_time[i]; 961 | } 962 | 963 | printf("average elapsed time = %1.4f\n", sum / threads); 964 | printf("min elapsed time = %1.4f\n", min_t); 965 | printf("max elapsed time = %1.4f\n", max_t); 966 | } 967 | 968 | free(elapsed_time); 969 | free(LCG_STATE); 970 | 971 | printf("\n\n"); 972 | 973 | return; 974 | } 975 | 976 | int main(int argc, char **argv) 977 | { 978 | int threads; 979 | int do_verification = 1; 980 | int verbose = 0; 981 | 982 | if (argc < 2) 983 | { 984 | printf("usage: avx512_modexp $threads $do_verification\n"); 985 | exit(1); 986 | } 987 | else if (argc == 3) 988 | { 989 | do_verification = atoi(argv[2]); 990 | } 991 | 992 | threads = atoi(argv[1]); 993 | 994 | printf("configured with MAXBITS = %d, DIGITBITS = %d, NUMWORDS = %d, VECLEN = %d\n", 995 | MAXBITS, DIGITBITS, NWORDS, VECLEN); 996 | printf("commencing modular exponentiation benchmarks\n"); fflush(stdout); 997 | 998 | #ifdef BASE52 999 | vecmulmod_ptr = &vecmulmod52; 1000 | vecsqrmod_ptr = &vecsqrmod52; 1001 | montsetup_ptr = &vec_montgomery_setup52; 1002 | vecmodexp_ptr = &vecmodexp52; 1003 | #else 1004 | vecmulmod_ptr = &vecmulmod; 1005 | vecsqrmod_ptr = &vecsqrmod; 1006 | montsetup_ptr = &vec_montgomery_setup; 1007 | vecmodexp_ptr = &vecmodexp; 1008 | #endif 1009 | 1010 | omp_set_num_threads(threads); 1011 | //vecmultest(do_verification, threads, verbose); 1012 | //vecsqrtest(do_verification, threads, verbose); 1013 | vecpmodtest(do_verification, threads, verbose); 1014 | 1015 | return 0; 1016 | } 1017 | 1018 | 1019 | -------------------------------------------------------------------------------- /x64_bench/bigarith.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2021, Ben Buhrow 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are those 26 | of the authors and should not be interpreted as representing official policies, 27 | either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #include "bigarith.h" 31 | 32 | void zSet1(bignum *n, base_t d) 33 | { 34 | n->data[0] = d; 35 | n->size = 1; 36 | return; 37 | } 38 | 39 | int zBits(bignum * n) 40 | { 41 | if (n->size == 1) 42 | return spBits(n->data[0]); 43 | else 44 | return DIGITBITS*(n->size-1) + spBits(n->data[n->size-1]); 45 | } 46 | 47 | base_t spBits(base_t n) 48 | { 49 | int i = 0; 50 | while (n != 0) 51 | { 52 | n >>= 1; 53 | i++; 54 | } 55 | return i; 56 | } 57 | 58 | int ndigits_1(base_t n) 59 | { 60 | int i=0; 61 | while (n != 0) 62 | { 63 | n /= 10; 64 | i++; 65 | } 66 | if (i==0) 67 | i++; 68 | return i; 69 | } 70 | 71 | base_t spGCD(base_t x, base_t y) 72 | { 73 | base_t a,b,c; 74 | a=x; b=y; 75 | while (b != 0) 76 | { 77 | c=a%b; 78 | a=b; 79 | b=c; 80 | } 81 | return a; 82 | } 83 | 84 | void sp2big(base_t src, bignum * dest) 85 | { 86 | dest->data[0] = src; 87 | dest->size = 1; 88 | return; 89 | } 90 | 91 | void zClear(bignum * n) 92 | { 93 | int i; 94 | for (i = 0; i <= NWORDS; i++) 95 | n->data[i] = 0; 96 | n->size = 1; 97 | return; 98 | } 99 | 100 | void zClearFull(bignum * n) 101 | { 102 | int i; 103 | memset(n->data, 0, 2 * NWORDS * sizeof(base_t)); 104 | n->size = 1; 105 | return; 106 | } 107 | 108 | bignum * zInit(void) 109 | { 110 | int i; 111 | size_t sz = 2 * (NWORDS + 4); 112 | bignum *n; 113 | 114 | n = (bignum *)malloc(sizeof(bignum)); 115 | 116 | n->data = (base_t *)xmalloc_align(sz * sizeof(base_t)); 117 | for (i = 0; i < sz; i++) 118 | { 119 | n->data[i] = 0; 120 | } 121 | n->size = 1; 122 | 123 | return n; 124 | } 125 | 126 | void zFree(bignum *n) 127 | { 128 | align_free(n->data); 129 | free(n); 130 | } 131 | 132 | void zPrint(bignum *n) 133 | { 134 | int j; 135 | for (j = MIN(n->size - 1, 2*NWORDS); j >= 0; j--) 136 | printf("%016lx", n->data[j]); 137 | return; 138 | } 139 | 140 | void zClamp(bignum * n) 141 | { 142 | int j; 143 | int sn = abs(n->size); 144 | int sign = n->size < 0; 145 | 146 | for (j = sn - 1; j >= 0; j--) 147 | { 148 | if (n->data[j] == 0) 149 | { 150 | sn--; 151 | } 152 | else 153 | break; 154 | } 155 | 156 | n->size = (sn == 0 ? 1 : sn); 157 | if (sign) 158 | n->size *= -1; 159 | 160 | return; 161 | } 162 | 163 | void zCopy(bignum * src, bignum * dest) 164 | { 165 | //physically copy the digits of u into the digits of v 166 | int su = abs(src->size); 167 | int i; 168 | 169 | //memcpy(dest->data, src->data, su * sizeof(base_t)); 170 | for (i = 0; i < su; i++) 171 | { 172 | dest->data[i] = src->data[i]; 173 | } 174 | dest->size = src->size; 175 | return; 176 | } 177 | 178 | void zAdd(bignum * u, bignum * v, bignum * w) 179 | { 180 | int i, su, sv; 181 | base_t *larger; 182 | base_t k; 183 | int n, m; 184 | 185 | if (u->size < 0) 186 | { 187 | if (v->size > 0) 188 | { 189 | //u is negative, v is not 190 | u->size *= -1; 191 | zSub(v, u, w); 192 | if (u != w) 193 | u->size *= -1; 194 | return; 195 | } 196 | } 197 | else if (v->size < 0) 198 | { 199 | //v is negative, u is not 200 | v->size *= -1; 201 | zSub(u, v, w); 202 | if (v != w) 203 | v->size *= -1; 204 | return; 205 | } 206 | 207 | su = abs(u->size); 208 | sv = abs(v->size); 209 | 210 | if (su >= sv) 211 | { 212 | larger = u->data; 213 | n = su; 214 | m = sv; 215 | } 216 | else 217 | { 218 | larger = v->data; 219 | n = sv; 220 | m = su; 221 | } 222 | 223 | k=0; 224 | for (i = 0; i < m; ++i) 225 | spAdd3(u->data[i], v->data[i], k, w->data + i, &k); 226 | 227 | for ( ; i < n; ++i) 228 | spAdd(larger[i], k, w->data + i, &k); 229 | 230 | w->size = n; 231 | if (k) 232 | { 233 | w->data[n] = k; 234 | w->size++; 235 | } 236 | 237 | // if one is negative then so is the other or we would be subtracting 238 | if (u->size < 0) 239 | w->size *= -1; 240 | 241 | return; 242 | } 243 | 244 | void zShortAdd(bignum * u, base_t v, bignum * w) 245 | { 246 | int i, su; 247 | base_t k; 248 | 249 | if (u->size < 0) 250 | { 251 | //u is negative 252 | u->size *= -1; 253 | zShortSub(u, v, w); 254 | w->size *= -1; 255 | u->size *= -1; 256 | return; 257 | } 258 | 259 | su = abs(u->size); 260 | 261 | zCopy(u,w); 262 | 263 | //add 264 | spAdd(u->data[0], v, w->data, &k); 265 | 266 | //add the carry 267 | spAdd(u->data[1], k, w->data + 1, &k); 268 | 269 | if (k) 270 | { 271 | //only rarely will the carry propagate more than one place 272 | //special case this. 273 | for (i = 2; i < su; ++i) 274 | spAdd(u->data[i], k, w->data + i, &k); 275 | 276 | w->size = u->size; 277 | if (k) 278 | { 279 | w->data[u->size] = k; 280 | w->size++; 281 | } 282 | } 283 | 284 | return; 285 | } 286 | 287 | int zSub(bignum * u, bignum * v, bignum * w) 288 | { 289 | base_t k = 0; 290 | int i, j, su, sv, sw, m, sign=0; 291 | base_t *bigger, *smaller; 292 | 293 | if (u->size < 0) 294 | { 295 | if (v->size > 0) 296 | { 297 | //u is negative, v is not, so really an addition 298 | u->size *= -1; 299 | zAdd(u, v, w); 300 | if (u != w) 301 | u->size *= -1; 302 | w->size *= -1; 303 | //printf("did an addition, result is neg\n"); 304 | return 0; 305 | } 306 | else 307 | { 308 | //both are negative, so we really have -u + v or v - u 309 | v->size *= -1; 310 | u->size *= -1; 311 | zSub(v, u, w); 312 | if (v != w) 313 | v->size *= -1; 314 | if (u != w) 315 | u->size *= -1; 316 | //printf("both negative\n"); 317 | return 0; 318 | } 319 | } 320 | else if (v->size < 0) 321 | { 322 | if (u->size > 0) 323 | { 324 | //v is negative, u is not, so really an addition 325 | v->size *= -1; 326 | zAdd(u, v, w); 327 | if (v != w) 328 | v->size *= -1; 329 | //printf("did an addition, result is pos\n"); 330 | return 0; 331 | } 332 | } 333 | 334 | su = u->size; 335 | sv = v->size; 336 | 337 | if (su > sv) 338 | { 339 | bigger = u->data; 340 | smaller = v->data; 341 | sw = su; 342 | m = sv; 343 | goto beginsub; 344 | } 345 | if (su < sv) 346 | { 347 | bigger = v->data; 348 | smaller = u->data; 349 | sw = sv; 350 | m = su; 351 | sign=1; 352 | goto beginsub; 353 | } 354 | 355 | // same size 356 | m = su; 357 | sw = sv; 358 | for (i = su - 1; i >= 0; --i) 359 | { 360 | if (u->data[i] > v->data[i]) 361 | { 362 | bigger = u->data; 363 | smaller = v->data; 364 | goto beginsub; 365 | } 366 | if (u->data[i] < v->data[i]) 367 | { 368 | bigger = v->data; 369 | smaller = u->data; 370 | sign=1; 371 | goto beginsub; 372 | } 373 | } 374 | 375 | //equal if got to here 376 | w->size = 1; 377 | w->data[0] = 0; 378 | return 1; 379 | 380 | beginsub: 381 | 382 | for (j = 0; j < m; ++j) 383 | spSub3(bigger[j], smaller[j], k, w->data + j, &k); 384 | 385 | 386 | //if there is a leftover word that is != 0, then subtract any 387 | //carry and simply copy any other leftover words 388 | 389 | //if there is a leftover word that is == 0, then subtract with 390 | //borrow for the rest of the leftover words. this will happen rarely 391 | 392 | //leftover word? 393 | if (sw > m) 394 | { 395 | //not equal to zero? 396 | if (bigger[m] != 0) 397 | { 398 | //subtract any carry and copy the rest 399 | w->data[m] = bigger[m] - k; 400 | j = m + 1; 401 | m = sw; 402 | for(; j < m; j++) 403 | w->data[j] = bigger[j]; 404 | } 405 | else 406 | { 407 | //equal to zero, need to subtract with borrow for the rest 408 | //of the leftover words. 409 | j = m; 410 | m = sw; 411 | 412 | for ( ; j < m; ++j) 413 | spSub3(bigger[j], 0, k, w->data + j, &k); 414 | } 415 | } 416 | 417 | w->size = sw; 418 | zClamp(w); 419 | 420 | if (sign) 421 | w->size *= -1; 422 | 423 | if (w->size == 0) 424 | w->size = 1; 425 | 426 | return 0; 427 | } 428 | 429 | void zShortSub(bignum * u, base_t v, bignum * w) 430 | { 431 | // w = u - v 432 | // assume both are initially positive; result can be negative 433 | 434 | int i, su = abs(u->size); 435 | base_t k = 0; 436 | 437 | su = abs(u->size); 438 | w->size = su; 439 | 440 | if (u->size < 0) 441 | { 442 | //u is negative, really an addition 443 | u->size *= -1; 444 | zShortAdd(u,v,w); 445 | u->size *= -1; 446 | w->size *= -1; 447 | return; 448 | } 449 | 450 | zCopy(u,w); 451 | 452 | //subtract 453 | spSub3(u->data[0],v,0,w->data,&k); 454 | 455 | //subtract the borrow 456 | spSub3(u->data[1],k,0,w->data+1,&k); 457 | 458 | if (k) 459 | { 460 | //propagate the borrow 461 | for (i=2;idata[i],0,k,w->data+i,&k); 463 | } 464 | 465 | //check if we lost the high digit 466 | if ((w->data[su - 1] == 0) && (su != 1)) 467 | su--; 468 | w->size = su; 469 | 470 | //check for u < v 471 | if (k) 472 | { 473 | //then u < v, and result is negative 474 | w->data[0] = ~w->data[0]; 475 | w->data[0]++; 476 | w->size *= -1; 477 | } 478 | 479 | return; 480 | } 481 | 482 | int zCompare(bignum * u, bignum * v) 483 | { 484 | //return 1 if u > v, -1 if u < v, 0 if equal 485 | int i,j,su,sv; 486 | 487 | i = u->size < 0; 488 | j = v->size < 0; 489 | 490 | su = abs(u->size); 491 | sv = abs(v->size); 492 | 493 | if (i > j) 494 | { 495 | //v pos, u neg 496 | //make sure both are not zero 497 | if ((u->data[0] == 0) && (su == 1) && (v->data[0] == 0) && (sv == 1)) 498 | return 0; 499 | else 500 | return -1; 501 | } 502 | if (j > i) 503 | { 504 | //u pos, v neg 505 | //make sure both are not zero 506 | if ((u->data[0] == 0) && (su == 1) && (v->data[0] == 0) && (sv == 1)) 507 | return 0; 508 | else 509 | return 1; 510 | } 511 | 512 | //check obvious 513 | if (j) 514 | { //both are negative 515 | if (su > sv) return -1; 516 | if (su < sv) return 1; 517 | } 518 | else 519 | { //both are positive 520 | if (su > sv) return 1; 521 | if (su < sv) return -1; 522 | } 523 | 524 | //if the numbers are both negative, then we'll need to switch the return value 525 | for (i = su - 1; i>=0; --i) 526 | { 527 | if (u->data[i] > v->data[i]) 528 | return (1 - 2*j); 529 | if (u->data[i] < v->data[i]) 530 | return (-1 + 2*j); 531 | } 532 | 533 | //equal if got to here 534 | return 0; 535 | } 536 | 537 | int zCompare1(bignum * u, base_t v) 538 | { 539 | // return 1 if u > v, -1 if u < v, 0 if equal. 540 | // single digit v is assumed to be positive. 541 | if (u->size < 0) 542 | { 543 | return -1; 544 | } 545 | else if (u->size > 1) 546 | { 547 | return 1; 548 | } 549 | else if (u->data[0] > v) 550 | { 551 | return 1; 552 | } 553 | else if (u->data[0] < v) 554 | { 555 | return -1; 556 | } 557 | else 558 | { 559 | return 0; 560 | } 561 | } 562 | 563 | base_t zShortDiv(bignum * u, base_t v, bignum * q) 564 | { 565 | // q = u/v 566 | // return the remainder 567 | 568 | int su = abs(u->size); 569 | int sign = u->size < 0 ? 1 : 0; 570 | int i; 571 | base_t rem = 0; 572 | 573 | q->size = su; 574 | 575 | i = su - 1; 576 | if (u->data[i] < v) 577 | { 578 | rem = u->data[i]; 579 | q->data[i--] = 0; 580 | } 581 | 582 | while (i >= 0) 583 | { 584 | base_t quot1; 585 | 586 | #if DIGITBITS == 64 587 | __asm__ ("divq %4" 588 | : "=a"(quot1),"=d"(rem) 589 | : "1"(rem), "0"(u->data[i]), "r"(v) ); 590 | #else 591 | __asm__ ("divl %4" 592 | : "=a"(quot1),"=d"(rem) 593 | : "1"(rem), "0"(u->data[i]), "r"(v) ); 594 | #endif 595 | 596 | q->data[i] = quot1; 597 | i--; 598 | } 599 | 600 | //the quotient could be one limb smaller than the input 601 | if ((q->data[q->size - 1] == 0) && (q->size != 1)) 602 | q->size--; 603 | 604 | if (sign) 605 | q->size *= -1; 606 | 607 | return rem; 608 | } 609 | 610 | void zDiv(bignum * u, bignum * v, bignum * q, bignum * r) 611 | { 612 | /* 613 | q = u \ v 614 | r = u mod v 615 | u is overwritten 616 | 617 | schoolbook long division. see knuth TAOCP, vol. 2 618 | */ 619 | 620 | base_t v1=0,v2=0,d=0,k,qhat,rhat,uj2,tt[2],pp[2]; 621 | int i,j,m,su,sv; 622 | int s =0,cmp,sdd,sd; 623 | unsigned int shift; 624 | base_t bitmask; 625 | 626 | su = abs(u->size); 627 | sv = abs(v->size); 628 | m = su-sv; 629 | 630 | //v > u, so just set q = 0 and r = u 631 | if (su < sv) 632 | { 633 | q->size = 1; 634 | zCopy(u,r); 635 | 636 | return; 637 | } 638 | 639 | if (sv == 1) 640 | { 641 | r->data[0] = zShortDiv(u, v->data[0], q); 642 | r->size = 1; 643 | s = (v->size < 0); 644 | if (s) 645 | { 646 | q->size *= -1; 647 | r->size *= -1; 648 | } 649 | return; 650 | } 651 | 652 | //u and v are the same length 653 | if (su == sv) 654 | { 655 | cmp = zCompare(u,v); 656 | //v > u, as above 657 | if (cmp < 0) 658 | { 659 | q->size = 1; 660 | zCopy(u,r); 661 | return; 662 | } 663 | else if (cmp == 0) //v == u, so set q = 1 and r = 0 664 | { 665 | q->size = 1; 666 | q->data[0] = 1; 667 | r->size = 1; 668 | r->data[0] = 0; 669 | return; 670 | } 671 | } 672 | 673 | //normalize v by left shifting until the high bit of v is set (v1 >= floor(2^31)) 674 | bitmask = HIBITMASK; 675 | for (shift = 0; shift < DIGITBITS; ++shift) 676 | { 677 | if (v->data[sv-1] & bitmask) 678 | break; 679 | bitmask >>= 1; 680 | } 681 | 682 | //normalize v by shifting left (x2) shift number of times 683 | //overflow should never occur to v during normalization 684 | zShiftLeft(v,v,shift); 685 | 686 | //left shift u the same amount - may get an overflow here 687 | zShiftLeft(u,u,shift); 688 | if (abs(u->size) == su) 689 | { //no overflow - force extra digit 690 | if (u->size < 0) 691 | u->size--; 692 | else 693 | u->size++; 694 | u->data[su] = 0; 695 | su++; 696 | } 697 | else 698 | su++; 699 | 700 | //copy first two digits of v to local variables for quick access 701 | v1=v->data[sv-1]; 702 | v2=v->data[sv-2]; 703 | 704 | sdd=0; 705 | sd=0; 706 | //main loop 707 | for (j=0;j<=m;++j) 708 | { 709 | //calculate qhat 710 | tt[1] = u->data[su-j-1]; //first digit of normalized u 711 | tt[0] = u->data[su-j-2]; //second digit of normalized u 712 | if (tt[1] == v1) 713 | qhat = MAXDIGIT; 714 | else 715 | spDivide(&qhat, &rhat, tt, v1); 716 | 717 | //quick check if qhat is too big based on our initial guess involving 718 | //the first two digits of u and v. 719 | uj2 = u->data[su-j-3]; 720 | 721 | while (1) 722 | { 723 | spMultiply(qhat,v1,&pp[0],&pp[1]); 724 | shortSubtract(tt,pp,tt); 725 | if (tt[1]) break; 726 | tt[1] = tt[0]; tt[0] = uj2; 727 | spMultiply(qhat,v2,&pp[0],&pp[1]); 728 | i = shortCompare(pp,tt); //p = v2*qhat, t = (uj*b+uj1-qhat*v1)*b + uj2 729 | 730 | if (i == 1) 731 | qhat--; 732 | else 733 | break; 734 | } 735 | 736 | //keep track of the significant digits 737 | if (qhat > 0) 738 | { 739 | sdd = sdd + 1 + sd; 740 | sd = 0; 741 | } 742 | else if (sdd != 0) 743 | sd++; 744 | 745 | //multiply and subtract, in situ 746 | k=0; 747 | for (i=0;idata[i],qhat,&pp[0],&pp[1]); 751 | spAdd(pp[0],k,&tt[0],&tt[1]); 752 | u->data[s] = u->data[s] - tt[0]; 753 | //check if this result is negative, remember the borrow for the next digit 754 | if (u->data[s] > (u->data[s] + tt[0])) 755 | k = pp[1] + tt[1] + 1; 756 | else 757 | k = pp[1] + tt[1]; 758 | } 759 | 760 | //if the final carry is bigger than the most significant digit of u, then qhat 761 | //was too big, i.e. qhat[v1v2...vn] > [u0u1u2...un] 762 | if (k > u->data[su-j-1]) 763 | { 764 | //correct by decrementing qhat and adding back [v1v2...vn] to [u0u1...un] 765 | qhat--; 766 | //first subtract the final carry, yielding a negative number for [u0u1...un] 767 | u->data[su-j-1] -= k; 768 | //then add back v 769 | k=0; 770 | for (i=0;idata[su-j-sv+i-1],v->data[i],k,&u->data[su-j-sv+i-1],&k); 772 | u->data[su-j-1] += k; 773 | } 774 | else //else qhat was ok, subtract the final carry 775 | u->data[su-j-1] -= k; 776 | 777 | //set digit of q 778 | q->data[m-j] = qhat; 779 | } 780 | q->size = sdd+sd; 781 | zCopy(u,r); 782 | 783 | for (s=r->size - 1; s>=0; --s) 784 | { 785 | if ((r->data[s] == 0) && (r->size > 0)) 786 | r->size--; 787 | else 788 | break; 789 | } 790 | 791 | //unnormalize. 792 | zShiftRight(v,v,shift); 793 | zShiftRight(r,r,shift); 794 | 795 | s = (u->size < 0) ^ (v->size < 0); 796 | if (s) 797 | { 798 | q->size *= -1; 799 | r->size *= -1; 800 | } 801 | 802 | return; 803 | } 804 | 805 | int shortCompare(base_t p[2], base_t t[2]) 806 | { 807 | //utility function used in zDiv 808 | int i; 809 | 810 | for (i=1;i>=0;--i) 811 | { 812 | if (p[i] > t[i]) return 1; 813 | if (p[i] < t[i]) return -1; 814 | } 815 | return 0; 816 | } 817 | 818 | int shortSubtract(base_t u[2], base_t v[2], base_t w[2]) 819 | { 820 | //utility function used in zDiv 821 | base_t j=0; 822 | 823 | w[0] = u[0] - v[0]; 824 | if (w[0] > (MAXDIGIT - v[0])) 825 | { 826 | j=1; 827 | w[0] = w[0] + MAXDIGIT + 1; 828 | } 829 | w[1] = u[1] - v[1] - j; 830 | 831 | return 1; 832 | } 833 | 834 | void zMult(bignum * u, bignum * v, bignum * w, bignum *tmp) 835 | { 836 | //w = u*v 837 | base_t k = 0; 838 | int su, sv, i, j, signu, signv; 839 | base_t *wptr; 840 | int words = u->size; 841 | 842 | signu = u->size < 0; 843 | signv = v->size < 0; 844 | 845 | su = abs(u->size); 846 | sv = abs(v->size); 847 | 848 | //for each digit of u 849 | for (i = 0; i < su; ++i) 850 | { 851 | //take an inner product and add in-situ with the previous inner products 852 | k = 0; 853 | wptr = &tmp->data[i]; 854 | for (j = 0; j < sv; ++j) 855 | { 856 | spMulAdd(u->data[i], v->data[j], wptr[j], k, &wptr[j], &k); 857 | } 858 | wptr[j] += k; 859 | } 860 | tmp->size = su + sv; 861 | 862 | zClamp(tmp); 863 | 864 | if (((u->size == 1) && (u->data[0] == 0)) || ((v->size == 1) && (v->data[0] == 0))) 865 | { 866 | w->size = 1; 867 | w->data[0] = 0; 868 | } 869 | else 870 | { 871 | zCopy(tmp, w); 872 | 873 | if (signu ^ signv) 874 | w->size *= -1; 875 | } 876 | 877 | return; 878 | } 879 | 880 | void zMul(bignum * u, bignum * v, bignum * w) 881 | { 882 | //w = u*v 883 | base_t k = 0; 884 | int su, sv, i, j, signu, signv; 885 | base_t *wptr; 886 | int words = u->size; 887 | bignum *tmp; 888 | 889 | signu = u->size < 0; 890 | signv = v->size < 0; 891 | 892 | tmp = zInit(); 893 | 894 | su = abs(u->size); 895 | sv = abs(v->size); 896 | 897 | //for each digit of u 898 | for (i = 0; i < su; ++i) 899 | { 900 | //take an inner product and add in-situ with the previous inner products 901 | k=0; 902 | wptr = &tmp->data[i]; 903 | for (j = 0; j < sv; ++j) 904 | { 905 | spMulAdd(u->data[i], v->data[j], wptr[j], k, &wptr[j], &k); 906 | } 907 | wptr[j] += k; 908 | } 909 | tmp->size = su+sv; 910 | 911 | zClamp(tmp); 912 | 913 | if (((u->size == 1) && (u->data[0] == 0)) || ((v->size == 1) && (v->data[0] == 0))) 914 | { 915 | w->size = 1; 916 | w->data[0] = 0; 917 | } 918 | else 919 | { 920 | zCopy(tmp, w); 921 | 922 | if (signu ^ signv) 923 | w->size *= -1; 924 | } 925 | 926 | zFree(tmp); 927 | return; 928 | } 929 | 930 | void zModMul(bignum * u, bignum * v, bignum * n, bignum * w) 931 | { 932 | bignum * t1, *t2; 933 | 934 | t1 = zInit(); 935 | t2 = zInit(); 936 | 937 | zMul(u,v,t1); 938 | zDiv(t1,n,t2,w); 939 | 940 | zFree(t1); 941 | zFree(t2); 942 | return; 943 | } 944 | 945 | void zModMuls(bignum * u, bignum * v, bignum * n, bignum * w, bignum *s1, bignum *s2) 946 | { 947 | zMul(u, v, s1); 948 | zDiv(s1, n, s2, w); 949 | return; 950 | } 951 | 952 | void zModExp(bignum *d, bignum *b, bignum *e, bignum *m) 953 | { 954 | // d = b^e mod m 955 | // all b and e vector elements can be different. 956 | // all m elements are the same. 957 | int i, word = 0, bit = 0; 958 | int j; 959 | 960 | bignum *s1, *s2, *bb, *t; 961 | 962 | s1 = zInit(); 963 | s2 = zInit(); 964 | bb = zInit(); 965 | t = zInit(); 966 | 967 | zCopy(b, bb); 968 | zSet1(d, 1); 969 | 970 | while (word < NWORDS) 971 | { 972 | if (e->data[word] & (1 << bit)) 973 | { 974 | zModMuls(d, bb, m, d, s1, s2); 975 | } 976 | 977 | zModMuls(bb, bb, m, bb, s1, s2); 978 | 979 | bit++; 980 | if (bit == 32) 981 | { 982 | bit = 0; 983 | word++; 984 | } 985 | } 986 | 987 | zFree(s1); 988 | zFree(s2); 989 | zFree(bb); 990 | zFree(t); 991 | return; 992 | } 993 | 994 | void zShortMul(bignum * u, base_t v, bignum * w) 995 | { 996 | //w = u * v 997 | //schoolbook multiplication, see knuth TAOCP, vol. 2 998 | base_t k=0; 999 | long i; 1000 | long su; 1001 | 1002 | su = abs(u->size); 1003 | 1004 | //inner product 1005 | for (i = 0; i < su; ++i) 1006 | spMulAdd(u->data[i], v, 0, k, &w->data[i], &k); 1007 | 1008 | //if still have a carry, add a digit to w 1009 | if (k) 1010 | { 1011 | w->data[su]=k; 1012 | su++; 1013 | } 1014 | 1015 | if (v == 0) 1016 | { 1017 | w->size = 1; 1018 | } 1019 | else 1020 | { 1021 | w->size = su; 1022 | 1023 | if (u->size < 0) 1024 | w->size *= -1; 1025 | } 1026 | 1027 | return; 1028 | } 1029 | 1030 | void zSqr(bignum * x, bignum * w) 1031 | { 1032 | //this routine is faster than the generic comba sqr on MSVC x86_32 builds. 1033 | bignum *t; 1034 | 1035 | t = zInit(); 1036 | 1037 | zCopy(x, t); 1038 | zMul(x, x, w); 1039 | 1040 | zFree(t); 1041 | 1042 | return; 1043 | } 1044 | 1045 | void zShiftLeft(bignum * a, bignum * b, int x) 1046 | { 1047 | /* Computes a = b << x */ 1048 | int i,wordshift; 1049 | int y; 1050 | int sb,j; 1051 | base_t mask, carry, nextcarry; 1052 | 1053 | wordshift = x / DIGITBITS; 1054 | x = x % DIGITBITS; 1055 | 1056 | //create a mask for the bits that will overflow each digit 1057 | mask = HIBITMASK; 1058 | for (i = 1; i < x; ++i) 1059 | mask = (mask >> 1) | mask; 1060 | 1061 | if (x == 0) mask = 0x0; 1062 | 1063 | sb = abs(b->size); 1064 | a->size = sb; 1065 | 1066 | //for each digit, remember the highest x bits using the mask, then shift. 1067 | //the highest x bits becomes the lowest x bits for the next digit 1068 | y = DIGITBITS - x; 1069 | carry = 0; 1070 | for (j = 0; j < sb; ++j) 1071 | { 1072 | nextcarry = (b->data[j] & mask) >> y; 1073 | a->data[j] = (b->data[j] << x) | carry; 1074 | carry = nextcarry; 1075 | } 1076 | 1077 | if (carry) 1078 | { 1079 | a->data[sb] = carry; 1080 | a->size++; 1081 | } 1082 | 1083 | if (wordshift) 1084 | { 1085 | //now shift by any full words 1086 | for (i=a->size - 1;i>=0;i--) 1087 | a->data[i+wordshift] = a->data[i]; 1088 | //zero out the ones that were shifted 1089 | for (i=wordshift-1;i>=0;i--) 1090 | a->data[i] = 0; 1091 | a->size += wordshift; 1092 | } 1093 | 1094 | if (b->size < 0) 1095 | a->size *= -1; 1096 | 1097 | return; 1098 | } 1099 | 1100 | void zShiftLeft_1(bignum * a, bignum * b) 1101 | { 1102 | /* Computes a = b << 1 */ 1103 | int i; 1104 | int y; 1105 | int sb, j; 1106 | base_t mask, carry, nextcarry; 1107 | 1108 | //create a mask for the bits that will overflow each digit 1109 | mask = HIBITMASK; 1110 | sb = abs(b->size); 1111 | a->size = sb; 1112 | 1113 | //for each digit, remember the highest x bits using the mask, then shift. 1114 | //the highest x bits becomes the lowest x bits for the next digit 1115 | y = DIGITBITS - 1; 1116 | carry = 0; 1117 | for (j = 0; j < sb; ++j) 1118 | { 1119 | nextcarry = (b->data[j] & mask) >> y; 1120 | a->data[j] = (b->data[j] << 1) | carry; 1121 | carry = nextcarry; 1122 | } 1123 | 1124 | if (carry) 1125 | { 1126 | a->data[sb] = carry; 1127 | a->size++; 1128 | } 1129 | 1130 | if (b->size < 0) 1131 | a->size *= -1; 1132 | 1133 | return; 1134 | } 1135 | 1136 | void zShiftRight(bignum * a, bignum * b, int x) 1137 | { /* Computes a = b >> x */ 1138 | int i, y, sign, wordshift; 1139 | int sb; 1140 | base_t mask, carry, nextcarry; 1141 | 1142 | wordshift = x / DIGITBITS; 1143 | x = x % DIGITBITS; 1144 | 1145 | //create a mask for the bits that will overflow each digit 1146 | mask = 0x1; 1147 | for (i = 1; i < x; ++i) 1148 | { 1149 | mask = (mask << 1) | mask; 1150 | } 1151 | if (x == 0) mask = 0x0; 1152 | 1153 | sign =( b->size < 0); 1154 | sb = abs(b->size); 1155 | a->size = sb; 1156 | 1157 | //for each digit, remember the lowest x bits using the mask, then shift. 1158 | //the lowest x bits becomes the highest x bits for the next digit 1159 | y = DIGITBITS - x; 1160 | carry = 0; 1161 | for (i = sb - 1; i >= 0; --i) 1162 | { 1163 | nextcarry = (b->data[i] & mask) << y; 1164 | a->data[i] = b->data[i] >> x | carry; 1165 | carry = nextcarry; 1166 | } 1167 | 1168 | if ((a->data[sb-1] == 0) && (a->size > 1)) 1169 | a->size--; 1170 | 1171 | if (wordshift) 1172 | { 1173 | //now shift by any full words 1174 | for (i=0;isize - 1;i++) 1175 | a->data[i] = a->data[i+wordshift]; 1176 | //zero out the ones that were shifted 1177 | a->size -= wordshift; 1178 | } 1179 | 1180 | if (sign) 1181 | a->size *= -1; 1182 | 1183 | return; 1184 | } 1185 | 1186 | void zShiftRight_1(bignum * a, bignum * b) 1187 | { /* Computes a = b >> x */ 1188 | int i, sign; 1189 | int sb; 1190 | base_t mask, carry, nextcarry; 1191 | 1192 | //create a mask for the bits that will overflow each digit 1193 | mask = 0x1; 1194 | 1195 | sign = (b->size < 0); 1196 | sb = abs(b->size); 1197 | a->size = sb; 1198 | 1199 | //for each digit, remember the lowest x bits using the mask, then shift. 1200 | //the lowest x bits becomes the highest x bits for the next digit 1201 | carry = 0; 1202 | for (i = sb - 1; i >= 0; --i) 1203 | { 1204 | nextcarry = (b->data[i] & mask) << 31; 1205 | a->data[i] = (b->data[i] >> 1) | carry; 1206 | carry = nextcarry; 1207 | } 1208 | 1209 | if ((a->data[sb - 1] == 0) && (a->size > 1)) 1210 | a->size--; 1211 | 1212 | if (sign) 1213 | a->size *= -1; 1214 | 1215 | return; 1216 | } 1217 | 1218 | int zLEGCD(bignum *u, bignum *v, bignum *w) 1219 | { 1220 | //use the Lehman-Euclid algorithm to calculate GCD(u,v) = w 1221 | //Algorithm L in Knuth, 4.5.2 p. 329 1222 | //assumes u,v nonnegative 1223 | 1224 | base_t aa,bb,cc,dd; 1225 | int i,j,k,it; 1226 | base_signed_t a,b,c,d,t; 1227 | base_t up,vdp, q1, q2; 1228 | base_t mask; 1229 | bignum *y, *zz; //t and w, in knuth 1230 | bignum *x; //tmp variable 1231 | bignum *uu, *vv; //so u and v don't get destroyed 1232 | bignum *uh, *vh; 1233 | base_t udp[2],vp[2]; 1234 | 1235 | 1236 | #if DIGITBITS == 32 1237 | mask = 0xff000000; 1238 | #else 1239 | mask = 0xff00000000000000; 1240 | #endif 1241 | 1242 | i = zCompare1(u,0); 1243 | j = zCompare1(v,0); 1244 | 1245 | if (i == 0) 1246 | { 1247 | zCopy(v,w); 1248 | return 1; 1249 | } 1250 | if (j == 0) 1251 | { 1252 | zCopy(u,w); 1253 | return 1; 1254 | } 1255 | 1256 | //temp variables should be twice as big as the input, to make room 1257 | //for intermediate operations. w should be as big as the biggest input. 1258 | i = u->size; 1259 | j = v->size; 1260 | if (j > i) 1261 | i = j; 1262 | 1263 | y = zInit(); 1264 | zz = zInit(); 1265 | x = zInit(); 1266 | uu = zInit(); 1267 | vv = zInit(); 1268 | uh = zInit(); 1269 | vh = zInit(); 1270 | 1271 | //put bigger number in uu, other in vv. 1272 | i = zCompare(u,v); 1273 | if (i >= 0) 1274 | { 1275 | zCopy(u,uu); 1276 | zCopy(v,vv); 1277 | } 1278 | else 1279 | { 1280 | zCopy(v,uu); 1281 | zCopy(u,vv); 1282 | } 1283 | 1284 | j=0; 1285 | while (vv->size > 1) 1286 | { 1287 | //Step L1 1288 | for (it=vv->size;itsize;it++) 1289 | vv->data[it]=0; 1290 | vv->size = uu->size; 1291 | //get the most significant 32 bits of u and v, such that uhat >= vhat 1292 | uh->data[1] = uu->data[uu->size - 1]; 1293 | uh->data[0] = uu->data[uu->size - 2]; 1294 | vh->data[1] = vv->data[vv->size - 1]; 1295 | vh->data[0] = vv->data[vv->size - 2]; 1296 | uh->size = vh->size = 2; 1297 | 1298 | //rightshift until uhat is a single word 1299 | //0xff000000 is magic 1300 | if ((uh->data[1] & mask) > 0) 1301 | { 1302 | uh->data[0] = uh->data[1]; 1303 | vh->data[0] = vh->data[1]; 1304 | } 1305 | else 1306 | { 1307 | i=0; 1308 | aa=uh->data[1]; 1309 | while ((aa & MAXDIGIT) != 0) 1310 | { 1311 | aa >>= 1; 1312 | i++; 1313 | } 1314 | zShiftRight(uh,uh,i); 1315 | zShiftRight(vh,vh,i); 1316 | } 1317 | 1318 | //make u',v',u'',v'' 1319 | up = uh->data[0]; 1320 | vdp = vh->data[0]; 1321 | if (up == MAXDIGIT) 1322 | { 1323 | udp[0] = 0; 1324 | udp[1] = 1; 1325 | } 1326 | else 1327 | { 1328 | udp[0] = up+1; 1329 | udp[1] = 0; 1330 | } 1331 | 1332 | if (vdp == MAXDIGIT) 1333 | { 1334 | vp[0] = 0; 1335 | vp[1] = 1; 1336 | } 1337 | else 1338 | { 1339 | vp[0] = vdp+1; 1340 | vp[1] = 0; 1341 | } 1342 | 1343 | a=1; b=0; c=0; d=1; 1344 | 1345 | k=0; 1346 | while (1) 1347 | { 1348 | //Step L2: 1349 | /*test quotient, protecting for overflow. the conditions: 1350 | 0 <= uhat + a <= 2^32 1351 | 0 <= uhat + b < 2^32 1352 | 0 <= vhat + c < 2^32 1353 | 0 <= vhat + d <= 2^32 1354 | will always hold. hence only need to check for the case where 1355 | uhat == MAX_DIGIT and a = 1 or vhat == MAX_DIGIT and d = 1 1356 | */ 1357 | 1358 | //first check for /0 1359 | if (((vp[0] == 0) && (vp[1] == 0)) || (vdp == 0)) 1360 | break; 1361 | 1362 | //u''/v'' 1363 | if (udp[1] == 1) 1364 | { 1365 | spDivide(&q2,(base_t *)&t,udp,vdp); 1366 | } 1367 | else 1368 | q2 = udp[0]/vdp; 1369 | 1370 | //u'/v' 1371 | if (vp[1] >= 1) 1372 | q1=0; 1373 | else 1374 | q1 = up/vp[0]; 1375 | 1376 | if (q1 != q2) 1377 | break; 1378 | 1379 | //Step L3: Emulate Euclid 1380 | t=a-q1*c; 1381 | a=c; 1382 | c=t; 1383 | t=b-q1*d; 1384 | b=d; 1385 | d=t; 1386 | t=up-q1*vp[0]; 1387 | up=vp[0]; 1388 | vp[0]=t; 1389 | t=udp[0]-q2*vdp; 1390 | udp[0]=vdp; 1391 | vdp=t; 1392 | k++; 1393 | if (k>10000) 1394 | goto free; 1395 | } 1396 | 1397 | //Step L4: multiprecision step 1398 | if (b==0) 1399 | { 1400 | for (i=vv->size-1;i>=0;i--) 1401 | { 1402 | if (vv->data[i] == 0) 1403 | vv->size--; 1404 | else 1405 | break; 1406 | } 1407 | zDiv(uu,vv,y,x); //y = u mod v 1408 | zCopy(vv,uu); //u = v 1409 | zCopy(x,vv); //v = y 1410 | } 1411 | else 1412 | { 1413 | //aa=abs(a); 1414 | //bb=abs(b); 1415 | //cc=abs(c); 1416 | //dd=abs(d); 1417 | if (a<0) 1418 | aa = -a; 1419 | else 1420 | aa = a; 1421 | if (b<0) 1422 | bb = -b; 1423 | else 1424 | bb = b; 1425 | if (c<0) 1426 | cc = -c; 1427 | else 1428 | cc = c; 1429 | if (d<0) 1430 | dd = -d; 1431 | else 1432 | dd = d; 1433 | zShortMul(uu,aa,y); //y = A*u 1434 | zShortMul(vv,bb,x); //y = y + B*v 1435 | if (a<0) 1436 | { 1437 | zSub(x,y,x); 1438 | zCopy(x,y); 1439 | } 1440 | else if (b<0) 1441 | zSub(y,x,y); 1442 | else 1443 | zAdd(y,x,y); 1444 | 1445 | zShortMul(uu,cc,zz); //z = c*u 1446 | zShortMul(vv,dd,x); //z = z + d*v 1447 | if (c<0) 1448 | { 1449 | zSub(x,zz,x); 1450 | zCopy(x,zz); 1451 | } 1452 | else // (d<0) 1453 | zSub(zz,x,zz); 1454 | 1455 | zCopy(y,uu); //u = y; 1456 | zCopy(zz,vv); //v = z; 1457 | } 1458 | j++; 1459 | if (j>10000) 1460 | goto free; 1461 | } 1462 | 1463 | //here, the size of v is 1, so finish up with regular GCD 1464 | zBinGCD(uu,vv,w); 1465 | 1466 | free: 1467 | zFree(y); 1468 | zFree(zz); 1469 | zFree(x); 1470 | zFree(uu); 1471 | zFree(vv); 1472 | zFree(uh); 1473 | zFree(vh); 1474 | return 1; 1475 | } 1476 | 1477 | int zBinGCD(bignum *u, bignum *v, bignum *w) 1478 | { 1479 | //computes w = gcd(u,v) 1480 | //follows algorithm B. p.321 Knuth Vol. 2 1481 | 1482 | bignum *uu, *vv, *t; 1483 | long i=0,j; 1484 | int k,sz; 1485 | 1486 | sz = abs(u->size); 1487 | if (abs(v->size) > sz) 1488 | { 1489 | sz = abs(v->size); 1490 | } 1491 | 1492 | i = zCompare1(u, 0); 1493 | j = zCompare1(v, 0); 1494 | if (i == 0) 1495 | { 1496 | zCopy(v,w); 1497 | return 1; 1498 | } 1499 | if (j == 0) 1500 | { 1501 | zCopy(u,w); 1502 | return 1; 1503 | } 1504 | 1505 | uu = zInit(); 1506 | vv = zInit(); 1507 | t = zInit(); 1508 | 1509 | zCopy(u,uu); 1510 | zCopy(v,vv); 1511 | 1512 | //find power of 2 such that u and v are not both even 1513 | k = 0; 1514 | while(((uu->data[0] & 0x1) == 0) && ((vv->data[0] & 0x1) == 0)) 1515 | { 1516 | zShiftRight(uu,uu,1); 1517 | zShiftRight(vv,vv,1); 1518 | k++; 1519 | } 1520 | 1521 | j=0; 1522 | do 1523 | { 1524 | if ((uu->data[0] & 0x1) == 0) 1525 | zShiftRight(uu,uu,1); 1526 | else if ((vv->data[0] & 0x1) == 0) 1527 | zShiftRight(vv,vv,1); 1528 | else 1529 | { 1530 | zSub(uu,vv,t); 1531 | zShiftRight(t,t,1); 1532 | if (zCompare(uu,vv) < 0) 1533 | zCopy(t,vv); 1534 | else 1535 | zCopy(t,uu); 1536 | } 1537 | ++j; 1538 | if (j>= 10000) 1539 | break; 1540 | } while (zCompare1(uu, 0) > 0); 1541 | 1542 | zClear(w); 1543 | w->data[0] = 1; 1544 | zShiftLeft(w,w,k); 1545 | zMul(w,vv,uu); 1546 | zCopy(uu,w); 1547 | 1548 | zFree(uu); 1549 | zFree(vv); 1550 | zFree(t); 1551 | return j; 1552 | } 1553 | 1554 | void xGCD(bignum *a, bignum *b, bignum *x, bignum *y, bignum *g) 1555 | { 1556 | //compute the extended GCD of a, b, returning g = GCD(a,b) and x, y 1557 | //such that ax + by = GCD(a,b) if a,b are coprime 1558 | bignum *t1, *t2, *t3, *u, *v, *r, *R, *q, *tmp; 1559 | 1560 | // int i; 1561 | /* 1562 | 1563 | Step 1: 1564 | if a < b then 1565 | Set u=0, v=1, and r=b 1566 | Set U=1, V=0, and R=a 1567 | else 1568 | Set u=1, v=0, and r=a 1569 | Set U=0, V=1, and R=b 1570 | 1571 | Step 2: 1572 | if R = 0 then return r (for the gcd) and no inverses exist. 1573 | if R = 1 then return R (for the gcd), V (for the inverse a(mod b)) and U (for the inverse of b(mod a)). 1574 | 1575 | Step 3: 1576 | Calculate q = int(r/R) 1577 | Calculate t1 = u - U*q 1578 | Calculate t2 = v - V*q 1579 | Calculate t3 = r - R*q 1580 | set u=U, v=V, r=R 1581 | set U=t1, V=t2, R=t3 1582 | goto Step 2. 1583 | */ 1584 | 1585 | tmp = zInit(); 1586 | t1 = zInit(); 1587 | t2 = zInit(); 1588 | t3 = zInit(); 1589 | q = zInit(); 1590 | r = zInit(); 1591 | R = zInit(); 1592 | u = zInit(); 1593 | v = zInit(); 1594 | 1595 | //need to check for temp allocation 1596 | 1597 | zClear(x); 1598 | zClear(y); 1599 | 1600 | 1601 | if (zCompare(a,b) < 0) 1602 | { 1603 | u->data[0]=0; 1604 | v->data[0]=1; 1605 | zCopy(b,r); 1606 | x->data[0]=1; 1607 | y->data[0]=0; 1608 | zCopy(a,R); 1609 | } 1610 | else 1611 | { 1612 | u->data[0]=1; 1613 | v->data[0]=0; 1614 | zCopy(a,r); 1615 | x->data[0]=0; 1616 | y->data[0]=1; 1617 | zCopy(b,R); 1618 | } 1619 | 1620 | while (1) 1621 | { 1622 | if (zCompare1(R, 0) == 0) 1623 | { 1624 | zCopy(r,g); 1625 | x->data[0] = 0; 1626 | x->size = 1; 1627 | y->data[0] = 0; 1628 | y->size = 1; 1629 | break; 1630 | } 1631 | 1632 | if (zCompare1(R, 1) == 0) 1633 | { 1634 | zCopy(R,g); 1635 | break; 1636 | } 1637 | 1638 | zCopy(r,tmp); 1639 | zDiv(tmp,R,q,t3); //q = int(r/R), t3 = r % R 1640 | 1641 | zMul(q,x,tmp); //t1 = u - U*q 1642 | zSub(u,tmp,t1); 1643 | 1644 | zMul(q,y,tmp); //t2 = v - V*q 1645 | zSub(v,tmp,t2); 1646 | 1647 | zCopy(x,u); 1648 | zCopy(y,v); 1649 | zCopy(R,r); 1650 | 1651 | zCopy(t1,x); 1652 | zCopy(t2,y); 1653 | zCopy(t3,R); 1654 | 1655 | //printf("iteration %d: x = %s\n", i, z2decstr(x)); 1656 | //printf("iteration %d: y = %s\n", i, z2decstr(y)); 1657 | //printf("iteration %d: g = %s\n", i, z2decstr(g)); 1658 | //printf("iteration %d: r = %s\n", i, z2decstr(r)); 1659 | //printf("iteration %d: R = %s\n", i, z2decstr(R)); 1660 | //printf("iteration %d: q = %s\n", i, z2decstr(q)); 1661 | //printf("iteration %d: u = %s\n", i, z2decstr(u)); 1662 | //printf("iteration %d: v = %s\n", i, z2decstr(v)); 1663 | //i++; 1664 | } 1665 | 1666 | if (x->size < 0) 1667 | { 1668 | x->size *= -1; 1669 | zSub(b,x,x); 1670 | } 1671 | 1672 | if (y->size < 0) 1673 | { 1674 | y->size *= -1; 1675 | zSub(a,y,y); 1676 | } 1677 | 1678 | zFree(tmp); 1679 | zFree(t1); 1680 | zFree(t2); 1681 | zFree(t3); 1682 | zFree(q); 1683 | zFree(r); 1684 | zFree(R); 1685 | zFree(u); 1686 | zFree(v); 1687 | return; 1688 | } 1689 | 1690 | void str2hexz(char in[], bignum * u) 1691 | { 1692 | // convert a string to a bigint 1693 | char *s2,*s; 1694 | char **ptr = NULL; 1695 | 1696 | // assume input is base10, we convert 9 digits at a time (32 bit words) 1697 | int i,j,su,base=10,step=9; 1698 | bignum * t; 1699 | 1700 | // allocate space for a temporary bignum 1701 | t = zInit(); 1702 | 1703 | // work with a copy of in (because the first step in the conversion process 1704 | // inserts null characters into the string...). This could probably be changed. 1705 | s = (char *)malloc(8192*sizeof(char)); 1706 | strcpy(s,in); 1707 | 1708 | // compute how many 9-digit decimal words we have in the string 1709 | su = strlen(s)/step + (strlen(s)%step != 0); 1710 | 1711 | // read 9 characters of s at a time into a base-10 bignum, 'u' 1712 | j=0; 1713 | for (i=0;idata[j] = strtoul(s2,ptr,base); 1717 | s2[0] = '\0'; 1718 | j++; 1719 | } 1720 | 1721 | if (strlen(s) > 0) 1722 | { 1723 | s2 = s; 1724 | ptr = &s2; 1725 | t->data[j] = strtoul(s2,ptr,base); 1726 | } 1727 | t->size = j+1; 1728 | 1729 | // now convert the base-10 bignum to a binary bignum 1730 | zDec2Hex(t,u); 1731 | 1732 | // clear the upper words, if any 1733 | for (i = u->size; i < NWORDS; i++) 1734 | { 1735 | u->data[i] = 0; 1736 | } 1737 | 1738 | free(s); 1739 | zFree(t); 1740 | return; 1741 | } 1742 | 1743 | void zDec2Hex(bignum * u, bignum * v) 1744 | { 1745 | // convert u[] in dec to v[] in hex by multiplying the ith digit by (1e9)*i 1746 | // and adding to the previous digits 1747 | 1748 | bignum * a, *b, *vv; 1749 | base_t d = MAX_DEC_WORD; 1750 | int i, j; 1751 | 1752 | a = zInit(); 1753 | b = zInit(); 1754 | vv = zInit(); 1755 | zClear(v); 1756 | 1757 | a->data[0] = 1; 1758 | for (i = 0; i < u->size; i++) 1759 | { 1760 | zShortMul(a, u->data[i], b); 1761 | zAdd(vv, b, vv); 1762 | zShortMul(a, d, a); 1763 | } 1764 | 1765 | zClamp(vv); 1766 | zCopy(vv, v); 1767 | 1768 | zFree(a); 1769 | zFree(b); 1770 | zFree(vv); 1771 | 1772 | return; 1773 | } 1774 | 1775 | void zHex2Dec(bignum * u, bignum * v) 1776 | { 1777 | //convert u[] in hex to v[] in decimal by repeatedly dividing 1778 | //u by 1e9 = 0x3b9aca00 1779 | //the remainder of the ith division is the ith decimal digit. 1780 | //when the quotient = 0, stop 1781 | 1782 | bignum * a, *b; 1783 | base_t d = MAX_DEC_WORD; 1784 | base_t r = 0; 1785 | int su = u->size; 1786 | //because decimal takes more room than hex to store 1787 | 1788 | a = zInit(); 1789 | b = zInit(); 1790 | zClear(v); 1791 | 1792 | zCopy(u, a); 1793 | v->size = 1; 1794 | do 1795 | { 1796 | r = zShortDiv(a, d, b); 1797 | v->data[v->size - 1] = r; 1798 | v->size++; 1799 | zCopy(b, a); 1800 | } while (zCompare1(a, 0) != 0); 1801 | v->size--; 1802 | 1803 | zFree(a); 1804 | zFree(b); 1805 | return; 1806 | } 1807 | 1808 | char *z2decstr(bignum * n) 1809 | { 1810 | //pass in a pointer to a string. if necessary, this routine will 1811 | //reallocate space for the string to accomodate its size. If this happens 1812 | //the pointer to the string's (likely) new location is automatically 1813 | //updated and returned. 1814 | bignum * a; 1815 | int i,sza,sign = 0; 1816 | char *tmp, *s; 1817 | int nchars, j; 1818 | 1819 | a = zInit(); 1820 | 1821 | s = (char *)malloc(8192 * sizeof(char)); 1822 | 1823 | strcpy(s,""); 1824 | nchars = 1; 1825 | if (n->size < 0) 1826 | { 1827 | sign = 1; 1828 | n->size *= -1; 1829 | sprintf(s, "-"); 1830 | nchars++; 1831 | } 1832 | 1833 | zHex2Dec(n, a); 1834 | sza = abs(a->size); 1835 | 1836 | tmp = (char *)malloc((DEC_DIGIT_PER_WORD + 10) * sizeof(char)); 1837 | 1838 | //print first word 1839 | #if DIGITBITS == 64 1840 | sprintf(s,"%s%lu", s, a->data[sza - 1]); 1841 | #else 1842 | sprintf(s, "%s%u", s, a->data[sza - 1]); 1843 | #endif 1844 | nchars += ndigits_1(a->data[sza-1]) - 1; 1845 | 1846 | //print the rest 1847 | for (i=sza - 2; i >= 0; i--) 1848 | { 1849 | #if DIGITBITS == 64 1850 | sprintf(tmp,"%019lu",a->data[i]); 1851 | #else 1852 | sprintf(tmp, "%09u", a->data[i]); 1853 | #endif 1854 | memcpy(s + nchars, tmp, DEC_DIGIT_PER_WORD * sizeof(char)); 1855 | nchars += DEC_DIGIT_PER_WORD; 1856 | } 1857 | s[nchars] = '\0'; 1858 | 1859 | free(tmp); 1860 | zFree(a); 1861 | 1862 | if (sign) 1863 | { 1864 | n->size *= -1; 1865 | } 1866 | 1867 | return s; 1868 | } 1869 | 1870 | void spMulAdd(base_t u, base_t v, base_t w, base_t t, base_t *lower, base_t *carry) 1871 | { 1872 | base_t k,p; 1873 | spMultiply(u,v,&p,carry); 1874 | spAdd3(p,w,t,lower,&k); 1875 | *carry += k; 1876 | return; 1877 | } 1878 | 1879 | void spMulMod(base_t u, base_t v, base_t m, base_t *w) 1880 | { 1881 | base_t p[2]; 1882 | base_t q; 1883 | 1884 | spMultiply(u,v,&p[0],&p[1]); 1885 | spDivide(&q,w,p,m); 1886 | 1887 | return; 1888 | } 1889 | 1890 | --------------------------------------------------------------------------------