├── .github
    ├── dependabot.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── config-fermat.sh
├── docs
    ├── Fermat-testing.md
    ├── Mfactor_buildnotes.txt
    ├── a.txt
    ├── alderlake.txt
    ├── b.txt
    ├── brent-suyama.txt
    ├── c.txt
    ├── d.txt
    ├── dct.txt
    ├── fgt.txt
    ├── gerbicz.txt
    ├── gpuowl_stats.txt
    ├── hwloc_test.txt
    ├── irrational.txt
    ├── knc.txt
    ├── nt.txt
    ├── pm1.txt
    ├── pm1_compare.png
    ├── pm1_compare.txt
    ├── pm1_d210.txt
    ├── pm1_d330.txt
    ├── pm1_d420.txt
    ├── pm1_d660.txt
    ├── pm1_d840.txt
    ├── predefs_archlinux.txt
    ├── predefs_linux.txt
    ├── predefs_mac.txt
    ├── predefs_power9.txt
    ├── prp_proof.txt
    ├── prp_proof_examples.txt
    ├── qs.txt
    └── todo.txt
├── help.txt
├── makemake.sh
└── src
    ├── Mdata.h
    ├── Mlucas.c
    ├── Mlucas.h
    ├── align.h
    ├── br.c
    ├── carry.h
    ├── carry_dbg.h
    ├── carry_gcc32.h
    ├── carry_gcc64.h
    ├── dft_macro.c
    ├── dft_macro.h
    ├── dft_sine_term_opt.c.txt
    ├── f2psp.h
    ├── f2psp_3_5.h
    ├── fac_test_dat128.h
    ├── fac_test_dat192.h
    ├── fac_test_dat256.h
    ├── fac_test_dat64.h
    ├── fac_test_dat96.h
    ├── factor.c
    ├── factor.h
    ├── factor_test.h
    ├── fermat_mod_square.c
    ├── fgt_m61.c
    ├── fgt_m61.h
    ├── float_intrin.h
    ├── gcd_lehmer.c.txt
    ├── gcd_lehmer.h
    ├── genFFT_mul.h
    ├── getRealTime.c
    ├── get_cpuid.c
    ├── get_fft_radices.c
    ├── get_fp_rnd_const.c
    ├── get_preferred_fft_radix.c
    ├── gpu_iface.cu
    ├── gpu_iface.h
    ├── gpu_sieve.cu
    ├── imul256_macro.h
    ├── imul_macro.c
    ├── imul_macro.h
    ├── imul_macro0.h
    ├── imul_macro1.h
    ├── masterdefs.h
    ├── mers_mod_square.c
    ├── mi64.c
    ├── mi64.h
    ├── mi64_new.c.txt
    ├── pairFFT_mul.c
    ├── pair_square.c
    ├── pair_square.h
    ├── platform.h
    ├── pm1.c
    ├── prefetch.h
    ├── qfcheb.c.txt
    ├── qfcheb.h
    ├── qfloat.c
    ├── qfloat.h
    ├── radix09_sse_macro.h
    ├── radix1008_avx_negadwt_consts.h
    ├── radix1008_ditN_cy_dif1.c
    ├── radix1008_main_carry_loop.h
    ├── radix1024.h
    ├── radix1024_avx_negadwt_consts.h
    ├── radix1024_ditN_cy_dif1.c
    ├── radix1024_main_carry_loop.h
    ├── radix1024_twiddles.h
    ├── radix104_ditN_cy_dif1.c
    ├── radix10_ditN_cy_dif1.c
    ├── radix112_ditN_cy_dif1.c
    ├── radix11_ditN_cy_dif1.c
    ├── radix11_sse_macro.h
    ├── radix120_ditN_cy_dif1.c
    ├── radix128.h
    ├── radix128_ditN_cy_dif1.c
    ├── radix128_main_carry_loop.h
    ├── radix128_twiddles.h
    ├── radix12_ditN_cy_dif1.c
    ├── radix12_main_carry_loop.h
    ├── radix13.h
    ├── radix13_ditN_cy_dif1.c
    ├── radix13_sse_macro.h
    ├── radix144_ditN_cy_dif1.c
    ├── radix144_main_carry_loop.h
    ├── radix14_ditN_cy_dif1.c
    ├── radix15_ditN_cy_dif1.c
    ├── radix15_sse_macro.h
    ├── radix16.h
    ├── radix160_ditN_cy_dif1.c
    ├── radix160_main_carry_loop.h
    ├── radix16_dif_dit_pass.c
    ├── radix16_dif_dit_pass_asm.h
    ├── radix16_ditN_cy_dif1.c
    ├── radix16_ditN_cy_dif1_asm.h
    ├── radix16_dyadic_square.c
    ├── radix16_dyadic_square_gcc64.h
    ├── radix16_main_carry_loop.h
    ├── radix16_pairFFT_mul.c
    ├── radix16_utils_asm.h
    ├── radix16_wrapper_ini.c
    ├── radix16_wrapper_square.c
    ├── radix16_wrapper_square_gcc32.h
    ├── radix16_wrapper_square_gcc64.h
    ├── radix176_ditN_cy_dif1.c
    ├── radix176_main_carry_loop.h
    ├── radix17_dft.h
    ├── radix17_ditN_cy_dif1.c
    ├── radix18_ditN_cy_dif1.c
    ├── radix192_ditN_cy_dif1.c
    ├── radix192_main_carry_loop.h
    ├── radix208_ditN_cy_dif1.c
    ├── radix208_main_carry_loop.h
    ├── radix20_ditN_cy_dif1.c
    ├── radix20_ditN_cy_dif1_gcc32.h
    ├── radix20_ditN_cy_dif1_gcc64.h
    ├── radix20_main_carry_loop.h
    ├── radix224_ditN_cy_dif1.c
    ├── radix224_main_carry_loop.h
    ├── radix22_ditN_cy_dif1.c
    ├── radix240_ditN_cy_dif1.c
    ├── radix240_main_carry_loop.h
    ├── radix24_ditN_cy_dif1.c
    ├── radix24_ditN_cy_dif1_gcc32.h
    ├── radix24_ditN_cy_dif1_gcc64.h
    ├── radix24_main_carry_loop.h
    ├── radix256.h
    ├── radix256_ditN_cy_dif1.c
    ├── radix256_main_carry_loop.h
    ├── radix256_twiddles.h
    ├── radix26_ditN_cy_dif1.c
    ├── radix288_ditN_cy_dif1.c
    ├── radix288_main_carry_loop.h
    ├── radix28_ditN_cy_dif1.c
    ├── radix28_ditN_cy_dif1_gcc32.h
    ├── radix28_ditN_cy_dif1_gcc64.h
    ├── radix28_main_carry_loop.h
    ├── radix30_ditN_cy_dif1.c
    ├── radix31.h
    ├── radix31_ditN_cy_dif1.c
    ├── radix32.h
    ├── radix320_ditN_cy_dif1.c
    ├── radix320_main_carry_loop.h
    ├── radix32_dif_dit_pass.c
    ├── radix32_dif_dit_pass_asm.h
    ├── radix32_ditN_cy_dif1.c
    ├── radix32_ditN_cy_dif1_asm.h
    ├── radix32_dyadic_square.c
    ├── radix32_dyadic_square_gcc64.h
    ├── radix32_main_carry_loop.h
    ├── radix32_utils_asm.h
    ├── radix32_wrapper_ini.c
    ├── radix32_wrapper_square.c
    ├── radix32_wrapper_square_gcc32.h
    ├── radix32_wrapper_square_gcc64.h
    ├── radix352_ditN_cy_dif1.c
    ├── radix352_main_carry_loop.h
    ├── radix36_ditN_cy_dif1.c
    ├── radix36_main_carry_loop.h
    ├── radix384_ditN_cy_dif1.c
    ├── radix384_main_carry_loop.h
    ├── radix4032.h
    ├── radix4032_avx_negadwt_consts.h
    ├── radix4032_ditN_cy_dif1.c
    ├── radix4032_main_carry_loop.h
    ├── radix40_ditN_cy_dif1.c
    ├── radix40_main_carry_loop.h
    ├── radix44_ditN_cy_dif1.c
    ├── radix44_main_carry_loop.h
    ├── radix48_ditN_cy_dif1.c
    ├── radix48_main_carry_loop.h
    ├── radix512.h
    ├── radix512_ditN_cy_dif1.c
    ├── radix52_ditN_cy_dif1.c
    ├── radix52_main_carry_loop.h
    ├── radix56_ditN_cy_dif1.c
    ├── radix56_main_carry_loop.h
    ├── radix5_ditN_cy_dif1.c
    ├── radix60_ditN_cy_dif1.c
    ├── radix60_main_carry_loop.h
    ├── radix63_ditN_cy_dif1.c
    ├── radix63_main_carry_loop.h
    ├── radix64.h
    ├── radix64_ditN_cy_dif1.c
    ├── radix64_main_carry_loop.h
    ├── radix6_ditN_cy_dif1.c
    ├── radix72_ditN_cy_dif1.c
    ├── radix768_ditN_cy_dif1.c
    ├── radix768_main_carry_loop.h
    ├── radix7_ditN_cy_dif1.c
    ├── radix80_ditN_cy_dif1.c
    ├── radix88_ditN_cy_dif1.c
    ├── radix8_dif_dit_pass.c
    ├── radix8_dif_dit_pass_asm.h
    ├── radix8_ditN_cy_dif1.c
    ├── radix960_avx_negadwt_consts.h
    ├── radix960_ditN_cy_dif1.c
    ├── radix960_main_carry_loop.h
    ├── radix96_ditN_cy_dif1.c
    ├── radix992_ditN_cy_dif1.c
    ├── radix992_main_carry_loop.h
    ├── radix9_ditN_cy_dif1.c
    ├── rng_isaac.c
    ├── rng_isaac.h
    ├── sse2_macro.h
    ├── sse2_macro_gcc32.h
    ├── sse2_macro_gcc64.h
    ├── test_fft_radix.c
    ├── test_fft_radix.c.txt
    ├── threadpool.c
    ├── threadpool.h
    ├── twopmodq.c
    ├── twopmodq100.c
    ├── twopmodq100.h
    ├── twopmodq128.c
    ├── twopmodq128_96.c
    ├── twopmodq160.c
    ├── twopmodq192.c
    ├── twopmodq256.c
    ├── twopmodq64_test.c
    ├── twopmodq80.c
    ├── twopmodq80.h
    ├── twopmodq96.c
    ├── types.c
    ├── types.h
    ├── util.c
    └── util.h


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "monthly"
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | obj*/
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Actions Status](https://github.com/primesearch/Mlucas/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/primesearch/Mlucas/actions/workflows/ci.yml)
  2 | 
  3 | # Mlucas
  4 | Ernst Mayer's Mlucas and Mfactor programs for GIMPS
  5 | 
  6 | [Ernst Mayer passed away unexpectedly](https://www.mersenneforum.org/showthread.php?t=28890) on September 10, 2023. This repository contains his posthumously released Mlucas v21 code, which is now maintained by the Great Internet Mersenne Prime Search (GIMPS) community. AutoPrimeNet (the Python PrimeNet program) previously bundled with Mlucas is now maintained in a [separate repository](https://github.com/tdulcet/AutoPrimeNet).
  7 | 
  8 | Mlucas and Mfactor are 100% open source programs. Mlucas is for [primality](https://en.wikipedia.org/wiki/Primality_test) and [P-1](https://en.wikipedia.org/wiki/Pollard%27s_p_%E2%88%92_1_algorithm) testing of [Mersenne](https://en.wikipedia.org/wiki/Mersenne_prime) and [Fermat](https://en.wikipedia.org/wiki/Fermat_number) numbers, including support for the [Lucas-Lehmer](https://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test), [Probable prime](https://en.wikipedia.org/wiki/Probable_prime) (PRP) and [Pépin](https://en.wikipedia.org/wiki/P%C3%A9pin%27s_test) tests. Mfactor is for trial factoring. They support x86 Intel and AMD, ARM and other CPUs.
  9 | 
 10 | The original [Mlucas README](https://mersenneforum.org/mayer/README.html) is available for posterity and contains a lot of information, but note that it is no longer up to date. For more information about Mlucas v21, please see the [Ernst's Mlucas - the future](https://www.mersenneforum.org/showthread.php?t=28926) thread on the Mersenne Forum.
 11 | 
 12 | Feature | | Mlucas | Prime95/MPrime
 13 | --- | --- | ---: | ---:
 14 | **Architectures** | x86 | ✔️ | ✔️
 15 | \- | ARM | ✔️ | 
 16 | \- | Other | ✔️ | 
 17 | **Worktypes** | LL | ✔️ | ✔️
 18 | \- | PRP | ✔️ | ✔️
 19 | \- | P-1 | ✔️ | ✔️
 20 | \- | P+1 | | ✔️
 21 | \- | ECM | | ✔️
 22 | \- | Pépin | ✔️ | ✔️
 23 | **PRP** | Proofs | | ✔️
 24 | \- | Certs | | ✔️
 25 | **Error Checking** | Jacobi | | ✔️
 26 | \- | Gerbicz | ✔️ | ✔️
 27 | **Random Shifts** | | ✔️ | ✔️
 28 | **Interface** | CLI | ✔️ | MPrime only
 29 | \- | GUI | | Prime95 only
 30 | **Multiple Workers** | | Separate runs | ✔️
 31 | **PrimeNet Support** | | Separate program | ✔️
 32 | **Max FFT Length** | | 256M<br>(**512M** with 0 shift) | 32M (AVX) -<br>64M (AVX512)
 33 | **Largest Exponent** | | 4,294,967,231<br>(**8,937,021,911** with 0 shift) | 595,700,000 (AVX) -<br>1,169,000,000 (AVX512)
 34 | **Performance** | | ~50-90% | **100%**
 35 | **Free** 🆓 | | **Yes**, GPL | No, EULA
 36 | **100% Open Source** | | ✔️ | Mostly
 37 | **Claim Full EFF Awards** | | ✔️ | 
 38 | 
 39 | ## Usage
 40 | 
 41 | ### Automatic method
 42 | 
 43 | Linux users can use the [Mlucas install script](https://github.com/tdulcet/Distributed-Computing-Scripts#mlucas) to automatically download, build, setup and run Mlucas, including downloading, setting up and running the [AutoPrimeNet](https://github.com/tdulcet/AutoPrimeNet) for automated PrimeNet assignments.
 44 | 
 45 | ### Manual method
 46 | 
 47 | Dependencies:
 48 | * Make
 49 | * GNU C or Clang compiler
 50 | * \*GNU Multiple Precision (GMP) library
 51 | * \*Portable Hardware Locality (hwloc) library
 52 | * \*Python 3
 53 | 
 54 | \* Optional
 55 | 
 56 | #### Download
 57 | 
 58 | ##### Linux
 59 | 
 60 | 1. Verify that the dependencies above are installed. On Debian and Ubuntu, run: `sudo apt update` and `sudo apt install build-essential libgmp-dev libhwloc-dev`.
 61 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
 62 | 3. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
 63 | 
 64 | ##### macOS
 65 | 
 66 | 1. Verify that the dependencies above are installed. Run: `brew install gmp hwloc`.
 67 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `curl -fLO https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
 68 | 3. To download AutoPrimeNet, run: `curl -sSfLO https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
 69 | 
 70 | ##### Windows
 71 | 
 72 | Native Windows builds are experimental. For now, Windows users should use the [Windows Subsystem for Linux](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) (WSL) and follow the [Linux](#linux) instructions above instead.
 73 | 
 74 | 1. Download and install [MSYS2](https://www.msys2.org/).
 75 | 2. Verify that the dependencies above are installed. With the MINGW64 environment, run: `pacman -S mingw-w64-x86_64-gmp mingw-w64-x86_64-hwloc`.
 76 | 3. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
 77 | 4. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
 78 | 
 79 | #### Build
 80 | 
 81 | 1. Change into the `Mlucas` directory. Run: `cd Mlucas` or `cd Mlucas-main` depending on which method one used to download it.
 82 | 2. Run:
 83 | 	* To build Mlucas: `bash makemake.sh [use_hwloc]`.
 84 | 	* To build Mfactor: `bash makemake.sh mfac [word]`, where  `word` is optionally one of `1word`, `2word`, `3word`, `4word` or `nword`.
 85 | 
 86 | To build with Clang or another compiler instead of GCC, run: `export CC=<compiler>`, for example: `export CC=clang`.
 87 | 
 88 | #### Setup and Run
 89 | 
 90 | 1. Change into the `obj` directory. Run: `cd obj` or `cd obj_mfac` depending on if one built Mlucas or Mfactor respectively.
 91 | 
 92 | This README is still in progress. For now, see the original [Mlucas README](https://mersenneforum.org/mayer/README.html), which has more information about how to setup and run Mlucas. Also see [Help](#help) below. Note that with Mlucas v21, if built with the hwloc library, one would want to use the new `-core` option instead of `-cpu`.
 93 | 
 94 | ## Help
 95 | 
 96 | The [help.txt](help.txt) file includes a variety of usage information not covered in the original [README](https://mersenneforum.org/mayer/README.html), concentrating largely on the Mlucas command line options. A separate documentation page covers [Fermat numbers](docs/Fermat-testing.md).
 97 | 
 98 | ## Contributing
 99 | 
100 | Pull requests welcome!
101 | 


--------------------------------------------------------------------------------
/config-fermat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Shell script for generating fermat.cfg; Mlucas output saved to config-fermat.log
 4 | 
 5 | ################################################################################
 6 | #                                                                              #
 7 | #   (C) 2024 by Catherine Cowie and Teal Dulcet.                               #
 8 | #                                                                              #
 9 | #  This program is free software; you can redistribute it and/or modify it     #
10 | #  under the terms of the GNU General Public License as published by the       #
11 | #  Free Software Foundation; either version 2 of the License, or (at your      #
12 | #  option) any later version.                                                  #
13 | #                                                                              #
14 | #  This program is distributed in the hope that it will be useful, but WITHOUT #
15 | #  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       #
16 | #  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   #
17 | #  more details.                                                               #
18 | #                                                                              #
19 | #  You should have received a copy of the GNU General Public License along     #
20 | #  with this program; see the file GPL.txt.  If not, you may view one at       #
21 | #  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  #
22 | #  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     #
23 | #  02111-1307, USA.                                                            #
24 | #                                                                              #
25 | ################################################################################
26 | 
27 | # Mlucas
28 | MLUCAS=./Mlucas
29 | 
30 | # Number of iterations (use 100, 1000, or 10000 to match pre-computed values)
31 | ITERS=100
32 | 
33 | # Minimum Fermat number (15 or greater)
34 | MIN=15
35 | 
36 | # Maximum Fermat number (33 or less)
37 | MAX=29
38 | 
39 | # Mlucas arguments
40 | ARGS=(
41 | 	"$@"
42 | 	# Add desired -cpu or -core settings here, or as following arguments, e.g. bash ../config-fermat.sh -cpu 0:3
43 | )
44 | 
45 | # First, tiny FFT lengths for F15 to F17 (note 4K is the smallest workable length without fiddly radix settings);
46 | FFTS=([2]=15 [4]=16 [7]=17 [8]=17)
47 | # Then, from small up to egregiously large FFTs for F18 to F33.
48 | # The largest FFT reached is 512M, if MAX is set to 33.
49 | # Note that large FFTs require considerable runtime at 10000 iterations.
50 | for ((n = 0; n < 16; ++n)); do
51 | 	m=$((1 << n))
52 | 	f=$((18 + n))
53 | 	for k in 15 16; do
54 | 		if [[ $k -eq 15 && $n -lt 11 ]]; then
55 | 			# k = 7 multiples (7K, 14K, ...) become unworkable after F28 (14M).
56 | 			FFTS[14 * m]=$f
57 | 		fi
58 | 		# k = 15, 16 should both be supported up to at least F32.
59 | 		FFTS[k * m]=$f
60 | 		if [[ $k -eq 15 && $n -gt 5 ]]; then
61 | 			# k = 63 is mostly supported for F24 (1008K) and above.
62 | 			FFTS[63 * m >> 2]=$f
63 | 		fi
64 | 	done
65 | done
66 | for fft in "${!FFTS[@]}"; do
67 | 	f=${FFTS[fft]}
68 | 	if [[ -n $MIN && $f -lt $MIN ]]; then
69 | 		continue
70 | 	elif [[ -n $MAX && $f -gt $MAX ]]; then
71 | 		break
72 | 	fi
73 | 	printf '\n\tTesting F%s (2^%s + 1),\tFFT length: %sK\n\n' "$f" $((1 << f)) "$fft"
74 | 	args=("${ARGS[@]}")
75 | 	# First we test the very fiddly F15 and then loop over F16 up to maximum
76 | 	if [[ $f -eq 15 ]]; then
77 | 		args+=(-radset 8,8,16)
78 | 	fi
79 | 	if [[ $f -le 17 || $f -ge 32 ]]; then
80 | 		args+=(-shift 0)
81 | 	fi
82 | 	time $MLUCAS -f "$f" -fft "$fft" -iters $ITERS "${args[@]}" 2>&1 | tee -a config-fermat.log | grep -i 'error\|warn\|assert\|writing\|pmax_rec\|fft radices'
83 | done
84 | 


--------------------------------------------------------------------------------
/docs/irrational.txt:
--------------------------------------------------------------------------------
 1 | 24 Feb 2022
 2 | Prove irrationality of sqrt(2) via N-R iteration formula?
 3 | 
 4 | Let f(x) = x^(-2) − c, applying N-R (dx = -f/f' = (c - 1/x^2)/(-2/x^3) = x.(1-cx^2)/2) to this yields a second-order iterative formula for the reciprocal square-root of the computationally efficient kind we seek, with a per-iteration cost of 1 ADD and 4 MUL:
 5 | 	x_n+1 = x*(3 - c*x^2)/2
 6 | Fixed point(s) x* of the iteration given by dx = x*.(1-cx*^2)/2) = 0, with solutions x* = +- 1/sqrt(c) .
 7 | Assume x* rational, i.e. x* = 1/sqrt(c) = p/q. In terms of p and q our iteration is
 8 | 	x_n+1 = (p/q)*(3 - c*(p/q)^2)/2 = p.(3.q^2 - 2.p^2)/(2.q^3), i.e. p' = p*(3*q^2 - 2*p^2), q' = (2*q^3)
 9 | Example: c = 2, x0 = 1, x_n+1 = x_n*(3 - 2*x_n^2)/2 ... if x_n = p/q, have
10 | bc:
11 | p=q=1
12 | p *= (3*q^2-2*p^2); q = (2*q^3); g = gcd(p,q); p /= g; q /= g; print "gcd = ",g,": p = ",p,", q = ",q,"\n"
13 | n	x_n = p/q												factorization of p,q
14 | 0	1
15 | 1	1/2
16 | 2	5/8														5,2^3
17 | 3	355/512													5.71,2^9
18 | 4	94852805/134217728										5.23.71.11617,2^27
19 | 5	1709678476417571835487555/2417851639229258349412352		5.23.71.5741.8837.11617.355280903,2^81
20 | 6	p = 5.23.71.3023.5741.8837.11617.27509.355280903.70298580191725636724693742124090124808533, q = 2^243
21 | ...
22 | We observe that for each iteration, gcd(p',q') = 2. Also:
23 | 	o Once p has a given odd factor, subsequent iterations merely add more odd factors to p
24 | 	[Q: Are said odd factors all distinct, i.e. is p squarefree?]
25 | 	o q = 2^, with k tripling on each iteration
26 | Q: Is there a similar trend for other initial choices of p,q?
27 | p0 = 4, q0 = 5:
28 | n	x_n = p/q
29 | 0	4/5
30 | 1	86/125
31 | 2	43.32083/5^9
32 | 3	43.32083.308933.24722741/2.5^27
33 | 4	43.1987.32083.197947.308933.5926127.24722741.51537769.1848407118139843/2^3.5^81; so, more observations:
34 | 	o Any power of 2 in p is reduced by 1 each iteration until p odd, q = 2.odd
35 | 	o Each distinct prime in the factorization of q has its power tripled each iteration
36 | 	o Assuming p0,q0 in reduced form (gcd(p,q) = 1), again we have gcd(p,q) = 2 each iteration.
37 | 
38 | Without loss of generality we can consider the initial iterate within the basin of monotone convergence and its p0,q0 reduced, i.e. gcd(p0,q0) = 1, thus p0=q0=1 or p0,q0 have opposite parity, and:
39 | 1: p0=q0=1 yields next-iterate p = 1, q = 2, thus of form [2] below.
40 | 2: For p0 odd, q0 even: both 2.p^2 and 3.q^2 even and numerator p*(3*q^2 - 2*p^2) = 2*odd, thus gcd(p',q') = 2
41 | 3: For p0 even, q0 odd: 2.p^2 even and 3.q^2 odd; p*(3*q^2 - 2*p^2) even, denominator 2.q^3 = 2*odd, thus gcd(p',q') = 2
42 | In case [3] the unreduced numerator is divisible by 2^k with k > 1; since (3*q^2 - 2*p^2) odd, said power of 2 is the same as contained in the input value p0, and the ensuing division by the gcd = 2 reduces it by 1, thus after k further iterations we fall into pattern [2] and remain there (e.g. p0,q0 = 4,5 give p = 86,1379569,... and q = 125,1953125,...; p0,q0 = 8,9 give p = 460,269358290,41100860142614334318305635,... and q = 729,387420489,58149737003040059690390169,...).
43 | Thus after a finite number of iterations we inevitably settle into pattern [2] and remain there, thus the iteration converges in the sense that p/q approaches a limit but p,q never do because their gcd remains fixed at 2. QED
44 | *** Not quite - need to show that (or if) gcd cannot include an odd prime ***
45 | For q0 = 2 that is easy - denominator = 2.q^3, if it starts as a power of 2 it stays there.
46 | 
47 | Now try c = 3: Iterative-update is p = p.(3.q^2 - c.p^2) = 3.p.(q^2 - p^2), q = (2.q^3).
48 | Again use p0 = 1, q0 = 2:
49 | n	x_n = p/q
50 | 1	3^2/2^4
51 | 2	3^3.5^2.7/2^13
52 | 3	3^4.5^2.7.3467.12917/2^40
53 | Denominator = 2^k, k = (3.n+1); if p0 odd, numerator = odd.odd.(even-odd) always odd, hence sqrt(3) irrational.
54 | 
55 | Now try a (rational)^2, c = 9/16, yielding p = 3.p.(16.q^2 - 3.p^2), q = (2^5.q^3), same initial guess:
56 | n	x_n = p/q
57 | 0	1/2
58 | 1	3.61/2^8
59 | 2	3^2.61.107.1511/2^25
60 | ... this clearly converges -> 4/3, but here's the rub: p/q can converge in the sense of the limit as n -> oo, but p and q converge only in this same sense, i.e. there's no reason to expect gcd(p,q) to magically hit a nonzero value such that the resuling gcd-reduced p = 4 and q = 3 in a finite number of steps..
61 | 
62 | 


--------------------------------------------------------------------------------
/docs/pm1_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/docs/pm1_compare.png


--------------------------------------------------------------------------------
/docs/pm1_compare.txt:
--------------------------------------------------------------------------------
 1 | P-1 relative modmul count for stage 2 with b1=1m, b2=30m, best-bigstep option as tabulated in the
 2 | comments preceding my pm1_bigstep_size() function, for various prime-pairing #memory buffers:
 3 | #buf	#modmul
 4 |   24 1.00000000000000000000
 5 |   40 0.94138773325629738116
 6 |   48 0.92740481217168044188
 7 |   72 0.86767843536398087761
 8 |   80 0.86074072083480994905
 9 |   96 0.83101581644392863201
10 |  120 0.80442105746400556719
11 |  144 0.78374955701528234292
12 |  160 0.77282735940798766528
13 |  168 0.76741692741880852926
14 |  192 0.75413606363080072042
15 |  200 0.74602321491854959665
16 |  216 0.74320938718907791436
17 |  240 0.72551239265493547959
18 |  280 0.70928837479334378021
19 |  320 0.69613179866071653512
20 |  336 0.69459499859756496969
21 |  360 0.68523479449708007988
22 |  384 0.68135948300814196113
23 |  400 0.67617411244897627841
24 |  432 0.67047815476500395537
25 |  440 0.66840613398769104328
26 |  480 0.66141411359107905758
27 |  520 0.65619459191939889563
28 |  528 0.65365901177877469167
29 |  560 0.65110495644613445796
30 |  576 0.64716694127520254129
31 |  600 0.64677560311704482025
32 |  624 0.64151017339247634195
33 |  672 0.63652914965409401857
34 |  720 0.63217516273564967452
35 |  768 0.62836031551149128950
36 |  816 0.62483771223376828414
37 |  864 0.62172716172343309377
38 |  912 0.61890717559662273489
39 |  960 0.61633464507196647144
40 | 1008 0.61408235120892139029
41 | 1040 0.61240950655001542398
42 | 1056 0.61196946106745180634
43 | 1104 0.60997358047541707745
44 | 1120 0.60748670765919877011
45 | 1200 0.60309241123090127025
46 | 1280 0.59918350848375218833
47 | 1360 0.59569673588145421035
48 | 1440 0.59258058682808385945
49 | 1520 0.58975556201254185610
50 | 1600 0.58720542566002623477
51 | 1680 0.58489434709511196806
52 | 1760 0.58283856209260102151
53 | 1824 0.58255863494084299501
54 | 1840 0.58092777935470073268
55 | 1920 0.57913848500066342734
56 | 2000 0.57749587247414732789
57 | 2080 0.57601449798704385170
58 | 2112 0.57561308245142284171
59 | 2160 0.57463221771166271689
60 | 2208 0.57365527195202720444
61 | 2240 0.57334175354205821477
62 | 2304 0.57187661482975670411
63 | 2400 0.57025303734956015046
64 | 


--------------------------------------------------------------------------------
/docs/pm1_d210.txt:
--------------------------------------------------------------------------------
 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 210, for various prime-pairing #memory buffers:
 2 | #buf	#modmul
 3 |   24	8482142
 4 |   72	7462541
 5 |  120	6903803
 6 |  168	6578211
 7 |  216	6369191
 8 |  264	6213292
 9 |  312	6102927
10 |  360	6019017
11 |  408	5948963
12 |  456	5893249
13 |  504	5848289
14 |  552	5809671
15 |  600	5779087
16 |  648	5750744
17 |  696	5727095
18 |  744	5706454
19 |  792	5687784
20 |  840	5671474
21 |  888	5657449
22 |  936	5643928
23 |  984	5631970
24 | 1032	5621610
25 | 1080	5610944
26 | 1128	5602213
27 | 1176	5594236
28 | 1224	5587383
29 | 1272	5579976
30 | 1320	5573772
31 | 1368	5568653
32 | 1416	5562780
33 | 1464	5557665
34 | 1512	5552816
35 | 1560	5548156
36 | 1608	5543827
37 | 1656	5539520
38 | 1704	5535999
39 | 1752	5532637
40 | 1800	5529301
41 | 1848	5526107
42 | 1896	5523275
43 | 1944	5520469
44 | 1992	5517760
45 | 2040	5515237
46 | 2088	5513037
47 | 2136	5510383
48 | 2184	5508344
49 | 2232	5506230
50 | 2280	5504316
51 | 2328	5502474
52 | 2376	5500748
53 | 


--------------------------------------------------------------------------------
/docs/pm1_d330.txt:
--------------------------------------------------------------------------------
 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 330, for various prime-pairing #memory buffers:
 2 | #buf	#modmul
 3 |   40	8047788
 4 |  120	7036479
 5 |  200	6488371
 6 |  280	6120697
 7 |  360	5927827
 8 |  440	5764103
 9 |  520	5645736
10 |  600	5550700
11 |  680	5481350
12 |  760	5426563
13 |  840	5378130
14 |  920	5339380
15 | 1000	5305896
16 | 1080	5275177
17 | 1160	5247276
18 | 1240	5225875
19 | 1320	5206908
20 | 1400	5188988
21 | 1480	5174005
22 | 1560	5159755
23 | 1640	5147646
24 | 1720	5134944
25 | 1800	5125084
26 | 1880	5115890
27 | 1960	5107361
28 | 2040	5099600
29 | 2120	5092087
30 | 2200	5085266
31 | 2280	5077999
32 | 2360	5072329
33 | 


--------------------------------------------------------------------------------
/docs/pm1_d420.txt:
--------------------------------------------------------------------------------
 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 420, for various prime-pairing #memory buffers:
 2 | #buf	#modmul
 3 |   48	7799105
 4 |  144	6767956
 5 |  240	6212962
 6 |  336	5887052
 7 |  432	5678594
 8 |  528	5520431
 9 |  624	5413006
10 |  720	5329443
11 |  816	5259481
12 |  912	5203978
13 | 1008	5159100
14 | 1104	5119750
15 | 1200	5088646
16 | 1296	5060438
17 | 1392	5036585
18 | 1488	5016366
19 | 1584	4997817
20 | 1680	4981221
21 | 1776	4966761
22 | 1872	4953372
23 | 1968	4941632
24 | 2064	4931285
25 | 2160	4921051
26 | 2256	4911881
27 | 2352	4904025
28 | 


--------------------------------------------------------------------------------
/docs/pm1_d660.txt:
--------------------------------------------------------------------------------
 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 660, for various prime-pairing #memory buffers:
 2 | #buf	#modmul
 3 |   80	7599932
 4 |  240	6585540
 5 |  400	6054425
 6 |  560	5683123
 7 |  720	5486768
 8 |  880	5324522
 9 | 1040	5204859
10 | 1200	5109664
11 | 1360	5038998
12 | 1520	4984967
13 | 1680	4937214
14 | 1840	4898839
15 | 2000	4865619
16 | 2160	4834781
17 | 2320	4807728
18 | 


--------------------------------------------------------------------------------
/docs/pm1_d840.txt:
--------------------------------------------------------------------------------
 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 840, for various prime-pairing #memory buffers:
 2 | #buf	#modmul
 3 |   96 7451024
 4 |  288 6422689
 5 |  480 5869480
 6 |  672 5542247
 7 |  864 5332830
 8 | 1056 5174753
 9 | 1248 5067407
10 | 1440 4984033
11 | 1632 4914157
12 | 1824 4858498
13 | 2016 4813486
14 | 2208 4773984
15 | 2400 4743455
16 | 


--------------------------------------------------------------------------------
/docs/predefs_mac.txt:
--------------------------------------------------------------------------------
  1 | #define __DBL_MIN_EXP__ (-1021)
  2 | #define __FLT_MIN__ 1.17549435e-38F
  3 | #define __DEC64_DEN__ 0.000000000000001E-383DD
  4 | #define TRUE 1
  5 | #define __CHAR_BIT__ 8
  6 | #define BIT_CLR(x,b) ( (x) &= ~(1 << (b)) )
  7 | #define CPU_NAME "x86_64"
  8 | #define ALIGN_VEC_U64(_p) ALIGN_UINT64(_p)
  9 | #define __WCHAR_MAX__ 2147483647
 10 | #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
 11 | #define __FLT_EVAL_METHOD__ 0
 12 | #define STRNEQN(s1,s2,n) ( strncmp(s1,s2,n))
 13 | #define ALIGN_f128(_p) (__float128 *)(((long)(_p) | 127)+1)
 14 | #define __DBL_MIN_10_EXP__ (-307)
 15 | #define __FINITE_MATH_ONLY__ 0
 16 | #define ALLOC_COMPLEX(_p,_n) (struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512)
 17 | #define L2_SZ_VD 3
 18 | #define ALIGN_COMPLEX(_p) (struct complex*)(((long)(_p) | 127)+1)
 19 | #define __DEC64_MAX_EXP__ 384
 20 | #define __SHRT_MAX__ 32767
 21 | #define __LDBL_MAX__ 1.18973149535723176502e+4932L
 22 | #define __APPLE_CC__ 5666
 23 | #define __UINTMAX_TYPE__ long unsigned int
 24 | #define __DEC32_EPSILON__ 1E-6DF
 25 | #define __block __attribute__((__blocks__(byref)))
 26 | #define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256)
 27 | #define ALIGN_UINT64(_p) (uint64 *)(((long)(_p) | 63)+1)
 28 | #define STREQ(s1,s2) (!strcmp(s1,s2))
 29 | #define __SCHAR_MAX__ 127
 30 | #define HERE __LINE__, __FILE__
 31 | #define align_h_included 
 32 | #define __USER_LABEL_PREFIX__ _
 33 | #define __STDC_HOSTED__ 1
 34 | #define ALLOC_UINT128(_p,_n) (uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256)
 35 | #define __DEC64_MIN_EXP__ (-383)
 36 | #define BIT_SETC(x,b,condition) ( (x) |= ((condition) << (b)) )
 37 | #define __DBL_DIG__ 15
 38 | #define __FLT_EPSILON__ 1.19209290e-7F
 39 | #define ALLOC_POINTER(_p,_ptr_type,_n) (_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64)
 40 | #define __LDBL_MIN__ 3.36210314311209350626e-4932L
 41 | #define __DEC32_MAX__ 9.999999E96DF
 42 | #define OS_POSIX_COMPLIANT 
 43 | #define __strong 
 44 | #define COMPILER_NAME "Gnu C [or other compatible]"
 45 | #define __APPLE__ 1
 46 | #define __DECIMAL_DIG__ 21
 47 | #define SZ_VDM1 7
 48 | #define __LDBL_HAS_QUIET_NAN__ 1
 49 | #define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512)
 50 | #define __DYNAMIC__ 1
 51 | #define __GNUC__ 4
 52 | #define __MMX__ 1
 53 | #define __FLT_HAS_DENORM__ 1
 54 | #define ALLOC_VEC_DBL(_p,_n) ALLOC_DOUBLE(_p,_n)
 55 | #define __DBL_MAX__ 1.7976931348623157e+308
 56 | #define __DBL_HAS_INFINITY__ 1
 57 | #define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256)
 58 | #define __DEC32_MIN_EXP__ (-95)
 59 | #define ALIGN_UINT128(_p) (uint128 *)(((long)(_p) | 63)+1)
 60 | #define OBJC_NEW_PROPERTIES 1
 61 | #define __LDBL_HAS_DENORM__ 1
 62 | #define __DEC32_MIN__ 1E-95DF
 63 | #define __weak __attribute__((objc_gc(weak)))
 64 | #define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512)
 65 | #define __DBL_MAX_EXP__ 1024
 66 | #define __DEC128_EPSILON__ 1E-33DL
 67 | #define __SSE2_MATH__ 1
 68 | #define STRNEQ(s1,s2) ( strcmp(s1,s2))
 69 | #define __amd64 1
 70 | #define __tune_core2__ 1
 71 | #define __LONG_LONG_MAX__ 9223372036854775807LL
 72 | #define IS_ODD(a) ( (int)(a) & 1)
 73 | #define NINT(x) floor(x + 0.5)
 74 | #define BIT_SET(x,b) ( (x) |= (1 << (b)) )
 75 | #define platform_h_included 
 76 | #define FP_MANTISSA_BITS_DOUBLE 64
 77 | #define __GXX_ABI_VERSION 1002
 78 | #define COMPILER_TYPE_GCC 
 79 | #define ALIGN_INT(_p) (int *)(((long)(_p) | 63)+1)
 80 | #define __FLT_MIN_EXP__ (-125)
 81 | #define DNINT(x) lrint((x))
 82 | #define __x86_64 1
 83 | #define CPU_SUBTYPE_NAME "Unknown CPU subtype"
 84 | #define __DBL_MIN__ 2.2250738585072014e-308
 85 | #define COMPILER_VERSION __VERSION__
 86 | #define ALIGN_VEC_DBL(_p) ALIGN_DOUBLE(_p)
 87 | #define __LP64__ 1
 88 | #define __DBL_HAS_QUIET_NAN__ 1
 89 | #define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256)
 90 | #define __DEC128_MIN__ 1E-6143DL
 91 | #define __REGISTER_PREFIX__ 
 92 | #define __DBL_HAS_DENORM__ 1
 93 | #define __NO_INLINE__ 1
 94 | #define __DEC_EVAL_METHOD__ 2
 95 | #define types_h_included 
 96 | #define __DEC128_MAX__ 9.999999999999999999999999999999999E6144DL
 97 | #define __FLT_MANT_DIG__ 24
 98 | #define __VERSION__ "4.2.1 (Apple Inc. build 5666) (dot 3)"
 99 | #define MOD_ADD32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_ADD64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; }
100 | #define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n)
101 | #define ARRAYS_DISJOINT(xarr,lenx,yarr,leny) ((yarr+leny <= xarr) || (yarr >= xarr+lenx))
102 | #define MOD_ADD64(__x,__y,__q,__z) { uint64 cy,tmp; tmp = __x + __y; cy = tmp < __x; __z = tmp - __q; cy -= __z > tmp; __z = __z + (cy & __q); }
103 | #define IS_EVEN(a) (~(int)(a) & 1)
104 | #define HACK_ALIGN_STACK_ODD() 
105 | #define __DEC64_EPSILON__ 1E-15DD
106 | #define __DEC128_MIN_EXP__ (-6143)
107 | #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ 1068
108 | #define __SIZE_TYPE__ long unsigned int
109 | #define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((long)(_p) | 63)+1)
110 | #define __DEC32_DEN__ 0.000001E-95DF
111 | #define CMUL(ar,ai,br,bi,cr,ci) { double __tmp = ar; ci = __tmp*bi + ai*br; cr = __tmp*br - ai*bi;}
112 | #define __FLT_RADIX__ 2
113 | #define __LDBL_EPSILON__ 1.08420217248550443401e-19L
114 | #define SGN(x,b) ((b) == 1 ? -(x) : (x))
115 | #define __SSE_MATH__ 1
116 | #define __k8 1
117 | #define __LDBL_DIG__ 18
118 | #define __x86_64__ 1
119 | #define OS_VERSION "[Unknown]"
120 | #define HACK_ALIGN_STACK_EVEN() 
121 | #define ABS(a) ((a) < 0 ? -(a) : (a))
122 | #define X32_ASM 
123 | #define __FLT_HAS_QUIET_NAN__ 1
124 | #define __FLT_MAX_10_EXP__ 38
125 | #define __LONG_MAX__ 9223372036854775807L
126 | #define __FLT_HAS_INFINITY__ 1
127 | #define __DEC64_MAX__ 9.999999999999999E384DD
128 | #define ALIGN_UINT(_p) (uint *)(((long)(_p) | 63)+1)
129 | #define __DEC64_MANT_DIG__ 16
130 | #define OS_TYPE 
131 | #define __DEC32_MAX_EXP__ 96
132 | #define __DEC128_DEN__ 0.000000000000000000000000000000001E-6143DL
133 | #define MOD_SUB32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_SUB64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; }
134 | #define OS_TYPE_MACOSX 
135 | #define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256)
136 | #define __LITTLE_ENDIAN__ 1
137 | #define CPU_IS_X86_64 
138 | #define RE_IM_STRIDE 1
139 | #define MOD_SUB64(__x,__y,__q,__z) { uint64 bw,tmp; tmp = __x - __y; bw = tmp > __x; __z = tmp + __q; bw -= __z < tmp; __z = __z - (bw & __q); }
140 | #define __LDBL_MANT_DIG__ 64
141 | #define __CONSTANT_CFSTRINGS__ 1
142 | #define ALIGN_DOUBLE(_p) (double *)(((long)(_p) | 127)+1)
143 | #define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256)
144 | #define __DEC32_MANT_DIG__ 7
145 | #define __k8__ 1
146 | #define __WCHAR_TYPE__ int
147 | #define FALSE 0
148 | #define __pic__ 2
149 | #define MULH64_FAST 
150 | #define __FLT_DIG__ 6
151 | #define __INT_MAX__ 2147483647
152 | #define ALIGN_INT64(_p) (int64 *)(((long)(_p) | 63)+1)
153 | #define __FLT_MAX_EXP__ 128
154 | #define __BLOCKS__ 1
155 | #define __DBL_MANT_DIG__ 53
156 | #define CPU_TYPE 
157 | #define __DEC64_MIN__ 1E-383DD
158 | #define __WINT_TYPE__ int
159 | #define __SSE__ 1
160 | #define __LDBL_MIN_EXP__ (-16381)
161 | #define __MACH__ 1
162 | #define X64_ASM 
163 | #define __amd64__ 1
164 | #define __LDBL_MAX_EXP__ 16384
165 | #define __SSP__ 1
166 | #define ARRAYS_OVERLAP(xarr,lenx,yarr,leny) !ARRAYS_DISJOINT(xarr,lenx,yarr,leny)
167 | #define __LDBL_MAX_10_EXP__ 4932
168 | #define __DBL_EPSILON__ 2.2204460492503131e-16
169 | #define _LP64 1
170 | #define __GNUC_PATCHLEVEL__ 1
171 | #define __LDBL_HAS_INFINITY__ 1
172 | #define __INTMAX_MAX__ 9223372036854775807L
173 | #define __FLT_DENORM_MIN__ 1.40129846e-45F
174 | #define __PIC__ 2
175 | #define OS_BITS 64
176 | #define __FLT_MAX__ 3.40282347e+38F
177 | #define __SSE2__ 1
178 | #define BIT_FLIP(x,b) ( (x) ^= (1 << (b)) )
179 | #define __FLT_MIN_10_EXP__ (-37)
180 | #define __INTMAX_TYPE__ long int
181 | #define __DEC128_MAX_EXP__ 6144
182 | #define ALLOC_VEC_U64(_p,_n) ALLOC_UINT64(_p,_n)
183 | #define __GNUC_MINOR__ 2
184 | #define __DBL_MAX_10_EXP__ 308
185 | #define SZ_VD 8
186 | #define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
187 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
188 | #define __STDC__ 1
189 | #define __PTRDIFF_TYPE__ long int
190 | #define ALIGN_FLOAT(_p) (float *)(((long)(_p) | 63)+1)
191 | #define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p)
192 | #define STREQN(s1,s2,n) (!strncmp(s1,s2,n))
193 | #define OS_NAME "OS X"
194 | #define __DEC128_MANT_DIG__ 34
195 | #define __LDBL_MIN_10_EXP__ (-4931)
196 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
197 | #define BIT_TEST(x,b) ( ((x) >> (b)) & 1 )
198 | #define __GNUC_GNU_INLINE__ 1
199 | #define COMPILER_TYPE 
200 | #define __SSE3__ 1
201 | 


--------------------------------------------------------------------------------
/docs/qs.txt:
--------------------------------------------------------------------------------
 1 | A p-1 run found the following 53-digit composite factor of M(109228331), which factors into p25*p29:
 2 | q = 67043584777242522312784510096836476580550779917618449 = 3258278300321182416433937 * 20576383782390150543028926977
 3 | The prime factors themselves have p-1 factorizations
 4 | p25-1 = p*2^4.11.113.305611.4907867
 5 | p29-1 = p*2^9.577.20929.28687.1062073, which is why a p-1 run to B1 = 10^6 and B2 > 10^6 found them both in stage 2.
 6 | 
 7 | For such composite factors of M(p) = q1.q2
 8 | = (2.k1.p+1).(2.k2.p+1) = 4.k1.k2.p^2 + 2.(k1+k2).p + 1 = (2.k1.k2.p + k1 + k2).2.p + 1 = 2.p.F + 1,
 9 | where F := (2.k1.k2.p + k1 + k2) = 306896499120006339347913821446363016297258904 ... Need to find k1,k2!
10 | Seems we could do better than e.g. ECM or QS on (n-1), since we have that k1,k2 must satisfy F == k1 + k2 (mod p).
11 | 
12 | If k1,k2 < p, things are easy: F/2p = k1.k2, i.e. quotient Q = k1.k2, remainder R = k1 + k2, can just brute-force loop over all k1 <= sqrt(Q) which divide Q, compute k2 = Q/k1, see if R = k1+k2. Will this work in the general case where k1,k2 may be quite a bit larger than 2p? For the above example, k1 = 14914987121432728, k2 = 94189774731567355648. The true 2.k1.k2.p = 306896499120006339347913727241673297608470528,
13 | whereas   q/2p = 306896499120006339347913821446363016297258904; difference too large to make the above idea workable.
14 | 
15 | Know F == (k1+k2) mod 2p; in our case (k1+k2) == 98326026 mod 2p, useless because requires a priori knowledge of k1,k2.
16 | 
17 | 5/23/21: Can at least do p-1 with S1 seed = p on n ... wait:
18 | [We open our next scene with a hand slapping the owner's forehead, accompanied by the utterance "doh!"]
19 | 
20 | Re above: In fact it seems silly to use powerful general-modulus factoring machinery like ECM or QS on such (p-1)-found factor-product composites. Here's why: say we have some product of prime factors F = f1*f2*...*fn discovered by running p-1 to stage bounds b1 and b2 on an input Mersenne M(p) (or other bigum modulus with factors of a known form, allowing p-1 to be 'seeded' with a component of same). BY DEFINITION, each prime factor f1-fn will be b1/b2-smooth, in the sense than fj = 2*p*C + 1, where C is a composite all of whose prime factors are <= b1, save possibly one outlier-prime factor > b1 and <= b2. Thus if we again run p-1 to bounds b1/b2, but now with arithmetic modulo the relatively tiny factor product F, we are guaranteed to resolve all the prime factors f1-fn - the only trick is that we will need to do multiple GCDs along the way in order to capture the individual prime factors f1,...,fn, rather than have this secondary p-1 run modulo F again produce the same composite GCD = F which the original p-1 run mod M(p) did. Again, though, since in the followup p-1 run we are working mod F, all the arithmetic is trivially cheap, including the needed GCDs.
21 | 
22 | ====================================
23 | 
24 | Use above example composite to work through the basics of ECM:
25 | [to-do!]
26 | 
27 | ====================================
28 | 
29 | Use above example composite to work through the basics of ECM and the Quadratic Sieve factorization algorithm.
30 | Wikipedia:
31 | 
32 | "The algorithm attempts to set up a congruence of squares modulo n (the integer to be factorized), which often leads to a factorization of n. The algorithm works in two phases: the data collection phase, where it collects information that may lead to a congruence of squares; and the data processing phase, where it puts all the data it has collected into a matrix and solves it to obtain a congruence of squares. The data collection phase can be easily parallelized to many processors, but the data processing phase requires large amounts of memory, and is difficult to parallelize efficiently over many nodes or if the processing nodes do not each have enough memory to store the whole matrix. The block Wiedemann algorithm can be used in the case of a few systems each capable of holding the matrix.
33 | 
34 | "The naive approach to finding a congruence of squares is to pick a random number, square it, and hope the least non-negative remainder modulo n is a perfect square (in the integers). For example, 802 mod 5959 is 441, which is 212. This approach finds a congruence of squares only rarely for large n, but when it does find one, more often than not, the congruence is nontrivial and the factorization is complete. This is roughly the basis of Fermat's factorization method."
35 | 
36 | 


--------------------------------------------------------------------------------
/src/align.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /****************************************************************************
 24 |  * We now include this header file if it was not included before.
 25 |  ****************************************************************************/
 26 | #ifndef align_h_included
 27 | #define align_h_included
 28 | 
 29 | #include "types.h"
 30 | 
 31 | /* These are for basic memory allocation, and to force alignment of array data on desired-byte boundaries.
 32 | We use the normally-not-recommended immediate-overwrite-of-pointer form of realloc() because if the returned
 33 | pointer is null we exit immediately, thus the resulting memory leak is never an issue.
 34 | 
 35 | In the Align macros we cast pointers to longto accommodate architectures which use 64-bit address arithmetic.
 36 | Note that rather than simply assuming sizeof(void *) <= sizeof(long), we check this at program invocation, in
 37 | util.c::check_nbits_in_types()>
 38 | */
 39 | 
 40 | #define ALLOC_INT(_p,_n)	(int           *)realloc(_p,(_n)*sizeof(int           )+256)
 41 | #define ALIGN_INT(_p)		(int           *)(((intptr_t)(_p) | 63)+1)
 42 | 
 43 | #define ALLOC_UINT(_p,_n)	(uint          *)realloc(_p,(_n)*sizeof(uint          )+256)
 44 | #define ALIGN_UINT(_p)		(uint          *)(((intptr_t)(_p) | 63)+1)
 45 | 
 46 | #define ALLOC_INT64(_p,_n)	(int64         *)realloc(_p,(_n)*sizeof(int64         )+256)
 47 | #define ALIGN_INT64(_p)		(int64         *)(((intptr_t)(_p) | 63)+1)
 48 | 
 49 | #define ALLOC_UINT64(_p,_n)	(uint64        *)realloc(_p,(_n)*sizeof(uint64        )+256)
 50 | #define ALIGN_UINT64(_p)	(uint64        *)(((intptr_t)(_p) | 63)+1)
 51 | 
 52 | #define ALLOC_UINT128(_p,_n)(uint128       *)realloc(_p,(_n+_n)*sizeof(uint64     )+256)
 53 | #define ALIGN_UINT128(_p)	(uint128       *)(((intptr_t)(_p) | 63)+1)
 54 | 
 55 | #define ALLOC_FLOAT(_p,_n)	(float         *)realloc(_p,(_n)*sizeof(float         )+256)
 56 | #define ALIGN_FLOAT(_p)		(float         *)(((intptr_t)(_p) | 63)+1)
 57 | 
 58 | #define ALLOC_DOUBLE(_p,_n)	(double        *)realloc(_p,(_n)*sizeof(double        )+512)
 59 | #define ALIGN_DOUBLE(_p)	(double        *)(((intptr_t)(_p) | 127)+1)
 60 | 
 61 | #define ALLOC_f128(_p,_n)	(__float128    *)realloc(_p,(_n)*sizeof(__float128    )+512)
 62 | #define ALIGN_f128(_p)		(__float128    *)(((intptr_t)(_p) | 127)+1)
 63 | 
 64 | #define ALLOC_COMPLEX(_p,_n)(struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512)
 65 | #define ALIGN_COMPLEX(_p)	(struct complex*)(((intptr_t)(_p) | 127)+1)
 66 | 
 67 | // Vector-double|uint64-alloc used by SIMD builds; register size difference between YMM and XMM taken care of by def of vec_dbl in types.h:
 68 | #ifdef USE_SSE2
 69 | 
 70 | 	#define ALLOC_VEC_DBL(_p,_n)(vec_dbl*)realloc(_p,(_n)*sizeof(vec_dbl)+512)
 71 | 	#define ALIGN_VEC_DBL(_p)	(vec_dbl*)(((intptr_t)(_p) | 127)+1)
 72 | 
 73 | 	#define ALLOC_VEC_U64(_p,_n)(vec_u64*)realloc(_p,(_n)*sizeof(vec_u64)+512)
 74 | 	#define ALIGN_VEC_U64(_p)	(vec_u64*)(((intptr_t)(_p) | 127)+1)
 75 | 
 76 | #else	// In scalar-mode simply use the above double|uint64 macros:
 77 | 
 78 | 	#define ALLOC_VEC_DBL(_p,_n)	ALLOC_DOUBLE(_p,_n)
 79 | 	#define ALIGN_VEC_DBL(_p)		ALIGN_DOUBLE(_p)
 80 | 
 81 | 	#define ALLOC_VEC_U64(_p,_n)	ALLOC_UINT64(_p,_n)
 82 | 	#define ALIGN_VEC_U64(_p)		ALIGN_UINT64(_p)
 83 | 
 84 | #endif
 85 | 
 86 | #define ALLOC_POINTER(_p,_ptr_type,_n)(_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64)
 87 | #define ALIGN_POINTER(_p,_ptr_type)	  (_ptr_type*)(((intptr_t)(_p) | 63)+1)
 88 | 
 89 | #define ALLOC_QFLOAT(_p,_n)	ALLOC_UINT128(_p,_n)
 90 | #define ALIGN_QFLOAT(_p)	ALIGN_UINT128(_p)
 91 | 
 92 | /*
 93 |  On the x86 family, alignment of the stack is very important
 94 |  This uses the GNU gcc  __builtin_alloca function to align doubles properly
 95 |  This is taken from GNU/FFTW package
 96 | */
 97 | #ifdef COMPILER_TYPE_GCC
 98 | #	if (defined(__i386))
 99 | #		define HACK_ALIGN_STACK_EVEN(){					\
100 | 		if( (((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\
101 | 		}
102 | 
103 | #		define HACK_ALIGN_STACK_ODD() {					\
104 | 		if(!(((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\
105 | 		}
106 | #	else
107 | #		define HACK_ALIGN_STACK_EVEN() /* */
108 | #		define HACK_ALIGN_STACK_ODD() /* */
109 | #	endif
110 | #else
111 | #	define HACK_ALIGN_STACK_EVEN() /* */
112 | #	define HACK_ALIGN_STACK_ODD() /* */
113 | #endif
114 | 
115 | 
116 | #endif	/* align_h_included */
117 | 


--------------------------------------------------------------------------------
/src/f2psp.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2012 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /*******************************************************************************
 24 |    We now include this header file if it was not included before.
 25 | *******************************************************************************/
 26 | #ifndef f2psp_h_included
 27 | #define f2psp_h_included
 28 | 
 29 | #ifdef __cplusplus
 30 | extern "C" {
 31 | #endif
 32 | 
 33 | 	#define MI64_IS_DIV_BY_SCALAR32P_X8_SSE2(\
 34 | 		array_64x8inputs,\
 35 | 		q,		\
 36 | 		qinv,	\
 37 | 		retval	\
 38 | 	)\
 39 | 	{\
 40 | 		DBG_ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\
 41 | 		DBG_ASSERT(((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\
 42 | 		__asm	mov	eax, array_64x8inputs	/* Assumes inputs a,b,c,d,... are 64-bit separated and &a[0} is 64-byte aligned */\
 43 | 		__asm	lea	ebx, q\
 44 | 		__asm	lea	ecx, qinv\
 45 | 		__asm	movaps	xmm0,[eax     ]	/* ab: d3210 = [bhi|blo|ahi|alo] */\
 46 | 		__asm	movaps	xmm1,[eax+0x10]	/* cd: d3210 = [dhi|dlo|chi|clo] */\
 47 | 		__asm	movaps	xmm2,[eax+0x20]	/* ef: d3210 = [fhi|flo|ehi|elo] */\
 48 | 		__asm	movaps	xmm3,[eax+0x30]	/* gh: d3210 = [hhi|hlo|ghi|glo] */\
 49 | 		__asm	movaps	xmm6,xmm0	/* Circularly-permute [4,6,7] -> [6,7,4] here so the 2 packed outputs end up in xmm6,7 */\
 50 | 		__asm	movaps	xmm5,xmm1\
 51 | 		__asm	movaps	xmm7,xmm2\
 52 | 		__asm	movaps	xmm4,xmm3\
 53 | 		__asm	psrlq	xmm6, 32		/* d3210 = [  0|bhi|  0|ahi] */\
 54 | 		__asm	psrlq	xmm5, 32		/* d3210 = [  0|dhi|  0|chi] */\
 55 | 		__asm	psrlq	xmm7, 32		/* d3210 = [  0|fhi|  0|ehi] */\
 56 | 		__asm	psrlq	xmm4, 32		/* d3210 = [  0|hhi|  0|ghi] */\
 57 | 		__asm	psllq	xmm5, 32		/* d3210 = [dhi|  0|chi|  0] */\
 58 | 		__asm	psllq	xmm4, 32		/* d3210 = [hhi|  0|ghi|  0] */\
 59 | 		__asm	paddd	xmm6,xmm5		/* d3210 = [dhi|bhi|chi|ahi], xmm5 FREE */\
 60 | 		__asm	paddd	xmm7,xmm4		/* d3210 = [hhi|fhi|ghi|ehi], xmm4 FREE */\
 61 | 		__asm	movd	xmm4,[ebx]\
 62 | 		__asm	movd	xmm5,[ecx]\
 63 | 		__asm	pshufd	xmm4,xmm4,0x44	/* Broadcast q    to slots 0,2 of xmm4 */\
 64 | 		__asm	pshufd	xmm5,xmm5,0x44	/* Broadcast qinv to slots 0,2 of xmm5 */\
 65 | 		/* (a-h)[0]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\
 66 | 		__asm	pmuludq	xmm0,xmm5\
 67 | 		__asm	pmuludq	xmm1,xmm5\
 68 | 		__asm	pmuludq	xmm2,xmm5\
 69 | 		__asm	pmuludq	xmm3,xmm5\
 70 | 		/* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\
 71 | 		__asm	pmuludq	xmm0,xmm4\
 72 | 		__asm	pmuludq	xmm1,xmm4\
 73 | 		__asm	pmuludq	xmm2,xmm4\
 74 | 		__asm	pmuludq	xmm3,xmm4\
 75 | 		__asm	psrlq	xmm0, 32		/* d3210 = [  0|cy1|  0|cy0] */\
 76 | 		__asm	psrlq	xmm1, 32		/* d3210 = [  0|cy3|  0|cy2] */\
 77 | 		__asm	psrlq	xmm2, 32		/* d3210 = [  0|cy5|  0|cy4] */\
 78 | 		__asm	psrlq	xmm3, 32		/* d3210 = [  0|cy7|  0|cy6] */\
 79 | 		__asm	psllq	xmm1, 32		/* d3210 = [cy3|  0|cy2|  0] */\
 80 | 		__asm	psllq	xmm3, 32		/* d3210 = [cy7|  0|cy6|  0] */\
 81 | 		__asm	paddd	xmm0,xmm1		/* d3210 = [cy3|cy1|cy2|cy0], xmm1 FREE */\
 82 | 		__asm	paddd	xmm2,xmm3		/* d3210 = [cy7|cy5|cy6|cy4], xmm3 FREE */\
 83 | 		__asm	movaps	xmm3,xmm6		/* Copy of acbd[1] */\
 84 | 		__asm	movaps	xmm1,xmm7		/* Copy of efgh[1] */\
 85 | 		__asm	psubd	xmm6,xmm0		/* acbd[1] - cy0213, xmm0 FREE */\
 86 | 		__asm	psubd	xmm7,xmm2		/* egfh[1] - cy4657, xmm2 FREE */\
 87 | 		__asm	movaps	xmm2,xmm6		/* Copy of acbd[1] - cy0213 */\
 88 | 		__asm	movaps	xmm0,xmm7		/* Copy of efgh[1] - cy4657 */\
 89 | 		/* Had a borrow? Frickin' SSE2 only gives us signed packed-integer compares,\
 90 | 		so need to emulate unsigned (x > y) via signed (x ^ 0x80000000) < (y ^ 0x80000000): */\
 91 | 		__asm	pcmpeqd	xmm4,xmm4		/* All 1s  - will need to restore q to this register later */\
 92 | 		__asm	pslld	xmm4, 31		/* 4-way 0x80000000 */\
 93 | 		__asm	pxor	xmm6,xmm4		/* (acbd[1]-cy0213) ^ 0x80000000 */\
 94 | 		__asm	pxor	xmm7,xmm4		/* (egfh[1]-cy4657) ^ 0x80000000 */\
 95 | 		__asm	pxor	xmm3,xmm4		/* (acbd[1]) ^ 0x80000000 */\
 96 | 		__asm	pxor	xmm1,xmm4		/* (egfh[1]) ^ 0x80000000 */\
 97 | 		__asm	pcmpgtd	xmm6,xmm3		/* cy0213 = (acbd[1]-cy0213) > abcd[1], xmm3 FREE */\
 98 | 		__asm	pcmpgtd	xmm7,xmm1		/* cy4657 = (egfh[1]-cy4657) > efgh[1], xmm1 FREE */\
 99 | 		__asm	pshufd	xmm3,xmm2,0x31	/* xmm2 = [----|tmp1|----|tmp0], xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\
100 | 		__asm	pshufd	xmm1,xmm0,0x31	/* xmm0 = [----|tmp5|----|tmp4], xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\
101 | 		__asm	movd	xmm4,[ebx]		/* Restore q to xmm4 */\
102 | 		__asm	pshufd	xmm4,xmm4,0x44	/* Broadcast q    to slots 0,2 of xmm4 */\
103 | 		/* tmp[0-7]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\
104 | 		__asm	pmuludq	xmm3,xmm5\
105 | 		__asm	pmuludq	xmm1,xmm5\
106 | 		__asm	pmuludq	xmm2,xmm5\
107 | 		__asm	pmuludq	xmm0,xmm5\
108 | 		/* Add carries 01/45, scatter carries 23/67 into slots of 01/45, add those...Since SSE2 compare result is ~()ed, add really means sub: */\
109 | 		__asm	psubd	xmm2,xmm6		/* xmm6 = [----|tmp1|----|tmp0], don't care what's in ---- slots */\
110 | 		__asm	psubd	xmm0,xmm7		/* xmm7 = [----|tmp5|----|tmp4], don't care what's in ---- slots */\
111 | 		__asm	pshufd	xmm6,xmm6,0x31\
112 | 		__asm	pshufd	xmm7,xmm7,0x31\
113 | 		__asm	psubd	xmm3,xmm6		/* xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\
114 | 		__asm	psubd	xmm1,xmm7		/* xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\
115 | 		/* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\
116 | 		__asm	pmuludq	xmm2,xmm4\
117 | 		__asm	pmuludq	xmm0,xmm4\
118 | 		__asm	pmuludq	xmm3,xmm4\
119 | 		__asm	pmuludq	xmm1,xmm4\
120 | 		__asm	psrlq	xmm2, 32		/* d3210 = [  0|cy1|  0|cy0] */\
121 | 		__asm	psrlq	xmm0, 32		/* d3210 = [  0|cy5|  0|cy4] */\
122 | 		__asm	psrlq	xmm3, 32		/* d3210 = [  0|cy3|  0|cy2] */\
123 | 		__asm	psrlq	xmm1, 32		/* d3210 = [  0|cy7|  0|cy6] */\
124 | 		__asm	pshufd	xmm2,xmm2,0x58	/* [  0|  0|cy1|cy0] */\
125 | 		__asm	pshufd	xmm0,xmm0,0x58	/* [  0|  0|cy5|cy4] */\
126 | 		__asm	pshufd	xmm3,xmm3,0x85	/* [cy3|cy2|  0|  0] */\
127 | 		__asm	pshufd	xmm1,xmm1,0x85	/* [cy7|cy6|  0|  0] */\
128 | 		__asm	paddd	xmm2,xmm3		/* d3210 = [cy3|cy1|cy2|cy0] */\
129 | 		__asm	paddd	xmm0,xmm1		/* d3210 = [cy7|cy5|cy6|cy4] */\
130 | 		__asm	pcmpgtd	xmm7,xmm7		/* All 0s */\
131 | 		__asm	pcmpeqd	xmm2,xmm7		/* retval[0-3] */\
132 | 		__asm	pcmpeqd	xmm0,xmm7		/* retval[4-7] */\
133 | 		__asm	movmskps eax,xmm2		/* retval[0-3] */\
134 | 		__asm	movmskps ebx,xmm0		/* retval[4-7] */\
135 | 		__asm	shl		 ebx, 4		/* retval[4-7] << 4 */\
136 | 		__asm	add		 eax,ebx	/* retval[0-7] */\
137 | 		__asm	mov	retval,  eax	\
138 | 	}
139 | 
140 | #ifdef __cplusplus
141 | }
142 | #endif
143 | 
144 | #endif	/* f2psp_h_included */
145 | 
146 | 


--------------------------------------------------------------------------------
/src/fac_test_dat192.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /****************************************************************************
 24 |  * We now include this header file if it was not included before.
 25 |  ****************************************************************************/
 26 | #ifndef fac_test_dat192_included
 27 | #define fac_test_dat192_included
 28 | 
 29 | 	#include "types.h"
 30 | 
 31 | 	struct testFac160{
 32 | 		uint32 p;
 33 | 		uint64 d2;
 34 | 		uint64 d1;
 35 | 		uint64 d0;
 36 | 	};
 37 | 
 38 | 	struct testFac192{
 39 | 		uint32 p;
 40 | 		uint64 d2;
 41 | 		uint64 d1;
 42 | 		uint64 d0;
 43 | 	};
 44 | 
 45 | 	/*******************************************/
 46 | 	/*      Fermat-number test factors:        */
 47 | 	/*******************************************/
 48 | 
 49 | 	// Here interpret the above testFac struct as a minimalist [n,k]-pair format,
 50 | 	// where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor:
 51 | 	// To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1.
 52 | 
 53 | 	// Testcases with factors < 2^192:
 54 | 	static const struct testFac192 ffac192[] =
 55 | 	{
 56 | 		{ 86,0ull,0ull,	   20018578522347ull},		// 2012 M. Dangler & Rodenkirch
 57 | 		{ 88,0ull,0ull,	     119942751127ull},		// 2001 T. Nohara & Durman
 58 | 		{ 90,0ull,0ull,	     198922467387ull},		// 2001 P. Grobstich & Durman
 59 | 		{ 91,0ull,0ull,	             1421ull},		// 1977 D. E. Shippee
 60 | 		{ 93,0ull,0ull,2*	        92341ull},		// 1979 R. Baillie
 61 | 		{ 94,0ull,0ull,2*	 482524552001ull},		// 2001 P. Grobstich & Durman
 62 | 		{ 96,0ull,0ull,8*	3334131633063ull},		// 2008 M. Ptáček & Durman
 63 | 		{107,0ull,0ull,4*	   1289179925ull},		// 1992 G. B. Gostin
 64 | 		{116,0ull,0ull,4*	   3433149787ull},		// 1999 T. Taura
 65 | 		{122,0ull,0ull,	          5234775ull},		// 1986 G. B. Gostin
 66 | 		{125,0ull,0ull,	                5ull},		// 1956 R. M. Robinson
 67 | 		{133,0ull,0ull,	      88075576149ull},		// 2001 P. Samidoost & Durman
 68 | 		{142,0ull,0ull,2*	      8152599ull},		// 1986 G. B. Gostin
 69 | 		{144,0ull,0ull,2*	           17ull},		// 1956 R. M. Robinson
 70 | 		{146,0ull,0ull,	         37092477ull},		// 1987 G. B. Gostin
 71 | 		{147,0ull,0ull,	             3125ull},		// 1979 G. B. Gostin & P. B. McLaughlin
 72 | 		{147,0ull,0ull,	        124567335ull},		// 1990 G. B. Gostin
 73 | 		{150,0ull,0ull,32*	         1575ull},		// 1956 R. M. Robinson
 74 | 		{150,0ull,0ull,4*	         5439ull},		// 1980 G. B. Gostin & P. B. McLaughlin & H. Suyama
 75 | 		{0,0ull,0ull,0ull}
 76 | 	};
 77 | 
 78 | 	/*******************************************/
 79 | 	/*      Mersenne-number test factors:      */
 80 | 	/*******************************************/
 81 | 
 82 | 	/* Factors > 128 but <= 160 bits. If desired, we can construct more test factors
 83 | 	by multiplying together a 64-bit factor q1 of M(p1) and a 96-bit factor q2 of M(p2)
 84 | 	and checking whether q1*q2 divides M(p1*p2).*/
 85 | 	static const struct testFac160 fac160[] =
 86 | 	{
 87 | 		{     629,       133ull,11545660419510266595ull,15875370168207932041ull},
 88 | 		{     631,      1394ull,15571349859840161706ull,  509892144742137431ull},
 89 | 		{     673,    121320ull, 4492854135134704005ull,14226674137430228263ull},
 90 | 		{     695,2649519282ull,14842833464112563611ull,10174116463236461383ull},
 91 | 		{     731, 655903171ull,17652352551621896287ull, 7660429456444636239ull},
 92 | 		{     805,1083827012ull,18314245293386716597ull, 2219421057460140527ull},
 93 | 		{     877,  13161208ull,18225246095436784582ull,12343089078196252631ull},
 94 | 		{     957,      4730ull,14663183769241509326ull, 8097149896429635207ull},
 95 | 		{     967,    215159ull,  881920578744577810ull,17184239148975426263ull},
 96 | 		{    1017, 212724356ull, 9900144438119899815ull,17733134473107607967ull},
 97 | 		{    1033,       261ull, 5238930328752646394ull, 2803405107698253561ull},
 98 | 		{    1087,         1ull, 4415476118538293365ull,16346425147370540471ull},
 99 | 		{    1087,     70130ull,11905462972019801043ull, 6167785434693019223ull},
100 | 		{    1131,   5800574ull,18429773635221665090ull,17951008765075981215ull},
101 | 		{    1157,  22381525ull,14500669099417213747ull,15903397166638806257ull},
102 | 		{    1283,        14ull, 3291757557782450881ull, 3893270457587058239ull},
103 | 		{    1319,      1552ull, 1390029428449091172ull,14288981644299514807ull},
104 | 		{    1483,      2674ull,14802171160149427175ull, 5085420234315110585ull},
105 | 		{    6659,       664ull,14291576310931480037ull, 4949688733053552967ull},
106 | 		{    8191,    617742ull, 6334326874596939334ull,11405337619840706193ull},
107 | 		{18031451,      2122ull, 5198971222801411122ull,12425019173815339143ull},	/* Note: composite factor! */
108 | 		{0,0ull,0ull,0ull}
109 | 	};
110 | 
111 | 	/* Factors > 160 but <= 192 bits. We can construct more test factors by multiplying
112 | 	together smaller factors of M(p) with multiple factors, or for exponents p1, p2, p3, ...
113 | 	and corresponding factors q1, q2, q3, ... , checking whether q1*q2*q3*...
114 | 	divides M(p1*p2*p3*...). */
115 | 	static const struct testFac192 fac192[] =
116 | 	{
117 | 		{     677,     157590042578912ull,10558642444782195772ull,  329809049266961143ull},
118 | 		{     773,       9118322195022ull, 1933308633079010416ull,17814616685598394119ull},
119 | 		{     971,      70286054459973ull,17012949627558354271ull, 3547755741880899889ull},
120 | 		{     997,  492416983078691417ull, 8040689323464953445ull,16007877010440112335ull},
121 | 		{    1001,         59364131986ull, 9565712986615012496ull,10050950882119470361ull},
122 | 		{0,0ull,0ull,0ull}
123 | 	};
124 | 
125 | #endif	/* #ifndef fac_test_dat192_included */
126 | 


--------------------------------------------------------------------------------
/src/fac_test_dat256.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /****************************************************************************
 24 |  * We now include this header file if it was not included before.
 25 |  ****************************************************************************/
 26 | #ifndef fac_test_dat256_included
 27 | #define fac_test_dat256_included
 28 | 
 29 | 	#include "types.h"
 30 | 
 31 | 	struct testFac256{
 32 | 		char p[80];
 33 | 		char q[80];
 34 | 	};
 35 | 
 36 | 	/*******************************************/
 37 | 	/*      Fermat-number test factors:        */
 38 | 	/*******************************************/
 39 | 
 40 | 	struct testFermFac{
 41 | 		uint32 n;
 42 | 		uint64 k;
 43 | 	};
 44 | 
 45 | 	// Here interpret the above testFac struct as a minimalist [n,k]-pair format,
 46 | 	// where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor:
 47 | 	// To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1.
 48 | 
 49 | 	// Testcases with factors < 2^256:
 50 | 	static const struct testFermFac ffac256[] =
 51 | 	{
 52 | 		{164,2*	    1835601567ull},		// 1993 G. B. Gostin
 53 | 		{166,8*	 2674670937447ull},		// 2012 R. Maznichenko & Rodenkirch
 54 | 		{172,	   20569603303ull},		// 2001 L. N. Durman
 55 | 		{178,	     313047661ull},		// 1991 G. B. Gostin
 56 | 		{184,2*	     117012935ull},		// 1990 G. B. Gostin
 57 | 		{195,	48595346636925ull},		// 2014 S. Batalov & Woltman
 58 | 		{201,2*	          4845ull},		// 1980 G. B. Gostin & P. B. McLaughlin
 59 | 		{205,	        232905ull},		// 1984 W. Keller
 60 | 		{207,	             3ull},		// 1956 R. M. Robinson
 61 | 		{215,	         32111ull},		// 1980 H. Suyama
 62 | 		{226,2*	            15ull},		// 1956 R. M. Robinson
 63 | 		{228,2*	            29ull},		// 1956 R. M. Robinson
 64 | 		{0,0ull}
 65 | 	};
 66 | 
 67 | 	/*******************************************/
 68 | 	/*      Mersenne-number test factors:      */
 69 | 	/*******************************************/
 70 | 
 71 | 	/* 256-bit Factors are easier to give in character-string form: */
 72 | 	/* EWM: These are from my April 2006 shakedown runs of the P4WORD functionality -
 73 | 	ran ??? 64-65-digit test exponents up to k = 10^10; ??? had factors below this bound,
 74 | 	compared to ??? predicted by theory (Dickman's function).
 75 | 	*/
 76 | 	static const struct testFac256 fac256[] =
 77 | 	{
 78 | 		{"1000000000000000000000000000000000000000000000000000000001059"			,"40000000000000000000000000000000000000000000000000000000042361"					},	/* k =          20 */
 79 | 		{"12160287649628674460477464915995054973742562690104903778198683593"		,"543592246870442485937175551111623340804481341938942752102988291735322287319"		},	/* k = 22351126163 */
 80 | 		{"20992192221842725502542568876717904946016534668049886272327917860857843"	,"41984384443685451005085137753435809892033069336099772544655835721715687"			},	/* k =           1 */
 81 | 		{"24247014121478057345510500801908699603302763478708108175450119307"		,"2079083331892761004876676951418337621569030224230467189523407626117207889809"		},	/* k = 42872976472 */
 82 | 		{"3082533446850352619311881710100031378387528865875332083814206171"			,"6165066893700705238623763420200062756775057731750664167628412343"					},	/* k =           1 */
 83 | 		{"32046927906821207388377814233562823608963208068222468012248261177"		,"192281567440927244330266885401376941653779248409334808073489567063"				},	/* k =           3 */
 84 | 		{"32046927906821207388377814233562823608963208068222468012248261177"		,"7261513394406617382132528927183000201554973316178529026895333500096431"			},	/* k =      113295 */
 85 | 		{"3444030707469211201913020330380197621101100449293215160842444859637669"	,"53389364027187712052055641161553823522309259164943421423379580214103144839"		},	/* k =        7751 */
 86 | 		{"3600113305305488204665213841469519415116094330572703657595919530921861"	,"1605650534166247739280685373295405659141778071435425831287780110791150007"		},	/* k =         223 */
 87 | 		{"3852254995466672782398645659611635488623057745649803559363456817432411"	,"22788707831582286845380020155651359827337650244785629920055214225748565104481"	},	/* k =     2957840 */
 88 | 		{"3873455283316355076479185358932261854896321329330898570642046752590709"	,"2921297999392661936999377930740968974773127205440094407601101388055871276457"		},	/* k =      377092 */
 89 | 		{"4088350865739177150968288747826569959957449066175834413752239709"			,"532990125664685046817433867476654272539732719859211180852051986382913"			},	/* k =       65184 */
 90 | 		{"41927056387293174872332083760112302991136793862708943879936201629"		,"586978789422104448212649172641572241875915114077925214319106822807"				},	/* k =           7 */
 91 | 		{"53710507922796892589235420199561121290219608640344181598136297747713099"	,"107421015845593785178470840399122242580439217280688363196272595495426199"			},	/* k =           1 */
 92 | 		{"54973742562690104903778198683593814657412680492564879855614537234786733"	,"769632395877661468652894781570313405203777526895908317978603521287014263"			},	/* k =           7 */
 93 | 		{"5509792592309907965473761255176567513575178296664547791745011299"			,"742776869444172678136618913571387191947269048779332840473151581151737887"			},	/* k =    67405157 */
 94 | 		{"570658748822569815793678976697422057505968344086973502014102067"			,"322107495328491256282531776450837995333351643082236449882652963072723913"			},	/* k =   282224268 */
 95 | 		{"62735676303544776280350450777235547105859548702790814356240145171"		,"19573531006705970199469340642497490697028179195270734079146925293353"				},	/* k =         156 */
 96 | 		{"62749567351885752724891227938183011949129833673362440656643086021"		,"6902452408707432799738035073200131314404281704069868472230739462311"				},	/* k =          55 */
 97 | 		{"6402474964732639141992726042699227967823547816360093417216412199"			,"59547637466852043611708058111909725657028150812842162510646420336832110759"		},	/* k =  4650360821 */
 98 | 		{"7195429162991930645537799140373404328752628889639958794757291746426357"	,"957164768977838582192020192849031737427989705415465878713793977276619713569"		},	/* k =       66512 */
 99 | 		{"7095890455635792122103334669749923563025494780249011419521238281"			,"93538027986191011753566157616643492407802072193242468532128963020143"				},	/* k =        6591 */
100 | 		{"83011949129833673362440656643086021394946395224737190702179860943"		,"2473258012374264464160556924024104921441032899325819859780746776935743"			},	/* k =       14897 */
101 | 		{"85102283345085048608250393021332197155184306354550076682829493041"		,"23658434769933643513093609259930350809141237166564921317826599065399"				},	/* k =         139 */
102 | 		{"9104140792886215078424516709087000699282120660418371806535567252532567"	,"207009953348646758453216660931220221900276859576592938137005728188085508447"		},	/* k =       11369 */
103 | 		{"9729971208443357326548938239119325974636673058360414281388303203"			,"215547993800818444194894669517580993600170370441819907413109392526239783"			},	/* k =    11076497 */
104 | 		{""	,""		},	/* k =             */
105 | 		{""	,""		},	/* k =             */
106 | 		{""	,""		},	/* k =             */
107 | 		{""	,""		},	/* k =             */
108 | 		{""	,""		},	/* k =             */
109 | 		{""	,""		},	/* k =             */
110 | 		{""	,""		},	/* k =             */
111 | 		{"",""}
112 | 	};
113 | 
114 | #endif	/* #ifndef fac_test_dat256_included */
115 | 


--------------------------------------------------------------------------------
/src/fgt_m61.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /****************************************************************************
 24 |  * We now include this header file if it was not included before.
 25 |  ****************************************************************************/
 26 | #ifndef fgt_m61_h_included
 27 | #define fgt_m61_h_included
 28 | 
 29 | #include "util.h"
 30 | 
 31 | // Our modulus q = 2^61 - 1:
 32 | 
 33 | /***************/
 34 | // NB: Since args to these reduce macros will more often than not be expressions (e.g. qreduce(x - y + q4)),
 35 | // start each by copying arg into a local uint64, to ensure that any input expression only gets evaluated once:
 36 | /*
 37 | Returns x (mod q), but in the sense of a possible partial modular reduction: Outputs are in [0, B], where B = q+7.
 38 | Note: if x = q, QREDUCE returns q, not zero.
 39 | */
 40 | #define qreduce(x)	\
 41 | 	({ uint64 tmp = x;	\
 42 | 		tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull);	\
 43 | 		tmp;	})
 44 | 
 45 | // ...or this if you want to finish reducing a qreduce() output:
 46 | #define qreduce_finish(x)	\
 47 | 	({ uint64 tmp = x;	\
 48 | 		tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull;	\
 49 | 		tmp;	})
 50 | 
 51 | // Use this if you require a guaranteed-full reduction of x (mod q)...
 52 | #define qreduce_full(x)	\
 53 | 	({ uint64 tmp = x;	\
 54 | 		tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull);	\
 55 | 		tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull;	\
 56 | 		tmp;	})
 57 | 
 58 | /***************/
 59 | 
 60 | /*
 61 | Returns sqrt(1/2)*x (mod q).
 62 | sqrt(1/2) == 2^30 mod q, so the multiply can be effected via 2 shifts, an AND, and an add.
 63 | For normalized inputs (< q), Output is in [0, B30], where B30 = q + 7*2^30 = 2^61 + 2^33 - 2^30 - 1.
 64 | */
 65 | #define mul_i2(x)	(((x) << 30) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 31)
 66 | 
 67 | /***************/
 68 | 
 69 | /*
 70 | Returns sqrt(2)*x (mod q).
 71 | sqrt(2) == 2^31 mod q, so the multiply can be effected via 2 shifts, an AND, and an add.
 72 | Outputs are in [0, B31], where B31 = q + 7*2^31 = 2^61 + 2^34 - 2^31 - 1.
 73 | */
 74 | #define mul_s2(x)	(((x) << 31) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 30)
 75 | 
 76 | /***************/
 77 | 
 78 | /*
 79 | Returns 2^n * x (mod q). x is a uint64; The shift count n is assumed to be any kind of int, with value in [0,61].
 80 | 
 81 | If x only partially normalized (i.e. in [0, b]) on entry and n = 0, result is fully normalized, i.e. xout in [0,q].
 82 | If x unnormalized on entry and n = 0, the result is partially normalized, i.e. xout in [0,b].
 83 | The special case n = 61 leaves x unchanged.
 84 | 
 85 | For general operands x in [0,2^64-1] and n in [0,60], ((x << n) & q) is in [0, q - (2^n - 1)] = [0, 2^61 - 2^n]
 86 | and (x >> (61-n)) is in [0, 2^(3+n) - 1]. The sum is bounded above by 2^61 - 2^n + 2^(3+n) - 1 = q + 2^(3+n) - 2^n.
 87 | 
 88 | OK, let`s do some crude estimation for non-normalized inputs:
 89 | 
 90 | The sum is maximized for x = 2^64-1 and n = 60, giving 2^63 - 1 + 2^60 = 9*2^60 - 1 ~= 4.5*q,
 91 | i.e. inputs approximately in [0,8q] yield outputs approximately in [0,5q].
 92 | For x = 2^64-1 and n = 59, the sum is bounded by ~2.75*q, etc., approaching q+7 from above.
 93 | 
 94 | x = 2^63-1 and n = 60 gives q + 2^62 - 2^60 ~= 2.5*q .
 95 | x = 2^63-1 and n = 59 gives q + 2^61 - 2^59 ~= 1.75*q . This case is important in the between-forward-and-inverse-FFT
 96 | 							pair_square step, where we multiply inputs in [0,4q] by the modular inverse of 4 == 2^59.
 97 | x = 2^62-1 and n = 60 gives 2^61 - 1 + 2^60 = 3*2^60 - 1 ~= 1.5*q, i.e. inputs approximately in [0,2q]
 98 | 																	yield outputs approximately in [0,2q].
 99 | 
100 | NEGATIVE POWERS OF 2:
101 | 
102 | The modular analog of 1/2 (call it w) satisfies 2*w == 1 (mod q), thus w = (q+1)/2 = 2^60. More generally,
103 | any negative-integer power of 2 (mod q) satisfies 2^(-p) == 2^(61-p), with p < 61. We obtain the same
104 | result by simply analogizing the mul_pow2_modq macro to negative powers, and thus can effect multiply
105 | by 2^(-p) by simply calling the mul_pow2_modq macro with power-of-2 argument (61-p).
106 | 
107 | Thus e.g. to effect a modular x*(1/2) we call mul_pow2_modq(x,60).
108 | */
109 | #define mul_pow2_modq(x,n)	(((x) << n) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> (61-n))
110 | 
111 | /****** Prototypes for functions defined in fgt_m61.c are collected in util.h *******/
112 | 
113 | #endif	/* fgt_m61_h_included */
114 | 


--------------------------------------------------------------------------------
/src/gcd_lehmer.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /*******************************************************************************
24 |    We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef gcd_lehmer_h_included
27 | #define gcd_lehmer_h_included
28 | 
29 | #include "Mlucas.h"
30 | #include "genFFT_mul.h"
31 | 
32 | #ifdef __cplusplus
33 | extern "C" {
34 | #endif
35 | 
36 | /*******************************************************************************
37 |    Function prototypes. The corresponding function definitions will either
38 |    be in a {function name}.c file or (for cases where a .c file contains
39 |    multiple function definitions) in the given .c file:
40 | *******************************************************************************/
41 | 
42 | /* gcd_lehmer.c: */
43 | uint32	mi64_gcd(
44 | 	uint64 u[], uint64 v[], uint32 const ndim,
45 | 	const uint32 EGCD, uint64 Ap[], uint64 Bp[], uint32 *len_AB, uint32 *sign_AB,
46 | 	const uint32 HALF, uint64 Cp[], uint64 Dp[], uint32 *len_CD, uint32 *sign_CD, const uint32 len_targ);
47 | 
48 | uint32	matrix_vector_product_sub(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len);
49 | uint32	matrix_vector_product_add(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len);
50 | 
51 | int		CMP_LT_PROD192	(uint64 a, uint64 xlo, uint64 xhi, uint64 b, uint64 ylo, uint64 yhi);
52 | int		pprime192		(uint192 p, uint64 z);
53 | uint192	bitwise_mod192	(uint192 x, uint192 y);
54 | /*
55 | void	mv_dwtvarbase_to_int64	(x,p,m,u,ndim);
56 | */
57 | void	gcd_init();
58 | int		test_gcd();
59 | 
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 | 
64 | #endif	/* gcd_lehmer_h_included */
65 | 
66 | 


--------------------------------------------------------------------------------
/src/genFFT_mul.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /*******************************************************************************
24 |    We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef gen_fft_h_included
27 | #define gen_fft_h_included
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | /* Enumeration constant of the various supported values for the MODE argument to genFFT_mul().
34 | As the use of an enum implies, these modes are mutually exclusive:
35 | 
36 | 	Mode				Description
37 | 	----------------	-------------------------
38 | 	INIT_ARRAYS			Init FFT-related bit-reversal-index and roots-of-unity data, using input x-array for scratch storage
39 | 
40 | 	The rest assume the function has been previously called in INIT_ARRAYS mode for the FFT length in question:
41 | 
42 | 	FORWARD_FFT_ONLY	The fFFT of the input X-array is computed and stored in-place
43 | 	AUTO_SQUARE			The fFFT of the input X-array is computed, followed by a wrapper/dyadic-square step and an iFFT, all in-place.
44 | 	MUL_PRECOMPUTED		The X-array is assumed to contain an untransformed input vector, and the Y-array to contain a data vector which was previously-transformed by calling this routine in FORWARD_FFT_ONLY mode. The fFFT of the input X-array is computed, followed by a wrapper/dyadic-mul-with-Y-transform step and an iFFT. The result is returned in X; Y is unaffected. (I.e. this is designed for the common case where we have a constant vector which will be used to multiply many sets of inouts).
45 | 
46 | */
47 | enum mode {INIT_ARRAYS, FORWARD_FFT_ONLY, AUTO_SQUARE, MUL_PRECOMPUTED};
48 | 
49 | /* genFFT_mul.c: */
50 | void  genFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int MODE);
51 | void  genFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int MODE);
52 | 
53 | /* Nov 2015 - moved updated versions of these to Mlucas.h:
54 | void pairFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int FORWARD_FFT_ONLY);
55 | void pairFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int FORWARD_FFT_ONLY, int skip_square);
56 | void radix16_pairFFT_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int INIT_ARRAYS, int FORWARD_FFT_ONLY, int skip_square);
57 | */
58 | 
59 | /* The complex/rel wrapper and dyadic-mul step, combined with the final-fFFt/initial-iFFT radix pass: */
60 | void	radix16_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE);
61 | void	radix32_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE);
62 | 
63 | #ifdef __cplusplus
64 | }
65 | #endif
66 | 
67 | #endif	/* gen_fft_h_included */
68 | 
69 | 


--------------------------------------------------------------------------------
/src/getRealTime.c:
--------------------------------------------------------------------------------
  1 | // EWM: June 2014 - Code from http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking
  2 | // for high-precision elapsed real time; thanks to Stephen Searle for finding this.
  3 | // Prototype for getRealTime() is in util.h .
  4 | 
  5 | /*
  6 |  * Author:  David Robert Nadeau
  7 |  * Site:    http://NadeauSoftware.com/
  8 |  * License: Creative Commons Attribution 3.0 Unported License
  9 |  *          http://creativecommons.org/licenses/by/3.0/deed.en_US
 10 |  */
 11 | 
 12 | #if defined(_WIN32)
 13 | 
 14 | 	#include <windows.h>
 15 | 
 16 | #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
 17 | 
 18 | 	#include <unistd.h>	/* POSIX flags */
 19 | 	#include <time.h>	/* clock_gettime(), time() */
 20 | 	#include <sys/time.h>	/* gethrtime(), gettimeofday() */
 21 | 
 22 | #if defined(__MACH__) && defined(__APPLE__)
 23 | 
 24 | 	#include <mach/mach.h>
 25 | 	#include <mach/mach_time.h>
 26 | 
 27 | #endif
 28 | 
 29 | #else
 30 | 	// EWM: Instead of error-on-compile we default to the util.c:get_time_str() function if any misconfig detected:
 31 | 	#error "Unable to define getRealTime( ) for an unknown OS."
 32 | 	#define GRT_MISCONFIG
 33 | #endif
 34 | 
 35 | /**
 36 |  * Returns the real time, in seconds, or -1.0 if an error occurred.
 37 |  *
 38 |  * Time is measured since an arbitrary and OS-dependent start time.
 39 |  * The returned real time is only useful for computing an elapsed time
 40 |  * between two calls to this function.
 41 |  */
 42 | double getRealTime( )
 43 | {
 44 | #ifdef GRT_MISCONFIG
 45 | 	return -1.0;		// EWM: See above note
 46 | #elif defined(_WIN32)
 47 | 	FILETIME tm;
 48 | 	ULONGLONG t;
 49 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
 50 | 	/* Windows 8, Windows Server 2012 and later. ---------------- */
 51 | 	GetSystemTimePreciseAsFileTime( &tm );
 52 | #else
 53 | 	/* Windows 2000 and later. ---------------------------------- */
 54 | 	GetSystemTimeAsFileTime( &tm );
 55 | #endif
 56 | 	t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
 57 | 	return (double)t / 10000000.0;
 58 | 
 59 | #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
 60 | 	/* HP-UX, Solaris. ------------------------------------------ */
 61 | 	return (double)gethrtime( ) / 1000000000.0;
 62 | 
 63 | #elif defined(__MACH__) && defined(__APPLE__)
 64 | 	/* OSX. ----------------------------------------------------- */
 65 | 	static double timeConvert = 0.0;
 66 | 	if ( timeConvert == 0.0 )
 67 | 	{
 68 | 		mach_timebase_info_data_t timeBase;
 69 | 		(void)mach_timebase_info( &timeBase );
 70 | 		timeConvert = (double)timeBase.numer /
 71 | 			(double)timeBase.denom /
 72 | 			1000000000.0;
 73 | 	}
 74 | 	return (double)mach_absolute_time( ) * timeConvert;
 75 | 
 76 | #elif defined(_POSIX_VERSION)
 77 | 	/* POSIX. --------------------------------------------------- */
 78 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
 79 | 	{
 80 | 		struct timespec ts;
 81 | #if defined(CLOCK_MONOTONIC_PRECISE)
 82 | 		/* BSD. --------------------------------------------- */
 83 | 		const clockid_t id = CLOCK_MONOTONIC_PRECISE;
 84 | #elif defined(CLOCK_MONOTONIC_RAW)
 85 | 		/* Linux. ------------------------------------------- */
 86 | 		const clockid_t id = CLOCK_MONOTONIC_RAW;
 87 | #elif defined(CLOCK_HIGHRES)
 88 | 		/* Solaris. ----------------------------------------- */
 89 | 		const clockid_t id = CLOCK_HIGHRES;
 90 | #elif defined(CLOCK_MONOTONIC)
 91 | 		/* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
 92 | 		const clockid_t id = CLOCK_MONOTONIC;
 93 | #elif defined(CLOCK_REALTIME)
 94 | 		/* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
 95 | 		const clockid_t id = CLOCK_REALTIME;
 96 | #else
 97 | 		const clockid_t id = (clockid_t)-1;	/* Unknown. */
 98 | #endif /* CLOCK_* */
 99 | 		if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
100 | 			return (double)ts.tv_sec +
101 | 				(double)ts.tv_nsec / 1000000000.0;
102 | 		/* Fall thru. */
103 | 	}
104 | #endif /* _POSIX_TIMERS */
105 | 
106 | 	/* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
107 | 	struct timeval tm;
108 | 	gettimeofday( &tm, NULL );
109 | 	return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
110 | #else
111 | 	return -1.0;		/* Failed. */
112 | #endif
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/src/get_fp_rnd_const.c:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | #include "util.h"
24 | 
25 | /* Set the value of the round constant used for fast NINT emulation: */
26 | void	get_fp_rnd_const(double*RND_A, double*RND_B)
27 | {
28 | #if(FP_MANTISSA_BITS_DOUBLE == 64)	/* X86 64-mantissa-bit register doubles: */
29 | 	*RND_A = 3.0*0x4000000*0x2000000*0x800;
30 | 	*RND_B =12.0*0x2000000*0x1000000*0x800;
31 | 	fprintf(stderr,"INFO: using 64-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation.\n");
32 | #else	/* These assume IEEE64-compliant double-precision hardware arithmetic: */
33 | 	*RND_A = 3.0*0x4000000*0x2000000;
34 | 	*RND_B =12.0*0x2000000*0x1000000;
35 | 	fprintf(stderr,"INFO: using 53-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation. \n");
36 | #endif
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/src/gpu_iface.cu:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2012 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so
 24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be
 25 | // considered coincidental, and any faults strictly mine.
 26 | 
 27 | #include "gpu_iface.h"
 28 | 
 29 | #ifdef __CUDACC__
 30 | 	#warning using nvcc
 31 | 	#ifdef __CUDA_ARCH__
 32 | 		#warning device code trajectory
 33 | 		#if __CUDA_ARCH__ > 120
 34 | 			#warning compiling with double precision
 35 | 		#else
 36 | 			#warning compiling with single precision
 37 | 		#endif
 38 | 	#else
 39 | 		#warning nvcc host code trajectory
 40 | 	#endif
 41 | #else
 42 | 	#warning non-nvcc code trajectory
 43 | #endif
 44 | 
 45 | #ifndef OS_BITS
 46 | 	#error Bitness not defined!
 47 | #elif OS_BITS == 32
 48 | 	#warning compiling in 32-bit mode
 49 | #elif OS_BITS == 64
 50 | 	#warning compiling in 64-bit mode
 51 | #else
 52 | 	#error Bitness defined but not supported!
 53 | #endif
 54 | 
 55 | // 50 Ways to say "Houston, we have a problem":
 56 | char *
 57 | cuGetErrorMessage(CUresult result)
 58 | {
 59 | 	switch (result) {
 60 | 	case CUDA_SUCCESS: return "CUDA_SUCCESS";
 61 | 	case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
 62 | 	case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
 63 | 	case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
 64 | 	case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
 65 | 	case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
 66 | 	case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
 67 | 	case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
 68 | 	case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
 69 | 	case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
 70 | 	case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
 71 | 	case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
 72 | 	case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
 73 | 	case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
 74 | 	case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
 75 | 	case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
 76 | 	case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
 77 | 	case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
 78 | 	case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
 79 | 	case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
 80 | 	case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
 81 | 	case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
 82 | 	case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
 83 | 	case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
 84 | 	case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
 85 | 	case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
 86 | 	case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
 87 | 	default: return "CUDA: unexpected error";
 88 | 	}
 89 | }
 90 | 
 91 | // Read information on all available GPUs into input arg:
 92 | void
 93 | gpu_init(gpu_config_t *gpu_config)
 94 | {
 95 | 	int32 device, nskip = 0;
 96 | 	memset(gpu_config, 0, sizeof(gpu_config_t));
 97 | 
 98 | //	CUDA_TRY(cudaGetDeviceCount(&gpu_config->num_gpu))	*** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult"
 99 | 	cudaGetDeviceCount(&gpu_config->num_gpu);
100 | 	for (device = 0; device < (int32)gpu_config->num_gpu; device++)
101 | 	{
102 | 		// Get pointer to info for [device]th GPU having the minimum required capability:
103 | 		gpu_info_t *info = gpu_config->gpu_info + device - nskip;
104 | //		CUDA_TRY(cudaGetDeviceProperties(info, device))	*** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult"
105 | 		cudaGetDeviceProperties(info, device);
106 | 		if(info->major < 2) {
107 | 			printf("GPU #%d compute capability %d.%d is less than min-supported 2.x ... ignoring this device.\n",device,info->major,info->minor);
108 | 			++nskip;
109 | 		}
110 | 		// Note: Devices with cc = 2.x have (32 + 16*x) shader cores per multiprocessor (At least for x = 0 and 1 ... may need table for this
111 | 	}
112 | 	gpu_config->num_gpu -= nskip;
113 | 	return;
114 | }
115 | 
116 | #ifdef GPU_IFACE_STANDALONE
117 | 	int main(int argc, char *argv[])
118 | 	{
119 | 		gpu_config_t gpu_config;
120 | 		gpu_info_t ginfo;
121 | 		int32 igpu;
122 | 
123 | 		gpu_init(&gpu_config);
124 | 		if (gpu_config.num_gpu > 0) {
125 | 			printf("Detected %u CUDA-enabled GPU devices.\n", gpu_config.num_gpu);
126 | 			for(igpu = 0; igpu < gpu_config.num_gpu; ++igpu) {
127 | 				ginfo = gpu_config.gpu_info[igpu];
128 | 				printf("GPU #%u: %s v%u.%u\n", igpu, ginfo.name, ginfo.major, ginfo.minor);
129 | 				printf("clock_speed = %u MHz\n", ginfo.clockRate/1000);
130 | 				printf("num_compute_units = %u\n", ginfo.multiProcessorCount);
131 | 				printf("constant_mem_size = %u\n", ginfo.totalConstMem);
132 | 				printf("shared_mem_size = %u\n", ginfo.sharedMemPerBlock);
133 | 				printf("global_mem_size = %u\n", ginfo.totalGlobalMem);
134 | 				printf("registers_per_block = %u\n", ginfo.regsPerBlock);
135 | 				printf("max_threads_per_block = %u\n", ginfo.maxThreadsPerBlock);
136 | 				printf("can_overlap = %u\n", ginfo.deviceOverlap);
137 | 				printf("concurrent_kernels = %u\n", ginfo.concurrentKernels);
138 | 				printf("warp_size = %u\n", ginfo.warpSize);
139 | 				printf("max_thread_dim[3] = [%u,%u,%u]\n", ginfo.maxThreadsDim[0], ginfo.maxThreadsDim[1], ginfo.maxThreadsDim[2]);
140 | 				printf("max_grid_size[3] = [%u,%u,%u]\n", ginfo.maxGridSize[0], ginfo.maxGridSize[1], ginfo.maxGridSize[2]);
141 | 			}
142 | 			exit(0);
143 | 		} else {
144 | 			printf("ERROR: No CUDA-enabled GPUs found\n");
145 | 			exit(-1);
146 | 		}
147 | 	}
148 | #endif
149 | 
150 | 


--------------------------------------------------------------------------------
/src/gpu_iface.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so
 24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be
 25 | // considered coincidental, and any faults strictly mine.
 26 | 
 27 | #ifndef gpu_iface_h_included
 28 | #define gpu_iface_h_included
 29 | 
 30 | #ifndef GPU_IFACE_STANDALONE
 31 | 	// Non-standalone build assumes the non-main functions in this file will serve as GPU diagnostics
 32 | 	// for an Mlucas or Mfactor build, so require same compile flag as for the other sources in such a build:
 33 | 	#ifndef USE_GPU
 34 | 		#error Compilation of any source file using a gpu-specific header requires the user-defined preprocessor flag USE_GPU
 35 | 	#endif
 36 | 
 37 | 	#include "masterdefs.h"
 38 | 	#include "types.h"
 39 | #else
 40 | 	#include <stdio.h>
 41 | 	typedef int int32;
 42 | #endif
 43 | 
 44 | #include <cuda.h>
 45 | #include <cuda_runtime.h>
 46 | #include <driver_types.h>
 47 | 
 48 | #ifdef __cplusplus
 49 | extern "C" {
 50 | #endif
 51 | 
 52 | #define MAX_GPU 16
 53 | 
 54 | typedef struct cudaDeviceProp gpu_info_t;
 55 | /*
 56 | cudaDeviceProp struct members:
 57 | 
 58 | int 	canMapHostMemory 	Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer.
 59 | int 	clockRate			Clock frequency in kilohertz.
 60 | int 	computeMode			Compute mode (See cudaComputeMode).
 61 | int 	deviceOverlap		Device can concurrently copy memory and execute a kernel.
 62 | int 	integrated			Device is integrated as opposed to discrete.
 63 | int 	kernelExecTimeoutEnabled 	Specified whether there is a run time limit on kernels.
 64 | int 	major				Major compute capability.
 65 | int 	minor				Minor compute capability.
 66 | int 	maxGridSize [3] 	Maximum size of each dimension of a grid.
 67 | int 	maxThreadsDim [3] 	Maximum size of each dimension of a block.
 68 | int 	maxThreadsPerBlock 	Maximum number of threads per block.
 69 | size_t 	memPitch			Maximum pitch in bytes allowed by memory copies.
 70 | int 	multiProcessorCount Number of multiprocessors on device.
 71 | char 	name [256]			ASCII string identifying device.
 72 | int 	regsPerBlock		32-bit registers available per block
 73 | size_t 	sharedMemPerBlock 	Shared memory available per block in bytes.
 74 | size_t 	textureAlignment 	Alignment requirement for textures.
 75 | size_t 	totalConstMem		Constant memory available on device in bytes.
 76 | size_t 	totalGlobalMem		Global memory available on device in bytes.
 77 | int 	warpSize			Warp size in threads.
 78 | */
 79 | 
 80 | typedef struct {
 81 | 	int32 num_gpu;
 82 | 	gpu_info_t gpu_info[MAX_GPU];
 83 | } gpu_config_t;
 84 | 
 85 | char * cuGetErrorMessage(CUresult result);
 86 | 
 87 | void gpu_init(gpu_config_t *config);
 88 | 
 89 | #define CUDA_TRY(func) \
 90 | 	{ 			 				\
 91 | 		CUresult status = func;				\
 92 | 		if (status != CUDA_SUCCESS) {			\
 93 | 			printf("error (line %d): %s\n", __LINE__,\
 94 | 				cuGetErrorMessage(status));	\
 95 | 			exit(-1);				\
 96 | 		}						\
 97 | 	}
 98 | 
 99 | #define CUDA_ALIGN_PARAM(offset, pow2align) \
100 | 	(offset) = ((offset) + (pow2align) - 1) & ~((pow2align) - 1)
101 | 
102 | #ifdef __cplusplus
103 | }
104 | #endif
105 | 
106 | #endif /* !gpu_iface_h_included_ */
107 | 
108 | 


--------------------------------------------------------------------------------
/src/imul_macro.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef imul_macro_h_included
27 | #define imul_macro_h_included
28 | 
29 | #include "imul_macro0.h"
30 | #include "imul_macro1.h"
31 | 
32 | #endif	/* imul_macro_h_included */
33 | 
34 | 


--------------------------------------------------------------------------------
/src/masterdefs.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef masterdefs_h_included
27 | #define masterdefs_h_included
28 | 
29 | #include <assert.h>
30 | #include <ctype.h>
31 | #include <errno.h>
32 | #include <math.h>
33 | #include <signal.h>
34 | #include <stdarg.h>
35 | #include <stddef.h>
36 | #include <stdio.h>
37 | #include <stdint.h>
38 | #include <stdlib.h>
39 | #include <string.h>
40 | #include <strings.h>	// Nov 2021: Add to provide POSIX case-insensitive string compare string compare strcasecmp() and strncasecmp();
41 | 						// cf. https://stackoverflow.com/questions/5820810/case-insensitive-string-comparison-in-c
42 | #include <time.h>
43 | 
44 | #ifdef macintosh
45 | 	#include <console.h>	/* Macintosh CW */
46 | #endif
47 | 
48 | #undef  EWM_DEBUG
49 | #define EWM_DEBUG		0	/* Set = 1 to turn on various debugging diagnostics, especially DBG_ASSERT, defined in util.c . */
50 | 
51 | /* cf. util.h|c : If debug enabled, alias DBG_ASSERT to ASSERT (a function defined
52 | in util.c), otherwise alias the entire 4-argument DBG_ASSERT invocation to "Bolivian"
53 | (to paraphrase ex-heavyweight boxing champ Mike Tyson.) */
54 | #if EWM_DEBUG
55 | 	#define DBG_ASSERT ASSERT
56 | 	#define DBG_WARN   WARN
57 | 	#define DBG_INFO   INFO
58 | #else	/* Bolivian - lump both the FILE and LINE args together as a single __here, that's why it looks like these take 1 less arg than the underlying functions: */
59 | 	#define DBG_ASSERT(__arg1, __arg2)	/* */
60 | 	#define DBG_WARN(__here, __arg2, __arg3, __arg4)	/* */
61 | 	#define DBG_INFO(__here, __arg2, __arg3, __arg4)	/* */
62 | #endif
63 | 
64 | /*******************************************************************************
65 |    Mlucas-specific master #defines:
66 | *******************************************************************************/
67 | 
68 | /* Set = 1 to do a simple FFT/IFFT-returns-original-inputs test
69 | (sans weighting and dyadic squaring) using pseudorandom inputs:
70 | */
71 | #undef  FFT_DEBUG
72 | #define FFT_DEBUG	0
73 | 
74 | #undef  NOBRANCH
75 | #define NOBRANCH	1	/* Switch between branched and branchless versions of various key sequences. */
76 | 
77 | #ifndef	LO_ADD
78 | 	#define	LO_ADD		1	/* TRUE = use algorithm with more mul and fewer add */
79 | #endif
80 | 
81 | #undef	N_LEADING_RADICES
82 | #define	N_LEADING_RADICES	8	/* # of intervals we split adjacent power-of-2 transform lengths into */
83 | 
84 | #endif	/* masterdefs_h_included */
85 | 


--------------------------------------------------------------------------------
/src/pair_square.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | #include "Mlucas.h"
 24 | 
 25 | /***************/
 26 | 
 27 | /*
 28 | Macro versions of these are in pair_square.h, since radix32_wrapper_square.c also needs to inline those;
 29 | SSE2 macros for this are in sse2_macro_gcc64.h.
 30 | */
 31 | void pair_square(double *x1, double *y1, double *x2, double *y2, double c, double s)
 32 | {
 33 | /*
 34 | !   Given complex scalars H[j] = (x1,y1) and H[N-j] = (x2,y2) along with complex exponential E = (c,s),
 35 | !   calculates I[j] = H[j]^2 + {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 and its complex conjugate I~,
 36 | !   returns the former in H[j] and the latter in H[N-j].
 37 | */
 38 | 	// Use that (H[j] - H~[N-j])^2 = H(j)^2 - 2*H(j)*H~(N-j) + H~(N-j)^2 to efficiently compute both (H[j]-H~[N-j])^2 and H[j]^2:
 39 | #if 0
 40 | 	double rt0,rt1,rt2,rt3,it1,it2,it3;
 41 | 	// H[j] = (r1,i1); H[N-j] = (r2,i2):
 42 | 	rt1 = *x1;	it1 = *y1;	rt2 = *x2;	it2 = *y2;	// H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)]
 43 | 												// = [(r1^2-i1^2) + (r2^2-i2^2) - 2.(r1.r2+i1.i2)] + 2.I.[(r1.i1-r2.i2) - (i1.r2-r1.i2)]
 44 | 	// Calculate cross product terms:
 45 | 	rt3 = rt1*rt2 + it1*it2; rt3 = rt3 + rt3;	// 2.(r1.r2 + i1.i2)
 46 | 	it3 = it1*rt2 - rt1*it2; it3 = it3 + it3;	// 2.(i1.r2 - r1.i2)
 47 | 	// Now calculate square terms and store back in the same temporaries:
 48 | 	rt0 = (rt1 + it1)*(rt1 - it1); it1 = rt1*it1; it1 = it1 + it1; rt1 = rt0;	// rt1,it1 = (r1^2-i1^2); 2.r1.i1
 49 | 	rt0 = (rt2 + it2)*(rt2 - it2); it2 = rt2*it2; it2 = it2 + it2; rt2 = rt0;	// rt2,it2 = (r2^2-i2^2); 2.r2.i2
 50 | 	// {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 :
 51 | 	rt3 = rt1 + rt2 - rt3;	// Re(H[j]-H~[N-j])
 52 | 	it3 = it1 - it2 - it3;	// Im(H[j]-H~[N-j])
 53 | 	rt0 = ((c + 1.0)*rt3 - s*it3)*0.25;
 54 | 	it3 = (s*rt3 + (c + 1.0)*it3)*0.25;
 55 | 	// And now complete and store the results:
 56 | 	*x1 = (rt1 - rt0);	// Re(I[j])
 57 | 	*y1 = (it1 - it3);	// Im(I[j])
 58 | 	// N-j terms are as above, but with the replacements: rt1<-->rt2, it1<-->it2, it3|-->-it3:
 59 | 	*x2 = (rt2 - rt0);
 60 | 	*y2 = (it2 + it3);
 61 | // Cost: [22 add, 12 mul], compared to [18 add, 18 mul] for generic-mul version ... seems too add-heavy.
 62 | #elif 0	// Quick test of mul version of this function, using square inputs:
 63 | 	double re,im,tt;
 64 | /*...gather the 4 complex elements which are to be combined...*/
 65 | 		//	Re{H[j]}	Im{H[j]}	Re{I[j]}	Im{I[j]}	Re{H[N-j]}	Im{H[N-j]}	Re{I[N-j]}	Im{I[N-j]}
 66 | 	double r1 = *x1,	i1 = *y1,	r2 = *x1,	i2 = *y1,	r3 = *x2,	i3 = *y2,	r4 = *x2,	i4 = *y2;
 67 | // calculate 2nd square-like term and store in temp...
 68 | 	re = r3*r4 - i3*i4;	// re := Re{H(n2-j)*I(n2-j)}
 69 | 	im = r3*i4 + i3*r4;	// im := Im{H(n2-j)*I(n2-j)}
 70 | // calculate difference terms...
 71 | 	r3 = r1 - r3;		// r3 := Re{H(j)-H~(n2-j)}
 72 | 	i3 = i1 + i3;		// i3 := Im{H(j)-H~(n2-j)}
 73 | 	r4 = r2 - r4;		// r4 := Re{I(j)-I~(n2-j)}
 74 | 	i4 = i2 + i4;		// i4 := Im{I(j)-I~(n2-j)}
 75 | // now calculate 1st square-like term and store back in H(j) slot...
 76 | 	tt = r1*r2 - i1*i2;			// r1 := Re{H(j)*I(j)}
 77 | 	i1 = r1*i2 + i1*r2; r1 = tt;// i1 := Im{H(j)*I(j)}
 78 | // calculate the complex products to build the second term...
 79 | 	tt = r3*r4 - i3*i4;			// Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
 80 | 	i3 = r3*i4 + i3*r4; r3 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
 81 | 	tt = ((c + 1.0)*r3 - s*i3)*0.25;	// Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
 82 | 	i3 = (s*r3 + (c + 1.0)*i3)*0.25;	// Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
 83 | // and now complete and store the results.
 84 | 	*x1 = (r1-tt);	// Re{M(j)}
 85 | 	*y1 = (i1-i3);	// Im{M(j)}
 86 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3.
 87 | 	*x2 = (re-tt);	// Re{M(N-j)}
 88 | 	*y2 = (im+i3);	// Im{M(N-j)}
 89 | #else
 90 | 	double re,im,tt, r1 = *x1, i1 = *y1, r2 = *x2, i2 = *y2, cc = (c + 1.0)*0.25, ss = s*0.25;
 91 | 	// H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)]
 92 | // calculate 2nd square-like term and store in temp...
 93 | 	re = (r2+i2)*(r2-i2);	// re := Re{H(n2-j)^2}
 94 | 	im = r2*i2 + i2*r2;		// im := Im{H(n2-j)^2}
 95 | // calculate difference terms...
 96 | 	r2 = r1 - r2;			// r2 := Re{H(j)-H~(n2-j)}
 97 | 	i2 = i1 + i2;			// i2 := Im{H(j)-H~(n2-j)}
 98 | // now calculate 1st square-like term and store back in H(j) slot...
 99 | 	tt = (r1+i1)*(r1-i1);		// r1 := Re{H(j)^2}
100 | 	i1 = r1*i1 + i1*r1; r1 = tt;// i1 := Im{H(j)^2}
101 | // calculate the complex products to build the second term...
102 | 	tt = (r2+i2)*(r2-i2);		// Re{(H[j] - H~[N/2-j])^2}
103 | 	i2 = r2*i2 + i2*r2; r2 = tt;// Im{(H[j] - H~[N/2-j])^2}
104 | 	tt = (cc*r2 - ss*i2);	// Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
105 | 	i2 = (ss*r2 + cc*i2);	// Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
106 | // and now complete and store the results.
107 | 	*x1 = (r1-tt);	// Re{M(j)}
108 | 	*y1 = (i1-i2);	// Im{M(j)}
109 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3.
110 | 	*x2 = (re-tt);	// Re{M(N-j)}
111 | 	*y2 = (im+i2);	// Im{M(N-j)}
112 | // Cost: [19 add, 15 mul] ... or [16 add, 18 mul] if replace re-part-of-cmuls (r+i)*(r-i) with r^2-i^2.
113 | // Can save another [2 add, 2 mul] by precomputing cc = (c + 1.0)/4 and ss = s/4.
114 | #endif
115 | }
116 | 
117 | // Jul 2019: This routine adapted from my vintage 1999 mersenne_pm1.f90 code, with input-indec swap 2 <--> 3:
118 | void pair_mul(
119 | 	double *x1, double *y1, double *x2, double *y2, const double sx3, const double sy3, const double sx4, const double sy4,
120 | 	const double c, const double s)
121 | {
122 | /*
123 | !   Given complex scalars H[j] = (x1,y1), H[N-j] = (x2,y2) and (const)I[j] = (x3,y3), I[N-j] = (x4,y4)
124 | !   along with complex exponential E = (c,s),
125 | !   calculates M[j] = H[j]*I[j] + {1 + exp(4*pi*I*j/N)}*{H[j]-H~[N-j]}*{I[j]-I~[N-j]}/4 and its complex conjugate M~,
126 | !   returns the former in H[j] and the latter in H[N-j], thus overwriting those non-const inputs.
127 | */
128 | 	double re,im,tt, cc = (c + 1.0)*0.25, ss = s*0.25;
129 | /*...gather the 4 complex elements which are to be combined...*/
130 | 		//	Re{H[j]}	Im{H[j]}	Re{H[N-j]}	Im{H[N-j]}	Re{I[j]}	Im{I[j]}	Re{I[N-j]}	Im{I[N-j]}
131 | 	double r1 = *x1,	i1 = *y1,	r2 = *x2,	i2 = *y2,	r3 = sx3,	i3 = sy3,	r4 = sx4,	i4 = sy4;
132 | 
133 | /*...Have: H, H~, I, I~	need: H*I, H~*I~, H - H~, I - I~. Use the sequence:
134 | 	Find H~I~, store in tmp
135 | 	Find H-H~, store in H~
136 | 	Find I-I~, store in I~
137 | 	Find HI, store in H
138 | 	Store H~I~ in I
139 | */
140 | // calculate 2nd square-like term and store in temp...
141 | 	re = r2*r4 - i2*i4;	// re := Re{H(n2-j)*I(n2-j)}
142 | 	im = r2*i4 + i2*r4;	// im := Im{H(n2-j)*I(n2-j)}
143 | // calculate difference terms...
144 | 	r2 = r1 - r2;		// r2 := Re{H(j)-H~(n2-j)}
145 | 	i2 = i1 + i2;		// i2 := Im{H(j)-H~(n2-j)}
146 | 	r4 = r3 - r4;		// r4 := Re{I(j)-I~(n2-j)}
147 | 	i4 = i3 + i4;		// i4 := Im{I(j)-I~(n2-j)}
148 | // now calculate 1st square-like term and store back in H(j) slot...
149 | 	tt = r1*r3 - i1*i3;			// r1 := Re{H(j)*I(j)}
150 | 	i1 = r1*i3 + i1*r3; r1 = tt;// i1 := Im{H(j)*I(j)}
151 | // calculate the complex products to build the second term...
152 | 	tt = r2*r4 - i2*i4;			// Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
153 | 	i2 = r2*i4 + i2*r4; r2 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
154 | 	tt = (cc*r2 - ss*i2);	// Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
155 | 	i2 = (ss*r2 + cc*i2);	// Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
156 | // and now complete and store the results.
157 | 	*x1 = (r1-tt);	// Re{M(j)}
158 | 	*y1 = (i1-i2);	// Im{M(j)}
159 | // N-j terms are as above, but with the replacements: r1<-->r3, i1<-->i3, i2|-->-i2.
160 | 	*x2 = (re-tt);	// Re{M(N-j)}
161 | 	*y2 = (im+i2);	// Im{M(N-j)}
162 | // Cost: 16 add, 16 mul [Ignoring the (1 add, 2 mul) cost of the cc,ss precomputation]
163 | }
164 | 
165 | 


--------------------------------------------------------------------------------
/src/prefetch.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/src/prefetch.h


--------------------------------------------------------------------------------
/src/qfcheb.h:
--------------------------------------------------------------------------------
1 | #define STR_MAX_LEN 1024
2 | extern char cbuf[STR_MAX_LEN*2];
3 | 


--------------------------------------------------------------------------------
/src/radix1024.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /****************************************************************************
 24 |  * We now include this header file if it was not included before.
 25 |  ****************************************************************************/
 26 | #ifndef radix1024_included
 27 | #define radix1024_included
 28 | 
 29 | #include "radix512.h"
 30 | 
 31 | 	// 'bc -l' code for these: p2=8*a(1);d=p2/1024;t=-d;	t+=(d+d);c(t);s(t); [repeat 64 times]:
 32 | 	// Of the odd-order 1024th roots, note that _4f,_53,_7f end up being unused by the radix-1024 DFT twiddles array:
 33 | 	#define c1024_01 ((double)0.99998117528260114265)
 34 | 	#define s1024_01 ((double)0.00613588464915447535)	/* exp(01*I*twopi/1024) */
 35 | 	#define c1024_03 ((double)0.99983058179582342201)
 36 | 	#define s1024_03 ((double)0.01840672990580482090)	/* exp(03*I*twopi/1024) */		
 37 | 	#define c1024_05 ((double)0.99952941750109316308)
 38 | 	#define s1024_05 ((double)0.03067480317663662588)	/* exp(05*I*twopi/1024) */
 39 | 	#define c1024_07 ((double)0.99907772775264538289)
 40 | 	#define s1024_07 ((double)0.04293825693494082301)	/* exp(07*I*twopi/1024) */		
 41 | 	#define c1024_09 ((double)0.99847558057329475221)
 42 | 	#define s1024_09 ((double)0.05519524434968993972)	/* exp(09*I*twopi/1024) */
 43 | 	#define c1024_0b ((double)0.99772306664419160985)
 44 | 	#define s1024_0b ((double)0.06744391956366405780)	/* exp(0b*I*twopi/1024) */		
 45 | 	#define c1024_0d ((double)0.99682029929116571498)
 46 | 	#define s1024_0d ((double)0.07968243797143012103)	/* exp(0d*I*twopi/1024) */
 47 | 	#define c1024_0f ((double)0.99576741446765979399)
 48 | 	#define s1024_0f ((double)0.09190895649713272849)	/* exp(0f*I*twopi/1024) */		
 49 | 	#define c1024_11 ((double)0.99456457073425545213)
 50 | 	#define s1024_11 ((double)0.10412163387205457897)	/* exp(11*I*twopi/1024) */
 51 | 	#define c1024_13 ((double)0.99321194923479453312)
 52 | 	#define s1024_13 ((double)0.11631863091190476708)	/* exp(13*I*twopi/1024) */		
 53 | 	#define c1024_15 ((double)0.99170975366909952288)
 54 | 	#define s1024_15 ((double)0.12849811079379317243)	/* exp(15*I*twopi/1024) */
 55 | 	#define c1024_17 ((double)0.99005821026229710553)
 56 | 	#define s1024_17 ((double)0.14065823933284923051)	/* exp(17*I*twopi/1024) */		
 57 | 	#define c1024_19 ((double)0.98825756773074949143)
 58 | 	#define s1024_19 ((double)0.15279718525844342750)	/* exp(19*I*twopi/1024) */
 59 | 	#define c1024_1b ((double)0.98630809724459864790)
 60 | 	#define s1024_1b ((double)0.16491312048996992118)	/* exp(1b*I*twopi/1024) */		
 61 | 	#define c1024_1d ((double)0.98421009238692907323)
 62 | 	#define s1024_1d ((double)0.17700422041214875594)	/* exp(1d*I*twopi/1024) */
 63 | 	#define c1024_1f ((double)0.98196386910955526412)
 64 | 	#define s1024_1f ((double)0.18906866414980621248)	/* exp(1f*I*twopi/1024) */		
 65 | 	#define c1024_21 ((double)0.97956976568544053449)
 66 | 	#define s1024_21 ((double)0.20110463484209191127)	/* exp(21*I*twopi/1024) */
 67 | 	#define c1024_23 ((double)0.97702814265775435155)
 68 | 	#define s1024_23 ((double)0.21311031991609137366)	/* exp(23*I*twopi/1024) */		
 69 | 	#define c1024_25 ((double)0.97433938278557586059)
 70 | 	#define s1024_25 ((double)0.22508391135979283567)	/* exp(25*I*twopi/1024) */
 71 | 	#define c1024_27 ((double)0.97150389098625177561)
 72 | 	#define s1024_27 ((double)0.23702360599436720653)	/* exp(27*I*twopi/1024) */		
 73 | 	#define c1024_29 ((double)0.96852209427441731631)
 74 | 	#define s1024_29 ((double)0.24892760574572016775)	/* exp(29*I*twopi/1024) */
 75 | 	#define c1024_2b ((double)0.96539444169768937465)
 76 | 	#define s1024_2b ((double)0.26079411791527551791)	/* exp(2b*I*twopi/1024) */		
 77 | 	#define c1024_2d ((double)0.96212140426904159553)
 78 | 	#define s1024_2d ((double)0.27262135544994898410)	/* exp(2d*I*twopi/1024) */
 79 | 	#define c1024_2f ((double)0.95870347489587155549)
 80 | 	#define s1024_2f ((double)0.28440753721127184321)	/* exp(2f*I*twopi/1024) */		
 81 | 	#define c1024_31 ((double)0.95514116830577072162)
 82 | 	#define s1024_31 ((double)0.29615088824362382370)	/* exp(31*I*twopi/1024) */
 83 | 	#define c1024_33 ((double)0.95143502096900836968)
 84 | 	#define s1024_33 ((double)0.30784964004153489325)	/* exp(33*I*twopi/1024) */		
 85 | 	#define c1024_35 ((double)0.94758559101774113480)
 86 | 	#define s1024_35 ((double)0.31950203081601567745)	/* exp(35*I*twopi/1024) */
 87 | 	#define c1024_37 ((double)0.94359345816196036165)
 88 | 	#define s1024_37 ((double)0.33110630575987640127)	/* exp(37*I*twopi/1024) */		
 89 | 	#define c1024_39 ((double)0.93945922360218991213)
 90 | 	#define s1024_39 ((double)0.34266071731199439711)	/* exp(39*I*twopi/1024) */
 91 | 	#define c1024_3b ((double)0.93518350993894757782)
 92 | 	#define s1024_3b ((double)0.35416352542049038186)	/* exp(3b*I*twopi/1024) */		
 93 | 	#define c1024_3d ((double)0.93076696107898373214)
 94 | 	#define s1024_3d ((double)0.36561299780477386950)	/* exp(3d*I*twopi/1024) */
 95 | 	#define c1024_3f ((double)0.92621024213831134218)
 96 | 	#define s1024_3f ((double)0.37700741021641825620)	/* exp(3f*I*twopi/1024) */
 97 | 	#define c1024_41 ((double)0.92151403934204194368)
 98 | 	#define s1024_41 ((double)0.38834504669882629109)	/* exp(41*I*twopi/1024) */
 99 | 	#define c1024_43 ((double)0.91667905992104266335)
100 | 	#define s1024_43 ((double)0.39962419984564682799)	/* exp(43*I*twopi/1024) */		
101 | 	#define c1024_45 ((double)0.91170603200542985165)
102 | 	#define s1024_45 ((double)0.41084317105790394162)	/* exp(45*I*twopi/1024) */
103 | 	#define c1024_47 ((double)0.90659570451491536559)
104 | 	#define s1024_47 ((double)0.42200027079979968537)	/* exp(47*I*twopi/1024) */		
105 | 	#define c1024_49 ((double)0.90134884704602201485)
106 | 	#define s1024_49 ((double)0.43309381885315196790)	/* exp(49*I*twopi/1024) */
107 | 	#define c1024_4b ((double)0.89596624975618515621)
108 | 	#define s1024_4b ((double)0.44412214457042923104)	/* exp(4b*I*twopi/1024) */		
109 | 	#define c1024_4d ((double)0.89044872324475789026)
110 | 	#define s1024_4d ((double)0.45508358712634382292)	/* exp(4d*I*twopi/1024) */
111 | 	#define c1024_4f ((double)0.88479709843093778043)
112 | 	#define s1024_4f ((double)0.46597649576796617728)	/* exp(4f*I*twopi/1024) */		
113 | 	#define c1024_51 ((double)0.87901222642863347817)
114 | 	#define s1024_51 ((double)0.47679923006332213271)	/* exp(51*I*twopi/1024) */
115 | 	#define c1024_53 ((double)0.87309497841829009899)
116 | 	#define s1024_53 ((double)0.48755016014843595399)	/* exp(53*I*twopi/1024) */		
117 | 	#define c1024_55 ((double)0.86704624551569265185)
118 | 	#define s1024_55 ((double)0.49822766697278185175)	/* exp(55*I*twopi/1024) */
119 | 	#define c1024_57 ((double)0.86086693863776727973)
120 | 	#define s1024_57 ((double)0.50883014254310703626)	/* exp(57*I*twopi/1024) */		
121 | 	#define c1024_59 ((double)0.85455798836540052117)
122 | 	#define s1024_59 ((double)0.51935599016558958668)	/* exp(59*I*twopi/1024) */
123 | 	#define c1024_5b ((double)0.84812034480329725170)
124 | 	#define s1024_5b ((double)0.52980362468629466753)	/* exp(5b*I*twopi/1024) */		
125 | 	#define c1024_5d ((double)0.84155497743689841004)
126 | 	#define s1024_5d ((double)0.54017147272989288060)	/* exp(5d*I*twopi/1024) */
127 | 	#define c1024_5f ((double)0.83486287498638005676)
128 | 	#define s1024_5f ((double)0.55045797293660480227)	/* exp(5f*I*twopi/1024) */		
129 | 	#define c1024_61 ((double)0.82804504525775575255)
130 | 	#define s1024_61 ((double)0.56066157619733602312)	/* exp(61*I*twopi/1024) */
131 | 	#define c1024_63 ((double)0.82110251499110467956)
132 | 	#define s1024_63 ((double)0.57078074588696727951)	/* exp(63*I*twopi/1024) */		
133 | 	#define c1024_65 ((double)0.81403632970594836217)
134 | 	#define s1024_65 ((double)0.58081395809576454434)	/* exp(65*I*twopi/1024) */
135 | 	#define c1024_67 ((double)0.80684755354379927274)
136 | 	#define s1024_67 ((double)0.59075970185887422768)	/* exp(67*I*twopi/1024) */		
137 | 	#define c1024_69 ((double)0.79953726910790503405)
138 | 	#define s1024_69 ((double)0.60061647938386892590)	/* exp(69*I*twopi/1024) */
139 | 	#define c1024_6b ((double)0.79210657730021235236)
140 | 	#define s1024_6b ((double)0.61038280627630945196)	/* exp(6b*I*twopi/1024) */		
141 | 	#define c1024_6d ((double)0.78455659715557523362)
142 | 	#define s1024_6d ((double)0.62005721176328917788)	/* exp(6d*I*twopi/1024) */
143 | 	#define c1024_6f ((double)0.77688846567323245066)
144 | 	#define s1024_6f ((double)0.62963823891492702460)	/* exp(6f*I*twopi/1024) */		
145 | 	#define c1024_71 ((double)0.76910333764557963998)
146 | 	#define s1024_71 ((double)0.63912444486377574303)	/* exp(71*I*twopi/1024) */
147 | 	#define c1024_73 ((double)0.76120238548426181469)
148 | 	#define s1024_73 ((double)0.64851440102211244430)	/* exp(73*I*twopi/1024) */		
149 | 	#define c1024_75 ((double)0.75318679904361248316)
150 | 	#define s1024_75 ((double)0.65780669329707865614)	/* exp(75*I*twopi/1024) */
151 | 	#define c1024_77 ((double)0.74505778544146596311)
152 | 	#define s1024_77 ((double)0.66699992230363750586)	/* exp(77*I*twopi/1024) */		
153 | 	#define c1024_79 ((double)0.73681656887736987581)
154 | 	#define s1024_79 ((double)0.67609270357531595956)	/* exp(79*I*twopi/1024) */
155 | 	#define c1024_7b ((double)0.72846439044822519723)
156 | 	#define s1024_7b ((double)0.68508366777270038056)	/* exp(7b*I*twopi/1024) */		
157 | 	#define c1024_7d ((double)0.72000250796138162984)
158 | 	#define s1024_7d ((double)0.69397146088965400820)	/* exp(7d*I*twopi/1024) */
159 | 	#define c1024_7f ((double)0.71143219574521644231)
160 | 	#define s1024_7f ((double)0.70275474445722530165)	/* exp(7f*I*twopi/1024) */
161 | 
162 | #endif	/* #ifndef radix1024_included */
163 | 


--------------------------------------------------------------------------------
/src/radix128.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix128_included
27 | #define radix128_included
28 | 
29 | #include "radix64.h"
30 | 
31 | 	#define c128_1 ((double)0.99879545620517239271)
32 | 	#define s128_1 ((double)0.04906767432741801425)	/* exp(1*I*twopi/128) */		
33 | 	#define c128_3 ((double)0.98917650996478097345)
34 | 	#define s128_3 ((double)0.14673047445536175165)	/* exp(3*I*twopi/128) */		
35 | 	#define c128_5 ((double)0.97003125319454399260)
36 | 	#define s128_5 ((double)0.24298017990326388994)	/* exp(5*I*twopi/128) */		
37 | 	#define c128_7 ((double)0.94154406518302077841)
38 | 	#define s128_7 ((double)0.33688985339222005068)	/* exp(7*I*twopi/128) */		
39 | 	#define c128_9 ((double)0.90398929312344333158)
40 | 	#define s128_9 ((double)0.42755509343028209431)	/* exp(9*I*twopi/128) */		
41 | 	#define c128_b ((double)0.85772861000027206990)
42 | 	#define s128_b ((double)0.51410274419322172658)	/* exp(b*I*twopi/128) */		
43 | 	#define c128_d ((double)0.80320753148064490981)
44 | 	#define s128_d ((double)0.59569930449243334345)	/* exp(d*I*twopi/128) */		
45 | 	#define c128_f ((double)0.74095112535495909118)
46 | 	#define s128_f ((double)0.67155895484701840061)	/* exp(f*I*twopi/128) */		
47 | 
48 | #endif	/* #ifndef radix128_included */
49 | 


--------------------------------------------------------------------------------
/src/radix128_twiddles.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | #include "radix128.h"
24 | 
25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper,
26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration
27 | // and thus needs to be inline-able in multiple places in a source filing making use of it.
28 | 
29 | const double DFT128_TWIDDLES[16][14] = {
30 | 	{ 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 },
31 | 	{ 0,1,ISRT2,ISRT2,-ISRT2,ISRT2,c16,s16,-s16,c16,s16,c16,-c16,s16 },
32 | 	{ ISRT2,ISRT2,c16,s16,s16,c16,c32_1,s32_1,s32_3,c32_3,c32_3,s32_3,s32_1,c32_1 },
33 | 	{ -ISRT2,ISRT2,s16,c16,-c16,-s16,c32_3,s32_3,-c32_1,s32_1,-s32_1,c32_1,-s32_3,-c32_3 },
34 | 	{ c16,s16,c32_1,s32_1,c32_3,s32_3,c64_1,s64_1,c64_5,s64_5,c64_3,s64_3,c64_7,s64_7 },
35 | 	{ -s16,c16,s32_3,c32_3,-c32_1,s32_1,c64_5,s64_5,-c64_7,s64_7,s64_1,c64_1,-c64_3,-s64_3 },
36 | 	{ s16,c16,c32_3,s32_3,-s32_1,c32_1,c64_3,s64_3,s64_1,c64_1,s64_7,c64_7,-s64_5,c64_5 },
37 | 	{ -c16,s16,s32_1,c32_1,-s32_3,-c32_3,c64_7,s64_7,-c64_3,-s64_3,-s64_5,c64_5,s64_1,-c64_1 },
38 | 	{ c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7 },
39 | 	{ -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1 },
40 | 	{ s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3 },
41 | 	{ -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5 },
42 | 	{ c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b },
43 | 	{ -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d },
44 | 	{ s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f },
45 | 	{ -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9 }
46 | };
47 | 
48 | 


--------------------------------------------------------------------------------
/src/radix15_sse_macro.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | /*******************************************************************************
 24 |    We now include this header file if it was not included before.
 25 | *******************************************************************************/
 26 | #ifndef radix15_sse_macro_h_included
 27 | #define radix15_sse_macro_h_included
 28 | 
 29 | #include "sse2_macro_gcc64.h"
 30 | 
 31 | /* General indexing for twiddleless radix-15 done as 3*radix-5 followed by 5*radix-3 is as for the scalar macro above:
 32 | RADIX_15_DIF(00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E)
 33 | ->
 34 | 	RADIX_05_DFT(i0,iC,i9,i6,i3, t0,t1,t2,t3,t4)
 35 | 	RADIX_05_DFT(iA,i7,i4,i1,iD, t5,t6,t7,t8,t9)
 36 | 	RADIX_05_DFT(i5,i2,iE,iB,i8, tA,tB,tC,tD,tE)
 37 | 
 38 | 	RADIX_03_DFT(t0,t5,tA, o0,o1,o2,)
 39 | 	RADIX_03_DFT(t1,t6,tB, oD,oE,oB,)
 40 | 	RADIX_03_DFT(t2,t7,tC, o9,oA,oB,)
 41 | 	RADIX_03_DFT(t3,t8,tD, o8,o6,o7,)
 42 | 	RADIX_03_DFT(t4,t9,tE, o4,o5,o3,)
 43 | 
 44 | In our impl below, the __i are input pointers, which may overlap the __o outputs;
 45 | ..cc0 and cc1 are ptrs to the radix-3 and radix-5 SSE2 sincos constants (c3m1 and cn1);
 46 | __t0-E are ptr to scratch local storage (i.e. the address block pointed to by r00-r3e).
 47 | */
 48 | // Aug 2014: Need arbitrary-pointer-offsets to support I/O permutations needed by
 49 | // larger-radix DFTs of length 15 * 2^n
 50 | 
 51 | #define SSE2_RADIX_15_DIF(\
 52 | 	__cc0, __cc1,\
 53 | 	__i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
 54 | 	__t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
 55 | 	__o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\
 56 | {\
 57 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__i0,__iC,__i9,__i6,__i3, __cc1, __t0,__t1,__t2,__t3,__t4);\
 58 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__iA,__i7,__i4,__i1,__iD, __cc1, __t5,__t6,__t7,__t8,__t9);\
 59 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__i5,__i2,__iE,__iB,__i8, __cc1, __tA,__tB,__tC,__tD,__tE);\
 60 | \
 61 | 	SSE2_RADIX_03_DFT(__t0,__t5,__tA, __cc0, __o0,__o1,__o2);\
 62 | 	SSE2_RADIX_03_DFT(__t1,__t6,__tB, __cc0, __oD,__oE,__oC);\
 63 | 	SSE2_RADIX_03_DFT(__t2,__t7,__tC, __cc0, __o9,__oA,__oB);\
 64 | 	SSE2_RADIX_03_DFT(__t3,__t8,__tD, __cc0, __o8,__o6,__o7);\
 65 | 	SSE2_RADIX_03_DFT(__t4,__t9,__tE, __cc0, __o4,__o5,__o3);\
 66 | }
 67 | 
 68 | #define SSE2_RADIX_15_DIT(\
 69 | 	__cc0, __cc1,\
 70 | 	__i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
 71 | 	__t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
 72 | 	__o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\
 73 | {\
 74 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\
 75 | 	SSE2_RADIX_03_DFT(__i0,__i2,__i1, __cc0, __t0,__t2,__t1);\
 76 | 	SSE2_RADIX_03_DFT(__i8,__i7,__i6, __cc0, __t3,__t5,__t4);\
 77 | 	SSE2_RADIX_03_DFT(__iD,__iC,__iE, __cc0, __t6,__t8,__t7);\
 78 | 	SSE2_RADIX_03_DFT(__i4,__i3,__i5, __cc0, __t9,__tB,__tA);\
 79 | 	SSE2_RADIX_03_DFT(__i9,__iB,__iA, __cc0, __tC,__tE,__tD);\
 80 | \
 81 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\
 82 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__t0,__t3,__t6,__t9,__tC, __cc1, __o0,__o6,__oC,__o3,__o9);\
 83 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__t1,__t4,__t7,__tA,__tD, __cc1, __o5,__oB,__o2,__o8,__oE);\
 84 | 	SSE2_RADIX_05_DFT_0TWIDDLE(__t2,__t5,__t8,__tB,__tE, __cc1, __oA,__o1,__o7,__oD,__o4);\
 85 | }
 86 | 
 87 | // Cost: 12 DP-math, 17 vector MOV for each of the two side-by-side 3-DFTs in SSE2_RADIX_03_DFT_X2
 88 | //       38 DP-math, 31 vector MOV for each of the two side-by-side 5-DFTs in SSE2_RADIX_05_DFT_0TWIDDLE_X2. Thus
 89 | // 150 DP-math, 144 vector MOV for each of the two side-by-side 15-DFTs in each of these two [DIF and DIT] 15-DFT macro-of-macros.
 90 | // Compare to van-Buskirk 13-DFT: 198 DP-math, 168 vector MOV.
 91 | #define SSE2_RADIX_15_DIF_X2(\
 92 | 	__cc0, __cc1, __two,\
 93 | 	__i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
 94 | 	__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\
 95 | 	__o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\
 96 | 			__j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\
 97 | 			__t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
 98 | 			__u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\
 99 | {\
100 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i0,__iC,__i9,__i6,__i3, __s0,__s1,__s2,__s3,__s4,	__j0,__jC,__j9,__j6,__j3, __t0,__t1,__t2,__t3,__t4);\
101 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __iA,__i7,__i4,__i1,__iD, __s5,__s6,__s7,__s8,__s9,	__jA,__j7,__j4,__j1,__jD, __t5,__t6,__t7,__t8,__t9);\
102 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i5,__i2,__iE,__iB,__i8, __sA,__sB,__sC,__sD,__sE,	__j5,__j2,__jE,__jB,__j8, __tA,__tB,__tC,__tD,__tE);\
103 | \
104 | 	SSE2_RADIX_03_DFT_X2(__cc0, __s0,__s5,__sA, __o0,__o1,__o2,		__t0,__t5,__tA, __u0,__u1,__u2);\
105 | 	SSE2_RADIX_03_DFT_X2(__cc0, __s1,__s6,__sB, __oD,__oE,__oC,		__t1,__t6,__tB, __uD,__uE,__uC);\
106 | 	SSE2_RADIX_03_DFT_X2(__cc0, __s2,__s7,__sC, __o9,__oA,__oB,		__t2,__t7,__tC, __u9,__uA,__uB);\
107 | 	SSE2_RADIX_03_DFT_X2(__cc0, __s3,__s8,__sD, __o8,__o6,__o7,		__t3,__t8,__tD, __u8,__u6,__u7);\
108 | 	SSE2_RADIX_03_DFT_X2(__cc0, __s4,__s9,__sE, __o4,__o5,__o3,		__t4,__t9,__tE, __u4,__u5,__u3);\
109 | }
110 | 
111 | #define SSE2_RADIX_15_DIT_X2(\
112 | 	__cc0, __cc1,__two,\
113 | 	__i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
114 | 	__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\
115 | 	__o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\
116 | 			__j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\
117 | 			__t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
118 | 			__u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\
119 | {\
120 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\
121 | 	SSE2_RADIX_03_DFT_X2(__cc0, __i0,__i2,__i1, __s0,__s2,__s1,		__j0,__j2,__j1, __t0,__t2,__t1);\
122 | 	SSE2_RADIX_03_DFT_X2(__cc0, __i8,__i7,__i6, __s3,__s5,__s4,		__j8,__j7,__j6, __t3,__t5,__t4);\
123 | 	SSE2_RADIX_03_DFT_X2(__cc0, __iD,__iC,__iE, __s6,__s8,__s7,		__jD,__jC,__jE, __t6,__t8,__t7);\
124 | 	SSE2_RADIX_03_DFT_X2(__cc0, __i4,__i3,__i5, __s9,__sB,__sA,		__j4,__j3,__j5, __t9,__tB,__tA);\
125 | 	SSE2_RADIX_03_DFT_X2(__cc0, __i9,__iB,__iA, __sC,__sE,__sD,		__j9,__jB,__jA, __tC,__tE,__tD);\
126 | \
127 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\
128 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s0,__s3,__s6,__s9,__sC, __o0,__o6,__oC,__o3,__o9,	__t0,__t3,__t6,__t9,__tC, __u0,__u6,__uC,__u3,__u9);\
129 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s1,__s4,__s7,__sA,__sD, __o5,__oB,__o2,__o8,__oE,	__t1,__t4,__t7,__tA,__tD, __u5,__uB,__u2,__u8,__uE);\
130 | 	SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s2,__s5,__s8,__sB,__sE, __oA,__o1,__o7,__oD,__o4,	__t2,__t5,__t8,__tB,__tE, __uA,__u1,__u7,__uD,__u4);\
131 | }
132 | 
133 | #endif	/* radix15_sse_macro_h_included */
134 | 
135 | 


--------------------------------------------------------------------------------
/src/radix16.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix16_included
27 | #define radix16_included
28 | 
29 | 	#define c16    ((double)0.92387953251128675613)
30 | 	#define s16    ((double)0.38268343236508977173)	/* exp(  I*twopi/16) */		
31 | 
32 | #endif	/* #ifndef radix16_included */
33 | 


--------------------------------------------------------------------------------
/src/radix16_wrapper_ini.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | #include "Mlucas.h"
 24 | 
 25 | /***************/
 26 | 
 27 | /* Initialize the various arrays of indices used in radix16_wrapper_square, so we can execute
 28 |    the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired.
 29 | */
 30 | void radix16_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[])
 31 | {
 32 | 	static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum;
 33 | 	int iblock_next;
 34 | 
 35 | 	if(iblock <= 1 && !(radix0 & 1))
 36 | 	  	iblock_next = iblock + 1;
 37 | 	else
 38 | 		iblock_next = iblock + 2;
 39 | 
 40 | 	if(iblock == 0)	// j1 = real-array index (double the complex-array index) of the 1st element of each floating pair.
 41 | 	{
 42 | 		// No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square:
 43 | 		j1           =  0;
 44 | 		j2           = 32;
 45 | 		j2_start     = j2;	// j2 = real-array index (double the complex-array index) of 2nd element of each floating pair.
 46 | 		k            =  0;
 47 | 		blocklen     = 16;	// = half of complex blocklength, since process 2 complex data for each value of loop index L.
 48 | 		blocklen_sum =  0;
 49 | 
 50 | 		ws_i           [iblock] = i           ;
 51 | 		ws_j1          [iblock] = j1          ;
 52 | 		ws_j2          [iblock] = j2          ;
 53 | 		ws_j2_start    [iblock] = j2_start    ;
 54 | 		ws_k           [iblock] = k           ;
 55 | 		ws_m           [iblock] = m           ;
 56 | 		ws_blocklen    [iblock] = blocklen    ;
 57 | 		ws_blocklen_sum[iblock] = blocklen_sum;
 58 | 	} else {
 59 | 		goto jump_in;
 60 | 	}
 61 | 
 62 | 	for(i = nradices_prim-5; i >= 0; i-- )	// Main loop: lower bound = nradices_prim - radix_now.
 63 | 	{										// Remember, radices get processed in reverse order here as in forward FFT.
 64 | 		for(m = 0; m < (blocklen-1)>>1; m += 8) // Do two 16-element sets per loop, so only execute loop half as many times as before.
 65 | 		{
 66 | 			// This tells us when we've reached the end of the current data block:
 67 | 			// Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow!
 68 | 			if(j1 && ((uint64)j1*radix0)%n == 0)
 69 | 			{
 70 | 				ws_i           [iblock_next] = i           ;
 71 | 				ws_j1          [iblock_next] = j1          ;
 72 | 				ws_j2          [iblock_next] = j2          ;
 73 | 				ws_j2_start    [iblock_next] = j2_start    ;
 74 | 				ws_k           [iblock_next] = k           ;
 75 | 				ws_m           [iblock_next] = m           ;
 76 | 				ws_blocklen    [iblock_next] = blocklen    ;
 77 | 				ws_blocklen_sum[iblock_next] = blocklen_sum;
 78 | 			//	printf("%8" PRIu64 "  %20" PRIu64 "  %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
 79 | 				return;
 80 | 			}
 81 | 	jump_in:	// Entry point for all blocks but the first.
 82 | 			k += 2;	// increment sincos array index
 83 | 			// And update the data (j1 and j2) array indices:
 84 | 			j1 += 32;
 85 | 			j2 -= 32;
 86 | 		}
 87 | 	/*
 88 | 	!...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position
 89 | 	!   ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a
 90 | 	!   second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2).
 91 | 	*/
 92 | 		j1 += (blocklen << 1);
 93 | 		if(j2_start == n-32) {
 94 | 		//	printf("(j2_start == n-32) return with j2_start = %d\n",j2_start);
 95 | 			return;
 96 | 		}
 97 | 
 98 | 	/*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit,
 99 | 		 we multiply the blocklength by K >> 1 in preparation for the final block.	*/
100 | 
101 | 		blocklen_sum += blocklen;
102 | 		blocklen = (blocklen_sum) * (radix_prim[i-1]-1);
103 | 
104 | 	/*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */
105 | 
106 | 		j2_start += (blocklen<<2);
107 | 		j2 = j2_start;			/* Reset j2 for start of the next block. */
108 | 	//	printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2);
109 | 	}	 /* End of Main loop */
110 | }
111 | 
112 | /*
113 | Jun 2014: Possible UMR bug? Note the aside from i=1, only even-idx elts of the ws-arrays get inited ...
114 | so how do the odd-index reads not hose the result?
115 | 
116 | Using complex FFT radices        16         8        16        16        16
117 |                                           init ws_k[  0] =          0
118 |    65536               1048576    131040: init ws_k[  1] =       2048
119 |   131072               2097152    262112: init ws_k[  2] =       4096
120 |   262144               4194304    524256: init ws_k[  4] =       8192
121 |   327680               5242880    458720: init ws_k[  6] =      12288
122 |   524288               8388608   1048544: init ws_k[  8] =      16384
123 |   589824               9437184    983008: init ws_k[ 10] =      20480
124 |   655360              10485760    917472: init ws_k[ 12] =      24576
125 |   720896              11534336    851936: init ws_k[ 14] =      28672
126 | Mers_mod_square: Init threadpool of 1 threads
127 | Setting CPU = 0 affinity of worker thread id 0, mach_id = 3843
128 | radix16_wrapper_square with ws[]-index = 0
129 | stride = 32
130 | On entry: i = 0, j1,j2,j2_start = 0, 32, 32, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 16
131 | radix16_wrapper_square with ws[]-index = 1
132 | stride = 32
133 | On entry: i = 3, j1,j2,j2_start = 65536, 131040, 131040, k,m = 2048, 0, nrad_prim = 19, blocklen,sum = 16384
134 | radix16_wrapper_square with ws[]-index = 2
135 | stride = 32
136 | On entry: i = 2, j1,j2,j2_start = 131072, 262112, 262112, k,m = 4096, 0, nrad_prim = 19, blocklen,sum = 32768
137 | radix16_wrapper_square with ws[]-index = 3	<*** inited where? ***
138 | stride = 32						vvvv <*** j1 = 0, so no "jump_in": ***
139 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
140 | ========================
141 | So whn j1 = 0 on entry we exit immediately via:
142 | 		if(j1 && ((uint64)j1*radix0)%n == 0)
143 | 		{
144 | 		//	fprintf(stderr,"(j1 && j1*radix0 == 0 (mod n)) check hit: returning\n");
145 | 			return;
146 | 		}
147 | ========================
148 | radix16_wrapper_square with ws[]-index = 4
149 | stride = 32
150 | On entry: i = 1, j1,j2,j2_start = 262144, 524256, 524256, k,m = 8192, 0, nrad_prim = 19, blocklen,sum = 65536
151 | radix16_wrapper_square with ws[]-index = 5	<*** j1 = 0
152 | stride = 32
153 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
154 | radix16_wrapper_square with ws[]-index = 6
155 | stride = 32
156 | On entry: i = 1, j1,j2,j2_start = 327680, 458720, 524256, k,m = 12288, 16384, nrad_prim = 19, blocklen,sum = 65536
157 | radix16_wrapper_square with ws[]-index = 7	<*** j1 = 0
158 | stride = 32
159 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
160 | radix16_wrapper_square with ws[]-index = 8
161 | stride = 32
162 | On entry: i = 0, j1,j2,j2_start = 524288, 1048544, 1048544, k,m = 16384, 0, nrad_prim = 19, blocklen,sum = 131072
163 | radix16_wrapper_square with ws[]-index = 9	<*** j1 = 0
164 | stride = 32
165 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
166 | radix16_wrapper_square with ws[]-index = 10
167 | stride = 32
168 | On entry: i = 0, j1,j2,j2_start = 589824, 983008, 1048544, k,m = 20480, 16384, nrad_prim = 19, blocklen,sum = 131072
169 | radix16_wrapper_square with ws[]-index = 11	<*** j1 = 0
170 | stride = 32
171 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
172 | radix16_wrapper_square with ws[]-index = 12
173 | stride = 32
174 | On entry: i = 0, j1,j2,j2_start = 655360, 917472, 1048544, k,m = 24576, 32768, nrad_prim = 19, blocklen,sum = 131072
175 | radix16_wrapper_square with ws[]-index = 13	<*** j1 = 0
176 | stride = 32
177 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
178 | radix16_wrapper_square with ws[]-index = 14
179 | stride = 32
180 | On entry: i = 0, j1,j2,j2_start = 720896, 851936, 1048544, k,m = 28672, 49152, nrad_prim = 19, blocklen,sum = 131072
181 | radix16_wrapper_square with ws[]-index = 15	<*** j1 = 0
182 | stride = 32
183 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
184 | 
185 | Thus, j1 = 0 is how the odd-idx uninit is handled in practice - BUT NEED TO ENSURE ALL THE J1-DATA ARE INITED = 0 AT OUTSET
186 | 
187 | Thus, switch ws_* allocs in mers_mod_square from malloc to calloc.
188 | 
189 | (Surprised this issue took so long to manifest...)
190 | */
191 | 


--------------------------------------------------------------------------------
/src/radix17_ditN_cy_dif1.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | #include "Mlucas.h"
 24 | #include "radix17_dft.h"
 25 | 
 26 | /***************/
 27 | 
 28 | int radix17_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], double wt1[], int si[], double base[], double baseinv[], int iter, double *fracmax, uint64 p)
 29 | {
 30 | 	return 1;
 31 | }
 32 | 
 33 | /***************/
 34 | 
 35 | void radix17_dif_pass1(double a[], int n)
 36 | {
 37 | /*
 38 | !...Acronym: DIF = Decimation In Frequency
 39 | !
 40 | !...Subroutine to perform an initial radix-17 complex DIF FFT pass on the data in the length-N real vector A.
 41 | !
 42 | !   See the documentation in radix16_dif_pass for further details on storage and indexing.
 43 | !
 44 | !   Given complex inputs (x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,xG), we need the following outputs
 45 | !   (here cJ = cos(2*J*pi/17), sJ = sin(2*J*pi/17)):
 46 | !
 47 | !	X0 = C0,      where C0 = x0+   (x1+xG)+   (x2+xF)+   (x3+xE)+   (x4+xD)+   (x5+xC)+   (x6+xB)+   (x7+xA)+   (x6+x9),
 48 | !					the cosine terms below get massaged into the form of a length-8 cyclic convolution:
 49 | !	X1 = C1 + I*S1		C1 =
 50 | !	X2 = C2 + I*S2
 51 | !	X3 = C3 + I*S3
 52 | !	X4 = C4 + I*S4
 53 | !	X5 = C5 + I*S5
 54 | !	X6 = C6 + I*S6
 55 | !	X7 = C7 + I*S7
 56 | !	X8 = C8 + I*S8
 57 | !					and the sine terms get massaged into the form of a length-8 acyclic convolution:
 58 | !	X9 = C8 - I*S8
 59 | !	XA = C7 - I*S7
 60 | !	XB = C6 - I*S6
 61 | !	XC = C5 - I*S5
 62 | !	XD = C4 - I*S4
 63 | !	XE = C3 - I*S3
 64 | !	XF = C2 - I*S2
 65 | !	XG = C1 - I*S1
 66 | !
 67 | !   We refer to the terms C1-8 (which do not explicitly involving the imaginary constant I)
 68 | !   as the "cosine part" of the output, and S1-8 (those multiplied by I) as the "sine part."
 69 | !	Opcount for general odd-prime radix R:
 70 | !   Totals :                                                        100 FMUL, 140 FADD,		(R-1)^2 fmul	(R+3)*(R-1) fadd
 71 | !                                                        compared to 16 FMUL,  96 FADD for radix-12. (Ouch!)
 72 | !
 73 | !   Relative cost := #FADD/(radix*lg2(radix)) = 3.679 .
 74 | */
 75 | 	int j,j1,j2;
 76 | 	static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE;
 77 | 
 78 | 	if(!first_entry && (n/17) != n17)	/* New runlength?	*/
 79 | 	{
 80 | 		first_entry=TRUE;
 81 | 	}
 82 | 
 83 | /*...initialize things upon first entry	*/
 84 | 
 85 | 	if(first_entry)
 86 | 	{
 87 | 		first_entry=FALSE;
 88 | 		n17 = n/17;
 89 | 	// Constant index offsets for array load/stores are here:
 90 | 		p1  = n17;
 91 | 		p2  = p1 +p1;
 92 | 		p3  = p2 +p1;
 93 | 		p4  = p3 +p1;
 94 | 		p5  = p4 +p1;
 95 | 		p6  = p5 +p1;
 96 | 		p7  = p6 +p1;
 97 | 		p8  = p7 +p1;
 98 | 		p9  = p8 +p1;
 99 | 		p10 = p9 +p1;
100 | 		p11 = p10+p1;
101 | 		p12 = p11+p1;
102 | 		p13 = p12+p1;
103 | 		p14 = p13+p1;
104 | 		p15 = p14+p1;
105 | 		p16 = p15+p1;
106 | 
107 | 		p1  += ( (p1 >> DAT_BITS) << PAD_BITS );
108 | 		p2  += ( (p2 >> DAT_BITS) << PAD_BITS );
109 | 		p3  += ( (p3 >> DAT_BITS) << PAD_BITS );
110 | 		p4  += ( (p4 >> DAT_BITS) << PAD_BITS );
111 | 		p5  += ( (p5 >> DAT_BITS) << PAD_BITS );
112 | 		p6  += ( (p6 >> DAT_BITS) << PAD_BITS );
113 | 		p7  += ( (p7 >> DAT_BITS) << PAD_BITS );
114 | 		p8  += ( (p8 >> DAT_BITS) << PAD_BITS );
115 | 		p9  += ( (p9 >> DAT_BITS) << PAD_BITS );
116 | 		p10 += ( (p10>> DAT_BITS) << PAD_BITS );
117 | 		p11 += ( (p11>> DAT_BITS) << PAD_BITS );
118 | 		p12 += ( (p12>> DAT_BITS) << PAD_BITS );
119 | 		p13 += ( (p13>> DAT_BITS) << PAD_BITS );
120 | 		p14 += ( (p14>> DAT_BITS) << PAD_BITS );
121 | 		p15 += ( (p15>> DAT_BITS) << PAD_BITS );
122 | 		p16 += ( (p16>> DAT_BITS) << PAD_BITS );
123 | 	}
124 | 
125 | /*...The radix-17 pass is here.	*/
126 | 
127 | 	for(j=0; j < n17; j += 2)
128 | 	{
129 | 	#ifdef USE_SSE2
130 | 		j1 = (j & mask01) + br4[j&3];
131 | 			j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS );
132 | 	#else
133 | 			j1 = j + ( (j >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
134 | 	#endif
135 | 		j2 = j1+RE_IM_STRIDE;
136 | 		/* Call same radix-11 DFT macro as for DIF, but replace indices [0,1,2,3,4,5,6,7,8,9,10] with j*10%11, j = 0, ..., 10: */
137 | 		RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16]
138 | 					,a+j1 ,a+j2 ,a+j1+p1 ,a+j2+p1 ,a+j1+p2 ,a+j2+p2 ,a+j1+p3 ,a+j2+p3 ,a+j1+p4 ,a+j2+p4 ,a+j1+p5 ,a+j2+p5 ,a+j1+p6 ,a+j2+p6 ,a+j1+p7 ,a+j2+p7 ,a+j1+p8 ,a+j2+p8 ,a+j1+p9 ,a+j2+p9 ,a+j1+p10 ,a+j2+p10 ,a+j1+p11 ,a+j2+p11 ,a+j1+p12 ,a+j2+p12 ,a+j1+p13 ,a+j2+p13 ,a+j1+p14 ,a+j2+p14 ,a+j1+p15 ,a+j2+p15 ,a+j1+p16 ,a+j2+p16 );
139 | 	}
140 | }
141 | 
142 | /***************/
143 | 
144 | void radix17_dit_pass1(double a[], int n)
145 | {
146 | /*
147 | !...Acronym: DIT = Decimation In Time
148 | !
149 | !...Subroutine to perform a final radix-17 complex DIT FFT pass on the data in the length-N real vector A.
150 | !
151 | !   See the documentation in radix16_dif_pass for further details on storage and indexing.
152 | */
153 | 	int j,j1,j2;
154 | 	static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE;
155 | 
156 | 	if(!first_entry && (n/17) != n17)	/* New runlength?	*/
157 | 	{
158 | 		first_entry=TRUE;
159 | 	}
160 | 
161 | /*...initialize things upon first entry	*/
162 | 
163 | 	if(first_entry)
164 | 	{
165 | 		first_entry=FALSE;
166 | 		n17 = n/17;
167 | 	// Constant index offsets for array load/stores are here:
168 | 		p1  = n17;
169 | 		p2  = p1 +p1;
170 | 		p3  = p2 +p1;
171 | 		p4  = p3 +p1;
172 | 		p5  = p4 +p1;
173 | 		p6  = p5 +p1;
174 | 		p7  = p6 +p1;
175 | 		p8  = p7 +p1;
176 | 		p9  = p8 +p1;
177 | 		p10 = p9 +p1;
178 | 		p11 = p10+p1;
179 | 		p12 = p11+p1;
180 | 		p13 = p12+p1;
181 | 		p14 = p13+p1;
182 | 		p15 = p14+p1;
183 | 		p16 = p15+p1;
184 | 
185 | 		p1  += ( (p1 >> DAT_BITS) << PAD_BITS );
186 | 		p2  += ( (p2 >> DAT_BITS) << PAD_BITS );
187 | 		p3  += ( (p3 >> DAT_BITS) << PAD_BITS );
188 | 		p4  += ( (p4 >> DAT_BITS) << PAD_BITS );
189 | 		p5  += ( (p5 >> DAT_BITS) << PAD_BITS );
190 | 		p6  += ( (p6 >> DAT_BITS) << PAD_BITS );
191 | 		p7  += ( (p7 >> DAT_BITS) << PAD_BITS );
192 | 		p8  += ( (p8 >> DAT_BITS) << PAD_BITS );
193 | 		p9  += ( (p9 >> DAT_BITS) << PAD_BITS );
194 | 		p10 += ( (p10>> DAT_BITS) << PAD_BITS );
195 | 		p11 += ( (p11>> DAT_BITS) << PAD_BITS );
196 | 		p12 += ( (p12>> DAT_BITS) << PAD_BITS );
197 | 		p13 += ( (p13>> DAT_BITS) << PAD_BITS );
198 | 		p14 += ( (p14>> DAT_BITS) << PAD_BITS );
199 | 		p15 += ( (p15>> DAT_BITS) << PAD_BITS );
200 | 		p16 += ( (p16>> DAT_BITS) << PAD_BITS );
201 | 	}
202 | 
203 | /*...The radix-17 pass is here.	*/
204 | 
205 | 	for(j=0; j < n17; j += 2)
206 | 	{
207 | 	#ifdef USE_SSE2
208 | 		j1 = (j & mask01) + br4[j&3];
209 | 			j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS );
210 | 	#else
211 | 			j1 = j + ( (j >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
212 | 	#endif
213 | 		j2 = j1+RE_IM_STRIDE;
214 | 		// Call same radix-17 DFT macro as for DIF, but replace indices j = 1-16 with j*16%17, i.e. run in reverse order:
215 | 		RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16]
216 | 					,a+j1 ,a+j2 ,a+j1+p16 ,a+j2+p16 ,a+j1+p15 ,a+j2+p15 ,a+j1+p14 ,a+j2+p14 ,a+j1+p13 ,a+j2+p13 ,a+j1+p12 ,a+j2+p12 ,a+j1+p11 ,a+j2+p11 ,a+j1+p10 ,a+j2+p10 ,a+j1+p9 ,a+j2+p9 ,a+j1+p8 ,a+j2+p8 ,a+j1+p7 ,a+j2+p7 ,a+j1+p6 ,a+j2+p6 ,a+j1+p5 ,a+j2+p5 ,a+j1+p4 ,a+j2+p4 ,a+j1+p3 ,a+j2+p3 ,a+j1+p2 ,a+j2+p2 ,a+j1+p1 ,a+j2+p1 );
217 | 	}
218 | }
219 | 


--------------------------------------------------------------------------------
/src/radix256.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix256_included
27 | #define radix256_included
28 | 
29 | #include "radix128.h"
30 | 
31 | 	#define c256_01 ((double)0.99969881869620422011)
32 | 	#define s256_01 ((double)0.02454122852291228802)	/* exp(01*I*twopi/256) */
33 | 	#define c256_03 ((double)0.99729045667869021613)
34 | 	#define s256_03 ((double)0.07356456359966742351)	/* exp(03*I*twopi/256) */
35 | 	#define c256_05 ((double)0.99247953459870999816)
36 | 	#define s256_05 ((double)0.12241067519921619847)	/* exp(05*I*twopi/256) */
37 | 	#define c256_07 ((double)0.98527764238894124478)
38 | 	#define s256_07 ((double)0.17096188876030122632)	/* exp(07*I*twopi/256) */
39 | 	#define c256_09 ((double)0.97570213003852854447)
40 | 	#define s256_09 ((double)0.21910124015686979717)	/* exp(09*I*twopi/256) */
41 | 	#define c256_0b ((double)0.96377606579543986670)
42 | 	#define s256_0b ((double)0.26671275747489838626)	/* exp(0b*I*twopi/256) */
43 | 	#define c256_0d ((double)0.94952818059303666721)
44 | 	#define s256_0d ((double)0.31368174039889147658)	/* exp(0d*I*twopi/256) */
45 | 	#define c256_0f ((double)0.93299279883473888774)
46 | 	#define s256_0f ((double)0.35989503653498814869)	/* exp(0f*I*twopi/256) */
47 | 	#define c256_11 ((double)0.91420975570353065467)
48 | 	#define s256_11 ((double)0.40524131400498987082)	/* exp(11*I*twopi/256) */
49 | 	#define c256_13 ((double)0.89322430119551532038)
50 | 	#define s256_13 ((double)0.44961132965460659995)	/* exp(13*I*twopi/256) */
51 | 	#define c256_15 ((double)0.87008699110871141870)
52 | 	#define s256_15 ((double)0.49289819222978403677)	/* exp(15*I*twopi/256) */
53 | 	#define c256_17 ((double)0.84485356524970707332)
54 | 	#define s256_17 ((double)0.53499761988709721055)	/* exp(17*I*twopi/256) */
55 | 	#define c256_19 ((double)0.81758481315158369658)
56 | 	#define s256_19 ((double)0.57580819141784530063)	/* exp(19*I*twopi/256) */
57 | 	#define c256_1b ((double)0.78834642762660626210)
58 | 	#define s256_1b ((double)0.61523159058062684536)	/* exp(1b*I*twopi/256) */
59 | 	#define c256_1d ((double)0.75720884650648454767)
60 | 	#define s256_1d ((double)0.65317284295377676396)	/* exp(1d*I*twopi/256) */
61 | 	#define c256_1f ((double)0.72424708295146692105)
62 | 	#define s256_1f ((double)0.68954054473706692449)	/* exp(1f*I*twopi/256) */
63 | 
64 | #endif	/* #ifndef radix256_included */
65 | 


--------------------------------------------------------------------------------
/src/radix256_twiddles.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | #include "radix256.h"
24 | 
25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper,
26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration
27 | // and thus needs to be inline-able in multiple places in a source filing making use of it.
28 | 
29 | const double DFT256_TWIDDLES[16][30] = {
30 | 	{ 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 },
31 | 	{ 0,1, ISRT2,ISRT2, -ISRT2,ISRT2, c16,s16, -s16,c16, s16,c16, -c16,s16, c32_1,s32_1, -s32_1,c32_1, s32_3,c32_3, -c32_3,s32_3, c32_3,s32_3, -s32_3,c32_3, s32_1,c32_1, -c32_1,s32_1 },
32 | 	{ ISRT2,ISRT2, c16,s16, s16,c16, c32_1,s32_1, s32_3,c32_3, c32_3,s32_3, s32_1,c32_1, c64_1,s64_1, s64_7,c64_7, c64_5,s64_5, s64_3,c64_3, c64_3,s64_3, s64_5,c64_5, c64_7,s64_7, s64_1,c64_1 },
33 | 	{ -ISRT2,ISRT2, s16,c16, -c16,-s16, c32_3,s32_3, -c32_1,s32_1, -s32_1,c32_1, -s32_3,-c32_3, c64_3,s64_3, -c64_5,s64_5, s64_1,c64_1, -c64_7,-s64_7, s64_7,c64_7, -c64_1,-s64_1, -s64_5,c64_5, -s64_3,-c64_3 },
34 | 	{ c16,s16, c32_1,s32_1, c32_3,s32_3, c64_1,s64_1, c64_5,s64_5, c64_3,s64_3, c64_7,s64_7, c128_1,s128_1, c128_9,s128_9, c128_5,s128_5, c128_d,s128_d, c128_3,s128_3, c128_b,s128_b, c128_7,s128_7, c128_f,s128_f },
35 | 	{ -s16,c16, s32_3,c32_3, -c32_1,s32_1, c64_5,s64_5, -c64_7,s64_7, s64_1,c64_1, -c64_3,-s64_3, c128_5,s128_5, -s128_d,c128_d, s128_7,c128_7, -c128_1,-s128_1, c128_f,s128_f, -c128_9,s128_9, -s128_3,c128_3, -c128_b,-s128_b },
36 | 	{ s16,c16, c32_3,s32_3, -s32_1,c32_1, c64_3,s64_3, s64_1,c64_1, s64_7,c64_7, -s64_5,c64_5, c128_3,s128_3, s128_5,c128_5, c128_f,s128_f, -s128_7,c128_7, c128_9,s128_9, -s128_1,c128_1, s128_b,c128_b, -s128_d,c128_d },
37 | 	{ -c16,s16, s32_1,c32_1, -s32_3,-c32_3, c64_7,s64_7, -c64_3,-s64_3, -s64_5,c64_5, s64_1,-c64_1, c128_7,s128_7, -c128_1,s128_1, -s128_3,c128_3, -s128_5,-c128_5, s128_b,c128_b, -c128_d,-s128_d, -c128_f,s128_f, s128_9,-c128_9 },
38 | 	{ c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7, c256_01,s256_01, c256_09,s256_09, c256_05,s256_05, c256_0d,s256_0d, c256_03,s256_03, c256_0b,s256_0b, c256_07,s256_07, c256_0f,s256_0f },
39 | 	{ -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1, c256_09,s256_09, -s256_11,c256_11, s256_13,c256_13, -c256_0b,s256_0b, c256_1b,s256_1b, -c256_1d,s256_1d, s256_01,c256_01, -c256_07,-s256_07 },
40 | 	{ s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3, c256_05,s256_05, s256_13,c256_13, c256_19,s256_19, -s256_01,c256_01, c256_0f,s256_0f, s256_09,c256_09, s256_1d,c256_1d, -s256_0b,c256_0b },
41 | 	{ -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5, c256_0d,s256_0d, -c256_0b,s256_0b, -s256_01,c256_01, -s256_17,-c256_17, s256_19,c256_19, -c256_0f,-s256_0f, -s256_1b,c256_1b, s256_03,-c256_03 },
42 | 	{ c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b, c256_03,s256_03, c256_1b,s256_1b, c256_0f,s256_0f, s256_19,c256_19, c256_09,s256_09, s256_1f,c256_1f, c256_15,s256_15, s256_13,c256_13 },
43 | 	{ -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d, c256_0b,s256_0b, -c256_1d,s256_1d, s256_09,c256_09, -c256_0f,-s256_0f, s256_1f,c256_1f, -c256_07,s256_07, -s256_0d,c256_0d, -s256_1b,-c256_1b },
44 | 	{ s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f, c256_07,s256_07, s256_01,c256_01, s256_1d,c256_1d, -s256_1b,c256_1b, c256_15,s256_15, -s256_0d,c256_0d, s256_0f,c256_0f, -c256_17,s256_17 },
45 | 	{ -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9, c256_0f,s256_0f, -c256_07,-s256_07, -s256_0b,c256_0b, s256_03,-c256_03, s256_13,c256_13, -s256_1b,-c256_1b, -c256_17,s256_17, c256_1f,-s256_1f }
46 | };
47 | 
48 | 


--------------------------------------------------------------------------------
/src/radix32.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix32_included
27 | #define radix32_included
28 | 
29 | #include "radix16.h"
30 | 
31 | 	#define c32_1  ((double)0.98078528040323044912)
32 | 	#define s32_1  ((double)0.19509032201612826784)	/* exp(1*I*twopi/32) */
33 | 	#define c32_3  ((double)0.83146961230254523708)
34 | 	#define s32_3  ((double)0.55557023301960222473)	/* exp(3*I*twopi/32) */		
35 | 
36 | #endif	/* #ifndef radix32_included */
37 | 


--------------------------------------------------------------------------------
/src/radix32_wrapper_ini.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | #include "Mlucas.h"
 24 | 
 25 | /***************/
 26 | 
 27 | /* Initialize the various arrays of indices used in radix32_wrapper_square, so we can execute
 28 |    the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired.
 29 | */
 30 | void radix32_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[])
 31 | {
 32 | 	static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum;
 33 | 	int iblock_next;
 34 | 
 35 | 	if(iblock <= 1 && !(radix0 & 1))
 36 | 	  	iblock_next = iblock + 1;
 37 | 	else
 38 | 		iblock_next = iblock + 2;
 39 | 
 40 | 	if(iblock == 0)	// j1 = real-array index (double the complex-array index) of the 1st element of each floating pair.
 41 | 	{
 42 | 		// No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square:
 43 | 		j1           =  0;
 44 | 		j2           = 64;
 45 | 		j2_start     = j2;	// j2 = real-array index (double the complex-array index) of 2nd element of each floating pair.
 46 | 		k            =  0;
 47 | 		blocklen     = 32;	// = half of complex blocklength, since process 2 complex data for each value of loop index L.
 48 | 		blocklen_sum =  0;
 49 | 
 50 | 		ws_i           [iblock] = i           ;
 51 | 		ws_j1          [iblock] = j1          ;
 52 | 		ws_j2          [iblock] = j2          ;
 53 | 		ws_j2_start    [iblock] = j2_start    ;
 54 | 		ws_k           [iblock] = k           ;
 55 | 		ws_m           [iblock] = m           ;
 56 | 		ws_blocklen    [iblock] = blocklen    ;
 57 | 		ws_blocklen_sum[iblock] = blocklen_sum;
 58 | 	} else {
 59 | 		goto jump_in;
 60 | 	}
 61 | 
 62 | 	for(i = nradices_prim-6; i >= 0; i-- )	// Main loop: lower bound = nradices_prim - radix_now.
 63 | 	{										// Remember, radices get processed in reverse order here as in forward FFT.
 64 | 		for(m = 0; m < (blocklen-1)>>1; m += 16) // Do two 32-element sets per loop, so only execute loop half as many times as before.
 65 | 		{
 66 | 			// This tells us when we've reached the end of the current data block:
 67 | 			// Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow!
 68 | 			if(j1 && ((uint64)j1*radix0)%n == 0)
 69 | 			{
 70 | 				ws_i           [iblock_next] = i           ;
 71 | 				ws_j1          [iblock_next] = j1          ;
 72 | 				ws_j2          [iblock_next] = j2          ;
 73 | 				ws_j2_start    [iblock_next] = j2_start    ;
 74 | 				ws_k           [iblock_next] = k           ;
 75 | 				ws_m           [iblock_next] = m           ;
 76 | 				ws_blocklen    [iblock_next] = blocklen    ;
 77 | 				ws_blocklen_sum[iblock_next] = blocklen_sum;
 78 | 			//	printf("%8" PRIu64 "  %20" PRIu64 "  %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
 79 | 				return;
 80 | 			}
 81 | 	jump_in:	// Entry point for all blocks but the first.
 82 | 			k += 2;	// increment sincos array index
 83 | 			// And update the data (j1 and j2) array indices:
 84 | 			j1 += 64;
 85 | 			j2 -= 64;
 86 | 		}
 87 | 	/*
 88 | 	!...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position
 89 | 	!   ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a
 90 | 	!   second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2).
 91 | 	*/
 92 | 		j1 += (blocklen << 1);
 93 | 
 94 | 		if(j2_start == n-64) {
 95 | 		//	printf("(j2_start == n-32) return with j2_start = %d\n",j2_start);
 96 | 			return;
 97 | 		}
 98 | 
 99 | 	/*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit,
100 | 		 we multiply the blocklength by K >> 1 in preparation for the final block.	*/
101 | 
102 | 		blocklen_sum += blocklen;
103 | 		blocklen = (blocklen_sum) * (radix_prim[i-1]-1);
104 | 
105 | 	/*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */
106 | 
107 | 		j2_start += (blocklen<<2);
108 | 		j2 = j2_start;			/* Reset j2 for start of the next block. */
109 | 	//	printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2);
110 | 	}	 /* End of Main loop */
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/src/radix512.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix512_included
27 | #define radix512_included
28 | 
29 | #include "radix256.h"
30 | 
31 | 	#define c512_01 ((double)0.99992470183914454092)
32 | 	#define s512_01 ((double)0.01227153828571992607)	/* exp(01*I*twopi/512) */
33 | 	#define c512_03 ((double)0.99932238458834950089)
34 | 	#define s512_03 ((double)0.03680722294135883230)	/* exp(03*I*twopi/512) */		
35 | 	#define c512_05 ((double)0.99811811290014920712)
36 | 	#define s512_05 ((double)0.06132073630220857774)	/* exp(05*I*twopi/512) */
37 | 	#define c512_07 ((double)0.99631261218277801263)
38 | 	#define s512_07 ((double)0.08579731234443989040)	/* exp(07*I*twopi/512) */		
39 | 	#define c512_09 ((double)0.99390697000235604155)
40 | 	#define s512_09 ((double)0.11022220729388305873)	/* exp(09*I*twopi/512) */
41 | 	#define c512_0b ((double)0.99090263542778002511)
42 | 	#define s512_0b ((double)0.13458070850712618623)	/* exp(0b*I*twopi/512) */		
43 | 	#define c512_0d ((double)0.98730141815785838241)
44 | 	#define s512_0d ((double)0.15885814333386144158)	/* exp(0d*I*twopi/512) */
45 | 	#define c512_0f ((double)0.98310548743121632720)
46 | 	#define s512_0f ((double)0.18303988795514095840)	/* exp(0f*I*twopi/512) */		
47 | 	#define c512_11 ((double)0.97831737071962763313)
48 | 	#define s512_11 ((double)0.20711137619221854957)	/* exp(11*I*twopi/512) */
49 | 	#define c512_13 ((double)0.97293995220556014550)
50 | 	#define s512_13 ((double)0.23105810828067111950)	/* exp(13*I*twopi/512) */		
51 | 	#define c512_15 ((double)0.96697647104485210912)
52 | 	#define s512_15 ((double)0.25486565960451457139)	/* exp(15*I*twopi/512) */
53 | 	#define c512_17 ((double)0.96043051941556581124)
54 | 	#define s512_17 ((double)0.27851968938505310503)	/* exp(17*I*twopi/512) */		
55 | 	#define c512_19 ((double)0.95330604035419383697)
56 | 	#define s512_19 ((double)0.30200594931922806681)	/* exp(19*I*twopi/512) */
57 | 	#define c512_1b ((double)0.94560732538052132579)
58 | 	#define s512_1b ((double)0.32531029216226293393)	/* exp(1b*I*twopi/512) */		
59 | 	#define c512_1d ((double)0.93733901191257492328)
60 | 	#define s512_1d ((double)0.34841868024943456820)	/* exp(1d*I*twopi/512) */
61 | 	#define c512_1f ((double)0.92850608047321556602)
62 | 	#define s512_1f ((double)0.37131719395183754318)	/* exp(1f*I*twopi/512) */		
63 | 	#define c512_21 ((double)0.91911385169005774400)
64 | 	#define s512_21 ((double)0.39399204006104810836)	/* exp(21*I*twopi/512) */
65 | 	#define c512_23 ((double)0.90916798309052237667)
66 | 	#define s512_23 ((double)0.41642956009763718231)	/* exp(23*I*twopi/512) */		
67 | 	#define c512_25 ((double)0.89867446569395384316)
68 | 	#define s512_25 ((double)0.43861623853852763738)	/* exp(25*I*twopi/512) */
69 | 	#define c512_27 ((double)0.88763962040285394789)
70 | 	#define s512_27 ((double)0.46053871095824002336)	/* exp(27*I*twopi/512) */		
71 | 	#define c512_29 ((double)0.87607009419540660724)
72 | 	#define s512_29 ((double)0.48218377207912274823)	/* exp(29*I*twopi/512) */
73 | 	#define c512_2b ((double)0.86397285612158673808)
74 | 	#define s512_2b ((double)0.50353838372571755840)	/* exp(2b*I*twopi/512) */		
75 | 	#define c512_2d ((double)0.85135519310526514244)
76 | 	#define s512_2d ((double)0.52458968267846890591)	/* exp(2d*I*twopi/512) */
77 | 	#define c512_2f ((double)0.83822470555483804338)
78 | 	#define s512_2f ((double)0.54532498842204642200)	/* exp(2f*I*twopi/512) */		
79 | 	#define c512_31 ((double)0.82458930278502526468)
80 | 	#define s512_31 ((double)0.56573181078361319707)	/* exp(31*I*twopi/512) */
81 | 	#define c512_33 ((double)0.81045719825259479195)
82 | 	#define s512_33 ((double)0.58579785745643886000)	/* exp(33*I*twopi/512) */		
83 | 	#define c512_35 ((double)0.79583690460888353651)
84 | 	#define s512_35 ((double)0.60551104140432551359)	/* exp(35*I*twopi/512) */
85 | 	#define c512_37 ((double)0.78073722857209447856)
86 | 	#define s512_37 ((double)0.62485948814238637675)	/* exp(37*I*twopi/512) */		
87 | 	#define c512_39 ((double)0.76516726562245892617)
88 | 	#define s512_39 ((double)0.64383154288979146473)	/* exp(39*I*twopi/512) */
89 | 	#define c512_3b ((double)0.74913639452345932577)
90 | 	#define s512_3b ((double)0.66241577759017176077)	/* exp(3b*I*twopi/512) */		
91 | 	#define c512_3d ((double)0.73265427167241283493)
92 | 	#define s512_3d ((double)0.68060099779545305024)	/* exp(3d*I*twopi/512) */
93 | 	#define c512_3f ((double)0.71573082528381865446)
94 | 	#define s512_3f ((double)0.69837624940897285320)	/* exp(3f*I*twopi/512) */
95 | 
96 | #endif	/* #ifndef radix512_included */
97 | 


--------------------------------------------------------------------------------
/src/radix63_main_carry_loop.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | // This main loop is same for un-and-multithreaded, so stick into a header file
 24 | // (can't use a macro because of the #if-enclosed stuff).
 25 | 
 26 | for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 27 | {
 28 | 	for(j = jstart; j < jhi; j += stride)
 29 | 	{
 30 | 		j1 =  j;
 31 | 		j1 = j1 + ( (j1 >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
 32 | 		j2 = j1 + RE_IM_STRIDE;
 33 | 
 34 | 	/*...The radix-63 DIT pass is here:	*/
 35 | 
 36 | 	//...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 7 radix-9 transforms:
 37 | 		tptr = t; iptr = dit_iperm;
 38 | 		for(l = 0; l < 7; l++) {
 39 | 			k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)];
 40 | 			RADIX_09_DIT(
 41 | 				a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8],
 42 | 				tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im,
 43 | 				rt,it,re
 44 | 			);	tptr += 9; iptr += 9;
 45 | 		}
 46 | 	//...and now do 9 radix-7 transforms:
 47 | 		tptr = t; iptr = dit_operm;
 48 | 		for(l = 0; l < 9; l++) {
 49 | 			k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)];
 50 | 			RADIX_07_DFT(
 51 | 				tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im,
 52 | 				t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13,
 53 | 				a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],
 54 | 				uc1,us1,uc2,us2,uc3,us3, rt,it,re,im
 55 | 			);	tptr++; iptr += 7;
 56 | 		}
 57 | 
 58 | /*...Now do the carries. Since the outputs would
 59 | 	normally be getting dispatched to 63 separate blocks of the A-array, we need 63 separate carries.	*/
 60 | 
 61 | 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 62 | 	{
 63 | 		l= j & (nwt-1);
 64 | 		n_minus_sil   = n-si[l  ];
 65 | 		n_minus_silp1 = n-si[l+1];
 66 | 		sinwt   = si[nwt-l  ];
 67 | 		sinwtm1 = si[nwt-l-1];
 68 | 
 69 | 		wtl     =wt0[    l  ];
 70 | 		wtn     =wt0[nwt-l  ]*scale;	/* Include 1/(n/2) scale factor of inverse transform here...	*/
 71 | 		wtlp1   =wt0[    l+1];
 72 | 		wtnm1   =wt0[nwt-l-1]*scale;	/* ...and here.	*/
 73 | 
 74 | 		/*...set0 is slightly different from others; divide work into blocks of RADIX/4 macro calls, 1st set of which gets pulled out of loop: */
 75 | // Apr 2014: Fermat-mod works fine, but mers-mod barfs immediately with what looks like a bad a0 value,
 76 | // div-by-n/2 should give 16, but instead see
 77 | //	iter 1, full = 1, a0in =   15.492078993055555
 78 | //	iter 1, full = 1, a0out =   13.000000000000000
 79 | //	Iter = 1, maxerr =    0.492078993055555
 80 | //if(!j)printf("iter %d, full = %d, a0in = %20.15f\n",iter,full_pass,a[0]/(n>>1));
 81 | 		l = 0; addr = cy_r; itmp = bjmodn;
 82 | 		jt = j1; jp = j2;
 83 | 	   cmplx_carry_norm_errcheck0(a[j1   ],a[j2   ],*addr,*itmp,0,prp_mult); ++l; ++addr; ++itmp;
 84 | 		// Next 15 quartets of macro calls done in loop:
 85 | 		for(ntmp = 1; ntmp < 16; ntmp++) {
 86 | 			cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 87 | 			cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 88 | 			cmplx_carry_norm_errcheck(a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 89 | 			jt = j1 + p[ntmp<<2]; jp = j2 + p[ntmp<<2];
 90 | 			cmplx_carry_norm_errcheck(a[jt   ],a[jp   ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 91 | 		}
 92 | 		// Cleanup of final 2 sets of carries:
 93 | 		cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 94 | 		cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
 95 | //if(!j)printf("iter %d, full = %d, a0out = %20.15f\n",iter,full_pass,a[0]);
 96 | 		i =((uint32)(sw - bjmodn[0]) >> 31);	/* get ready for the next set...	*/
 97 | 		co2 = co3;	/* For all data but the first set in each j-block, co2=co3. Thus, after the first block of data is done
 98 | 					(and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1).	*/
 99 | 	}
100 | 	else	/* MODULUS_TYPE_FERMAT */
101 | 	{
102 | 		// Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2):
103 | 		ntmp = 0; addr = cy_r; addi = cy_i; ic = 0;	// ic = idx into icycle mini-array, gets incremented (mod ODD_RADIX) between macro calls
104 | 		jt = j1; jp = j2;
105 | 		fermat_carry_norm_errcheckB(a[jt   ],a[jp   ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
106 | 		fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
107 | 		fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
108 | 		for(m = 1; m < 16; m++) {
109 | 			fermat_carry_norm_errcheckB(a[jt+p3],a[jp+p3],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
110 | 			jt = j1 + p[m<<2]; jp = j2 + p[m<<2];
111 | 			fermat_carry_norm_errcheckB(a[jt   ],a[jp   ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
112 | 			fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
113 | 			fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi; ++ic;
114 | 		}
115 | 		for(l = 0; l < ODD_RADIX; l++) {
116 | 			icycle[l] += wts_idx_incr;	/* Inside the loop use this, as it is faster than general-mod '% nwt' */
117 | 			icycle[l] += ( (-(int)((uint32)icycle[l] >> 31)) & nwt);
118 | 		}
119 | 	}	/* if(MODULUS_TYPE == ...) */
120 | 
121 | 	/*...The radix-63 DIF pass is here:	*/
122 | 
123 | 	//...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 9 radix-7 transforms:
124 | 		tptr = t; iptr = dif_iperm;
125 | 		for(l = 0; l < 9; l++) {
126 | 			k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)];
127 | 			RADIX_07_DFT(
128 | 				a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],
129 | 				t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13,
130 | 				tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im,
131 | 				uc1,us1,uc2,us2,uc3,us3, rt,it,re,im
132 | 			);	tptr++; iptr += 7;
133 | 		}
134 | 	//...and now do 7 radix-9 transforms:
135 | 		tptr = t; iptr = dif_operm;
136 | 		for(l = 0; l < 7; l++) {
137 | 			k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)];
138 | 			RADIX_09_DIF(
139 | 				tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im,
140 | 				a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8],
141 | 				rt,it,re
142 | 			);	tptr += 9; iptr += 9;
143 | 		}
144 | 	}
145 | 
146 | 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
147 | 	{
148 | 		jstart += nwt;
149 | 		jhi    += nwt;
150 | 
151 | 		col += RADIX;
152 | 		co3 -= RADIX;
153 | 	}
154 | }	/* end for(k=1; k <= khi; k++) */
155 | 
156 | 


--------------------------------------------------------------------------------
/src/radix64.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | *                                                                              *
 3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
 4 | *                                                                              *
 5 | *  This program is free software; you can redistribute it and/or modify it     *
 6 | *  under the terms of the GNU General Public License as published by the       *
 7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 8 | *  option) any later version.                                                  *
 9 | *                                                                              *
10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
13 | *  more details.                                                               *
14 | *                                                                              *
15 | *  You should have received a copy of the GNU General Public License along     *
16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
19 | *  02111-1307, USA.                                                            *
20 | *                                                                              *
21 | *******************************************************************************/
22 | 
23 | /****************************************************************************
24 |  * We now include this header file if it was not included before.
25 |  ****************************************************************************/
26 | #ifndef radix64_included
27 | #define radix64_included
28 | 
29 | #include "radix32.h"
30 | 
31 | 	#define c64_1  ((double)0.99518472667219688624)
32 | 	#define s64_1  ((double)0.09801714032956060199)	/* exp(1*I*twopi/64) */		
33 | 	#define c64_3  ((double)0.95694033573220886494)
34 | 	#define s64_3  ((double)0.29028467725446236764)	/* exp(3*I*twopi/64) */		
35 | 	#define c64_5  ((double)0.88192126434835502971)
36 | 	#define s64_5  ((double)0.47139673682599764856)	/* exp(5*I*twopi/64) */		
37 | 	#define c64_7  ((double)0.77301045336273696081)
38 | 	#define s64_7  ((double)0.63439328416364549822)	/* exp(7*I*twopi/64) */		
39 | 
40 | #endif	/* #ifndef radix64_included */
41 | 


--------------------------------------------------------------------------------
/src/rng_isaac.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | ------------------------------------------------------------------------------
  3 | isaac64.c: My random number generator for 64-bit machines.
  4 | By Bob Jenkins, 1996.  Public Domain.
  5 | ------------------------------------------------------------------------------
  6 | */
  7 | 
  8 | #include <stdio.h>
  9 | #include "rng_isaac.h"
 10 | 
 11 | /* externs declared in rng_isaac.h: */
 12 | ub8 randrsl[RANDSIZ], randcnt;
 13 | 
 14 | static    ub8 mm[RANDSIZ];
 15 | static    ub8 aa=0, bb=0, cc=0;
 16 | 
 17 | #define ind(mm,x)  (*(ub8 *)((ub1 *)(mm) + ((x) & ((RANDSIZ-1)<<3))))
 18 | #define rngstep(mix,a,b,mm,m,m2,r,x) \
 19 | { \
 20 |   x = *m;  \
 21 |   a = (mix) + *(m2++); \
 22 |   *(m++) = y = ind(mm,x) + a + b; \
 23 |   *(r++) = b = ind(mm,y>>RANDSIZL) + x; \
 24 | }
 25 | 
 26 | void isaac64()
 27 | {
 28 |   register ub8 a,b,x,y,*m,*m2,*r,*mend;
 29 |   r = randrsl;	/* Need a variable address pointer to feed to rngstep */
 30 |   a = aa; b = bb + (++cc);
 31 |   for (m = mm, mend = m2 = m+(RANDSIZ/2); m<mend; )
 32 |   {
 33 |     rngstep(~(a^(a<<21)), a, b, mm, m, m2, r, x);
 34 |     rngstep(  a^(a>>5)  , a, b, mm, m, m2, r, x);
 35 |     rngstep(  a^(a<<12) , a, b, mm, m, m2, r, x);
 36 |     rngstep(  a^(a>>33) , a, b, mm, m, m2, r, x);
 37 |   }
 38 |   for (m2 = mm; m2<mend; )
 39 |   {
 40 |     rngstep(~(a^(a<<21)), a, b, mm, m, m2, r, x);
 41 |     rngstep(  a^(a>>5)  , a, b, mm, m, m2, r, x);
 42 |     rngstep(  a^(a<<12) , a, b, mm, m, m2, r, x);
 43 |     rngstep(  a^(a>>33) , a, b, mm, m, m2, r, x);
 44 |   }
 45 |   bb = b; aa = a;
 46 | }
 47 | 
 48 | #define mix(a,b,c,d,e,f,g,h) \
 49 | { \
 50 |    a-=e; f^=h>>9;  h+=a; \
 51 |    b-=f; g^=a<<9;  a+=b; \
 52 |    c-=g; h^=b>>23; b+=c; \
 53 |    d-=h; a^=c<<15; c+=d; \
 54 |    e-=a; b^=d>>14; d+=e; \
 55 |    f-=b; c^=e<<20; e+=f; \
 56 |    g-=c; d^=f>>17; f+=g; \
 57 |    h-=d; e^=g<<14; g+=h; \
 58 | }
 59 | 
 60 | void rng_isaac_init(word flag)
 61 | {
 62 |    word i;
 63 |    ub8 a,b,c,d,e,f,g,h;
 64 |    aa=bb=cc=(ub8)0;
 65 |    a=b=c=d=e=f=g=h=0x9E3779B97F4A7C13ull;  /* the golden ratio */
 66 | 
 67 |    for (i=0; i<4; ++i)                    /* scramble it */
 68 |    {
 69 |      mix(a,b,c,d,e,f,g,h);
 70 |    }
 71 | 
 72 |    for (i=0; i<RANDSIZ; i+=8)   /* fill in mm[] with messy stuff */
 73 |    {
 74 |      if (flag)                  /* use all the information in the seed */
 75 |      {
 76 |        a+=randrsl[i  ]; b+=randrsl[i+1]; c+=randrsl[i+2]; d+=randrsl[i+3];
 77 |        e+=randrsl[i+4]; f+=randrsl[i+5]; g+=randrsl[i+6]; h+=randrsl[i+7];
 78 |      }
 79 |      mix(a,b,c,d,e,f,g,h);
 80 |      mm[i  ]=a; mm[i+1]=b; mm[i+2]=c; mm[i+3]=d;
 81 |      mm[i+4]=e; mm[i+5]=f; mm[i+6]=g; mm[i+7]=h;
 82 |    }
 83 | 
 84 |    if (flag)
 85 |    {        /* do a second pass to make all of the seed affect all of mm */
 86 |      for (i=0; i<RANDSIZ; i+=8)
 87 |      {
 88 |        a+=mm[i  ]; b+=mm[i+1]; c+=mm[i+2]; d+=mm[i+3];
 89 |        e+=mm[i+4]; f+=mm[i+5]; g+=mm[i+6]; h+=mm[i+7];
 90 |        mix(a,b,c,d,e,f,g,h);
 91 |        mm[i  ]=a; mm[i+1]=b; mm[i+2]=c; mm[i+3]=d;
 92 |        mm[i+4]=e; mm[i+5]=f; mm[i+6]=g; mm[i+7]=h;
 93 |      }
 94 |    }
 95 | 
 96 |    isaac64();          /* fill in the first set of results */
 97 |    randcnt=RANDSIZ;    /* prepare to use the first set of results */
 98 | }
 99 | 
100 | 
101 | #ifdef NEVER
102 | /*int main() - Rename this for build purposes, since even with NEVER undefined, MSVC still views this as global main() */
103 | int rng_isaac_main()
104 | {
105 |   word i,j;
106 |   aa=bb=cc=(ub8)0;
107 |   for (i=0; i<RANDSIZ; ++i) mm[i]=(ub8)0;
108 |   rng_isaac_init(TRUE);
109 |   for (i=0; i<2; ++i)
110 |   {
111 |     isaac64();
112 |     for (j=0; j<RANDSIZ; ++j)
113 |       printf("%.8lx%.8lx",(ub4)(randrsl[j]>>32),(ub4)randrsl[j]);
114 |   }
115 | }
116 | #endif
117 | 
118 | /*
119 | 11/25/05: EWM - modified to add 2 types of double-precision floating rand() calls:
120 | 
121 | 	- rng_isaac_rand_double() returns a random double via a 64-bit field
122 | 	which is (within the limits of the generator) a random 64-bit int;
123 | 
124 | 	- rng_isaac_rand_double_norm_pos() returns a random double with
125 | 	probability uniformly distributed in [0, 1), insofar as IEEE64 doubles
126 | 	are capable of distributing such values, excluding underflows;
127 | 
128 | 	- rng_isaac_rand_double_norm_pm1() returns a random double with
129 | 	probability uniformly distributed in (-1, 1), insofar as IEEE64 doubles
130 | 	are capable of distributing such values, excluding underflows;
131 | */
132 | double	rng_isaac_rand_double()
133 | {
134 | 	uint64 iran64;
135 | 	uint32 fexp;
136 | 
137 | 	/* Make sure resulting float will not be denormal: */
138 | 	for(;;)
139 | 	{
140 | 		iran64 = rng_isaac_rand();
141 | 		fexp = (uint32)(iran64 >> 52) & 0x7ff;
142 | 		if(fexp != 0 && fexp < 0x7f0) break;
143 | 	}
144 | 	return	*(double *)&iran64;
145 | }
146 | 
147 | /* Assumes IEEE64-compliant: */
148 | double	rng_isaac_rand_double_norm_pos()
149 | {
150 | 	/*
151 | 	Obtain a result in [0, 1) by merging a sign/exponent field = 0x3ff with
152 | 	random 52-bit mantissa (52-bit because the hidden bit is assumed 1 via the
153 | 	choice of exponent - we only randomly generate the non-hidden 52 bits),
154 | 	yielding a result in [1, 2), and subtracting 1:
155 | 	*/
156 | 	uint64 iran64, itmp64;
157 | 	double retval;
158 | 
159 | 	itmp64 = rng_isaac_rand();
160 | 	iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull);
161 | 	retval=(*(double *)&iran64) - 1.0;
162 | 	/* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
163 | 	if(retval < 0.0 || retval > 1.0)
164 | 	{
165 | 		sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
166 | 		ASSERT(0, cbuf);
167 | 	}
168 | 	return retval;
169 | }
170 | 
171 | 
172 | /* Assumes IEEE64-compliant: */
173 | double	rng_isaac_rand_double_norm_pm1()
174 | {
175 | 	/*
176 | 	Obtain a result in (-1, 1) by following the same procedure used in
177 | 	rng_isaac_rand_double_norm_pos to get a value in [0, 1) and multiplying
178 | 	the result by a random choice of -1 or +1. Note that this doubles the
179 | 	odds of getting a zero result, but we assume that won't be fatal -
180 | 	in essence one can consider that as though -0.0 and +0.0 were separate
181 | 	possible outputs, each occurring with probability equal to that of any
182 | 	of the discrete nonzero outputs.
183 | 	*/
184 | 	static double pm1[] = {-1.0, +1.0};
185 | 	double sign;
186 | 	uint64 itmp64, iran64;
187 | 	double retval;
188 | 
189 | 	itmp64 = rng_isaac_rand();
190 | 	sign = pm1[itmp64 >> 63];	/* Use high bit of iran64 for sign */
191 | 	iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull);
192 | 	retval=sign*((*(double *)&iran64) - 1.0);
193 | 	/* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
194 | 	if(retval < -1.0 || retval > 1.0)
195 | 	{
196 | 		sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
197 | 		ASSERT(0, cbuf);
198 | 	}
199 | 	return retval;
200 | }
201 | 
202 | 


--------------------------------------------------------------------------------
/src/rng_isaac.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | ------------------------------------------------------------------------------
 3 | isaac64.h: definitions for a random number generator
 4 | Bob Jenkins, 1996, Public Domain
 5 | ------------------------------------------------------------------------------
 6 | */
 7 | /****************************************************************************
 8 |  * We now include this header file if it was not included before.
 9 |  ****************************************************************************/
10 | #ifndef rng_isaac_h_included
11 | #define rng_isaac_h_included
12 | 
13 | /*
14 | 11/25/05: EWM -  typedefs to use standard int types defined in types.h :
15 | */
16 | #include	"Mdata.h"
17 | 
18 | #ifdef __cplusplus
19 | extern "C" {
20 | #endif
21 | 
22 | typedef  uint64  ub8;
23 | #define UB8MAXVAL 0xffffffffffffffffLL
24 | #define UB8BITS 64
25 | typedef  sint64  sb8;
26 | #define SB8MAXVAL 0x7fffffffffffffffLL
27 | typedef  uint32  ub4;	/* unsigned 4-byte quantities */
28 | #define UB4MAXVAL 0xffffffff
29 | typedef  sint32  sb4;
30 | #define UB4BITS 32
31 | #define SB4MAXVAL 0x7fffffff
32 | typedef  uint16  ub2;
33 | #define UB2MAXVAL 0xffff
34 | #define UB2BITS 16
35 | typedef  sint16  sb2;
36 | #define SB2MAXVAL 0x7fff
37 | typedef uint8	 ub1;
38 | #define UB1MAXVAL 0xff
39 | #define UB1BITS 8
40 | typedef sint8	 sb1;	/* signed 1-byte quantities */
41 | #define SB1MAXVAL 0x7f
42 | typedef int  	word;	/* fastest type available */
43 | 
44 | 
45 | #ifndef ISAAC64
46 | #define ISAAC64
47 | 
48 | #define RANDSIZL   (8)
49 | #define RANDSIZ    (1<<RANDSIZL)
50 | 
51 | extern ub8 randrsl[RANDSIZ], randcnt;
52 | 
53 | /*
54 | ------------------------------------------------------------------------------
55 |  If (flag==TRUE), then use the contents of randrsl[0..255] as the seed.
56 | ------------------------------------------------------------------------------
57 | */
58 | void rng_isaac_init(word flag);
59 | 
60 | void isaac64();
61 | 
62 | /*
63 | 11/25/05: EWM - modified to add 2 types of double-precision floating rand() calls:
64 | 
65 | 	- rng_isaac_rand_double() returns a random double via a 64-bit field
66 | 	which is (within the limits of the generator) a random 64-bit int;
67 | 
68 | 	- rng_isaac_rand_double_norm_pos() returns a random double with
69 | 	probability uniformly distributed in [0, 1), insofar as IEEE64 doubles
70 | 	are capable of distributing such values, excluding underflows;
71 | 
72 | 	- rng_isaac_rand_double_norm_pm1() returns a random double with
73 | 	probability uniformly distributed in (-1, 1), insofar as IEEE64 doubles
74 | 	are capable of distributing such values, excluding underflows;
75 | */
76 | double	rng_isaac_rand_double();
77 | 
78 | double	rng_isaac_rand_double_norm_pos();
79 | 
80 | double	rng_isaac_rand_double_norm_pm1();
81 | 
82 | /*
83 | ------------------------------------------------------------------------------
84 |  Call rand() to retrieve a single 64-bit random value
85 | ------------------------------------------------------------------------------
86 | */
87 | #define rng_isaac_rand() \
88 |    (!randcnt-- ? (isaac64(), randcnt=RANDSIZ-1, randrsl[randcnt]) : \
89 |                  randrsl[randcnt])
90 | 
91 | #endif  /* RAND */
92 | 
93 | #ifdef __cplusplus
94 | }
95 | #endif
96 | 
97 | #endif	/* rng_isaac_h_included */
98 | 
99 | 


--------------------------------------------------------------------------------
/src/threadpool.h:
--------------------------------------------------------------------------------
  1 | // EWM: This threadpool file has "more history" than the other Mlucas sources,
  2 | // thus I include public-license boilerplate from all 3 authors in the chain below.
  3 | // [1] First my standard GPL header covering the code including my customizations:
  4 | 
  5 | /*******************************************************************************
  6 | *                                                                              *
  7 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  8 | *                                                                              *
  9 | *  This program is free software; you can redistribute it and/or modify it     *
 10 | *  under the terms of the GNU General Public License as published by the       *
 11 | *  Free Software Foundation; either version 2 of the License, or (at your      *
 12 | *  option) any later version.                                                  *
 13 | *                                                                              *
 14 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 15 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 16 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 17 | *  more details.                                                               *
 18 | *                                                                              *
 19 | *  You should have received a copy of the GNU General Public License along     *
 20 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 21 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 22 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 23 | *  02111-1307, USA.                                                            *
 24 | *                                                                              *
 25 | *******************************************************************************/
 26 | 
 27 | // [2] The version is started with was sent by , whose latest analogs of
 28 | // same are available at 
 29 | // http://sourceforge.net/p/msieve/code/HEAD/tree/trunk/include/thread.h [header]
 30 | // http://sourceforge.net/p/msieve/code/HEAD/tree/trunk/common/thread.c [C source].
 31 | 
 32 | /*--------------------------------------------------------------------
 33 | This source distribution is placed in the public domain by its author,
 34 | Jason Papadopoulos. You may use it for any purpose, free of charge,
 35 | without having to notify anyone. I disclaim any responsibility for any
 36 | errors.
 37 | 
 38 | Optionally, please be nice and tell me if you find this source to be
 39 | useful. Again optionally, if you add to the functionality present here
 40 | please consider making those additions public too, so that others may 
 41 | benefit from your work.	
 42 | 
 43 | $Id$
 44 | --------------------------------------------------------------------*/
 45 | 
 46 | // [3] Jason informs me he first started with Tomer Heber's code at
 47 | // http://sourceforge.net/projects/cthreadpool/ , which has the following
 48 | // license info (Note that BSD and GPL licenses are more-or-less compatible):
 49 | 
 50 | /*--------------------------------------------------------------------
 51 | The cthreadpool project is free and open source (BSD License).
 52 | If you are not familiar with the thread pool pattern please refer to:
 53 | http://en.wikipedia.org/wiki/Thread_pool_pattern
 54 | 
 55 | Instructions:
 56 | 1. In order to use the threadpool add threadpool.c and threadpool.h to
 57 | your project and compile it (compiling will create an object(o) file).
 58 | 2. Check the file threadpool.h for the API.
 59 | 3. Examples are available in the file example.c.
 60 | 
 61 | For questions, suggestions, bug reports or just comments please contact
 62 | me at: heber.tomer@gmail.com
 63 | --------------------------------------------------------------------*/
 64 | 
 65 | #ifndef _THREAD_H_
 66 | #define _THREAD_H_
 67 | 
 68 | #include "masterdefs.h"
 69 | #include "types.h"
 70 | #include "mi64.h"	// Sep 2016: Needed for enhanced affinity-setting functionality
 71 | 
 72 | #include <pthread.h>
 73 | 
 74 | #ifdef __cplusplus
 75 | extern "C"
 76 | {
 77 | #endif
 78 | 
 79 | /* mutexes ---------------------------------------------------------*/
 80 | 
 81 | #ifdef OS_TYPE_WINDOWS
 82 | typedef HANDLE mutex_t;
 83 | #else
 84 | typedef pthread_mutex_t mutex_t;
 85 | #endif
 86 | /*
 87 | static void mutex_init(mutex_t *m)
 88 | {
 89 | #ifdef OS_TYPE_WINDOWS
 90 | 	*m = CreateMutex(NULL, FALSE, NULL);
 91 | #else
 92 | 	pthread_mutex_init(m, NULL);
 93 | #endif
 94 | }
 95 | 
 96 | static void mutex_free(mutex_t *m)
 97 | {
 98 | #ifdef OS_TYPE_WINDOWS
 99 | 	CloseHandle(*m);
100 | #else
101 | 	pthread_mutex_destroy(m);
102 | #endif
103 | }
104 | 
105 | static void mutex_lock(mutex_t *m)
106 | {
107 | #ifdef OS_TYPE_WINDOWS
108 | 	WaitForSingleObject(*m, INFINITE);
109 | #else
110 | 	pthread_mutex_lock(m);
111 | #endif
112 | }
113 | 
114 | static void mutex_unlock(mutex_t *m)
115 | {
116 | #ifdef OS_TYPE_WINDOWS
117 | 	ReleaseMutex(*m);
118 | #else
119 | 	pthread_mutex_unlock(m);
120 | #endif
121 | }
122 | */
123 | /* a thread pool --------------------------------------------------*/
124 | 
125 | typedef void (*init_func)(void *data, int thread_num);
126 | typedef void (*run_func)(void *data, int thread_num);
127 | typedef void (*shutdown_func)(void *data, int thread_num);
128 | 
129 | typedef struct {
130 | 	init_func init;
131 | 	shutdown_func shutdown;
132 | 	void *data;
133 | } thread_control_t;
134 | 
135 | typedef struct {
136 | 	init_func init;
137 | 	run_func run;
138 | 	shutdown_func shutdown;
139 | 	void *data;
140 | } task_control_t;
141 | 
142 | struct threadpool_queue
143 | {
144 | 	unsigned int head;
145 | 	unsigned int tail;
146 | 	unsigned int num_tasks;
147 | 	unsigned int max_tasks;
148 | 	void **tasks;
149 | };
150 | 
151 | struct thread_init 
152 | {
153 | 	int thread_num;
154 | 	struct threadpool *pool;
155 | 	thread_control_t control;
156 | };
157 | 
158 | struct threadpool
159 | {
160 | 	struct threadpool_queue tasks_queue;
161 | 	struct threadpool_queue free_tasks_queue;
162 | 
163 | 	task_control_t *tasks;
164 | 
165 | 	struct thread_init *thr_init;
166 | 	pthread_t *thr_arr;
167 | 
168 | 	unsigned short num_of_threads;
169 | 	unsigned short num_of_cores;
170 | 	volatile unsigned short stop_flag;
171 | 
172 | 	pthread_mutex_t free_tasks_mutex;
173 | 	pthread_cond_t free_tasks_cond;
174 | 	pthread_cond_t tasks_done_cond;
175 | 
176 | 	pthread_mutex_t mutex;
177 | 	pthread_cond_t new_tasks_cond;
178 | };
179 | 
180 | struct threadpool* threadpool_init(
181 | 			int num_threads, 
182 | 			int num_cores, 
183 | 			int queue_size, 
184 | 			thread_control_t *t);
185 | 
186 | int threadpool_add_task(struct threadpool *pool, 
187 | 			task_control_t *t, 
188 | 			int blocking);
189 | 
190 | void threadpool_free(struct threadpool *pool);
191 | 
192 | /* returns zero if no pending tasks */
193 | int threadpool_drain(struct threadpool *pool,
194 | 			int blocking);
195 | 
196 | /********************* utility macros: ********************/
197 | 
198 | // Don't use any of these at present, but note MacOS has its own versions of these, in /usr/include/X11/Xthreads.h:
199 | #if 1
200 | 	static void * xmalloc(size_t len) {
201 | 		void *ptr = malloc(len);
202 | 		if (ptr == NULL) {
203 | 			printf("failed to allocate %u bytes\n", (uint32)len);
204 | 			exit(-1);
205 | 		}
206 | 		return ptr;
207 | 	}
208 | 	
209 | 	static void * xcalloc(size_t num, size_t len) {
210 | 		void *ptr = calloc(num, len);
211 | 		if (ptr == NULL) {
212 | 			printf("failed to calloc %u bytes\n", (uint32)(num * len));
213 | 			exit(-1);
214 | 		}
215 | 		return ptr;
216 | 	}
217 | 	
218 | 	static void * xrealloc(void *iptr, size_t len) {
219 | 		void *ptr = realloc(iptr, len);
220 | 		if (ptr == NULL) {
221 | 			printf("failed to reallocate %u bytes\n", (uint32)len);
222 | 			exit(-1);
223 | 		}
224 | 		return ptr;
225 | 	}
226 | #endif
227 | 
228 | #ifdef __cplusplus
229 | }
230 | #endif
231 | 
232 | #endif /* !_THREAD_H_ */
233 | 
234 | 


--------------------------------------------------------------------------------
/src/types.c:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | *                                                                              *
  3 | *   (C) 1997-2021 by Ernst W. Mayer.                                           *
  4 | *                                                                              *
  5 | *  This program is free software; you can redistribute it and/or modify it     *
  6 | *  under the terms of the GNU General Public License as published by the       *
  7 | *  Free Software Foundation; either version 2 of the License, or (at your      *
  8 | *  option) any later version.                                                  *
  9 | *                                                                              *
 10 | *  This program is distributed in the hope that it will be useful, but WITHOUT *
 11 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
 12 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
 13 | *  more details.                                                               *
 14 | *                                                                              *
 15 | *  You should have received a copy of the GNU General Public License along     *
 16 | *  with this program; see the file GPL.txt.  If not, you may view one at       *
 17 | *  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
 18 | *  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
 19 | *  02111-1307, USA.                                                            *
 20 | *                                                                              *
 21 | *******************************************************************************/
 22 | 
 23 | #include "types.h"
 24 | 
 25 | /* Useful extern constants to export: */
 26 | 
 27 | /* Multiword ints have word significance increasing from left to right: */
 28 | 
 29 | /* 5/04/2005: uint96/160s are really uint128/192s with upper 32 bits zero: */
 30 | const uint96  NIL96  = {(uint64)0, (uint32)0};
 31 | const uint96  ONE96  = {(uint64)1, (uint32)0};
 32 | const uint96  TWO96  = {(uint64)2, (uint32)0};
 33 | 
 34 | const uint128 NIL128 = {(uint64)0, (uint64)0};
 35 | const uint128 ONE128 = {(uint64)1, (uint64)0};
 36 | const uint128 TWO128 = {(uint64)2, (uint64)0};
 37 | 
 38 | const uint160 NIL160 = {(uint64)0, (uint64)0, (uint32)0};
 39 | const uint160 ONE160 = {(uint64)1, (uint64)0, (uint32)0};
 40 | const uint160 TWO160 = {(uint64)2, (uint64)0, (uint32)0};
 41 | 
 42 | const uint192 NIL192 = {(uint64)0, (uint64)0, (uint64)0};
 43 | const uint192 ONE192 = {(uint64)1, (uint64)0, (uint64)0};
 44 | const uint192 TWO192 = {(uint64)2, (uint64)0, (uint64)0};
 45 | 
 46 | const uint256 NIL256 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0};
 47 | const uint256 ONE256 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0};
 48 | const uint256 TWO256 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0};
 49 | 
 50 | const uint512 NIL512 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
 51 | const uint512 ONE512 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
 52 | const uint512 TWO512 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
 53 | 
 54 | /* Nov 2021: Case-insensitive analog of strstr - used the code posted by 'chux' here:
 55 | 	https://stackoverflow.com/questions/27303062/strstr-function-like-that-ignores-upper-or-lower-case
 56 | */
 57 | #include <ctype.h>	// Needed for tolower ... this include is normally via masterdefs.h
 58 | char* stristr(const char* haystack, const char* needle) {
 59 | 	do {
 60 | 		const char* h = haystack;
 61 | 		const char* n = needle;
 62 | 		while (tolower((unsigned char) *h) == tolower((unsigned char ) *n) && *n) {
 63 | 			h++;
 64 | 			n++;
 65 | 		}
 66 | 		if (*n == 0) {
 67 | 			return (char *) haystack;
 68 | 		}
 69 | 	} while (*haystack++);
 70 | 	return 0;
 71 | }
 72 | 
 73 | /* Binary predicates for use of stdlib qsort(): */
 74 | int ncmp_int(const void * a, const void * b)	// Default-int compare predicate
 75 | {
 76 | 	return ( *(int*)a - *(int*)b );
 77 | }
 78 | 
 79 | int ncmp_uint32(const void * a, const void * b)	// Mnemonic: "Numeric CoMPare of UINT32 data"
 80 | {
 81 | 	uint32 diff = *(uint32*)a - *(uint32*)b;
 82 | 	uint32 borrow = 1 - ((diff > *(uint32*)a) << 1);	// -1 if (a < b), +1 otherwise
 83 | 	// If (diff > a) == 1, had a borrow, i.e. a < b, return -1.
 84 | 	// Otherwise return 0 if diff == 0, +1 if diff != 0. Can roll all 3 possibilities into one expression:
 85 | 	return ( borrow & -(diff != 0) );
 86 | 	/*
 87 | 	a < b: bw = -1, (diff != 0) = 1, -() = -1 ===> -1 & -1 = -1
 88 | 	a = b: bw = +1, (diff != 0) = 0, -() =  0 ===> +1 &  0 =  0
 89 | 	a > b: bw = +1, (diff != 0) = 1, -() = -1 ===> +1 & -1 = +1
 90 | 	*/
 91 | }
 92 | 
 93 | int ncmp_sint32(const void * a, const void * b)
 94 | {
 95 | 	return ( *(sint32*)a - *(sint32*)b );
 96 | }
 97 | 
 98 | int ncmp_uint64(const void * a, const void * b)
 99 | {
100 | 	uint64 diff = *(uint64*)a - *(uint64*)b;
101 | 	uint64 borrow = 1 - ((diff > *(uint64*)a) << 1);	// -1 if (a < b), +1 otherwise
102 | 	return ( borrow & -(diff != 0) );
103 | }
104 | 
105 | int ncmp_sint64(const void * a, const void * b)
106 | {
107 | 	return ( *(sint64*)a - *(sint64*)b );
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------