├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── config-fermat.sh ├── docs ├── Fermat-testing.md ├── Mfactor_buildnotes.txt ├── a.txt ├── alderlake.txt ├── b.txt ├── brent-suyama.txt ├── c.txt ├── d.txt ├── dct.txt ├── fgt.txt ├── gerbicz.txt ├── gpuowl_stats.txt ├── hwloc_test.txt ├── irrational.txt ├── knc.txt ├── nt.txt ├── pm1.txt ├── pm1_compare.png ├── pm1_compare.txt ├── pm1_d210.txt ├── pm1_d330.txt ├── pm1_d420.txt ├── pm1_d660.txt ├── pm1_d840.txt ├── predefs_archlinux.txt ├── predefs_linux.txt ├── predefs_mac.txt ├── predefs_power9.txt ├── prp_proof.txt ├── prp_proof_examples.txt ├── qs.txt └── todo.txt ├── help.txt ├── makemake.sh └── src ├── Mdata.h ├── Mlucas.c ├── Mlucas.h ├── align.h ├── br.c ├── carry.h ├── carry_dbg.h ├── carry_gcc32.h ├── carry_gcc64.h ├── dft_macro.c ├── dft_macro.h ├── dft_sine_term_opt.c.txt ├── f2psp.h ├── f2psp_3_5.h ├── fac_test_dat128.h ├── fac_test_dat192.h ├── fac_test_dat256.h ├── fac_test_dat64.h ├── fac_test_dat96.h ├── factor.c ├── factor.h ├── factor_test.h ├── fermat_mod_square.c ├── fgt_m61.c ├── fgt_m61.h ├── float_intrin.h ├── gcd_lehmer.c.txt ├── gcd_lehmer.h ├── genFFT_mul.h ├── getRealTime.c ├── get_cpuid.c ├── get_fft_radices.c ├── get_fp_rnd_const.c ├── get_preferred_fft_radix.c ├── gpu_iface.cu ├── gpu_iface.h ├── gpu_sieve.cu ├── imul256_macro.h ├── imul_macro.c ├── imul_macro.h ├── imul_macro0.h ├── imul_macro1.h ├── masterdefs.h ├── mers_mod_square.c ├── mi64.c ├── mi64.h ├── mi64_new.c.txt ├── pairFFT_mul.c ├── pair_square.c ├── pair_square.h ├── platform.h ├── pm1.c ├── prefetch.h ├── qfcheb.c.txt ├── qfcheb.h ├── qfloat.c ├── qfloat.h ├── radix09_sse_macro.h ├── radix1008_avx_negadwt_consts.h ├── radix1008_ditN_cy_dif1.c ├── radix1008_main_carry_loop.h ├── radix1024.h ├── radix1024_avx_negadwt_consts.h ├── radix1024_ditN_cy_dif1.c ├── radix1024_main_carry_loop.h ├── radix1024_twiddles.h ├── radix104_ditN_cy_dif1.c ├── radix10_ditN_cy_dif1.c ├── radix112_ditN_cy_dif1.c ├── radix11_ditN_cy_dif1.c ├── radix11_sse_macro.h ├── radix120_ditN_cy_dif1.c ├── radix128.h ├── radix128_ditN_cy_dif1.c ├── radix128_main_carry_loop.h ├── radix128_twiddles.h ├── radix12_ditN_cy_dif1.c ├── radix12_main_carry_loop.h ├── radix13.h ├── radix13_ditN_cy_dif1.c ├── radix13_sse_macro.h ├── radix144_ditN_cy_dif1.c ├── radix144_main_carry_loop.h ├── radix14_ditN_cy_dif1.c ├── radix15_ditN_cy_dif1.c ├── radix15_sse_macro.h ├── radix16.h ├── radix160_ditN_cy_dif1.c ├── radix160_main_carry_loop.h ├── radix16_dif_dit_pass.c ├── radix16_dif_dit_pass_asm.h ├── radix16_ditN_cy_dif1.c ├── radix16_ditN_cy_dif1_asm.h ├── radix16_dyadic_square.c ├── radix16_dyadic_square_gcc64.h ├── radix16_main_carry_loop.h ├── radix16_pairFFT_mul.c ├── radix16_utils_asm.h ├── radix16_wrapper_ini.c ├── radix16_wrapper_square.c ├── radix16_wrapper_square_gcc32.h ├── radix16_wrapper_square_gcc64.h ├── radix176_ditN_cy_dif1.c ├── radix176_main_carry_loop.h ├── radix17_dft.h ├── radix17_ditN_cy_dif1.c ├── radix18_ditN_cy_dif1.c ├── radix192_ditN_cy_dif1.c ├── radix192_main_carry_loop.h ├── radix208_ditN_cy_dif1.c ├── radix208_main_carry_loop.h ├── radix20_ditN_cy_dif1.c ├── radix20_ditN_cy_dif1_gcc32.h ├── radix20_ditN_cy_dif1_gcc64.h ├── radix20_main_carry_loop.h ├── radix224_ditN_cy_dif1.c ├── radix224_main_carry_loop.h ├── radix22_ditN_cy_dif1.c ├── radix240_ditN_cy_dif1.c ├── radix240_main_carry_loop.h ├── radix24_ditN_cy_dif1.c ├── radix24_ditN_cy_dif1_gcc32.h ├── radix24_ditN_cy_dif1_gcc64.h ├── radix24_main_carry_loop.h ├── radix256.h ├── radix256_ditN_cy_dif1.c ├── radix256_main_carry_loop.h ├── radix256_twiddles.h ├── radix26_ditN_cy_dif1.c ├── radix288_ditN_cy_dif1.c ├── radix288_main_carry_loop.h ├── radix28_ditN_cy_dif1.c ├── radix28_ditN_cy_dif1_gcc32.h ├── radix28_ditN_cy_dif1_gcc64.h ├── radix28_main_carry_loop.h ├── radix30_ditN_cy_dif1.c ├── radix31.h ├── radix31_ditN_cy_dif1.c ├── radix32.h ├── radix320_ditN_cy_dif1.c ├── radix320_main_carry_loop.h ├── radix32_dif_dit_pass.c ├── radix32_dif_dit_pass_asm.h ├── radix32_ditN_cy_dif1.c ├── radix32_ditN_cy_dif1_asm.h ├── radix32_dyadic_square.c ├── radix32_dyadic_square_gcc64.h ├── radix32_main_carry_loop.h ├── radix32_utils_asm.h ├── radix32_wrapper_ini.c ├── radix32_wrapper_square.c ├── radix32_wrapper_square_gcc32.h ├── radix32_wrapper_square_gcc64.h ├── radix352_ditN_cy_dif1.c ├── radix352_main_carry_loop.h ├── radix36_ditN_cy_dif1.c ├── radix36_main_carry_loop.h ├── radix384_ditN_cy_dif1.c ├── radix384_main_carry_loop.h ├── radix4032.h ├── radix4032_avx_negadwt_consts.h ├── radix4032_ditN_cy_dif1.c ├── radix4032_main_carry_loop.h ├── radix40_ditN_cy_dif1.c ├── radix40_main_carry_loop.h ├── radix44_ditN_cy_dif1.c ├── radix44_main_carry_loop.h ├── radix48_ditN_cy_dif1.c ├── radix48_main_carry_loop.h ├── radix512.h ├── radix512_ditN_cy_dif1.c ├── radix52_ditN_cy_dif1.c ├── radix52_main_carry_loop.h ├── radix56_ditN_cy_dif1.c ├── radix56_main_carry_loop.h ├── radix5_ditN_cy_dif1.c ├── radix60_ditN_cy_dif1.c ├── radix60_main_carry_loop.h ├── radix63_ditN_cy_dif1.c ├── radix63_main_carry_loop.h ├── radix64.h ├── radix64_ditN_cy_dif1.c ├── radix64_main_carry_loop.h ├── radix6_ditN_cy_dif1.c ├── radix72_ditN_cy_dif1.c ├── radix768_ditN_cy_dif1.c ├── radix768_main_carry_loop.h ├── radix7_ditN_cy_dif1.c ├── radix80_ditN_cy_dif1.c ├── radix88_ditN_cy_dif1.c ├── radix8_dif_dit_pass.c ├── radix8_dif_dit_pass_asm.h ├── radix8_ditN_cy_dif1.c ├── radix960_avx_negadwt_consts.h ├── radix960_ditN_cy_dif1.c ├── radix960_main_carry_loop.h ├── radix96_ditN_cy_dif1.c ├── radix992_ditN_cy_dif1.c ├── radix992_main_carry_loop.h ├── radix9_ditN_cy_dif1.c ├── rng_isaac.c ├── rng_isaac.h ├── sse2_macro.h ├── sse2_macro_gcc32.h ├── sse2_macro_gcc64.h ├── test_fft_radix.c ├── test_fft_radix.c.txt ├── threadpool.c ├── threadpool.h ├── twopmodq.c ├── twopmodq100.c ├── twopmodq100.h ├── twopmodq128.c ├── twopmodq128_96.c ├── twopmodq160.c ├── twopmodq192.c ├── twopmodq256.c ├── twopmodq64_test.c ├── twopmodq80.c ├── twopmodq80.h ├── twopmodq96.c ├── types.c ├── types.h ├── util.c └── util.h /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | obj*/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Actions Status](https://github.com/primesearch/Mlucas/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/primesearch/Mlucas/actions/workflows/ci.yml) 2 | 3 | # Mlucas 4 | Ernst Mayer's Mlucas and Mfactor programs for GIMPS 5 | 6 | [Ernst Mayer passed away unexpectedly](https://www.mersenneforum.org/showthread.php?t=28890) on September 10, 2023. This repository contains his posthumously released Mlucas v21 code, which is now maintained by the Great Internet Mersenne Prime Search (GIMPS) community. AutoPrimeNet (the Python PrimeNet program) previously bundled with Mlucas is now maintained in a [separate repository](https://github.com/tdulcet/AutoPrimeNet). 7 | 8 | Mlucas and Mfactor are 100% open source programs. Mlucas is for [primality](https://en.wikipedia.org/wiki/Primality_test) and [P-1](https://en.wikipedia.org/wiki/Pollard%27s_p_%E2%88%92_1_algorithm) testing of [Mersenne](https://en.wikipedia.org/wiki/Mersenne_prime) and [Fermat](https://en.wikipedia.org/wiki/Fermat_number) numbers, including support for the [Lucas-Lehmer](https://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test), [Probable prime](https://en.wikipedia.org/wiki/Probable_prime) (PRP) and [Pépin](https://en.wikipedia.org/wiki/P%C3%A9pin%27s_test) tests. Mfactor is for trial factoring. They support x86 Intel and AMD, ARM and other CPUs. 9 | 10 | The original [Mlucas README](https://mersenneforum.org/mayer/README.html) is available for posterity and contains a lot of information, but note that it is no longer up to date. For more information about Mlucas v21, please see the [Ernst's Mlucas - the future](https://www.mersenneforum.org/showthread.php?t=28926) thread on the Mersenne Forum. 11 | 12 | Feature | | Mlucas | Prime95/MPrime 13 | --- | --- | ---: | ---: 14 | **Architectures** | x86 | ✔️ | ✔️ 15 | \- | ARM | ✔️ | 16 | \- | Other | ✔️ | 17 | **Worktypes** | LL | ✔️ | ✔️ 18 | \- | PRP | ✔️ | ✔️ 19 | \- | P-1 | ✔️ | ✔️ 20 | \- | P+1 | | ✔️ 21 | \- | ECM | | ✔️ 22 | \- | Pépin | ✔️ | ✔️ 23 | **PRP** | Proofs | | ✔️ 24 | \- | Certs | | ✔️ 25 | **Error Checking** | Jacobi | | ✔️ 26 | \- | Gerbicz | ✔️ | ✔️ 27 | **Random Shifts** | | ✔️ | ✔️ 28 | **Interface** | CLI | ✔️ | MPrime only 29 | \- | GUI | | Prime95 only 30 | **Multiple Workers** | | Separate runs | ✔️ 31 | **PrimeNet Support** | | Separate program | ✔️ 32 | **Max FFT Length** | | 256M
(**512M** with 0 shift) | 32M (AVX) -
64M (AVX512) 33 | **Largest Exponent** | | 4,294,967,231
(**8,937,021,911** with 0 shift) | 595,700,000 (AVX) -
1,169,000,000 (AVX512) 34 | **Performance** | | ~50-90% | **100%** 35 | **Free** 🆓 | | **Yes**, GPL | No, EULA 36 | **100% Open Source** | | ✔️ | Mostly 37 | **Claim Full EFF Awards** | | ✔️ | 38 | 39 | ## Usage 40 | 41 | ### Automatic method 42 | 43 | Linux users can use the [Mlucas install script](https://github.com/tdulcet/Distributed-Computing-Scripts#mlucas) to automatically download, build, setup and run Mlucas, including downloading, setting up and running the [AutoPrimeNet](https://github.com/tdulcet/AutoPrimeNet) for automated PrimeNet assignments. 44 | 45 | ### Manual method 46 | 47 | Dependencies: 48 | * Make 49 | * GNU C or Clang compiler 50 | * \*GNU Multiple Precision (GMP) library 51 | * \*Portable Hardware Locality (hwloc) library 52 | * \*Python 3 53 | 54 | \* Optional 55 | 56 | #### Download 57 | 58 | ##### Linux 59 | 60 | 1. Verify that the dependencies above are installed. On Debian and Ubuntu, run: `sudo apt update` and `sudo apt install build-essential libgmp-dev libhwloc-dev`. 61 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`. 62 | 3. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`. 63 | 64 | ##### macOS 65 | 66 | 1. Verify that the dependencies above are installed. Run: `brew install gmp hwloc`. 67 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `curl -fLO https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`. 68 | 3. To download AutoPrimeNet, run: `curl -sSfLO https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`. 69 | 70 | ##### Windows 71 | 72 | Native Windows builds are experimental. For now, Windows users should use the [Windows Subsystem for Linux](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) (WSL) and follow the [Linux](#linux) instructions above instead. 73 | 74 | 1. Download and install [MSYS2](https://www.msys2.org/). 75 | 2. Verify that the dependencies above are installed. With the MINGW64 environment, run: `pacman -S mingw-w64-x86_64-gmp mingw-w64-x86_64-hwloc`. 76 | 3. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`. 77 | 4. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`. 78 | 79 | #### Build 80 | 81 | 1. Change into the `Mlucas` directory. Run: `cd Mlucas` or `cd Mlucas-main` depending on which method one used to download it. 82 | 2. Run: 83 | * To build Mlucas: `bash makemake.sh [use_hwloc]`. 84 | * To build Mfactor: `bash makemake.sh mfac [word]`, where `word` is optionally one of `1word`, `2word`, `3word`, `4word` or `nword`. 85 | 86 | To build with Clang or another compiler instead of GCC, run: `export CC=`, for example: `export CC=clang`. 87 | 88 | #### Setup and Run 89 | 90 | 1. Change into the `obj` directory. Run: `cd obj` or `cd obj_mfac` depending on if one built Mlucas or Mfactor respectively. 91 | 92 | This README is still in progress. For now, see the original [Mlucas README](https://mersenneforum.org/mayer/README.html), which has more information about how to setup and run Mlucas. Also see [Help](#help) below. Note that with Mlucas v21, if built with the hwloc library, one would want to use the new `-core` option instead of `-cpu`. 93 | 94 | ## Help 95 | 96 | The [help.txt](help.txt) file includes a variety of usage information not covered in the original [README](https://mersenneforum.org/mayer/README.html), concentrating largely on the Mlucas command line options. A separate documentation page covers [Fermat numbers](docs/Fermat-testing.md). 97 | 98 | ## Contributing 99 | 100 | Pull requests welcome! 101 | -------------------------------------------------------------------------------- /config-fermat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Shell script for generating fermat.cfg; Mlucas output saved to config-fermat.log 4 | 5 | ################################################################################ 6 | # # 7 | # (C) 2024 by Catherine Cowie and Teal Dulcet. # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify it # 10 | # under the terms of the GNU General Public License as published by the # 11 | # Free Software Foundation; either version 2 of the License, or (at your # 12 | # option) any later version. # 13 | # # 14 | # This program is distributed in the hope that it will be useful, but WITHOUT # 15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # 16 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # 17 | # more details. # 18 | # # 19 | # You should have received a copy of the GNU General Public License along # 20 | # with this program; see the file GPL.txt. If not, you may view one at # 21 | # http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the # 22 | # Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 23 | # 02111-1307, USA. # 24 | # # 25 | ################################################################################ 26 | 27 | # Mlucas 28 | MLUCAS=./Mlucas 29 | 30 | # Number of iterations (use 100, 1000, or 10000 to match pre-computed values) 31 | ITERS=100 32 | 33 | # Minimum Fermat number (15 or greater) 34 | MIN=15 35 | 36 | # Maximum Fermat number (33 or less) 37 | MAX=29 38 | 39 | # Mlucas arguments 40 | ARGS=( 41 | "$@" 42 | # Add desired -cpu or -core settings here, or as following arguments, e.g. bash ../config-fermat.sh -cpu 0:3 43 | ) 44 | 45 | # First, tiny FFT lengths for F15 to F17 (note 4K is the smallest workable length without fiddly radix settings); 46 | FFTS=([2]=15 [4]=16 [7]=17 [8]=17) 47 | # Then, from small up to egregiously large FFTs for F18 to F33. 48 | # The largest FFT reached is 512M, if MAX is set to 33. 49 | # Note that large FFTs require considerable runtime at 10000 iterations. 50 | for ((n = 0; n < 16; ++n)); do 51 | m=$((1 << n)) 52 | f=$((18 + n)) 53 | for k in 15 16; do 54 | if [[ $k -eq 15 && $n -lt 11 ]]; then 55 | # k = 7 multiples (7K, 14K, ...) become unworkable after F28 (14M). 56 | FFTS[14 * m]=$f 57 | fi 58 | # k = 15, 16 should both be supported up to at least F32. 59 | FFTS[k * m]=$f 60 | if [[ $k -eq 15 && $n -gt 5 ]]; then 61 | # k = 63 is mostly supported for F24 (1008K) and above. 62 | FFTS[63 * m >> 2]=$f 63 | fi 64 | done 65 | done 66 | for fft in "${!FFTS[@]}"; do 67 | f=${FFTS[fft]} 68 | if [[ -n $MIN && $f -lt $MIN ]]; then 69 | continue 70 | elif [[ -n $MAX && $f -gt $MAX ]]; then 71 | break 72 | fi 73 | printf '\n\tTesting F%s (2^%s + 1),\tFFT length: %sK\n\n' "$f" $((1 << f)) "$fft" 74 | args=("${ARGS[@]}") 75 | # First we test the very fiddly F15 and then loop over F16 up to maximum 76 | if [[ $f -eq 15 ]]; then 77 | args+=(-radset 8,8,16) 78 | fi 79 | if [[ $f -le 17 || $f -ge 32 ]]; then 80 | args+=(-shift 0) 81 | fi 82 | time $MLUCAS -f "$f" -fft "$fft" -iters $ITERS "${args[@]}" 2>&1 | tee -a config-fermat.log | grep -i 'error\|warn\|assert\|writing\|pmax_rec\|fft radices' 83 | done 84 | -------------------------------------------------------------------------------- /docs/irrational.txt: -------------------------------------------------------------------------------- 1 | 24 Feb 2022 2 | Prove irrationality of sqrt(2) via N-R iteration formula? 3 | 4 | Let f(x) = x^(-2) − c, applying N-R (dx = -f/f' = (c - 1/x^2)/(-2/x^3) = x.(1-cx^2)/2) to this yields a second-order iterative formula for the reciprocal square-root of the computationally efficient kind we seek, with a per-iteration cost of 1 ADD and 4 MUL: 5 | x_n+1 = x*(3 - c*x^2)/2 6 | Fixed point(s) x* of the iteration given by dx = x*.(1-cx*^2)/2) = 0, with solutions x* = +- 1/sqrt(c) . 7 | Assume x* rational, i.e. x* = 1/sqrt(c) = p/q. In terms of p and q our iteration is 8 | x_n+1 = (p/q)*(3 - c*(p/q)^2)/2 = p.(3.q^2 - 2.p^2)/(2.q^3), i.e. p' = p*(3*q^2 - 2*p^2), q' = (2*q^3) 9 | Example: c = 2, x0 = 1, x_n+1 = x_n*(3 - 2*x_n^2)/2 ... if x_n = p/q, have 10 | bc: 11 | p=q=1 12 | p *= (3*q^2-2*p^2); q = (2*q^3); g = gcd(p,q); p /= g; q /= g; print "gcd = ",g,": p = ",p,", q = ",q,"\n" 13 | n x_n = p/q factorization of p,q 14 | 0 1 15 | 1 1/2 16 | 2 5/8 5,2^3 17 | 3 355/512 5.71,2^9 18 | 4 94852805/134217728 5.23.71.11617,2^27 19 | 5 1709678476417571835487555/2417851639229258349412352 5.23.71.5741.8837.11617.355280903,2^81 20 | 6 p = 5.23.71.3023.5741.8837.11617.27509.355280903.70298580191725636724693742124090124808533, q = 2^243 21 | ... 22 | We observe that for each iteration, gcd(p',q') = 2. Also: 23 | o Once p has a given odd factor, subsequent iterations merely add more odd factors to p 24 | [Q: Are said odd factors all distinct, i.e. is p squarefree?] 25 | o q = 2^, with k tripling on each iteration 26 | Q: Is there a similar trend for other initial choices of p,q? 27 | p0 = 4, q0 = 5: 28 | n x_n = p/q 29 | 0 4/5 30 | 1 86/125 31 | 2 43.32083/5^9 32 | 3 43.32083.308933.24722741/2.5^27 33 | 4 43.1987.32083.197947.308933.5926127.24722741.51537769.1848407118139843/2^3.5^81; so, more observations: 34 | o Any power of 2 in p is reduced by 1 each iteration until p odd, q = 2.odd 35 | o Each distinct prime in the factorization of q has its power tripled each iteration 36 | o Assuming p0,q0 in reduced form (gcd(p,q) = 1), again we have gcd(p,q) = 2 each iteration. 37 | 38 | Without loss of generality we can consider the initial iterate within the basin of monotone convergence and its p0,q0 reduced, i.e. gcd(p0,q0) = 1, thus p0=q0=1 or p0,q0 have opposite parity, and: 39 | 1: p0=q0=1 yields next-iterate p = 1, q = 2, thus of form [2] below. 40 | 2: For p0 odd, q0 even: both 2.p^2 and 3.q^2 even and numerator p*(3*q^2 - 2*p^2) = 2*odd, thus gcd(p',q') = 2 41 | 3: For p0 even, q0 odd: 2.p^2 even and 3.q^2 odd; p*(3*q^2 - 2*p^2) even, denominator 2.q^3 = 2*odd, thus gcd(p',q') = 2 42 | In case [3] the unreduced numerator is divisible by 2^k with k > 1; since (3*q^2 - 2*p^2) odd, said power of 2 is the same as contained in the input value p0, and the ensuing division by the gcd = 2 reduces it by 1, thus after k further iterations we fall into pattern [2] and remain there (e.g. p0,q0 = 4,5 give p = 86,1379569,... and q = 125,1953125,...; p0,q0 = 8,9 give p = 460,269358290,41100860142614334318305635,... and q = 729,387420489,58149737003040059690390169,...). 43 | Thus after a finite number of iterations we inevitably settle into pattern [2] and remain there, thus the iteration converges in the sense that p/q approaches a limit but p,q never do because their gcd remains fixed at 2. QED 44 | *** Not quite - need to show that (or if) gcd cannot include an odd prime *** 45 | For q0 = 2 that is easy - denominator = 2.q^3, if it starts as a power of 2 it stays there. 46 | 47 | Now try c = 3: Iterative-update is p = p.(3.q^2 - c.p^2) = 3.p.(q^2 - p^2), q = (2.q^3). 48 | Again use p0 = 1, q0 = 2: 49 | n x_n = p/q 50 | 1 3^2/2^4 51 | 2 3^3.5^2.7/2^13 52 | 3 3^4.5^2.7.3467.12917/2^40 53 | Denominator = 2^k, k = (3.n+1); if p0 odd, numerator = odd.odd.(even-odd) always odd, hence sqrt(3) irrational. 54 | 55 | Now try a (rational)^2, c = 9/16, yielding p = 3.p.(16.q^2 - 3.p^2), q = (2^5.q^3), same initial guess: 56 | n x_n = p/q 57 | 0 1/2 58 | 1 3.61/2^8 59 | 2 3^2.61.107.1511/2^25 60 | ... this clearly converges -> 4/3, but here's the rub: p/q can converge in the sense of the limit as n -> oo, but p and q converge only in this same sense, i.e. there's no reason to expect gcd(p,q) to magically hit a nonzero value such that the resuling gcd-reduced p = 4 and q = 3 in a finite number of steps.. 61 | 62 | -------------------------------------------------------------------------------- /docs/pm1_compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/docs/pm1_compare.png -------------------------------------------------------------------------------- /docs/pm1_compare.txt: -------------------------------------------------------------------------------- 1 | P-1 relative modmul count for stage 2 with b1=1m, b2=30m, best-bigstep option as tabulated in the 2 | comments preceding my pm1_bigstep_size() function, for various prime-pairing #memory buffers: 3 | #buf #modmul 4 | 24 1.00000000000000000000 5 | 40 0.94138773325629738116 6 | 48 0.92740481217168044188 7 | 72 0.86767843536398087761 8 | 80 0.86074072083480994905 9 | 96 0.83101581644392863201 10 | 120 0.80442105746400556719 11 | 144 0.78374955701528234292 12 | 160 0.77282735940798766528 13 | 168 0.76741692741880852926 14 | 192 0.75413606363080072042 15 | 200 0.74602321491854959665 16 | 216 0.74320938718907791436 17 | 240 0.72551239265493547959 18 | 280 0.70928837479334378021 19 | 320 0.69613179866071653512 20 | 336 0.69459499859756496969 21 | 360 0.68523479449708007988 22 | 384 0.68135948300814196113 23 | 400 0.67617411244897627841 24 | 432 0.67047815476500395537 25 | 440 0.66840613398769104328 26 | 480 0.66141411359107905758 27 | 520 0.65619459191939889563 28 | 528 0.65365901177877469167 29 | 560 0.65110495644613445796 30 | 576 0.64716694127520254129 31 | 600 0.64677560311704482025 32 | 624 0.64151017339247634195 33 | 672 0.63652914965409401857 34 | 720 0.63217516273564967452 35 | 768 0.62836031551149128950 36 | 816 0.62483771223376828414 37 | 864 0.62172716172343309377 38 | 912 0.61890717559662273489 39 | 960 0.61633464507196647144 40 | 1008 0.61408235120892139029 41 | 1040 0.61240950655001542398 42 | 1056 0.61196946106745180634 43 | 1104 0.60997358047541707745 44 | 1120 0.60748670765919877011 45 | 1200 0.60309241123090127025 46 | 1280 0.59918350848375218833 47 | 1360 0.59569673588145421035 48 | 1440 0.59258058682808385945 49 | 1520 0.58975556201254185610 50 | 1600 0.58720542566002623477 51 | 1680 0.58489434709511196806 52 | 1760 0.58283856209260102151 53 | 1824 0.58255863494084299501 54 | 1840 0.58092777935470073268 55 | 1920 0.57913848500066342734 56 | 2000 0.57749587247414732789 57 | 2080 0.57601449798704385170 58 | 2112 0.57561308245142284171 59 | 2160 0.57463221771166271689 60 | 2208 0.57365527195202720444 61 | 2240 0.57334175354205821477 62 | 2304 0.57187661482975670411 63 | 2400 0.57025303734956015046 64 | -------------------------------------------------------------------------------- /docs/pm1_d210.txt: -------------------------------------------------------------------------------- 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 210, for various prime-pairing #memory buffers: 2 | #buf #modmul 3 | 24 8482142 4 | 72 7462541 5 | 120 6903803 6 | 168 6578211 7 | 216 6369191 8 | 264 6213292 9 | 312 6102927 10 | 360 6019017 11 | 408 5948963 12 | 456 5893249 13 | 504 5848289 14 | 552 5809671 15 | 600 5779087 16 | 648 5750744 17 | 696 5727095 18 | 744 5706454 19 | 792 5687784 20 | 840 5671474 21 | 888 5657449 22 | 936 5643928 23 | 984 5631970 24 | 1032 5621610 25 | 1080 5610944 26 | 1128 5602213 27 | 1176 5594236 28 | 1224 5587383 29 | 1272 5579976 30 | 1320 5573772 31 | 1368 5568653 32 | 1416 5562780 33 | 1464 5557665 34 | 1512 5552816 35 | 1560 5548156 36 | 1608 5543827 37 | 1656 5539520 38 | 1704 5535999 39 | 1752 5532637 40 | 1800 5529301 41 | 1848 5526107 42 | 1896 5523275 43 | 1944 5520469 44 | 1992 5517760 45 | 2040 5515237 46 | 2088 5513037 47 | 2136 5510383 48 | 2184 5508344 49 | 2232 5506230 50 | 2280 5504316 51 | 2328 5502474 52 | 2376 5500748 53 | -------------------------------------------------------------------------------- /docs/pm1_d330.txt: -------------------------------------------------------------------------------- 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 330, for various prime-pairing #memory buffers: 2 | #buf #modmul 3 | 40 8047788 4 | 120 7036479 5 | 200 6488371 6 | 280 6120697 7 | 360 5927827 8 | 440 5764103 9 | 520 5645736 10 | 600 5550700 11 | 680 5481350 12 | 760 5426563 13 | 840 5378130 14 | 920 5339380 15 | 1000 5305896 16 | 1080 5275177 17 | 1160 5247276 18 | 1240 5225875 19 | 1320 5206908 20 | 1400 5188988 21 | 1480 5174005 22 | 1560 5159755 23 | 1640 5147646 24 | 1720 5134944 25 | 1800 5125084 26 | 1880 5115890 27 | 1960 5107361 28 | 2040 5099600 29 | 2120 5092087 30 | 2200 5085266 31 | 2280 5077999 32 | 2360 5072329 33 | -------------------------------------------------------------------------------- /docs/pm1_d420.txt: -------------------------------------------------------------------------------- 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 420, for various prime-pairing #memory buffers: 2 | #buf #modmul 3 | 48 7799105 4 | 144 6767956 5 | 240 6212962 6 | 336 5887052 7 | 432 5678594 8 | 528 5520431 9 | 624 5413006 10 | 720 5329443 11 | 816 5259481 12 | 912 5203978 13 | 1008 5159100 14 | 1104 5119750 15 | 1200 5088646 16 | 1296 5060438 17 | 1392 5036585 18 | 1488 5016366 19 | 1584 4997817 20 | 1680 4981221 21 | 1776 4966761 22 | 1872 4953372 23 | 1968 4941632 24 | 2064 4931285 25 | 2160 4921051 26 | 2256 4911881 27 | 2352 4904025 28 | -------------------------------------------------------------------------------- /docs/pm1_d660.txt: -------------------------------------------------------------------------------- 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 660, for various prime-pairing #memory buffers: 2 | #buf #modmul 3 | 80 7599932 4 | 240 6585540 5 | 400 6054425 6 | 560 5683123 7 | 720 5486768 8 | 880 5324522 9 | 1040 5204859 10 | 1200 5109664 11 | 1360 5038998 12 | 1520 4984967 13 | 1680 4937214 14 | 1840 4898839 15 | 2000 4865619 16 | 2160 4834781 17 | 2320 4807728 18 | -------------------------------------------------------------------------------- /docs/pm1_d840.txt: -------------------------------------------------------------------------------- 1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 840, for various prime-pairing #memory buffers: 2 | #buf #modmul 3 | 96 7451024 4 | 288 6422689 5 | 480 5869480 6 | 672 5542247 7 | 864 5332830 8 | 1056 5174753 9 | 1248 5067407 10 | 1440 4984033 11 | 1632 4914157 12 | 1824 4858498 13 | 2016 4813486 14 | 2208 4773984 15 | 2400 4743455 16 | -------------------------------------------------------------------------------- /docs/predefs_mac.txt: -------------------------------------------------------------------------------- 1 | #define __DBL_MIN_EXP__ (-1021) 2 | #define __FLT_MIN__ 1.17549435e-38F 3 | #define __DEC64_DEN__ 0.000000000000001E-383DD 4 | #define TRUE 1 5 | #define __CHAR_BIT__ 8 6 | #define BIT_CLR(x,b) ( (x) &= ~(1 << (b)) ) 7 | #define CPU_NAME "x86_64" 8 | #define ALIGN_VEC_U64(_p) ALIGN_UINT64(_p) 9 | #define __WCHAR_MAX__ 2147483647 10 | #define __DBL_DENORM_MIN__ 4.9406564584124654e-324 11 | #define __FLT_EVAL_METHOD__ 0 12 | #define STRNEQN(s1,s2,n) ( strncmp(s1,s2,n)) 13 | #define ALIGN_f128(_p) (__float128 *)(((long)(_p) | 127)+1) 14 | #define __DBL_MIN_10_EXP__ (-307) 15 | #define __FINITE_MATH_ONLY__ 0 16 | #define ALLOC_COMPLEX(_p,_n) (struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512) 17 | #define L2_SZ_VD 3 18 | #define ALIGN_COMPLEX(_p) (struct complex*)(((long)(_p) | 127)+1) 19 | #define __DEC64_MAX_EXP__ 384 20 | #define __SHRT_MAX__ 32767 21 | #define __LDBL_MAX__ 1.18973149535723176502e+4932L 22 | #define __APPLE_CC__ 5666 23 | #define __UINTMAX_TYPE__ long unsigned int 24 | #define __DEC32_EPSILON__ 1E-6DF 25 | #define __block __attribute__((__blocks__(byref))) 26 | #define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256) 27 | #define ALIGN_UINT64(_p) (uint64 *)(((long)(_p) | 63)+1) 28 | #define STREQ(s1,s2) (!strcmp(s1,s2)) 29 | #define __SCHAR_MAX__ 127 30 | #define HERE __LINE__, __FILE__ 31 | #define align_h_included 32 | #define __USER_LABEL_PREFIX__ _ 33 | #define __STDC_HOSTED__ 1 34 | #define ALLOC_UINT128(_p,_n) (uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256) 35 | #define __DEC64_MIN_EXP__ (-383) 36 | #define BIT_SETC(x,b,condition) ( (x) |= ((condition) << (b)) ) 37 | #define __DBL_DIG__ 15 38 | #define __FLT_EPSILON__ 1.19209290e-7F 39 | #define ALLOC_POINTER(_p,_ptr_type,_n) (_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64) 40 | #define __LDBL_MIN__ 3.36210314311209350626e-4932L 41 | #define __DEC32_MAX__ 9.999999E96DF 42 | #define OS_POSIX_COMPLIANT 43 | #define __strong 44 | #define COMPILER_NAME "Gnu C [or other compatible]" 45 | #define __APPLE__ 1 46 | #define __DECIMAL_DIG__ 21 47 | #define SZ_VDM1 7 48 | #define __LDBL_HAS_QUIET_NAN__ 1 49 | #define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512) 50 | #define __DYNAMIC__ 1 51 | #define __GNUC__ 4 52 | #define __MMX__ 1 53 | #define __FLT_HAS_DENORM__ 1 54 | #define ALLOC_VEC_DBL(_p,_n) ALLOC_DOUBLE(_p,_n) 55 | #define __DBL_MAX__ 1.7976931348623157e+308 56 | #define __DBL_HAS_INFINITY__ 1 57 | #define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256) 58 | #define __DEC32_MIN_EXP__ (-95) 59 | #define ALIGN_UINT128(_p) (uint128 *)(((long)(_p) | 63)+1) 60 | #define OBJC_NEW_PROPERTIES 1 61 | #define __LDBL_HAS_DENORM__ 1 62 | #define __DEC32_MIN__ 1E-95DF 63 | #define __weak __attribute__((objc_gc(weak))) 64 | #define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512) 65 | #define __DBL_MAX_EXP__ 1024 66 | #define __DEC128_EPSILON__ 1E-33DL 67 | #define __SSE2_MATH__ 1 68 | #define STRNEQ(s1,s2) ( strcmp(s1,s2)) 69 | #define __amd64 1 70 | #define __tune_core2__ 1 71 | #define __LONG_LONG_MAX__ 9223372036854775807LL 72 | #define IS_ODD(a) ( (int)(a) & 1) 73 | #define NINT(x) floor(x + 0.5) 74 | #define BIT_SET(x,b) ( (x) |= (1 << (b)) ) 75 | #define platform_h_included 76 | #define FP_MANTISSA_BITS_DOUBLE 64 77 | #define __GXX_ABI_VERSION 1002 78 | #define COMPILER_TYPE_GCC 79 | #define ALIGN_INT(_p) (int *)(((long)(_p) | 63)+1) 80 | #define __FLT_MIN_EXP__ (-125) 81 | #define DNINT(x) lrint((x)) 82 | #define __x86_64 1 83 | #define CPU_SUBTYPE_NAME "Unknown CPU subtype" 84 | #define __DBL_MIN__ 2.2250738585072014e-308 85 | #define COMPILER_VERSION __VERSION__ 86 | #define ALIGN_VEC_DBL(_p) ALIGN_DOUBLE(_p) 87 | #define __LP64__ 1 88 | #define __DBL_HAS_QUIET_NAN__ 1 89 | #define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256) 90 | #define __DEC128_MIN__ 1E-6143DL 91 | #define __REGISTER_PREFIX__ 92 | #define __DBL_HAS_DENORM__ 1 93 | #define __NO_INLINE__ 1 94 | #define __DEC_EVAL_METHOD__ 2 95 | #define types_h_included 96 | #define __DEC128_MAX__ 9.999999999999999999999999999999999E6144DL 97 | #define __FLT_MANT_DIG__ 24 98 | #define __VERSION__ "4.2.1 (Apple Inc. build 5666) (dot 3)" 99 | #define MOD_ADD32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_ADD64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; } 100 | #define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n) 101 | #define ARRAYS_DISJOINT(xarr,lenx,yarr,leny) ((yarr+leny <= xarr) || (yarr >= xarr+lenx)) 102 | #define MOD_ADD64(__x,__y,__q,__z) { uint64 cy,tmp; tmp = __x + __y; cy = tmp < __x; __z = tmp - __q; cy -= __z > tmp; __z = __z + (cy & __q); } 103 | #define IS_EVEN(a) (~(int)(a) & 1) 104 | #define HACK_ALIGN_STACK_ODD() 105 | #define __DEC64_EPSILON__ 1E-15DD 106 | #define __DEC128_MIN_EXP__ (-6143) 107 | #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ 1068 108 | #define __SIZE_TYPE__ long unsigned int 109 | #define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((long)(_p) | 63)+1) 110 | #define __DEC32_DEN__ 0.000001E-95DF 111 | #define CMUL(ar,ai,br,bi,cr,ci) { double __tmp = ar; ci = __tmp*bi + ai*br; cr = __tmp*br - ai*bi;} 112 | #define __FLT_RADIX__ 2 113 | #define __LDBL_EPSILON__ 1.08420217248550443401e-19L 114 | #define SGN(x,b) ((b) == 1 ? -(x) : (x)) 115 | #define __SSE_MATH__ 1 116 | #define __k8 1 117 | #define __LDBL_DIG__ 18 118 | #define __x86_64__ 1 119 | #define OS_VERSION "[Unknown]" 120 | #define HACK_ALIGN_STACK_EVEN() 121 | #define ABS(a) ((a) < 0 ? -(a) : (a)) 122 | #define X32_ASM 123 | #define __FLT_HAS_QUIET_NAN__ 1 124 | #define __FLT_MAX_10_EXP__ 38 125 | #define __LONG_MAX__ 9223372036854775807L 126 | #define __FLT_HAS_INFINITY__ 1 127 | #define __DEC64_MAX__ 9.999999999999999E384DD 128 | #define ALIGN_UINT(_p) (uint *)(((long)(_p) | 63)+1) 129 | #define __DEC64_MANT_DIG__ 16 130 | #define OS_TYPE 131 | #define __DEC32_MAX_EXP__ 96 132 | #define __DEC128_DEN__ 0.000000000000000000000000000000001E-6143DL 133 | #define MOD_SUB32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_SUB64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; } 134 | #define OS_TYPE_MACOSX 135 | #define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256) 136 | #define __LITTLE_ENDIAN__ 1 137 | #define CPU_IS_X86_64 138 | #define RE_IM_STRIDE 1 139 | #define MOD_SUB64(__x,__y,__q,__z) { uint64 bw,tmp; tmp = __x - __y; bw = tmp > __x; __z = tmp + __q; bw -= __z < tmp; __z = __z - (bw & __q); } 140 | #define __LDBL_MANT_DIG__ 64 141 | #define __CONSTANT_CFSTRINGS__ 1 142 | #define ALIGN_DOUBLE(_p) (double *)(((long)(_p) | 127)+1) 143 | #define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256) 144 | #define __DEC32_MANT_DIG__ 7 145 | #define __k8__ 1 146 | #define __WCHAR_TYPE__ int 147 | #define FALSE 0 148 | #define __pic__ 2 149 | #define MULH64_FAST 150 | #define __FLT_DIG__ 6 151 | #define __INT_MAX__ 2147483647 152 | #define ALIGN_INT64(_p) (int64 *)(((long)(_p) | 63)+1) 153 | #define __FLT_MAX_EXP__ 128 154 | #define __BLOCKS__ 1 155 | #define __DBL_MANT_DIG__ 53 156 | #define CPU_TYPE 157 | #define __DEC64_MIN__ 1E-383DD 158 | #define __WINT_TYPE__ int 159 | #define __SSE__ 1 160 | #define __LDBL_MIN_EXP__ (-16381) 161 | #define __MACH__ 1 162 | #define X64_ASM 163 | #define __amd64__ 1 164 | #define __LDBL_MAX_EXP__ 16384 165 | #define __SSP__ 1 166 | #define ARRAYS_OVERLAP(xarr,lenx,yarr,leny) !ARRAYS_DISJOINT(xarr,lenx,yarr,leny) 167 | #define __LDBL_MAX_10_EXP__ 4932 168 | #define __DBL_EPSILON__ 2.2204460492503131e-16 169 | #define _LP64 1 170 | #define __GNUC_PATCHLEVEL__ 1 171 | #define __LDBL_HAS_INFINITY__ 1 172 | #define __INTMAX_MAX__ 9223372036854775807L 173 | #define __FLT_DENORM_MIN__ 1.40129846e-45F 174 | #define __PIC__ 2 175 | #define OS_BITS 64 176 | #define __FLT_MAX__ 3.40282347e+38F 177 | #define __SSE2__ 1 178 | #define BIT_FLIP(x,b) ( (x) ^= (1 << (b)) ) 179 | #define __FLT_MIN_10_EXP__ (-37) 180 | #define __INTMAX_TYPE__ long int 181 | #define __DEC128_MAX_EXP__ 6144 182 | #define ALLOC_VEC_U64(_p,_n) ALLOC_UINT64(_p,_n) 183 | #define __GNUC_MINOR__ 2 184 | #define __DBL_MAX_10_EXP__ 308 185 | #define SZ_VD 8 186 | #define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L 187 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 188 | #define __STDC__ 1 189 | #define __PTRDIFF_TYPE__ long int 190 | #define ALIGN_FLOAT(_p) (float *)(((long)(_p) | 63)+1) 191 | #define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p) 192 | #define STREQN(s1,s2,n) (!strncmp(s1,s2,n)) 193 | #define OS_NAME "OS X" 194 | #define __DEC128_MANT_DIG__ 34 195 | #define __LDBL_MIN_10_EXP__ (-4931) 196 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 197 | #define BIT_TEST(x,b) ( ((x) >> (b)) & 1 ) 198 | #define __GNUC_GNU_INLINE__ 1 199 | #define COMPILER_TYPE 200 | #define __SSE3__ 1 201 | -------------------------------------------------------------------------------- /docs/qs.txt: -------------------------------------------------------------------------------- 1 | A p-1 run found the following 53-digit composite factor of M(109228331), which factors into p25*p29: 2 | q = 67043584777242522312784510096836476580550779917618449 = 3258278300321182416433937 * 20576383782390150543028926977 3 | The prime factors themselves have p-1 factorizations 4 | p25-1 = p*2^4.11.113.305611.4907867 5 | p29-1 = p*2^9.577.20929.28687.1062073, which is why a p-1 run to B1 = 10^6 and B2 > 10^6 found them both in stage 2. 6 | 7 | For such composite factors of M(p) = q1.q2 8 | = (2.k1.p+1).(2.k2.p+1) = 4.k1.k2.p^2 + 2.(k1+k2).p + 1 = (2.k1.k2.p + k1 + k2).2.p + 1 = 2.p.F + 1, 9 | where F := (2.k1.k2.p + k1 + k2) = 306896499120006339347913821446363016297258904 ... Need to find k1,k2! 10 | Seems we could do better than e.g. ECM or QS on (n-1), since we have that k1,k2 must satisfy F == k1 + k2 (mod p). 11 | 12 | If k1,k2 < p, things are easy: F/2p = k1.k2, i.e. quotient Q = k1.k2, remainder R = k1 + k2, can just brute-force loop over all k1 <= sqrt(Q) which divide Q, compute k2 = Q/k1, see if R = k1+k2. Will this work in the general case where k1,k2 may be quite a bit larger than 2p? For the above example, k1 = 14914987121432728, k2 = 94189774731567355648. The true 2.k1.k2.p = 306896499120006339347913727241673297608470528, 13 | whereas q/2p = 306896499120006339347913821446363016297258904; difference too large to make the above idea workable. 14 | 15 | Know F == (k1+k2) mod 2p; in our case (k1+k2) == 98326026 mod 2p, useless because requires a priori knowledge of k1,k2. 16 | 17 | 5/23/21: Can at least do p-1 with S1 seed = p on n ... wait: 18 | [We open our next scene with a hand slapping the owner's forehead, accompanied by the utterance "doh!"] 19 | 20 | Re above: In fact it seems silly to use powerful general-modulus factoring machinery like ECM or QS on such (p-1)-found factor-product composites. Here's why: say we have some product of prime factors F = f1*f2*...*fn discovered by running p-1 to stage bounds b1 and b2 on an input Mersenne M(p) (or other bigum modulus with factors of a known form, allowing p-1 to be 'seeded' with a component of same). BY DEFINITION, each prime factor f1-fn will be b1/b2-smooth, in the sense than fj = 2*p*C + 1, where C is a composite all of whose prime factors are <= b1, save possibly one outlier-prime factor > b1 and <= b2. Thus if we again run p-1 to bounds b1/b2, but now with arithmetic modulo the relatively tiny factor product F, we are guaranteed to resolve all the prime factors f1-fn - the only trick is that we will need to do multiple GCDs along the way in order to capture the individual prime factors f1,...,fn, rather than have this secondary p-1 run modulo F again produce the same composite GCD = F which the original p-1 run mod M(p) did. Again, though, since in the followup p-1 run we are working mod F, all the arithmetic is trivially cheap, including the needed GCDs. 21 | 22 | ==================================== 23 | 24 | Use above example composite to work through the basics of ECM: 25 | [to-do!] 26 | 27 | ==================================== 28 | 29 | Use above example composite to work through the basics of ECM and the Quadratic Sieve factorization algorithm. 30 | Wikipedia: 31 | 32 | "The algorithm attempts to set up a congruence of squares modulo n (the integer to be factorized), which often leads to a factorization of n. The algorithm works in two phases: the data collection phase, where it collects information that may lead to a congruence of squares; and the data processing phase, where it puts all the data it has collected into a matrix and solves it to obtain a congruence of squares. The data collection phase can be easily parallelized to many processors, but the data processing phase requires large amounts of memory, and is difficult to parallelize efficiently over many nodes or if the processing nodes do not each have enough memory to store the whole matrix. The block Wiedemann algorithm can be used in the case of a few systems each capable of holding the matrix. 33 | 34 | "The naive approach to finding a congruence of squares is to pick a random number, square it, and hope the least non-negative remainder modulo n is a perfect square (in the integers). For example, 802 mod 5959 is 441, which is 212. This approach finds a congruence of squares only rarely for large n, but when it does find one, more often than not, the congruence is nontrivial and the factorization is complete. This is roughly the basis of Fermat's factorization method." 35 | 36 | -------------------------------------------------------------------------------- /src/align.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef align_h_included 27 | #define align_h_included 28 | 29 | #include "types.h" 30 | 31 | /* These are for basic memory allocation, and to force alignment of array data on desired-byte boundaries. 32 | We use the normally-not-recommended immediate-overwrite-of-pointer form of realloc() because if the returned 33 | pointer is null we exit immediately, thus the resulting memory leak is never an issue. 34 | 35 | In the Align macros we cast pointers to longto accommodate architectures which use 64-bit address arithmetic. 36 | Note that rather than simply assuming sizeof(void *) <= sizeof(long), we check this at program invocation, in 37 | util.c::check_nbits_in_types()> 38 | */ 39 | 40 | #define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256) 41 | #define ALIGN_INT(_p) (int *)(((intptr_t)(_p) | 63)+1) 42 | 43 | #define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256) 44 | #define ALIGN_UINT(_p) (uint *)(((intptr_t)(_p) | 63)+1) 45 | 46 | #define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256) 47 | #define ALIGN_INT64(_p) (int64 *)(((intptr_t)(_p) | 63)+1) 48 | 49 | #define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256) 50 | #define ALIGN_UINT64(_p) (uint64 *)(((intptr_t)(_p) | 63)+1) 51 | 52 | #define ALLOC_UINT128(_p,_n)(uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256) 53 | #define ALIGN_UINT128(_p) (uint128 *)(((intptr_t)(_p) | 63)+1) 54 | 55 | #define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256) 56 | #define ALIGN_FLOAT(_p) (float *)(((intptr_t)(_p) | 63)+1) 57 | 58 | #define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512) 59 | #define ALIGN_DOUBLE(_p) (double *)(((intptr_t)(_p) | 127)+1) 60 | 61 | #define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512) 62 | #define ALIGN_f128(_p) (__float128 *)(((intptr_t)(_p) | 127)+1) 63 | 64 | #define ALLOC_COMPLEX(_p,_n)(struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512) 65 | #define ALIGN_COMPLEX(_p) (struct complex*)(((intptr_t)(_p) | 127)+1) 66 | 67 | // Vector-double|uint64-alloc used by SIMD builds; register size difference between YMM and XMM taken care of by def of vec_dbl in types.h: 68 | #ifdef USE_SSE2 69 | 70 | #define ALLOC_VEC_DBL(_p,_n)(vec_dbl*)realloc(_p,(_n)*sizeof(vec_dbl)+512) 71 | #define ALIGN_VEC_DBL(_p) (vec_dbl*)(((intptr_t)(_p) | 127)+1) 72 | 73 | #define ALLOC_VEC_U64(_p,_n)(vec_u64*)realloc(_p,(_n)*sizeof(vec_u64)+512) 74 | #define ALIGN_VEC_U64(_p) (vec_u64*)(((intptr_t)(_p) | 127)+1) 75 | 76 | #else // In scalar-mode simply use the above double|uint64 macros: 77 | 78 | #define ALLOC_VEC_DBL(_p,_n) ALLOC_DOUBLE(_p,_n) 79 | #define ALIGN_VEC_DBL(_p) ALIGN_DOUBLE(_p) 80 | 81 | #define ALLOC_VEC_U64(_p,_n) ALLOC_UINT64(_p,_n) 82 | #define ALIGN_VEC_U64(_p) ALIGN_UINT64(_p) 83 | 84 | #endif 85 | 86 | #define ALLOC_POINTER(_p,_ptr_type,_n)(_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64) 87 | #define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((intptr_t)(_p) | 63)+1) 88 | 89 | #define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n) 90 | #define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p) 91 | 92 | /* 93 | On the x86 family, alignment of the stack is very important 94 | This uses the GNU gcc __builtin_alloca function to align doubles properly 95 | This is taken from GNU/FFTW package 96 | */ 97 | #ifdef COMPILER_TYPE_GCC 98 | # if (defined(__i386)) 99 | # define HACK_ALIGN_STACK_EVEN(){ \ 100 | if( (((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\ 101 | } 102 | 103 | # define HACK_ALIGN_STACK_ODD() { \ 104 | if(!(((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\ 105 | } 106 | # else 107 | # define HACK_ALIGN_STACK_EVEN() /* */ 108 | # define HACK_ALIGN_STACK_ODD() /* */ 109 | # endif 110 | #else 111 | # define HACK_ALIGN_STACK_EVEN() /* */ 112 | # define HACK_ALIGN_STACK_ODD() /* */ 113 | #endif 114 | 115 | 116 | #endif /* align_h_included */ 117 | -------------------------------------------------------------------------------- /src/f2psp.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2012 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /******************************************************************************* 24 | We now include this header file if it was not included before. 25 | *******************************************************************************/ 26 | #ifndef f2psp_h_included 27 | #define f2psp_h_included 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #define MI64_IS_DIV_BY_SCALAR32P_X8_SSE2(\ 34 | array_64x8inputs,\ 35 | q, \ 36 | qinv, \ 37 | retval \ 38 | )\ 39 | {\ 40 | DBG_ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\ 41 | DBG_ASSERT(((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\ 42 | __asm mov eax, array_64x8inputs /* Assumes inputs a,b,c,d,... are 64-bit separated and &a[0} is 64-byte aligned */\ 43 | __asm lea ebx, q\ 44 | __asm lea ecx, qinv\ 45 | __asm movaps xmm0,[eax ] /* ab: d3210 = [bhi|blo|ahi|alo] */\ 46 | __asm movaps xmm1,[eax+0x10] /* cd: d3210 = [dhi|dlo|chi|clo] */\ 47 | __asm movaps xmm2,[eax+0x20] /* ef: d3210 = [fhi|flo|ehi|elo] */\ 48 | __asm movaps xmm3,[eax+0x30] /* gh: d3210 = [hhi|hlo|ghi|glo] */\ 49 | __asm movaps xmm6,xmm0 /* Circularly-permute [4,6,7] -> [6,7,4] here so the 2 packed outputs end up in xmm6,7 */\ 50 | __asm movaps xmm5,xmm1\ 51 | __asm movaps xmm7,xmm2\ 52 | __asm movaps xmm4,xmm3\ 53 | __asm psrlq xmm6, 32 /* d3210 = [ 0|bhi| 0|ahi] */\ 54 | __asm psrlq xmm5, 32 /* d3210 = [ 0|dhi| 0|chi] */\ 55 | __asm psrlq xmm7, 32 /* d3210 = [ 0|fhi| 0|ehi] */\ 56 | __asm psrlq xmm4, 32 /* d3210 = [ 0|hhi| 0|ghi] */\ 57 | __asm psllq xmm5, 32 /* d3210 = [dhi| 0|chi| 0] */\ 58 | __asm psllq xmm4, 32 /* d3210 = [hhi| 0|ghi| 0] */\ 59 | __asm paddd xmm6,xmm5 /* d3210 = [dhi|bhi|chi|ahi], xmm5 FREE */\ 60 | __asm paddd xmm7,xmm4 /* d3210 = [hhi|fhi|ghi|ehi], xmm4 FREE */\ 61 | __asm movd xmm4,[ebx]\ 62 | __asm movd xmm5,[ecx]\ 63 | __asm pshufd xmm4,xmm4,0x44 /* Broadcast q to slots 0,2 of xmm4 */\ 64 | __asm pshufd xmm5,xmm5,0x44 /* Broadcast qinv to slots 0,2 of xmm5 */\ 65 | /* (a-h)[0]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\ 66 | __asm pmuludq xmm0,xmm5\ 67 | __asm pmuludq xmm1,xmm5\ 68 | __asm pmuludq xmm2,xmm5\ 69 | __asm pmuludq xmm3,xmm5\ 70 | /* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\ 71 | __asm pmuludq xmm0,xmm4\ 72 | __asm pmuludq xmm1,xmm4\ 73 | __asm pmuludq xmm2,xmm4\ 74 | __asm pmuludq xmm3,xmm4\ 75 | __asm psrlq xmm0, 32 /* d3210 = [ 0|cy1| 0|cy0] */\ 76 | __asm psrlq xmm1, 32 /* d3210 = [ 0|cy3| 0|cy2] */\ 77 | __asm psrlq xmm2, 32 /* d3210 = [ 0|cy5| 0|cy4] */\ 78 | __asm psrlq xmm3, 32 /* d3210 = [ 0|cy7| 0|cy6] */\ 79 | __asm psllq xmm1, 32 /* d3210 = [cy3| 0|cy2| 0] */\ 80 | __asm psllq xmm3, 32 /* d3210 = [cy7| 0|cy6| 0] */\ 81 | __asm paddd xmm0,xmm1 /* d3210 = [cy3|cy1|cy2|cy0], xmm1 FREE */\ 82 | __asm paddd xmm2,xmm3 /* d3210 = [cy7|cy5|cy6|cy4], xmm3 FREE */\ 83 | __asm movaps xmm3,xmm6 /* Copy of acbd[1] */\ 84 | __asm movaps xmm1,xmm7 /* Copy of efgh[1] */\ 85 | __asm psubd xmm6,xmm0 /* acbd[1] - cy0213, xmm0 FREE */\ 86 | __asm psubd xmm7,xmm2 /* egfh[1] - cy4657, xmm2 FREE */\ 87 | __asm movaps xmm2,xmm6 /* Copy of acbd[1] - cy0213 */\ 88 | __asm movaps xmm0,xmm7 /* Copy of efgh[1] - cy4657 */\ 89 | /* Had a borrow? Frickin' SSE2 only gives us signed packed-integer compares,\ 90 | so need to emulate unsigned (x > y) via signed (x ^ 0x80000000) < (y ^ 0x80000000): */\ 91 | __asm pcmpeqd xmm4,xmm4 /* All 1s - will need to restore q to this register later */\ 92 | __asm pslld xmm4, 31 /* 4-way 0x80000000 */\ 93 | __asm pxor xmm6,xmm4 /* (acbd[1]-cy0213) ^ 0x80000000 */\ 94 | __asm pxor xmm7,xmm4 /* (egfh[1]-cy4657) ^ 0x80000000 */\ 95 | __asm pxor xmm3,xmm4 /* (acbd[1]) ^ 0x80000000 */\ 96 | __asm pxor xmm1,xmm4 /* (egfh[1]) ^ 0x80000000 */\ 97 | __asm pcmpgtd xmm6,xmm3 /* cy0213 = (acbd[1]-cy0213) > abcd[1], xmm3 FREE */\ 98 | __asm pcmpgtd xmm7,xmm1 /* cy4657 = (egfh[1]-cy4657) > efgh[1], xmm1 FREE */\ 99 | __asm pshufd xmm3,xmm2,0x31 /* xmm2 = [----|tmp1|----|tmp0], xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\ 100 | __asm pshufd xmm1,xmm0,0x31 /* xmm0 = [----|tmp5|----|tmp4], xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\ 101 | __asm movd xmm4,[ebx] /* Restore q to xmm4 */\ 102 | __asm pshufd xmm4,xmm4,0x44 /* Broadcast q to slots 0,2 of xmm4 */\ 103 | /* tmp[0-7]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\ 104 | __asm pmuludq xmm3,xmm5\ 105 | __asm pmuludq xmm1,xmm5\ 106 | __asm pmuludq xmm2,xmm5\ 107 | __asm pmuludq xmm0,xmm5\ 108 | /* Add carries 01/45, scatter carries 23/67 into slots of 01/45, add those...Since SSE2 compare result is ~()ed, add really means sub: */\ 109 | __asm psubd xmm2,xmm6 /* xmm6 = [----|tmp1|----|tmp0], don't care what's in ---- slots */\ 110 | __asm psubd xmm0,xmm7 /* xmm7 = [----|tmp5|----|tmp4], don't care what's in ---- slots */\ 111 | __asm pshufd xmm6,xmm6,0x31\ 112 | __asm pshufd xmm7,xmm7,0x31\ 113 | __asm psubd xmm3,xmm6 /* xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\ 114 | __asm psubd xmm1,xmm7 /* xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\ 115 | /* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\ 116 | __asm pmuludq xmm2,xmm4\ 117 | __asm pmuludq xmm0,xmm4\ 118 | __asm pmuludq xmm3,xmm4\ 119 | __asm pmuludq xmm1,xmm4\ 120 | __asm psrlq xmm2, 32 /* d3210 = [ 0|cy1| 0|cy0] */\ 121 | __asm psrlq xmm0, 32 /* d3210 = [ 0|cy5| 0|cy4] */\ 122 | __asm psrlq xmm3, 32 /* d3210 = [ 0|cy3| 0|cy2] */\ 123 | __asm psrlq xmm1, 32 /* d3210 = [ 0|cy7| 0|cy6] */\ 124 | __asm pshufd xmm2,xmm2,0x58 /* [ 0| 0|cy1|cy0] */\ 125 | __asm pshufd xmm0,xmm0,0x58 /* [ 0| 0|cy5|cy4] */\ 126 | __asm pshufd xmm3,xmm3,0x85 /* [cy3|cy2| 0| 0] */\ 127 | __asm pshufd xmm1,xmm1,0x85 /* [cy7|cy6| 0| 0] */\ 128 | __asm paddd xmm2,xmm3 /* d3210 = [cy3|cy1|cy2|cy0] */\ 129 | __asm paddd xmm0,xmm1 /* d3210 = [cy7|cy5|cy6|cy4] */\ 130 | __asm pcmpgtd xmm7,xmm7 /* All 0s */\ 131 | __asm pcmpeqd xmm2,xmm7 /* retval[0-3] */\ 132 | __asm pcmpeqd xmm0,xmm7 /* retval[4-7] */\ 133 | __asm movmskps eax,xmm2 /* retval[0-3] */\ 134 | __asm movmskps ebx,xmm0 /* retval[4-7] */\ 135 | __asm shl ebx, 4 /* retval[4-7] << 4 */\ 136 | __asm add eax,ebx /* retval[0-7] */\ 137 | __asm mov retval, eax \ 138 | } 139 | 140 | #ifdef __cplusplus 141 | } 142 | #endif 143 | 144 | #endif /* f2psp_h_included */ 145 | 146 | -------------------------------------------------------------------------------- /src/fac_test_dat192.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef fac_test_dat192_included 27 | #define fac_test_dat192_included 28 | 29 | #include "types.h" 30 | 31 | struct testFac160{ 32 | uint32 p; 33 | uint64 d2; 34 | uint64 d1; 35 | uint64 d0; 36 | }; 37 | 38 | struct testFac192{ 39 | uint32 p; 40 | uint64 d2; 41 | uint64 d1; 42 | uint64 d0; 43 | }; 44 | 45 | /*******************************************/ 46 | /* Fermat-number test factors: */ 47 | /*******************************************/ 48 | 49 | // Here interpret the above testFac struct as a minimalist [n,k]-pair format, 50 | // where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor: 51 | // To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1. 52 | 53 | // Testcases with factors < 2^192: 54 | static const struct testFac192 ffac192[] = 55 | { 56 | { 86,0ull,0ull, 20018578522347ull}, // 2012 M. Dangler & Rodenkirch 57 | { 88,0ull,0ull, 119942751127ull}, // 2001 T. Nohara & Durman 58 | { 90,0ull,0ull, 198922467387ull}, // 2001 P. Grobstich & Durman 59 | { 91,0ull,0ull, 1421ull}, // 1977 D. E. Shippee 60 | { 93,0ull,0ull,2* 92341ull}, // 1979 R. Baillie 61 | { 94,0ull,0ull,2* 482524552001ull}, // 2001 P. Grobstich & Durman 62 | { 96,0ull,0ull,8* 3334131633063ull}, // 2008 M. Ptáček & Durman 63 | {107,0ull,0ull,4* 1289179925ull}, // 1992 G. B. Gostin 64 | {116,0ull,0ull,4* 3433149787ull}, // 1999 T. Taura 65 | {122,0ull,0ull, 5234775ull}, // 1986 G. B. Gostin 66 | {125,0ull,0ull, 5ull}, // 1956 R. M. Robinson 67 | {133,0ull,0ull, 88075576149ull}, // 2001 P. Samidoost & Durman 68 | {142,0ull,0ull,2* 8152599ull}, // 1986 G. B. Gostin 69 | {144,0ull,0ull,2* 17ull}, // 1956 R. M. Robinson 70 | {146,0ull,0ull, 37092477ull}, // 1987 G. B. Gostin 71 | {147,0ull,0ull, 3125ull}, // 1979 G. B. Gostin & P. B. McLaughlin 72 | {147,0ull,0ull, 124567335ull}, // 1990 G. B. Gostin 73 | {150,0ull,0ull,32* 1575ull}, // 1956 R. M. Robinson 74 | {150,0ull,0ull,4* 5439ull}, // 1980 G. B. Gostin & P. B. McLaughlin & H. Suyama 75 | {0,0ull,0ull,0ull} 76 | }; 77 | 78 | /*******************************************/ 79 | /* Mersenne-number test factors: */ 80 | /*******************************************/ 81 | 82 | /* Factors > 128 but <= 160 bits. If desired, we can construct more test factors 83 | by multiplying together a 64-bit factor q1 of M(p1) and a 96-bit factor q2 of M(p2) 84 | and checking whether q1*q2 divides M(p1*p2).*/ 85 | static const struct testFac160 fac160[] = 86 | { 87 | { 629, 133ull,11545660419510266595ull,15875370168207932041ull}, 88 | { 631, 1394ull,15571349859840161706ull, 509892144742137431ull}, 89 | { 673, 121320ull, 4492854135134704005ull,14226674137430228263ull}, 90 | { 695,2649519282ull,14842833464112563611ull,10174116463236461383ull}, 91 | { 731, 655903171ull,17652352551621896287ull, 7660429456444636239ull}, 92 | { 805,1083827012ull,18314245293386716597ull, 2219421057460140527ull}, 93 | { 877, 13161208ull,18225246095436784582ull,12343089078196252631ull}, 94 | { 957, 4730ull,14663183769241509326ull, 8097149896429635207ull}, 95 | { 967, 215159ull, 881920578744577810ull,17184239148975426263ull}, 96 | { 1017, 212724356ull, 9900144438119899815ull,17733134473107607967ull}, 97 | { 1033, 261ull, 5238930328752646394ull, 2803405107698253561ull}, 98 | { 1087, 1ull, 4415476118538293365ull,16346425147370540471ull}, 99 | { 1087, 70130ull,11905462972019801043ull, 6167785434693019223ull}, 100 | { 1131, 5800574ull,18429773635221665090ull,17951008765075981215ull}, 101 | { 1157, 22381525ull,14500669099417213747ull,15903397166638806257ull}, 102 | { 1283, 14ull, 3291757557782450881ull, 3893270457587058239ull}, 103 | { 1319, 1552ull, 1390029428449091172ull,14288981644299514807ull}, 104 | { 1483, 2674ull,14802171160149427175ull, 5085420234315110585ull}, 105 | { 6659, 664ull,14291576310931480037ull, 4949688733053552967ull}, 106 | { 8191, 617742ull, 6334326874596939334ull,11405337619840706193ull}, 107 | {18031451, 2122ull, 5198971222801411122ull,12425019173815339143ull}, /* Note: composite factor! */ 108 | {0,0ull,0ull,0ull} 109 | }; 110 | 111 | /* Factors > 160 but <= 192 bits. We can construct more test factors by multiplying 112 | together smaller factors of M(p) with multiple factors, or for exponents p1, p2, p3, ... 113 | and corresponding factors q1, q2, q3, ... , checking whether q1*q2*q3*... 114 | divides M(p1*p2*p3*...). */ 115 | static const struct testFac192 fac192[] = 116 | { 117 | { 677, 157590042578912ull,10558642444782195772ull, 329809049266961143ull}, 118 | { 773, 9118322195022ull, 1933308633079010416ull,17814616685598394119ull}, 119 | { 971, 70286054459973ull,17012949627558354271ull, 3547755741880899889ull}, 120 | { 997, 492416983078691417ull, 8040689323464953445ull,16007877010440112335ull}, 121 | { 1001, 59364131986ull, 9565712986615012496ull,10050950882119470361ull}, 122 | {0,0ull,0ull,0ull} 123 | }; 124 | 125 | #endif /* #ifndef fac_test_dat192_included */ 126 | -------------------------------------------------------------------------------- /src/fac_test_dat256.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef fac_test_dat256_included 27 | #define fac_test_dat256_included 28 | 29 | #include "types.h" 30 | 31 | struct testFac256{ 32 | char p[80]; 33 | char q[80]; 34 | }; 35 | 36 | /*******************************************/ 37 | /* Fermat-number test factors: */ 38 | /*******************************************/ 39 | 40 | struct testFermFac{ 41 | uint32 n; 42 | uint64 k; 43 | }; 44 | 45 | // Here interpret the above testFac struct as a minimalist [n,k]-pair format, 46 | // where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor: 47 | // To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1. 48 | 49 | // Testcases with factors < 2^256: 50 | static const struct testFermFac ffac256[] = 51 | { 52 | {164,2* 1835601567ull}, // 1993 G. B. Gostin 53 | {166,8* 2674670937447ull}, // 2012 R. Maznichenko & Rodenkirch 54 | {172, 20569603303ull}, // 2001 L. N. Durman 55 | {178, 313047661ull}, // 1991 G. B. Gostin 56 | {184,2* 117012935ull}, // 1990 G. B. Gostin 57 | {195, 48595346636925ull}, // 2014 S. Batalov & Woltman 58 | {201,2* 4845ull}, // 1980 G. B. Gostin & P. B. McLaughlin 59 | {205, 232905ull}, // 1984 W. Keller 60 | {207, 3ull}, // 1956 R. M. Robinson 61 | {215, 32111ull}, // 1980 H. Suyama 62 | {226,2* 15ull}, // 1956 R. M. Robinson 63 | {228,2* 29ull}, // 1956 R. M. Robinson 64 | {0,0ull} 65 | }; 66 | 67 | /*******************************************/ 68 | /* Mersenne-number test factors: */ 69 | /*******************************************/ 70 | 71 | /* 256-bit Factors are easier to give in character-string form: */ 72 | /* EWM: These are from my April 2006 shakedown runs of the P4WORD functionality - 73 | ran ??? 64-65-digit test exponents up to k = 10^10; ??? had factors below this bound, 74 | compared to ??? predicted by theory (Dickman's function). 75 | */ 76 | static const struct testFac256 fac256[] = 77 | { 78 | {"1000000000000000000000000000000000000000000000000000000001059" ,"40000000000000000000000000000000000000000000000000000000042361" }, /* k = 20 */ 79 | {"12160287649628674460477464915995054973742562690104903778198683593" ,"543592246870442485937175551111623340804481341938942752102988291735322287319" }, /* k = 22351126163 */ 80 | {"20992192221842725502542568876717904946016534668049886272327917860857843" ,"41984384443685451005085137753435809892033069336099772544655835721715687" }, /* k = 1 */ 81 | {"24247014121478057345510500801908699603302763478708108175450119307" ,"2079083331892761004876676951418337621569030224230467189523407626117207889809" }, /* k = 42872976472 */ 82 | {"3082533446850352619311881710100031378387528865875332083814206171" ,"6165066893700705238623763420200062756775057731750664167628412343" }, /* k = 1 */ 83 | {"32046927906821207388377814233562823608963208068222468012248261177" ,"192281567440927244330266885401376941653779248409334808073489567063" }, /* k = 3 */ 84 | {"32046927906821207388377814233562823608963208068222468012248261177" ,"7261513394406617382132528927183000201554973316178529026895333500096431" }, /* k = 113295 */ 85 | {"3444030707469211201913020330380197621101100449293215160842444859637669" ,"53389364027187712052055641161553823522309259164943421423379580214103144839" }, /* k = 7751 */ 86 | {"3600113305305488204665213841469519415116094330572703657595919530921861" ,"1605650534166247739280685373295405659141778071435425831287780110791150007" }, /* k = 223 */ 87 | {"3852254995466672782398645659611635488623057745649803559363456817432411" ,"22788707831582286845380020155651359827337650244785629920055214225748565104481" }, /* k = 2957840 */ 88 | {"3873455283316355076479185358932261854896321329330898570642046752590709" ,"2921297999392661936999377930740968974773127205440094407601101388055871276457" }, /* k = 377092 */ 89 | {"4088350865739177150968288747826569959957449066175834413752239709" ,"532990125664685046817433867476654272539732719859211180852051986382913" }, /* k = 65184 */ 90 | {"41927056387293174872332083760112302991136793862708943879936201629" ,"586978789422104448212649172641572241875915114077925214319106822807" }, /* k = 7 */ 91 | {"53710507922796892589235420199561121290219608640344181598136297747713099" ,"107421015845593785178470840399122242580439217280688363196272595495426199" }, /* k = 1 */ 92 | {"54973742562690104903778198683593814657412680492564879855614537234786733" ,"769632395877661468652894781570313405203777526895908317978603521287014263" }, /* k = 7 */ 93 | {"5509792592309907965473761255176567513575178296664547791745011299" ,"742776869444172678136618913571387191947269048779332840473151581151737887" }, /* k = 67405157 */ 94 | {"570658748822569815793678976697422057505968344086973502014102067" ,"322107495328491256282531776450837995333351643082236449882652963072723913" }, /* k = 282224268 */ 95 | {"62735676303544776280350450777235547105859548702790814356240145171" ,"19573531006705970199469340642497490697028179195270734079146925293353" }, /* k = 156 */ 96 | {"62749567351885752724891227938183011949129833673362440656643086021" ,"6902452408707432799738035073200131314404281704069868472230739462311" }, /* k = 55 */ 97 | {"6402474964732639141992726042699227967823547816360093417216412199" ,"59547637466852043611708058111909725657028150812842162510646420336832110759" }, /* k = 4650360821 */ 98 | {"7195429162991930645537799140373404328752628889639958794757291746426357" ,"957164768977838582192020192849031737427989705415465878713793977276619713569" }, /* k = 66512 */ 99 | {"7095890455635792122103334669749923563025494780249011419521238281" ,"93538027986191011753566157616643492407802072193242468532128963020143" }, /* k = 6591 */ 100 | {"83011949129833673362440656643086021394946395224737190702179860943" ,"2473258012374264464160556924024104921441032899325819859780746776935743" }, /* k = 14897 */ 101 | {"85102283345085048608250393021332197155184306354550076682829493041" ,"23658434769933643513093609259930350809141237166564921317826599065399" }, /* k = 139 */ 102 | {"9104140792886215078424516709087000699282120660418371806535567252532567" ,"207009953348646758453216660931220221900276859576592938137005728188085508447" }, /* k = 11369 */ 103 | {"9729971208443357326548938239119325974636673058360414281388303203" ,"215547993800818444194894669517580993600170370441819907413109392526239783" }, /* k = 11076497 */ 104 | {"" ,"" }, /* k = */ 105 | {"" ,"" }, /* k = */ 106 | {"" ,"" }, /* k = */ 107 | {"" ,"" }, /* k = */ 108 | {"" ,"" }, /* k = */ 109 | {"" ,"" }, /* k = */ 110 | {"" ,"" }, /* k = */ 111 | {"",""} 112 | }; 113 | 114 | #endif /* #ifndef fac_test_dat256_included */ 115 | -------------------------------------------------------------------------------- /src/fgt_m61.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef fgt_m61_h_included 27 | #define fgt_m61_h_included 28 | 29 | #include "util.h" 30 | 31 | // Our modulus q = 2^61 - 1: 32 | 33 | /***************/ 34 | // NB: Since args to these reduce macros will more often than not be expressions (e.g. qreduce(x - y + q4)), 35 | // start each by copying arg into a local uint64, to ensure that any input expression only gets evaluated once: 36 | /* 37 | Returns x (mod q), but in the sense of a possible partial modular reduction: Outputs are in [0, B], where B = q+7. 38 | Note: if x = q, QREDUCE returns q, not zero. 39 | */ 40 | #define qreduce(x) \ 41 | ({ uint64 tmp = x; \ 42 | tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull); \ 43 | tmp; }) 44 | 45 | // ...or this if you want to finish reducing a qreduce() output: 46 | #define qreduce_finish(x) \ 47 | ({ uint64 tmp = x; \ 48 | tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull; \ 49 | tmp; }) 50 | 51 | // Use this if you require a guaranteed-full reduction of x (mod q)... 52 | #define qreduce_full(x) \ 53 | ({ uint64 tmp = x; \ 54 | tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull); \ 55 | tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull; \ 56 | tmp; }) 57 | 58 | /***************/ 59 | 60 | /* 61 | Returns sqrt(1/2)*x (mod q). 62 | sqrt(1/2) == 2^30 mod q, so the multiply can be effected via 2 shifts, an AND, and an add. 63 | For normalized inputs (< q), Output is in [0, B30], where B30 = q + 7*2^30 = 2^61 + 2^33 - 2^30 - 1. 64 | */ 65 | #define mul_i2(x) (((x) << 30) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 31) 66 | 67 | /***************/ 68 | 69 | /* 70 | Returns sqrt(2)*x (mod q). 71 | sqrt(2) == 2^31 mod q, so the multiply can be effected via 2 shifts, an AND, and an add. 72 | Outputs are in [0, B31], where B31 = q + 7*2^31 = 2^61 + 2^34 - 2^31 - 1. 73 | */ 74 | #define mul_s2(x) (((x) << 31) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 30) 75 | 76 | /***************/ 77 | 78 | /* 79 | Returns 2^n * x (mod q). x is a uint64; The shift count n is assumed to be any kind of int, with value in [0,61]. 80 | 81 | If x only partially normalized (i.e. in [0, b]) on entry and n = 0, result is fully normalized, i.e. xout in [0,q]. 82 | If x unnormalized on entry and n = 0, the result is partially normalized, i.e. xout in [0,b]. 83 | The special case n = 61 leaves x unchanged. 84 | 85 | For general operands x in [0,2^64-1] and n in [0,60], ((x << n) & q) is in [0, q - (2^n - 1)] = [0, 2^61 - 2^n] 86 | and (x >> (61-n)) is in [0, 2^(3+n) - 1]. The sum is bounded above by 2^61 - 2^n + 2^(3+n) - 1 = q + 2^(3+n) - 2^n. 87 | 88 | OK, let`s do some crude estimation for non-normalized inputs: 89 | 90 | The sum is maximized for x = 2^64-1 and n = 60, giving 2^63 - 1 + 2^60 = 9*2^60 - 1 ~= 4.5*q, 91 | i.e. inputs approximately in [0,8q] yield outputs approximately in [0,5q]. 92 | For x = 2^64-1 and n = 59, the sum is bounded by ~2.75*q, etc., approaching q+7 from above. 93 | 94 | x = 2^63-1 and n = 60 gives q + 2^62 - 2^60 ~= 2.5*q . 95 | x = 2^63-1 and n = 59 gives q + 2^61 - 2^59 ~= 1.75*q . This case is important in the between-forward-and-inverse-FFT 96 | pair_square step, where we multiply inputs in [0,4q] by the modular inverse of 4 == 2^59. 97 | x = 2^62-1 and n = 60 gives 2^61 - 1 + 2^60 = 3*2^60 - 1 ~= 1.5*q, i.e. inputs approximately in [0,2q] 98 | yield outputs approximately in [0,2q]. 99 | 100 | NEGATIVE POWERS OF 2: 101 | 102 | The modular analog of 1/2 (call it w) satisfies 2*w == 1 (mod q), thus w = (q+1)/2 = 2^60. More generally, 103 | any negative-integer power of 2 (mod q) satisfies 2^(-p) == 2^(61-p), with p < 61. We obtain the same 104 | result by simply analogizing the mul_pow2_modq macro to negative powers, and thus can effect multiply 105 | by 2^(-p) by simply calling the mul_pow2_modq macro with power-of-2 argument (61-p). 106 | 107 | Thus e.g. to effect a modular x*(1/2) we call mul_pow2_modq(x,60). 108 | */ 109 | #define mul_pow2_modq(x,n) (((x) << n) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> (61-n)) 110 | 111 | /****** Prototypes for functions defined in fgt_m61.c are collected in util.h *******/ 112 | 113 | #endif /* fgt_m61_h_included */ 114 | -------------------------------------------------------------------------------- /src/gcd_lehmer.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /******************************************************************************* 24 | We now include this header file if it was not included before. 25 | *******************************************************************************/ 26 | #ifndef gcd_lehmer_h_included 27 | #define gcd_lehmer_h_included 28 | 29 | #include "Mlucas.h" 30 | #include "genFFT_mul.h" 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | /******************************************************************************* 37 | Function prototypes. The corresponding function definitions will either 38 | be in a {function name}.c file or (for cases where a .c file contains 39 | multiple function definitions) in the given .c file: 40 | *******************************************************************************/ 41 | 42 | /* gcd_lehmer.c: */ 43 | uint32 mi64_gcd( 44 | uint64 u[], uint64 v[], uint32 const ndim, 45 | const uint32 EGCD, uint64 Ap[], uint64 Bp[], uint32 *len_AB, uint32 *sign_AB, 46 | const uint32 HALF, uint64 Cp[], uint64 Dp[], uint32 *len_CD, uint32 *sign_CD, const uint32 len_targ); 47 | 48 | uint32 matrix_vector_product_sub(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len); 49 | uint32 matrix_vector_product_add(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len); 50 | 51 | int CMP_LT_PROD192 (uint64 a, uint64 xlo, uint64 xhi, uint64 b, uint64 ylo, uint64 yhi); 52 | int pprime192 (uint192 p, uint64 z); 53 | uint192 bitwise_mod192 (uint192 x, uint192 y); 54 | /* 55 | void mv_dwtvarbase_to_int64 (x,p,m,u,ndim); 56 | */ 57 | void gcd_init(); 58 | int test_gcd(); 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | 64 | #endif /* gcd_lehmer_h_included */ 65 | 66 | -------------------------------------------------------------------------------- /src/genFFT_mul.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /******************************************************************************* 24 | We now include this header file if it was not included before. 25 | *******************************************************************************/ 26 | #ifndef gen_fft_h_included 27 | #define gen_fft_h_included 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | /* Enumeration constant of the various supported values for the MODE argument to genFFT_mul(). 34 | As the use of an enum implies, these modes are mutually exclusive: 35 | 36 | Mode Description 37 | ---------------- ------------------------- 38 | INIT_ARRAYS Init FFT-related bit-reversal-index and roots-of-unity data, using input x-array for scratch storage 39 | 40 | The rest assume the function has been previously called in INIT_ARRAYS mode for the FFT length in question: 41 | 42 | FORWARD_FFT_ONLY The fFFT of the input X-array is computed and stored in-place 43 | AUTO_SQUARE The fFFT of the input X-array is computed, followed by a wrapper/dyadic-square step and an iFFT, all in-place. 44 | MUL_PRECOMPUTED The X-array is assumed to contain an untransformed input vector, and the Y-array to contain a data vector which was previously-transformed by calling this routine in FORWARD_FFT_ONLY mode. The fFFT of the input X-array is computed, followed by a wrapper/dyadic-mul-with-Y-transform step and an iFFT. The result is returned in X; Y is unaffected. (I.e. this is designed for the common case where we have a constant vector which will be used to multiply many sets of inouts). 45 | 46 | */ 47 | enum mode {INIT_ARRAYS, FORWARD_FFT_ONLY, AUTO_SQUARE, MUL_PRECOMPUTED}; 48 | 49 | /* genFFT_mul.c: */ 50 | void genFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int MODE); 51 | void genFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int MODE); 52 | 53 | /* Nov 2015 - moved updated versions of these to Mlucas.h: 54 | void pairFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int FORWARD_FFT_ONLY); 55 | void pairFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int FORWARD_FFT_ONLY, int skip_square); 56 | void radix16_pairFFT_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int INIT_ARRAYS, int FORWARD_FFT_ONLY, int skip_square); 57 | */ 58 | 59 | /* The complex/rel wrapper and dyadic-mul step, combined with the final-fFFt/initial-iFFT radix pass: */ 60 | void radix16_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE); 61 | void radix32_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE); 62 | 63 | #ifdef __cplusplus 64 | } 65 | #endif 66 | 67 | #endif /* gen_fft_h_included */ 68 | 69 | -------------------------------------------------------------------------------- /src/getRealTime.c: -------------------------------------------------------------------------------- 1 | // EWM: June 2014 - Code from http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking 2 | // for high-precision elapsed real time; thanks to Stephen Searle for finding this. 3 | // Prototype for getRealTime() is in util.h . 4 | 5 | /* 6 | * Author: David Robert Nadeau 7 | * Site: http://NadeauSoftware.com/ 8 | * License: Creative Commons Attribution 3.0 Unported License 9 | * http://creativecommons.org/licenses/by/3.0/deed.en_US 10 | */ 11 | 12 | #if defined(_WIN32) 13 | 14 | #include 15 | 16 | #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) 17 | 18 | #include /* POSIX flags */ 19 | #include /* clock_gettime(), time() */ 20 | #include /* gethrtime(), gettimeofday() */ 21 | 22 | #if defined(__MACH__) && defined(__APPLE__) 23 | 24 | #include 25 | #include 26 | 27 | #endif 28 | 29 | #else 30 | // EWM: Instead of error-on-compile we default to the util.c:get_time_str() function if any misconfig detected: 31 | #error "Unable to define getRealTime( ) for an unknown OS." 32 | #define GRT_MISCONFIG 33 | #endif 34 | 35 | /** 36 | * Returns the real time, in seconds, or -1.0 if an error occurred. 37 | * 38 | * Time is measured since an arbitrary and OS-dependent start time. 39 | * The returned real time is only useful for computing an elapsed time 40 | * between two calls to this function. 41 | */ 42 | double getRealTime( ) 43 | { 44 | #ifdef GRT_MISCONFIG 45 | return -1.0; // EWM: See above note 46 | #elif defined(_WIN32) 47 | FILETIME tm; 48 | ULONGLONG t; 49 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8 50 | /* Windows 8, Windows Server 2012 and later. ---------------- */ 51 | GetSystemTimePreciseAsFileTime( &tm ); 52 | #else 53 | /* Windows 2000 and later. ---------------------------------- */ 54 | GetSystemTimeAsFileTime( &tm ); 55 | #endif 56 | t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime; 57 | return (double)t / 10000000.0; 58 | 59 | #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__))) 60 | /* HP-UX, Solaris. ------------------------------------------ */ 61 | return (double)gethrtime( ) / 1000000000.0; 62 | 63 | #elif defined(__MACH__) && defined(__APPLE__) 64 | /* OSX. ----------------------------------------------------- */ 65 | static double timeConvert = 0.0; 66 | if ( timeConvert == 0.0 ) 67 | { 68 | mach_timebase_info_data_t timeBase; 69 | (void)mach_timebase_info( &timeBase ); 70 | timeConvert = (double)timeBase.numer / 71 | (double)timeBase.denom / 72 | 1000000000.0; 73 | } 74 | return (double)mach_absolute_time( ) * timeConvert; 75 | 76 | #elif defined(_POSIX_VERSION) 77 | /* POSIX. --------------------------------------------------- */ 78 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 79 | { 80 | struct timespec ts; 81 | #if defined(CLOCK_MONOTONIC_PRECISE) 82 | /* BSD. --------------------------------------------- */ 83 | const clockid_t id = CLOCK_MONOTONIC_PRECISE; 84 | #elif defined(CLOCK_MONOTONIC_RAW) 85 | /* Linux. ------------------------------------------- */ 86 | const clockid_t id = CLOCK_MONOTONIC_RAW; 87 | #elif defined(CLOCK_HIGHRES) 88 | /* Solaris. ----------------------------------------- */ 89 | const clockid_t id = CLOCK_HIGHRES; 90 | #elif defined(CLOCK_MONOTONIC) 91 | /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */ 92 | const clockid_t id = CLOCK_MONOTONIC; 93 | #elif defined(CLOCK_REALTIME) 94 | /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */ 95 | const clockid_t id = CLOCK_REALTIME; 96 | #else 97 | const clockid_t id = (clockid_t)-1; /* Unknown. */ 98 | #endif /* CLOCK_* */ 99 | if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 ) 100 | return (double)ts.tv_sec + 101 | (double)ts.tv_nsec / 1000000000.0; 102 | /* Fall thru. */ 103 | } 104 | #endif /* _POSIX_TIMERS */ 105 | 106 | /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */ 107 | struct timeval tm; 108 | gettimeofday( &tm, NULL ); 109 | return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0; 110 | #else 111 | return -1.0; /* Failed. */ 112 | #endif 113 | } 114 | 115 | -------------------------------------------------------------------------------- /src/get_fp_rnd_const.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "util.h" 24 | 25 | /* Set the value of the round constant used for fast NINT emulation: */ 26 | void get_fp_rnd_const(double*RND_A, double*RND_B) 27 | { 28 | #if(FP_MANTISSA_BITS_DOUBLE == 64) /* X86 64-mantissa-bit register doubles: */ 29 | *RND_A = 3.0*0x4000000*0x2000000*0x800; 30 | *RND_B =12.0*0x2000000*0x1000000*0x800; 31 | fprintf(stderr,"INFO: using 64-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation.\n"); 32 | #else /* These assume IEEE64-compliant double-precision hardware arithmetic: */ 33 | *RND_A = 3.0*0x4000000*0x2000000; 34 | *RND_B =12.0*0x2000000*0x1000000; 35 | fprintf(stderr,"INFO: using 53-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation. \n"); 36 | #endif 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/gpu_iface.cu: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2012 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so 24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be 25 | // considered coincidental, and any faults strictly mine. 26 | 27 | #include "gpu_iface.h" 28 | 29 | #ifdef __CUDACC__ 30 | #warning using nvcc 31 | #ifdef __CUDA_ARCH__ 32 | #warning device code trajectory 33 | #if __CUDA_ARCH__ > 120 34 | #warning compiling with double precision 35 | #else 36 | #warning compiling with single precision 37 | #endif 38 | #else 39 | #warning nvcc host code trajectory 40 | #endif 41 | #else 42 | #warning non-nvcc code trajectory 43 | #endif 44 | 45 | #ifndef OS_BITS 46 | #error Bitness not defined! 47 | #elif OS_BITS == 32 48 | #warning compiling in 32-bit mode 49 | #elif OS_BITS == 64 50 | #warning compiling in 64-bit mode 51 | #else 52 | #error Bitness defined but not supported! 53 | #endif 54 | 55 | // 50 Ways to say "Houston, we have a problem": 56 | char * 57 | cuGetErrorMessage(CUresult result) 58 | { 59 | switch (result) { 60 | case CUDA_SUCCESS: return "CUDA_SUCCESS"; 61 | case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE"; 62 | case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY"; 63 | case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED"; 64 | case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED"; 65 | case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE"; 66 | case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE"; 67 | case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE"; 68 | case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT"; 69 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; 70 | case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED"; 71 | case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED"; 72 | case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED"; 73 | case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED"; 74 | case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU"; 75 | case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED"; 76 | case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED"; 77 | case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE"; 78 | case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND"; 79 | case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE"; 80 | case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND"; 81 | case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY"; 82 | case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED"; 83 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; 84 | case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT"; 85 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; 86 | case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN"; 87 | default: return "CUDA: unexpected error"; 88 | } 89 | } 90 | 91 | // Read information on all available GPUs into input arg: 92 | void 93 | gpu_init(gpu_config_t *gpu_config) 94 | { 95 | int32 device, nskip = 0; 96 | memset(gpu_config, 0, sizeof(gpu_config_t)); 97 | 98 | // CUDA_TRY(cudaGetDeviceCount(&gpu_config->num_gpu)) *** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult" 99 | cudaGetDeviceCount(&gpu_config->num_gpu); 100 | for (device = 0; device < (int32)gpu_config->num_gpu; device++) 101 | { 102 | // Get pointer to info for [device]th GPU having the minimum required capability: 103 | gpu_info_t *info = gpu_config->gpu_info + device - nskip; 104 | // CUDA_TRY(cudaGetDeviceProperties(info, device)) *** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult" 105 | cudaGetDeviceProperties(info, device); 106 | if(info->major < 2) { 107 | printf("GPU #%d compute capability %d.%d is less than min-supported 2.x ... ignoring this device.\n",device,info->major,info->minor); 108 | ++nskip; 109 | } 110 | // Note: Devices with cc = 2.x have (32 + 16*x) shader cores per multiprocessor (At least for x = 0 and 1 ... may need table for this 111 | } 112 | gpu_config->num_gpu -= nskip; 113 | return; 114 | } 115 | 116 | #ifdef GPU_IFACE_STANDALONE 117 | int main(int argc, char *argv[]) 118 | { 119 | gpu_config_t gpu_config; 120 | gpu_info_t ginfo; 121 | int32 igpu; 122 | 123 | gpu_init(&gpu_config); 124 | if (gpu_config.num_gpu > 0) { 125 | printf("Detected %u CUDA-enabled GPU devices.\n", gpu_config.num_gpu); 126 | for(igpu = 0; igpu < gpu_config.num_gpu; ++igpu) { 127 | ginfo = gpu_config.gpu_info[igpu]; 128 | printf("GPU #%u: %s v%u.%u\n", igpu, ginfo.name, ginfo.major, ginfo.minor); 129 | printf("clock_speed = %u MHz\n", ginfo.clockRate/1000); 130 | printf("num_compute_units = %u\n", ginfo.multiProcessorCount); 131 | printf("constant_mem_size = %u\n", ginfo.totalConstMem); 132 | printf("shared_mem_size = %u\n", ginfo.sharedMemPerBlock); 133 | printf("global_mem_size = %u\n", ginfo.totalGlobalMem); 134 | printf("registers_per_block = %u\n", ginfo.regsPerBlock); 135 | printf("max_threads_per_block = %u\n", ginfo.maxThreadsPerBlock); 136 | printf("can_overlap = %u\n", ginfo.deviceOverlap); 137 | printf("concurrent_kernels = %u\n", ginfo.concurrentKernels); 138 | printf("warp_size = %u\n", ginfo.warpSize); 139 | printf("max_thread_dim[3] = [%u,%u,%u]\n", ginfo.maxThreadsDim[0], ginfo.maxThreadsDim[1], ginfo.maxThreadsDim[2]); 140 | printf("max_grid_size[3] = [%u,%u,%u]\n", ginfo.maxGridSize[0], ginfo.maxGridSize[1], ginfo.maxGridSize[2]); 141 | } 142 | exit(0); 143 | } else { 144 | printf("ERROR: No CUDA-enabled GPUs found\n"); 145 | exit(-1); 146 | } 147 | } 148 | #endif 149 | 150 | -------------------------------------------------------------------------------- /src/gpu_iface.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so 24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be 25 | // considered coincidental, and any faults strictly mine. 26 | 27 | #ifndef gpu_iface_h_included 28 | #define gpu_iface_h_included 29 | 30 | #ifndef GPU_IFACE_STANDALONE 31 | // Non-standalone build assumes the non-main functions in this file will serve as GPU diagnostics 32 | // for an Mlucas or Mfactor build, so require same compile flag as for the other sources in such a build: 33 | #ifndef USE_GPU 34 | #error Compilation of any source file using a gpu-specific header requires the user-defined preprocessor flag USE_GPU 35 | #endif 36 | 37 | #include "masterdefs.h" 38 | #include "types.h" 39 | #else 40 | #include 41 | typedef int int32; 42 | #endif 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #ifdef __cplusplus 49 | extern "C" { 50 | #endif 51 | 52 | #define MAX_GPU 16 53 | 54 | typedef struct cudaDeviceProp gpu_info_t; 55 | /* 56 | cudaDeviceProp struct members: 57 | 58 | int canMapHostMemory Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer. 59 | int clockRate Clock frequency in kilohertz. 60 | int computeMode Compute mode (See cudaComputeMode). 61 | int deviceOverlap Device can concurrently copy memory and execute a kernel. 62 | int integrated Device is integrated as opposed to discrete. 63 | int kernelExecTimeoutEnabled Specified whether there is a run time limit on kernels. 64 | int major Major compute capability. 65 | int minor Minor compute capability. 66 | int maxGridSize [3] Maximum size of each dimension of a grid. 67 | int maxThreadsDim [3] Maximum size of each dimension of a block. 68 | int maxThreadsPerBlock Maximum number of threads per block. 69 | size_t memPitch Maximum pitch in bytes allowed by memory copies. 70 | int multiProcessorCount Number of multiprocessors on device. 71 | char name [256] ASCII string identifying device. 72 | int regsPerBlock 32-bit registers available per block 73 | size_t sharedMemPerBlock Shared memory available per block in bytes. 74 | size_t textureAlignment Alignment requirement for textures. 75 | size_t totalConstMem Constant memory available on device in bytes. 76 | size_t totalGlobalMem Global memory available on device in bytes. 77 | int warpSize Warp size in threads. 78 | */ 79 | 80 | typedef struct { 81 | int32 num_gpu; 82 | gpu_info_t gpu_info[MAX_GPU]; 83 | } gpu_config_t; 84 | 85 | char * cuGetErrorMessage(CUresult result); 86 | 87 | void gpu_init(gpu_config_t *config); 88 | 89 | #define CUDA_TRY(func) \ 90 | { \ 91 | CUresult status = func; \ 92 | if (status != CUDA_SUCCESS) { \ 93 | printf("error (line %d): %s\n", __LINE__,\ 94 | cuGetErrorMessage(status)); \ 95 | exit(-1); \ 96 | } \ 97 | } 98 | 99 | #define CUDA_ALIGN_PARAM(offset, pow2align) \ 100 | (offset) = ((offset) + (pow2align) - 1) & ~((pow2align) - 1) 101 | 102 | #ifdef __cplusplus 103 | } 104 | #endif 105 | 106 | #endif /* !gpu_iface_h_included_ */ 107 | 108 | -------------------------------------------------------------------------------- /src/imul_macro.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef imul_macro_h_included 27 | #define imul_macro_h_included 28 | 29 | #include "imul_macro0.h" 30 | #include "imul_macro1.h" 31 | 32 | #endif /* imul_macro_h_included */ 33 | 34 | -------------------------------------------------------------------------------- /src/masterdefs.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef masterdefs_h_included 27 | #define masterdefs_h_included 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include // Nov 2021: Add to provide POSIX case-insensitive string compare string compare strcasecmp() and strncasecmp(); 41 | // cf. https://stackoverflow.com/questions/5820810/case-insensitive-string-comparison-in-c 42 | #include 43 | 44 | #ifdef macintosh 45 | #include /* Macintosh CW */ 46 | #endif 47 | 48 | #undef EWM_DEBUG 49 | #define EWM_DEBUG 0 /* Set = 1 to turn on various debugging diagnostics, especially DBG_ASSERT, defined in util.c . */ 50 | 51 | /* cf. util.h|c : If debug enabled, alias DBG_ASSERT to ASSERT (a function defined 52 | in util.c), otherwise alias the entire 4-argument DBG_ASSERT invocation to "Bolivian" 53 | (to paraphrase ex-heavyweight boxing champ Mike Tyson.) */ 54 | #if EWM_DEBUG 55 | #define DBG_ASSERT ASSERT 56 | #define DBG_WARN WARN 57 | #define DBG_INFO INFO 58 | #else /* Bolivian - lump both the FILE and LINE args together as a single __here, that's why it looks like these take 1 less arg than the underlying functions: */ 59 | #define DBG_ASSERT(__arg1, __arg2) /* */ 60 | #define DBG_WARN(__here, __arg2, __arg3, __arg4) /* */ 61 | #define DBG_INFO(__here, __arg2, __arg3, __arg4) /* */ 62 | #endif 63 | 64 | /******************************************************************************* 65 | Mlucas-specific master #defines: 66 | *******************************************************************************/ 67 | 68 | /* Set = 1 to do a simple FFT/IFFT-returns-original-inputs test 69 | (sans weighting and dyadic squaring) using pseudorandom inputs: 70 | */ 71 | #undef FFT_DEBUG 72 | #define FFT_DEBUG 0 73 | 74 | #undef NOBRANCH 75 | #define NOBRANCH 1 /* Switch between branched and branchless versions of various key sequences. */ 76 | 77 | #ifndef LO_ADD 78 | #define LO_ADD 1 /* TRUE = use algorithm with more mul and fewer add */ 79 | #endif 80 | 81 | #undef N_LEADING_RADICES 82 | #define N_LEADING_RADICES 8 /* # of intervals we split adjacent power-of-2 transform lengths into */ 83 | 84 | #endif /* masterdefs_h_included */ 85 | -------------------------------------------------------------------------------- /src/pair_square.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "Mlucas.h" 24 | 25 | /***************/ 26 | 27 | /* 28 | Macro versions of these are in pair_square.h, since radix32_wrapper_square.c also needs to inline those; 29 | SSE2 macros for this are in sse2_macro_gcc64.h. 30 | */ 31 | void pair_square(double *x1, double *y1, double *x2, double *y2, double c, double s) 32 | { 33 | /* 34 | ! Given complex scalars H[j] = (x1,y1) and H[N-j] = (x2,y2) along with complex exponential E = (c,s), 35 | ! calculates I[j] = H[j]^2 + {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 and its complex conjugate I~, 36 | ! returns the former in H[j] and the latter in H[N-j]. 37 | */ 38 | // Use that (H[j] - H~[N-j])^2 = H(j)^2 - 2*H(j)*H~(N-j) + H~(N-j)^2 to efficiently compute both (H[j]-H~[N-j])^2 and H[j]^2: 39 | #if 0 40 | double rt0,rt1,rt2,rt3,it1,it2,it3; 41 | // H[j] = (r1,i1); H[N-j] = (r2,i2): 42 | rt1 = *x1; it1 = *y1; rt2 = *x2; it2 = *y2; // H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)] 43 | // = [(r1^2-i1^2) + (r2^2-i2^2) - 2.(r1.r2+i1.i2)] + 2.I.[(r1.i1-r2.i2) - (i1.r2-r1.i2)] 44 | // Calculate cross product terms: 45 | rt3 = rt1*rt2 + it1*it2; rt3 = rt3 + rt3; // 2.(r1.r2 + i1.i2) 46 | it3 = it1*rt2 - rt1*it2; it3 = it3 + it3; // 2.(i1.r2 - r1.i2) 47 | // Now calculate square terms and store back in the same temporaries: 48 | rt0 = (rt1 + it1)*(rt1 - it1); it1 = rt1*it1; it1 = it1 + it1; rt1 = rt0; // rt1,it1 = (r1^2-i1^2); 2.r1.i1 49 | rt0 = (rt2 + it2)*(rt2 - it2); it2 = rt2*it2; it2 = it2 + it2; rt2 = rt0; // rt2,it2 = (r2^2-i2^2); 2.r2.i2 50 | // {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 : 51 | rt3 = rt1 + rt2 - rt3; // Re(H[j]-H~[N-j]) 52 | it3 = it1 - it2 - it3; // Im(H[j]-H~[N-j]) 53 | rt0 = ((c + 1.0)*rt3 - s*it3)*0.25; 54 | it3 = (s*rt3 + (c + 1.0)*it3)*0.25; 55 | // And now complete and store the results: 56 | *x1 = (rt1 - rt0); // Re(I[j]) 57 | *y1 = (it1 - it3); // Im(I[j]) 58 | // N-j terms are as above, but with the replacements: rt1<-->rt2, it1<-->it2, it3|-->-it3: 59 | *x2 = (rt2 - rt0); 60 | *y2 = (it2 + it3); 61 | // Cost: [22 add, 12 mul], compared to [18 add, 18 mul] for generic-mul version ... seems too add-heavy. 62 | #elif 0 // Quick test of mul version of this function, using square inputs: 63 | double re,im,tt; 64 | /*...gather the 4 complex elements which are to be combined...*/ 65 | // Re{H[j]} Im{H[j]} Re{I[j]} Im{I[j]} Re{H[N-j]} Im{H[N-j]} Re{I[N-j]} Im{I[N-j]} 66 | double r1 = *x1, i1 = *y1, r2 = *x1, i2 = *y1, r3 = *x2, i3 = *y2, r4 = *x2, i4 = *y2; 67 | // calculate 2nd square-like term and store in temp... 68 | re = r3*r4 - i3*i4; // re := Re{H(n2-j)*I(n2-j)} 69 | im = r3*i4 + i3*r4; // im := Im{H(n2-j)*I(n2-j)} 70 | // calculate difference terms... 71 | r3 = r1 - r3; // r3 := Re{H(j)-H~(n2-j)} 72 | i3 = i1 + i3; // i3 := Im{H(j)-H~(n2-j)} 73 | r4 = r2 - r4; // r4 := Re{I(j)-I~(n2-j)} 74 | i4 = i2 + i4; // i4 := Im{I(j)-I~(n2-j)} 75 | // now calculate 1st square-like term and store back in H(j) slot... 76 | tt = r1*r2 - i1*i2; // r1 := Re{H(j)*I(j)} 77 | i1 = r1*i2 + i1*r2; r1 = tt;// i1 := Im{H(j)*I(j)} 78 | // calculate the complex products to build the second term... 79 | tt = r3*r4 - i3*i4; // Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])} 80 | i3 = r3*i4 + i3*r4; r3 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])} 81 | tt = ((c + 1.0)*r3 - s*i3)*0.25; // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4} 82 | i3 = (s*r3 + (c + 1.0)*i3)*0.25; // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4} 83 | // and now complete and store the results. 84 | *x1 = (r1-tt); // Re{M(j)} 85 | *y1 = (i1-i3); // Im{M(j)} 86 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3. 87 | *x2 = (re-tt); // Re{M(N-j)} 88 | *y2 = (im+i3); // Im{M(N-j)} 89 | #else 90 | double re,im,tt, r1 = *x1, i1 = *y1, r2 = *x2, i2 = *y2, cc = (c + 1.0)*0.25, ss = s*0.25; 91 | // H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)] 92 | // calculate 2nd square-like term and store in temp... 93 | re = (r2+i2)*(r2-i2); // re := Re{H(n2-j)^2} 94 | im = r2*i2 + i2*r2; // im := Im{H(n2-j)^2} 95 | // calculate difference terms... 96 | r2 = r1 - r2; // r2 := Re{H(j)-H~(n2-j)} 97 | i2 = i1 + i2; // i2 := Im{H(j)-H~(n2-j)} 98 | // now calculate 1st square-like term and store back in H(j) slot... 99 | tt = (r1+i1)*(r1-i1); // r1 := Re{H(j)^2} 100 | i1 = r1*i1 + i1*r1; r1 = tt;// i1 := Im{H(j)^2} 101 | // calculate the complex products to build the second term... 102 | tt = (r2+i2)*(r2-i2); // Re{(H[j] - H~[N/2-j])^2} 103 | i2 = r2*i2 + i2*r2; r2 = tt;// Im{(H[j] - H~[N/2-j])^2} 104 | tt = (cc*r2 - ss*i2); // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4} 105 | i2 = (ss*r2 + cc*i2); // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4} 106 | // and now complete and store the results. 107 | *x1 = (r1-tt); // Re{M(j)} 108 | *y1 = (i1-i2); // Im{M(j)} 109 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3. 110 | *x2 = (re-tt); // Re{M(N-j)} 111 | *y2 = (im+i2); // Im{M(N-j)} 112 | // Cost: [19 add, 15 mul] ... or [16 add, 18 mul] if replace re-part-of-cmuls (r+i)*(r-i) with r^2-i^2. 113 | // Can save another [2 add, 2 mul] by precomputing cc = (c + 1.0)/4 and ss = s/4. 114 | #endif 115 | } 116 | 117 | // Jul 2019: This routine adapted from my vintage 1999 mersenne_pm1.f90 code, with input-indec swap 2 <--> 3: 118 | void pair_mul( 119 | double *x1, double *y1, double *x2, double *y2, const double sx3, const double sy3, const double sx4, const double sy4, 120 | const double c, const double s) 121 | { 122 | /* 123 | ! Given complex scalars H[j] = (x1,y1), H[N-j] = (x2,y2) and (const)I[j] = (x3,y3), I[N-j] = (x4,y4) 124 | ! along with complex exponential E = (c,s), 125 | ! calculates M[j] = H[j]*I[j] + {1 + exp(4*pi*I*j/N)}*{H[j]-H~[N-j]}*{I[j]-I~[N-j]}/4 and its complex conjugate M~, 126 | ! returns the former in H[j] and the latter in H[N-j], thus overwriting those non-const inputs. 127 | */ 128 | double re,im,tt, cc = (c + 1.0)*0.25, ss = s*0.25; 129 | /*...gather the 4 complex elements which are to be combined...*/ 130 | // Re{H[j]} Im{H[j]} Re{H[N-j]} Im{H[N-j]} Re{I[j]} Im{I[j]} Re{I[N-j]} Im{I[N-j]} 131 | double r1 = *x1, i1 = *y1, r2 = *x2, i2 = *y2, r3 = sx3, i3 = sy3, r4 = sx4, i4 = sy4; 132 | 133 | /*...Have: H, H~, I, I~ need: H*I, H~*I~, H - H~, I - I~. Use the sequence: 134 | Find H~I~, store in tmp 135 | Find H-H~, store in H~ 136 | Find I-I~, store in I~ 137 | Find HI, store in H 138 | Store H~I~ in I 139 | */ 140 | // calculate 2nd square-like term and store in temp... 141 | re = r2*r4 - i2*i4; // re := Re{H(n2-j)*I(n2-j)} 142 | im = r2*i4 + i2*r4; // im := Im{H(n2-j)*I(n2-j)} 143 | // calculate difference terms... 144 | r2 = r1 - r2; // r2 := Re{H(j)-H~(n2-j)} 145 | i2 = i1 + i2; // i2 := Im{H(j)-H~(n2-j)} 146 | r4 = r3 - r4; // r4 := Re{I(j)-I~(n2-j)} 147 | i4 = i3 + i4; // i4 := Im{I(j)-I~(n2-j)} 148 | // now calculate 1st square-like term and store back in H(j) slot... 149 | tt = r1*r3 - i1*i3; // r1 := Re{H(j)*I(j)} 150 | i1 = r1*i3 + i1*r3; r1 = tt;// i1 := Im{H(j)*I(j)} 151 | // calculate the complex products to build the second term... 152 | tt = r2*r4 - i2*i4; // Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])} 153 | i2 = r2*i4 + i2*r4; r2 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])} 154 | tt = (cc*r2 - ss*i2); // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4} 155 | i2 = (ss*r2 + cc*i2); // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4} 156 | // and now complete and store the results. 157 | *x1 = (r1-tt); // Re{M(j)} 158 | *y1 = (i1-i2); // Im{M(j)} 159 | // N-j terms are as above, but with the replacements: r1<-->r3, i1<-->i3, i2|-->-i2. 160 | *x2 = (re-tt); // Re{M(N-j)} 161 | *y2 = (im+i2); // Im{M(N-j)} 162 | // Cost: 16 add, 16 mul [Ignoring the (1 add, 2 mul) cost of the cc,ss precomputation] 163 | } 164 | 165 | -------------------------------------------------------------------------------- /src/prefetch.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/src/prefetch.h -------------------------------------------------------------------------------- /src/qfcheb.h: -------------------------------------------------------------------------------- 1 | #define STR_MAX_LEN 1024 2 | extern char cbuf[STR_MAX_LEN*2]; 3 | -------------------------------------------------------------------------------- /src/radix1024.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix1024_included 27 | #define radix1024_included 28 | 29 | #include "radix512.h" 30 | 31 | // 'bc -l' code for these: p2=8*a(1);d=p2/1024;t=-d; t+=(d+d);c(t);s(t); [repeat 64 times]: 32 | // Of the odd-order 1024th roots, note that _4f,_53,_7f end up being unused by the radix-1024 DFT twiddles array: 33 | #define c1024_01 ((double)0.99998117528260114265) 34 | #define s1024_01 ((double)0.00613588464915447535) /* exp(01*I*twopi/1024) */ 35 | #define c1024_03 ((double)0.99983058179582342201) 36 | #define s1024_03 ((double)0.01840672990580482090) /* exp(03*I*twopi/1024) */ 37 | #define c1024_05 ((double)0.99952941750109316308) 38 | #define s1024_05 ((double)0.03067480317663662588) /* exp(05*I*twopi/1024) */ 39 | #define c1024_07 ((double)0.99907772775264538289) 40 | #define s1024_07 ((double)0.04293825693494082301) /* exp(07*I*twopi/1024) */ 41 | #define c1024_09 ((double)0.99847558057329475221) 42 | #define s1024_09 ((double)0.05519524434968993972) /* exp(09*I*twopi/1024) */ 43 | #define c1024_0b ((double)0.99772306664419160985) 44 | #define s1024_0b ((double)0.06744391956366405780) /* exp(0b*I*twopi/1024) */ 45 | #define c1024_0d ((double)0.99682029929116571498) 46 | #define s1024_0d ((double)0.07968243797143012103) /* exp(0d*I*twopi/1024) */ 47 | #define c1024_0f ((double)0.99576741446765979399) 48 | #define s1024_0f ((double)0.09190895649713272849) /* exp(0f*I*twopi/1024) */ 49 | #define c1024_11 ((double)0.99456457073425545213) 50 | #define s1024_11 ((double)0.10412163387205457897) /* exp(11*I*twopi/1024) */ 51 | #define c1024_13 ((double)0.99321194923479453312) 52 | #define s1024_13 ((double)0.11631863091190476708) /* exp(13*I*twopi/1024) */ 53 | #define c1024_15 ((double)0.99170975366909952288) 54 | #define s1024_15 ((double)0.12849811079379317243) /* exp(15*I*twopi/1024) */ 55 | #define c1024_17 ((double)0.99005821026229710553) 56 | #define s1024_17 ((double)0.14065823933284923051) /* exp(17*I*twopi/1024) */ 57 | #define c1024_19 ((double)0.98825756773074949143) 58 | #define s1024_19 ((double)0.15279718525844342750) /* exp(19*I*twopi/1024) */ 59 | #define c1024_1b ((double)0.98630809724459864790) 60 | #define s1024_1b ((double)0.16491312048996992118) /* exp(1b*I*twopi/1024) */ 61 | #define c1024_1d ((double)0.98421009238692907323) 62 | #define s1024_1d ((double)0.17700422041214875594) /* exp(1d*I*twopi/1024) */ 63 | #define c1024_1f ((double)0.98196386910955526412) 64 | #define s1024_1f ((double)0.18906866414980621248) /* exp(1f*I*twopi/1024) */ 65 | #define c1024_21 ((double)0.97956976568544053449) 66 | #define s1024_21 ((double)0.20110463484209191127) /* exp(21*I*twopi/1024) */ 67 | #define c1024_23 ((double)0.97702814265775435155) 68 | #define s1024_23 ((double)0.21311031991609137366) /* exp(23*I*twopi/1024) */ 69 | #define c1024_25 ((double)0.97433938278557586059) 70 | #define s1024_25 ((double)0.22508391135979283567) /* exp(25*I*twopi/1024) */ 71 | #define c1024_27 ((double)0.97150389098625177561) 72 | #define s1024_27 ((double)0.23702360599436720653) /* exp(27*I*twopi/1024) */ 73 | #define c1024_29 ((double)0.96852209427441731631) 74 | #define s1024_29 ((double)0.24892760574572016775) /* exp(29*I*twopi/1024) */ 75 | #define c1024_2b ((double)0.96539444169768937465) 76 | #define s1024_2b ((double)0.26079411791527551791) /* exp(2b*I*twopi/1024) */ 77 | #define c1024_2d ((double)0.96212140426904159553) 78 | #define s1024_2d ((double)0.27262135544994898410) /* exp(2d*I*twopi/1024) */ 79 | #define c1024_2f ((double)0.95870347489587155549) 80 | #define s1024_2f ((double)0.28440753721127184321) /* exp(2f*I*twopi/1024) */ 81 | #define c1024_31 ((double)0.95514116830577072162) 82 | #define s1024_31 ((double)0.29615088824362382370) /* exp(31*I*twopi/1024) */ 83 | #define c1024_33 ((double)0.95143502096900836968) 84 | #define s1024_33 ((double)0.30784964004153489325) /* exp(33*I*twopi/1024) */ 85 | #define c1024_35 ((double)0.94758559101774113480) 86 | #define s1024_35 ((double)0.31950203081601567745) /* exp(35*I*twopi/1024) */ 87 | #define c1024_37 ((double)0.94359345816196036165) 88 | #define s1024_37 ((double)0.33110630575987640127) /* exp(37*I*twopi/1024) */ 89 | #define c1024_39 ((double)0.93945922360218991213) 90 | #define s1024_39 ((double)0.34266071731199439711) /* exp(39*I*twopi/1024) */ 91 | #define c1024_3b ((double)0.93518350993894757782) 92 | #define s1024_3b ((double)0.35416352542049038186) /* exp(3b*I*twopi/1024) */ 93 | #define c1024_3d ((double)0.93076696107898373214) 94 | #define s1024_3d ((double)0.36561299780477386950) /* exp(3d*I*twopi/1024) */ 95 | #define c1024_3f ((double)0.92621024213831134218) 96 | #define s1024_3f ((double)0.37700741021641825620) /* exp(3f*I*twopi/1024) */ 97 | #define c1024_41 ((double)0.92151403934204194368) 98 | #define s1024_41 ((double)0.38834504669882629109) /* exp(41*I*twopi/1024) */ 99 | #define c1024_43 ((double)0.91667905992104266335) 100 | #define s1024_43 ((double)0.39962419984564682799) /* exp(43*I*twopi/1024) */ 101 | #define c1024_45 ((double)0.91170603200542985165) 102 | #define s1024_45 ((double)0.41084317105790394162) /* exp(45*I*twopi/1024) */ 103 | #define c1024_47 ((double)0.90659570451491536559) 104 | #define s1024_47 ((double)0.42200027079979968537) /* exp(47*I*twopi/1024) */ 105 | #define c1024_49 ((double)0.90134884704602201485) 106 | #define s1024_49 ((double)0.43309381885315196790) /* exp(49*I*twopi/1024) */ 107 | #define c1024_4b ((double)0.89596624975618515621) 108 | #define s1024_4b ((double)0.44412214457042923104) /* exp(4b*I*twopi/1024) */ 109 | #define c1024_4d ((double)0.89044872324475789026) 110 | #define s1024_4d ((double)0.45508358712634382292) /* exp(4d*I*twopi/1024) */ 111 | #define c1024_4f ((double)0.88479709843093778043) 112 | #define s1024_4f ((double)0.46597649576796617728) /* exp(4f*I*twopi/1024) */ 113 | #define c1024_51 ((double)0.87901222642863347817) 114 | #define s1024_51 ((double)0.47679923006332213271) /* exp(51*I*twopi/1024) */ 115 | #define c1024_53 ((double)0.87309497841829009899) 116 | #define s1024_53 ((double)0.48755016014843595399) /* exp(53*I*twopi/1024) */ 117 | #define c1024_55 ((double)0.86704624551569265185) 118 | #define s1024_55 ((double)0.49822766697278185175) /* exp(55*I*twopi/1024) */ 119 | #define c1024_57 ((double)0.86086693863776727973) 120 | #define s1024_57 ((double)0.50883014254310703626) /* exp(57*I*twopi/1024) */ 121 | #define c1024_59 ((double)0.85455798836540052117) 122 | #define s1024_59 ((double)0.51935599016558958668) /* exp(59*I*twopi/1024) */ 123 | #define c1024_5b ((double)0.84812034480329725170) 124 | #define s1024_5b ((double)0.52980362468629466753) /* exp(5b*I*twopi/1024) */ 125 | #define c1024_5d ((double)0.84155497743689841004) 126 | #define s1024_5d ((double)0.54017147272989288060) /* exp(5d*I*twopi/1024) */ 127 | #define c1024_5f ((double)0.83486287498638005676) 128 | #define s1024_5f ((double)0.55045797293660480227) /* exp(5f*I*twopi/1024) */ 129 | #define c1024_61 ((double)0.82804504525775575255) 130 | #define s1024_61 ((double)0.56066157619733602312) /* exp(61*I*twopi/1024) */ 131 | #define c1024_63 ((double)0.82110251499110467956) 132 | #define s1024_63 ((double)0.57078074588696727951) /* exp(63*I*twopi/1024) */ 133 | #define c1024_65 ((double)0.81403632970594836217) 134 | #define s1024_65 ((double)0.58081395809576454434) /* exp(65*I*twopi/1024) */ 135 | #define c1024_67 ((double)0.80684755354379927274) 136 | #define s1024_67 ((double)0.59075970185887422768) /* exp(67*I*twopi/1024) */ 137 | #define c1024_69 ((double)0.79953726910790503405) 138 | #define s1024_69 ((double)0.60061647938386892590) /* exp(69*I*twopi/1024) */ 139 | #define c1024_6b ((double)0.79210657730021235236) 140 | #define s1024_6b ((double)0.61038280627630945196) /* exp(6b*I*twopi/1024) */ 141 | #define c1024_6d ((double)0.78455659715557523362) 142 | #define s1024_6d ((double)0.62005721176328917788) /* exp(6d*I*twopi/1024) */ 143 | #define c1024_6f ((double)0.77688846567323245066) 144 | #define s1024_6f ((double)0.62963823891492702460) /* exp(6f*I*twopi/1024) */ 145 | #define c1024_71 ((double)0.76910333764557963998) 146 | #define s1024_71 ((double)0.63912444486377574303) /* exp(71*I*twopi/1024) */ 147 | #define c1024_73 ((double)0.76120238548426181469) 148 | #define s1024_73 ((double)0.64851440102211244430) /* exp(73*I*twopi/1024) */ 149 | #define c1024_75 ((double)0.75318679904361248316) 150 | #define s1024_75 ((double)0.65780669329707865614) /* exp(75*I*twopi/1024) */ 151 | #define c1024_77 ((double)0.74505778544146596311) 152 | #define s1024_77 ((double)0.66699992230363750586) /* exp(77*I*twopi/1024) */ 153 | #define c1024_79 ((double)0.73681656887736987581) 154 | #define s1024_79 ((double)0.67609270357531595956) /* exp(79*I*twopi/1024) */ 155 | #define c1024_7b ((double)0.72846439044822519723) 156 | #define s1024_7b ((double)0.68508366777270038056) /* exp(7b*I*twopi/1024) */ 157 | #define c1024_7d ((double)0.72000250796138162984) 158 | #define s1024_7d ((double)0.69397146088965400820) /* exp(7d*I*twopi/1024) */ 159 | #define c1024_7f ((double)0.71143219574521644231) 160 | #define s1024_7f ((double)0.70275474445722530165) /* exp(7f*I*twopi/1024) */ 161 | 162 | #endif /* #ifndef radix1024_included */ 163 | -------------------------------------------------------------------------------- /src/radix128.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix128_included 27 | #define radix128_included 28 | 29 | #include "radix64.h" 30 | 31 | #define c128_1 ((double)0.99879545620517239271) 32 | #define s128_1 ((double)0.04906767432741801425) /* exp(1*I*twopi/128) */ 33 | #define c128_3 ((double)0.98917650996478097345) 34 | #define s128_3 ((double)0.14673047445536175165) /* exp(3*I*twopi/128) */ 35 | #define c128_5 ((double)0.97003125319454399260) 36 | #define s128_5 ((double)0.24298017990326388994) /* exp(5*I*twopi/128) */ 37 | #define c128_7 ((double)0.94154406518302077841) 38 | #define s128_7 ((double)0.33688985339222005068) /* exp(7*I*twopi/128) */ 39 | #define c128_9 ((double)0.90398929312344333158) 40 | #define s128_9 ((double)0.42755509343028209431) /* exp(9*I*twopi/128) */ 41 | #define c128_b ((double)0.85772861000027206990) 42 | #define s128_b ((double)0.51410274419322172658) /* exp(b*I*twopi/128) */ 43 | #define c128_d ((double)0.80320753148064490981) 44 | #define s128_d ((double)0.59569930449243334345) /* exp(d*I*twopi/128) */ 45 | #define c128_f ((double)0.74095112535495909118) 46 | #define s128_f ((double)0.67155895484701840061) /* exp(f*I*twopi/128) */ 47 | 48 | #endif /* #ifndef radix128_included */ 49 | -------------------------------------------------------------------------------- /src/radix128_twiddles.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "radix128.h" 24 | 25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper, 26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration 27 | // and thus needs to be inline-able in multiple places in a source filing making use of it. 28 | 29 | const double DFT128_TWIDDLES[16][14] = { 30 | { 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 }, 31 | { 0,1,ISRT2,ISRT2,-ISRT2,ISRT2,c16,s16,-s16,c16,s16,c16,-c16,s16 }, 32 | { ISRT2,ISRT2,c16,s16,s16,c16,c32_1,s32_1,s32_3,c32_3,c32_3,s32_3,s32_1,c32_1 }, 33 | { -ISRT2,ISRT2,s16,c16,-c16,-s16,c32_3,s32_3,-c32_1,s32_1,-s32_1,c32_1,-s32_3,-c32_3 }, 34 | { c16,s16,c32_1,s32_1,c32_3,s32_3,c64_1,s64_1,c64_5,s64_5,c64_3,s64_3,c64_7,s64_7 }, 35 | { -s16,c16,s32_3,c32_3,-c32_1,s32_1,c64_5,s64_5,-c64_7,s64_7,s64_1,c64_1,-c64_3,-s64_3 }, 36 | { s16,c16,c32_3,s32_3,-s32_1,c32_1,c64_3,s64_3,s64_1,c64_1,s64_7,c64_7,-s64_5,c64_5 }, 37 | { -c16,s16,s32_1,c32_1,-s32_3,-c32_3,c64_7,s64_7,-c64_3,-s64_3,-s64_5,c64_5,s64_1,-c64_1 }, 38 | { c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7 }, 39 | { -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1 }, 40 | { s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3 }, 41 | { -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5 }, 42 | { c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b }, 43 | { -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d }, 44 | { s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f }, 45 | { -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9 } 46 | }; 47 | 48 | -------------------------------------------------------------------------------- /src/radix15_sse_macro.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /******************************************************************************* 24 | We now include this header file if it was not included before. 25 | *******************************************************************************/ 26 | #ifndef radix15_sse_macro_h_included 27 | #define radix15_sse_macro_h_included 28 | 29 | #include "sse2_macro_gcc64.h" 30 | 31 | /* General indexing for twiddleless radix-15 done as 3*radix-5 followed by 5*radix-3 is as for the scalar macro above: 32 | RADIX_15_DIF(00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E) 33 | -> 34 | RADIX_05_DFT(i0,iC,i9,i6,i3, t0,t1,t2,t3,t4) 35 | RADIX_05_DFT(iA,i7,i4,i1,iD, t5,t6,t7,t8,t9) 36 | RADIX_05_DFT(i5,i2,iE,iB,i8, tA,tB,tC,tD,tE) 37 | 38 | RADIX_03_DFT(t0,t5,tA, o0,o1,o2,) 39 | RADIX_03_DFT(t1,t6,tB, oD,oE,oB,) 40 | RADIX_03_DFT(t2,t7,tC, o9,oA,oB,) 41 | RADIX_03_DFT(t3,t8,tD, o8,o6,o7,) 42 | RADIX_03_DFT(t4,t9,tE, o4,o5,o3,) 43 | 44 | In our impl below, the __i are input pointers, which may overlap the __o outputs; 45 | ..cc0 and cc1 are ptrs to the radix-3 and radix-5 SSE2 sincos constants (c3m1 and cn1); 46 | __t0-E are ptr to scratch local storage (i.e. the address block pointed to by r00-r3e). 47 | */ 48 | // Aug 2014: Need arbitrary-pointer-offsets to support I/O permutations needed by 49 | // larger-radix DFTs of length 15 * 2^n 50 | 51 | #define SSE2_RADIX_15_DIF(\ 52 | __cc0, __cc1,\ 53 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\ 54 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\ 55 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\ 56 | {\ 57 | SSE2_RADIX_05_DFT_0TWIDDLE(__i0,__iC,__i9,__i6,__i3, __cc1, __t0,__t1,__t2,__t3,__t4);\ 58 | SSE2_RADIX_05_DFT_0TWIDDLE(__iA,__i7,__i4,__i1,__iD, __cc1, __t5,__t6,__t7,__t8,__t9);\ 59 | SSE2_RADIX_05_DFT_0TWIDDLE(__i5,__i2,__iE,__iB,__i8, __cc1, __tA,__tB,__tC,__tD,__tE);\ 60 | \ 61 | SSE2_RADIX_03_DFT(__t0,__t5,__tA, __cc0, __o0,__o1,__o2);\ 62 | SSE2_RADIX_03_DFT(__t1,__t6,__tB, __cc0, __oD,__oE,__oC);\ 63 | SSE2_RADIX_03_DFT(__t2,__t7,__tC, __cc0, __o9,__oA,__oB);\ 64 | SSE2_RADIX_03_DFT(__t3,__t8,__tD, __cc0, __o8,__o6,__o7);\ 65 | SSE2_RADIX_03_DFT(__t4,__t9,__tE, __cc0, __o4,__o5,__o3);\ 66 | } 67 | 68 | #define SSE2_RADIX_15_DIT(\ 69 | __cc0, __cc1,\ 70 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\ 71 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\ 72 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\ 73 | {\ 74 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\ 75 | SSE2_RADIX_03_DFT(__i0,__i2,__i1, __cc0, __t0,__t2,__t1);\ 76 | SSE2_RADIX_03_DFT(__i8,__i7,__i6, __cc0, __t3,__t5,__t4);\ 77 | SSE2_RADIX_03_DFT(__iD,__iC,__iE, __cc0, __t6,__t8,__t7);\ 78 | SSE2_RADIX_03_DFT(__i4,__i3,__i5, __cc0, __t9,__tB,__tA);\ 79 | SSE2_RADIX_03_DFT(__i9,__iB,__iA, __cc0, __tC,__tE,__tD);\ 80 | \ 81 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\ 82 | SSE2_RADIX_05_DFT_0TWIDDLE(__t0,__t3,__t6,__t9,__tC, __cc1, __o0,__o6,__oC,__o3,__o9);\ 83 | SSE2_RADIX_05_DFT_0TWIDDLE(__t1,__t4,__t7,__tA,__tD, __cc1, __o5,__oB,__o2,__o8,__oE);\ 84 | SSE2_RADIX_05_DFT_0TWIDDLE(__t2,__t5,__t8,__tB,__tE, __cc1, __oA,__o1,__o7,__oD,__o4);\ 85 | } 86 | 87 | // Cost: 12 DP-math, 17 vector MOV for each of the two side-by-side 3-DFTs in SSE2_RADIX_03_DFT_X2 88 | // 38 DP-math, 31 vector MOV for each of the two side-by-side 5-DFTs in SSE2_RADIX_05_DFT_0TWIDDLE_X2. Thus 89 | // 150 DP-math, 144 vector MOV for each of the two side-by-side 15-DFTs in each of these two [DIF and DIT] 15-DFT macro-of-macros. 90 | // Compare to van-Buskirk 13-DFT: 198 DP-math, 168 vector MOV. 91 | #define SSE2_RADIX_15_DIF_X2(\ 92 | __cc0, __cc1, __two,\ 93 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\ 94 | __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\ 95 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\ 96 | __j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\ 97 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\ 98 | __u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\ 99 | {\ 100 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i0,__iC,__i9,__i6,__i3, __s0,__s1,__s2,__s3,__s4, __j0,__jC,__j9,__j6,__j3, __t0,__t1,__t2,__t3,__t4);\ 101 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __iA,__i7,__i4,__i1,__iD, __s5,__s6,__s7,__s8,__s9, __jA,__j7,__j4,__j1,__jD, __t5,__t6,__t7,__t8,__t9);\ 102 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i5,__i2,__iE,__iB,__i8, __sA,__sB,__sC,__sD,__sE, __j5,__j2,__jE,__jB,__j8, __tA,__tB,__tC,__tD,__tE);\ 103 | \ 104 | SSE2_RADIX_03_DFT_X2(__cc0, __s0,__s5,__sA, __o0,__o1,__o2, __t0,__t5,__tA, __u0,__u1,__u2);\ 105 | SSE2_RADIX_03_DFT_X2(__cc0, __s1,__s6,__sB, __oD,__oE,__oC, __t1,__t6,__tB, __uD,__uE,__uC);\ 106 | SSE2_RADIX_03_DFT_X2(__cc0, __s2,__s7,__sC, __o9,__oA,__oB, __t2,__t7,__tC, __u9,__uA,__uB);\ 107 | SSE2_RADIX_03_DFT_X2(__cc0, __s3,__s8,__sD, __o8,__o6,__o7, __t3,__t8,__tD, __u8,__u6,__u7);\ 108 | SSE2_RADIX_03_DFT_X2(__cc0, __s4,__s9,__sE, __o4,__o5,__o3, __t4,__t9,__tE, __u4,__u5,__u3);\ 109 | } 110 | 111 | #define SSE2_RADIX_15_DIT_X2(\ 112 | __cc0, __cc1,__two,\ 113 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\ 114 | __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\ 115 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\ 116 | __j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\ 117 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\ 118 | __u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\ 119 | {\ 120 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\ 121 | SSE2_RADIX_03_DFT_X2(__cc0, __i0,__i2,__i1, __s0,__s2,__s1, __j0,__j2,__j1, __t0,__t2,__t1);\ 122 | SSE2_RADIX_03_DFT_X2(__cc0, __i8,__i7,__i6, __s3,__s5,__s4, __j8,__j7,__j6, __t3,__t5,__t4);\ 123 | SSE2_RADIX_03_DFT_X2(__cc0, __iD,__iC,__iE, __s6,__s8,__s7, __jD,__jC,__jE, __t6,__t8,__t7);\ 124 | SSE2_RADIX_03_DFT_X2(__cc0, __i4,__i3,__i5, __s9,__sB,__sA, __j4,__j3,__j5, __t9,__tB,__tA);\ 125 | SSE2_RADIX_03_DFT_X2(__cc0, __i9,__iB,__iA, __sC,__sE,__sD, __j9,__jB,__jA, __tC,__tE,__tD);\ 126 | \ 127 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\ 128 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s0,__s3,__s6,__s9,__sC, __o0,__o6,__oC,__o3,__o9, __t0,__t3,__t6,__t9,__tC, __u0,__u6,__uC,__u3,__u9);\ 129 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s1,__s4,__s7,__sA,__sD, __o5,__oB,__o2,__o8,__oE, __t1,__t4,__t7,__tA,__tD, __u5,__uB,__u2,__u8,__uE);\ 130 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s2,__s5,__s8,__sB,__sE, __oA,__o1,__o7,__oD,__o4, __t2,__t5,__t8,__tB,__tE, __uA,__u1,__u7,__uD,__u4);\ 131 | } 132 | 133 | #endif /* radix15_sse_macro_h_included */ 134 | 135 | -------------------------------------------------------------------------------- /src/radix16.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix16_included 27 | #define radix16_included 28 | 29 | #define c16 ((double)0.92387953251128675613) 30 | #define s16 ((double)0.38268343236508977173) /* exp( I*twopi/16) */ 31 | 32 | #endif /* #ifndef radix16_included */ 33 | -------------------------------------------------------------------------------- /src/radix16_wrapper_ini.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "Mlucas.h" 24 | 25 | /***************/ 26 | 27 | /* Initialize the various arrays of indices used in radix16_wrapper_square, so we can execute 28 | the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired. 29 | */ 30 | void radix16_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[]) 31 | { 32 | static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum; 33 | int iblock_next; 34 | 35 | if(iblock <= 1 && !(radix0 & 1)) 36 | iblock_next = iblock + 1; 37 | else 38 | iblock_next = iblock + 2; 39 | 40 | if(iblock == 0) // j1 = real-array index (double the complex-array index) of the 1st element of each floating pair. 41 | { 42 | // No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square: 43 | j1 = 0; 44 | j2 = 32; 45 | j2_start = j2; // j2 = real-array index (double the complex-array index) of 2nd element of each floating pair. 46 | k = 0; 47 | blocklen = 16; // = half of complex blocklength, since process 2 complex data for each value of loop index L. 48 | blocklen_sum = 0; 49 | 50 | ws_i [iblock] = i ; 51 | ws_j1 [iblock] = j1 ; 52 | ws_j2 [iblock] = j2 ; 53 | ws_j2_start [iblock] = j2_start ; 54 | ws_k [iblock] = k ; 55 | ws_m [iblock] = m ; 56 | ws_blocklen [iblock] = blocklen ; 57 | ws_blocklen_sum[iblock] = blocklen_sum; 58 | } else { 59 | goto jump_in; 60 | } 61 | 62 | for(i = nradices_prim-5; i >= 0; i-- ) // Main loop: lower bound = nradices_prim - radix_now. 63 | { // Remember, radices get processed in reverse order here as in forward FFT. 64 | for(m = 0; m < (blocklen-1)>>1; m += 8) // Do two 16-element sets per loop, so only execute loop half as many times as before. 65 | { 66 | // This tells us when we've reached the end of the current data block: 67 | // Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow! 68 | if(j1 && ((uint64)j1*radix0)%n == 0) 69 | { 70 | ws_i [iblock_next] = i ; 71 | ws_j1 [iblock_next] = j1 ; 72 | ws_j2 [iblock_next] = j2 ; 73 | ws_j2_start [iblock_next] = j2_start ; 74 | ws_k [iblock_next] = k ; 75 | ws_m [iblock_next] = m ; 76 | ws_blocklen [iblock_next] = blocklen ; 77 | ws_blocklen_sum[iblock_next] = blocklen_sum; 78 | // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); 79 | return; 80 | } 81 | jump_in: // Entry point for all blocks but the first. 82 | k += 2; // increment sincos array index 83 | // And update the data (j1 and j2) array indices: 84 | j1 += 32; 85 | j2 -= 32; 86 | } 87 | /* 88 | !...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position 89 | ! ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a 90 | ! second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2). 91 | */ 92 | j1 += (blocklen << 1); 93 | if(j2_start == n-32) { 94 | // printf("(j2_start == n-32) return with j2_start = %d\n",j2_start); 95 | return; 96 | } 97 | 98 | /*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit, 99 | we multiply the blocklength by K >> 1 in preparation for the final block. */ 100 | 101 | blocklen_sum += blocklen; 102 | blocklen = (blocklen_sum) * (radix_prim[i-1]-1); 103 | 104 | /*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */ 105 | 106 | j2_start += (blocklen<<2); 107 | j2 = j2_start; /* Reset j2 for start of the next block. */ 108 | // printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2); 109 | } /* End of Main loop */ 110 | } 111 | 112 | /* 113 | Jun 2014: Possible UMR bug? Note the aside from i=1, only even-idx elts of the ws-arrays get inited ... 114 | so how do the odd-index reads not hose the result? 115 | 116 | Using complex FFT radices 16 8 16 16 16 117 | init ws_k[ 0] = 0 118 | 65536 1048576 131040: init ws_k[ 1] = 2048 119 | 131072 2097152 262112: init ws_k[ 2] = 4096 120 | 262144 4194304 524256: init ws_k[ 4] = 8192 121 | 327680 5242880 458720: init ws_k[ 6] = 12288 122 | 524288 8388608 1048544: init ws_k[ 8] = 16384 123 | 589824 9437184 983008: init ws_k[ 10] = 20480 124 | 655360 10485760 917472: init ws_k[ 12] = 24576 125 | 720896 11534336 851936: init ws_k[ 14] = 28672 126 | Mers_mod_square: Init threadpool of 1 threads 127 | Setting CPU = 0 affinity of worker thread id 0, mach_id = 3843 128 | radix16_wrapper_square with ws[]-index = 0 129 | stride = 32 130 | On entry: i = 0, j1,j2,j2_start = 0, 32, 32, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 16 131 | radix16_wrapper_square with ws[]-index = 1 132 | stride = 32 133 | On entry: i = 3, j1,j2,j2_start = 65536, 131040, 131040, k,m = 2048, 0, nrad_prim = 19, blocklen,sum = 16384 134 | radix16_wrapper_square with ws[]-index = 2 135 | stride = 32 136 | On entry: i = 2, j1,j2,j2_start = 131072, 262112, 262112, k,m = 4096, 0, nrad_prim = 19, blocklen,sum = 32768 137 | radix16_wrapper_square with ws[]-index = 3 <*** inited where? *** 138 | stride = 32 vvvv <*** j1 = 0, so no "jump_in": *** 139 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 140 | ======================== 141 | So whn j1 = 0 on entry we exit immediately via: 142 | if(j1 && ((uint64)j1*radix0)%n == 0) 143 | { 144 | // fprintf(stderr,"(j1 && j1*radix0 == 0 (mod n)) check hit: returning\n"); 145 | return; 146 | } 147 | ======================== 148 | radix16_wrapper_square with ws[]-index = 4 149 | stride = 32 150 | On entry: i = 1, j1,j2,j2_start = 262144, 524256, 524256, k,m = 8192, 0, nrad_prim = 19, blocklen,sum = 65536 151 | radix16_wrapper_square with ws[]-index = 5 <*** j1 = 0 152 | stride = 32 153 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 154 | radix16_wrapper_square with ws[]-index = 6 155 | stride = 32 156 | On entry: i = 1, j1,j2,j2_start = 327680, 458720, 524256, k,m = 12288, 16384, nrad_prim = 19, blocklen,sum = 65536 157 | radix16_wrapper_square with ws[]-index = 7 <*** j1 = 0 158 | stride = 32 159 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 160 | radix16_wrapper_square with ws[]-index = 8 161 | stride = 32 162 | On entry: i = 0, j1,j2,j2_start = 524288, 1048544, 1048544, k,m = 16384, 0, nrad_prim = 19, blocklen,sum = 131072 163 | radix16_wrapper_square with ws[]-index = 9 <*** j1 = 0 164 | stride = 32 165 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 166 | radix16_wrapper_square with ws[]-index = 10 167 | stride = 32 168 | On entry: i = 0, j1,j2,j2_start = 589824, 983008, 1048544, k,m = 20480, 16384, nrad_prim = 19, blocklen,sum = 131072 169 | radix16_wrapper_square with ws[]-index = 11 <*** j1 = 0 170 | stride = 32 171 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 172 | radix16_wrapper_square with ws[]-index = 12 173 | stride = 32 174 | On entry: i = 0, j1,j2,j2_start = 655360, 917472, 1048544, k,m = 24576, 32768, nrad_prim = 19, blocklen,sum = 131072 175 | radix16_wrapper_square with ws[]-index = 13 <*** j1 = 0 176 | stride = 32 177 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 178 | radix16_wrapper_square with ws[]-index = 14 179 | stride = 32 180 | On entry: i = 0, j1,j2,j2_start = 720896, 851936, 1048544, k,m = 28672, 49152, nrad_prim = 19, blocklen,sum = 131072 181 | radix16_wrapper_square with ws[]-index = 15 <*** j1 = 0 182 | stride = 32 183 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0 184 | 185 | Thus, j1 = 0 is how the odd-idx uninit is handled in practice - BUT NEED TO ENSURE ALL THE J1-DATA ARE INITED = 0 AT OUTSET 186 | 187 | Thus, switch ws_* allocs in mers_mod_square from malloc to calloc. 188 | 189 | (Surprised this issue took so long to manifest...) 190 | */ 191 | -------------------------------------------------------------------------------- /src/radix17_ditN_cy_dif1.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "Mlucas.h" 24 | #include "radix17_dft.h" 25 | 26 | /***************/ 27 | 28 | int radix17_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], double wt1[], int si[], double base[], double baseinv[], int iter, double *fracmax, uint64 p) 29 | { 30 | return 1; 31 | } 32 | 33 | /***************/ 34 | 35 | void radix17_dif_pass1(double a[], int n) 36 | { 37 | /* 38 | !...Acronym: DIF = Decimation In Frequency 39 | ! 40 | !...Subroutine to perform an initial radix-17 complex DIF FFT pass on the data in the length-N real vector A. 41 | ! 42 | ! See the documentation in radix16_dif_pass for further details on storage and indexing. 43 | ! 44 | ! Given complex inputs (x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,xG), we need the following outputs 45 | ! (here cJ = cos(2*J*pi/17), sJ = sin(2*J*pi/17)): 46 | ! 47 | ! X0 = C0, where C0 = x0+ (x1+xG)+ (x2+xF)+ (x3+xE)+ (x4+xD)+ (x5+xC)+ (x6+xB)+ (x7+xA)+ (x6+x9), 48 | ! the cosine terms below get massaged into the form of a length-8 cyclic convolution: 49 | ! X1 = C1 + I*S1 C1 = 50 | ! X2 = C2 + I*S2 51 | ! X3 = C3 + I*S3 52 | ! X4 = C4 + I*S4 53 | ! X5 = C5 + I*S5 54 | ! X6 = C6 + I*S6 55 | ! X7 = C7 + I*S7 56 | ! X8 = C8 + I*S8 57 | ! and the sine terms get massaged into the form of a length-8 acyclic convolution: 58 | ! X9 = C8 - I*S8 59 | ! XA = C7 - I*S7 60 | ! XB = C6 - I*S6 61 | ! XC = C5 - I*S5 62 | ! XD = C4 - I*S4 63 | ! XE = C3 - I*S3 64 | ! XF = C2 - I*S2 65 | ! XG = C1 - I*S1 66 | ! 67 | ! We refer to the terms C1-8 (which do not explicitly involving the imaginary constant I) 68 | ! as the "cosine part" of the output, and S1-8 (those multiplied by I) as the "sine part." 69 | ! Opcount for general odd-prime radix R: 70 | ! Totals : 100 FMUL, 140 FADD, (R-1)^2 fmul (R+3)*(R-1) fadd 71 | ! compared to 16 FMUL, 96 FADD for radix-12. (Ouch!) 72 | ! 73 | ! Relative cost := #FADD/(radix*lg2(radix)) = 3.679 . 74 | */ 75 | int j,j1,j2; 76 | static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE; 77 | 78 | if(!first_entry && (n/17) != n17) /* New runlength? */ 79 | { 80 | first_entry=TRUE; 81 | } 82 | 83 | /*...initialize things upon first entry */ 84 | 85 | if(first_entry) 86 | { 87 | first_entry=FALSE; 88 | n17 = n/17; 89 | // Constant index offsets for array load/stores are here: 90 | p1 = n17; 91 | p2 = p1 +p1; 92 | p3 = p2 +p1; 93 | p4 = p3 +p1; 94 | p5 = p4 +p1; 95 | p6 = p5 +p1; 96 | p7 = p6 +p1; 97 | p8 = p7 +p1; 98 | p9 = p8 +p1; 99 | p10 = p9 +p1; 100 | p11 = p10+p1; 101 | p12 = p11+p1; 102 | p13 = p12+p1; 103 | p14 = p13+p1; 104 | p15 = p14+p1; 105 | p16 = p15+p1; 106 | 107 | p1 += ( (p1 >> DAT_BITS) << PAD_BITS ); 108 | p2 += ( (p2 >> DAT_BITS) << PAD_BITS ); 109 | p3 += ( (p3 >> DAT_BITS) << PAD_BITS ); 110 | p4 += ( (p4 >> DAT_BITS) << PAD_BITS ); 111 | p5 += ( (p5 >> DAT_BITS) << PAD_BITS ); 112 | p6 += ( (p6 >> DAT_BITS) << PAD_BITS ); 113 | p7 += ( (p7 >> DAT_BITS) << PAD_BITS ); 114 | p8 += ( (p8 >> DAT_BITS) << PAD_BITS ); 115 | p9 += ( (p9 >> DAT_BITS) << PAD_BITS ); 116 | p10 += ( (p10>> DAT_BITS) << PAD_BITS ); 117 | p11 += ( (p11>> DAT_BITS) << PAD_BITS ); 118 | p12 += ( (p12>> DAT_BITS) << PAD_BITS ); 119 | p13 += ( (p13>> DAT_BITS) << PAD_BITS ); 120 | p14 += ( (p14>> DAT_BITS) << PAD_BITS ); 121 | p15 += ( (p15>> DAT_BITS) << PAD_BITS ); 122 | p16 += ( (p16>> DAT_BITS) << PAD_BITS ); 123 | } 124 | 125 | /*...The radix-17 pass is here. */ 126 | 127 | for(j=0; j < n17; j += 2) 128 | { 129 | #ifdef USE_SSE2 130 | j1 = (j & mask01) + br4[j&3]; 131 | j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS ); 132 | #else 133 | j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ 134 | #endif 135 | j2 = j1+RE_IM_STRIDE; 136 | /* Call same radix-11 DFT macro as for DIF, but replace indices [0,1,2,3,4,5,6,7,8,9,10] with j*10%11, j = 0, ..., 10: */ 137 | RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16] 138 | ,a+j1 ,a+j2 ,a+j1+p1 ,a+j2+p1 ,a+j1+p2 ,a+j2+p2 ,a+j1+p3 ,a+j2+p3 ,a+j1+p4 ,a+j2+p4 ,a+j1+p5 ,a+j2+p5 ,a+j1+p6 ,a+j2+p6 ,a+j1+p7 ,a+j2+p7 ,a+j1+p8 ,a+j2+p8 ,a+j1+p9 ,a+j2+p9 ,a+j1+p10 ,a+j2+p10 ,a+j1+p11 ,a+j2+p11 ,a+j1+p12 ,a+j2+p12 ,a+j1+p13 ,a+j2+p13 ,a+j1+p14 ,a+j2+p14 ,a+j1+p15 ,a+j2+p15 ,a+j1+p16 ,a+j2+p16 ); 139 | } 140 | } 141 | 142 | /***************/ 143 | 144 | void radix17_dit_pass1(double a[], int n) 145 | { 146 | /* 147 | !...Acronym: DIT = Decimation In Time 148 | ! 149 | !...Subroutine to perform a final radix-17 complex DIT FFT pass on the data in the length-N real vector A. 150 | ! 151 | ! See the documentation in radix16_dif_pass for further details on storage and indexing. 152 | */ 153 | int j,j1,j2; 154 | static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE; 155 | 156 | if(!first_entry && (n/17) != n17) /* New runlength? */ 157 | { 158 | first_entry=TRUE; 159 | } 160 | 161 | /*...initialize things upon first entry */ 162 | 163 | if(first_entry) 164 | { 165 | first_entry=FALSE; 166 | n17 = n/17; 167 | // Constant index offsets for array load/stores are here: 168 | p1 = n17; 169 | p2 = p1 +p1; 170 | p3 = p2 +p1; 171 | p4 = p3 +p1; 172 | p5 = p4 +p1; 173 | p6 = p5 +p1; 174 | p7 = p6 +p1; 175 | p8 = p7 +p1; 176 | p9 = p8 +p1; 177 | p10 = p9 +p1; 178 | p11 = p10+p1; 179 | p12 = p11+p1; 180 | p13 = p12+p1; 181 | p14 = p13+p1; 182 | p15 = p14+p1; 183 | p16 = p15+p1; 184 | 185 | p1 += ( (p1 >> DAT_BITS) << PAD_BITS ); 186 | p2 += ( (p2 >> DAT_BITS) << PAD_BITS ); 187 | p3 += ( (p3 >> DAT_BITS) << PAD_BITS ); 188 | p4 += ( (p4 >> DAT_BITS) << PAD_BITS ); 189 | p5 += ( (p5 >> DAT_BITS) << PAD_BITS ); 190 | p6 += ( (p6 >> DAT_BITS) << PAD_BITS ); 191 | p7 += ( (p7 >> DAT_BITS) << PAD_BITS ); 192 | p8 += ( (p8 >> DAT_BITS) << PAD_BITS ); 193 | p9 += ( (p9 >> DAT_BITS) << PAD_BITS ); 194 | p10 += ( (p10>> DAT_BITS) << PAD_BITS ); 195 | p11 += ( (p11>> DAT_BITS) << PAD_BITS ); 196 | p12 += ( (p12>> DAT_BITS) << PAD_BITS ); 197 | p13 += ( (p13>> DAT_BITS) << PAD_BITS ); 198 | p14 += ( (p14>> DAT_BITS) << PAD_BITS ); 199 | p15 += ( (p15>> DAT_BITS) << PAD_BITS ); 200 | p16 += ( (p16>> DAT_BITS) << PAD_BITS ); 201 | } 202 | 203 | /*...The radix-17 pass is here. */ 204 | 205 | for(j=0; j < n17; j += 2) 206 | { 207 | #ifdef USE_SSE2 208 | j1 = (j & mask01) + br4[j&3]; 209 | j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS ); 210 | #else 211 | j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ 212 | #endif 213 | j2 = j1+RE_IM_STRIDE; 214 | // Call same radix-17 DFT macro as for DIF, but replace indices j = 1-16 with j*16%17, i.e. run in reverse order: 215 | RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16] 216 | ,a+j1 ,a+j2 ,a+j1+p16 ,a+j2+p16 ,a+j1+p15 ,a+j2+p15 ,a+j1+p14 ,a+j2+p14 ,a+j1+p13 ,a+j2+p13 ,a+j1+p12 ,a+j2+p12 ,a+j1+p11 ,a+j2+p11 ,a+j1+p10 ,a+j2+p10 ,a+j1+p9 ,a+j2+p9 ,a+j1+p8 ,a+j2+p8 ,a+j1+p7 ,a+j2+p7 ,a+j1+p6 ,a+j2+p6 ,a+j1+p5 ,a+j2+p5 ,a+j1+p4 ,a+j2+p4 ,a+j1+p3 ,a+j2+p3 ,a+j1+p2 ,a+j2+p2 ,a+j1+p1 ,a+j2+p1 ); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/radix256.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix256_included 27 | #define radix256_included 28 | 29 | #include "radix128.h" 30 | 31 | #define c256_01 ((double)0.99969881869620422011) 32 | #define s256_01 ((double)0.02454122852291228802) /* exp(01*I*twopi/256) */ 33 | #define c256_03 ((double)0.99729045667869021613) 34 | #define s256_03 ((double)0.07356456359966742351) /* exp(03*I*twopi/256) */ 35 | #define c256_05 ((double)0.99247953459870999816) 36 | #define s256_05 ((double)0.12241067519921619847) /* exp(05*I*twopi/256) */ 37 | #define c256_07 ((double)0.98527764238894124478) 38 | #define s256_07 ((double)0.17096188876030122632) /* exp(07*I*twopi/256) */ 39 | #define c256_09 ((double)0.97570213003852854447) 40 | #define s256_09 ((double)0.21910124015686979717) /* exp(09*I*twopi/256) */ 41 | #define c256_0b ((double)0.96377606579543986670) 42 | #define s256_0b ((double)0.26671275747489838626) /* exp(0b*I*twopi/256) */ 43 | #define c256_0d ((double)0.94952818059303666721) 44 | #define s256_0d ((double)0.31368174039889147658) /* exp(0d*I*twopi/256) */ 45 | #define c256_0f ((double)0.93299279883473888774) 46 | #define s256_0f ((double)0.35989503653498814869) /* exp(0f*I*twopi/256) */ 47 | #define c256_11 ((double)0.91420975570353065467) 48 | #define s256_11 ((double)0.40524131400498987082) /* exp(11*I*twopi/256) */ 49 | #define c256_13 ((double)0.89322430119551532038) 50 | #define s256_13 ((double)0.44961132965460659995) /* exp(13*I*twopi/256) */ 51 | #define c256_15 ((double)0.87008699110871141870) 52 | #define s256_15 ((double)0.49289819222978403677) /* exp(15*I*twopi/256) */ 53 | #define c256_17 ((double)0.84485356524970707332) 54 | #define s256_17 ((double)0.53499761988709721055) /* exp(17*I*twopi/256) */ 55 | #define c256_19 ((double)0.81758481315158369658) 56 | #define s256_19 ((double)0.57580819141784530063) /* exp(19*I*twopi/256) */ 57 | #define c256_1b ((double)0.78834642762660626210) 58 | #define s256_1b ((double)0.61523159058062684536) /* exp(1b*I*twopi/256) */ 59 | #define c256_1d ((double)0.75720884650648454767) 60 | #define s256_1d ((double)0.65317284295377676396) /* exp(1d*I*twopi/256) */ 61 | #define c256_1f ((double)0.72424708295146692105) 62 | #define s256_1f ((double)0.68954054473706692449) /* exp(1f*I*twopi/256) */ 63 | 64 | #endif /* #ifndef radix256_included */ 65 | -------------------------------------------------------------------------------- /src/radix256_twiddles.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "radix256.h" 24 | 25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper, 26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration 27 | // and thus needs to be inline-able in multiple places in a source filing making use of it. 28 | 29 | const double DFT256_TWIDDLES[16][30] = { 30 | { 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 }, 31 | { 0,1, ISRT2,ISRT2, -ISRT2,ISRT2, c16,s16, -s16,c16, s16,c16, -c16,s16, c32_1,s32_1, -s32_1,c32_1, s32_3,c32_3, -c32_3,s32_3, c32_3,s32_3, -s32_3,c32_3, s32_1,c32_1, -c32_1,s32_1 }, 32 | { ISRT2,ISRT2, c16,s16, s16,c16, c32_1,s32_1, s32_3,c32_3, c32_3,s32_3, s32_1,c32_1, c64_1,s64_1, s64_7,c64_7, c64_5,s64_5, s64_3,c64_3, c64_3,s64_3, s64_5,c64_5, c64_7,s64_7, s64_1,c64_1 }, 33 | { -ISRT2,ISRT2, s16,c16, -c16,-s16, c32_3,s32_3, -c32_1,s32_1, -s32_1,c32_1, -s32_3,-c32_3, c64_3,s64_3, -c64_5,s64_5, s64_1,c64_1, -c64_7,-s64_7, s64_7,c64_7, -c64_1,-s64_1, -s64_5,c64_5, -s64_3,-c64_3 }, 34 | { c16,s16, c32_1,s32_1, c32_3,s32_3, c64_1,s64_1, c64_5,s64_5, c64_3,s64_3, c64_7,s64_7, c128_1,s128_1, c128_9,s128_9, c128_5,s128_5, c128_d,s128_d, c128_3,s128_3, c128_b,s128_b, c128_7,s128_7, c128_f,s128_f }, 35 | { -s16,c16, s32_3,c32_3, -c32_1,s32_1, c64_5,s64_5, -c64_7,s64_7, s64_1,c64_1, -c64_3,-s64_3, c128_5,s128_5, -s128_d,c128_d, s128_7,c128_7, -c128_1,-s128_1, c128_f,s128_f, -c128_9,s128_9, -s128_3,c128_3, -c128_b,-s128_b }, 36 | { s16,c16, c32_3,s32_3, -s32_1,c32_1, c64_3,s64_3, s64_1,c64_1, s64_7,c64_7, -s64_5,c64_5, c128_3,s128_3, s128_5,c128_5, c128_f,s128_f, -s128_7,c128_7, c128_9,s128_9, -s128_1,c128_1, s128_b,c128_b, -s128_d,c128_d }, 37 | { -c16,s16, s32_1,c32_1, -s32_3,-c32_3, c64_7,s64_7, -c64_3,-s64_3, -s64_5,c64_5, s64_1,-c64_1, c128_7,s128_7, -c128_1,s128_1, -s128_3,c128_3, -s128_5,-c128_5, s128_b,c128_b, -c128_d,-s128_d, -c128_f,s128_f, s128_9,-c128_9 }, 38 | { c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7, c256_01,s256_01, c256_09,s256_09, c256_05,s256_05, c256_0d,s256_0d, c256_03,s256_03, c256_0b,s256_0b, c256_07,s256_07, c256_0f,s256_0f }, 39 | { -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1, c256_09,s256_09, -s256_11,c256_11, s256_13,c256_13, -c256_0b,s256_0b, c256_1b,s256_1b, -c256_1d,s256_1d, s256_01,c256_01, -c256_07,-s256_07 }, 40 | { s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3, c256_05,s256_05, s256_13,c256_13, c256_19,s256_19, -s256_01,c256_01, c256_0f,s256_0f, s256_09,c256_09, s256_1d,c256_1d, -s256_0b,c256_0b }, 41 | { -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5, c256_0d,s256_0d, -c256_0b,s256_0b, -s256_01,c256_01, -s256_17,-c256_17, s256_19,c256_19, -c256_0f,-s256_0f, -s256_1b,c256_1b, s256_03,-c256_03 }, 42 | { c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b, c256_03,s256_03, c256_1b,s256_1b, c256_0f,s256_0f, s256_19,c256_19, c256_09,s256_09, s256_1f,c256_1f, c256_15,s256_15, s256_13,c256_13 }, 43 | { -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d, c256_0b,s256_0b, -c256_1d,s256_1d, s256_09,c256_09, -c256_0f,-s256_0f, s256_1f,c256_1f, -c256_07,s256_07, -s256_0d,c256_0d, -s256_1b,-c256_1b }, 44 | { s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f, c256_07,s256_07, s256_01,c256_01, s256_1d,c256_1d, -s256_1b,c256_1b, c256_15,s256_15, -s256_0d,c256_0d, s256_0f,c256_0f, -c256_17,s256_17 }, 45 | { -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9, c256_0f,s256_0f, -c256_07,-s256_07, -s256_0b,c256_0b, s256_03,-c256_03, s256_13,c256_13, -s256_1b,-c256_1b, -c256_17,s256_17, c256_1f,-s256_1f } 46 | }; 47 | 48 | -------------------------------------------------------------------------------- /src/radix32.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix32_included 27 | #define radix32_included 28 | 29 | #include "radix16.h" 30 | 31 | #define c32_1 ((double)0.98078528040323044912) 32 | #define s32_1 ((double)0.19509032201612826784) /* exp(1*I*twopi/32) */ 33 | #define c32_3 ((double)0.83146961230254523708) 34 | #define s32_3 ((double)0.55557023301960222473) /* exp(3*I*twopi/32) */ 35 | 36 | #endif /* #ifndef radix32_included */ 37 | -------------------------------------------------------------------------------- /src/radix32_wrapper_ini.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "Mlucas.h" 24 | 25 | /***************/ 26 | 27 | /* Initialize the various arrays of indices used in radix32_wrapper_square, so we can execute 28 | the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired. 29 | */ 30 | void radix32_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[]) 31 | { 32 | static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum; 33 | int iblock_next; 34 | 35 | if(iblock <= 1 && !(radix0 & 1)) 36 | iblock_next = iblock + 1; 37 | else 38 | iblock_next = iblock + 2; 39 | 40 | if(iblock == 0) // j1 = real-array index (double the complex-array index) of the 1st element of each floating pair. 41 | { 42 | // No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square: 43 | j1 = 0; 44 | j2 = 64; 45 | j2_start = j2; // j2 = real-array index (double the complex-array index) of 2nd element of each floating pair. 46 | k = 0; 47 | blocklen = 32; // = half of complex blocklength, since process 2 complex data for each value of loop index L. 48 | blocklen_sum = 0; 49 | 50 | ws_i [iblock] = i ; 51 | ws_j1 [iblock] = j1 ; 52 | ws_j2 [iblock] = j2 ; 53 | ws_j2_start [iblock] = j2_start ; 54 | ws_k [iblock] = k ; 55 | ws_m [iblock] = m ; 56 | ws_blocklen [iblock] = blocklen ; 57 | ws_blocklen_sum[iblock] = blocklen_sum; 58 | } else { 59 | goto jump_in; 60 | } 61 | 62 | for(i = nradices_prim-6; i >= 0; i-- ) // Main loop: lower bound = nradices_prim - radix_now. 63 | { // Remember, radices get processed in reverse order here as in forward FFT. 64 | for(m = 0; m < (blocklen-1)>>1; m += 16) // Do two 32-element sets per loop, so only execute loop half as many times as before. 65 | { 66 | // This tells us when we've reached the end of the current data block: 67 | // Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow! 68 | if(j1 && ((uint64)j1*radix0)%n == 0) 69 | { 70 | ws_i [iblock_next] = i ; 71 | ws_j1 [iblock_next] = j1 ; 72 | ws_j2 [iblock_next] = j2 ; 73 | ws_j2_start [iblock_next] = j2_start ; 74 | ws_k [iblock_next] = k ; 75 | ws_m [iblock_next] = m ; 76 | ws_blocklen [iblock_next] = blocklen ; 77 | ws_blocklen_sum[iblock_next] = blocklen_sum; 78 | // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); 79 | return; 80 | } 81 | jump_in: // Entry point for all blocks but the first. 82 | k += 2; // increment sincos array index 83 | // And update the data (j1 and j2) array indices: 84 | j1 += 64; 85 | j2 -= 64; 86 | } 87 | /* 88 | !...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position 89 | ! ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a 90 | ! second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2). 91 | */ 92 | j1 += (blocklen << 1); 93 | 94 | if(j2_start == n-64) { 95 | // printf("(j2_start == n-32) return with j2_start = %d\n",j2_start); 96 | return; 97 | } 98 | 99 | /*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit, 100 | we multiply the blocklength by K >> 1 in preparation for the final block. */ 101 | 102 | blocklen_sum += blocklen; 103 | blocklen = (blocklen_sum) * (radix_prim[i-1]-1); 104 | 105 | /*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */ 106 | 107 | j2_start += (blocklen<<2); 108 | j2 = j2_start; /* Reset j2 for start of the next block. */ 109 | // printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2); 110 | } /* End of Main loop */ 111 | } 112 | 113 | -------------------------------------------------------------------------------- /src/radix512.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix512_included 27 | #define radix512_included 28 | 29 | #include "radix256.h" 30 | 31 | #define c512_01 ((double)0.99992470183914454092) 32 | #define s512_01 ((double)0.01227153828571992607) /* exp(01*I*twopi/512) */ 33 | #define c512_03 ((double)0.99932238458834950089) 34 | #define s512_03 ((double)0.03680722294135883230) /* exp(03*I*twopi/512) */ 35 | #define c512_05 ((double)0.99811811290014920712) 36 | #define s512_05 ((double)0.06132073630220857774) /* exp(05*I*twopi/512) */ 37 | #define c512_07 ((double)0.99631261218277801263) 38 | #define s512_07 ((double)0.08579731234443989040) /* exp(07*I*twopi/512) */ 39 | #define c512_09 ((double)0.99390697000235604155) 40 | #define s512_09 ((double)0.11022220729388305873) /* exp(09*I*twopi/512) */ 41 | #define c512_0b ((double)0.99090263542778002511) 42 | #define s512_0b ((double)0.13458070850712618623) /* exp(0b*I*twopi/512) */ 43 | #define c512_0d ((double)0.98730141815785838241) 44 | #define s512_0d ((double)0.15885814333386144158) /* exp(0d*I*twopi/512) */ 45 | #define c512_0f ((double)0.98310548743121632720) 46 | #define s512_0f ((double)0.18303988795514095840) /* exp(0f*I*twopi/512) */ 47 | #define c512_11 ((double)0.97831737071962763313) 48 | #define s512_11 ((double)0.20711137619221854957) /* exp(11*I*twopi/512) */ 49 | #define c512_13 ((double)0.97293995220556014550) 50 | #define s512_13 ((double)0.23105810828067111950) /* exp(13*I*twopi/512) */ 51 | #define c512_15 ((double)0.96697647104485210912) 52 | #define s512_15 ((double)0.25486565960451457139) /* exp(15*I*twopi/512) */ 53 | #define c512_17 ((double)0.96043051941556581124) 54 | #define s512_17 ((double)0.27851968938505310503) /* exp(17*I*twopi/512) */ 55 | #define c512_19 ((double)0.95330604035419383697) 56 | #define s512_19 ((double)0.30200594931922806681) /* exp(19*I*twopi/512) */ 57 | #define c512_1b ((double)0.94560732538052132579) 58 | #define s512_1b ((double)0.32531029216226293393) /* exp(1b*I*twopi/512) */ 59 | #define c512_1d ((double)0.93733901191257492328) 60 | #define s512_1d ((double)0.34841868024943456820) /* exp(1d*I*twopi/512) */ 61 | #define c512_1f ((double)0.92850608047321556602) 62 | #define s512_1f ((double)0.37131719395183754318) /* exp(1f*I*twopi/512) */ 63 | #define c512_21 ((double)0.91911385169005774400) 64 | #define s512_21 ((double)0.39399204006104810836) /* exp(21*I*twopi/512) */ 65 | #define c512_23 ((double)0.90916798309052237667) 66 | #define s512_23 ((double)0.41642956009763718231) /* exp(23*I*twopi/512) */ 67 | #define c512_25 ((double)0.89867446569395384316) 68 | #define s512_25 ((double)0.43861623853852763738) /* exp(25*I*twopi/512) */ 69 | #define c512_27 ((double)0.88763962040285394789) 70 | #define s512_27 ((double)0.46053871095824002336) /* exp(27*I*twopi/512) */ 71 | #define c512_29 ((double)0.87607009419540660724) 72 | #define s512_29 ((double)0.48218377207912274823) /* exp(29*I*twopi/512) */ 73 | #define c512_2b ((double)0.86397285612158673808) 74 | #define s512_2b ((double)0.50353838372571755840) /* exp(2b*I*twopi/512) */ 75 | #define c512_2d ((double)0.85135519310526514244) 76 | #define s512_2d ((double)0.52458968267846890591) /* exp(2d*I*twopi/512) */ 77 | #define c512_2f ((double)0.83822470555483804338) 78 | #define s512_2f ((double)0.54532498842204642200) /* exp(2f*I*twopi/512) */ 79 | #define c512_31 ((double)0.82458930278502526468) 80 | #define s512_31 ((double)0.56573181078361319707) /* exp(31*I*twopi/512) */ 81 | #define c512_33 ((double)0.81045719825259479195) 82 | #define s512_33 ((double)0.58579785745643886000) /* exp(33*I*twopi/512) */ 83 | #define c512_35 ((double)0.79583690460888353651) 84 | #define s512_35 ((double)0.60551104140432551359) /* exp(35*I*twopi/512) */ 85 | #define c512_37 ((double)0.78073722857209447856) 86 | #define s512_37 ((double)0.62485948814238637675) /* exp(37*I*twopi/512) */ 87 | #define c512_39 ((double)0.76516726562245892617) 88 | #define s512_39 ((double)0.64383154288979146473) /* exp(39*I*twopi/512) */ 89 | #define c512_3b ((double)0.74913639452345932577) 90 | #define s512_3b ((double)0.66241577759017176077) /* exp(3b*I*twopi/512) */ 91 | #define c512_3d ((double)0.73265427167241283493) 92 | #define s512_3d ((double)0.68060099779545305024) /* exp(3d*I*twopi/512) */ 93 | #define c512_3f ((double)0.71573082528381865446) 94 | #define s512_3f ((double)0.69837624940897285320) /* exp(3f*I*twopi/512) */ 95 | 96 | #endif /* #ifndef radix512_included */ 97 | -------------------------------------------------------------------------------- /src/radix63_main_carry_loop.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | // This main loop is same for un-and-multithreaded, so stick into a header file 24 | // (can't use a macro because of the #if-enclosed stuff). 25 | 26 | for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */ 27 | { 28 | for(j = jstart; j < jhi; j += stride) 29 | { 30 | j1 = j; 31 | j1 = j1 + ( (j1 >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ 32 | j2 = j1 + RE_IM_STRIDE; 33 | 34 | /*...The radix-63 DIT pass is here: */ 35 | 36 | //...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 7 radix-9 transforms: 37 | tptr = t; iptr = dit_iperm; 38 | for(l = 0; l < 7; l++) { 39 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)]; 40 | RADIX_09_DIT( 41 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8], 42 | tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im, 43 | rt,it,re 44 | ); tptr += 9; iptr += 9; 45 | } 46 | //...and now do 9 radix-7 transforms: 47 | tptr = t; iptr = dit_operm; 48 | for(l = 0; l < 9; l++) { 49 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; 50 | RADIX_07_DFT( 51 | tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im, 52 | t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13, 53 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6], 54 | uc1,us1,uc2,us2,uc3,us3, rt,it,re,im 55 | ); tptr++; iptr += 7; 56 | } 57 | 58 | /*...Now do the carries. Since the outputs would 59 | normally be getting dispatched to 63 separate blocks of the A-array, we need 63 separate carries. */ 60 | 61 | if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) 62 | { 63 | l= j & (nwt-1); 64 | n_minus_sil = n-si[l ]; 65 | n_minus_silp1 = n-si[l+1]; 66 | sinwt = si[nwt-l ]; 67 | sinwtm1 = si[nwt-l-1]; 68 | 69 | wtl =wt0[ l ]; 70 | wtn =wt0[nwt-l ]*scale; /* Include 1/(n/2) scale factor of inverse transform here... */ 71 | wtlp1 =wt0[ l+1]; 72 | wtnm1 =wt0[nwt-l-1]*scale; /* ...and here. */ 73 | 74 | /*...set0 is slightly different from others; divide work into blocks of RADIX/4 macro calls, 1st set of which gets pulled out of loop: */ 75 | // Apr 2014: Fermat-mod works fine, but mers-mod barfs immediately with what looks like a bad a0 value, 76 | // div-by-n/2 should give 16, but instead see 77 | // iter 1, full = 1, a0in = 15.492078993055555 78 | // iter 1, full = 1, a0out = 13.000000000000000 79 | // Iter = 1, maxerr = 0.492078993055555 80 | //if(!j)printf("iter %d, full = %d, a0in = %20.15f\n",iter,full_pass,a[0]/(n>>1)); 81 | l = 0; addr = cy_r; itmp = bjmodn; 82 | jt = j1; jp = j2; 83 | cmplx_carry_norm_errcheck0(a[j1 ],a[j2 ],*addr,*itmp,0,prp_mult); ++l; ++addr; ++itmp; 84 | // Next 15 quartets of macro calls done in loop: 85 | for(ntmp = 1; ntmp < 16; ntmp++) { 86 | cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 87 | cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 88 | cmplx_carry_norm_errcheck(a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 89 | jt = j1 + p[ntmp<<2]; jp = j2 + p[ntmp<<2]; 90 | cmplx_carry_norm_errcheck(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 91 | } 92 | // Cleanup of final 2 sets of carries: 93 | cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 94 | cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; 95 | //if(!j)printf("iter %d, full = %d, a0out = %20.15f\n",iter,full_pass,a[0]); 96 | i =((uint32)(sw - bjmodn[0]) >> 31); /* get ready for the next set... */ 97 | co2 = co3; /* For all data but the first set in each j-block, co2=co3. Thus, after the first block of data is done 98 | (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1). */ 99 | } 100 | else /* MODULUS_TYPE_FERMAT */ 101 | { 102 | // Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2): 103 | ntmp = 0; addr = cy_r; addi = cy_i; ic = 0; // ic = idx into icycle mini-array, gets incremented (mod ODD_RADIX) between macro calls 104 | jt = j1; jp = j2; 105 | fermat_carry_norm_errcheckB(a[jt ],a[jp ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 106 | fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 107 | fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 108 | for(m = 1; m < 16; m++) { 109 | fermat_carry_norm_errcheckB(a[jt+p3],a[jp+p3],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 110 | jt = j1 + p[m<<2]; jp = j2 + p[m<<2]; 111 | fermat_carry_norm_errcheckB(a[jt ],a[jp ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 112 | fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 113 | fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic; 114 | } 115 | for(l = 0; l < ODD_RADIX; l++) { 116 | icycle[l] += wts_idx_incr; /* Inside the loop use this, as it is faster than general-mod '% nwt' */ 117 | icycle[l] += ( (-(int)((uint32)icycle[l] >> 31)) & nwt); 118 | } 119 | } /* if(MODULUS_TYPE == ...) */ 120 | 121 | /*...The radix-63 DIF pass is here: */ 122 | 123 | //...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 9 radix-7 transforms: 124 | tptr = t; iptr = dif_iperm; 125 | for(l = 0; l < 9; l++) { 126 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; 127 | RADIX_07_DFT( 128 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6], 129 | t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13, 130 | tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im, 131 | uc1,us1,uc2,us2,uc3,us3, rt,it,re,im 132 | ); tptr++; iptr += 7; 133 | } 134 | //...and now do 7 radix-9 transforms: 135 | tptr = t; iptr = dif_operm; 136 | for(l = 0; l < 7; l++) { 137 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)]; 138 | RADIX_09_DIF( 139 | tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im, 140 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8], 141 | rt,it,re 142 | ); tptr += 9; iptr += 9; 143 | } 144 | } 145 | 146 | if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) 147 | { 148 | jstart += nwt; 149 | jhi += nwt; 150 | 151 | col += RADIX; 152 | co3 -= RADIX; 153 | } 154 | } /* end for(k=1; k <= khi; k++) */ 155 | 156 | -------------------------------------------------------------------------------- /src/radix64.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | /**************************************************************************** 24 | * We now include this header file if it was not included before. 25 | ****************************************************************************/ 26 | #ifndef radix64_included 27 | #define radix64_included 28 | 29 | #include "radix32.h" 30 | 31 | #define c64_1 ((double)0.99518472667219688624) 32 | #define s64_1 ((double)0.09801714032956060199) /* exp(1*I*twopi/64) */ 33 | #define c64_3 ((double)0.95694033573220886494) 34 | #define s64_3 ((double)0.29028467725446236764) /* exp(3*I*twopi/64) */ 35 | #define c64_5 ((double)0.88192126434835502971) 36 | #define s64_5 ((double)0.47139673682599764856) /* exp(5*I*twopi/64) */ 37 | #define c64_7 ((double)0.77301045336273696081) 38 | #define s64_7 ((double)0.63439328416364549822) /* exp(7*I*twopi/64) */ 39 | 40 | #endif /* #ifndef radix64_included */ 41 | -------------------------------------------------------------------------------- /src/rng_isaac.c: -------------------------------------------------------------------------------- 1 | /* 2 | ------------------------------------------------------------------------------ 3 | isaac64.c: My random number generator for 64-bit machines. 4 | By Bob Jenkins, 1996. Public Domain. 5 | ------------------------------------------------------------------------------ 6 | */ 7 | 8 | #include 9 | #include "rng_isaac.h" 10 | 11 | /* externs declared in rng_isaac.h: */ 12 | ub8 randrsl[RANDSIZ], randcnt; 13 | 14 | static ub8 mm[RANDSIZ]; 15 | static ub8 aa=0, bb=0, cc=0; 16 | 17 | #define ind(mm,x) (*(ub8 *)((ub1 *)(mm) + ((x) & ((RANDSIZ-1)<<3)))) 18 | #define rngstep(mix,a,b,mm,m,m2,r,x) \ 19 | { \ 20 | x = *m; \ 21 | a = (mix) + *(m2++); \ 22 | *(m++) = y = ind(mm,x) + a + b; \ 23 | *(r++) = b = ind(mm,y>>RANDSIZL) + x; \ 24 | } 25 | 26 | void isaac64() 27 | { 28 | register ub8 a,b,x,y,*m,*m2,*r,*mend; 29 | r = randrsl; /* Need a variable address pointer to feed to rngstep */ 30 | a = aa; b = bb + (++cc); 31 | for (m = mm, mend = m2 = m+(RANDSIZ/2); m>5) , a, b, mm, m, m2, r, x); 35 | rngstep( a^(a<<12) , a, b, mm, m, m2, r, x); 36 | rngstep( a^(a>>33) , a, b, mm, m, m2, r, x); 37 | } 38 | for (m2 = mm; m2>5) , a, b, mm, m, m2, r, x); 42 | rngstep( a^(a<<12) , a, b, mm, m, m2, r, x); 43 | rngstep( a^(a>>33) , a, b, mm, m, m2, r, x); 44 | } 45 | bb = b; aa = a; 46 | } 47 | 48 | #define mix(a,b,c,d,e,f,g,h) \ 49 | { \ 50 | a-=e; f^=h>>9; h+=a; \ 51 | b-=f; g^=a<<9; a+=b; \ 52 | c-=g; h^=b>>23; b+=c; \ 53 | d-=h; a^=c<<15; c+=d; \ 54 | e-=a; b^=d>>14; d+=e; \ 55 | f-=b; c^=e<<20; e+=f; \ 56 | g-=c; d^=f>>17; f+=g; \ 57 | h-=d; e^=g<<14; g+=h; \ 58 | } 59 | 60 | void rng_isaac_init(word flag) 61 | { 62 | word i; 63 | ub8 a,b,c,d,e,f,g,h; 64 | aa=bb=cc=(ub8)0; 65 | a=b=c=d=e=f=g=h=0x9E3779B97F4A7C13ull; /* the golden ratio */ 66 | 67 | for (i=0; i<4; ++i) /* scramble it */ 68 | { 69 | mix(a,b,c,d,e,f,g,h); 70 | } 71 | 72 | for (i=0; i>32),(ub4)randrsl[j]); 114 | } 115 | } 116 | #endif 117 | 118 | /* 119 | 11/25/05: EWM - modified to add 2 types of double-precision floating rand() calls: 120 | 121 | - rng_isaac_rand_double() returns a random double via a 64-bit field 122 | which is (within the limits of the generator) a random 64-bit int; 123 | 124 | - rng_isaac_rand_double_norm_pos() returns a random double with 125 | probability uniformly distributed in [0, 1), insofar as IEEE64 doubles 126 | are capable of distributing such values, excluding underflows; 127 | 128 | - rng_isaac_rand_double_norm_pm1() returns a random double with 129 | probability uniformly distributed in (-1, 1), insofar as IEEE64 doubles 130 | are capable of distributing such values, excluding underflows; 131 | */ 132 | double rng_isaac_rand_double() 133 | { 134 | uint64 iran64; 135 | uint32 fexp; 136 | 137 | /* Make sure resulting float will not be denormal: */ 138 | for(;;) 139 | { 140 | iran64 = rng_isaac_rand(); 141 | fexp = (uint32)(iran64 >> 52) & 0x7ff; 142 | if(fexp != 0 && fexp < 0x7f0) break; 143 | } 144 | return *(double *)&iran64; 145 | } 146 | 147 | /* Assumes IEEE64-compliant: */ 148 | double rng_isaac_rand_double_norm_pos() 149 | { 150 | /* 151 | Obtain a result in [0, 1) by merging a sign/exponent field = 0x3ff with 152 | random 52-bit mantissa (52-bit because the hidden bit is assumed 1 via the 153 | choice of exponent - we only randomly generate the non-hidden 52 bits), 154 | yielding a result in [1, 2), and subtracting 1: 155 | */ 156 | uint64 iran64, itmp64; 157 | double retval; 158 | 159 | itmp64 = rng_isaac_rand(); 160 | iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull); 161 | retval=(*(double *)&iran64) - 1.0; 162 | /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */ 163 | if(retval < 0.0 || retval > 1.0) 164 | { 165 | sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval); 166 | ASSERT(0, cbuf); 167 | } 168 | return retval; 169 | } 170 | 171 | 172 | /* Assumes IEEE64-compliant: */ 173 | double rng_isaac_rand_double_norm_pm1() 174 | { 175 | /* 176 | Obtain a result in (-1, 1) by following the same procedure used in 177 | rng_isaac_rand_double_norm_pos to get a value in [0, 1) and multiplying 178 | the result by a random choice of -1 or +1. Note that this doubles the 179 | odds of getting a zero result, but we assume that won't be fatal - 180 | in essence one can consider that as though -0.0 and +0.0 were separate 181 | possible outputs, each occurring with probability equal to that of any 182 | of the discrete nonzero outputs. 183 | */ 184 | static double pm1[] = {-1.0, +1.0}; 185 | double sign; 186 | uint64 itmp64, iran64; 187 | double retval; 188 | 189 | itmp64 = rng_isaac_rand(); 190 | sign = pm1[itmp64 >> 63]; /* Use high bit of iran64 for sign */ 191 | iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull); 192 | retval=sign*((*(double *)&iran64) - 1.0); 193 | /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */ 194 | if(retval < -1.0 || retval > 1.0) 195 | { 196 | sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval); 197 | ASSERT(0, cbuf); 198 | } 199 | return retval; 200 | } 201 | 202 | -------------------------------------------------------------------------------- /src/rng_isaac.h: -------------------------------------------------------------------------------- 1 | /* 2 | ------------------------------------------------------------------------------ 3 | isaac64.h: definitions for a random number generator 4 | Bob Jenkins, 1996, Public Domain 5 | ------------------------------------------------------------------------------ 6 | */ 7 | /**************************************************************************** 8 | * We now include this header file if it was not included before. 9 | ****************************************************************************/ 10 | #ifndef rng_isaac_h_included 11 | #define rng_isaac_h_included 12 | 13 | /* 14 | 11/25/05: EWM - typedefs to use standard int types defined in types.h : 15 | */ 16 | #include "Mdata.h" 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | typedef uint64 ub8; 23 | #define UB8MAXVAL 0xffffffffffffffffLL 24 | #define UB8BITS 64 25 | typedef sint64 sb8; 26 | #define SB8MAXVAL 0x7fffffffffffffffLL 27 | typedef uint32 ub4; /* unsigned 4-byte quantities */ 28 | #define UB4MAXVAL 0xffffffff 29 | typedef sint32 sb4; 30 | #define UB4BITS 32 31 | #define SB4MAXVAL 0x7fffffff 32 | typedef uint16 ub2; 33 | #define UB2MAXVAL 0xffff 34 | #define UB2BITS 16 35 | typedef sint16 sb2; 36 | #define SB2MAXVAL 0x7fff 37 | typedef uint8 ub1; 38 | #define UB1MAXVAL 0xff 39 | #define UB1BITS 8 40 | typedef sint8 sb1; /* signed 1-byte quantities */ 41 | #define SB1MAXVAL 0x7f 42 | typedef int word; /* fastest type available */ 43 | 44 | 45 | #ifndef ISAAC64 46 | #define ISAAC64 47 | 48 | #define RANDSIZL (8) 49 | #define RANDSIZ (1< 73 | 74 | #ifdef __cplusplus 75 | extern "C" 76 | { 77 | #endif 78 | 79 | /* mutexes ---------------------------------------------------------*/ 80 | 81 | #ifdef OS_TYPE_WINDOWS 82 | typedef HANDLE mutex_t; 83 | #else 84 | typedef pthread_mutex_t mutex_t; 85 | #endif 86 | /* 87 | static void mutex_init(mutex_t *m) 88 | { 89 | #ifdef OS_TYPE_WINDOWS 90 | *m = CreateMutex(NULL, FALSE, NULL); 91 | #else 92 | pthread_mutex_init(m, NULL); 93 | #endif 94 | } 95 | 96 | static void mutex_free(mutex_t *m) 97 | { 98 | #ifdef OS_TYPE_WINDOWS 99 | CloseHandle(*m); 100 | #else 101 | pthread_mutex_destroy(m); 102 | #endif 103 | } 104 | 105 | static void mutex_lock(mutex_t *m) 106 | { 107 | #ifdef OS_TYPE_WINDOWS 108 | WaitForSingleObject(*m, INFINITE); 109 | #else 110 | pthread_mutex_lock(m); 111 | #endif 112 | } 113 | 114 | static void mutex_unlock(mutex_t *m) 115 | { 116 | #ifdef OS_TYPE_WINDOWS 117 | ReleaseMutex(*m); 118 | #else 119 | pthread_mutex_unlock(m); 120 | #endif 121 | } 122 | */ 123 | /* a thread pool --------------------------------------------------*/ 124 | 125 | typedef void (*init_func)(void *data, int thread_num); 126 | typedef void (*run_func)(void *data, int thread_num); 127 | typedef void (*shutdown_func)(void *data, int thread_num); 128 | 129 | typedef struct { 130 | init_func init; 131 | shutdown_func shutdown; 132 | void *data; 133 | } thread_control_t; 134 | 135 | typedef struct { 136 | init_func init; 137 | run_func run; 138 | shutdown_func shutdown; 139 | void *data; 140 | } task_control_t; 141 | 142 | struct threadpool_queue 143 | { 144 | unsigned int head; 145 | unsigned int tail; 146 | unsigned int num_tasks; 147 | unsigned int max_tasks; 148 | void **tasks; 149 | }; 150 | 151 | struct thread_init 152 | { 153 | int thread_num; 154 | struct threadpool *pool; 155 | thread_control_t control; 156 | }; 157 | 158 | struct threadpool 159 | { 160 | struct threadpool_queue tasks_queue; 161 | struct threadpool_queue free_tasks_queue; 162 | 163 | task_control_t *tasks; 164 | 165 | struct thread_init *thr_init; 166 | pthread_t *thr_arr; 167 | 168 | unsigned short num_of_threads; 169 | unsigned short num_of_cores; 170 | volatile unsigned short stop_flag; 171 | 172 | pthread_mutex_t free_tasks_mutex; 173 | pthread_cond_t free_tasks_cond; 174 | pthread_cond_t tasks_done_cond; 175 | 176 | pthread_mutex_t mutex; 177 | pthread_cond_t new_tasks_cond; 178 | }; 179 | 180 | struct threadpool* threadpool_init( 181 | int num_threads, 182 | int num_cores, 183 | int queue_size, 184 | thread_control_t *t); 185 | 186 | int threadpool_add_task(struct threadpool *pool, 187 | task_control_t *t, 188 | int blocking); 189 | 190 | void threadpool_free(struct threadpool *pool); 191 | 192 | /* returns zero if no pending tasks */ 193 | int threadpool_drain(struct threadpool *pool, 194 | int blocking); 195 | 196 | /********************* utility macros: ********************/ 197 | 198 | // Don't use any of these at present, but note MacOS has its own versions of these, in /usr/include/X11/Xthreads.h: 199 | #if 1 200 | static void * xmalloc(size_t len) { 201 | void *ptr = malloc(len); 202 | if (ptr == NULL) { 203 | printf("failed to allocate %u bytes\n", (uint32)len); 204 | exit(-1); 205 | } 206 | return ptr; 207 | } 208 | 209 | static void * xcalloc(size_t num, size_t len) { 210 | void *ptr = calloc(num, len); 211 | if (ptr == NULL) { 212 | printf("failed to calloc %u bytes\n", (uint32)(num * len)); 213 | exit(-1); 214 | } 215 | return ptr; 216 | } 217 | 218 | static void * xrealloc(void *iptr, size_t len) { 219 | void *ptr = realloc(iptr, len); 220 | if (ptr == NULL) { 221 | printf("failed to reallocate %u bytes\n", (uint32)len); 222 | exit(-1); 223 | } 224 | return ptr; 225 | } 226 | #endif 227 | 228 | #ifdef __cplusplus 229 | } 230 | #endif 231 | 232 | #endif /* !_THREAD_H_ */ 233 | 234 | -------------------------------------------------------------------------------- /src/types.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * * 3 | * (C) 1997-2021 by Ernst W. Mayer. * 4 | * * 5 | * This program is free software; you can redistribute it and/or modify it * 6 | * under the terms of the GNU General Public License as published by the * 7 | * Free Software Foundation; either version 2 of the License, or (at your * 8 | * option) any later version. * 9 | * * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT * 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * 13 | * more details. * 14 | * * 15 | * You should have received a copy of the GNU General Public License along * 16 | * with this program; see the file GPL.txt. If not, you may view one at * 17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the * 18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 19 | * 02111-1307, USA. * 20 | * * 21 | *******************************************************************************/ 22 | 23 | #include "types.h" 24 | 25 | /* Useful extern constants to export: */ 26 | 27 | /* Multiword ints have word significance increasing from left to right: */ 28 | 29 | /* 5/04/2005: uint96/160s are really uint128/192s with upper 32 bits zero: */ 30 | const uint96 NIL96 = {(uint64)0, (uint32)0}; 31 | const uint96 ONE96 = {(uint64)1, (uint32)0}; 32 | const uint96 TWO96 = {(uint64)2, (uint32)0}; 33 | 34 | const uint128 NIL128 = {(uint64)0, (uint64)0}; 35 | const uint128 ONE128 = {(uint64)1, (uint64)0}; 36 | const uint128 TWO128 = {(uint64)2, (uint64)0}; 37 | 38 | const uint160 NIL160 = {(uint64)0, (uint64)0, (uint32)0}; 39 | const uint160 ONE160 = {(uint64)1, (uint64)0, (uint32)0}; 40 | const uint160 TWO160 = {(uint64)2, (uint64)0, (uint32)0}; 41 | 42 | const uint192 NIL192 = {(uint64)0, (uint64)0, (uint64)0}; 43 | const uint192 ONE192 = {(uint64)1, (uint64)0, (uint64)0}; 44 | const uint192 TWO192 = {(uint64)2, (uint64)0, (uint64)0}; 45 | 46 | const uint256 NIL256 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0}; 47 | const uint256 ONE256 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0}; 48 | const uint256 TWO256 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0}; 49 | 50 | const uint512 NIL512 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0}; 51 | const uint512 ONE512 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0}; 52 | const uint512 TWO512 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0}; 53 | 54 | /* Nov 2021: Case-insensitive analog of strstr - used the code posted by 'chux' here: 55 | https://stackoverflow.com/questions/27303062/strstr-function-like-that-ignores-upper-or-lower-case 56 | */ 57 | #include // Needed for tolower ... this include is normally via masterdefs.h 58 | char* stristr(const char* haystack, const char* needle) { 59 | do { 60 | const char* h = haystack; 61 | const char* n = needle; 62 | while (tolower((unsigned char) *h) == tolower((unsigned char ) *n) && *n) { 63 | h++; 64 | n++; 65 | } 66 | if (*n == 0) { 67 | return (char *) haystack; 68 | } 69 | } while (*haystack++); 70 | return 0; 71 | } 72 | 73 | /* Binary predicates for use of stdlib qsort(): */ 74 | int ncmp_int(const void * a, const void * b) // Default-int compare predicate 75 | { 76 | return ( *(int*)a - *(int*)b ); 77 | } 78 | 79 | int ncmp_uint32(const void * a, const void * b) // Mnemonic: "Numeric CoMPare of UINT32 data" 80 | { 81 | uint32 diff = *(uint32*)a - *(uint32*)b; 82 | uint32 borrow = 1 - ((diff > *(uint32*)a) << 1); // -1 if (a < b), +1 otherwise 83 | // If (diff > a) == 1, had a borrow, i.e. a < b, return -1. 84 | // Otherwise return 0 if diff == 0, +1 if diff != 0. Can roll all 3 possibilities into one expression: 85 | return ( borrow & -(diff != 0) ); 86 | /* 87 | a < b: bw = -1, (diff != 0) = 1, -() = -1 ===> -1 & -1 = -1 88 | a = b: bw = +1, (diff != 0) = 0, -() = 0 ===> +1 & 0 = 0 89 | a > b: bw = +1, (diff != 0) = 1, -() = -1 ===> +1 & -1 = +1 90 | */ 91 | } 92 | 93 | int ncmp_sint32(const void * a, const void * b) 94 | { 95 | return ( *(sint32*)a - *(sint32*)b ); 96 | } 97 | 98 | int ncmp_uint64(const void * a, const void * b) 99 | { 100 | uint64 diff = *(uint64*)a - *(uint64*)b; 101 | uint64 borrow = 1 - ((diff > *(uint64*)a) << 1); // -1 if (a < b), +1 otherwise 102 | return ( borrow & -(diff != 0) ); 103 | } 104 | 105 | int ncmp_sint64(const void * a, const void * b) 106 | { 107 | return ( *(sint64*)a - *(sint64*)b ); 108 | } 109 | 110 | --------------------------------------------------------------------------------