├── .github
├── dependabot.yml
└── workflows
│ └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── config-fermat.sh
├── docs
├── Fermat-testing.md
├── Mfactor_buildnotes.txt
├── a.txt
├── alderlake.txt
├── b.txt
├── brent-suyama.txt
├── c.txt
├── d.txt
├── dct.txt
├── fgt.txt
├── gerbicz.txt
├── gpuowl_stats.txt
├── hwloc_test.txt
├── irrational.txt
├── knc.txt
├── nt.txt
├── pm1.txt
├── pm1_compare.png
├── pm1_compare.txt
├── pm1_d210.txt
├── pm1_d330.txt
├── pm1_d420.txt
├── pm1_d660.txt
├── pm1_d840.txt
├── predefs_archlinux.txt
├── predefs_linux.txt
├── predefs_mac.txt
├── predefs_power9.txt
├── prp_proof.txt
├── prp_proof_examples.txt
├── qs.txt
└── todo.txt
├── help.txt
├── makemake.sh
└── src
├── Mdata.h
├── Mlucas.c
├── Mlucas.h
├── align.h
├── br.c
├── carry.h
├── carry_dbg.h
├── carry_gcc32.h
├── carry_gcc64.h
├── dft_macro.c
├── dft_macro.h
├── dft_sine_term_opt.c.txt
├── f2psp.h
├── f2psp_3_5.h
├── fac_test_dat128.h
├── fac_test_dat192.h
├── fac_test_dat256.h
├── fac_test_dat64.h
├── fac_test_dat96.h
├── factor.c
├── factor.h
├── factor_test.h
├── fermat_mod_square.c
├── fgt_m61.c
├── fgt_m61.h
├── float_intrin.h
├── gcd_lehmer.c.txt
├── gcd_lehmer.h
├── genFFT_mul.h
├── getRealTime.c
├── get_cpuid.c
├── get_fft_radices.c
├── get_fp_rnd_const.c
├── get_preferred_fft_radix.c
├── gpu_iface.cu
├── gpu_iface.h
├── gpu_sieve.cu
├── imul256_macro.h
├── imul_macro.c
├── imul_macro.h
├── imul_macro0.h
├── imul_macro1.h
├── masterdefs.h
├── mers_mod_square.c
├── mi64.c
├── mi64.h
├── mi64_new.c.txt
├── pairFFT_mul.c
├── pair_square.c
├── pair_square.h
├── platform.h
├── pm1.c
├── prefetch.h
├── qfcheb.c.txt
├── qfcheb.h
├── qfloat.c
├── qfloat.h
├── radix09_sse_macro.h
├── radix1008_avx_negadwt_consts.h
├── radix1008_ditN_cy_dif1.c
├── radix1008_main_carry_loop.h
├── radix1024.h
├── radix1024_avx_negadwt_consts.h
├── radix1024_ditN_cy_dif1.c
├── radix1024_main_carry_loop.h
├── radix1024_twiddles.h
├── radix104_ditN_cy_dif1.c
├── radix10_ditN_cy_dif1.c
├── radix112_ditN_cy_dif1.c
├── radix11_ditN_cy_dif1.c
├── radix11_sse_macro.h
├── radix120_ditN_cy_dif1.c
├── radix128.h
├── radix128_ditN_cy_dif1.c
├── radix128_main_carry_loop.h
├── radix128_twiddles.h
├── radix12_ditN_cy_dif1.c
├── radix12_main_carry_loop.h
├── radix13.h
├── radix13_ditN_cy_dif1.c
├── radix13_sse_macro.h
├── radix144_ditN_cy_dif1.c
├── radix144_main_carry_loop.h
├── radix14_ditN_cy_dif1.c
├── radix15_ditN_cy_dif1.c
├── radix15_sse_macro.h
├── radix16.h
├── radix160_ditN_cy_dif1.c
├── radix160_main_carry_loop.h
├── radix16_dif_dit_pass.c
├── radix16_dif_dit_pass_asm.h
├── radix16_ditN_cy_dif1.c
├── radix16_ditN_cy_dif1_asm.h
├── radix16_dyadic_square.c
├── radix16_dyadic_square_gcc64.h
├── radix16_main_carry_loop.h
├── radix16_pairFFT_mul.c
├── radix16_utils_asm.h
├── radix16_wrapper_ini.c
├── radix16_wrapper_square.c
├── radix16_wrapper_square_gcc32.h
├── radix16_wrapper_square_gcc64.h
├── radix176_ditN_cy_dif1.c
├── radix176_main_carry_loop.h
├── radix17_dft.h
├── radix17_ditN_cy_dif1.c
├── radix18_ditN_cy_dif1.c
├── radix192_ditN_cy_dif1.c
├── radix192_main_carry_loop.h
├── radix208_ditN_cy_dif1.c
├── radix208_main_carry_loop.h
├── radix20_ditN_cy_dif1.c
├── radix20_ditN_cy_dif1_gcc32.h
├── radix20_ditN_cy_dif1_gcc64.h
├── radix20_main_carry_loop.h
├── radix224_ditN_cy_dif1.c
├── radix224_main_carry_loop.h
├── radix22_ditN_cy_dif1.c
├── radix240_ditN_cy_dif1.c
├── radix240_main_carry_loop.h
├── radix24_ditN_cy_dif1.c
├── radix24_ditN_cy_dif1_gcc32.h
├── radix24_ditN_cy_dif1_gcc64.h
├── radix24_main_carry_loop.h
├── radix256.h
├── radix256_ditN_cy_dif1.c
├── radix256_main_carry_loop.h
├── radix256_twiddles.h
├── radix26_ditN_cy_dif1.c
├── radix288_ditN_cy_dif1.c
├── radix288_main_carry_loop.h
├── radix28_ditN_cy_dif1.c
├── radix28_ditN_cy_dif1_gcc32.h
├── radix28_ditN_cy_dif1_gcc64.h
├── radix28_main_carry_loop.h
├── radix30_ditN_cy_dif1.c
├── radix31.h
├── radix31_ditN_cy_dif1.c
├── radix32.h
├── radix320_ditN_cy_dif1.c
├── radix320_main_carry_loop.h
├── radix32_dif_dit_pass.c
├── radix32_dif_dit_pass_asm.h
├── radix32_ditN_cy_dif1.c
├── radix32_ditN_cy_dif1_asm.h
├── radix32_dyadic_square.c
├── radix32_dyadic_square_gcc64.h
├── radix32_main_carry_loop.h
├── radix32_utils_asm.h
├── radix32_wrapper_ini.c
├── radix32_wrapper_square.c
├── radix32_wrapper_square_gcc32.h
├── radix32_wrapper_square_gcc64.h
├── radix352_ditN_cy_dif1.c
├── radix352_main_carry_loop.h
├── radix36_ditN_cy_dif1.c
├── radix36_main_carry_loop.h
├── radix384_ditN_cy_dif1.c
├── radix384_main_carry_loop.h
├── radix4032.h
├── radix4032_avx_negadwt_consts.h
├── radix4032_ditN_cy_dif1.c
├── radix4032_main_carry_loop.h
├── radix40_ditN_cy_dif1.c
├── radix40_main_carry_loop.h
├── radix44_ditN_cy_dif1.c
├── radix44_main_carry_loop.h
├── radix48_ditN_cy_dif1.c
├── radix48_main_carry_loop.h
├── radix512.h
├── radix512_ditN_cy_dif1.c
├── radix52_ditN_cy_dif1.c
├── radix52_main_carry_loop.h
├── radix56_ditN_cy_dif1.c
├── radix56_main_carry_loop.h
├── radix5_ditN_cy_dif1.c
├── radix60_ditN_cy_dif1.c
├── radix60_main_carry_loop.h
├── radix63_ditN_cy_dif1.c
├── radix63_main_carry_loop.h
├── radix64.h
├── radix64_ditN_cy_dif1.c
├── radix64_main_carry_loop.h
├── radix6_ditN_cy_dif1.c
├── radix72_ditN_cy_dif1.c
├── radix768_ditN_cy_dif1.c
├── radix768_main_carry_loop.h
├── radix7_ditN_cy_dif1.c
├── radix80_ditN_cy_dif1.c
├── radix88_ditN_cy_dif1.c
├── radix8_dif_dit_pass.c
├── radix8_dif_dit_pass_asm.h
├── radix8_ditN_cy_dif1.c
├── radix960_avx_negadwt_consts.h
├── radix960_ditN_cy_dif1.c
├── radix960_main_carry_loop.h
├── radix96_ditN_cy_dif1.c
├── radix992_ditN_cy_dif1.c
├── radix992_main_carry_loop.h
├── radix9_ditN_cy_dif1.c
├── rng_isaac.c
├── rng_isaac.h
├── sse2_macro.h
├── sse2_macro_gcc32.h
├── sse2_macro_gcc64.h
├── test_fft_radix.c
├── test_fft_radix.c.txt
├── threadpool.c
├── threadpool.h
├── twopmodq.c
├── twopmodq100.c
├── twopmodq100.h
├── twopmodq128.c
├── twopmodq128_96.c
├── twopmodq160.c
├── twopmodq192.c
├── twopmodq256.c
├── twopmodq64_test.c
├── twopmodq80.c
├── twopmodq80.h
├── twopmodq96.c
├── types.c
├── types.h
├── util.c
└── util.h
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "monthly"
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | obj*/
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/primesearch/Mlucas/actions/workflows/ci.yml)
2 |
3 | # Mlucas
4 | Ernst Mayer's Mlucas and Mfactor programs for GIMPS
5 |
6 | [Ernst Mayer passed away unexpectedly](https://www.mersenneforum.org/showthread.php?t=28890) on September 10, 2023. This repository contains his posthumously released Mlucas v21 code, which is now maintained by the Great Internet Mersenne Prime Search (GIMPS) community. AutoPrimeNet (the Python PrimeNet program) previously bundled with Mlucas is now maintained in a [separate repository](https://github.com/tdulcet/AutoPrimeNet).
7 |
8 | Mlucas and Mfactor are 100% open source programs. Mlucas is for [primality](https://en.wikipedia.org/wiki/Primality_test) and [P-1](https://en.wikipedia.org/wiki/Pollard%27s_p_%E2%88%92_1_algorithm) testing of [Mersenne](https://en.wikipedia.org/wiki/Mersenne_prime) and [Fermat](https://en.wikipedia.org/wiki/Fermat_number) numbers, including support for the [Lucas-Lehmer](https://en.wikipedia.org/wiki/Lucas%E2%80%93Lehmer_primality_test), [Probable prime](https://en.wikipedia.org/wiki/Probable_prime) (PRP) and [Pépin](https://en.wikipedia.org/wiki/P%C3%A9pin%27s_test) tests. Mfactor is for trial factoring. They support x86 Intel and AMD, ARM and other CPUs.
9 |
10 | The original [Mlucas README](https://mersenneforum.org/mayer/README.html) is available for posterity and contains a lot of information, but note that it is no longer up to date. For more information about Mlucas v21, please see the [Ernst's Mlucas - the future](https://www.mersenneforum.org/showthread.php?t=28926) thread on the Mersenne Forum.
11 |
12 | Feature | | Mlucas | Prime95/MPrime
13 | --- | --- | ---: | ---:
14 | **Architectures** | x86 | ✔️ | ✔️
15 | \- | ARM | ✔️ |
16 | \- | Other | ✔️ |
17 | **Worktypes** | LL | ✔️ | ✔️
18 | \- | PRP | ✔️ | ✔️
19 | \- | P-1 | ✔️ | ✔️
20 | \- | P+1 | | ✔️
21 | \- | ECM | | ✔️
22 | \- | Pépin | ✔️ | ✔️
23 | **PRP** | Proofs | | ✔️
24 | \- | Certs | | ✔️
25 | **Error Checking** | Jacobi | | ✔️
26 | \- | Gerbicz | ✔️ | ✔️
27 | **Random Shifts** | | ✔️ | ✔️
28 | **Interface** | CLI | ✔️ | MPrime only
29 | \- | GUI | | Prime95 only
30 | **Multiple Workers** | | Separate runs | ✔️
31 | **PrimeNet Support** | | Separate program | ✔️
32 | **Max FFT Length** | | 256M
(**512M** with 0 shift) | 32M (AVX) -
64M (AVX512)
33 | **Largest Exponent** | | 4,294,967,231
(**8,937,021,911** with 0 shift) | 595,700,000 (AVX) -
1,169,000,000 (AVX512)
34 | **Performance** | | ~50-90% | **100%**
35 | **Free** 🆓 | | **Yes**, GPL | No, EULA
36 | **100% Open Source** | | ✔️ | Mostly
37 | **Claim Full EFF Awards** | | ✔️ |
38 |
39 | ## Usage
40 |
41 | ### Automatic method
42 |
43 | Linux users can use the [Mlucas install script](https://github.com/tdulcet/Distributed-Computing-Scripts#mlucas) to automatically download, build, setup and run Mlucas, including downloading, setting up and running the [AutoPrimeNet](https://github.com/tdulcet/AutoPrimeNet) for automated PrimeNet assignments.
44 |
45 | ### Manual method
46 |
47 | Dependencies:
48 | * Make
49 | * GNU C or Clang compiler
50 | * \*GNU Multiple Precision (GMP) library
51 | * \*Portable Hardware Locality (hwloc) library
52 | * \*Python 3
53 |
54 | \* Optional
55 |
56 | #### Download
57 |
58 | ##### Linux
59 |
60 | 1. Verify that the dependencies above are installed. On Debian and Ubuntu, run: `sudo apt update` and `sudo apt install build-essential libgmp-dev libhwloc-dev`.
61 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
62 | 3. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
63 |
64 | ##### macOS
65 |
66 | 1. Verify that the dependencies above are installed. Run: `brew install gmp hwloc`.
67 | 2. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `curl -fLO https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
68 | 3. To download AutoPrimeNet, run: `curl -sSfLO https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
69 |
70 | ##### Windows
71 |
72 | Native Windows builds are experimental. For now, Windows users should use the [Windows Subsystem for Linux](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) (WSL) and follow the [Linux](#linux) instructions above instead.
73 |
74 | 1. Download and install [MSYS2](https://www.msys2.org/).
75 | 2. Verify that the dependencies above are installed. With the MINGW64 environment, run: `pacman -S mingw-w64-x86_64-gmp mingw-w64-x86_64-hwloc`.
76 | 3. If one has git installed, just run: `git clone https://github.com/primesearch/Mlucas.git`. Otherwise, download the latest archive: `wget https://github.com/primesearch/Mlucas/archive/main.tar.gz` and then decompress the files: `tar -xzvf main.tar.gz`.
77 | 4. To download AutoPrimeNet, run: `wget -nv https://raw.github.com/tdulcet/AutoPrimeNet/main/autoprimenet.py`.
78 |
79 | #### Build
80 |
81 | 1. Change into the `Mlucas` directory. Run: `cd Mlucas` or `cd Mlucas-main` depending on which method one used to download it.
82 | 2. Run:
83 | * To build Mlucas: `bash makemake.sh [use_hwloc]`.
84 | * To build Mfactor: `bash makemake.sh mfac [word]`, where `word` is optionally one of `1word`, `2word`, `3word`, `4word` or `nword`.
85 |
86 | To build with Clang or another compiler instead of GCC, run: `export CC=`, for example: `export CC=clang`.
87 |
88 | #### Setup and Run
89 |
90 | 1. Change into the `obj` directory. Run: `cd obj` or `cd obj_mfac` depending on if one built Mlucas or Mfactor respectively.
91 |
92 | This README is still in progress. For now, see the original [Mlucas README](https://mersenneforum.org/mayer/README.html), which has more information about how to setup and run Mlucas. Also see [Help](#help) below. Note that with Mlucas v21, if built with the hwloc library, one would want to use the new `-core` option instead of `-cpu`.
93 |
94 | ## Help
95 |
96 | The [help.txt](help.txt) file includes a variety of usage information not covered in the original [README](https://mersenneforum.org/mayer/README.html), concentrating largely on the Mlucas command line options. A separate documentation page covers [Fermat numbers](docs/Fermat-testing.md).
97 |
98 | ## Contributing
99 |
100 | Pull requests welcome!
101 |
--------------------------------------------------------------------------------
/config-fermat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Shell script for generating fermat.cfg; Mlucas output saved to config-fermat.log
4 |
5 | ################################################################################
6 | # #
7 | # (C) 2024 by Catherine Cowie and Teal Dulcet. #
8 | # #
9 | # This program is free software; you can redistribute it and/or modify it #
10 | # under the terms of the GNU General Public License as published by the #
11 | # Free Software Foundation; either version 2 of the License, or (at your #
12 | # option) any later version. #
13 | # #
14 | # This program is distributed in the hope that it will be useful, but WITHOUT #
15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
16 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
17 | # more details. #
18 | # #
19 | # You should have received a copy of the GNU General Public License along #
20 | # with this program; see the file GPL.txt. If not, you may view one at #
21 | # http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the #
22 | # Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #
23 | # 02111-1307, USA. #
24 | # #
25 | ################################################################################
26 |
27 | # Mlucas
28 | MLUCAS=./Mlucas
29 |
30 | # Number of iterations (use 100, 1000, or 10000 to match pre-computed values)
31 | ITERS=100
32 |
33 | # Minimum Fermat number (15 or greater)
34 | MIN=15
35 |
36 | # Maximum Fermat number (33 or less)
37 | MAX=29
38 |
39 | # Mlucas arguments
40 | ARGS=(
41 | "$@"
42 | # Add desired -cpu or -core settings here, or as following arguments, e.g. bash ../config-fermat.sh -cpu 0:3
43 | )
44 |
45 | # First, tiny FFT lengths for F15 to F17 (note 4K is the smallest workable length without fiddly radix settings);
46 | FFTS=([2]=15 [4]=16 [7]=17 [8]=17)
47 | # Then, from small up to egregiously large FFTs for F18 to F33.
48 | # The largest FFT reached is 512M, if MAX is set to 33.
49 | # Note that large FFTs require considerable runtime at 10000 iterations.
50 | for ((n = 0; n < 16; ++n)); do
51 | m=$((1 << n))
52 | f=$((18 + n))
53 | for k in 15 16; do
54 | if [[ $k -eq 15 && $n -lt 11 ]]; then
55 | # k = 7 multiples (7K, 14K, ...) become unworkable after F28 (14M).
56 | FFTS[14 * m]=$f
57 | fi
58 | # k = 15, 16 should both be supported up to at least F32.
59 | FFTS[k * m]=$f
60 | if [[ $k -eq 15 && $n -gt 5 ]]; then
61 | # k = 63 is mostly supported for F24 (1008K) and above.
62 | FFTS[63 * m >> 2]=$f
63 | fi
64 | done
65 | done
66 | for fft in "${!FFTS[@]}"; do
67 | f=${FFTS[fft]}
68 | if [[ -n $MIN && $f -lt $MIN ]]; then
69 | continue
70 | elif [[ -n $MAX && $f -gt $MAX ]]; then
71 | break
72 | fi
73 | printf '\n\tTesting F%s (2^%s + 1),\tFFT length: %sK\n\n' "$f" $((1 << f)) "$fft"
74 | args=("${ARGS[@]}")
75 | # First we test the very fiddly F15 and then loop over F16 up to maximum
76 | if [[ $f -eq 15 ]]; then
77 | args+=(-radset 8,8,16)
78 | fi
79 | if [[ $f -le 17 || $f -ge 32 ]]; then
80 | args+=(-shift 0)
81 | fi
82 | time $MLUCAS -f "$f" -fft "$fft" -iters $ITERS "${args[@]}" 2>&1 | tee -a config-fermat.log | grep -i 'error\|warn\|assert\|writing\|pmax_rec\|fft radices'
83 | done
84 |
--------------------------------------------------------------------------------
/docs/irrational.txt:
--------------------------------------------------------------------------------
1 | 24 Feb 2022
2 | Prove irrationality of sqrt(2) via N-R iteration formula?
3 |
4 | Let f(x) = x^(-2) − c, applying N-R (dx = -f/f' = (c - 1/x^2)/(-2/x^3) = x.(1-cx^2)/2) to this yields a second-order iterative formula for the reciprocal square-root of the computationally efficient kind we seek, with a per-iteration cost of 1 ADD and 4 MUL:
5 | x_n+1 = x*(3 - c*x^2)/2
6 | Fixed point(s) x* of the iteration given by dx = x*.(1-cx*^2)/2) = 0, with solutions x* = +- 1/sqrt(c) .
7 | Assume x* rational, i.e. x* = 1/sqrt(c) = p/q. In terms of p and q our iteration is
8 | x_n+1 = (p/q)*(3 - c*(p/q)^2)/2 = p.(3.q^2 - 2.p^2)/(2.q^3), i.e. p' = p*(3*q^2 - 2*p^2), q' = (2*q^3)
9 | Example: c = 2, x0 = 1, x_n+1 = x_n*(3 - 2*x_n^2)/2 ... if x_n = p/q, have
10 | bc:
11 | p=q=1
12 | p *= (3*q^2-2*p^2); q = (2*q^3); g = gcd(p,q); p /= g; q /= g; print "gcd = ",g,": p = ",p,", q = ",q,"\n"
13 | n x_n = p/q factorization of p,q
14 | 0 1
15 | 1 1/2
16 | 2 5/8 5,2^3
17 | 3 355/512 5.71,2^9
18 | 4 94852805/134217728 5.23.71.11617,2^27
19 | 5 1709678476417571835487555/2417851639229258349412352 5.23.71.5741.8837.11617.355280903,2^81
20 | 6 p = 5.23.71.3023.5741.8837.11617.27509.355280903.70298580191725636724693742124090124808533, q = 2^243
21 | ...
22 | We observe that for each iteration, gcd(p',q') = 2. Also:
23 | o Once p has a given odd factor, subsequent iterations merely add more odd factors to p
24 | [Q: Are said odd factors all distinct, i.e. is p squarefree?]
25 | o q = 2^, with k tripling on each iteration
26 | Q: Is there a similar trend for other initial choices of p,q?
27 | p0 = 4, q0 = 5:
28 | n x_n = p/q
29 | 0 4/5
30 | 1 86/125
31 | 2 43.32083/5^9
32 | 3 43.32083.308933.24722741/2.5^27
33 | 4 43.1987.32083.197947.308933.5926127.24722741.51537769.1848407118139843/2^3.5^81; so, more observations:
34 | o Any power of 2 in p is reduced by 1 each iteration until p odd, q = 2.odd
35 | o Each distinct prime in the factorization of q has its power tripled each iteration
36 | o Assuming p0,q0 in reduced form (gcd(p,q) = 1), again we have gcd(p,q) = 2 each iteration.
37 |
38 | Without loss of generality we can consider the initial iterate within the basin of monotone convergence and its p0,q0 reduced, i.e. gcd(p0,q0) = 1, thus p0=q0=1 or p0,q0 have opposite parity, and:
39 | 1: p0=q0=1 yields next-iterate p = 1, q = 2, thus of form [2] below.
40 | 2: For p0 odd, q0 even: both 2.p^2 and 3.q^2 even and numerator p*(3*q^2 - 2*p^2) = 2*odd, thus gcd(p',q') = 2
41 | 3: For p0 even, q0 odd: 2.p^2 even and 3.q^2 odd; p*(3*q^2 - 2*p^2) even, denominator 2.q^3 = 2*odd, thus gcd(p',q') = 2
42 | In case [3] the unreduced numerator is divisible by 2^k with k > 1; since (3*q^2 - 2*p^2) odd, said power of 2 is the same as contained in the input value p0, and the ensuing division by the gcd = 2 reduces it by 1, thus after k further iterations we fall into pattern [2] and remain there (e.g. p0,q0 = 4,5 give p = 86,1379569,... and q = 125,1953125,...; p0,q0 = 8,9 give p = 460,269358290,41100860142614334318305635,... and q = 729,387420489,58149737003040059690390169,...).
43 | Thus after a finite number of iterations we inevitably settle into pattern [2] and remain there, thus the iteration converges in the sense that p/q approaches a limit but p,q never do because their gcd remains fixed at 2. QED
44 | *** Not quite - need to show that (or if) gcd cannot include an odd prime ***
45 | For q0 = 2 that is easy - denominator = 2.q^3, if it starts as a power of 2 it stays there.
46 |
47 | Now try c = 3: Iterative-update is p = p.(3.q^2 - c.p^2) = 3.p.(q^2 - p^2), q = (2.q^3).
48 | Again use p0 = 1, q0 = 2:
49 | n x_n = p/q
50 | 1 3^2/2^4
51 | 2 3^3.5^2.7/2^13
52 | 3 3^4.5^2.7.3467.12917/2^40
53 | Denominator = 2^k, k = (3.n+1); if p0 odd, numerator = odd.odd.(even-odd) always odd, hence sqrt(3) irrational.
54 |
55 | Now try a (rational)^2, c = 9/16, yielding p = 3.p.(16.q^2 - 3.p^2), q = (2^5.q^3), same initial guess:
56 | n x_n = p/q
57 | 0 1/2
58 | 1 3.61/2^8
59 | 2 3^2.61.107.1511/2^25
60 | ... this clearly converges -> 4/3, but here's the rub: p/q can converge in the sense of the limit as n -> oo, but p and q converge only in this same sense, i.e. there's no reason to expect gcd(p,q) to magically hit a nonzero value such that the resuling gcd-reduced p = 4 and q = 3 in a finite number of steps..
61 |
62 |
--------------------------------------------------------------------------------
/docs/pm1_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/docs/pm1_compare.png
--------------------------------------------------------------------------------
/docs/pm1_compare.txt:
--------------------------------------------------------------------------------
1 | P-1 relative modmul count for stage 2 with b1=1m, b2=30m, best-bigstep option as tabulated in the
2 | comments preceding my pm1_bigstep_size() function, for various prime-pairing #memory buffers:
3 | #buf #modmul
4 | 24 1.00000000000000000000
5 | 40 0.94138773325629738116
6 | 48 0.92740481217168044188
7 | 72 0.86767843536398087761
8 | 80 0.86074072083480994905
9 | 96 0.83101581644392863201
10 | 120 0.80442105746400556719
11 | 144 0.78374955701528234292
12 | 160 0.77282735940798766528
13 | 168 0.76741692741880852926
14 | 192 0.75413606363080072042
15 | 200 0.74602321491854959665
16 | 216 0.74320938718907791436
17 | 240 0.72551239265493547959
18 | 280 0.70928837479334378021
19 | 320 0.69613179866071653512
20 | 336 0.69459499859756496969
21 | 360 0.68523479449708007988
22 | 384 0.68135948300814196113
23 | 400 0.67617411244897627841
24 | 432 0.67047815476500395537
25 | 440 0.66840613398769104328
26 | 480 0.66141411359107905758
27 | 520 0.65619459191939889563
28 | 528 0.65365901177877469167
29 | 560 0.65110495644613445796
30 | 576 0.64716694127520254129
31 | 600 0.64677560311704482025
32 | 624 0.64151017339247634195
33 | 672 0.63652914965409401857
34 | 720 0.63217516273564967452
35 | 768 0.62836031551149128950
36 | 816 0.62483771223376828414
37 | 864 0.62172716172343309377
38 | 912 0.61890717559662273489
39 | 960 0.61633464507196647144
40 | 1008 0.61408235120892139029
41 | 1040 0.61240950655001542398
42 | 1056 0.61196946106745180634
43 | 1104 0.60997358047541707745
44 | 1120 0.60748670765919877011
45 | 1200 0.60309241123090127025
46 | 1280 0.59918350848375218833
47 | 1360 0.59569673588145421035
48 | 1440 0.59258058682808385945
49 | 1520 0.58975556201254185610
50 | 1600 0.58720542566002623477
51 | 1680 0.58489434709511196806
52 | 1760 0.58283856209260102151
53 | 1824 0.58255863494084299501
54 | 1840 0.58092777935470073268
55 | 1920 0.57913848500066342734
56 | 2000 0.57749587247414732789
57 | 2080 0.57601449798704385170
58 | 2112 0.57561308245142284171
59 | 2160 0.57463221771166271689
60 | 2208 0.57365527195202720444
61 | 2240 0.57334175354205821477
62 | 2304 0.57187661482975670411
63 | 2400 0.57025303734956015046
64 |
--------------------------------------------------------------------------------
/docs/pm1_d210.txt:
--------------------------------------------------------------------------------
1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 210, for various prime-pairing #memory buffers:
2 | #buf #modmul
3 | 24 8482142
4 | 72 7462541
5 | 120 6903803
6 | 168 6578211
7 | 216 6369191
8 | 264 6213292
9 | 312 6102927
10 | 360 6019017
11 | 408 5948963
12 | 456 5893249
13 | 504 5848289
14 | 552 5809671
15 | 600 5779087
16 | 648 5750744
17 | 696 5727095
18 | 744 5706454
19 | 792 5687784
20 | 840 5671474
21 | 888 5657449
22 | 936 5643928
23 | 984 5631970
24 | 1032 5621610
25 | 1080 5610944
26 | 1128 5602213
27 | 1176 5594236
28 | 1224 5587383
29 | 1272 5579976
30 | 1320 5573772
31 | 1368 5568653
32 | 1416 5562780
33 | 1464 5557665
34 | 1512 5552816
35 | 1560 5548156
36 | 1608 5543827
37 | 1656 5539520
38 | 1704 5535999
39 | 1752 5532637
40 | 1800 5529301
41 | 1848 5526107
42 | 1896 5523275
43 | 1944 5520469
44 | 1992 5517760
45 | 2040 5515237
46 | 2088 5513037
47 | 2136 5510383
48 | 2184 5508344
49 | 2232 5506230
50 | 2280 5504316
51 | 2328 5502474
52 | 2376 5500748
53 |
--------------------------------------------------------------------------------
/docs/pm1_d330.txt:
--------------------------------------------------------------------------------
1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 330, for various prime-pairing #memory buffers:
2 | #buf #modmul
3 | 40 8047788
4 | 120 7036479
5 | 200 6488371
6 | 280 6120697
7 | 360 5927827
8 | 440 5764103
9 | 520 5645736
10 | 600 5550700
11 | 680 5481350
12 | 760 5426563
13 | 840 5378130
14 | 920 5339380
15 | 1000 5305896
16 | 1080 5275177
17 | 1160 5247276
18 | 1240 5225875
19 | 1320 5206908
20 | 1400 5188988
21 | 1480 5174005
22 | 1560 5159755
23 | 1640 5147646
24 | 1720 5134944
25 | 1800 5125084
26 | 1880 5115890
27 | 1960 5107361
28 | 2040 5099600
29 | 2120 5092087
30 | 2200 5085266
31 | 2280 5077999
32 | 2360 5072329
33 |
--------------------------------------------------------------------------------
/docs/pm1_d420.txt:
--------------------------------------------------------------------------------
1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 420, for various prime-pairing #memory buffers:
2 | #buf #modmul
3 | 48 7799105
4 | 144 6767956
5 | 240 6212962
6 | 336 5887052
7 | 432 5678594
8 | 528 5520431
9 | 624 5413006
10 | 720 5329443
11 | 816 5259481
12 | 912 5203978
13 | 1008 5159100
14 | 1104 5119750
15 | 1200 5088646
16 | 1296 5060438
17 | 1392 5036585
18 | 1488 5016366
19 | 1584 4997817
20 | 1680 4981221
21 | 1776 4966761
22 | 1872 4953372
23 | 1968 4941632
24 | 2064 4931285
25 | 2160 4921051
26 | 2256 4911881
27 | 2352 4904025
28 |
--------------------------------------------------------------------------------
/docs/pm1_d660.txt:
--------------------------------------------------------------------------------
1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 660, for various prime-pairing #memory buffers:
2 | #buf #modmul
3 | 80 7599932
4 | 240 6585540
5 | 400 6054425
6 | 560 5683123
7 | 720 5486768
8 | 880 5324522
9 | 1040 5204859
10 | 1200 5109664
11 | 1360 5038998
12 | 1520 4984967
13 | 1680 4937214
14 | 1840 4898839
15 | 2000 4865619
16 | 2160 4834781
17 | 2320 4807728
18 |
--------------------------------------------------------------------------------
/docs/pm1_d840.txt:
--------------------------------------------------------------------------------
1 | P-1 (modmul count/10^7) for stage 2 with b1=5m, b2=150m, bigstep = 840, for various prime-pairing #memory buffers:
2 | #buf #modmul
3 | 96 7451024
4 | 288 6422689
5 | 480 5869480
6 | 672 5542247
7 | 864 5332830
8 | 1056 5174753
9 | 1248 5067407
10 | 1440 4984033
11 | 1632 4914157
12 | 1824 4858498
13 | 2016 4813486
14 | 2208 4773984
15 | 2400 4743455
16 |
--------------------------------------------------------------------------------
/docs/predefs_mac.txt:
--------------------------------------------------------------------------------
1 | #define __DBL_MIN_EXP__ (-1021)
2 | #define __FLT_MIN__ 1.17549435e-38F
3 | #define __DEC64_DEN__ 0.000000000000001E-383DD
4 | #define TRUE 1
5 | #define __CHAR_BIT__ 8
6 | #define BIT_CLR(x,b) ( (x) &= ~(1 << (b)) )
7 | #define CPU_NAME "x86_64"
8 | #define ALIGN_VEC_U64(_p) ALIGN_UINT64(_p)
9 | #define __WCHAR_MAX__ 2147483647
10 | #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
11 | #define __FLT_EVAL_METHOD__ 0
12 | #define STRNEQN(s1,s2,n) ( strncmp(s1,s2,n))
13 | #define ALIGN_f128(_p) (__float128 *)(((long)(_p) | 127)+1)
14 | #define __DBL_MIN_10_EXP__ (-307)
15 | #define __FINITE_MATH_ONLY__ 0
16 | #define ALLOC_COMPLEX(_p,_n) (struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512)
17 | #define L2_SZ_VD 3
18 | #define ALIGN_COMPLEX(_p) (struct complex*)(((long)(_p) | 127)+1)
19 | #define __DEC64_MAX_EXP__ 384
20 | #define __SHRT_MAX__ 32767
21 | #define __LDBL_MAX__ 1.18973149535723176502e+4932L
22 | #define __APPLE_CC__ 5666
23 | #define __UINTMAX_TYPE__ long unsigned int
24 | #define __DEC32_EPSILON__ 1E-6DF
25 | #define __block __attribute__((__blocks__(byref)))
26 | #define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256)
27 | #define ALIGN_UINT64(_p) (uint64 *)(((long)(_p) | 63)+1)
28 | #define STREQ(s1,s2) (!strcmp(s1,s2))
29 | #define __SCHAR_MAX__ 127
30 | #define HERE __LINE__, __FILE__
31 | #define align_h_included
32 | #define __USER_LABEL_PREFIX__ _
33 | #define __STDC_HOSTED__ 1
34 | #define ALLOC_UINT128(_p,_n) (uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256)
35 | #define __DEC64_MIN_EXP__ (-383)
36 | #define BIT_SETC(x,b,condition) ( (x) |= ((condition) << (b)) )
37 | #define __DBL_DIG__ 15
38 | #define __FLT_EPSILON__ 1.19209290e-7F
39 | #define ALLOC_POINTER(_p,_ptr_type,_n) (_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64)
40 | #define __LDBL_MIN__ 3.36210314311209350626e-4932L
41 | #define __DEC32_MAX__ 9.999999E96DF
42 | #define OS_POSIX_COMPLIANT
43 | #define __strong
44 | #define COMPILER_NAME "Gnu C [or other compatible]"
45 | #define __APPLE__ 1
46 | #define __DECIMAL_DIG__ 21
47 | #define SZ_VDM1 7
48 | #define __LDBL_HAS_QUIET_NAN__ 1
49 | #define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512)
50 | #define __DYNAMIC__ 1
51 | #define __GNUC__ 4
52 | #define __MMX__ 1
53 | #define __FLT_HAS_DENORM__ 1
54 | #define ALLOC_VEC_DBL(_p,_n) ALLOC_DOUBLE(_p,_n)
55 | #define __DBL_MAX__ 1.7976931348623157e+308
56 | #define __DBL_HAS_INFINITY__ 1
57 | #define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256)
58 | #define __DEC32_MIN_EXP__ (-95)
59 | #define ALIGN_UINT128(_p) (uint128 *)(((long)(_p) | 63)+1)
60 | #define OBJC_NEW_PROPERTIES 1
61 | #define __LDBL_HAS_DENORM__ 1
62 | #define __DEC32_MIN__ 1E-95DF
63 | #define __weak __attribute__((objc_gc(weak)))
64 | #define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512)
65 | #define __DBL_MAX_EXP__ 1024
66 | #define __DEC128_EPSILON__ 1E-33DL
67 | #define __SSE2_MATH__ 1
68 | #define STRNEQ(s1,s2) ( strcmp(s1,s2))
69 | #define __amd64 1
70 | #define __tune_core2__ 1
71 | #define __LONG_LONG_MAX__ 9223372036854775807LL
72 | #define IS_ODD(a) ( (int)(a) & 1)
73 | #define NINT(x) floor(x + 0.5)
74 | #define BIT_SET(x,b) ( (x) |= (1 << (b)) )
75 | #define platform_h_included
76 | #define FP_MANTISSA_BITS_DOUBLE 64
77 | #define __GXX_ABI_VERSION 1002
78 | #define COMPILER_TYPE_GCC
79 | #define ALIGN_INT(_p) (int *)(((long)(_p) | 63)+1)
80 | #define __FLT_MIN_EXP__ (-125)
81 | #define DNINT(x) lrint((x))
82 | #define __x86_64 1
83 | #define CPU_SUBTYPE_NAME "Unknown CPU subtype"
84 | #define __DBL_MIN__ 2.2250738585072014e-308
85 | #define COMPILER_VERSION __VERSION__
86 | #define ALIGN_VEC_DBL(_p) ALIGN_DOUBLE(_p)
87 | #define __LP64__ 1
88 | #define __DBL_HAS_QUIET_NAN__ 1
89 | #define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256)
90 | #define __DEC128_MIN__ 1E-6143DL
91 | #define __REGISTER_PREFIX__
92 | #define __DBL_HAS_DENORM__ 1
93 | #define __NO_INLINE__ 1
94 | #define __DEC_EVAL_METHOD__ 2
95 | #define types_h_included
96 | #define __DEC128_MAX__ 9.999999999999999999999999999999999E6144DL
97 | #define __FLT_MANT_DIG__ 24
98 | #define __VERSION__ "4.2.1 (Apple Inc. build 5666) (dot 3)"
99 | #define MOD_ADD32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_ADD64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; }
100 | #define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n)
101 | #define ARRAYS_DISJOINT(xarr,lenx,yarr,leny) ((yarr+leny <= xarr) || (yarr >= xarr+lenx))
102 | #define MOD_ADD64(__x,__y,__q,__z) { uint64 cy,tmp; tmp = __x + __y; cy = tmp < __x; __z = tmp - __q; cy -= __z > tmp; __z = __z + (cy & __q); }
103 | #define IS_EVEN(a) (~(int)(a) & 1)
104 | #define HACK_ALIGN_STACK_ODD()
105 | #define __DEC64_EPSILON__ 1E-15DD
106 | #define __DEC128_MIN_EXP__ (-6143)
107 | #define __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ 1068
108 | #define __SIZE_TYPE__ long unsigned int
109 | #define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((long)(_p) | 63)+1)
110 | #define __DEC32_DEN__ 0.000001E-95DF
111 | #define CMUL(ar,ai,br,bi,cr,ci) { double __tmp = ar; ci = __tmp*bi + ai*br; cr = __tmp*br - ai*bi;}
112 | #define __FLT_RADIX__ 2
113 | #define __LDBL_EPSILON__ 1.08420217248550443401e-19L
114 | #define SGN(x,b) ((b) == 1 ? -(x) : (x))
115 | #define __SSE_MATH__ 1
116 | #define __k8 1
117 | #define __LDBL_DIG__ 18
118 | #define __x86_64__ 1
119 | #define OS_VERSION "[Unknown]"
120 | #define HACK_ALIGN_STACK_EVEN()
121 | #define ABS(a) ((a) < 0 ? -(a) : (a))
122 | #define X32_ASM
123 | #define __FLT_HAS_QUIET_NAN__ 1
124 | #define __FLT_MAX_10_EXP__ 38
125 | #define __LONG_MAX__ 9223372036854775807L
126 | #define __FLT_HAS_INFINITY__ 1
127 | #define __DEC64_MAX__ 9.999999999999999E384DD
128 | #define ALIGN_UINT(_p) (uint *)(((long)(_p) | 63)+1)
129 | #define __DEC64_MANT_DIG__ 16
130 | #define OS_TYPE
131 | #define __DEC32_MAX_EXP__ 96
132 | #define __DEC128_DEN__ 0.000000000000000000000000000000001E-6143DL
133 | #define MOD_SUB32(__x,__y,__q,__z) { uint64 _xx = __x, _yy = __y, _qq = __q, _zz = __z; MOD_SUB64(_xx, _yy, _qq, _zz); __z = (uint32)_zz; }
134 | #define OS_TYPE_MACOSX
135 | #define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256)
136 | #define __LITTLE_ENDIAN__ 1
137 | #define CPU_IS_X86_64
138 | #define RE_IM_STRIDE 1
139 | #define MOD_SUB64(__x,__y,__q,__z) { uint64 bw,tmp; tmp = __x - __y; bw = tmp > __x; __z = tmp + __q; bw -= __z < tmp; __z = __z - (bw & __q); }
140 | #define __LDBL_MANT_DIG__ 64
141 | #define __CONSTANT_CFSTRINGS__ 1
142 | #define ALIGN_DOUBLE(_p) (double *)(((long)(_p) | 127)+1)
143 | #define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256)
144 | #define __DEC32_MANT_DIG__ 7
145 | #define __k8__ 1
146 | #define __WCHAR_TYPE__ int
147 | #define FALSE 0
148 | #define __pic__ 2
149 | #define MULH64_FAST
150 | #define __FLT_DIG__ 6
151 | #define __INT_MAX__ 2147483647
152 | #define ALIGN_INT64(_p) (int64 *)(((long)(_p) | 63)+1)
153 | #define __FLT_MAX_EXP__ 128
154 | #define __BLOCKS__ 1
155 | #define __DBL_MANT_DIG__ 53
156 | #define CPU_TYPE
157 | #define __DEC64_MIN__ 1E-383DD
158 | #define __WINT_TYPE__ int
159 | #define __SSE__ 1
160 | #define __LDBL_MIN_EXP__ (-16381)
161 | #define __MACH__ 1
162 | #define X64_ASM
163 | #define __amd64__ 1
164 | #define __LDBL_MAX_EXP__ 16384
165 | #define __SSP__ 1
166 | #define ARRAYS_OVERLAP(xarr,lenx,yarr,leny) !ARRAYS_DISJOINT(xarr,lenx,yarr,leny)
167 | #define __LDBL_MAX_10_EXP__ 4932
168 | #define __DBL_EPSILON__ 2.2204460492503131e-16
169 | #define _LP64 1
170 | #define __GNUC_PATCHLEVEL__ 1
171 | #define __LDBL_HAS_INFINITY__ 1
172 | #define __INTMAX_MAX__ 9223372036854775807L
173 | #define __FLT_DENORM_MIN__ 1.40129846e-45F
174 | #define __PIC__ 2
175 | #define OS_BITS 64
176 | #define __FLT_MAX__ 3.40282347e+38F
177 | #define __SSE2__ 1
178 | #define BIT_FLIP(x,b) ( (x) ^= (1 << (b)) )
179 | #define __FLT_MIN_10_EXP__ (-37)
180 | #define __INTMAX_TYPE__ long int
181 | #define __DEC128_MAX_EXP__ 6144
182 | #define ALLOC_VEC_U64(_p,_n) ALLOC_UINT64(_p,_n)
183 | #define __GNUC_MINOR__ 2
184 | #define __DBL_MAX_10_EXP__ 308
185 | #define SZ_VD 8
186 | #define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
187 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
188 | #define __STDC__ 1
189 | #define __PTRDIFF_TYPE__ long int
190 | #define ALIGN_FLOAT(_p) (float *)(((long)(_p) | 63)+1)
191 | #define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p)
192 | #define STREQN(s1,s2,n) (!strncmp(s1,s2,n))
193 | #define OS_NAME "OS X"
194 | #define __DEC128_MANT_DIG__ 34
195 | #define __LDBL_MIN_10_EXP__ (-4931)
196 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
197 | #define BIT_TEST(x,b) ( ((x) >> (b)) & 1 )
198 | #define __GNUC_GNU_INLINE__ 1
199 | #define COMPILER_TYPE
200 | #define __SSE3__ 1
201 |
--------------------------------------------------------------------------------
/docs/qs.txt:
--------------------------------------------------------------------------------
1 | A p-1 run found the following 53-digit composite factor of M(109228331), which factors into p25*p29:
2 | q = 67043584777242522312784510096836476580550779917618449 = 3258278300321182416433937 * 20576383782390150543028926977
3 | The prime factors themselves have p-1 factorizations
4 | p25-1 = p*2^4.11.113.305611.4907867
5 | p29-1 = p*2^9.577.20929.28687.1062073, which is why a p-1 run to B1 = 10^6 and B2 > 10^6 found them both in stage 2.
6 |
7 | For such composite factors of M(p) = q1.q2
8 | = (2.k1.p+1).(2.k2.p+1) = 4.k1.k2.p^2 + 2.(k1+k2).p + 1 = (2.k1.k2.p + k1 + k2).2.p + 1 = 2.p.F + 1,
9 | where F := (2.k1.k2.p + k1 + k2) = 306896499120006339347913821446363016297258904 ... Need to find k1,k2!
10 | Seems we could do better than e.g. ECM or QS on (n-1), since we have that k1,k2 must satisfy F == k1 + k2 (mod p).
11 |
12 | If k1,k2 < p, things are easy: F/2p = k1.k2, i.e. quotient Q = k1.k2, remainder R = k1 + k2, can just brute-force loop over all k1 <= sqrt(Q) which divide Q, compute k2 = Q/k1, see if R = k1+k2. Will this work in the general case where k1,k2 may be quite a bit larger than 2p? For the above example, k1 = 14914987121432728, k2 = 94189774731567355648. The true 2.k1.k2.p = 306896499120006339347913727241673297608470528,
13 | whereas q/2p = 306896499120006339347913821446363016297258904; difference too large to make the above idea workable.
14 |
15 | Know F == (k1+k2) mod 2p; in our case (k1+k2) == 98326026 mod 2p, useless because requires a priori knowledge of k1,k2.
16 |
17 | 5/23/21: Can at least do p-1 with S1 seed = p on n ... wait:
18 | [We open our next scene with a hand slapping the owner's forehead, accompanied by the utterance "doh!"]
19 |
20 | Re above: In fact it seems silly to use powerful general-modulus factoring machinery like ECM or QS on such (p-1)-found factor-product composites. Here's why: say we have some product of prime factors F = f1*f2*...*fn discovered by running p-1 to stage bounds b1 and b2 on an input Mersenne M(p) (or other bigum modulus with factors of a known form, allowing p-1 to be 'seeded' with a component of same). BY DEFINITION, each prime factor f1-fn will be b1/b2-smooth, in the sense than fj = 2*p*C + 1, where C is a composite all of whose prime factors are <= b1, save possibly one outlier-prime factor > b1 and <= b2. Thus if we again run p-1 to bounds b1/b2, but now with arithmetic modulo the relatively tiny factor product F, we are guaranteed to resolve all the prime factors f1-fn - the only trick is that we will need to do multiple GCDs along the way in order to capture the individual prime factors f1,...,fn, rather than have this secondary p-1 run modulo F again produce the same composite GCD = F which the original p-1 run mod M(p) did. Again, though, since in the followup p-1 run we are working mod F, all the arithmetic is trivially cheap, including the needed GCDs.
21 |
22 | ====================================
23 |
24 | Use above example composite to work through the basics of ECM:
25 | [to-do!]
26 |
27 | ====================================
28 |
29 | Use above example composite to work through the basics of ECM and the Quadratic Sieve factorization algorithm.
30 | Wikipedia:
31 |
32 | "The algorithm attempts to set up a congruence of squares modulo n (the integer to be factorized), which often leads to a factorization of n. The algorithm works in two phases: the data collection phase, where it collects information that may lead to a congruence of squares; and the data processing phase, where it puts all the data it has collected into a matrix and solves it to obtain a congruence of squares. The data collection phase can be easily parallelized to many processors, but the data processing phase requires large amounts of memory, and is difficult to parallelize efficiently over many nodes or if the processing nodes do not each have enough memory to store the whole matrix. The block Wiedemann algorithm can be used in the case of a few systems each capable of holding the matrix.
33 |
34 | "The naive approach to finding a congruence of squares is to pick a random number, square it, and hope the least non-negative remainder modulo n is a perfect square (in the integers). For example, 802 mod 5959 is 441, which is 212. This approach finds a congruence of squares only rarely for large n, but when it does find one, more often than not, the congruence is nontrivial and the factorization is complete. This is roughly the basis of Fermat's factorization method."
35 |
36 |
--------------------------------------------------------------------------------
/src/align.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef align_h_included
27 | #define align_h_included
28 |
29 | #include "types.h"
30 |
31 | /* These are for basic memory allocation, and to force alignment of array data on desired-byte boundaries.
32 | We use the normally-not-recommended immediate-overwrite-of-pointer form of realloc() because if the returned
33 | pointer is null we exit immediately, thus the resulting memory leak is never an issue.
34 |
35 | In the Align macros we cast pointers to longto accommodate architectures which use 64-bit address arithmetic.
36 | Note that rather than simply assuming sizeof(void *) <= sizeof(long), we check this at program invocation, in
37 | util.c::check_nbits_in_types()>
38 | */
39 |
40 | #define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256)
41 | #define ALIGN_INT(_p) (int *)(((intptr_t)(_p) | 63)+1)
42 |
43 | #define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256)
44 | #define ALIGN_UINT(_p) (uint *)(((intptr_t)(_p) | 63)+1)
45 |
46 | #define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256)
47 | #define ALIGN_INT64(_p) (int64 *)(((intptr_t)(_p) | 63)+1)
48 |
49 | #define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256)
50 | #define ALIGN_UINT64(_p) (uint64 *)(((intptr_t)(_p) | 63)+1)
51 |
52 | #define ALLOC_UINT128(_p,_n)(uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256)
53 | #define ALIGN_UINT128(_p) (uint128 *)(((intptr_t)(_p) | 63)+1)
54 |
55 | #define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256)
56 | #define ALIGN_FLOAT(_p) (float *)(((intptr_t)(_p) | 63)+1)
57 |
58 | #define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512)
59 | #define ALIGN_DOUBLE(_p) (double *)(((intptr_t)(_p) | 127)+1)
60 |
61 | #define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512)
62 | #define ALIGN_f128(_p) (__float128 *)(((intptr_t)(_p) | 127)+1)
63 |
64 | #define ALLOC_COMPLEX(_p,_n)(struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512)
65 | #define ALIGN_COMPLEX(_p) (struct complex*)(((intptr_t)(_p) | 127)+1)
66 |
67 | // Vector-double|uint64-alloc used by SIMD builds; register size difference between YMM and XMM taken care of by def of vec_dbl in types.h:
68 | #ifdef USE_SSE2
69 |
70 | #define ALLOC_VEC_DBL(_p,_n)(vec_dbl*)realloc(_p,(_n)*sizeof(vec_dbl)+512)
71 | #define ALIGN_VEC_DBL(_p) (vec_dbl*)(((intptr_t)(_p) | 127)+1)
72 |
73 | #define ALLOC_VEC_U64(_p,_n)(vec_u64*)realloc(_p,(_n)*sizeof(vec_u64)+512)
74 | #define ALIGN_VEC_U64(_p) (vec_u64*)(((intptr_t)(_p) | 127)+1)
75 |
76 | #else // In scalar-mode simply use the above double|uint64 macros:
77 |
78 | #define ALLOC_VEC_DBL(_p,_n) ALLOC_DOUBLE(_p,_n)
79 | #define ALIGN_VEC_DBL(_p) ALIGN_DOUBLE(_p)
80 |
81 | #define ALLOC_VEC_U64(_p,_n) ALLOC_UINT64(_p,_n)
82 | #define ALIGN_VEC_U64(_p) ALIGN_UINT64(_p)
83 |
84 | #endif
85 |
86 | #define ALLOC_POINTER(_p,_ptr_type,_n)(_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64)
87 | #define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((intptr_t)(_p) | 63)+1)
88 |
89 | #define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n)
90 | #define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p)
91 |
92 | /*
93 | On the x86 family, alignment of the stack is very important
94 | This uses the GNU gcc __builtin_alloca function to align doubles properly
95 | This is taken from GNU/FFTW package
96 | */
97 | #ifdef COMPILER_TYPE_GCC
98 | # if (defined(__i386))
99 | # define HACK_ALIGN_STACK_EVEN(){ \
100 | if( (((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\
101 | }
102 |
103 | # define HACK_ALIGN_STACK_ODD() { \
104 | if(!(((uint64) (__builtin_alloca(0))) & 0x7)) __builtin_alloca(4);\
105 | }
106 | # else
107 | # define HACK_ALIGN_STACK_EVEN() /* */
108 | # define HACK_ALIGN_STACK_ODD() /* */
109 | # endif
110 | #else
111 | # define HACK_ALIGN_STACK_EVEN() /* */
112 | # define HACK_ALIGN_STACK_ODD() /* */
113 | #endif
114 |
115 |
116 | #endif /* align_h_included */
117 |
--------------------------------------------------------------------------------
/src/f2psp.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2012 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /*******************************************************************************
24 | We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef f2psp_h_included
27 | #define f2psp_h_included
28 |
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 |
33 | #define MI64_IS_DIV_BY_SCALAR32P_X8_SSE2(\
34 | array_64x8inputs,\
35 | q, \
36 | qinv, \
37 | retval \
38 | )\
39 | {\
40 | DBG_ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\
41 | DBG_ASSERT(((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\
42 | __asm mov eax, array_64x8inputs /* Assumes inputs a,b,c,d,... are 64-bit separated and &a[0} is 64-byte aligned */\
43 | __asm lea ebx, q\
44 | __asm lea ecx, qinv\
45 | __asm movaps xmm0,[eax ] /* ab: d3210 = [bhi|blo|ahi|alo] */\
46 | __asm movaps xmm1,[eax+0x10] /* cd: d3210 = [dhi|dlo|chi|clo] */\
47 | __asm movaps xmm2,[eax+0x20] /* ef: d3210 = [fhi|flo|ehi|elo] */\
48 | __asm movaps xmm3,[eax+0x30] /* gh: d3210 = [hhi|hlo|ghi|glo] */\
49 | __asm movaps xmm6,xmm0 /* Circularly-permute [4,6,7] -> [6,7,4] here so the 2 packed outputs end up in xmm6,7 */\
50 | __asm movaps xmm5,xmm1\
51 | __asm movaps xmm7,xmm2\
52 | __asm movaps xmm4,xmm3\
53 | __asm psrlq xmm6, 32 /* d3210 = [ 0|bhi| 0|ahi] */\
54 | __asm psrlq xmm5, 32 /* d3210 = [ 0|dhi| 0|chi] */\
55 | __asm psrlq xmm7, 32 /* d3210 = [ 0|fhi| 0|ehi] */\
56 | __asm psrlq xmm4, 32 /* d3210 = [ 0|hhi| 0|ghi] */\
57 | __asm psllq xmm5, 32 /* d3210 = [dhi| 0|chi| 0] */\
58 | __asm psllq xmm4, 32 /* d3210 = [hhi| 0|ghi| 0] */\
59 | __asm paddd xmm6,xmm5 /* d3210 = [dhi|bhi|chi|ahi], xmm5 FREE */\
60 | __asm paddd xmm7,xmm4 /* d3210 = [hhi|fhi|ghi|ehi], xmm4 FREE */\
61 | __asm movd xmm4,[ebx]\
62 | __asm movd xmm5,[ecx]\
63 | __asm pshufd xmm4,xmm4,0x44 /* Broadcast q to slots 0,2 of xmm4 */\
64 | __asm pshufd xmm5,xmm5,0x44 /* Broadcast qinv to slots 0,2 of xmm5 */\
65 | /* (a-h)[0]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\
66 | __asm pmuludq xmm0,xmm5\
67 | __asm pmuludq xmm1,xmm5\
68 | __asm pmuludq xmm2,xmm5\
69 | __asm pmuludq xmm3,xmm5\
70 | /* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\
71 | __asm pmuludq xmm0,xmm4\
72 | __asm pmuludq xmm1,xmm4\
73 | __asm pmuludq xmm2,xmm4\
74 | __asm pmuludq xmm3,xmm4\
75 | __asm psrlq xmm0, 32 /* d3210 = [ 0|cy1| 0|cy0] */\
76 | __asm psrlq xmm1, 32 /* d3210 = [ 0|cy3| 0|cy2] */\
77 | __asm psrlq xmm2, 32 /* d3210 = [ 0|cy5| 0|cy4] */\
78 | __asm psrlq xmm3, 32 /* d3210 = [ 0|cy7| 0|cy6] */\
79 | __asm psllq xmm1, 32 /* d3210 = [cy3| 0|cy2| 0] */\
80 | __asm psllq xmm3, 32 /* d3210 = [cy7| 0|cy6| 0] */\
81 | __asm paddd xmm0,xmm1 /* d3210 = [cy3|cy1|cy2|cy0], xmm1 FREE */\
82 | __asm paddd xmm2,xmm3 /* d3210 = [cy7|cy5|cy6|cy4], xmm3 FREE */\
83 | __asm movaps xmm3,xmm6 /* Copy of acbd[1] */\
84 | __asm movaps xmm1,xmm7 /* Copy of efgh[1] */\
85 | __asm psubd xmm6,xmm0 /* acbd[1] - cy0213, xmm0 FREE */\
86 | __asm psubd xmm7,xmm2 /* egfh[1] - cy4657, xmm2 FREE */\
87 | __asm movaps xmm2,xmm6 /* Copy of acbd[1] - cy0213 */\
88 | __asm movaps xmm0,xmm7 /* Copy of efgh[1] - cy4657 */\
89 | /* Had a borrow? Frickin' SSE2 only gives us signed packed-integer compares,\
90 | so need to emulate unsigned (x > y) via signed (x ^ 0x80000000) < (y ^ 0x80000000): */\
91 | __asm pcmpeqd xmm4,xmm4 /* All 1s - will need to restore q to this register later */\
92 | __asm pslld xmm4, 31 /* 4-way 0x80000000 */\
93 | __asm pxor xmm6,xmm4 /* (acbd[1]-cy0213) ^ 0x80000000 */\
94 | __asm pxor xmm7,xmm4 /* (egfh[1]-cy4657) ^ 0x80000000 */\
95 | __asm pxor xmm3,xmm4 /* (acbd[1]) ^ 0x80000000 */\
96 | __asm pxor xmm1,xmm4 /* (egfh[1]) ^ 0x80000000 */\
97 | __asm pcmpgtd xmm6,xmm3 /* cy0213 = (acbd[1]-cy0213) > abcd[1], xmm3 FREE */\
98 | __asm pcmpgtd xmm7,xmm1 /* cy4657 = (egfh[1]-cy4657) > efgh[1], xmm1 FREE */\
99 | __asm pshufd xmm3,xmm2,0x31 /* xmm2 = [----|tmp1|----|tmp0], xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\
100 | __asm pshufd xmm1,xmm0,0x31 /* xmm0 = [----|tmp5|----|tmp4], xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\
101 | __asm movd xmm4,[ebx] /* Restore q to xmm4 */\
102 | __asm pshufd xmm4,xmm4,0x44 /* Broadcast q to slots 0,2 of xmm4 */\
103 | /* tmp[0-7]*qinv; Alas SSE2 has no 32-bit low-half packed MUL, so use 32x32->64 -bit and discard high halves */\
104 | __asm pmuludq xmm3,xmm5\
105 | __asm pmuludq xmm1,xmm5\
106 | __asm pmuludq xmm2,xmm5\
107 | __asm pmuludq xmm0,xmm5\
108 | /* Add carries 01/45, scatter carries 23/67 into slots of 01/45, add those...Since SSE2 compare result is ~()ed, add really means sub: */\
109 | __asm psubd xmm2,xmm6 /* xmm6 = [----|tmp1|----|tmp0], don't care what's in ---- slots */\
110 | __asm psubd xmm0,xmm7 /* xmm7 = [----|tmp5|----|tmp4], don't care what's in ---- slots */\
111 | __asm pshufd xmm6,xmm6,0x31\
112 | __asm pshufd xmm7,xmm7,0x31\
113 | __asm psubd xmm3,xmm6 /* xmm3 = [----|tmp3|----|tmp2], don't care what's in ---- slots */\
114 | __asm psubd xmm1,xmm7 /* xmm1 = [----|tmp7|----|tmp6], don't care what's in ---- slots */\
115 | /* cy[0-7] = MULH32(tmp[0-7]*q) - high halves of above MULQs automatically get overwritten: */\
116 | __asm pmuludq xmm2,xmm4\
117 | __asm pmuludq xmm0,xmm4\
118 | __asm pmuludq xmm3,xmm4\
119 | __asm pmuludq xmm1,xmm4\
120 | __asm psrlq xmm2, 32 /* d3210 = [ 0|cy1| 0|cy0] */\
121 | __asm psrlq xmm0, 32 /* d3210 = [ 0|cy5| 0|cy4] */\
122 | __asm psrlq xmm3, 32 /* d3210 = [ 0|cy3| 0|cy2] */\
123 | __asm psrlq xmm1, 32 /* d3210 = [ 0|cy7| 0|cy6] */\
124 | __asm pshufd xmm2,xmm2,0x58 /* [ 0| 0|cy1|cy0] */\
125 | __asm pshufd xmm0,xmm0,0x58 /* [ 0| 0|cy5|cy4] */\
126 | __asm pshufd xmm3,xmm3,0x85 /* [cy3|cy2| 0| 0] */\
127 | __asm pshufd xmm1,xmm1,0x85 /* [cy7|cy6| 0| 0] */\
128 | __asm paddd xmm2,xmm3 /* d3210 = [cy3|cy1|cy2|cy0] */\
129 | __asm paddd xmm0,xmm1 /* d3210 = [cy7|cy5|cy6|cy4] */\
130 | __asm pcmpgtd xmm7,xmm7 /* All 0s */\
131 | __asm pcmpeqd xmm2,xmm7 /* retval[0-3] */\
132 | __asm pcmpeqd xmm0,xmm7 /* retval[4-7] */\
133 | __asm movmskps eax,xmm2 /* retval[0-3] */\
134 | __asm movmskps ebx,xmm0 /* retval[4-7] */\
135 | __asm shl ebx, 4 /* retval[4-7] << 4 */\
136 | __asm add eax,ebx /* retval[0-7] */\
137 | __asm mov retval, eax \
138 | }
139 |
140 | #ifdef __cplusplus
141 | }
142 | #endif
143 |
144 | #endif /* f2psp_h_included */
145 |
146 |
--------------------------------------------------------------------------------
/src/fac_test_dat192.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef fac_test_dat192_included
27 | #define fac_test_dat192_included
28 |
29 | #include "types.h"
30 |
31 | struct testFac160{
32 | uint32 p;
33 | uint64 d2;
34 | uint64 d1;
35 | uint64 d0;
36 | };
37 |
38 | struct testFac192{
39 | uint32 p;
40 | uint64 d2;
41 | uint64 d1;
42 | uint64 d0;
43 | };
44 |
45 | /*******************************************/
46 | /* Fermat-number test factors: */
47 | /*******************************************/
48 |
49 | // Here interpret the above testFac struct as a minimalist [n,k]-pair format,
50 | // where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor:
51 | // To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1.
52 |
53 | // Testcases with factors < 2^192:
54 | static const struct testFac192 ffac192[] =
55 | {
56 | { 86,0ull,0ull, 20018578522347ull}, // 2012 M. Dangler & Rodenkirch
57 | { 88,0ull,0ull, 119942751127ull}, // 2001 T. Nohara & Durman
58 | { 90,0ull,0ull, 198922467387ull}, // 2001 P. Grobstich & Durman
59 | { 91,0ull,0ull, 1421ull}, // 1977 D. E. Shippee
60 | { 93,0ull,0ull,2* 92341ull}, // 1979 R. Baillie
61 | { 94,0ull,0ull,2* 482524552001ull}, // 2001 P. Grobstich & Durman
62 | { 96,0ull,0ull,8* 3334131633063ull}, // 2008 M. Ptáček & Durman
63 | {107,0ull,0ull,4* 1289179925ull}, // 1992 G. B. Gostin
64 | {116,0ull,0ull,4* 3433149787ull}, // 1999 T. Taura
65 | {122,0ull,0ull, 5234775ull}, // 1986 G. B. Gostin
66 | {125,0ull,0ull, 5ull}, // 1956 R. M. Robinson
67 | {133,0ull,0ull, 88075576149ull}, // 2001 P. Samidoost & Durman
68 | {142,0ull,0ull,2* 8152599ull}, // 1986 G. B. Gostin
69 | {144,0ull,0ull,2* 17ull}, // 1956 R. M. Robinson
70 | {146,0ull,0ull, 37092477ull}, // 1987 G. B. Gostin
71 | {147,0ull,0ull, 3125ull}, // 1979 G. B. Gostin & P. B. McLaughlin
72 | {147,0ull,0ull, 124567335ull}, // 1990 G. B. Gostin
73 | {150,0ull,0ull,32* 1575ull}, // 1956 R. M. Robinson
74 | {150,0ull,0ull,4* 5439ull}, // 1980 G. B. Gostin & P. B. McLaughlin & H. Suyama
75 | {0,0ull,0ull,0ull}
76 | };
77 |
78 | /*******************************************/
79 | /* Mersenne-number test factors: */
80 | /*******************************************/
81 |
82 | /* Factors > 128 but <= 160 bits. If desired, we can construct more test factors
83 | by multiplying together a 64-bit factor q1 of M(p1) and a 96-bit factor q2 of M(p2)
84 | and checking whether q1*q2 divides M(p1*p2).*/
85 | static const struct testFac160 fac160[] =
86 | {
87 | { 629, 133ull,11545660419510266595ull,15875370168207932041ull},
88 | { 631, 1394ull,15571349859840161706ull, 509892144742137431ull},
89 | { 673, 121320ull, 4492854135134704005ull,14226674137430228263ull},
90 | { 695,2649519282ull,14842833464112563611ull,10174116463236461383ull},
91 | { 731, 655903171ull,17652352551621896287ull, 7660429456444636239ull},
92 | { 805,1083827012ull,18314245293386716597ull, 2219421057460140527ull},
93 | { 877, 13161208ull,18225246095436784582ull,12343089078196252631ull},
94 | { 957, 4730ull,14663183769241509326ull, 8097149896429635207ull},
95 | { 967, 215159ull, 881920578744577810ull,17184239148975426263ull},
96 | { 1017, 212724356ull, 9900144438119899815ull,17733134473107607967ull},
97 | { 1033, 261ull, 5238930328752646394ull, 2803405107698253561ull},
98 | { 1087, 1ull, 4415476118538293365ull,16346425147370540471ull},
99 | { 1087, 70130ull,11905462972019801043ull, 6167785434693019223ull},
100 | { 1131, 5800574ull,18429773635221665090ull,17951008765075981215ull},
101 | { 1157, 22381525ull,14500669099417213747ull,15903397166638806257ull},
102 | { 1283, 14ull, 3291757557782450881ull, 3893270457587058239ull},
103 | { 1319, 1552ull, 1390029428449091172ull,14288981644299514807ull},
104 | { 1483, 2674ull,14802171160149427175ull, 5085420234315110585ull},
105 | { 6659, 664ull,14291576310931480037ull, 4949688733053552967ull},
106 | { 8191, 617742ull, 6334326874596939334ull,11405337619840706193ull},
107 | {18031451, 2122ull, 5198971222801411122ull,12425019173815339143ull}, /* Note: composite factor! */
108 | {0,0ull,0ull,0ull}
109 | };
110 |
111 | /* Factors > 160 but <= 192 bits. We can construct more test factors by multiplying
112 | together smaller factors of M(p) with multiple factors, or for exponents p1, p2, p3, ...
113 | and corresponding factors q1, q2, q3, ... , checking whether q1*q2*q3*...
114 | divides M(p1*p2*p3*...). */
115 | static const struct testFac192 fac192[] =
116 | {
117 | { 677, 157590042578912ull,10558642444782195772ull, 329809049266961143ull},
118 | { 773, 9118322195022ull, 1933308633079010416ull,17814616685598394119ull},
119 | { 971, 70286054459973ull,17012949627558354271ull, 3547755741880899889ull},
120 | { 997, 492416983078691417ull, 8040689323464953445ull,16007877010440112335ull},
121 | { 1001, 59364131986ull, 9565712986615012496ull,10050950882119470361ull},
122 | {0,0ull,0ull,0ull}
123 | };
124 |
125 | #endif /* #ifndef fac_test_dat192_included */
126 |
--------------------------------------------------------------------------------
/src/fac_test_dat256.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef fac_test_dat256_included
27 | #define fac_test_dat256_included
28 |
29 | #include "types.h"
30 |
31 | struct testFac256{
32 | char p[80];
33 | char q[80];
34 | };
35 |
36 | /*******************************************/
37 | /* Fermat-number test factors: */
38 | /*******************************************/
39 |
40 | struct testFermFac{
41 | uint32 n;
42 | uint64 k;
43 | };
44 |
45 | // Here interpret the above testFac struct as a minimalist [n,k]-pair format,
46 | // where Fn = 2^2^n+1 is the Fermat number and q = k.2^(n+2)+1 the factor:
47 | // To check any particular (alleged) factor q of Fn using Pari, use Mod(2,q)^(2^n)+1.
48 |
49 | // Testcases with factors < 2^256:
50 | static const struct testFermFac ffac256[] =
51 | {
52 | {164,2* 1835601567ull}, // 1993 G. B. Gostin
53 | {166,8* 2674670937447ull}, // 2012 R. Maznichenko & Rodenkirch
54 | {172, 20569603303ull}, // 2001 L. N. Durman
55 | {178, 313047661ull}, // 1991 G. B. Gostin
56 | {184,2* 117012935ull}, // 1990 G. B. Gostin
57 | {195, 48595346636925ull}, // 2014 S. Batalov & Woltman
58 | {201,2* 4845ull}, // 1980 G. B. Gostin & P. B. McLaughlin
59 | {205, 232905ull}, // 1984 W. Keller
60 | {207, 3ull}, // 1956 R. M. Robinson
61 | {215, 32111ull}, // 1980 H. Suyama
62 | {226,2* 15ull}, // 1956 R. M. Robinson
63 | {228,2* 29ull}, // 1956 R. M. Robinson
64 | {0,0ull}
65 | };
66 |
67 | /*******************************************/
68 | /* Mersenne-number test factors: */
69 | /*******************************************/
70 |
71 | /* 256-bit Factors are easier to give in character-string form: */
72 | /* EWM: These are from my April 2006 shakedown runs of the P4WORD functionality -
73 | ran ??? 64-65-digit test exponents up to k = 10^10; ??? had factors below this bound,
74 | compared to ??? predicted by theory (Dickman's function).
75 | */
76 | static const struct testFac256 fac256[] =
77 | {
78 | {"1000000000000000000000000000000000000000000000000000000001059" ,"40000000000000000000000000000000000000000000000000000000042361" }, /* k = 20 */
79 | {"12160287649628674460477464915995054973742562690104903778198683593" ,"543592246870442485937175551111623340804481341938942752102988291735322287319" }, /* k = 22351126163 */
80 | {"20992192221842725502542568876717904946016534668049886272327917860857843" ,"41984384443685451005085137753435809892033069336099772544655835721715687" }, /* k = 1 */
81 | {"24247014121478057345510500801908699603302763478708108175450119307" ,"2079083331892761004876676951418337621569030224230467189523407626117207889809" }, /* k = 42872976472 */
82 | {"3082533446850352619311881710100031378387528865875332083814206171" ,"6165066893700705238623763420200062756775057731750664167628412343" }, /* k = 1 */
83 | {"32046927906821207388377814233562823608963208068222468012248261177" ,"192281567440927244330266885401376941653779248409334808073489567063" }, /* k = 3 */
84 | {"32046927906821207388377814233562823608963208068222468012248261177" ,"7261513394406617382132528927183000201554973316178529026895333500096431" }, /* k = 113295 */
85 | {"3444030707469211201913020330380197621101100449293215160842444859637669" ,"53389364027187712052055641161553823522309259164943421423379580214103144839" }, /* k = 7751 */
86 | {"3600113305305488204665213841469519415116094330572703657595919530921861" ,"1605650534166247739280685373295405659141778071435425831287780110791150007" }, /* k = 223 */
87 | {"3852254995466672782398645659611635488623057745649803559363456817432411" ,"22788707831582286845380020155651359827337650244785629920055214225748565104481" }, /* k = 2957840 */
88 | {"3873455283316355076479185358932261854896321329330898570642046752590709" ,"2921297999392661936999377930740968974773127205440094407601101388055871276457" }, /* k = 377092 */
89 | {"4088350865739177150968288747826569959957449066175834413752239709" ,"532990125664685046817433867476654272539732719859211180852051986382913" }, /* k = 65184 */
90 | {"41927056387293174872332083760112302991136793862708943879936201629" ,"586978789422104448212649172641572241875915114077925214319106822807" }, /* k = 7 */
91 | {"53710507922796892589235420199561121290219608640344181598136297747713099" ,"107421015845593785178470840399122242580439217280688363196272595495426199" }, /* k = 1 */
92 | {"54973742562690104903778198683593814657412680492564879855614537234786733" ,"769632395877661468652894781570313405203777526895908317978603521287014263" }, /* k = 7 */
93 | {"5509792592309907965473761255176567513575178296664547791745011299" ,"742776869444172678136618913571387191947269048779332840473151581151737887" }, /* k = 67405157 */
94 | {"570658748822569815793678976697422057505968344086973502014102067" ,"322107495328491256282531776450837995333351643082236449882652963072723913" }, /* k = 282224268 */
95 | {"62735676303544776280350450777235547105859548702790814356240145171" ,"19573531006705970199469340642497490697028179195270734079146925293353" }, /* k = 156 */
96 | {"62749567351885752724891227938183011949129833673362440656643086021" ,"6902452408707432799738035073200131314404281704069868472230739462311" }, /* k = 55 */
97 | {"6402474964732639141992726042699227967823547816360093417216412199" ,"59547637466852043611708058111909725657028150812842162510646420336832110759" }, /* k = 4650360821 */
98 | {"7195429162991930645537799140373404328752628889639958794757291746426357" ,"957164768977838582192020192849031737427989705415465878713793977276619713569" }, /* k = 66512 */
99 | {"7095890455635792122103334669749923563025494780249011419521238281" ,"93538027986191011753566157616643492407802072193242468532128963020143" }, /* k = 6591 */
100 | {"83011949129833673362440656643086021394946395224737190702179860943" ,"2473258012374264464160556924024104921441032899325819859780746776935743" }, /* k = 14897 */
101 | {"85102283345085048608250393021332197155184306354550076682829493041" ,"23658434769933643513093609259930350809141237166564921317826599065399" }, /* k = 139 */
102 | {"9104140792886215078424516709087000699282120660418371806535567252532567" ,"207009953348646758453216660931220221900276859576592938137005728188085508447" }, /* k = 11369 */
103 | {"9729971208443357326548938239119325974636673058360414281388303203" ,"215547993800818444194894669517580993600170370441819907413109392526239783" }, /* k = 11076497 */
104 | {"" ,"" }, /* k = */
105 | {"" ,"" }, /* k = */
106 | {"" ,"" }, /* k = */
107 | {"" ,"" }, /* k = */
108 | {"" ,"" }, /* k = */
109 | {"" ,"" }, /* k = */
110 | {"" ,"" }, /* k = */
111 | {"",""}
112 | };
113 |
114 | #endif /* #ifndef fac_test_dat256_included */
115 |
--------------------------------------------------------------------------------
/src/fgt_m61.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef fgt_m61_h_included
27 | #define fgt_m61_h_included
28 |
29 | #include "util.h"
30 |
31 | // Our modulus q = 2^61 - 1:
32 |
33 | /***************/
34 | // NB: Since args to these reduce macros will more often than not be expressions (e.g. qreduce(x - y + q4)),
35 | // start each by copying arg into a local uint64, to ensure that any input expression only gets evaluated once:
36 | /*
37 | Returns x (mod q), but in the sense of a possible partial modular reduction: Outputs are in [0, B], where B = q+7.
38 | Note: if x = q, QREDUCE returns q, not zero.
39 | */
40 | #define qreduce(x) \
41 | ({ uint64 tmp = x; \
42 | tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull); \
43 | tmp; })
44 |
45 | // ...or this if you want to finish reducing a qreduce() output:
46 | #define qreduce_finish(x) \
47 | ({ uint64 tmp = x; \
48 | tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull; \
49 | tmp; })
50 |
51 | // Use this if you require a guaranteed-full reduction of x (mod q)...
52 | #define qreduce_full(x) \
53 | ({ uint64 tmp = x; \
54 | tmp = (tmp >> 61) + (tmp & 0x1FFFFFFFFFFFFFFFull); \
55 | tmp -= (-(uint64)(tmp >= 0x1FFFFFFFFFFFFFFFull)) & 0x1FFFFFFFFFFFFFFFull; \
56 | tmp; })
57 |
58 | /***************/
59 |
60 | /*
61 | Returns sqrt(1/2)*x (mod q).
62 | sqrt(1/2) == 2^30 mod q, so the multiply can be effected via 2 shifts, an AND, and an add.
63 | For normalized inputs (< q), Output is in [0, B30], where B30 = q + 7*2^30 = 2^61 + 2^33 - 2^30 - 1.
64 | */
65 | #define mul_i2(x) (((x) << 30) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 31)
66 |
67 | /***************/
68 |
69 | /*
70 | Returns sqrt(2)*x (mod q).
71 | sqrt(2) == 2^31 mod q, so the multiply can be effected via 2 shifts, an AND, and an add.
72 | Outputs are in [0, B31], where B31 = q + 7*2^31 = 2^61 + 2^34 - 2^31 - 1.
73 | */
74 | #define mul_s2(x) (((x) << 31) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> 30)
75 |
76 | /***************/
77 |
78 | /*
79 | Returns 2^n * x (mod q). x is a uint64; The shift count n is assumed to be any kind of int, with value in [0,61].
80 |
81 | If x only partially normalized (i.e. in [0, b]) on entry and n = 0, result is fully normalized, i.e. xout in [0,q].
82 | If x unnormalized on entry and n = 0, the result is partially normalized, i.e. xout in [0,b].
83 | The special case n = 61 leaves x unchanged.
84 |
85 | For general operands x in [0,2^64-1] and n in [0,60], ((x << n) & q) is in [0, q - (2^n - 1)] = [0, 2^61 - 2^n]
86 | and (x >> (61-n)) is in [0, 2^(3+n) - 1]. The sum is bounded above by 2^61 - 2^n + 2^(3+n) - 1 = q + 2^(3+n) - 2^n.
87 |
88 | OK, let`s do some crude estimation for non-normalized inputs:
89 |
90 | The sum is maximized for x = 2^64-1 and n = 60, giving 2^63 - 1 + 2^60 = 9*2^60 - 1 ~= 4.5*q,
91 | i.e. inputs approximately in [0,8q] yield outputs approximately in [0,5q].
92 | For x = 2^64-1 and n = 59, the sum is bounded by ~2.75*q, etc., approaching q+7 from above.
93 |
94 | x = 2^63-1 and n = 60 gives q + 2^62 - 2^60 ~= 2.5*q .
95 | x = 2^63-1 and n = 59 gives q + 2^61 - 2^59 ~= 1.75*q . This case is important in the between-forward-and-inverse-FFT
96 | pair_square step, where we multiply inputs in [0,4q] by the modular inverse of 4 == 2^59.
97 | x = 2^62-1 and n = 60 gives 2^61 - 1 + 2^60 = 3*2^60 - 1 ~= 1.5*q, i.e. inputs approximately in [0,2q]
98 | yield outputs approximately in [0,2q].
99 |
100 | NEGATIVE POWERS OF 2:
101 |
102 | The modular analog of 1/2 (call it w) satisfies 2*w == 1 (mod q), thus w = (q+1)/2 = 2^60. More generally,
103 | any negative-integer power of 2 (mod q) satisfies 2^(-p) == 2^(61-p), with p < 61. We obtain the same
104 | result by simply analogizing the mul_pow2_modq macro to negative powers, and thus can effect multiply
105 | by 2^(-p) by simply calling the mul_pow2_modq macro with power-of-2 argument (61-p).
106 |
107 | Thus e.g. to effect a modular x*(1/2) we call mul_pow2_modq(x,60).
108 | */
109 | #define mul_pow2_modq(x,n) (((x) << n) & 0x1FFFFFFFFFFFFFFFull) + ((x) >> (61-n))
110 |
111 | /****** Prototypes for functions defined in fgt_m61.c are collected in util.h *******/
112 |
113 | #endif /* fgt_m61_h_included */
114 |
--------------------------------------------------------------------------------
/src/gcd_lehmer.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /*******************************************************************************
24 | We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef gcd_lehmer_h_included
27 | #define gcd_lehmer_h_included
28 |
29 | #include "Mlucas.h"
30 | #include "genFFT_mul.h"
31 |
32 | #ifdef __cplusplus
33 | extern "C" {
34 | #endif
35 |
36 | /*******************************************************************************
37 | Function prototypes. The corresponding function definitions will either
38 | be in a {function name}.c file or (for cases where a .c file contains
39 | multiple function definitions) in the given .c file:
40 | *******************************************************************************/
41 |
42 | /* gcd_lehmer.c: */
43 | uint32 mi64_gcd(
44 | uint64 u[], uint64 v[], uint32 const ndim,
45 | const uint32 EGCD, uint64 Ap[], uint64 Bp[], uint32 *len_AB, uint32 *sign_AB,
46 | const uint32 HALF, uint64 Cp[], uint64 Dp[], uint32 *len_CD, uint32 *sign_CD, const uint32 len_targ);
47 |
48 | uint32 matrix_vector_product_sub(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len);
49 | uint32 matrix_vector_product_add(uint64c abmul[], uint64c cdmul[], uint64 *uv_ptr[], uint32 len);
50 |
51 | int CMP_LT_PROD192 (uint64 a, uint64 xlo, uint64 xhi, uint64 b, uint64 ylo, uint64 yhi);
52 | int pprime192 (uint192 p, uint64 z);
53 | uint192 bitwise_mod192 (uint192 x, uint192 y);
54 | /*
55 | void mv_dwtvarbase_to_int64 (x,p,m,u,ndim);
56 | */
57 | void gcd_init();
58 | int test_gcd();
59 |
60 | #ifdef __cplusplus
61 | }
62 | #endif
63 |
64 | #endif /* gcd_lehmer_h_included */
65 |
66 |
--------------------------------------------------------------------------------
/src/genFFT_mul.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /*******************************************************************************
24 | We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef gen_fft_h_included
27 | #define gen_fft_h_included
28 |
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 |
33 | /* Enumeration constant of the various supported values for the MODE argument to genFFT_mul().
34 | As the use of an enum implies, these modes are mutually exclusive:
35 |
36 | Mode Description
37 | ---------------- -------------------------
38 | INIT_ARRAYS Init FFT-related bit-reversal-index and roots-of-unity data, using input x-array for scratch storage
39 |
40 | The rest assume the function has been previously called in INIT_ARRAYS mode for the FFT length in question:
41 |
42 | FORWARD_FFT_ONLY The fFFT of the input X-array is computed and stored in-place
43 | AUTO_SQUARE The fFFT of the input X-array is computed, followed by a wrapper/dyadic-square step and an iFFT, all in-place.
44 | MUL_PRECOMPUTED The X-array is assumed to contain an untransformed input vector, and the Y-array to contain a data vector which was previously-transformed by calling this routine in FORWARD_FFT_ONLY mode. The fFFT of the input X-array is computed, followed by a wrapper/dyadic-mul-with-Y-transform step and an iFFT. The result is returned in X; Y is unaffected. (I.e. this is designed for the common case where we have a constant vector which will be used to multiply many sets of inouts).
45 |
46 | */
47 | enum mode {INIT_ARRAYS, FORWARD_FFT_ONLY, AUTO_SQUARE, MUL_PRECOMPUTED};
48 |
49 | /* genFFT_mul.c: */
50 | void genFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int MODE);
51 | void genFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int MODE);
52 |
53 | /* Nov 2015 - moved updated versions of these to Mlucas.h:
54 | void pairFFT_mul(double x[], double y[], int n, int INIT_ARRAYS, int FORWARD_FFT_ONLY);
55 | void pairFFT_mul_process_chunk(double a[], double ab_mul[], double cd_mul[], int n, struct complex rt0[], struct complex rt1[], int index[], int ii, int nradices_prim, int radix_prim[], int FORWARD_FFT_ONLY, int skip_square);
56 | void radix16_pairFFT_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int INIT_ARRAYS, int FORWARD_FFT_ONLY, int skip_square);
57 | */
58 |
59 | /* The complex/rel wrapper and dyadic-mul step, combined with the final-fFFt/initial-iFFT radix pass: */
60 | void radix16_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE);
61 | void radix32_genFFT_wrapper_mul(double uv[], double ab_mul[], double cd_mul[], int n, int radix0, struct complex rt0[], struct complex rt1[], int ii, int nradices_prim, int radix_prim[], int nloops, int incr, int MODE);
62 |
63 | #ifdef __cplusplus
64 | }
65 | #endif
66 |
67 | #endif /* gen_fft_h_included */
68 |
69 |
--------------------------------------------------------------------------------
/src/getRealTime.c:
--------------------------------------------------------------------------------
1 | // EWM: June 2014 - Code from http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking
2 | // for high-precision elapsed real time; thanks to Stephen Searle for finding this.
3 | // Prototype for getRealTime() is in util.h .
4 |
5 | /*
6 | * Author: David Robert Nadeau
7 | * Site: http://NadeauSoftware.com/
8 | * License: Creative Commons Attribution 3.0 Unported License
9 | * http://creativecommons.org/licenses/by/3.0/deed.en_US
10 | */
11 |
12 | #if defined(_WIN32)
13 |
14 | #include
15 |
16 | #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
17 |
18 | #include /* POSIX flags */
19 | #include /* clock_gettime(), time() */
20 | #include /* gethrtime(), gettimeofday() */
21 |
22 | #if defined(__MACH__) && defined(__APPLE__)
23 |
24 | #include
25 | #include
26 |
27 | #endif
28 |
29 | #else
30 | // EWM: Instead of error-on-compile we default to the util.c:get_time_str() function if any misconfig detected:
31 | #error "Unable to define getRealTime( ) for an unknown OS."
32 | #define GRT_MISCONFIG
33 | #endif
34 |
35 | /**
36 | * Returns the real time, in seconds, or -1.0 if an error occurred.
37 | *
38 | * Time is measured since an arbitrary and OS-dependent start time.
39 | * The returned real time is only useful for computing an elapsed time
40 | * between two calls to this function.
41 | */
42 | double getRealTime( )
43 | {
44 | #ifdef GRT_MISCONFIG
45 | return -1.0; // EWM: See above note
46 | #elif defined(_WIN32)
47 | FILETIME tm;
48 | ULONGLONG t;
49 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
50 | /* Windows 8, Windows Server 2012 and later. ---------------- */
51 | GetSystemTimePreciseAsFileTime( &tm );
52 | #else
53 | /* Windows 2000 and later. ---------------------------------- */
54 | GetSystemTimeAsFileTime( &tm );
55 | #endif
56 | t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
57 | return (double)t / 10000000.0;
58 |
59 | #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
60 | /* HP-UX, Solaris. ------------------------------------------ */
61 | return (double)gethrtime( ) / 1000000000.0;
62 |
63 | #elif defined(__MACH__) && defined(__APPLE__)
64 | /* OSX. ----------------------------------------------------- */
65 | static double timeConvert = 0.0;
66 | if ( timeConvert == 0.0 )
67 | {
68 | mach_timebase_info_data_t timeBase;
69 | (void)mach_timebase_info( &timeBase );
70 | timeConvert = (double)timeBase.numer /
71 | (double)timeBase.denom /
72 | 1000000000.0;
73 | }
74 | return (double)mach_absolute_time( ) * timeConvert;
75 |
76 | #elif defined(_POSIX_VERSION)
77 | /* POSIX. --------------------------------------------------- */
78 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
79 | {
80 | struct timespec ts;
81 | #if defined(CLOCK_MONOTONIC_PRECISE)
82 | /* BSD. --------------------------------------------- */
83 | const clockid_t id = CLOCK_MONOTONIC_PRECISE;
84 | #elif defined(CLOCK_MONOTONIC_RAW)
85 | /* Linux. ------------------------------------------- */
86 | const clockid_t id = CLOCK_MONOTONIC_RAW;
87 | #elif defined(CLOCK_HIGHRES)
88 | /* Solaris. ----------------------------------------- */
89 | const clockid_t id = CLOCK_HIGHRES;
90 | #elif defined(CLOCK_MONOTONIC)
91 | /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
92 | const clockid_t id = CLOCK_MONOTONIC;
93 | #elif defined(CLOCK_REALTIME)
94 | /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
95 | const clockid_t id = CLOCK_REALTIME;
96 | #else
97 | const clockid_t id = (clockid_t)-1; /* Unknown. */
98 | #endif /* CLOCK_* */
99 | if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
100 | return (double)ts.tv_sec +
101 | (double)ts.tv_nsec / 1000000000.0;
102 | /* Fall thru. */
103 | }
104 | #endif /* _POSIX_TIMERS */
105 |
106 | /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
107 | struct timeval tm;
108 | gettimeofday( &tm, NULL );
109 | return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
110 | #else
111 | return -1.0; /* Failed. */
112 | #endif
113 | }
114 |
115 |
--------------------------------------------------------------------------------
/src/get_fp_rnd_const.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "util.h"
24 |
25 | /* Set the value of the round constant used for fast NINT emulation: */
26 | void get_fp_rnd_const(double*RND_A, double*RND_B)
27 | {
28 | #if(FP_MANTISSA_BITS_DOUBLE == 64) /* X86 64-mantissa-bit register doubles: */
29 | *RND_A = 3.0*0x4000000*0x2000000*0x800;
30 | *RND_B =12.0*0x2000000*0x1000000*0x800;
31 | fprintf(stderr,"INFO: using 64-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation.\n");
32 | #else /* These assume IEEE64-compliant double-precision hardware arithmetic: */
33 | *RND_A = 3.0*0x4000000*0x2000000;
34 | *RND_B =12.0*0x2000000*0x1000000;
35 | fprintf(stderr,"INFO: using 53-bit-significand form of floating-double rounding constant for scalar-mode DNINT emulation. \n");
36 | #endif
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/src/gpu_iface.cu:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2012 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so
24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be
25 | // considered coincidental, and any faults strictly mine.
26 |
27 | #include "gpu_iface.h"
28 |
29 | #ifdef __CUDACC__
30 | #warning using nvcc
31 | #ifdef __CUDA_ARCH__
32 | #warning device code trajectory
33 | #if __CUDA_ARCH__ > 120
34 | #warning compiling with double precision
35 | #else
36 | #warning compiling with single precision
37 | #endif
38 | #else
39 | #warning nvcc host code trajectory
40 | #endif
41 | #else
42 | #warning non-nvcc code trajectory
43 | #endif
44 |
45 | #ifndef OS_BITS
46 | #error Bitness not defined!
47 | #elif OS_BITS == 32
48 | #warning compiling in 32-bit mode
49 | #elif OS_BITS == 64
50 | #warning compiling in 64-bit mode
51 | #else
52 | #error Bitness defined but not supported!
53 | #endif
54 |
55 | // 50 Ways to say "Houston, we have a problem":
56 | char *
57 | cuGetErrorMessage(CUresult result)
58 | {
59 | switch (result) {
60 | case CUDA_SUCCESS: return "CUDA_SUCCESS";
61 | case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
62 | case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
63 | case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
64 | case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
65 | case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
66 | case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
67 | case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
68 | case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
69 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
70 | case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
71 | case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
72 | case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
73 | case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
74 | case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
75 | case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
76 | case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
77 | case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
78 | case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
79 | case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
80 | case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
81 | case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
82 | case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
83 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
84 | case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
85 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
86 | case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
87 | default: return "CUDA: unexpected error";
88 | }
89 | }
90 |
91 | // Read information on all available GPUs into input arg:
92 | void
93 | gpu_init(gpu_config_t *gpu_config)
94 | {
95 | int32 device, nskip = 0;
96 | memset(gpu_config, 0, sizeof(gpu_config_t));
97 |
98 | // CUDA_TRY(cudaGetDeviceCount(&gpu_config->num_gpu)) *** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult"
99 | cudaGetDeviceCount(&gpu_config->num_gpu);
100 | for (device = 0; device < (int32)gpu_config->num_gpu; device++)
101 | {
102 | // Get pointer to info for [device]th GPU having the minimum required capability:
103 | gpu_info_t *info = gpu_config->gpu_info + device - nskip;
104 | // CUDA_TRY(cudaGetDeviceProperties(info, device)) *** error: a value of type "cudaError_t" cannot be used to initialize an entity of type "CUresult"
105 | cudaGetDeviceProperties(info, device);
106 | if(info->major < 2) {
107 | printf("GPU #%d compute capability %d.%d is less than min-supported 2.x ... ignoring this device.\n",device,info->major,info->minor);
108 | ++nskip;
109 | }
110 | // Note: Devices with cc = 2.x have (32 + 16*x) shader cores per multiprocessor (At least for x = 0 and 1 ... may need table for this
111 | }
112 | gpu_config->num_gpu -= nskip;
113 | return;
114 | }
115 |
116 | #ifdef GPU_IFACE_STANDALONE
117 | int main(int argc, char *argv[])
118 | {
119 | gpu_config_t gpu_config;
120 | gpu_info_t ginfo;
121 | int32 igpu;
122 |
123 | gpu_init(&gpu_config);
124 | if (gpu_config.num_gpu > 0) {
125 | printf("Detected %u CUDA-enabled GPU devices.\n", gpu_config.num_gpu);
126 | for(igpu = 0; igpu < gpu_config.num_gpu; ++igpu) {
127 | ginfo = gpu_config.gpu_info[igpu];
128 | printf("GPU #%u: %s v%u.%u\n", igpu, ginfo.name, ginfo.major, ginfo.minor);
129 | printf("clock_speed = %u MHz\n", ginfo.clockRate/1000);
130 | printf("num_compute_units = %u\n", ginfo.multiProcessorCount);
131 | printf("constant_mem_size = %u\n", ginfo.totalConstMem);
132 | printf("shared_mem_size = %u\n", ginfo.sharedMemPerBlock);
133 | printf("global_mem_size = %u\n", ginfo.totalGlobalMem);
134 | printf("registers_per_block = %u\n", ginfo.regsPerBlock);
135 | printf("max_threads_per_block = %u\n", ginfo.maxThreadsPerBlock);
136 | printf("can_overlap = %u\n", ginfo.deviceOverlap);
137 | printf("concurrent_kernels = %u\n", ginfo.concurrentKernels);
138 | printf("warp_size = %u\n", ginfo.warpSize);
139 | printf("max_thread_dim[3] = [%u,%u,%u]\n", ginfo.maxThreadsDim[0], ginfo.maxThreadsDim[1], ginfo.maxThreadsDim[2]);
140 | printf("max_grid_size[3] = [%u,%u,%u]\n", ginfo.maxGridSize[0], ginfo.maxGridSize[1], ginfo.maxGridSize[2]);
141 | }
142 | exit(0);
143 | } else {
144 | printf("ERROR: No CUDA-enabled GPUs found\n");
145 | exit(-1);
146 | }
147 | }
148 | #endif
149 |
150 |
--------------------------------------------------------------------------------
/src/gpu_iface.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | // Thanks to Jason Papadopoulos for the original version of the GPU interface ... this is now so
24 | // heavily modified by me that any resemblance to the original in the nontrivial details should be
25 | // considered coincidental, and any faults strictly mine.
26 |
27 | #ifndef gpu_iface_h_included
28 | #define gpu_iface_h_included
29 |
30 | #ifndef GPU_IFACE_STANDALONE
31 | // Non-standalone build assumes the non-main functions in this file will serve as GPU diagnostics
32 | // for an Mlucas or Mfactor build, so require same compile flag as for the other sources in such a build:
33 | #ifndef USE_GPU
34 | #error Compilation of any source file using a gpu-specific header requires the user-defined preprocessor flag USE_GPU
35 | #endif
36 |
37 | #include "masterdefs.h"
38 | #include "types.h"
39 | #else
40 | #include
41 | typedef int int32;
42 | #endif
43 |
44 | #include
45 | #include
46 | #include
47 |
48 | #ifdef __cplusplus
49 | extern "C" {
50 | #endif
51 |
52 | #define MAX_GPU 16
53 |
54 | typedef struct cudaDeviceProp gpu_info_t;
55 | /*
56 | cudaDeviceProp struct members:
57 |
58 | int canMapHostMemory Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer.
59 | int clockRate Clock frequency in kilohertz.
60 | int computeMode Compute mode (See cudaComputeMode).
61 | int deviceOverlap Device can concurrently copy memory and execute a kernel.
62 | int integrated Device is integrated as opposed to discrete.
63 | int kernelExecTimeoutEnabled Specified whether there is a run time limit on kernels.
64 | int major Major compute capability.
65 | int minor Minor compute capability.
66 | int maxGridSize [3] Maximum size of each dimension of a grid.
67 | int maxThreadsDim [3] Maximum size of each dimension of a block.
68 | int maxThreadsPerBlock Maximum number of threads per block.
69 | size_t memPitch Maximum pitch in bytes allowed by memory copies.
70 | int multiProcessorCount Number of multiprocessors on device.
71 | char name [256] ASCII string identifying device.
72 | int regsPerBlock 32-bit registers available per block
73 | size_t sharedMemPerBlock Shared memory available per block in bytes.
74 | size_t textureAlignment Alignment requirement for textures.
75 | size_t totalConstMem Constant memory available on device in bytes.
76 | size_t totalGlobalMem Global memory available on device in bytes.
77 | int warpSize Warp size in threads.
78 | */
79 |
80 | typedef struct {
81 | int32 num_gpu;
82 | gpu_info_t gpu_info[MAX_GPU];
83 | } gpu_config_t;
84 |
85 | char * cuGetErrorMessage(CUresult result);
86 |
87 | void gpu_init(gpu_config_t *config);
88 |
89 | #define CUDA_TRY(func) \
90 | { \
91 | CUresult status = func; \
92 | if (status != CUDA_SUCCESS) { \
93 | printf("error (line %d): %s\n", __LINE__,\
94 | cuGetErrorMessage(status)); \
95 | exit(-1); \
96 | } \
97 | }
98 |
99 | #define CUDA_ALIGN_PARAM(offset, pow2align) \
100 | (offset) = ((offset) + (pow2align) - 1) & ~((pow2align) - 1)
101 |
102 | #ifdef __cplusplus
103 | }
104 | #endif
105 |
106 | #endif /* !gpu_iface_h_included_ */
107 |
108 |
--------------------------------------------------------------------------------
/src/imul_macro.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef imul_macro_h_included
27 | #define imul_macro_h_included
28 |
29 | #include "imul_macro0.h"
30 | #include "imul_macro1.h"
31 |
32 | #endif /* imul_macro_h_included */
33 |
34 |
--------------------------------------------------------------------------------
/src/masterdefs.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef masterdefs_h_included
27 | #define masterdefs_h_included
28 |
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include // Nov 2021: Add to provide POSIX case-insensitive string compare string compare strcasecmp() and strncasecmp();
41 | // cf. https://stackoverflow.com/questions/5820810/case-insensitive-string-comparison-in-c
42 | #include
43 |
44 | #ifdef macintosh
45 | #include /* Macintosh CW */
46 | #endif
47 |
48 | #undef EWM_DEBUG
49 | #define EWM_DEBUG 0 /* Set = 1 to turn on various debugging diagnostics, especially DBG_ASSERT, defined in util.c . */
50 |
51 | /* cf. util.h|c : If debug enabled, alias DBG_ASSERT to ASSERT (a function defined
52 | in util.c), otherwise alias the entire 4-argument DBG_ASSERT invocation to "Bolivian"
53 | (to paraphrase ex-heavyweight boxing champ Mike Tyson.) */
54 | #if EWM_DEBUG
55 | #define DBG_ASSERT ASSERT
56 | #define DBG_WARN WARN
57 | #define DBG_INFO INFO
58 | #else /* Bolivian - lump both the FILE and LINE args together as a single __here, that's why it looks like these take 1 less arg than the underlying functions: */
59 | #define DBG_ASSERT(__arg1, __arg2) /* */
60 | #define DBG_WARN(__here, __arg2, __arg3, __arg4) /* */
61 | #define DBG_INFO(__here, __arg2, __arg3, __arg4) /* */
62 | #endif
63 |
64 | /*******************************************************************************
65 | Mlucas-specific master #defines:
66 | *******************************************************************************/
67 |
68 | /* Set = 1 to do a simple FFT/IFFT-returns-original-inputs test
69 | (sans weighting and dyadic squaring) using pseudorandom inputs:
70 | */
71 | #undef FFT_DEBUG
72 | #define FFT_DEBUG 0
73 |
74 | #undef NOBRANCH
75 | #define NOBRANCH 1 /* Switch between branched and branchless versions of various key sequences. */
76 |
77 | #ifndef LO_ADD
78 | #define LO_ADD 1 /* TRUE = use algorithm with more mul and fewer add */
79 | #endif
80 |
81 | #undef N_LEADING_RADICES
82 | #define N_LEADING_RADICES 8 /* # of intervals we split adjacent power-of-2 transform lengths into */
83 |
84 | #endif /* masterdefs_h_included */
85 |
--------------------------------------------------------------------------------
/src/pair_square.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "Mlucas.h"
24 |
25 | /***************/
26 |
27 | /*
28 | Macro versions of these are in pair_square.h, since radix32_wrapper_square.c also needs to inline those;
29 | SSE2 macros for this are in sse2_macro_gcc64.h.
30 | */
31 | void pair_square(double *x1, double *y1, double *x2, double *y2, double c, double s)
32 | {
33 | /*
34 | ! Given complex scalars H[j] = (x1,y1) and H[N-j] = (x2,y2) along with complex exponential E = (c,s),
35 | ! calculates I[j] = H[j]^2 + {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 and its complex conjugate I~,
36 | ! returns the former in H[j] and the latter in H[N-j].
37 | */
38 | // Use that (H[j] - H~[N-j])^2 = H(j)^2 - 2*H(j)*H~(N-j) + H~(N-j)^2 to efficiently compute both (H[j]-H~[N-j])^2 and H[j]^2:
39 | #if 0
40 | double rt0,rt1,rt2,rt3,it1,it2,it3;
41 | // H[j] = (r1,i1); H[N-j] = (r2,i2):
42 | rt1 = *x1; it1 = *y1; rt2 = *x2; it2 = *y2; // H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)]
43 | // = [(r1^2-i1^2) + (r2^2-i2^2) - 2.(r1.r2+i1.i2)] + 2.I.[(r1.i1-r2.i2) - (i1.r2-r1.i2)]
44 | // Calculate cross product terms:
45 | rt3 = rt1*rt2 + it1*it2; rt3 = rt3 + rt3; // 2.(r1.r2 + i1.i2)
46 | it3 = it1*rt2 - rt1*it2; it3 = it3 + it3; // 2.(i1.r2 - r1.i2)
47 | // Now calculate square terms and store back in the same temporaries:
48 | rt0 = (rt1 + it1)*(rt1 - it1); it1 = rt1*it1; it1 = it1 + it1; rt1 = rt0; // rt1,it1 = (r1^2-i1^2); 2.r1.i1
49 | rt0 = (rt2 + it2)*(rt2 - it2); it2 = rt2*it2; it2 = it2 + it2; rt2 = rt0; // rt2,it2 = (r2^2-i2^2); 2.r2.i2
50 | // {1 + exp(2*pi*I*j/N)}*{H[j]-H~[N-j]}^2/4 :
51 | rt3 = rt1 + rt2 - rt3; // Re(H[j]-H~[N-j])
52 | it3 = it1 - it2 - it3; // Im(H[j]-H~[N-j])
53 | rt0 = ((c + 1.0)*rt3 - s*it3)*0.25;
54 | it3 = (s*rt3 + (c + 1.0)*it3)*0.25;
55 | // And now complete and store the results:
56 | *x1 = (rt1 - rt0); // Re(I[j])
57 | *y1 = (it1 - it3); // Im(I[j])
58 | // N-j terms are as above, but with the replacements: rt1<-->rt2, it1<-->it2, it3|-->-it3:
59 | *x2 = (rt2 - rt0);
60 | *y2 = (it2 + it3);
61 | // Cost: [22 add, 12 mul], compared to [18 add, 18 mul] for generic-mul version ... seems too add-heavy.
62 | #elif 0 // Quick test of mul version of this function, using square inputs:
63 | double re,im,tt;
64 | /*...gather the 4 complex elements which are to be combined...*/
65 | // Re{H[j]} Im{H[j]} Re{I[j]} Im{I[j]} Re{H[N-j]} Im{H[N-j]} Re{I[N-j]} Im{I[N-j]}
66 | double r1 = *x1, i1 = *y1, r2 = *x1, i2 = *y1, r3 = *x2, i3 = *y2, r4 = *x2, i4 = *y2;
67 | // calculate 2nd square-like term and store in temp...
68 | re = r3*r4 - i3*i4; // re := Re{H(n2-j)*I(n2-j)}
69 | im = r3*i4 + i3*r4; // im := Im{H(n2-j)*I(n2-j)}
70 | // calculate difference terms...
71 | r3 = r1 - r3; // r3 := Re{H(j)-H~(n2-j)}
72 | i3 = i1 + i3; // i3 := Im{H(j)-H~(n2-j)}
73 | r4 = r2 - r4; // r4 := Re{I(j)-I~(n2-j)}
74 | i4 = i2 + i4; // i4 := Im{I(j)-I~(n2-j)}
75 | // now calculate 1st square-like term and store back in H(j) slot...
76 | tt = r1*r2 - i1*i2; // r1 := Re{H(j)*I(j)}
77 | i1 = r1*i2 + i1*r2; r1 = tt;// i1 := Im{H(j)*I(j)}
78 | // calculate the complex products to build the second term...
79 | tt = r3*r4 - i3*i4; // Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
80 | i3 = r3*i4 + i3*r4; r3 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
81 | tt = ((c + 1.0)*r3 - s*i3)*0.25; // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
82 | i3 = (s*r3 + (c + 1.0)*i3)*0.25; // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
83 | // and now complete and store the results.
84 | *x1 = (r1-tt); // Re{M(j)}
85 | *y1 = (i1-i3); // Im{M(j)}
86 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3.
87 | *x2 = (re-tt); // Re{M(N-j)}
88 | *y2 = (im+i3); // Im{M(N-j)}
89 | #else
90 | double re,im,tt, r1 = *x1, i1 = *y1, r2 = *x2, i2 = *y2, cc = (c + 1.0)*0.25, ss = s*0.25;
91 | // H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)]
92 | // calculate 2nd square-like term and store in temp...
93 | re = (r2+i2)*(r2-i2); // re := Re{H(n2-j)^2}
94 | im = r2*i2 + i2*r2; // im := Im{H(n2-j)^2}
95 | // calculate difference terms...
96 | r2 = r1 - r2; // r2 := Re{H(j)-H~(n2-j)}
97 | i2 = i1 + i2; // i2 := Im{H(j)-H~(n2-j)}
98 | // now calculate 1st square-like term and store back in H(j) slot...
99 | tt = (r1+i1)*(r1-i1); // r1 := Re{H(j)^2}
100 | i1 = r1*i1 + i1*r1; r1 = tt;// i1 := Im{H(j)^2}
101 | // calculate the complex products to build the second term...
102 | tt = (r2+i2)*(r2-i2); // Re{(H[j] - H~[N/2-j])^2}
103 | i2 = r2*i2 + i2*r2; r2 = tt;// Im{(H[j] - H~[N/2-j])^2}
104 | tt = (cc*r2 - ss*i2); // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
105 | i2 = (ss*r2 + cc*i2); // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
106 | // and now complete and store the results.
107 | *x1 = (r1-tt); // Re{M(j)}
108 | *y1 = (i1-i2); // Im{M(j)}
109 | // N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3.
110 | *x2 = (re-tt); // Re{M(N-j)}
111 | *y2 = (im+i2); // Im{M(N-j)}
112 | // Cost: [19 add, 15 mul] ... or [16 add, 18 mul] if replace re-part-of-cmuls (r+i)*(r-i) with r^2-i^2.
113 | // Can save another [2 add, 2 mul] by precomputing cc = (c + 1.0)/4 and ss = s/4.
114 | #endif
115 | }
116 |
117 | // Jul 2019: This routine adapted from my vintage 1999 mersenne_pm1.f90 code, with input-indec swap 2 <--> 3:
118 | void pair_mul(
119 | double *x1, double *y1, double *x2, double *y2, const double sx3, const double sy3, const double sx4, const double sy4,
120 | const double c, const double s)
121 | {
122 | /*
123 | ! Given complex scalars H[j] = (x1,y1), H[N-j] = (x2,y2) and (const)I[j] = (x3,y3), I[N-j] = (x4,y4)
124 | ! along with complex exponential E = (c,s),
125 | ! calculates M[j] = H[j]*I[j] + {1 + exp(4*pi*I*j/N)}*{H[j]-H~[N-j]}*{I[j]-I~[N-j]}/4 and its complex conjugate M~,
126 | ! returns the former in H[j] and the latter in H[N-j], thus overwriting those non-const inputs.
127 | */
128 | double re,im,tt, cc = (c + 1.0)*0.25, ss = s*0.25;
129 | /*...gather the 4 complex elements which are to be combined...*/
130 | // Re{H[j]} Im{H[j]} Re{H[N-j]} Im{H[N-j]} Re{I[j]} Im{I[j]} Re{I[N-j]} Im{I[N-j]}
131 | double r1 = *x1, i1 = *y1, r2 = *x2, i2 = *y2, r3 = sx3, i3 = sy3, r4 = sx4, i4 = sy4;
132 |
133 | /*...Have: H, H~, I, I~ need: H*I, H~*I~, H - H~, I - I~. Use the sequence:
134 | Find H~I~, store in tmp
135 | Find H-H~, store in H~
136 | Find I-I~, store in I~
137 | Find HI, store in H
138 | Store H~I~ in I
139 | */
140 | // calculate 2nd square-like term and store in temp...
141 | re = r2*r4 - i2*i4; // re := Re{H(n2-j)*I(n2-j)}
142 | im = r2*i4 + i2*r4; // im := Im{H(n2-j)*I(n2-j)}
143 | // calculate difference terms...
144 | r2 = r1 - r2; // r2 := Re{H(j)-H~(n2-j)}
145 | i2 = i1 + i2; // i2 := Im{H(j)-H~(n2-j)}
146 | r4 = r3 - r4; // r4 := Re{I(j)-I~(n2-j)}
147 | i4 = i3 + i4; // i4 := Im{I(j)-I~(n2-j)}
148 | // now calculate 1st square-like term and store back in H(j) slot...
149 | tt = r1*r3 - i1*i3; // r1 := Re{H(j)*I(j)}
150 | i1 = r1*i3 + i1*r3; r1 = tt;// i1 := Im{H(j)*I(j)}
151 | // calculate the complex products to build the second term...
152 | tt = r2*r4 - i2*i4; // Re{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
153 | i2 = r2*i4 + i2*r4; r2 = tt;// Im{(H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])}
154 | tt = (cc*r2 - ss*i2); // Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
155 | i2 = (ss*r2 + cc*i2); // Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])*(I[j] - I~[N/2-j])/4}
156 | // and now complete and store the results.
157 | *x1 = (r1-tt); // Re{M(j)}
158 | *y1 = (i1-i2); // Im{M(j)}
159 | // N-j terms are as above, but with the replacements: r1<-->r3, i1<-->i3, i2|-->-i2.
160 | *x2 = (re-tt); // Re{M(N-j)}
161 | *y2 = (im+i2); // Im{M(N-j)}
162 | // Cost: 16 add, 16 mul [Ignoring the (1 add, 2 mul) cost of the cc,ss precomputation]
163 | }
164 |
165 |
--------------------------------------------------------------------------------
/src/prefetch.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/primesearch/Mlucas/5e6465318b8c656ffb83025229038f5c2614fa35/src/prefetch.h
--------------------------------------------------------------------------------
/src/qfcheb.h:
--------------------------------------------------------------------------------
1 | #define STR_MAX_LEN 1024
2 | extern char cbuf[STR_MAX_LEN*2];
3 |
--------------------------------------------------------------------------------
/src/radix1024.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix1024_included
27 | #define radix1024_included
28 |
29 | #include "radix512.h"
30 |
31 | // 'bc -l' code for these: p2=8*a(1);d=p2/1024;t=-d; t+=(d+d);c(t);s(t); [repeat 64 times]:
32 | // Of the odd-order 1024th roots, note that _4f,_53,_7f end up being unused by the radix-1024 DFT twiddles array:
33 | #define c1024_01 ((double)0.99998117528260114265)
34 | #define s1024_01 ((double)0.00613588464915447535) /* exp(01*I*twopi/1024) */
35 | #define c1024_03 ((double)0.99983058179582342201)
36 | #define s1024_03 ((double)0.01840672990580482090) /* exp(03*I*twopi/1024) */
37 | #define c1024_05 ((double)0.99952941750109316308)
38 | #define s1024_05 ((double)0.03067480317663662588) /* exp(05*I*twopi/1024) */
39 | #define c1024_07 ((double)0.99907772775264538289)
40 | #define s1024_07 ((double)0.04293825693494082301) /* exp(07*I*twopi/1024) */
41 | #define c1024_09 ((double)0.99847558057329475221)
42 | #define s1024_09 ((double)0.05519524434968993972) /* exp(09*I*twopi/1024) */
43 | #define c1024_0b ((double)0.99772306664419160985)
44 | #define s1024_0b ((double)0.06744391956366405780) /* exp(0b*I*twopi/1024) */
45 | #define c1024_0d ((double)0.99682029929116571498)
46 | #define s1024_0d ((double)0.07968243797143012103) /* exp(0d*I*twopi/1024) */
47 | #define c1024_0f ((double)0.99576741446765979399)
48 | #define s1024_0f ((double)0.09190895649713272849) /* exp(0f*I*twopi/1024) */
49 | #define c1024_11 ((double)0.99456457073425545213)
50 | #define s1024_11 ((double)0.10412163387205457897) /* exp(11*I*twopi/1024) */
51 | #define c1024_13 ((double)0.99321194923479453312)
52 | #define s1024_13 ((double)0.11631863091190476708) /* exp(13*I*twopi/1024) */
53 | #define c1024_15 ((double)0.99170975366909952288)
54 | #define s1024_15 ((double)0.12849811079379317243) /* exp(15*I*twopi/1024) */
55 | #define c1024_17 ((double)0.99005821026229710553)
56 | #define s1024_17 ((double)0.14065823933284923051) /* exp(17*I*twopi/1024) */
57 | #define c1024_19 ((double)0.98825756773074949143)
58 | #define s1024_19 ((double)0.15279718525844342750) /* exp(19*I*twopi/1024) */
59 | #define c1024_1b ((double)0.98630809724459864790)
60 | #define s1024_1b ((double)0.16491312048996992118) /* exp(1b*I*twopi/1024) */
61 | #define c1024_1d ((double)0.98421009238692907323)
62 | #define s1024_1d ((double)0.17700422041214875594) /* exp(1d*I*twopi/1024) */
63 | #define c1024_1f ((double)0.98196386910955526412)
64 | #define s1024_1f ((double)0.18906866414980621248) /* exp(1f*I*twopi/1024) */
65 | #define c1024_21 ((double)0.97956976568544053449)
66 | #define s1024_21 ((double)0.20110463484209191127) /* exp(21*I*twopi/1024) */
67 | #define c1024_23 ((double)0.97702814265775435155)
68 | #define s1024_23 ((double)0.21311031991609137366) /* exp(23*I*twopi/1024) */
69 | #define c1024_25 ((double)0.97433938278557586059)
70 | #define s1024_25 ((double)0.22508391135979283567) /* exp(25*I*twopi/1024) */
71 | #define c1024_27 ((double)0.97150389098625177561)
72 | #define s1024_27 ((double)0.23702360599436720653) /* exp(27*I*twopi/1024) */
73 | #define c1024_29 ((double)0.96852209427441731631)
74 | #define s1024_29 ((double)0.24892760574572016775) /* exp(29*I*twopi/1024) */
75 | #define c1024_2b ((double)0.96539444169768937465)
76 | #define s1024_2b ((double)0.26079411791527551791) /* exp(2b*I*twopi/1024) */
77 | #define c1024_2d ((double)0.96212140426904159553)
78 | #define s1024_2d ((double)0.27262135544994898410) /* exp(2d*I*twopi/1024) */
79 | #define c1024_2f ((double)0.95870347489587155549)
80 | #define s1024_2f ((double)0.28440753721127184321) /* exp(2f*I*twopi/1024) */
81 | #define c1024_31 ((double)0.95514116830577072162)
82 | #define s1024_31 ((double)0.29615088824362382370) /* exp(31*I*twopi/1024) */
83 | #define c1024_33 ((double)0.95143502096900836968)
84 | #define s1024_33 ((double)0.30784964004153489325) /* exp(33*I*twopi/1024) */
85 | #define c1024_35 ((double)0.94758559101774113480)
86 | #define s1024_35 ((double)0.31950203081601567745) /* exp(35*I*twopi/1024) */
87 | #define c1024_37 ((double)0.94359345816196036165)
88 | #define s1024_37 ((double)0.33110630575987640127) /* exp(37*I*twopi/1024) */
89 | #define c1024_39 ((double)0.93945922360218991213)
90 | #define s1024_39 ((double)0.34266071731199439711) /* exp(39*I*twopi/1024) */
91 | #define c1024_3b ((double)0.93518350993894757782)
92 | #define s1024_3b ((double)0.35416352542049038186) /* exp(3b*I*twopi/1024) */
93 | #define c1024_3d ((double)0.93076696107898373214)
94 | #define s1024_3d ((double)0.36561299780477386950) /* exp(3d*I*twopi/1024) */
95 | #define c1024_3f ((double)0.92621024213831134218)
96 | #define s1024_3f ((double)0.37700741021641825620) /* exp(3f*I*twopi/1024) */
97 | #define c1024_41 ((double)0.92151403934204194368)
98 | #define s1024_41 ((double)0.38834504669882629109) /* exp(41*I*twopi/1024) */
99 | #define c1024_43 ((double)0.91667905992104266335)
100 | #define s1024_43 ((double)0.39962419984564682799) /* exp(43*I*twopi/1024) */
101 | #define c1024_45 ((double)0.91170603200542985165)
102 | #define s1024_45 ((double)0.41084317105790394162) /* exp(45*I*twopi/1024) */
103 | #define c1024_47 ((double)0.90659570451491536559)
104 | #define s1024_47 ((double)0.42200027079979968537) /* exp(47*I*twopi/1024) */
105 | #define c1024_49 ((double)0.90134884704602201485)
106 | #define s1024_49 ((double)0.43309381885315196790) /* exp(49*I*twopi/1024) */
107 | #define c1024_4b ((double)0.89596624975618515621)
108 | #define s1024_4b ((double)0.44412214457042923104) /* exp(4b*I*twopi/1024) */
109 | #define c1024_4d ((double)0.89044872324475789026)
110 | #define s1024_4d ((double)0.45508358712634382292) /* exp(4d*I*twopi/1024) */
111 | #define c1024_4f ((double)0.88479709843093778043)
112 | #define s1024_4f ((double)0.46597649576796617728) /* exp(4f*I*twopi/1024) */
113 | #define c1024_51 ((double)0.87901222642863347817)
114 | #define s1024_51 ((double)0.47679923006332213271) /* exp(51*I*twopi/1024) */
115 | #define c1024_53 ((double)0.87309497841829009899)
116 | #define s1024_53 ((double)0.48755016014843595399) /* exp(53*I*twopi/1024) */
117 | #define c1024_55 ((double)0.86704624551569265185)
118 | #define s1024_55 ((double)0.49822766697278185175) /* exp(55*I*twopi/1024) */
119 | #define c1024_57 ((double)0.86086693863776727973)
120 | #define s1024_57 ((double)0.50883014254310703626) /* exp(57*I*twopi/1024) */
121 | #define c1024_59 ((double)0.85455798836540052117)
122 | #define s1024_59 ((double)0.51935599016558958668) /* exp(59*I*twopi/1024) */
123 | #define c1024_5b ((double)0.84812034480329725170)
124 | #define s1024_5b ((double)0.52980362468629466753) /* exp(5b*I*twopi/1024) */
125 | #define c1024_5d ((double)0.84155497743689841004)
126 | #define s1024_5d ((double)0.54017147272989288060) /* exp(5d*I*twopi/1024) */
127 | #define c1024_5f ((double)0.83486287498638005676)
128 | #define s1024_5f ((double)0.55045797293660480227) /* exp(5f*I*twopi/1024) */
129 | #define c1024_61 ((double)0.82804504525775575255)
130 | #define s1024_61 ((double)0.56066157619733602312) /* exp(61*I*twopi/1024) */
131 | #define c1024_63 ((double)0.82110251499110467956)
132 | #define s1024_63 ((double)0.57078074588696727951) /* exp(63*I*twopi/1024) */
133 | #define c1024_65 ((double)0.81403632970594836217)
134 | #define s1024_65 ((double)0.58081395809576454434) /* exp(65*I*twopi/1024) */
135 | #define c1024_67 ((double)0.80684755354379927274)
136 | #define s1024_67 ((double)0.59075970185887422768) /* exp(67*I*twopi/1024) */
137 | #define c1024_69 ((double)0.79953726910790503405)
138 | #define s1024_69 ((double)0.60061647938386892590) /* exp(69*I*twopi/1024) */
139 | #define c1024_6b ((double)0.79210657730021235236)
140 | #define s1024_6b ((double)0.61038280627630945196) /* exp(6b*I*twopi/1024) */
141 | #define c1024_6d ((double)0.78455659715557523362)
142 | #define s1024_6d ((double)0.62005721176328917788) /* exp(6d*I*twopi/1024) */
143 | #define c1024_6f ((double)0.77688846567323245066)
144 | #define s1024_6f ((double)0.62963823891492702460) /* exp(6f*I*twopi/1024) */
145 | #define c1024_71 ((double)0.76910333764557963998)
146 | #define s1024_71 ((double)0.63912444486377574303) /* exp(71*I*twopi/1024) */
147 | #define c1024_73 ((double)0.76120238548426181469)
148 | #define s1024_73 ((double)0.64851440102211244430) /* exp(73*I*twopi/1024) */
149 | #define c1024_75 ((double)0.75318679904361248316)
150 | #define s1024_75 ((double)0.65780669329707865614) /* exp(75*I*twopi/1024) */
151 | #define c1024_77 ((double)0.74505778544146596311)
152 | #define s1024_77 ((double)0.66699992230363750586) /* exp(77*I*twopi/1024) */
153 | #define c1024_79 ((double)0.73681656887736987581)
154 | #define s1024_79 ((double)0.67609270357531595956) /* exp(79*I*twopi/1024) */
155 | #define c1024_7b ((double)0.72846439044822519723)
156 | #define s1024_7b ((double)0.68508366777270038056) /* exp(7b*I*twopi/1024) */
157 | #define c1024_7d ((double)0.72000250796138162984)
158 | #define s1024_7d ((double)0.69397146088965400820) /* exp(7d*I*twopi/1024) */
159 | #define c1024_7f ((double)0.71143219574521644231)
160 | #define s1024_7f ((double)0.70275474445722530165) /* exp(7f*I*twopi/1024) */
161 |
162 | #endif /* #ifndef radix1024_included */
163 |
--------------------------------------------------------------------------------
/src/radix128.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix128_included
27 | #define radix128_included
28 |
29 | #include "radix64.h"
30 |
31 | #define c128_1 ((double)0.99879545620517239271)
32 | #define s128_1 ((double)0.04906767432741801425) /* exp(1*I*twopi/128) */
33 | #define c128_3 ((double)0.98917650996478097345)
34 | #define s128_3 ((double)0.14673047445536175165) /* exp(3*I*twopi/128) */
35 | #define c128_5 ((double)0.97003125319454399260)
36 | #define s128_5 ((double)0.24298017990326388994) /* exp(5*I*twopi/128) */
37 | #define c128_7 ((double)0.94154406518302077841)
38 | #define s128_7 ((double)0.33688985339222005068) /* exp(7*I*twopi/128) */
39 | #define c128_9 ((double)0.90398929312344333158)
40 | #define s128_9 ((double)0.42755509343028209431) /* exp(9*I*twopi/128) */
41 | #define c128_b ((double)0.85772861000027206990)
42 | #define s128_b ((double)0.51410274419322172658) /* exp(b*I*twopi/128) */
43 | #define c128_d ((double)0.80320753148064490981)
44 | #define s128_d ((double)0.59569930449243334345) /* exp(d*I*twopi/128) */
45 | #define c128_f ((double)0.74095112535495909118)
46 | #define s128_f ((double)0.67155895484701840061) /* exp(f*I*twopi/128) */
47 |
48 | #endif /* #ifndef radix128_included */
49 |
--------------------------------------------------------------------------------
/src/radix128_twiddles.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "radix128.h"
24 |
25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper,
26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration
27 | // and thus needs to be inline-able in multiple places in a source filing making use of it.
28 |
29 | const double DFT128_TWIDDLES[16][14] = {
30 | { 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 },
31 | { 0,1,ISRT2,ISRT2,-ISRT2,ISRT2,c16,s16,-s16,c16,s16,c16,-c16,s16 },
32 | { ISRT2,ISRT2,c16,s16,s16,c16,c32_1,s32_1,s32_3,c32_3,c32_3,s32_3,s32_1,c32_1 },
33 | { -ISRT2,ISRT2,s16,c16,-c16,-s16,c32_3,s32_3,-c32_1,s32_1,-s32_1,c32_1,-s32_3,-c32_3 },
34 | { c16,s16,c32_1,s32_1,c32_3,s32_3,c64_1,s64_1,c64_5,s64_5,c64_3,s64_3,c64_7,s64_7 },
35 | { -s16,c16,s32_3,c32_3,-c32_1,s32_1,c64_5,s64_5,-c64_7,s64_7,s64_1,c64_1,-c64_3,-s64_3 },
36 | { s16,c16,c32_3,s32_3,-s32_1,c32_1,c64_3,s64_3,s64_1,c64_1,s64_7,c64_7,-s64_5,c64_5 },
37 | { -c16,s16,s32_1,c32_1,-s32_3,-c32_3,c64_7,s64_7,-c64_3,-s64_3,-s64_5,c64_5,s64_1,-c64_1 },
38 | { c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7 },
39 | { -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1 },
40 | { s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3 },
41 | { -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5 },
42 | { c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b },
43 | { -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d },
44 | { s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f },
45 | { -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9 }
46 | };
47 |
48 |
--------------------------------------------------------------------------------
/src/radix15_sse_macro.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /*******************************************************************************
24 | We now include this header file if it was not included before.
25 | *******************************************************************************/
26 | #ifndef radix15_sse_macro_h_included
27 | #define radix15_sse_macro_h_included
28 |
29 | #include "sse2_macro_gcc64.h"
30 |
31 | /* General indexing for twiddleless radix-15 done as 3*radix-5 followed by 5*radix-3 is as for the scalar macro above:
32 | RADIX_15_DIF(00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E)
33 | ->
34 | RADIX_05_DFT(i0,iC,i9,i6,i3, t0,t1,t2,t3,t4)
35 | RADIX_05_DFT(iA,i7,i4,i1,iD, t5,t6,t7,t8,t9)
36 | RADIX_05_DFT(i5,i2,iE,iB,i8, tA,tB,tC,tD,tE)
37 |
38 | RADIX_03_DFT(t0,t5,tA, o0,o1,o2,)
39 | RADIX_03_DFT(t1,t6,tB, oD,oE,oB,)
40 | RADIX_03_DFT(t2,t7,tC, o9,oA,oB,)
41 | RADIX_03_DFT(t3,t8,tD, o8,o6,o7,)
42 | RADIX_03_DFT(t4,t9,tE, o4,o5,o3,)
43 |
44 | In our impl below, the __i are input pointers, which may overlap the __o outputs;
45 | ..cc0 and cc1 are ptrs to the radix-3 and radix-5 SSE2 sincos constants (c3m1 and cn1);
46 | __t0-E are ptr to scratch local storage (i.e. the address block pointed to by r00-r3e).
47 | */
48 | // Aug 2014: Need arbitrary-pointer-offsets to support I/O permutations needed by
49 | // larger-radix DFTs of length 15 * 2^n
50 |
51 | #define SSE2_RADIX_15_DIF(\
52 | __cc0, __cc1,\
53 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
54 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
55 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\
56 | {\
57 | SSE2_RADIX_05_DFT_0TWIDDLE(__i0,__iC,__i9,__i6,__i3, __cc1, __t0,__t1,__t2,__t3,__t4);\
58 | SSE2_RADIX_05_DFT_0TWIDDLE(__iA,__i7,__i4,__i1,__iD, __cc1, __t5,__t6,__t7,__t8,__t9);\
59 | SSE2_RADIX_05_DFT_0TWIDDLE(__i5,__i2,__iE,__iB,__i8, __cc1, __tA,__tB,__tC,__tD,__tE);\
60 | \
61 | SSE2_RADIX_03_DFT(__t0,__t5,__tA, __cc0, __o0,__o1,__o2);\
62 | SSE2_RADIX_03_DFT(__t1,__t6,__tB, __cc0, __oD,__oE,__oC);\
63 | SSE2_RADIX_03_DFT(__t2,__t7,__tC, __cc0, __o9,__oA,__oB);\
64 | SSE2_RADIX_03_DFT(__t3,__t8,__tD, __cc0, __o8,__o6,__o7);\
65 | SSE2_RADIX_03_DFT(__t4,__t9,__tE, __cc0, __o4,__o5,__o3);\
66 | }
67 |
68 | #define SSE2_RADIX_15_DIT(\
69 | __cc0, __cc1,\
70 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
71 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
72 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE)\
73 | {\
74 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\
75 | SSE2_RADIX_03_DFT(__i0,__i2,__i1, __cc0, __t0,__t2,__t1);\
76 | SSE2_RADIX_03_DFT(__i8,__i7,__i6, __cc0, __t3,__t5,__t4);\
77 | SSE2_RADIX_03_DFT(__iD,__iC,__iE, __cc0, __t6,__t8,__t7);\
78 | SSE2_RADIX_03_DFT(__i4,__i3,__i5, __cc0, __t9,__tB,__tA);\
79 | SSE2_RADIX_03_DFT(__i9,__iB,__iA, __cc0, __tC,__tE,__tD);\
80 | \
81 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\
82 | SSE2_RADIX_05_DFT_0TWIDDLE(__t0,__t3,__t6,__t9,__tC, __cc1, __o0,__o6,__oC,__o3,__o9);\
83 | SSE2_RADIX_05_DFT_0TWIDDLE(__t1,__t4,__t7,__tA,__tD, __cc1, __o5,__oB,__o2,__o8,__oE);\
84 | SSE2_RADIX_05_DFT_0TWIDDLE(__t2,__t5,__t8,__tB,__tE, __cc1, __oA,__o1,__o7,__oD,__o4);\
85 | }
86 |
87 | // Cost: 12 DP-math, 17 vector MOV for each of the two side-by-side 3-DFTs in SSE2_RADIX_03_DFT_X2
88 | // 38 DP-math, 31 vector MOV for each of the two side-by-side 5-DFTs in SSE2_RADIX_05_DFT_0TWIDDLE_X2. Thus
89 | // 150 DP-math, 144 vector MOV for each of the two side-by-side 15-DFTs in each of these two [DIF and DIT] 15-DFT macro-of-macros.
90 | // Compare to van-Buskirk 13-DFT: 198 DP-math, 168 vector MOV.
91 | #define SSE2_RADIX_15_DIF_X2(\
92 | __cc0, __cc1, __two,\
93 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
94 | __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\
95 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\
96 | __j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\
97 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
98 | __u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\
99 | {\
100 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i0,__iC,__i9,__i6,__i3, __s0,__s1,__s2,__s3,__s4, __j0,__jC,__j9,__j6,__j3, __t0,__t1,__t2,__t3,__t4);\
101 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __iA,__i7,__i4,__i1,__iD, __s5,__s6,__s7,__s8,__s9, __jA,__j7,__j4,__j1,__jD, __t5,__t6,__t7,__t8,__t9);\
102 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __i5,__i2,__iE,__iB,__i8, __sA,__sB,__sC,__sD,__sE, __j5,__j2,__jE,__jB,__j8, __tA,__tB,__tC,__tD,__tE);\
103 | \
104 | SSE2_RADIX_03_DFT_X2(__cc0, __s0,__s5,__sA, __o0,__o1,__o2, __t0,__t5,__tA, __u0,__u1,__u2);\
105 | SSE2_RADIX_03_DFT_X2(__cc0, __s1,__s6,__sB, __oD,__oE,__oC, __t1,__t6,__tB, __uD,__uE,__uC);\
106 | SSE2_RADIX_03_DFT_X2(__cc0, __s2,__s7,__sC, __o9,__oA,__oB, __t2,__t7,__tC, __u9,__uA,__uB);\
107 | SSE2_RADIX_03_DFT_X2(__cc0, __s3,__s8,__sD, __o8,__o6,__o7, __t3,__t8,__tD, __u8,__u6,__u7);\
108 | SSE2_RADIX_03_DFT_X2(__cc0, __s4,__s9,__sE, __o4,__o5,__o3, __t4,__t9,__tE, __u4,__u5,__u3);\
109 | }
110 |
111 | #define SSE2_RADIX_15_DIT_X2(\
112 | __cc0, __cc1,__two,\
113 | __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7,__i8,__i9,__iA,__iB,__iC,__iD,__iE,\
114 | __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7,__s8,__s9,__sA,__sB,__sC,__sD,__sE,\
115 | __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7,__o8,__o9,__oA,__oB,__oC,__oD,__oE,\
116 | __j0,__j1,__j2,__j3,__j4,__j5,__j6,__j7,__j8,__j9,__jA,__jB,__jC,__jD,__jE,\
117 | __t0,__t1,__t2,__t3,__t4,__t5,__t6,__t7,__t8,__t9,__tA,__tB,__tC,__tD,__tE,\
118 | __u0,__u1,__u2,__u3,__u4,__u5,__u6,__u7,__u8,__u9,__uA,__uB,__uC,__uD,__uE)\
119 | {\
120 | /* Swap the 2nd pair of each output triplet to effect iDFT: */\
121 | SSE2_RADIX_03_DFT_X2(__cc0, __i0,__i2,__i1, __s0,__s2,__s1, __j0,__j2,__j1, __t0,__t2,__t1);\
122 | SSE2_RADIX_03_DFT_X2(__cc0, __i8,__i7,__i6, __s3,__s5,__s4, __j8,__j7,__j6, __t3,__t5,__t4);\
123 | SSE2_RADIX_03_DFT_X2(__cc0, __iD,__iC,__iE, __s6,__s8,__s7, __jD,__jC,__jE, __t6,__t8,__t7);\
124 | SSE2_RADIX_03_DFT_X2(__cc0, __i4,__i3,__i5, __s9,__sB,__sA, __j4,__j3,__j5, __t9,__tB,__tA);\
125 | SSE2_RADIX_03_DFT_X2(__cc0, __i9,__iB,__iA, __sC,__sE,__sD, __j9,__jB,__jA, __tC,__tE,__tD);\
126 | \
127 | /* Output perm here is 0123456789abcde --> 05a6b1c2738d9e4: */\
128 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s0,__s3,__s6,__s9,__sC, __o0,__o6,__oC,__o3,__o9, __t0,__t3,__t6,__t9,__tC, __u0,__u6,__uC,__u3,__u9);\
129 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s1,__s4,__s7,__sA,__sD, __o5,__oB,__o2,__o8,__oE, __t1,__t4,__t7,__tA,__tD, __u5,__uB,__u2,__u8,__uE);\
130 | SSE2_RADIX_05_DFT_0TWIDDLE_X2(__cc1,__two, __s2,__s5,__s8,__sB,__sE, __oA,__o1,__o7,__oD,__o4, __t2,__t5,__t8,__tB,__tE, __uA,__u1,__u7,__uD,__u4);\
131 | }
132 |
133 | #endif /* radix15_sse_macro_h_included */
134 |
135 |
--------------------------------------------------------------------------------
/src/radix16.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix16_included
27 | #define radix16_included
28 |
29 | #define c16 ((double)0.92387953251128675613)
30 | #define s16 ((double)0.38268343236508977173) /* exp( I*twopi/16) */
31 |
32 | #endif /* #ifndef radix16_included */
33 |
--------------------------------------------------------------------------------
/src/radix16_wrapper_ini.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "Mlucas.h"
24 |
25 | /***************/
26 |
27 | /* Initialize the various arrays of indices used in radix16_wrapper_square, so we can execute
28 | the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired.
29 | */
30 | void radix16_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[])
31 | {
32 | static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum;
33 | int iblock_next;
34 |
35 | if(iblock <= 1 && !(radix0 & 1))
36 | iblock_next = iblock + 1;
37 | else
38 | iblock_next = iblock + 2;
39 |
40 | if(iblock == 0) // j1 = real-array index (double the complex-array index) of the 1st element of each floating pair.
41 | {
42 | // No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square:
43 | j1 = 0;
44 | j2 = 32;
45 | j2_start = j2; // j2 = real-array index (double the complex-array index) of 2nd element of each floating pair.
46 | k = 0;
47 | blocklen = 16; // = half of complex blocklength, since process 2 complex data for each value of loop index L.
48 | blocklen_sum = 0;
49 |
50 | ws_i [iblock] = i ;
51 | ws_j1 [iblock] = j1 ;
52 | ws_j2 [iblock] = j2 ;
53 | ws_j2_start [iblock] = j2_start ;
54 | ws_k [iblock] = k ;
55 | ws_m [iblock] = m ;
56 | ws_blocklen [iblock] = blocklen ;
57 | ws_blocklen_sum[iblock] = blocklen_sum;
58 | } else {
59 | goto jump_in;
60 | }
61 |
62 | for(i = nradices_prim-5; i >= 0; i-- ) // Main loop: lower bound = nradices_prim - radix_now.
63 | { // Remember, radices get processed in reverse order here as in forward FFT.
64 | for(m = 0; m < (blocklen-1)>>1; m += 8) // Do two 16-element sets per loop, so only execute loop half as many times as before.
65 | {
66 | // This tells us when we've reached the end of the current data block:
67 | // Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow!
68 | if(j1 && ((uint64)j1*radix0)%n == 0)
69 | {
70 | ws_i [iblock_next] = i ;
71 | ws_j1 [iblock_next] = j1 ;
72 | ws_j2 [iblock_next] = j2 ;
73 | ws_j2_start [iblock_next] = j2_start ;
74 | ws_k [iblock_next] = k ;
75 | ws_m [iblock_next] = m ;
76 | ws_blocklen [iblock_next] = blocklen ;
77 | ws_blocklen_sum[iblock_next] = blocklen_sum;
78 | // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
79 | return;
80 | }
81 | jump_in: // Entry point for all blocks but the first.
82 | k += 2; // increment sincos array index
83 | // And update the data (j1 and j2) array indices:
84 | j1 += 32;
85 | j2 -= 32;
86 | }
87 | /*
88 | !...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position
89 | ! ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a
90 | ! second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2).
91 | */
92 | j1 += (blocklen << 1);
93 | if(j2_start == n-32) {
94 | // printf("(j2_start == n-32) return with j2_start = %d\n",j2_start);
95 | return;
96 | }
97 |
98 | /*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit,
99 | we multiply the blocklength by K >> 1 in preparation for the final block. */
100 |
101 | blocklen_sum += blocklen;
102 | blocklen = (blocklen_sum) * (radix_prim[i-1]-1);
103 |
104 | /*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */
105 |
106 | j2_start += (blocklen<<2);
107 | j2 = j2_start; /* Reset j2 for start of the next block. */
108 | // printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2);
109 | } /* End of Main loop */
110 | }
111 |
112 | /*
113 | Jun 2014: Possible UMR bug? Note the aside from i=1, only even-idx elts of the ws-arrays get inited ...
114 | so how do the odd-index reads not hose the result?
115 |
116 | Using complex FFT radices 16 8 16 16 16
117 | init ws_k[ 0] = 0
118 | 65536 1048576 131040: init ws_k[ 1] = 2048
119 | 131072 2097152 262112: init ws_k[ 2] = 4096
120 | 262144 4194304 524256: init ws_k[ 4] = 8192
121 | 327680 5242880 458720: init ws_k[ 6] = 12288
122 | 524288 8388608 1048544: init ws_k[ 8] = 16384
123 | 589824 9437184 983008: init ws_k[ 10] = 20480
124 | 655360 10485760 917472: init ws_k[ 12] = 24576
125 | 720896 11534336 851936: init ws_k[ 14] = 28672
126 | Mers_mod_square: Init threadpool of 1 threads
127 | Setting CPU = 0 affinity of worker thread id 0, mach_id = 3843
128 | radix16_wrapper_square with ws[]-index = 0
129 | stride = 32
130 | On entry: i = 0, j1,j2,j2_start = 0, 32, 32, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 16
131 | radix16_wrapper_square with ws[]-index = 1
132 | stride = 32
133 | On entry: i = 3, j1,j2,j2_start = 65536, 131040, 131040, k,m = 2048, 0, nrad_prim = 19, blocklen,sum = 16384
134 | radix16_wrapper_square with ws[]-index = 2
135 | stride = 32
136 | On entry: i = 2, j1,j2,j2_start = 131072, 262112, 262112, k,m = 4096, 0, nrad_prim = 19, blocklen,sum = 32768
137 | radix16_wrapper_square with ws[]-index = 3 <*** inited where? ***
138 | stride = 32 vvvv <*** j1 = 0, so no "jump_in": ***
139 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
140 | ========================
141 | So whn j1 = 0 on entry we exit immediately via:
142 | if(j1 && ((uint64)j1*radix0)%n == 0)
143 | {
144 | // fprintf(stderr,"(j1 && j1*radix0 == 0 (mod n)) check hit: returning\n");
145 | return;
146 | }
147 | ========================
148 | radix16_wrapper_square with ws[]-index = 4
149 | stride = 32
150 | On entry: i = 1, j1,j2,j2_start = 262144, 524256, 524256, k,m = 8192, 0, nrad_prim = 19, blocklen,sum = 65536
151 | radix16_wrapper_square with ws[]-index = 5 <*** j1 = 0
152 | stride = 32
153 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
154 | radix16_wrapper_square with ws[]-index = 6
155 | stride = 32
156 | On entry: i = 1, j1,j2,j2_start = 327680, 458720, 524256, k,m = 12288, 16384, nrad_prim = 19, blocklen,sum = 65536
157 | radix16_wrapper_square with ws[]-index = 7 <*** j1 = 0
158 | stride = 32
159 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
160 | radix16_wrapper_square with ws[]-index = 8
161 | stride = 32
162 | On entry: i = 0, j1,j2,j2_start = 524288, 1048544, 1048544, k,m = 16384, 0, nrad_prim = 19, blocklen,sum = 131072
163 | radix16_wrapper_square with ws[]-index = 9 <*** j1 = 0
164 | stride = 32
165 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
166 | radix16_wrapper_square with ws[]-index = 10
167 | stride = 32
168 | On entry: i = 0, j1,j2,j2_start = 589824, 983008, 1048544, k,m = 20480, 16384, nrad_prim = 19, blocklen,sum = 131072
169 | radix16_wrapper_square with ws[]-index = 11 <*** j1 = 0
170 | stride = 32
171 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
172 | radix16_wrapper_square with ws[]-index = 12
173 | stride = 32
174 | On entry: i = 0, j1,j2,j2_start = 655360, 917472, 1048544, k,m = 24576, 32768, nrad_prim = 19, blocklen,sum = 131072
175 | radix16_wrapper_square with ws[]-index = 13 <*** j1 = 0
176 | stride = 32
177 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
178 | radix16_wrapper_square with ws[]-index = 14
179 | stride = 32
180 | On entry: i = 0, j1,j2,j2_start = 720896, 851936, 1048544, k,m = 28672, 49152, nrad_prim = 19, blocklen,sum = 131072
181 | radix16_wrapper_square with ws[]-index = 15 <*** j1 = 0
182 | stride = 32
183 | On entry: i = 0, j1,j2,j2_start = 0, 0, 0, k,m = 0, 0, nrad_prim = 19, blocklen,sum = 0
184 |
185 | Thus, j1 = 0 is how the odd-idx uninit is handled in practice - BUT NEED TO ENSURE ALL THE J1-DATA ARE INITED = 0 AT OUTSET
186 |
187 | Thus, switch ws_* allocs in mers_mod_square from malloc to calloc.
188 |
189 | (Surprised this issue took so long to manifest...)
190 | */
191 |
--------------------------------------------------------------------------------
/src/radix17_ditN_cy_dif1.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "Mlucas.h"
24 | #include "radix17_dft.h"
25 |
26 | /***************/
27 |
28 | int radix17_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], double wt1[], int si[], double base[], double baseinv[], int iter, double *fracmax, uint64 p)
29 | {
30 | return 1;
31 | }
32 |
33 | /***************/
34 |
35 | void radix17_dif_pass1(double a[], int n)
36 | {
37 | /*
38 | !...Acronym: DIF = Decimation In Frequency
39 | !
40 | !...Subroutine to perform an initial radix-17 complex DIF FFT pass on the data in the length-N real vector A.
41 | !
42 | ! See the documentation in radix16_dif_pass for further details on storage and indexing.
43 | !
44 | ! Given complex inputs (x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,xG), we need the following outputs
45 | ! (here cJ = cos(2*J*pi/17), sJ = sin(2*J*pi/17)):
46 | !
47 | ! X0 = C0, where C0 = x0+ (x1+xG)+ (x2+xF)+ (x3+xE)+ (x4+xD)+ (x5+xC)+ (x6+xB)+ (x7+xA)+ (x6+x9),
48 | ! the cosine terms below get massaged into the form of a length-8 cyclic convolution:
49 | ! X1 = C1 + I*S1 C1 =
50 | ! X2 = C2 + I*S2
51 | ! X3 = C3 + I*S3
52 | ! X4 = C4 + I*S4
53 | ! X5 = C5 + I*S5
54 | ! X6 = C6 + I*S6
55 | ! X7 = C7 + I*S7
56 | ! X8 = C8 + I*S8
57 | ! and the sine terms get massaged into the form of a length-8 acyclic convolution:
58 | ! X9 = C8 - I*S8
59 | ! XA = C7 - I*S7
60 | ! XB = C6 - I*S6
61 | ! XC = C5 - I*S5
62 | ! XD = C4 - I*S4
63 | ! XE = C3 - I*S3
64 | ! XF = C2 - I*S2
65 | ! XG = C1 - I*S1
66 | !
67 | ! We refer to the terms C1-8 (which do not explicitly involving the imaginary constant I)
68 | ! as the "cosine part" of the output, and S1-8 (those multiplied by I) as the "sine part."
69 | ! Opcount for general odd-prime radix R:
70 | ! Totals : 100 FMUL, 140 FADD, (R-1)^2 fmul (R+3)*(R-1) fadd
71 | ! compared to 16 FMUL, 96 FADD for radix-12. (Ouch!)
72 | !
73 | ! Relative cost := #FADD/(radix*lg2(radix)) = 3.679 .
74 | */
75 | int j,j1,j2;
76 | static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE;
77 |
78 | if(!first_entry && (n/17) != n17) /* New runlength? */
79 | {
80 | first_entry=TRUE;
81 | }
82 |
83 | /*...initialize things upon first entry */
84 |
85 | if(first_entry)
86 | {
87 | first_entry=FALSE;
88 | n17 = n/17;
89 | // Constant index offsets for array load/stores are here:
90 | p1 = n17;
91 | p2 = p1 +p1;
92 | p3 = p2 +p1;
93 | p4 = p3 +p1;
94 | p5 = p4 +p1;
95 | p6 = p5 +p1;
96 | p7 = p6 +p1;
97 | p8 = p7 +p1;
98 | p9 = p8 +p1;
99 | p10 = p9 +p1;
100 | p11 = p10+p1;
101 | p12 = p11+p1;
102 | p13 = p12+p1;
103 | p14 = p13+p1;
104 | p15 = p14+p1;
105 | p16 = p15+p1;
106 |
107 | p1 += ( (p1 >> DAT_BITS) << PAD_BITS );
108 | p2 += ( (p2 >> DAT_BITS) << PAD_BITS );
109 | p3 += ( (p3 >> DAT_BITS) << PAD_BITS );
110 | p4 += ( (p4 >> DAT_BITS) << PAD_BITS );
111 | p5 += ( (p5 >> DAT_BITS) << PAD_BITS );
112 | p6 += ( (p6 >> DAT_BITS) << PAD_BITS );
113 | p7 += ( (p7 >> DAT_BITS) << PAD_BITS );
114 | p8 += ( (p8 >> DAT_BITS) << PAD_BITS );
115 | p9 += ( (p9 >> DAT_BITS) << PAD_BITS );
116 | p10 += ( (p10>> DAT_BITS) << PAD_BITS );
117 | p11 += ( (p11>> DAT_BITS) << PAD_BITS );
118 | p12 += ( (p12>> DAT_BITS) << PAD_BITS );
119 | p13 += ( (p13>> DAT_BITS) << PAD_BITS );
120 | p14 += ( (p14>> DAT_BITS) << PAD_BITS );
121 | p15 += ( (p15>> DAT_BITS) << PAD_BITS );
122 | p16 += ( (p16>> DAT_BITS) << PAD_BITS );
123 | }
124 |
125 | /*...The radix-17 pass is here. */
126 |
127 | for(j=0; j < n17; j += 2)
128 | {
129 | #ifdef USE_SSE2
130 | j1 = (j & mask01) + br4[j&3];
131 | j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS );
132 | #else
133 | j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */
134 | #endif
135 | j2 = j1+RE_IM_STRIDE;
136 | /* Call same radix-11 DFT macro as for DIF, but replace indices [0,1,2,3,4,5,6,7,8,9,10] with j*10%11, j = 0, ..., 10: */
137 | RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16]
138 | ,a+j1 ,a+j2 ,a+j1+p1 ,a+j2+p1 ,a+j1+p2 ,a+j2+p2 ,a+j1+p3 ,a+j2+p3 ,a+j1+p4 ,a+j2+p4 ,a+j1+p5 ,a+j2+p5 ,a+j1+p6 ,a+j2+p6 ,a+j1+p7 ,a+j2+p7 ,a+j1+p8 ,a+j2+p8 ,a+j1+p9 ,a+j2+p9 ,a+j1+p10 ,a+j2+p10 ,a+j1+p11 ,a+j2+p11 ,a+j1+p12 ,a+j2+p12 ,a+j1+p13 ,a+j2+p13 ,a+j1+p14 ,a+j2+p14 ,a+j1+p15 ,a+j2+p15 ,a+j1+p16 ,a+j2+p16 );
139 | }
140 | }
141 |
142 | /***************/
143 |
144 | void radix17_dit_pass1(double a[], int n)
145 | {
146 | /*
147 | !...Acronym: DIT = Decimation In Time
148 | !
149 | !...Subroutine to perform a final radix-17 complex DIT FFT pass on the data in the length-N real vector A.
150 | !
151 | ! See the documentation in radix16_dif_pass for further details on storage and indexing.
152 | */
153 | int j,j1,j2;
154 | static int n17,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16, first_entry=TRUE;
155 |
156 | if(!first_entry && (n/17) != n17) /* New runlength? */
157 | {
158 | first_entry=TRUE;
159 | }
160 |
161 | /*...initialize things upon first entry */
162 |
163 | if(first_entry)
164 | {
165 | first_entry=FALSE;
166 | n17 = n/17;
167 | // Constant index offsets for array load/stores are here:
168 | p1 = n17;
169 | p2 = p1 +p1;
170 | p3 = p2 +p1;
171 | p4 = p3 +p1;
172 | p5 = p4 +p1;
173 | p6 = p5 +p1;
174 | p7 = p6 +p1;
175 | p8 = p7 +p1;
176 | p9 = p8 +p1;
177 | p10 = p9 +p1;
178 | p11 = p10+p1;
179 | p12 = p11+p1;
180 | p13 = p12+p1;
181 | p14 = p13+p1;
182 | p15 = p14+p1;
183 | p16 = p15+p1;
184 |
185 | p1 += ( (p1 >> DAT_BITS) << PAD_BITS );
186 | p2 += ( (p2 >> DAT_BITS) << PAD_BITS );
187 | p3 += ( (p3 >> DAT_BITS) << PAD_BITS );
188 | p4 += ( (p4 >> DAT_BITS) << PAD_BITS );
189 | p5 += ( (p5 >> DAT_BITS) << PAD_BITS );
190 | p6 += ( (p6 >> DAT_BITS) << PAD_BITS );
191 | p7 += ( (p7 >> DAT_BITS) << PAD_BITS );
192 | p8 += ( (p8 >> DAT_BITS) << PAD_BITS );
193 | p9 += ( (p9 >> DAT_BITS) << PAD_BITS );
194 | p10 += ( (p10>> DAT_BITS) << PAD_BITS );
195 | p11 += ( (p11>> DAT_BITS) << PAD_BITS );
196 | p12 += ( (p12>> DAT_BITS) << PAD_BITS );
197 | p13 += ( (p13>> DAT_BITS) << PAD_BITS );
198 | p14 += ( (p14>> DAT_BITS) << PAD_BITS );
199 | p15 += ( (p15>> DAT_BITS) << PAD_BITS );
200 | p16 += ( (p16>> DAT_BITS) << PAD_BITS );
201 | }
202 |
203 | /*...The radix-17 pass is here. */
204 |
205 | for(j=0; j < n17; j += 2)
206 | {
207 | #ifdef USE_SSE2
208 | j1 = (j & mask01) + br4[j&3];
209 | j1 =j1 + ( (j1>> DAT_BITS) << PAD_BITS );
210 | #else
211 | j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */
212 | #endif
213 | j2 = j1+RE_IM_STRIDE;
214 | // Call same radix-17 DFT macro as for DIF, but replace indices j = 1-16 with j*16%17, i.e. run in reverse order:
215 | RADIX_17_DFT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7],a[j1+p8],a[j2+p8],a[j1+p9],a[j2+p9],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15],a[j1+p16],a[j2+p16]
216 | ,a+j1 ,a+j2 ,a+j1+p16 ,a+j2+p16 ,a+j1+p15 ,a+j2+p15 ,a+j1+p14 ,a+j2+p14 ,a+j1+p13 ,a+j2+p13 ,a+j1+p12 ,a+j2+p12 ,a+j1+p11 ,a+j2+p11 ,a+j1+p10 ,a+j2+p10 ,a+j1+p9 ,a+j2+p9 ,a+j1+p8 ,a+j2+p8 ,a+j1+p7 ,a+j2+p7 ,a+j1+p6 ,a+j2+p6 ,a+j1+p5 ,a+j2+p5 ,a+j1+p4 ,a+j2+p4 ,a+j1+p3 ,a+j2+p3 ,a+j1+p2 ,a+j2+p2 ,a+j1+p1 ,a+j2+p1 );
217 | }
218 | }
219 |
--------------------------------------------------------------------------------
/src/radix256.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix256_included
27 | #define radix256_included
28 |
29 | #include "radix128.h"
30 |
31 | #define c256_01 ((double)0.99969881869620422011)
32 | #define s256_01 ((double)0.02454122852291228802) /* exp(01*I*twopi/256) */
33 | #define c256_03 ((double)0.99729045667869021613)
34 | #define s256_03 ((double)0.07356456359966742351) /* exp(03*I*twopi/256) */
35 | #define c256_05 ((double)0.99247953459870999816)
36 | #define s256_05 ((double)0.12241067519921619847) /* exp(05*I*twopi/256) */
37 | #define c256_07 ((double)0.98527764238894124478)
38 | #define s256_07 ((double)0.17096188876030122632) /* exp(07*I*twopi/256) */
39 | #define c256_09 ((double)0.97570213003852854447)
40 | #define s256_09 ((double)0.21910124015686979717) /* exp(09*I*twopi/256) */
41 | #define c256_0b ((double)0.96377606579543986670)
42 | #define s256_0b ((double)0.26671275747489838626) /* exp(0b*I*twopi/256) */
43 | #define c256_0d ((double)0.94952818059303666721)
44 | #define s256_0d ((double)0.31368174039889147658) /* exp(0d*I*twopi/256) */
45 | #define c256_0f ((double)0.93299279883473888774)
46 | #define s256_0f ((double)0.35989503653498814869) /* exp(0f*I*twopi/256) */
47 | #define c256_11 ((double)0.91420975570353065467)
48 | #define s256_11 ((double)0.40524131400498987082) /* exp(11*I*twopi/256) */
49 | #define c256_13 ((double)0.89322430119551532038)
50 | #define s256_13 ((double)0.44961132965460659995) /* exp(13*I*twopi/256) */
51 | #define c256_15 ((double)0.87008699110871141870)
52 | #define s256_15 ((double)0.49289819222978403677) /* exp(15*I*twopi/256) */
53 | #define c256_17 ((double)0.84485356524970707332)
54 | #define s256_17 ((double)0.53499761988709721055) /* exp(17*I*twopi/256) */
55 | #define c256_19 ((double)0.81758481315158369658)
56 | #define s256_19 ((double)0.57580819141784530063) /* exp(19*I*twopi/256) */
57 | #define c256_1b ((double)0.78834642762660626210)
58 | #define s256_1b ((double)0.61523159058062684536) /* exp(1b*I*twopi/256) */
59 | #define c256_1d ((double)0.75720884650648454767)
60 | #define s256_1d ((double)0.65317284295377676396) /* exp(1d*I*twopi/256) */
61 | #define c256_1f ((double)0.72424708295146692105)
62 | #define s256_1f ((double)0.68954054473706692449) /* exp(1f*I*twopi/256) */
63 |
64 | #endif /* #ifndef radix256_included */
65 |
--------------------------------------------------------------------------------
/src/radix256_twiddles.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "radix256.h"
24 |
25 | // Skip the usual include-this-header-file-if-it-was-not-included-before #ifndef wapper,
26 | // since this file is not for defines/typedefs and such but rather to store a lengthy const-array-declaration
27 | // and thus needs to be inline-able in multiple places in a source filing making use of it.
28 |
29 | const double DFT256_TWIDDLES[16][30] = {
30 | { 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0, 1,0 },
31 | { 0,1, ISRT2,ISRT2, -ISRT2,ISRT2, c16,s16, -s16,c16, s16,c16, -c16,s16, c32_1,s32_1, -s32_1,c32_1, s32_3,c32_3, -c32_3,s32_3, c32_3,s32_3, -s32_3,c32_3, s32_1,c32_1, -c32_1,s32_1 },
32 | { ISRT2,ISRT2, c16,s16, s16,c16, c32_1,s32_1, s32_3,c32_3, c32_3,s32_3, s32_1,c32_1, c64_1,s64_1, s64_7,c64_7, c64_5,s64_5, s64_3,c64_3, c64_3,s64_3, s64_5,c64_5, c64_7,s64_7, s64_1,c64_1 },
33 | { -ISRT2,ISRT2, s16,c16, -c16,-s16, c32_3,s32_3, -c32_1,s32_1, -s32_1,c32_1, -s32_3,-c32_3, c64_3,s64_3, -c64_5,s64_5, s64_1,c64_1, -c64_7,-s64_7, s64_7,c64_7, -c64_1,-s64_1, -s64_5,c64_5, -s64_3,-c64_3 },
34 | { c16,s16, c32_1,s32_1, c32_3,s32_3, c64_1,s64_1, c64_5,s64_5, c64_3,s64_3, c64_7,s64_7, c128_1,s128_1, c128_9,s128_9, c128_5,s128_5, c128_d,s128_d, c128_3,s128_3, c128_b,s128_b, c128_7,s128_7, c128_f,s128_f },
35 | { -s16,c16, s32_3,c32_3, -c32_1,s32_1, c64_5,s64_5, -c64_7,s64_7, s64_1,c64_1, -c64_3,-s64_3, c128_5,s128_5, -s128_d,c128_d, s128_7,c128_7, -c128_1,-s128_1, c128_f,s128_f, -c128_9,s128_9, -s128_3,c128_3, -c128_b,-s128_b },
36 | { s16,c16, c32_3,s32_3, -s32_1,c32_1, c64_3,s64_3, s64_1,c64_1, s64_7,c64_7, -s64_5,c64_5, c128_3,s128_3, s128_5,c128_5, c128_f,s128_f, -s128_7,c128_7, c128_9,s128_9, -s128_1,c128_1, s128_b,c128_b, -s128_d,c128_d },
37 | { -c16,s16, s32_1,c32_1, -s32_3,-c32_3, c64_7,s64_7, -c64_3,-s64_3, -s64_5,c64_5, s64_1,-c64_1, c128_7,s128_7, -c128_1,s128_1, -s128_3,c128_3, -s128_5,-c128_5, s128_b,c128_b, -c128_d,-s128_d, -c128_f,s128_f, s128_9,-c128_9 },
38 | { c32_1,s32_1, c64_1,s64_1, c64_3,s64_3, c128_1,s128_1, c128_5,s128_5, c128_3,s128_3, c128_7,s128_7, c256_01,s256_01, c256_09,s256_09, c256_05,s256_05, c256_0d,s256_0d, c256_03,s256_03, c256_0b,s256_0b, c256_07,s256_07, c256_0f,s256_0f },
39 | { -s32_1,c32_1, s64_7,c64_7, -c64_5,s64_5, c128_9,s128_9, -s128_d,c128_d, s128_5,c128_5, -c128_1,s128_1, c256_09,s256_09, -s256_11,c256_11, s256_13,c256_13, -c256_0b,s256_0b, c256_1b,s256_1b, -c256_1d,s256_1d, s256_01,c256_01, -c256_07,-s256_07 },
40 | { s32_3,c32_3, c64_5,s64_5, s64_1,c64_1, c128_5,s128_5, s128_7,c128_7, c128_f,s128_f, -s128_3,c128_3, c256_05,s256_05, s256_13,c256_13, c256_19,s256_19, -s256_01,c256_01, c256_0f,s256_0f, s256_09,c256_09, s256_1d,c256_1d, -s256_0b,c256_0b },
41 | { -c32_3,s32_3, s64_3,c64_3, -c64_7,-s64_7, c128_d,s128_d, -c128_1,-s128_1, -s128_7,c128_7, -s128_5,-c128_5, c256_0d,s256_0d, -c256_0b,s256_0b, -s256_01,c256_01, -s256_17,-c256_17, s256_19,c256_19, -c256_0f,-s256_0f, -s256_1b,c256_1b, s256_03,-c256_03 },
42 | { c32_3,s32_3, c64_3,s64_3, s64_7,c64_7, c128_3,s128_3, c128_f,s128_f, c128_9,s128_9, s128_b,c128_b, c256_03,s256_03, c256_1b,s256_1b, c256_0f,s256_0f, s256_19,c256_19, c256_09,s256_09, s256_1f,c256_1f, c256_15,s256_15, s256_13,c256_13 },
43 | { -s32_3,c32_3, s64_5,c64_5, -c64_1,-s64_1, c128_b,s128_b, -c128_9,s128_9, -s128_1,c128_1, -c128_d,-s128_d, c256_0b,s256_0b, -c256_1d,s256_1d, s256_09,c256_09, -c256_0f,-s256_0f, s256_1f,c256_1f, -c256_07,s256_07, -s256_0d,c256_0d, -s256_1b,-c256_1b },
44 | { s32_1,c32_1, c64_7,s64_7, -s64_5,c64_5, c128_7,s128_7, -s128_3,c128_3, s128_b,c128_b, -c128_f,s128_f, c256_07,s256_07, s256_01,c256_01, s256_1d,c256_1d, -s256_1b,c256_1b, c256_15,s256_15, -s256_0d,c256_0d, s256_0f,c256_0f, -c256_17,s256_17 },
45 | { -c32_1,s32_1, s64_1,c64_1, -s64_3,-c64_3, c128_f,s128_f, -c128_b,-s128_b, -s128_d,c128_d, s128_9,-c128_9, c256_0f,s256_0f, -c256_07,-s256_07, -s256_0b,c256_0b, s256_03,-c256_03, s256_13,c256_13, -s256_1b,-c256_1b, -c256_17,s256_17, c256_1f,-s256_1f }
46 | };
47 |
48 |
--------------------------------------------------------------------------------
/src/radix32.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix32_included
27 | #define radix32_included
28 |
29 | #include "radix16.h"
30 |
31 | #define c32_1 ((double)0.98078528040323044912)
32 | #define s32_1 ((double)0.19509032201612826784) /* exp(1*I*twopi/32) */
33 | #define c32_3 ((double)0.83146961230254523708)
34 | #define s32_3 ((double)0.55557023301960222473) /* exp(3*I*twopi/32) */
35 |
36 | #endif /* #ifndef radix32_included */
37 |
--------------------------------------------------------------------------------
/src/radix32_wrapper_ini.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "Mlucas.h"
24 |
25 | /***************/
26 |
27 | /* Initialize the various arrays of indices used in radix32_wrapper_square, so we can execute
28 | the processing of the [radix0] disjoint data blocks by that routine in parallel, if desired.
29 | */
30 | void radix32_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int radix_prim[], int ws_i[], int ws_j1[], int ws_j2[], int ws_j2_start[], int ws_k[], int ws_m[], int ws_blocklen[], int ws_blocklen_sum[])
31 | {
32 | static int i,j1,j2,j2_start,k,m,blocklen,blocklen_sum;
33 | int iblock_next;
34 |
35 | if(iblock <= 1 && !(radix0 & 1))
36 | iblock_next = iblock + 1;
37 | else
38 | iblock_next = iblock + 2;
39 |
40 | if(iblock == 0) // j1 = real-array index (double the complex-array index) of the 1st element of each floating pair.
41 | {
42 | // No need to init I and M here, since they are set by entry into the nested I/M loop in radix16_pairFFT_mul_square:
43 | j1 = 0;
44 | j2 = 64;
45 | j2_start = j2; // j2 = real-array index (double the complex-array index) of 2nd element of each floating pair.
46 | k = 0;
47 | blocklen = 32; // = half of complex blocklength, since process 2 complex data for each value of loop index L.
48 | blocklen_sum = 0;
49 |
50 | ws_i [iblock] = i ;
51 | ws_j1 [iblock] = j1 ;
52 | ws_j2 [iblock] = j2 ;
53 | ws_j2_start [iblock] = j2_start ;
54 | ws_k [iblock] = k ;
55 | ws_m [iblock] = m ;
56 | ws_blocklen [iblock] = blocklen ;
57 | ws_blocklen_sum[iblock] = blocklen_sum;
58 | } else {
59 | goto jump_in;
60 | }
61 |
62 | for(i = nradices_prim-6; i >= 0; i-- ) // Main loop: lower bound = nradices_prim - radix_now.
63 | { // Remember, radices get processed in reverse order here as in forward FFT.
64 | for(m = 0; m < (blocklen-1)>>1; m += 16) // Do two 32-element sets per loop, so only execute loop half as many times as before.
65 | {
66 | // This tells us when we've reached the end of the current data block:
67 | // Apr 2014: Must store intermediate product j1*radix0 in a 64-bit int to prevent overflow!
68 | if(j1 && ((uint64)j1*radix0)%n == 0)
69 | {
70 | ws_i [iblock_next] = i ;
71 | ws_j1 [iblock_next] = j1 ;
72 | ws_j2 [iblock_next] = j2 ;
73 | ws_j2_start [iblock_next] = j2_start ;
74 | ws_k [iblock_next] = k ;
75 | ws_m [iblock_next] = m ;
76 | ws_blocklen [iblock_next] = blocklen ;
77 | ws_blocklen_sum[iblock_next] = blocklen_sum;
78 | // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
79 | return;
80 | }
81 | jump_in: // Entry point for all blocks but the first.
82 | k += 2; // increment sincos array index
83 | // And update the data (j1 and j2) array indices:
84 | j1 += 64;
85 | j2 -= 64;
86 | }
87 | /*
88 | !...Since the foregoing loop only gets executed half as many times as in the simple version, to properly position
89 | ! ourselves in the data array for the start of the next block, need to bump up j1 by as much as would occur in a
90 | ! second execution of the above loop. The exception is the first loop execution, where j1 needs to be doubled (32 x 2).
91 | */
92 | j1 += (blocklen << 1);
93 |
94 | if(j2_start == n-64) {
95 | // printf("(j2_start == n-32) return with j2_start = %d\n",j2_start);
96 | return;
97 | }
98 |
99 | /*...Reset half-complex-blocklength for next pass. If K >> 1 has a zero trailing bit,
100 | we multiply the blocklength by K >> 1 in preparation for the final block. */
101 |
102 | blocklen_sum += blocklen;
103 | blocklen = (blocklen_sum) * (radix_prim[i-1]-1);
104 |
105 | /*...Next j2_start is previous one plus the (real) length of the current block = 4*(half-complex-blocklength) */
106 |
107 | j2_start += (blocklen<<2);
108 | j2 = j2_start; /* Reset j2 for start of the next block. */
109 | // printf("newblock: blocklen = %8d blocklen_sum = %8d j2 = %8d\n",blocklen,blocklen_sum,j2);
110 | } /* End of Main loop */
111 | }
112 |
113 |
--------------------------------------------------------------------------------
/src/radix512.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix512_included
27 | #define radix512_included
28 |
29 | #include "radix256.h"
30 |
31 | #define c512_01 ((double)0.99992470183914454092)
32 | #define s512_01 ((double)0.01227153828571992607) /* exp(01*I*twopi/512) */
33 | #define c512_03 ((double)0.99932238458834950089)
34 | #define s512_03 ((double)0.03680722294135883230) /* exp(03*I*twopi/512) */
35 | #define c512_05 ((double)0.99811811290014920712)
36 | #define s512_05 ((double)0.06132073630220857774) /* exp(05*I*twopi/512) */
37 | #define c512_07 ((double)0.99631261218277801263)
38 | #define s512_07 ((double)0.08579731234443989040) /* exp(07*I*twopi/512) */
39 | #define c512_09 ((double)0.99390697000235604155)
40 | #define s512_09 ((double)0.11022220729388305873) /* exp(09*I*twopi/512) */
41 | #define c512_0b ((double)0.99090263542778002511)
42 | #define s512_0b ((double)0.13458070850712618623) /* exp(0b*I*twopi/512) */
43 | #define c512_0d ((double)0.98730141815785838241)
44 | #define s512_0d ((double)0.15885814333386144158) /* exp(0d*I*twopi/512) */
45 | #define c512_0f ((double)0.98310548743121632720)
46 | #define s512_0f ((double)0.18303988795514095840) /* exp(0f*I*twopi/512) */
47 | #define c512_11 ((double)0.97831737071962763313)
48 | #define s512_11 ((double)0.20711137619221854957) /* exp(11*I*twopi/512) */
49 | #define c512_13 ((double)0.97293995220556014550)
50 | #define s512_13 ((double)0.23105810828067111950) /* exp(13*I*twopi/512) */
51 | #define c512_15 ((double)0.96697647104485210912)
52 | #define s512_15 ((double)0.25486565960451457139) /* exp(15*I*twopi/512) */
53 | #define c512_17 ((double)0.96043051941556581124)
54 | #define s512_17 ((double)0.27851968938505310503) /* exp(17*I*twopi/512) */
55 | #define c512_19 ((double)0.95330604035419383697)
56 | #define s512_19 ((double)0.30200594931922806681) /* exp(19*I*twopi/512) */
57 | #define c512_1b ((double)0.94560732538052132579)
58 | #define s512_1b ((double)0.32531029216226293393) /* exp(1b*I*twopi/512) */
59 | #define c512_1d ((double)0.93733901191257492328)
60 | #define s512_1d ((double)0.34841868024943456820) /* exp(1d*I*twopi/512) */
61 | #define c512_1f ((double)0.92850608047321556602)
62 | #define s512_1f ((double)0.37131719395183754318) /* exp(1f*I*twopi/512) */
63 | #define c512_21 ((double)0.91911385169005774400)
64 | #define s512_21 ((double)0.39399204006104810836) /* exp(21*I*twopi/512) */
65 | #define c512_23 ((double)0.90916798309052237667)
66 | #define s512_23 ((double)0.41642956009763718231) /* exp(23*I*twopi/512) */
67 | #define c512_25 ((double)0.89867446569395384316)
68 | #define s512_25 ((double)0.43861623853852763738) /* exp(25*I*twopi/512) */
69 | #define c512_27 ((double)0.88763962040285394789)
70 | #define s512_27 ((double)0.46053871095824002336) /* exp(27*I*twopi/512) */
71 | #define c512_29 ((double)0.87607009419540660724)
72 | #define s512_29 ((double)0.48218377207912274823) /* exp(29*I*twopi/512) */
73 | #define c512_2b ((double)0.86397285612158673808)
74 | #define s512_2b ((double)0.50353838372571755840) /* exp(2b*I*twopi/512) */
75 | #define c512_2d ((double)0.85135519310526514244)
76 | #define s512_2d ((double)0.52458968267846890591) /* exp(2d*I*twopi/512) */
77 | #define c512_2f ((double)0.83822470555483804338)
78 | #define s512_2f ((double)0.54532498842204642200) /* exp(2f*I*twopi/512) */
79 | #define c512_31 ((double)0.82458930278502526468)
80 | #define s512_31 ((double)0.56573181078361319707) /* exp(31*I*twopi/512) */
81 | #define c512_33 ((double)0.81045719825259479195)
82 | #define s512_33 ((double)0.58579785745643886000) /* exp(33*I*twopi/512) */
83 | #define c512_35 ((double)0.79583690460888353651)
84 | #define s512_35 ((double)0.60551104140432551359) /* exp(35*I*twopi/512) */
85 | #define c512_37 ((double)0.78073722857209447856)
86 | #define s512_37 ((double)0.62485948814238637675) /* exp(37*I*twopi/512) */
87 | #define c512_39 ((double)0.76516726562245892617)
88 | #define s512_39 ((double)0.64383154288979146473) /* exp(39*I*twopi/512) */
89 | #define c512_3b ((double)0.74913639452345932577)
90 | #define s512_3b ((double)0.66241577759017176077) /* exp(3b*I*twopi/512) */
91 | #define c512_3d ((double)0.73265427167241283493)
92 | #define s512_3d ((double)0.68060099779545305024) /* exp(3d*I*twopi/512) */
93 | #define c512_3f ((double)0.71573082528381865446)
94 | #define s512_3f ((double)0.69837624940897285320) /* exp(3f*I*twopi/512) */
95 |
96 | #endif /* #ifndef radix512_included */
97 |
--------------------------------------------------------------------------------
/src/radix63_main_carry_loop.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | // This main loop is same for un-and-multithreaded, so stick into a header file
24 | // (can't use a macro because of the #if-enclosed stuff).
25 |
26 | for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */
27 | {
28 | for(j = jstart; j < jhi; j += stride)
29 | {
30 | j1 = j;
31 | j1 = j1 + ( (j1 >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */
32 | j2 = j1 + RE_IM_STRIDE;
33 |
34 | /*...The radix-63 DIT pass is here: */
35 |
36 | //...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 7 radix-9 transforms:
37 | tptr = t; iptr = dit_iperm;
38 | for(l = 0; l < 7; l++) {
39 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)];
40 | RADIX_09_DIT(
41 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8],
42 | tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im,
43 | rt,it,re
44 | ); tptr += 9; iptr += 9;
45 | }
46 | //...and now do 9 radix-7 transforms:
47 | tptr = t; iptr = dit_operm;
48 | for(l = 0; l < 9; l++) {
49 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)];
50 | RADIX_07_DFT(
51 | tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im,
52 | t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13,
53 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],
54 | uc1,us1,uc2,us2,uc3,us3, rt,it,re,im
55 | ); tptr++; iptr += 7;
56 | }
57 |
58 | /*...Now do the carries. Since the outputs would
59 | normally be getting dispatched to 63 separate blocks of the A-array, we need 63 separate carries. */
60 |
61 | if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
62 | {
63 | l= j & (nwt-1);
64 | n_minus_sil = n-si[l ];
65 | n_minus_silp1 = n-si[l+1];
66 | sinwt = si[nwt-l ];
67 | sinwtm1 = si[nwt-l-1];
68 |
69 | wtl =wt0[ l ];
70 | wtn =wt0[nwt-l ]*scale; /* Include 1/(n/2) scale factor of inverse transform here... */
71 | wtlp1 =wt0[ l+1];
72 | wtnm1 =wt0[nwt-l-1]*scale; /* ...and here. */
73 |
74 | /*...set0 is slightly different from others; divide work into blocks of RADIX/4 macro calls, 1st set of which gets pulled out of loop: */
75 | // Apr 2014: Fermat-mod works fine, but mers-mod barfs immediately with what looks like a bad a0 value,
76 | // div-by-n/2 should give 16, but instead see
77 | // iter 1, full = 1, a0in = 15.492078993055555
78 | // iter 1, full = 1, a0out = 13.000000000000000
79 | // Iter = 1, maxerr = 0.492078993055555
80 | //if(!j)printf("iter %d, full = %d, a0in = %20.15f\n",iter,full_pass,a[0]/(n>>1));
81 | l = 0; addr = cy_r; itmp = bjmodn;
82 | jt = j1; jp = j2;
83 | cmplx_carry_norm_errcheck0(a[j1 ],a[j2 ],*addr,*itmp,0,prp_mult); ++l; ++addr; ++itmp;
84 | // Next 15 quartets of macro calls done in loop:
85 | for(ntmp = 1; ntmp < 16; ntmp++) {
86 | cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
87 | cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
88 | cmplx_carry_norm_errcheck(a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
89 | jt = j1 + p[ntmp<<2]; jp = j2 + p[ntmp<<2];
90 | cmplx_carry_norm_errcheck(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
91 | }
92 | // Cleanup of final 2 sets of carries:
93 | cmplx_carry_norm_errcheck(a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
94 | cmplx_carry_norm_errcheck(a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
95 | //if(!j)printf("iter %d, full = %d, a0out = %20.15f\n",iter,full_pass,a[0]);
96 | i =((uint32)(sw - bjmodn[0]) >> 31); /* get ready for the next set... */
97 | co2 = co3; /* For all data but the first set in each j-block, co2=co3. Thus, after the first block of data is done
98 | (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1). */
99 | }
100 | else /* MODULUS_TYPE_FERMAT */
101 | {
102 | // Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2):
103 | ntmp = 0; addr = cy_r; addi = cy_i; ic = 0; // ic = idx into icycle mini-array, gets incremented (mod ODD_RADIX) between macro calls
104 | jt = j1; jp = j2;
105 | fermat_carry_norm_errcheckB(a[jt ],a[jp ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
106 | fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
107 | fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
108 | for(m = 1; m < 16; m++) {
109 | fermat_carry_norm_errcheckB(a[jt+p3],a[jp+p3],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
110 | jt = j1 + p[m<<2]; jp = j2 + p[m<<2];
111 | fermat_carry_norm_errcheckB(a[jt ],a[jp ],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
112 | fermat_carry_norm_errcheckB(a[jt+p1],a[jp+p1],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
113 | fermat_carry_norm_errcheckB(a[jt+p2],a[jp+p2],*addr,*addi,icycle[ic],ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; ++ic;
114 | }
115 | for(l = 0; l < ODD_RADIX; l++) {
116 | icycle[l] += wts_idx_incr; /* Inside the loop use this, as it is faster than general-mod '% nwt' */
117 | icycle[l] += ( (-(int)((uint32)icycle[l] >> 31)) & nwt);
118 | }
119 | } /* if(MODULUS_TYPE == ...) */
120 |
121 | /*...The radix-63 DIF pass is here: */
122 |
123 | //...gather the needed data (63 64-bit complex, i.e. 126 64-bit reals) and do 9 radix-7 transforms:
124 | tptr = t; iptr = dif_iperm;
125 | for(l = 0; l < 9; l++) {
126 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)];
127 | RADIX_07_DFT(
128 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],
129 | t00,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13,
130 | tptr->re,tptr->im,(tptr+9)->re,(tptr+9)->im,(tptr+18)->re,(tptr+18)->im,(tptr+27)->re,(tptr+27)->im,(tptr+36)->re,(tptr+36)->im,(tptr+45)->re,(tptr+45)->im,(tptr+54)->re,(tptr+54)->im,
131 | uc1,us1,uc2,us2,uc3,us3, rt,it,re,im
132 | ); tptr++; iptr += 7;
133 | }
134 | //...and now do 7 radix-9 transforms:
135 | tptr = t; iptr = dif_operm;
136 | for(l = 0; l < 7; l++) {
137 | k0 = p[*iptr]; k1 = p[*(iptr+1)]; k2 = p[*(iptr+2)]; k3 = p[*(iptr+3)]; k4 = p[*(iptr+4)]; k5 = p[*(iptr+5)]; k6 = p[*(iptr+6)]; k7 = p[*(iptr+7)]; k8 = p[*(iptr+8)];
138 | RADIX_09_DIF(
139 | tptr->re,tptr->im,(tptr+1)->re,(tptr+1)->im,(tptr+2)->re,(tptr+2)->im,(tptr+3)->re,(tptr+3)->im,(tptr+4)->re,(tptr+4)->im,(tptr+5)->re,(tptr+5)->im,(tptr+6)->re,(tptr+6)->im,(tptr+7)->re,(tptr+7)->im,(tptr+8)->re,(tptr+8)->im,
140 | a[j1+k0],a[j2+k0],a[j1+k1],a[j2+k1],a[j1+k2],a[j2+k2],a[j1+k3],a[j2+k3],a[j1+k4],a[j2+k4],a[j1+k5],a[j2+k5],a[j1+k6],a[j2+k6],a[j1+k7],a[j2+k7],a[j1+k8],a[j2+k8],
141 | rt,it,re
142 | ); tptr += 9; iptr += 9;
143 | }
144 | }
145 |
146 | if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
147 | {
148 | jstart += nwt;
149 | jhi += nwt;
150 |
151 | col += RADIX;
152 | co3 -= RADIX;
153 | }
154 | } /* end for(k=1; k <= khi; k++) */
155 |
156 |
--------------------------------------------------------------------------------
/src/radix64.h:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | /****************************************************************************
24 | * We now include this header file if it was not included before.
25 | ****************************************************************************/
26 | #ifndef radix64_included
27 | #define radix64_included
28 |
29 | #include "radix32.h"
30 |
31 | #define c64_1 ((double)0.99518472667219688624)
32 | #define s64_1 ((double)0.09801714032956060199) /* exp(1*I*twopi/64) */
33 | #define c64_3 ((double)0.95694033573220886494)
34 | #define s64_3 ((double)0.29028467725446236764) /* exp(3*I*twopi/64) */
35 | #define c64_5 ((double)0.88192126434835502971)
36 | #define s64_5 ((double)0.47139673682599764856) /* exp(5*I*twopi/64) */
37 | #define c64_7 ((double)0.77301045336273696081)
38 | #define s64_7 ((double)0.63439328416364549822) /* exp(7*I*twopi/64) */
39 |
40 | #endif /* #ifndef radix64_included */
41 |
--------------------------------------------------------------------------------
/src/rng_isaac.c:
--------------------------------------------------------------------------------
1 | /*
2 | ------------------------------------------------------------------------------
3 | isaac64.c: My random number generator for 64-bit machines.
4 | By Bob Jenkins, 1996. Public Domain.
5 | ------------------------------------------------------------------------------
6 | */
7 |
8 | #include
9 | #include "rng_isaac.h"
10 |
11 | /* externs declared in rng_isaac.h: */
12 | ub8 randrsl[RANDSIZ], randcnt;
13 |
14 | static ub8 mm[RANDSIZ];
15 | static ub8 aa=0, bb=0, cc=0;
16 |
17 | #define ind(mm,x) (*(ub8 *)((ub1 *)(mm) + ((x) & ((RANDSIZ-1)<<3))))
18 | #define rngstep(mix,a,b,mm,m,m2,r,x) \
19 | { \
20 | x = *m; \
21 | a = (mix) + *(m2++); \
22 | *(m++) = y = ind(mm,x) + a + b; \
23 | *(r++) = b = ind(mm,y>>RANDSIZL) + x; \
24 | }
25 |
26 | void isaac64()
27 | {
28 | register ub8 a,b,x,y,*m,*m2,*r,*mend;
29 | r = randrsl; /* Need a variable address pointer to feed to rngstep */
30 | a = aa; b = bb + (++cc);
31 | for (m = mm, mend = m2 = m+(RANDSIZ/2); m>5) , a, b, mm, m, m2, r, x);
35 | rngstep( a^(a<<12) , a, b, mm, m, m2, r, x);
36 | rngstep( a^(a>>33) , a, b, mm, m, m2, r, x);
37 | }
38 | for (m2 = mm; m2>5) , a, b, mm, m, m2, r, x);
42 | rngstep( a^(a<<12) , a, b, mm, m, m2, r, x);
43 | rngstep( a^(a>>33) , a, b, mm, m, m2, r, x);
44 | }
45 | bb = b; aa = a;
46 | }
47 |
48 | #define mix(a,b,c,d,e,f,g,h) \
49 | { \
50 | a-=e; f^=h>>9; h+=a; \
51 | b-=f; g^=a<<9; a+=b; \
52 | c-=g; h^=b>>23; b+=c; \
53 | d-=h; a^=c<<15; c+=d; \
54 | e-=a; b^=d>>14; d+=e; \
55 | f-=b; c^=e<<20; e+=f; \
56 | g-=c; d^=f>>17; f+=g; \
57 | h-=d; e^=g<<14; g+=h; \
58 | }
59 |
60 | void rng_isaac_init(word flag)
61 | {
62 | word i;
63 | ub8 a,b,c,d,e,f,g,h;
64 | aa=bb=cc=(ub8)0;
65 | a=b=c=d=e=f=g=h=0x9E3779B97F4A7C13ull; /* the golden ratio */
66 |
67 | for (i=0; i<4; ++i) /* scramble it */
68 | {
69 | mix(a,b,c,d,e,f,g,h);
70 | }
71 |
72 | for (i=0; i>32),(ub4)randrsl[j]);
114 | }
115 | }
116 | #endif
117 |
118 | /*
119 | 11/25/05: EWM - modified to add 2 types of double-precision floating rand() calls:
120 |
121 | - rng_isaac_rand_double() returns a random double via a 64-bit field
122 | which is (within the limits of the generator) a random 64-bit int;
123 |
124 | - rng_isaac_rand_double_norm_pos() returns a random double with
125 | probability uniformly distributed in [0, 1), insofar as IEEE64 doubles
126 | are capable of distributing such values, excluding underflows;
127 |
128 | - rng_isaac_rand_double_norm_pm1() returns a random double with
129 | probability uniformly distributed in (-1, 1), insofar as IEEE64 doubles
130 | are capable of distributing such values, excluding underflows;
131 | */
132 | double rng_isaac_rand_double()
133 | {
134 | uint64 iran64;
135 | uint32 fexp;
136 |
137 | /* Make sure resulting float will not be denormal: */
138 | for(;;)
139 | {
140 | iran64 = rng_isaac_rand();
141 | fexp = (uint32)(iran64 >> 52) & 0x7ff;
142 | if(fexp != 0 && fexp < 0x7f0) break;
143 | }
144 | return *(double *)&iran64;
145 | }
146 |
147 | /* Assumes IEEE64-compliant: */
148 | double rng_isaac_rand_double_norm_pos()
149 | {
150 | /*
151 | Obtain a result in [0, 1) by merging a sign/exponent field = 0x3ff with
152 | random 52-bit mantissa (52-bit because the hidden bit is assumed 1 via the
153 | choice of exponent - we only randomly generate the non-hidden 52 bits),
154 | yielding a result in [1, 2), and subtracting 1:
155 | */
156 | uint64 iran64, itmp64;
157 | double retval;
158 |
159 | itmp64 = rng_isaac_rand();
160 | iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull);
161 | retval=(*(double *)&iran64) - 1.0;
162 | /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
163 | if(retval < 0.0 || retval > 1.0)
164 | {
165 | sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
166 | ASSERT(0, cbuf);
167 | }
168 | return retval;
169 | }
170 |
171 |
172 | /* Assumes IEEE64-compliant: */
173 | double rng_isaac_rand_double_norm_pm1()
174 | {
175 | /*
176 | Obtain a result in (-1, 1) by following the same procedure used in
177 | rng_isaac_rand_double_norm_pos to get a value in [0, 1) and multiplying
178 | the result by a random choice of -1 or +1. Note that this doubles the
179 | odds of getting a zero result, but we assume that won't be fatal -
180 | in essence one can consider that as though -0.0 and +0.0 were separate
181 | possible outputs, each occurring with probability equal to that of any
182 | of the discrete nonzero outputs.
183 | */
184 | static double pm1[] = {-1.0, +1.0};
185 | double sign;
186 | uint64 itmp64, iran64;
187 | double retval;
188 |
189 | itmp64 = rng_isaac_rand();
190 | sign = pm1[itmp64 >> 63]; /* Use high bit of iran64 for sign */
191 | iran64 = 0x3FF0000000000000ull + (itmp64 & 0x000FFFFFFFFFFFFFull);
192 | retval=sign*((*(double *)&iran64) - 1.0);
193 | /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
194 | if(retval < -1.0 || retval > 1.0)
195 | {
196 | sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
197 | ASSERT(0, cbuf);
198 | }
199 | return retval;
200 | }
201 |
202 |
--------------------------------------------------------------------------------
/src/rng_isaac.h:
--------------------------------------------------------------------------------
1 | /*
2 | ------------------------------------------------------------------------------
3 | isaac64.h: definitions for a random number generator
4 | Bob Jenkins, 1996, Public Domain
5 | ------------------------------------------------------------------------------
6 | */
7 | /****************************************************************************
8 | * We now include this header file if it was not included before.
9 | ****************************************************************************/
10 | #ifndef rng_isaac_h_included
11 | #define rng_isaac_h_included
12 |
13 | /*
14 | 11/25/05: EWM - typedefs to use standard int types defined in types.h :
15 | */
16 | #include "Mdata.h"
17 |
18 | #ifdef __cplusplus
19 | extern "C" {
20 | #endif
21 |
22 | typedef uint64 ub8;
23 | #define UB8MAXVAL 0xffffffffffffffffLL
24 | #define UB8BITS 64
25 | typedef sint64 sb8;
26 | #define SB8MAXVAL 0x7fffffffffffffffLL
27 | typedef uint32 ub4; /* unsigned 4-byte quantities */
28 | #define UB4MAXVAL 0xffffffff
29 | typedef sint32 sb4;
30 | #define UB4BITS 32
31 | #define SB4MAXVAL 0x7fffffff
32 | typedef uint16 ub2;
33 | #define UB2MAXVAL 0xffff
34 | #define UB2BITS 16
35 | typedef sint16 sb2;
36 | #define SB2MAXVAL 0x7fff
37 | typedef uint8 ub1;
38 | #define UB1MAXVAL 0xff
39 | #define UB1BITS 8
40 | typedef sint8 sb1; /* signed 1-byte quantities */
41 | #define SB1MAXVAL 0x7f
42 | typedef int word; /* fastest type available */
43 |
44 |
45 | #ifndef ISAAC64
46 | #define ISAAC64
47 |
48 | #define RANDSIZL (8)
49 | #define RANDSIZ (1<
73 |
74 | #ifdef __cplusplus
75 | extern "C"
76 | {
77 | #endif
78 |
79 | /* mutexes ---------------------------------------------------------*/
80 |
81 | #ifdef OS_TYPE_WINDOWS
82 | typedef HANDLE mutex_t;
83 | #else
84 | typedef pthread_mutex_t mutex_t;
85 | #endif
86 | /*
87 | static void mutex_init(mutex_t *m)
88 | {
89 | #ifdef OS_TYPE_WINDOWS
90 | *m = CreateMutex(NULL, FALSE, NULL);
91 | #else
92 | pthread_mutex_init(m, NULL);
93 | #endif
94 | }
95 |
96 | static void mutex_free(mutex_t *m)
97 | {
98 | #ifdef OS_TYPE_WINDOWS
99 | CloseHandle(*m);
100 | #else
101 | pthread_mutex_destroy(m);
102 | #endif
103 | }
104 |
105 | static void mutex_lock(mutex_t *m)
106 | {
107 | #ifdef OS_TYPE_WINDOWS
108 | WaitForSingleObject(*m, INFINITE);
109 | #else
110 | pthread_mutex_lock(m);
111 | #endif
112 | }
113 |
114 | static void mutex_unlock(mutex_t *m)
115 | {
116 | #ifdef OS_TYPE_WINDOWS
117 | ReleaseMutex(*m);
118 | #else
119 | pthread_mutex_unlock(m);
120 | #endif
121 | }
122 | */
123 | /* a thread pool --------------------------------------------------*/
124 |
125 | typedef void (*init_func)(void *data, int thread_num);
126 | typedef void (*run_func)(void *data, int thread_num);
127 | typedef void (*shutdown_func)(void *data, int thread_num);
128 |
129 | typedef struct {
130 | init_func init;
131 | shutdown_func shutdown;
132 | void *data;
133 | } thread_control_t;
134 |
135 | typedef struct {
136 | init_func init;
137 | run_func run;
138 | shutdown_func shutdown;
139 | void *data;
140 | } task_control_t;
141 |
142 | struct threadpool_queue
143 | {
144 | unsigned int head;
145 | unsigned int tail;
146 | unsigned int num_tasks;
147 | unsigned int max_tasks;
148 | void **tasks;
149 | };
150 |
151 | struct thread_init
152 | {
153 | int thread_num;
154 | struct threadpool *pool;
155 | thread_control_t control;
156 | };
157 |
158 | struct threadpool
159 | {
160 | struct threadpool_queue tasks_queue;
161 | struct threadpool_queue free_tasks_queue;
162 |
163 | task_control_t *tasks;
164 |
165 | struct thread_init *thr_init;
166 | pthread_t *thr_arr;
167 |
168 | unsigned short num_of_threads;
169 | unsigned short num_of_cores;
170 | volatile unsigned short stop_flag;
171 |
172 | pthread_mutex_t free_tasks_mutex;
173 | pthread_cond_t free_tasks_cond;
174 | pthread_cond_t tasks_done_cond;
175 |
176 | pthread_mutex_t mutex;
177 | pthread_cond_t new_tasks_cond;
178 | };
179 |
180 | struct threadpool* threadpool_init(
181 | int num_threads,
182 | int num_cores,
183 | int queue_size,
184 | thread_control_t *t);
185 |
186 | int threadpool_add_task(struct threadpool *pool,
187 | task_control_t *t,
188 | int blocking);
189 |
190 | void threadpool_free(struct threadpool *pool);
191 |
192 | /* returns zero if no pending tasks */
193 | int threadpool_drain(struct threadpool *pool,
194 | int blocking);
195 |
196 | /********************* utility macros: ********************/
197 |
198 | // Don't use any of these at present, but note MacOS has its own versions of these, in /usr/include/X11/Xthreads.h:
199 | #if 1
200 | static void * xmalloc(size_t len) {
201 | void *ptr = malloc(len);
202 | if (ptr == NULL) {
203 | printf("failed to allocate %u bytes\n", (uint32)len);
204 | exit(-1);
205 | }
206 | return ptr;
207 | }
208 |
209 | static void * xcalloc(size_t num, size_t len) {
210 | void *ptr = calloc(num, len);
211 | if (ptr == NULL) {
212 | printf("failed to calloc %u bytes\n", (uint32)(num * len));
213 | exit(-1);
214 | }
215 | return ptr;
216 | }
217 |
218 | static void * xrealloc(void *iptr, size_t len) {
219 | void *ptr = realloc(iptr, len);
220 | if (ptr == NULL) {
221 | printf("failed to reallocate %u bytes\n", (uint32)len);
222 | exit(-1);
223 | }
224 | return ptr;
225 | }
226 | #endif
227 |
228 | #ifdef __cplusplus
229 | }
230 | #endif
231 |
232 | #endif /* !_THREAD_H_ */
233 |
234 |
--------------------------------------------------------------------------------
/src/types.c:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * *
3 | * (C) 1997-2021 by Ernst W. Mayer. *
4 | * *
5 | * This program is free software; you can redistribute it and/or modify it *
6 | * under the terms of the GNU General Public License as published by the *
7 | * Free Software Foundation; either version 2 of the License, or (at your *
8 | * option) any later version. *
9 | * *
10 | * This program is distributed in the hope that it will be useful, but WITHOUT *
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for *
13 | * more details. *
14 | * *
15 | * You should have received a copy of the GNU General Public License along *
16 | * with this program; see the file GPL.txt. If not, you may view one at *
17 | * http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the *
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA *
19 | * 02111-1307, USA. *
20 | * *
21 | *******************************************************************************/
22 |
23 | #include "types.h"
24 |
25 | /* Useful extern constants to export: */
26 |
27 | /* Multiword ints have word significance increasing from left to right: */
28 |
29 | /* 5/04/2005: uint96/160s are really uint128/192s with upper 32 bits zero: */
30 | const uint96 NIL96 = {(uint64)0, (uint32)0};
31 | const uint96 ONE96 = {(uint64)1, (uint32)0};
32 | const uint96 TWO96 = {(uint64)2, (uint32)0};
33 |
34 | const uint128 NIL128 = {(uint64)0, (uint64)0};
35 | const uint128 ONE128 = {(uint64)1, (uint64)0};
36 | const uint128 TWO128 = {(uint64)2, (uint64)0};
37 |
38 | const uint160 NIL160 = {(uint64)0, (uint64)0, (uint32)0};
39 | const uint160 ONE160 = {(uint64)1, (uint64)0, (uint32)0};
40 | const uint160 TWO160 = {(uint64)2, (uint64)0, (uint32)0};
41 |
42 | const uint192 NIL192 = {(uint64)0, (uint64)0, (uint64)0};
43 | const uint192 ONE192 = {(uint64)1, (uint64)0, (uint64)0};
44 | const uint192 TWO192 = {(uint64)2, (uint64)0, (uint64)0};
45 |
46 | const uint256 NIL256 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0};
47 | const uint256 ONE256 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0};
48 | const uint256 TWO256 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0};
49 |
50 | const uint512 NIL512 = {(uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
51 | const uint512 ONE512 = {(uint64)1, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
52 | const uint512 TWO512 = {(uint64)2, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0, (uint64)0};
53 |
54 | /* Nov 2021: Case-insensitive analog of strstr - used the code posted by 'chux' here:
55 | https://stackoverflow.com/questions/27303062/strstr-function-like-that-ignores-upper-or-lower-case
56 | */
57 | #include // Needed for tolower ... this include is normally via masterdefs.h
58 | char* stristr(const char* haystack, const char* needle) {
59 | do {
60 | const char* h = haystack;
61 | const char* n = needle;
62 | while (tolower((unsigned char) *h) == tolower((unsigned char ) *n) && *n) {
63 | h++;
64 | n++;
65 | }
66 | if (*n == 0) {
67 | return (char *) haystack;
68 | }
69 | } while (*haystack++);
70 | return 0;
71 | }
72 |
73 | /* Binary predicates for use of stdlib qsort(): */
74 | int ncmp_int(const void * a, const void * b) // Default-int compare predicate
75 | {
76 | return ( *(int*)a - *(int*)b );
77 | }
78 |
79 | int ncmp_uint32(const void * a, const void * b) // Mnemonic: "Numeric CoMPare of UINT32 data"
80 | {
81 | uint32 diff = *(uint32*)a - *(uint32*)b;
82 | uint32 borrow = 1 - ((diff > *(uint32*)a) << 1); // -1 if (a < b), +1 otherwise
83 | // If (diff > a) == 1, had a borrow, i.e. a < b, return -1.
84 | // Otherwise return 0 if diff == 0, +1 if diff != 0. Can roll all 3 possibilities into one expression:
85 | return ( borrow & -(diff != 0) );
86 | /*
87 | a < b: bw = -1, (diff != 0) = 1, -() = -1 ===> -1 & -1 = -1
88 | a = b: bw = +1, (diff != 0) = 0, -() = 0 ===> +1 & 0 = 0
89 | a > b: bw = +1, (diff != 0) = 1, -() = -1 ===> +1 & -1 = +1
90 | */
91 | }
92 |
93 | int ncmp_sint32(const void * a, const void * b)
94 | {
95 | return ( *(sint32*)a - *(sint32*)b );
96 | }
97 |
98 | int ncmp_uint64(const void * a, const void * b)
99 | {
100 | uint64 diff = *(uint64*)a - *(uint64*)b;
101 | uint64 borrow = 1 - ((diff > *(uint64*)a) << 1); // -1 if (a < b), +1 otherwise
102 | return ( borrow & -(diff != 0) );
103 | }
104 |
105 | int ncmp_sint64(const void * a, const void * b)
106 | {
107 | return ( *(sint64*)a - *(sint64*)b );
108 | }
109 |
110 |
--------------------------------------------------------------------------------