├── .gitmodules
├── sites
    ├── make.inc.olcf_summit
    ├── make.inc.nersc_cgpu
    ├── make.inc.CIMS
    ├── make.inc.nersc_cori
    ├── make.inc.nersc_perlmutter
    └── make.inc.FI
├── python
    └── cufinufft
    │   ├── requirements.txt
    │   ├── __init__.py
    │   ├── docs
    │       ├── index.rst
    │       ├── Makefile
    │       ├── make.bat
    │       └── conf.py
    │   ├── tests
    │       ├── test_examples.py
    │       ├── test_multi.py
    │       ├── utils.py
    │       ├── test_error_checks.py
    │       └── test_basic.py
    │   ├── README.md
    │   └── _cufinufft.py
├── docs
    ├── logo.png
    └── cufinufft_announce.png
├── MANIFEST.in
├── targets
    ├── make.inc.manylinux
    └── make.inc.power9
├── contrib
    ├── legendre_rule_fast.h
    ├── legendre_rule_fast.license
    ├── dirft.h
    ├── README
    ├── common.h
    ├── dataTypes.h
    ├── utils.cpp
    ├── spreadinterp.h
    ├── utils.h
    ├── utils_fp.cpp
    ├── utils_fp.h
    ├── dirft2d.cpp
    ├── spreadinterp.cpp
    └── common.cpp
├── .gitignore
├── ci
    ├── docker
    │   ├── cuda10.1
    │   │   ├── cuda.repo
    │   │   ├── README
    │   │   ├── CentOS-SCLo-scl-rh.repo
    │   │   ├── CentOS-SCLo-scl.repo
    │   │   ├── vault.repo
    │   │   └── Dockerfile-x86_64
    │   ├── cuda11.0
    │   │   ├── cuda.repo
    │   │   ├── README
    │   │   └── Dockerfile-x86_64
    │   └── cuda10.1-manylinux2014
    │   │   ├── cuda.repo
    │   │   ├── README
    │   │   └── Dockerfile-x86_64
    ├── build-wheels.sh
    └── distribution_helper.sh
├── include
    ├── profile.h
    ├── cufinufft.h
    ├── utils.h
    ├── cufinufft_opts.h
    └── cufinufft_errors.h
├── src
    ├── common.h
    ├── 1d
    │   ├── README
    │   ├── interp1d_wrapper.cu
    │   └── cufinufft1d.cu
    ├── 2d
    │   ├── README
    │   └── cufinufft2d.cu
    ├── 3d
    │   ├── README
    │   └── cufinufft3d.cu
    ├── memtransfer.h
    ├── README
    ├── profile.cu
    ├── cudeconvolve.h
    ├── common.cu
    └── precision_independent.h
├── test
    ├── fseriesperf.sh
    ├── spreadperf.sh
    ├── cufinufft2d2api_test.cu
    ├── cufinufft2d2api_test_32.cu
    ├── interp1d_test.cu
    ├── spread1d_test.cu
    ├── interp2d_test.cu
    ├── spread2d_test.cu
    ├── cufinufft1d1_test.cu
    ├── fseries_kernel_test.cu
    ├── cufinufft1d2_test.cu
    ├── interp3d_test.cu
    ├── cufinufft2d1_test.cu
    ├── cufinufft2d2_test.cu
    ├── cufinufft3d1_test.cu
    ├── cufinufft2d1many_test.cu
    ├── spread3d_test.cu
    └── cufinufft3d2_test.cu
├── examples
    ├── README
    ├── example2d2many.py
    ├── example2d1many.py
    ├── example2d1many.cpp
    └── example2d2many.cpp
├── .bumpversion.cfg
├── Jenkinsfile
├── LICENSE
├── setup.py
└── CHANGELOG


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sites/make.inc.olcf_summit:
--------------------------------------------------------------------------------
1 | target:=power9
2 | 


--------------------------------------------------------------------------------
/python/cufinufft/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pycuda
3 | six
4 | 


--------------------------------------------------------------------------------
/sites/make.inc.nersc_cgpu:
--------------------------------------------------------------------------------
1 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs
2 | 


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flatironinstitute/cufinufft/HEAD/docs/logo.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include python/cufinufft/README.md
2 | include python/cufinufft/requirements.txt
3 | 


--------------------------------------------------------------------------------
/docs/cufinufft_announce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flatironinstitute/cufinufft/HEAD/docs/cufinufft_announce.png


--------------------------------------------------------------------------------
/targets/make.inc.manylinux:
--------------------------------------------------------------------------------
1 | CFLAGS = -fPIC -O3 -funroll-loops -march=x86-64 -mtune=generic -msse4 -fcx-limited-range
2 | 


--------------------------------------------------------------------------------
/python/cufinufft/__init__.py:
--------------------------------------------------------------------------------
1 | from cufinufft.cufinufft import cufinufft
2 | 
3 | __all__ = ['cufinufft']
4 | __version__ = '1.3'
5 | 


--------------------------------------------------------------------------------
/contrib/legendre_rule_fast.h:
--------------------------------------------------------------------------------
1 | #ifndef GAUSSQUAD_H
2 | #define GAUSSQUAD_H
3 | 
4 | void legendre_compute_glr ( int n, double x[], double w[] );
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | __pycache__
 3 | bin
 4 | lib
 5 | lib-static
 6 | 
 7 | python/cufinufft/docs/_build
 8 | python/cufinufft/docs/_static
 9 | python/cufinufft/docs/_templates
10 | 


--------------------------------------------------------------------------------
/sites/make.inc.CIMS:
--------------------------------------------------------------------------------
1 | CUDA_ROOT=/usr/local/stow/cuda-10.0
2 | INC=-I$(CUDA_ROOT)/include \
3 |     -I$(CUDA_ROOT)/samples/common/inc
4 | NVCC_LIBS_PATH=-L$(CUDA_ROOT)/lib64
5 | NVARCH=-arch=sm_70
6 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/cuda.repo:
--------------------------------------------------------------------------------
1 | [cuda]
2 | name=cuda
3 | baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64
4 | enabled=1
5 | gpgcheck=1
6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
7 | 


--------------------------------------------------------------------------------
/ci/docker/cuda11.0/cuda.repo:
--------------------------------------------------------------------------------
1 | [cuda]
2 | name=cuda
3 | baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64
4 | enabled=1
5 | gpgcheck=1
6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
7 | 


--------------------------------------------------------------------------------
/sites/make.inc.nersc_cori:
--------------------------------------------------------------------------------
1 | CC  := $(shell which cc)
2 | CXX := $(shell which CC)
3 | 
4 | $(info detected compiler wrappers:)
5 | $(info CC  = $(CC))
6 | $(info CXX = $(CXX))
7 | 
8 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs
9 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1-manylinux2014/cuda.repo:
--------------------------------------------------------------------------------
1 | [cuda]
2 | name=cuda
3 | baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64
4 | enabled=1
5 | gpgcheck=1
6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
7 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/README:
--------------------------------------------------------------------------------
1 | This configuration is based off of manylinux2010,
2 | which is iteself based off of centos6.
3 | 
4 | I have extended manylinux with a compatible CUDA
5 | toolkit and runtime environment suitable for
6 | both building and running code inside docker.
7 | 


--------------------------------------------------------------------------------
/ci/docker/cuda11.0/README:
--------------------------------------------------------------------------------
1 | This configuration is based off of manylinux2014,
2 | which is iteself based off of centos8.
3 | 
4 | I have extended manylinux with a compatible CUDA
5 | toolkit and runtime environment suitable for
6 | both building and running code inside docker.
7 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1-manylinux2014/README:
--------------------------------------------------------------------------------
1 | This configuration is based off of manylinux2014,
2 | which is iteself based off of centos8.
3 | 
4 | I have extended manylinux with a compatible CUDA
5 | toolkit and runtime environment suitable for
6 | both building and running code inside docker.
7 | 


--------------------------------------------------------------------------------
/targets/make.inc.power9:
--------------------------------------------------------------------------------
1 | # -march is not always supported when compiling power9 targets, we use `mcpu` and `mtune` instead.
2 | CFLAGS   := -fPIC -O3 -funroll-loops -g
3 | CXXFLAGS := -fPIC -O3 -funroll-loops -mcpu=native -mtune=native -g -std=c++11
4 | 
5 | # All power9 systems so far have had recent GPU hardware.
6 | NVARCH := -arch=sm_70
7 | 


--------------------------------------------------------------------------------
/contrib/legendre_rule_fast.license:
--------------------------------------------------------------------------------
1 | LICENSE info for legendre_rule_fast.c ONLY:
2 | 
3 | According to
4 | https://people.sc.fsu.edu/~jburkardt/c_src/legendre_rule_fast/legendre_rule_fast.html
5 | 
6 | The computer code and data files described and made available on this web page are distributed under the GNU LGPL license:
7 | 
8 | https://www.gnu.org/licenses/lgpl-3.0.en.html
9 | 


--------------------------------------------------------------------------------
/python/cufinufft/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to cufinufft's Python documentation!
 2 | ==============================================
 3 | 
 4 | .. automodule:: cufinufft
 5 |    :members:
 6 |    :member-order: bysource
 7 | 
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | 


--------------------------------------------------------------------------------
/include/profile.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROFILE_H
 2 | #define PROFILE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | class CudaTracer {
 7 |     public:
 8 |         CudaTracer(const char* name, int cid = 0);
 9 |         ~CudaTracer();
10 | };
11 | 
12 | 
13 | 
14 | #define PROFILE_CUDA(fname) CudaTracer uniq_name_using_macros__(fname);
15 | #define PROFILE_CUDA_GROUP(fname, groupid) CudaTracer uniq_name_using_macros__(fname, groupid);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef __COMMON_H__
 2 | #define __COMMON_H__ 
 3 | #include <cufinufft_eitherprec.h>
 4 | 
 5 | __global__
 6 | void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns);
 7 | 
 8 | int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns);
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/src/1d/README:
--------------------------------------------------------------------------------
 1 | - cufinufft1d.cu
 2 |   This file contains the execution functions 1d type 1,2 that are called in ../cufinufft.cu
 3 | 
 4 | - spreadinterp1d.cu
 5 |   This file contains all the GPU kernels for 1d spreading, interpolation.
 6 | 
 7 | - interp1d_wrapper.cu
 8 |   Wrappers for 1d interpolations. One method is implemented: 
 9 |     (1) nonuniform driven, 
10 |     
11 | - spread1d_wrapper.cu
12 |   Wrappers for 1d spreading. Two methods are implemented: 
13 |     (1) nonuniform driven, 
14 |     (2) subproblem 
15 | 


--------------------------------------------------------------------------------
/sites/make.inc.nersc_perlmutter:
--------------------------------------------------------------------------------
 1 | CC  := $(shell which cc)
 2 | CXX := $(shell which CC)
 3 | 
 4 | $(info detected compiler wrappers:)
 5 | $(info CC  = $(CC))
 6 | $(info CXX = $(CXX))
 7 | 
 8 | 
 9 | CUDA_ROOT := $(CUDATOOLKIT_HOME)
10 | NVARCH := -arch=sm_80 \
11 |           -gencode=arch=compute_70,code=sm_70 \
12 |           -gencode=arch=compute_75,code=sm_75 \
13 |           -gencode=arch=compute_80,code=sm_80 \
14 |           -gencode=arch=compute_80,code=sm_80 \
15 |           -gencode=arch=compute_86,code=compute_86
16 | 
17 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs
18 | 


--------------------------------------------------------------------------------
/src/2d/README:
--------------------------------------------------------------------------------
 1 | - cufinufft2d.cu
 2 |   This file contains the execution functions 2d type 1,2 that are called in ../cufinufft.cu
 3 | 
 4 | - spreadinterp2d.cu
 5 |   This file contains all the GPU kernels for 2d spreading, interpolation.
 6 | 
 7 | - interp2d_wrapper.cu
 8 |   Wrappers for 2d interpolations. Two methods are implemented: 
 9 |     (1) nonuniform driven, 
10 |     (2) subproblem
11 |     
12 | - spread2d_wrapper.cu
13 |   Wrappers for 2d spreading. Three methods are implemented: 
14 |     (1) nonuniform driven, 
15 |     (2) subproblem, 
16 |     (3) paul's idea 
17 | 


--------------------------------------------------------------------------------
/src/3d/README:
--------------------------------------------------------------------------------
 1 | - cufinufft3d.cu
 2 |   This file contains the execution functions for 3d type1,2 that are called in ../cufinufft.cu
 3 | 
 4 | - spreadinterp3d.cu
 5 |   This file contains all the GPU kernels for 3d spreading, interpolation.
 6 | 
 7 | - interp3d_wrapper.cu
 8 |   Wrappers for 3d interpolations. Two methods are implemented: 
 9 |     (1) nonuniform driven, 
10 |     (2) subproblem
11 |     
12 | - spread3d_wrapper.cu
13 |   Wrappers for 3d spreading. Three methods are implemented: 
14 |     (1) nonuniform points driven, 
15 |     (2) subproblem, 
16 |     (4) block gather 
17 | 


--------------------------------------------------------------------------------
/include/cufinufft.h:
--------------------------------------------------------------------------------
 1 | // Defines the C++/C user interface to FINUFFT library.
 2 | 
 3 | // It simply combines single and double precision headers, by flipping a flag
 4 | // in the main macros which are in cufinufft_eitherprec.h
 5 | // No usual #ifndef testing is needed; it's done in cufinufft_eitherprec.h
 6 | // Internal cufinufft routines that are compiled separately for
 7 | // each precision should include cufinufft_eitherprec.h directly, and not cufinufft.h.
 8 | 
 9 | #undef SINGLE
10 | #include <cufinufft_eitherprec.h>
11 | #define SINGLE
12 | #include <cufinufft_eitherprec.h>
13 | #undef SINGLE
14 | 


--------------------------------------------------------------------------------
/src/memtransfer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MEMTRANSFER_H__
 2 | #define __MEMTRANSFER_H__
 3 | 
 4 | #include <cufinufft_eitherprec.h>
 5 | 
 6 | int ALLOCGPUMEM1D_PLAN(CUFINUFFT_PLAN d_plan);
 7 | int ALLOCGPUMEM1D_NUPTS(CUFINUFFT_PLAN d_plan);
 8 | void FREEGPUMEMORY1D(CUFINUFFT_PLAN d_plan);
 9 | 
10 | int ALLOCGPUMEM2D_PLAN(CUFINUFFT_PLAN d_plan);
11 | int ALLOCGPUMEM2D_NUPTS(CUFINUFFT_PLAN d_plan);
12 | void FREEGPUMEMORY2D(CUFINUFFT_PLAN d_plan);
13 | 
14 | int ALLOCGPUMEM3D_PLAN(CUFINUFFT_PLAN d_plan);
15 | int ALLOCGPUMEM3D_NUPTS(CUFINUFFT_PLAN d_plan);
16 | void FREEGPUMEMORY3D(CUFINUFFT_PLAN d_plan);
17 | #endif
18 | 


--------------------------------------------------------------------------------
/python/cufinufft/tests/test_examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Discover and run Python example scripts as unit tests.
 3 | """
 4 | 
 5 | import os
 6 | import subprocess
 7 | import sys
 8 | from pathlib import Path
 9 | 
10 | import pytest
11 | 
12 | examples_dir = os.path.join(Path(__file__).resolve().parents[3], "examples")
13 | 
14 | scripts = []
15 | for filename in os.listdir(examples_dir):
16 |     if filename.endswith(".py"):
17 |         scripts.append(os.path.join(examples_dir, filename))
18 | 
19 | @pytest.mark.parametrize("filename", scripts)
20 | def test_example(filename):
21 |     subprocess.check_call([sys.executable, filename])
22 | 


--------------------------------------------------------------------------------
/src/README:
--------------------------------------------------------------------------------
 1 | This folder contains the main source files of the GPU implementations. 
 2 | - cufinufft.cu
 3 |   Four main stages of cufinufft API. 
 4 |   (1) cufinufft_makeplan, (2) cufinufft_setpts, (3) cufinufft_execute, (4) cufinufft_destroy.
 5 |   Also, cufinufft_default_opts may precede stage 1.
 6 | 
 7 | - memtransfer_wrapper.cu
 8 |   Wrapper of allocation and free GPU memories for different dimensions and methods.
 9 | 
10 | - deconvolve_wrapper.cu
11 |   GPU kernels and wrappers of deconvolve and amplify the input/output coefficients by correction factor. (Step 3 in Type 1; Step 1 in Type 2)
12 | 
13 | - profile.cu
14 |   Codes for using NVProf
15 | 


--------------------------------------------------------------------------------
/test/fseriesperf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # basic perf test of compute fseries for 1d, single/double
 3 | # Melody 02/20/22
 4 | 
 5 | BIN=../bin/fseries_kernel_test
 6 | DIM=1
 7 | 
 8 | echo "Double.............................................."
 9 | for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5
10 | do
11 | 	for TOL in 1e-8
12 | 	do
13 | 		$BIN $N $DIM $TOL 0
14 | 		$BIN $N $DIM $TOL 1
15 | 	done
16 | done
17 | 
18 | BIN=../bin/fseries_kernel_test_32
19 | echo "Single.............................................."
20 | for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5
21 | do
22 | 	for TOL in 1e-6
23 | 	do
24 | 		$BIN $N $DIM $TOL 0
25 | 		$BIN $N $DIM $TOL 1
26 | 	done
27 | done
28 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef FINUFFT_UTILS_H
 2 | #define FINUFFT_UTILS_H
 3 | 
 4 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
 5 | #else
 6 | __inline__ __device__ double atomicAdd(double* address, double val)
 7 | {
 8 | 	unsigned long long int* address_as_ull =
 9 | 		(unsigned long long int*)address;
10 | 	unsigned long long int old = *address_as_ull, assumed;
11 | 
12 | 	do {
13 | 		assumed = old;
14 | 		old = atomicCAS(address_as_ull, assumed,
15 | 				__double_as_longlong(val +
16 | 					__longlong_as_double(assumed)));
17 | 
18 | 		// Note: uses integer comparison to avoid hang in case of NaN
19 | 		// (since NaN != NaN)
20 | 	} while (assumed != old);
21 | 
22 | 	return __longlong_as_double(old);
23 | }
24 | #endif
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/python/cufinufft/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/examples/README:
--------------------------------------------------------------------------------
 1 | Examples of cuFINUFFT usage in C++ and Python
 2 | 
 3 | Here we show 2D transforms of type 1 and 2, being performed, and tested,
 4 | in C++, and in Python. In each case, a batch of transforms is done with
 5 | new coefficients or weights, but the same set of nonuniform points; this
 6 | explains the suffix "many" in the code names. You may set ntransf=1 to
 7 | perform a single transform. Default options are used. In each case the
 8 | four steps (plan, setpts, execute, destroy) are used. A math test is also
 9 | performed; see the FINUFFT documentation for the definitions of the
10 | transforms: https://finufft.readthedocs.io/en/latest/math.html
11 | 
12 | For more usage examples see:
13 | 
14 | ../test/cufinufft*.cu
15 | ../python/cufinufft/tests/*.py
16 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.3
 3 | parse = (?P<major>\d+)\.(?P<minor>\d+)
 4 | serialize = 
 5 | 	{major}.{minor}
 6 | commit = True
 7 | tag = True
 8 | 
 9 | [bumpversion:file:setup.py]
10 | search = version='{current_version}'
11 | replace = version='{new_version}'
12 | 
13 | [bumpversion:file:README.md]
14 | search = v{current_version}
15 | replace = v{new_version}
16 | 
17 | [bumpversion:file:python/cufinufft/README.md]
18 | search = v{current_version}
19 | replace = v{new_version}
20 | 
21 | [bumpversion:file:python/cufinufft/docs/conf.py]
22 | search = release = '{current_version}'
23 | replace = release = '{new_version}'
24 | 
25 | [bumpversion:file:python/cufinufft/__init__.py]
26 | search = __version__ = '{current_version}'
27 | replace = __version__ = '{new_version}'
28 | 
29 | [bumpversion:file:ci/distribution_helper.sh]
30 | search = cufinufft_version={current_version}
31 | replace = cufinufft_version={new_version}
32 | 


--------------------------------------------------------------------------------
/contrib/dirft.h:
--------------------------------------------------------------------------------
 1 | #ifndef DIRFT_H
 2 | #define DIRFT_H
 3 | 
 4 | #include "utils.h"
 5 | #include "utils_fp.h"
 6 | 
 7 | void dirft1d1(BIGINT nj,FLT* x,CPX* c,int isign,BIGINT ms, CPX* f);
 8 | void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f);
 9 | void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f);
10 | 
11 | void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
12 | void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
13 | void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f);
14 | 
15 | void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
16 | void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
17 | void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f);
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/python/cufinufft/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/ci/build-wheels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u -x
 3 | 
 4 | function repair_wheel {
 5 |     wheel="$1"
 6 |     if ! "${PYBIN}/auditwheel" show "$wheel"; then
 7 |         echo "Skipping non-platform wheel $wheel"
 8 |     else
 9 |         "${PYBIN}/auditwheel" repair "$wheel" --plat "$PLAT" -w /io/wheelhouse/
10 |     fi
11 | }
12 | 
13 | 
14 | # Compile wheels
15 | for PYBIN in /opt/python/cp3*/bin; do
16 |     "${PYBIN}/pip" install --upgrade pip
17 |     "${PYBIN}/pip" install -r /io/python/cufinufft/requirements.txt
18 |     "${PYBIN}/pip" install auditwheel pytest
19 |     "${PYBIN}/pip" wheel /io/ --no-deps -w wheelhouse/
20 | done
21 | 
22 | 
23 | # Bundle external shared libraries into the wheels
24 | for whl in wheelhouse/*.whl; do
25 |     repair_wheel "$whl"
26 | done
27 | 
28 | 
29 | # Install packages and test
30 | for PYBIN in /opt/python/cp3*/bin/; do
31 |     "${PYBIN}/pip" install cufinufft -f /io/wheelhouse
32 |     "${PYBIN}/python" -m pytest /io/python/cufinufft/tests
33 | done
34 | 


--------------------------------------------------------------------------------
/include/cufinufft_opts.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUFINUFFT_OPTS_H__
 2 | #define __CUFINUFFT_OPTS_H__
 3 | 
 4 | typedef struct cufinufft_opts {   // see cufinufft_default_opts() for defaults
 5 | 	double upsampfac;   // upsampling ratio sigma, only 2.0 (standard) is implemented
 6 | 	/* following options are for gpu */
 7 |         int gpu_method;  // 1: nonuniform-pts driven, 2: shared mem (SM)
 8 | 	int gpu_sort;    // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort)
 9 | 
10 | 	int gpu_binsizex; // used for 2D, 3D subproblem method
11 | 	int gpu_binsizey;
12 | 	int gpu_binsizez;
13 | 
14 | 	int gpu_obinsizex; // used for 3D spread block gather method
15 | 	int gpu_obinsizey;
16 | 	int gpu_obinsizez;
17 | 
18 | 	int gpu_maxsubprobsize;
19 | 	int gpu_nstreams;
20 | 	int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval
21 | 
22 | 	int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only
23 | 
24 | 	/* multi-gpu support */
25 | 	int gpu_device_id;
26 | } cufinufft_opts;
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/contrib/README:
--------------------------------------------------------------------------------
 1 | This folder contains functions/files from FINUFFT that cuFINUFFT used. Following describes details of dependencies of each file.
 2 | - utils.h
 3 |   Definitions of CUCPX, CUFFT_TYPE, CUFFT_EX are added and are set depending on preprocessor SINGLE. 
 4 |   Definition of BIGINT is changed to the normal 4 byte integer (See line 81)
 5 | 
 6 | - utils.cpp
 7 |   This is required because of the use of computing norm, relative norm of vectors, e.g. relerrtwonorm, in the test codes.
 8 | 
 9 | - common.h 
10 | - common.cpp (hence legendre_rule_fast.c/.h are included)
11 |   setup_spreader_for_nufft, set_nf_type12, onedim_fseries_kernel are called in cufinufft_makeplan.
12 |   
13 | - spreadinterp.h
14 |   cufinufft plan contains the spread_opts struct where nspread, spread_direction, pirange, upsampfac, ES_beta, ES_c are used. 
15 |   
16 | - ker_horner_allw_loop.c
17 | - ker_lowupsampfac_horner_allw_loop.c
18 |   These two files are included in the src/2,3d/spreadinterp2,3d.cu files (See device function eval_kernel_vec_Horner)
19 | 


--------------------------------------------------------------------------------
/src/profile.cu:
--------------------------------------------------------------------------------
 1 | #include <profile.h>
 2 | #include <nvToolsExt.h>
 3 | #include <cstdio>
 4 | 
 5 | 
 6 | const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 
 7 | 	0x0000ffff, 0x00ff0000, 0x00ffffff }; 
 8 | const int num_colors = sizeof(colors)/sizeof(uint32_t);
 9 | 
10 | #define PUSH_RANGE(name,cid) { \
11 |         int color_id = cid; \
12 |         color_id = color_id%num_colors;\
13 |         nvtxEventAttributes_t eventAttrib = {0}; \
14 |         eventAttrib.version = NVTX_VERSION; \
15 |         eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
16 |         eventAttrib.colorType = NVTX_COLOR_ARGB; \
17 |         eventAttrib.color = colors[color_id]; \
18 |         eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
19 |         eventAttrib.message.ascii = name; \
20 |         nvtxRangePushEx(&eventAttrib); \
21 | }
22 | #define POP_RANGE nvtxRangePop();
23 | 
24 | CudaTracer::CudaTracer(const char* name, int cid) 
25 | {
26 |     PUSH_RANGE(name,cid);
27 | }
28 | 
29 | CudaTracer::~CudaTracer() {
30 |     POP_RANGE;
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/CentOS-SCLo-scl-rh.repo:
--------------------------------------------------------------------------------
 1 | # CentOS-SCLo-rh.repo
 2 | #
 3 | # Please see http://wiki.centos.org/SpecialInterestGroup/SCLo for more
 4 | # information
 5 | 
 6 | [centos-sclo-rh]
 7 | name=CentOS-6 - SCLo rh
 8 | baseurl=http://vault.centos.org/centos/6/sclo/$basearch/rh/
 9 | gpgcheck=1
10 | enabled=1
11 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
12 | 
13 | [centos-sclo-rh-testing]
14 | name=CentOS-6 - SCLo rh Testing
15 | baseurl=http://buildlogs.centos.org/centos/6/sclo/$basearch/rh/
16 | gpgcheck=0
17 | enabled=0
18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
19 | 
20 | [centos-sclo-rh-source]
21 | name=CentOS-6 - SCLo rh Sources
22 | baseurl=http://vault.centos.org/centos/6/sclo/Source/rh/
23 | gpgcheck=1
24 | enabled=0
25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
26 | 
27 | [centos-sclo-rh-debuginfo]
28 | name=CentOS-6 - SCLo rh Debuginfo
29 | baseurl=http://debuginfo.centos.org/centos/6/sclo/$basearch/
30 | gpgcheck=1
31 | enabled=0
32 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
33 | 
34 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/CentOS-SCLo-scl.repo:
--------------------------------------------------------------------------------
 1 | # CentOS-SCLo-sclo.repo
 2 | #
 3 | # Please see http://wiki.centos.org/SpecialInterestGroup/SCLo for more
 4 | # information
 5 | 
 6 | [centos-sclo-sclo]
 7 | name=CentOS-6 - SCLo sclo
 8 | baseurl=http://vault.centos.org/centos/6/sclo/$basearch/sclo/
 9 | gpgcheck=1
10 | enabled=1
11 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
12 | 
13 | [centos-sclo-sclo-testing]
14 | name=CentOS-6 - SCLo sclo Testing
15 | baseurl=http://buildlogs.centos.org/centos/6/sclo/$basearch/sclo/
16 | gpgcheck=0
17 | enabled=0
18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
19 | 
20 | [centos-sclo-sclo-source]
21 | name=CentOS-6 - SCLo sclo Sources
22 | baseurl=http://vault.centos.org/centos/6/sclo/Source/sclo/
23 | gpgcheck=1
24 | enabled=0
25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
26 | 
27 | [centos-sclo-sclo-debuginfo]
28 | name=CentOS-6 - SCLo sclo Debuginfo
29 | baseurl=http://debuginfo.centos.org/centos/6/sclo/$basearch/
30 | gpgcheck=1
31 | enabled=0
32 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
33 | 
34 | 


--------------------------------------------------------------------------------
/contrib/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | 
 4 | #include "dataTypes.h"
 5 | #include "utils.h"
 6 | #include "utils_fp.h"
 7 | #include "spreadinterp.h"
 8 | 
 9 | // constants needed within common
10 | #define MAX_NQUAD 100              // max number of positive quadr nodes
11 | // increase this if you need >1TB RAM...
12 | #define MAX_NF    (BIGINT)INT_MAX  // In cufinufft we limit array sizes to 2^31
13 |                                    // which is about 2 billion, since we set
14 |                                    // BIGINT to int. (Differs from FINUFFT)
15 | 
16 | struct cufinufft_opts;
17 | 
18 | // common.cpp provides...
19 | int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts);
20 | void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts,BIGINT *nf,
21 |                    BIGINT b);
22 | void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts);
23 | void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts);
24 | void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts);
25 | #endif  // COMMON_H
26 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/vault.repo:
--------------------------------------------------------------------------------
 1 | [base]
 2 | name=CentOS-$releasever - Base
 3 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=os&infra=$infra
 4 | # baseurl=http://mirror.centos.org/centos/$releasever/os/$basearch/
 5 | baseurl=https://vault.centos.org/6.10/os/$basearch/
 6 | gpgcheck=1
 7 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6
 8 | 
 9 | # released updates
10 | [updates]
11 | name=CentOS-$releasever - Updates
12 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates&infra=$infra
13 | # baseurl=http://mirror.centos.org/centos/$releasever/updates/$basearch/
14 | baseurl=https://vault.centos.org/6.10/updates/$basearch/
15 | gpgcheck=1
16 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6
17 | 
18 | # additional packages that may be useful
19 | [extras]
20 | name=CentOS-$releasever - Extras
21 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras&infra=$infra
22 | # baseurl=http://mirror.centos.org/centos/$releasever/extras/$basearch/
23 | baseurl=https://vault.centos.org/6.10/extras/$basearch/
24 | gpgcheck=1
25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6
26 | 


--------------------------------------------------------------------------------
/src/cudeconvolve.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUDECONVOLVE_H__
 2 | #define __CUDECONVOLVE_H__
 3 | 
 4 | #include <cufinufft_eitherprec.h>
 5 | 
 6 | __global__
 7 | void Deconvolve_1d(int ms, int nf1, int fw_width, CUCPX* fw, CUCPX *fk, 
 8 | 	FLT *fwkerhalf1);
 9 | __global__
10 | void Amplify_1d(int ms, int nf1, int fw_width, CUCPX* fw, CUCPX *fk, 
11 | 	FLT *fwkerhalf2);
12 | 
13 | __global__
14 | void Deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, CUCPX* fw, 
15 | 	CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2);
16 | __global__
17 | void Amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, CUCPX* fw, 
18 | 	CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2);
19 | 
20 | __global__
21 | void Deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, 
22 | 	int fw_width, CUCPX* fw, CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2, 
23 | 	FLT *fwkerhalf3);
24 | __global__
25 | void Amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, 
26 | 	CUCPX* fw, CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3);
27 | 
28 | int CUDECONVOLVE1D(CUFINUFFT_PLAN d_mem, int blksize);
29 | int CUDECONVOLVE2D(CUFINUFFT_PLAN d_mem, int blksize);
30 | int CUDECONVOLVE3D(CUFINUFFT_PLAN d_mem, int blksize);
31 | #endif
32 | 


--------------------------------------------------------------------------------
/sites/make.inc.FI:
--------------------------------------------------------------------------------
 1 | # FI: Flatiron Institute, rusty cluster, running on Cuda 11.4.2, up to A100
 2 | # devices. The A100 seems to need SM80 arch code.
 3 | # Barnett 12/2/21
 4 | 
 5 | # Here's some cmds to run experiments on rusty:
 6 | 
 7 | # log into rusty, some node, then...
 8 | #module load slurm
 9 | #srun -p gpu -N1 --gpus=1 -c 1 --constraint=a100 --exclusive --pty bash -i
10 | # to check the GPU...  seems device has cuda 11.2 not 11.4
11 | #nvidia-smi
12 | #module load cuda/11.4.2
13 | #module load gcc/7.5.0  
14 | # (cuda seems not to be able to use later gcc!)
15 | #make all -j
16 | # compile takes <1min with -j.
17 | #bin/cufinufft1d1_test 2 1e6 1e7
18 | #make check
19 | 
20 | # see http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
21 | NVARCH = -arch=sm_80 \
22 |        -gencode=arch=compute_50,code=sm_50 \
23 |        -gencode=arch=compute_52,code=sm_52 \
24 |        -gencode=arch=compute_60,code=sm_60 \
25 |        -gencode=arch=compute_61,code=sm_61 \
26 |        -gencode=arch=compute_70,code=sm_70 \
27 |        -gencode=arch=compute_75,code=sm_75 \
28 |        -gencode=arch=compute_80,code=sm_80 \
29 |        -gencode=arch=compute_86,code=sm_86 \
30 |        -gencode=arch=compute_86,code=compute_86
31 | 


--------------------------------------------------------------------------------
/contrib/dataTypes.h:
--------------------------------------------------------------------------------
 1 | // ------------ FINUFFT data type definitions ----------------------------------
 2 | 
 3 | #if (!defined(DATATYPES_H) && !defined(SINGLE)) || (!defined(DATATYPESF_H) && defined(SINGLE))
 4 | // Make sure we only include once per precision (as in finufft_eitherprec.h).
 5 | #ifndef SINGLE
 6 | #define DATATYPES_H
 7 | #else
 8 | #define DATATYPESF_H
 9 | #endif
10 | 
11 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
12 | #include <stdint.h>
13 | 
14 | // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
15 | // This includes all calling arguments (eg M,N) that could be huge someday...
16 | // Note: BIGINT is modified to have ``int'' data type for cufinufft.
17 | typedef int BIGINT;
18 | 
19 | // decide which kind of complex numbers to use in interface...
20 | #ifdef __cplusplus
21 | #include <complex>          // C++ type
22 | #define COMPLEXIFY(X) std::complex<X>
23 | #else
24 | #include <complex.h>        // C99 type
25 | #define COMPLEXIFY(X) X complex
26 | #endif
27 | 
28 | #undef FLT
29 | #undef CPX
30 | 
31 | // Precision-independent real and complex types for interfacing...
32 | // (note these cannot be typedefs since we want dual-precision library)
33 | #ifdef SINGLE
34 |   #define FLT float
35 | #else
36 |   #define FLT double
37 | #endif
38 | 
39 | #define CPX COMPLEXIFY(FLT)
40 | 
41 | #endif  // DATATYPES_H or DATATYPESF_H
42 | 


--------------------------------------------------------------------------------
/include/cufinufft_errors.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUFINUFFT_ERRORS_H__
 2 | #define __CUFINUFFT_ERRORS_H__
 3 | 
 4 | // For error checking
 5 | static const char* _cufftGetErrorEnum(cufftResult_t error)
 6 | {
 7 | 	switch(error)
 8 | 	{
 9 | 		case CUFFT_SUCCESS:
10 | 			return "cufft_success";
11 | 		case CUFFT_INVALID_PLAN:
12 | 			return "cufft_invalid_plan";
13 | 		case CUFFT_ALLOC_FAILED:
14 | 			return "cufft_alloc_failed";
15 | 		case CUFFT_INVALID_TYPE:
16 | 			return "cufft_invalid_type";
17 | 		case CUFFT_INVALID_VALUE:
18 | 			return "cufft_invalid_value";
19 | 		case CUFFT_INTERNAL_ERROR:
20 | 			return "cufft_internal_error";
21 | 		case CUFFT_EXEC_FAILED:
22 | 			return "cufft_exec_failed";
23 | 		case CUFFT_SETUP_FAILED:
24 | 			return "cufft_setup_failed";
25 | 		case CUFFT_INVALID_SIZE:
26 | 			return "cufft_invalid_size";
27 | 		case CUFFT_UNALIGNED_DATA:
28 | 			return "cufft_unaligned data";
29 | 		case CUFFT_INCOMPLETE_PARAMETER_LIST:
30 | 			return "cufft_incomplete_parameter_list";
31 | 		case CUFFT_INVALID_DEVICE:
32 | 			return "cufft_invalid_device";
33 | 		case CUFFT_PARSE_ERROR:
34 | 			return "cufft_parse_error";
35 | 		case CUFFT_NO_WORKSPACE:
36 | 			return "cufft_no_workspace";
37 | 		case CUFFT_NOT_IMPLEMENTED:
38 | 			return "cufft_not_implemented";
39 | 		case CUFFT_LICENSE_ERROR:
40 | 			return "cufft_license_error";
41 | 		case CUFFT_NOT_SUPPORTED:
42 | 			return "cufft_not_supported";
43 | 	}
44 | 	return "<unknown>";
45 | }
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/python/cufinufft/README.md:
--------------------------------------------------------------------------------
 1 | # cuFINUFFT v1.3 Python package
 2 | 
 3 | The cuFINUFFT library is an efficient GPU implementation of the 1-, 2- and
 4 | 3-dimensional nonuniform fast Fourier transform (NUFFT). It includes both type
 5 | 1 (nonuniform to uniform) and type 2 (uniform to nonuniform) transforms.
 6 | It is based on the [FINUFFT](https://github.com/flatironinstitute/finufft)
 7 | implementation for the CPU. This package provides a Python interface to the
 8 | cuFINUFFT library, which is written in C++ and CUDA.
 9 | 
10 | For a mathematical description of the NUFFT and applications to signal
11 | processing, imaging, and scientific computing, see [the FINUFFT
12 | documentation](https://finufft.readthedocs.io). Usage examples can be found
13 | [here](https://github.com/flatironinstitute/cufinufft/tree/v1.3/examples).
14 | 
15 | If you use this package, please cite our paper:
16 | 
17 | Y. Shih, G. Wright, J. Andén, J. Blaschke, A. H. Barnett (2021).
18 | cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs.
19 | arXiv preprint arXiv:2102.08463.
20 | [(paper)](https://arxiv.org/abs/2102.08463)
21 | [(bibtex)](https://arxiv.org/bibtex/2102.08463)
22 | 
23 | **Note**: We are currently in the process of adapting the cuFINUFFT interface to
24 | closer match that of FINUFFT. This will likely break code depending on the
25 | current interface once the next release is published. At this point we will
26 | publish a migration guide that will detail the exact changes to the
27 | interfaces.
28 | 


--------------------------------------------------------------------------------
/ci/distribution_helper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | 
 3 | # Helper Script For Building Wheels
 4 | 
 5 | cufinufft_version=1.3
 6 | manylinux_version=manylinux2014
 7 | cuda_version=11.0
 8 | dockerhub=garrettwrong
 9 | 
10 | 
11 | echo "# build the wheel"
12 | docker build -f ci/docker/cuda${cuda_version}/Dockerfile-x86_64 -t ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} .
13 | 
14 | 
15 | echo "# Run the container, invoking the build-wheels script to generate the wheels"
16 | docker run --gpus all -it -v `pwd`/wheelhouse:/io/wheelhouse -e PLAT=${manylinux_version}_x86_64  ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} /io/ci/build-wheels.sh
17 | 
18 | echo "# Copy the wheels we care about to the dist folder"
19 | mkdir -p dist
20 | cp -v wheelhouse/cufinufft-${cufinufft_version}-cp3*${manylinux_version}* dist
21 | 
22 | 
23 | echo "The following steps should be performed manually for now.\n"
24 | 
25 | 
26 | echo "# Push to Test PyPI for review/testing"
27 | echo "#twine upload -r testpypi dist/*"
28 | echo
29 | 
30 | 
31 | echo "# Tag release."
32 | ## Can do in a repo and push or on manually on GH gui.
33 | echo
34 | 
35 | 
36 | echo "# Review wheels from test index"
37 | echo "#pip install -i https://test.pypi.org/simple/ --no-deps cufinufft"
38 | echo
39 | 
40 | 
41 | echo "# Push to live index"
42 | echo "## twine upload dist/*"
43 | echo
44 | 
45 | 
46 | echo "# optionally push it (might take a long time)."
47 | echo "#docker push ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version}"
48 | 


--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | pipeline {
 2 |   agent none
 3 |   options {
 4 |     disableConcurrentBuilds()
 5 |     buildDiscarder(logRotator(numToKeepStr: '8', daysToKeepStr: '20'))
 6 |     timeout(time: 1, unit: 'HOURS')
 7 |   }
 8 |   stages {
 9 |     stage('main') {
10 |       agent {
11 |          dockerfile {
12 |             filename 'ci/docker/cuda10.1/Dockerfile-x86_64'
13 |             args '--gpus 1'
14 |          }
15 |       }
16 |       environment {
17 |     HOME = "$WORKSPACE/build"
18 |     PYBIN = "/opt/python/cp38-cp38/bin"
19 |       }
20 |       steps {
21 |     sh '${PYBIN}/python3 -m venv $HOME'
22 |     sh '''#!/bin/bash -ex
23 |       source $HOME/bin/activate
24 |       LIBRARY_PATH=/io/lib python3 -m pip install -e .
25 |       python3 -m pip install --upgrade "numpy<1.22"
26 |       python3 -m pip install pytest
27 |       python3 -m pytest
28 |     '''
29 |     sh 'make check'
30 |       }
31 |     }
32 |   }
33 |   post {
34 |     failure {
35 |       emailext subject: '$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS',
36 |            body: '''$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS
37 | 
38 | Check console output at $BUILD_URL to view full results.
39 | 
40 | Building $BRANCH_NAME for $CAUSE
41 | $JOB_DESCRIPTION
42 | 
43 | Chages:
44 | $CHANGES
45 | 
46 | End of build log:
47 | ${BUILD_LOG,maxLines=200}
48 | ''',
49 |            recipientProviders: [
50 |          [$class: 'DevelopersRecipientProvider'],
51 |            ],
52 |            replyTo: '$DEFAULT_REPLYTO',
53 |            to: 'janden@flatironinstitute.org'
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/contrib/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | BIGINT next235beven(BIGINT n, BIGINT b)
 4 | // finds even integer not less than n, with prime factors no larger than 5
 5 | // (ie, "smooth") and is a multiple of b (b is a number that the only prime 
 6 | // factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
 7 | // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 8 | // added condition about b Melody 05/31/20
 9 | {
10 |   if (n<=2) return 2;
11 |   if (n%2 == 1) n+=1;   // even
12 |   BIGINT nplus = n-2;   // to cancel out the +=2 at start of loop
13 |   BIGINT numdiv = 2;    // a dummy that is >1
14 |   while ((numdiv>1) || (nplus%b != 0)) {
15 |     nplus += 2;         // stays even
16 |     numdiv = nplus;
17 |     while (numdiv%2 == 0) numdiv /= 2;  // remove all factors of 2,3,5...
18 |     while (numdiv%3 == 0) numdiv /= 3;
19 |     while (numdiv%5 == 0) numdiv /= 5;
20 |   }
21 |   return nplus;
22 | }
23 | 
24 | // ----------------------- helpers for timing (always stay double prec)...
25 | using namespace std;
26 | 
27 | void CNTime::start()
28 | {
29 |   gettimeofday(&initial, 0);
30 | }
31 | 
32 | double CNTime::restart()
33 | // Barnett changed to returning in sec
34 | {
35 |   double delta = this->elapsedsec();
36 |   this->start();
37 |   return delta;
38 | }
39 | 
40 | double CNTime::elapsedsec()
41 | // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
42 | {
43 |   struct timeval now;
44 |   gettimeofday(&now, 0);
45 |   double nowsec = (double)now.tv_sec + 1e-6*now.tv_usec;
46 |   double initialsec = (double)initial.tv_sec + 1e-6*initial.tv_usec;
47 |   return nowsec - initialsec;
48 | }
49 | 


--------------------------------------------------------------------------------
/contrib/spreadinterp.h:
--------------------------------------------------------------------------------
 1 | #if (!defined(SPREADINTERP_H) && !defined(SINGLE)) || \
 2 |   (!defined(SPREADINTERPF_H) && defined(SINGLE))
 3 | 
 4 | #include <math.h>
 5 | #include <stdlib.h>
 6 | #include <stdio.h>
 7 | #include "utils.h"
 8 | #include "utils_fp.h"
 9 | 
10 | #define MAX_NSPREAD 16     // upper bound on w, ie nspread, even when padded
11 |                            // (see evaluate_kernel_vector); also for common
12 | 
13 | #undef SPREAD_OPTS
14 | 
15 | #ifdef SINGLE
16 | #define SPREAD_OPTS spread_optsf
17 | #define SPREADINTERPF_H
18 | #else
19 | #define SPREAD_OPTS spread_opts
20 | #define SPREADINTERP_H
21 | #endif
22 | 
23 | struct SPREAD_OPTS {      // see cnufftspread:setup_spreader for defaults.
24 |   int nspread;            // w, the kernel width in grid pts
25 |   int spread_direction;   // 1 means spread NU->U, 2 means interpolate U->NU
26 |   int pirange;            // 0: coords in [0,N), 1 coords in [-pi,pi)
27 |   FLT upsampfac;          // sigma, upsampling factor, default 2.0
28 |   // ES kernel specific...
29 |   FLT ES_beta;
30 |   FLT ES_halfwidth;
31 |   FLT ES_c;
32 | };
33 | 
34 | // NU coord handling macro: if p is true, rescales from [-pi,pi] to [0,N], then
35 | // folds *only* one period below and above, ie [-N,2N], into the domain [0,N]...
36 | #define RESCALE(x,N,p) (p ? \
37 | 		     ((x*M_1_2PI + (x<-PI ? 1.5 : (x>=PI ? -0.5 : 0.5)))*N) : \
38 | 		     (x<0 ? x+N : (x>=N ? x-N : x)))
39 | // yuk! But this is *so* much faster than slow std::fmod that we stick to it.
40 | FLT evaluate_kernel(FLT x, const SPREAD_OPTS &opts);
41 | int setup_spreader(SPREAD_OPTS &opts, FLT eps, FLT upsampfac, int kerevalmeth);
42 | 
43 | #endif  // SPREADINTERP_H
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2018-2021 The Simons Foundation, Inc. - All Rights Reserved.
 2 | 
 3 | Lead developer: Yu-Hsuan Melody Shih (New York University).
 4 | 
 5 | Other developers: (see github site for full list)
 6 | 
 7 | Garrett Wright (Princeton)
 8 | Joakim Anden (KTH)
 9 | Johannes Blaschke (LBNL)
10 | Alex Barnett (CCM, Flatiron Institute)
11 | 
12 | This project came out of Melody's 2018 and 2019 summer internships at
13 | the Flatiron Institute, advised by Alex Barnett.
14 | 
15 | ------
16 | 
17 | cuFINUFFT is licensed under the Apache License, Version 2.0 (the
18 | "License"); you may not use this file except in compliance with the
19 | License.  You may obtain a copy of the License at
20 | 
21 |     http://www.apache.org/licenses/LICENSE-2.0
22 | 
23 | Unless required by applicable law or agreed to in writing, software
24 | distributed under the License is distributed on an "AS IS" BASIS,
25 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | See the License for the specific language governing permissions and
27 | limitations under the License.
28 | 
29 | ------
30 | 
31 | Certain parts of this repository are contributed by others.
32 | For their license info, see:
33 | 
34 | contrib/legendre_rule_fast.license
35 | fortran/cmcl_license.txt
36 | 
37 | ------
38 | 
39 | If you find this library useful, or it helps you in creating software
40 | or publications, please let us know, and acknowledge that fact by citing our
41 | repository:
42 | 
43 |   https://github.com/flatironinstitute/cufinufft
44 | 
45 | and the publication:
46 | 
47 | cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs, Yu-hsuan Shih, Garrett Wright, Joakim Andén, Johannes Blaschke, Alex H. Barnett. PDSEC2021 conference (best paper prize). https://arxiv.org/abs/2102.08463
48 | 


--------------------------------------------------------------------------------
/python/cufinufft/tests/test_multi.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import numpy as np
 4 | 
 5 | import pycuda.driver as drv
 6 | import pycuda.gpuarray as gpuarray
 7 | 
 8 | from cufinufft import cufinufft
 9 | 
10 | import utils
11 | 
12 | 
13 | def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
14 |     complex_dtype = utils._complex_dtype(dtype)
15 | 
16 |     drv.init()
17 | 
18 |     dev_count = drv.Device.count()
19 | 
20 |     if dev_count == 1:
21 |         pytest.skip()
22 | 
23 |     devs = [drv.Device(dev_id) for dev_id in range(dev_count)]
24 | 
25 |     dim = len(shape)
26 | 
27 |     errs = []
28 | 
29 |     for dev_id, dev in enumerate(devs):
30 |         ctx = dev.make_context()
31 | 
32 |         k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
33 |         c = utils.gen_nonuniform_data(M).astype(complex_dtype)
34 | 
35 |         k_gpu = gpuarray.to_gpu(k)
36 |         c_gpu = gpuarray.to_gpu(c)
37 |         fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
38 | 
39 |         plan = cufinufft(1, shape, eps=tol, dtype=dtype,
40 |                          gpu_device_id=dev_id)
41 | 
42 |         plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])
43 | 
44 |         plan.execute(c_gpu, fk_gpu)
45 | 
46 |         fk = fk_gpu.get()
47 | 
48 |         ind = int(0.1789 * np.prod(shape))
49 | 
50 |         fk_est = fk.ravel()[ind]
51 |         fk_target = utils.direct_type1(c, k, shape, ind)
52 | 
53 |         type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
54 | 
55 |         print(f'Type 1 relative error (GPU {dev_id}):', type1_rel_err)
56 | 
57 |         ctx.pop()
58 | 
59 |         errs.append(type1_rel_err)
60 | 
61 |     assert all(err < 0.01 for err in errs)
62 | 
63 | 
64 | def main():
65 |     test_multi_type1()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/src/common.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include <helper_cuda.h>
 3 | #include <iostream>
 4 | #include <iomanip>
 5 | 
 6 | #include <cuComplex.h>
 7 | #include "precision_independent.h"
 8 | #include "common.h"
 9 | 
10 | using namespace std;
11 | 
12 | /* Kernel for computing approximations of exact Fourier series coeffs of
13 |    cnufftspread's real symmetric kernel. */
14 | // a , f are intermediate results from function onedim_fseries_kernel_precomp()
15 | // (see cufinufft/contrib/common.cpp for description)
16 | __global__
17 | void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a,
18 | 	FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns)
19 | {
20 | 	FLT J2 = ns/2.0;
21 | 	int q=(int)(2 + 3.0*J2);
22 | 	int nf;
23 | 	cuDoubleComplex *at = a + threadIdx.y*MAX_NQUAD;
24 | 	FLT *ft = f + threadIdx.y*MAX_NQUAD;
25 | 	FLT *oarr;
26 | 	if (threadIdx.y == 0){
27 | 		oarr = fwkerhalf1;
28 | 		nf = nf1;
29 | 	}else if (threadIdx.y == 1){
30 | 		oarr = fwkerhalf2;
31 | 		nf = nf2;
32 | 	}else{
33 | 		oarr = fwkerhalf3;
34 | 		nf = nf3;
35 | 	}
36 | 
37 | 	for(int i=blockDim.x*blockIdx.x+threadIdx.x; i<nf/2+1; i+=blockDim.x*gridDim.x){
38 | 		int brk = 0.5 + i;
39 | 		FLT x = 0.0;
40 | 		for(int n=0; n<q; n++){
41 | 			x += ft[n] * 2*(pow(cabs(at[n]), brk)*cos(brk*carg(at[n])));
42 | 		}
43 | 		oarr[i] = x;
44 | 	}
45 | }
46 | 
47 | int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f,
48 | 	cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2,
49 | 	FLT *d_fwkerhalf3, int ns)
50 | /*
51 | 	wrapper for approximation of Fourier series of real symmetric spreading 
52 | 	kernel.
53 | 
54 | 	Melody Shih 2/20/22
55 | */
56 | {
57 | 	int nout = max(max(nf1/2+1,nf2/2+1),nf3/2+1);
58 | 
59 | 	dim3 threadsPerBlock(16, dim);
60 | 	dim3 numBlocks((nout+16-1)/16, 1);
61 | 
62 | 	FseriesKernelCompute<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f,
63 | 		d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
64 | 	return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/python/cufinufft/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def _complex_dtype(dtype):
 5 |     if dtype == np.float32:
 6 |         complex_dtype = np.complex64
 7 |     elif dtype == np.float64:
 8 |         complex_dtype = np.complex128
 9 |     else:
10 |         raise TypeError("dtype should be np.float32 or np.float64.")
11 | 
12 |     return complex_dtype
13 | 
14 | 
15 | def _real_dtype(complex_dtype):
16 |     if complex_dtype == np.complex64:
17 |         real_dtype = np.float32
18 |     elif complex_dtype == np.complex128:
19 |         real_dtype = np.float64
20 |     else:
21 |         raise TypeError("dtype should be np.complex64 or np.complex128.")
22 | 
23 |     return real_dtype
24 | 
25 | 
26 | def gen_nu_pts(M, dim=3, seed=0):
27 |     np.random.seed(seed)
28 |     k = np.random.uniform(-np.pi, np.pi, (dim, M))
29 |     k = k.astype(np.float64)
30 |     return k
31 | 
32 | 
33 | def gen_uniform_data(shape, seed=0):
34 |     np.random.seed(seed)
35 |     fk = np.random.standard_normal(shape + (2,))
36 |     fk = fk.astype(np.float64).view(np.complex128)[..., 0]
37 |     return fk
38 | 
39 | 
40 | def gen_nonuniform_data(M, seed=0):
41 |     np.random.seed(seed)
42 |     c = np.random.standard_normal(2 * M)
43 |     c = c.astype(np.float64).view(np.complex128)
44 |     return c
45 | 
46 | 
47 | def make_grid(shape):
48 |     dim = len(shape)
49 |     shape = (1,) * (3 - dim) + shape
50 | 
51 |     grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape]
52 |     x, y, z = np.meshgrid(*grids, indexing='ij')
53 |     return np.stack((x, y, z))
54 | 
55 | 
56 | def direct_type1(c, k, shape, ind):
57 |     grid = make_grid(shape)
58 | 
59 |     phase = k.T @ grid.reshape((3, -1))[:, ind]
60 |     fk = np.sum(c * np.exp(1j * phase))
61 | 
62 |     return fk
63 | 
64 | 
65 | def direct_type2(fk, k):
66 |     grid = make_grid(fk.shape)
67 | 
68 |     phase = k @ grid.reshape((3, -1))
69 |     c = np.sum(fk.ravel() * np.exp(-1j * phase))
70 | 
71 |     return c
72 | 


--------------------------------------------------------------------------------
/test/spreadperf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # basic perf test of spread/interp for 2/3d, single/double
 3 | # Barnett 1/29/21, some 1D added 12/2/21.
 4 | 
 5 | BINDIR=../bin
 6 | 
 7 | n=1000000
 8 | M=1000000
 9 | dist=0         # 0= random unif, 1 = clustered
10 | Msub=10000   # claimed default is 65536
11 | tols=1e-5
12 | told=1e-12
13 | 
14 | echo "spread 1D.............................................."
15 | $BINDIR/spread1d_test    1 $dist $n $Msub $M $told
16 | $BINDIR/spread1d_test    2 $dist $n $Msub $M $told
17 | $BINDIR/spread1d_test_32 1 $dist $n $Msub $M $tols
18 | $BINDIR/spread1d_test_32 2 $dist $n $Msub $M $tols
19 | 
20 | echo "interp 1D.............................................."
21 | $BINDIR/interp1d_test    1 $dist $n $M $told
22 | $BINDIR/interp1d_test_32 1 $dist $n $M $tols
23 | # note there is no meth=2 in 1D interp
24 | 
25 | # 2D params... (n is grid size per dim)
26 | n=1000
27 | M=1000000
28 | 
29 | echo "spread 2D.............................................."
30 | $BINDIR/spread2d_test    1 $dist $n $n $Msub $M $told
31 | $BINDIR/spread2d_test    2 $dist $n $n $Msub $M $told
32 | $BINDIR/spread2d_test_32 1 $dist $n $n $Msub $M $tols
33 | $BINDIR/spread2d_test_32 2 $dist $n $n $Msub $M $tols
34 | 
35 | echo "interp 2D.............................................."
36 | $BINDIR/interp2d_test    1 $dist $n $n $M $told
37 | $BINDIR/interp2d_test    2 $dist $n $n $M $told
38 | $BINDIR/interp2d_test_32 1 $dist $n $n $M $tols
39 | $BINDIR/interp2d_test_32 2 $dist $n $n $M $tols
40 | 
41 | 
42 | # 3D params...
43 | n=100
44 | M=1000000
45 | 
46 | echo "spread 3D.............................................."
47 | $BINDIR/spread3d_test    1 $dist $n $n $n $Msub $M $told
48 | # note absence of meth=2 for 3D double
49 | $BINDIR/spread3d_test_32 1 $dist $n $n $n $Msub $M $tols
50 | $BINDIR/spread3d_test_32 2 $dist $n $n $n $Msub $M $tols
51 | 
52 | echo "interp 3D.............................................."
53 | $BINDIR/interp3d_test    1 $dist $n $n $n $M $told
54 | # note absence of meth=2 for 3D double
55 | $BINDIR/interp3d_test_32 1 $dist $n $n $n $M $tols
56 | $BINDIR/interp3d_test_32 2 $dist $n $n $n $M $tols
57 | 


--------------------------------------------------------------------------------
/examples/example2d2many.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Demonstrate the type 2 NUFFT using cuFINUFFT
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | import pycuda.autoinit
 8 | from pycuda.gpuarray import GPUArray, to_gpu
 9 | 
10 | from cufinufft import cufinufft
11 | 
12 | # Set up parameters for problem.
13 | N1, N2 = 37, 41                 # Size of uniform grid
14 | M = 17                          # Number of nonuniform points
15 | n_transf = 2                    # Number of input arrays
16 | eps = 1e-6                      # Requested tolerance
17 | dtype = np.float32              # Datatype (real)
18 | complex_dtype = np.complex64    # Datatype (complex)
19 | 
20 | # Generate coordinates of non-uniform points.
21 | kx = np.random.uniform(-np.pi, np.pi, size=M)
22 | ky = np.random.uniform(-np.pi, np.pi, size=M)
23 | 
24 | # Generate grid values.
25 | fk = (np.random.standard_normal((n_transf, N1, N2))
26 |       + 1j * np.random.standard_normal((n_transf, N1, N2)))
27 | 
28 | # Cast to desired datatype.
29 | kx = kx.astype(dtype)
30 | ky = ky.astype(dtype)
31 | fk = fk.astype(complex_dtype)
32 | 
33 | # Allocate memory for the nonuniform coefficients on the GPU.
34 | c_gpu = GPUArray((n_transf, M), dtype=complex_dtype)
35 | 
36 | # Initialize the plan and set the points.
37 | plan = cufinufft(2, (N1, N2), n_transf, eps=eps, dtype=dtype)
38 | plan.set_pts(to_gpu(kx), to_gpu(ky))
39 | 
40 | # Execute the plan, reading from the uniform grid fk c and storing the result
41 | # in c_gpu.
42 | plan.execute(c_gpu, to_gpu(fk))
43 | 
44 | # Retreive the result from the GPU.
45 | c = c_gpu.get()
46 | 
47 | # Check accuracy of the transform at index jt.
48 | jt = M // 2
49 | 
50 | for i in range(n_transf):
51 |     # Calculate the true value of the type 2 transform at the index jt.
52 |     x, y = np.mgrid[-(N1 // 2):(N1 + 1) // 2, -(N2 // 2):(N2 + 1) // 2]
53 |     c_true = np.sum(fk[i] * np.exp(-1j * (x * kx[jt] + y * ky[jt])))
54 | 
55 |     # Calculate the absolute and relative error.
56 |     err = np.abs(c[i, jt] - c_true)
57 |     rel_err = err / np.max(np.abs(c[i]))
58 | 
59 |     print(f"[{i}] Absolute error on point [{jt}] is {err:.3g}")
60 |     print(f"[{i}] Relative error on point [{jt}] is {rel_err:.3g}")
61 | 
62 |     assert(rel_err < 10 * eps)
63 | 


--------------------------------------------------------------------------------
/examples/example2d1many.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Demonstrate the type 1 NUFFT using cuFINUFFT
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | import pycuda.autoinit
 8 | from pycuda.gpuarray import GPUArray, to_gpu
 9 | 
10 | from cufinufft import cufinufft
11 | 
12 | # Set up parameters for problem.
13 | N1, N2 = 59, 61                 # Size of uniform grid
14 | M = 100                         # Number of nonuniform points
15 | n_transf = 2                    # Number of input arrays
16 | eps = 1e-6                      # Requested tolerance
17 | dtype = np.float32              # Datatype (real)
18 | complex_dtype = np.complex64    # Datatype (complex)
19 | 
20 | # Generate coordinates of non-uniform points.
21 | kx = np.random.uniform(-np.pi, np.pi, size=M)
22 | ky = np.random.uniform(-np.pi, np.pi, size=M)
23 | 
24 | # Generate source strengths.
25 | c = (np.random.standard_normal((n_transf, M))
26 |      + 1j * np.random.standard_normal((n_transf, M)))
27 | 
28 | # Cast to desired datatype.
29 | kx = kx.astype(dtype)
30 | ky = ky.astype(dtype)
31 | c = c.astype(complex_dtype)
32 | 
33 | # Allocate memory for the uniform grid on the GPU.
34 | fk_gpu = GPUArray((n_transf, N1, N2), dtype=complex_dtype)
35 | 
36 | # Initialize the plan and set the points.
37 | plan = cufinufft(1, (N1, N2), n_transf, eps=eps, dtype=dtype)
38 | plan.set_pts(to_gpu(kx), to_gpu(ky))
39 | 
40 | # Execute the plan, reading from the strengths array c and storing the
41 | # result in fk_gpu.
42 | plan.execute(to_gpu(c), fk_gpu)
43 | 
44 | # Retreive the result from the GPU.
45 | fk = fk_gpu.get()
46 | 
47 | # Check accuracy of the transform at position (nt1, nt2).
48 | nt1 = int(0.37 * N1)
49 | nt2 = int(0.26 * N2)
50 | 
51 | for i in range(n_transf):
52 |     # Calculate the true value of the type 1 transform at the uniform grid
53 |     # point (nt1, nt2), which corresponds to the coordinate nt1 - N1 // 2 and
54 |     # nt2 - N2 // 2.
55 |     x, y = nt1 - N1 // 2, nt2 - N2 // 2
56 |     fk_true = np.sum(c[i] * np.exp(1j * (x * kx + y * ky)))
57 | 
58 |     # Calculate the absolute and relative error.
59 |     err = np.abs(fk[i, nt1, nt2] - fk_true)
60 |     rel_err = err / np.max(np.abs(fk[i]))
61 | 
62 |     print(f"[{i}] Absolute error on mode [{nt1}, {nt2}] is {err:.3g}")
63 |     print(f"[{i}] Relative error on mode [{nt1}, {nt2}] is {rel_err:.3g}")
64 | 
65 |     assert(rel_err < 10 * eps)
66 | 


--------------------------------------------------------------------------------
/python/cufinufft/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'cufinufft'
21 | copyright = ('2020 The Simons Foundation, '
22 |              'Melody Shih, Joakim Anden, Garrett Wright.')
23 | author = 'Melody Shih, Joakim Anden, Garrett Wright'
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | release = '1.3'
27 | 
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 |     'sphinx.ext.autodoc',
36 |     'sphinx_rtd_theme',
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = []
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
46 | 
47 | 
48 | # -- Options for HTML output -------------------------------------------------
49 | 
50 | # The theme to use for HTML and HTML Help pages.  See the documentation for
51 | # a list of builtin themes.
52 | #
53 | html_theme = 'sphinx_rtd_theme'
54 | 
55 | # Add any paths that contain custom static files (such as style sheets) here,
56 | # relative to this directory. They are copied after the builtin static files,
57 | # so a file named "default.css" will overwrite the builtin "default.css".
58 | html_static_path = []
59 | 
60 | # Autodoc config
61 | autoclass_content = 'both'
62 | 


--------------------------------------------------------------------------------
/contrib/utils.h:
--------------------------------------------------------------------------------
 1 | // This contains some library-wide definitions & precision/OMP switches,
 2 | // as well as the interfaces to utilities in utils.cpp. Barnett 6/18/18.
 3 | 
 4 | #ifndef UTILS_H
 5 | #define UTILS_H
 6 | 
 7 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
 8 | #include <stdint.h>
 9 | 
10 | #include <complex>          // C++ type complex
11 | #include <cuComplex.h>
12 | #include "dataTypes.h"
13 | 
14 | // fraction growth cut-off in arraywidcen(), to decide if translate in type-3
15 | #define ARRAYWIDCEN_GROWFRAC 0.1
16 | 
17 | // math consts not in math.h ...
18 | #define M_1_2PI 0.159154943091895336
19 | #define M_2PI   6.28318530717958648
20 | // to avoid mixed precision operators in eg i*pi...
21 | #define PI (FLT)M_PI
22 | 
23 | using namespace std;        // means std:: not needed for cout, max, etc
24 | 
25 | typedef complex<double> dcomplex;  // slightly sneaky since duplicated by mwrap
26 | 
27 | // Global error codes for the library...
28 | #define WARN_EPS_TOO_SMALL       1
29 | #define ERR_MAXNALLOC            2
30 | #define ERR_SPREAD_BOX_SMALL     3
31 | #define ERR_SPREAD_PTS_OUT_RANGE 4
32 | #define ERR_SPREAD_ALLOC         5
33 | #define ERR_SPREAD_DIR           6
34 | #define ERR_UPSAMPFAC_TOO_SMALL  7
35 | #define HORNER_WRONG_BETA        8
36 | #define ERR_NDATA_NOTVALID       9
37 | 
38 | 
39 | //#define MAX(a,b) (a>b) ? a : b  // but we use std::max instead
40 | #define MIN(a,b) (a<b) ? a : b
41 | 
42 | // ahb math helpers
43 | BIGINT next235beven(BIGINT n, BIGINT b);
44 | 
45 | // jfm timer class
46 | #include <sys/time.h>
47 | class CNTime {
48 |  public:
49 |   void start();
50 |   double restart();
51 |   double elapsedsec();
52 |  private:
53 |   struct timeval initial;
54 | };
55 | 
56 | // allow compile-time switch off of openmp, so compilation without any openmp
57 | // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
58 | #ifdef _OPENMP
59 |   #include <omp.h>
60 |   // point to actual omp utils
61 |   #define MY_OMP_GET_NUM_THREADS() omp_get_num_threads()
62 |   #define MY_OMP_GET_MAX_THREADS() omp_get_max_threads()
63 |   #define MY_OMP_GET_THREAD_NUM() omp_get_thread_num()
64 |   #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
65 |   #define MY_OMP_SET_NESTED(x) omp_set_nested(x)
66 | #else
67 |   // non-omp safe dummy versions of omp utils
68 |   #define MY_OMP_GET_NUM_THREADS() 1
69 |   #define MY_OMP_GET_MAX_THREADS() 1
70 |   #define MY_OMP_GET_THREAD_NUM() 0
71 |   #define MY_OMP_SET_NUM_THREADS(x)
72 |   #define MY_OMP_SET_NESTED(x)
73 | #endif
74 | 
75 | #endif  // UTILS_H
76 | 


--------------------------------------------------------------------------------
/contrib/utils_fp.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | #include "utils_fp.h"
 3 | 
 4 | 
 5 | // ------------ complex array utils ---------------------------------
 6 | 
 7 | FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b)
 8 | // ||a-b||_2 / ||a||_2
 9 | {
10 |   FLT err = 0.0, nrm = 0.0;
11 |   for (BIGINT m=0; m<n; ++m) {
12 |     nrm += real(conj(a[m])*a[m]);
13 |     CPX diff = a[m]-b[m];
14 |     err += real(conj(diff)*diff);
15 |   }
16 |   return sqrt(err/nrm);
17 | }
18 | 
19 | FLT errtwonorm(BIGINT n, CPX* a, CPX* b)
20 | // ||a-b||_2
21 | {
22 |   FLT err = 0.0;   // compute error 2-norm
23 |   for (BIGINT m=0; m<n; ++m) {
24 |     CPX diff = a[m]-b[m];
25 |     err += real(conj(diff)*diff);
26 |   }
27 |   return sqrt(err);
28 | }
29 | 
30 | FLT twonorm(BIGINT n, CPX* a)
31 | // ||a||_2
32 | {
33 |   FLT nrm = 0.0;
34 |   for (BIGINT m=0; m<n; ++m)
35 |     nrm += real(conj(a[m])*a[m]);
36 |   return sqrt(nrm);
37 | }
38 | 
39 | FLT infnorm(BIGINT n, CPX* a)
40 | // ||a||_infty
41 | {
42 |   FLT nrm = 0.0;
43 |   for (BIGINT m=0; m<n; ++m) {
44 |     FLT aa = real(conj(a[m])*a[m]);
45 |     if (aa>nrm) nrm = aa;
46 |   }
47 |   return sqrt(nrm);
48 | }
49 | 
50 | void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi)
51 | // With a a length-n array, writes out min(a) to lo and max(a) to hi,
52 | // so that all a values lie in [lo,hi].
53 | // If n==0, lo and hi are not finite.
54 | {
55 |   *lo = INFINITY; *hi = -INFINITY;
56 |   for (BIGINT m=0; m<n; ++m) {
57 |     if (a[m]<*lo) *lo = a[m];
58 |     if (a[m]>*hi) *hi = a[m];
59 |   }
60 | }
61 | 
62 | void indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi)
63 | // With i a list of n indices, and a an array of length max(i), writes out
64 | // min(a(i)) to lo and max(a(i)) to hi, so that all a(i) values lie in [lo,hi].
65 | // This is not currently used in FINUFFT v1.2.
66 | {
67 |   *lo = INFINITY; *hi = -INFINITY;
68 |   for (BIGINT m=0; m<n; ++m) {
69 |     FLT A=a[i[m]];
70 |     if (A<*lo) *lo = A;
71 |     if (A>*hi) *hi = A;
72 |   }
73 | }
74 | 
75 | void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c)
76 | // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
77 | // Only chooses a nonzero center if this increases w by less than fraction
78 | // ARRAYWIDCEN_GROWFRAC defined in defs.h.
79 | // This prevents rephasings which don't grow nf by much. 6/8/17
80 | // If n==0, w and c are not finite.
81 | {
82 |   FLT lo,hi;
83 |   arrayrange(n,a,&lo,&hi);
84 |   *w = (hi-lo)/2;
85 |   *c = (hi+lo)/2;
86 |   if (std::abs(*c)<ARRAYWIDCEN_GROWFRAC*(*w)) {
87 |     *w += std::abs(*c);
88 |     *c = 0.0;
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1-manylinux2014/Dockerfile-x86_64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux2014_x86_64
 2 | LABEL maintainer "Garrett Wright"
 3 | 
 4 | # ---- The following block adds layers for CUDA --- #
 5 | # base
 6 | RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
 7 |     curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
 8 |     echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict  -
 9 | 
10 | COPY ci/docker/cuda10.1-manylinux2014/cuda.repo /etc/yum.repos.d/cuda.repo
11 | 
12 | ENV CUDA_VERSION 10.1.243
13 | 
14 | ENV CUDA_PKG_VERSION 10-1-$CUDA_VERSION-1
15 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
16 | RUN yum install -y \
17 | cuda-cudart-$CUDA_PKG_VERSION \
18 | cuda-compat-10-1 \
19 | && \
20 |     ln -s cuda-10.1 /usr/local/cuda && \
21 |     rm -rf /var/cache/yum/*
22 | 
23 | # nvidia-docker 1.0
24 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
25 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
26 | 
27 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
28 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
29 | 
30 | # nvidia-container-runtime
31 | ENV NVIDIA_VISIBLE_DEVICES all
32 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
33 | ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411"
34 | 
35 | #runtime
36 | RUN yum install -y \
37 |         cuda-libraries-$CUDA_PKG_VERSION \
38 | cuda-nvtx-$CUDA_PKG_VERSION \
39 |     libcublas10-10.2.1.243-1 \
40 |     && \
41 |     rm -rf /var/cache/yum/*
42 | 
43 | # devel
44 | RUN yum install -y \
45 |         cuda-nvml-dev-$CUDA_PKG_VERSION \
46 |         cuda-command-line-tools-$CUDA_PKG_VERSION \
47 | cuda-libraries-dev-$CUDA_PKG_VERSION \
48 |         cuda-minimal-build-$CUDA_PKG_VERSION \
49 |         libcublas-devel-10.2.1.243-1 \
50 |         && \
51 |     rm -rf /var/cache/yum/*
52 | 
53 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
54 | 
55 | # /CUDA #
56 | 
57 | RUN yum install -y devtoolset-8
58 | 
59 | RUN scl enable devtoolset-8 -- g++ --version
60 | 
61 | # Okay, so now we can begin cufinufft
62 | 
63 | # We need to build the CUDA code now.
64 | # assume we are building container in the root of the git repo...
65 | COPY . /io
66 | WORKDIR /io
67 | RUN scl enable devtoolset-8 -- make target=manylinux
68 | # And we need to pack it in our LD path
69 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH}
70 | 
71 | 
72 | CMD ["/bin/bash"]
73 | 


--------------------------------------------------------------------------------
/contrib/utils_fp.h:
--------------------------------------------------------------------------------
 1 | // Header for utils_fp.cpp, a little library of low-level array stuff.
 2 | // These are functions which depend on single/double precision.
 3 | // (rest of finufft defs and types are now in defs.h)
 4 | 
 5 | #if (!defined(UTILS_FP_H) && !defined(SINGLE)) || (!defined(UTILS_FPF_H) && defined(SINGLE))
 6 | // Make sure we only include once per precision (as in finufft_eitherprec.h).
 7 | #ifndef SINGLE
 8 | #define UTILS_FP_H
 9 | #else
10 | #define UTILS_FPF_H
11 | #endif
12 | 
13 | 
14 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
15 | #include <stdint.h>
16 | 
17 | #include <complex>          // C++ type complex
18 | #include <cuComplex.h>
19 | #include "dataTypes.h"
20 | 
21 | 
22 | #undef EPSILON
23 | #undef IMA
24 | #undef FABS
25 | #undef CUCPX
26 | #undef CUFFT_TYPE
27 | #undef CUFFT_EX
28 | #undef SET_NF_TYPE12
29 | 
30 | // Compile-flag choice of single or double (default) precision:
31 | // (Note in the other codes, FLT is "double" or "float", CPX same but complex)
32 | #ifdef SINGLE
33 |   // machine epsilon for rounding
34 |   #define EPSILON (float)6e-08
35 |   #define IMA complex<float>(0.0,1.0)
36 |   #define FABS(x) fabs(x)
37 |   #define CUCPX cuFloatComplex
38 |   #define CUFFT_TYPE CUFFT_C2C
39 |   #define CUFFT_EX cufftExecC2C
40 |   #define SET_NF_TYPE12 set_nf_type12f
41 | #else
42 |   // machine epsilon for rounding
43 |   #define EPSILON (double)1.1e-16
44 |   #define IMA complex<double>(0.0,1.0)
45 |   #define FABS(x) fabsf(x)
46 |   #define CUCPX cuDoubleComplex
47 |   #define CUFFT_TYPE CUFFT_Z2Z
48 |   #define CUFFT_EX cufftExecZ2Z
49 |   #define SET_NF_TYPE12 set_nf_type12
50 | #endif
51 | 
52 | 
53 | // ahb's low-level array helpers
54 | FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b);
55 | FLT errtwonorm(BIGINT n, CPX* a, CPX* b);
56 | FLT twonorm(BIGINT n, CPX* a);
57 | FLT infnorm(BIGINT n, CPX* a);
58 | void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi);
59 | void indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi);
60 | void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c);
61 | 
62 | // Random numbers: crappy unif random number generator in [0,1):
63 | //#define rand01() (((FLT)(rand()%RAND_MAX))/RAND_MAX)
64 | #define rand01() ((FLT)rand()/RAND_MAX)
65 | // unif[-1,1]:
66 | #define randm11() (2*rand01() - (FLT)1.0)
67 | // complex unif[-1,1] for Re and Im:
68 | #define crandm11() (randm11() + IMA*randm11())
69 | 
70 | // Thread-safe seed-carrying versions of above (x is ptr to seed)...
71 | #define rand01r(x) ((FLT)rand_r(x)/RAND_MAX)
72 | // unif[-1,1]:
73 | #define randm11r(x) (2*rand01r(x) - (FLT)1.0)
74 | // complex unif[-1,1] for Re and Im:
75 | #define crandm11r(x) (randm11r(x) + IMA*randm11r(x))
76 | 
77 | 
78 | #endif  // UTILS_FP_H
79 | 


--------------------------------------------------------------------------------
/ci/docker/cuda10.1/Dockerfile-x86_64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux2010_x86_64
 2 | LABEL maintainer "Garrett Wright"
 3 | 
 4 | # ---- CentOS 6 has been deprecated.
 5 | # We'll need to patch the repo links to point to the CentOS 6 Vault
 6 | COPY ci/docker/cuda10.1/vault.repo /etc/yum.repos.d/CentOS-Base.repo
 7 | COPY ci/docker/cuda10.1/CentOS-SCLo-scl-rh.repo /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo
 8 | COPY ci/docker/cuda10.1/CentOS-SCLo-scl.repo  /etc/yum.repos.d/CentOS-SCLo-scl.repo 
 9 | 
10 | # ---- The following block adds layers for CUDA --- #
11 | # base
12 | RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
13 | curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
14 |     echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c -
15 | 
16 | COPY ci/docker/cuda10.1/cuda.repo /etc/yum.repos.d/cuda.repo
17 | 
18 | ENV CUDA_VERSION 10.1.243
19 | 
20 | ENV CUDA_PKG_VERSION 10-1-$CUDA_VERSION-1
21 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
22 | RUN yum install -y \
23 | cuda-cudart-$CUDA_PKG_VERSION \
24 | cuda-compat-10-1 \
25 | && \
26 |     ln -s cuda-10.1 /usr/local/cuda && \
27 |     rm -rf /var/cache/yum/*
28 | 
29 | # nvidia-docker 1.0
30 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
31 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
32 | 
33 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
34 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
35 | 
36 | # nvidia-container-runtime
37 | ENV NVIDIA_VISIBLE_DEVICES all
38 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
39 | ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411"
40 | 
41 | #runtime
42 | RUN yum install -y \
43 |         cuda-libraries-$CUDA_PKG_VERSION \
44 | cuda-nvtx-$CUDA_PKG_VERSION \
45 |     libcublas10-10.2.1.243-1 \
46 |     && \
47 |     rm -rf /var/cache/yum/*
48 | 
49 | # devel
50 | RUN yum install -y \
51 |         cuda-nvml-dev-$CUDA_PKG_VERSION \
52 |         cuda-command-line-tools-$CUDA_PKG_VERSION \
53 | cuda-libraries-dev-$CUDA_PKG_VERSION \
54 |         cuda-minimal-build-$CUDA_PKG_VERSION \
55 |         libcublas-devel-10.2.1.243-1 \
56 |         && \
57 |     rm -rf /var/cache/yum/*
58 | 
59 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
60 | 
61 | # /CUDA #
62 | 
63 | 
64 | # Okay, so now we can begin cufinufft
65 | 
66 | # We need to build the CUDA code now.
67 | # assume we are building container in the root of the git repo...
68 | COPY . /io
69 | WORKDIR /io
70 | RUN make target=manylinux
71 | # And we need to pack it in our LD path
72 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH}
73 | 
74 | 
75 | CMD ["/bin/bash"]
76 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # This defines the Python module installation.
 2 | 
 3 | import os
 4 | import ctypes
 5 | 
 6 | from setuptools import setup, Extension
 7 | 
 8 | # Description
 9 | DESCRIPTION = "Non-uniform fast Fourier transforms on the GPU"
10 | 
11 | with open(os.path.join('python', 'cufinufft', 'README.md'), encoding='utf8') as fh:
12 |     LONG_DESCRIPTION = fh.read()
13 | 
14 | # Parse the requirements
15 | with open(os.path.join('python', 'cufinufft', 'requirements.txt'), 'r') as fh:
16 |     requirements = [item.strip() for item in fh.readlines()]
17 | 
18 | # Sanity check that we can find the CUDA cufinufft libraries before we get too far.
19 | try:
20 |     lib = ctypes.cdll.LoadLibrary('libcufinufft.so')
21 | except Exception as e:
22 |     print('CUDA shared libraries not found in library path.'
23 |           '  Please refer to installation documentation at http://github.com/flatironinstitute/cufinufft'
24 |           ' and ensure CUDA installation is successful first before attempting to install the Python wrappers.')
25 |     raise(e)
26 | print('cufinufft CUDA shared libraries found, continuing...')
27 | 
28 | 
29 | # Python Package Setup
30 | setup(
31 |     name='cufinufft',
32 |     version='1.3',
33 |     author='Yu-shuan Melody Shih, Garrett Wright, Joakim Anden, Johannes Blaschke, Alex Barnett',
34 |     author_email='janden-vscholar@flatironinstitute.org',
35 |     url='https://github.com/flatironinstitute/cufinufft',
36 |     description=DESCRIPTION,
37 |     long_description=LONG_DESCRIPTION,
38 |     long_description_content_type="text/markdown",
39 |     license="Apache 2",
40 |     packages=['cufinufft'],
41 |     package_dir={'': 'python'},
42 |     install_requires=requirements,
43 |     # If you'd like to build or alter the docs you may additionally require these.
44 |     extras_require={
45 |         'docs': ['sphinx', 'sphinx_rtd_theme']
46 |     },
47 |     classifiers=['Intended Audience :: Science/Research',
48 |         'License :: OSI Approved :: Apache Software License',
49 |         'Programming Language :: Python :: 3',
50 |         'Programming Language :: C++',
51 |         'Operating System :: POSIX :: Linux',
52 |         'Environment :: GPU',
53 |         'Topic :: Scientific/Engineering :: Mathematics'],
54 |     python_requires='>=3.6',
55 |     zip_safe=False,
56 |     # This explicitly tells the wheel systems that we're platform specific.
57 |     #   Addiitonally, will create a new cPython library with a decorated name
58 |     #   that is rpath linked to CUDA library, also decorated (by auditwheel).
59 |     #   Most importantly, pip will manage to install all this stuff in
60 |     #   in places Python can find it (with a little help).
61 |     py_modules=['cufinufftc'],
62 |     ext_modules=[
63 |         Extension(name='cufinufftc',
64 |                   sources=[],
65 |                   libraries=['cufinufft'],
66 |                   library_dirs=['lib'])
67 |         ]
68 | )
69 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | v 1.3 (06/10/23)
 2 | 
 3 | * Move second half of onedim_fseries_kernel() to GPU (with a simple heuristic 
 4 |   basing on nf1 to switch between the CPU and the GPU version).
 5 | * Melody fixed bug in MAX_NF being 0 due to typecasting 1e11 to int (thanks
 6 |   Elliot Slaughter for catching that).
 7 | * Melody fixed kernel eval so done w*d not w^d times, speeds up 2d a little, 3d
 8 |   quite a lot! (PR#130)
 9 | * Melody added 1D support for both types 1 (GM-sort and SM methods) 2 (GM-sort),
10 |   in C++/CUDA and their test executables (but not Python interface).
11 | * Various fixes to package config.
12 | * Miscellaneous bug fixes.
13 | 
14 | v 1.2 (02/17/21)
15 | 
16 | * Warning: Following are Python interface changes -- not backwards compatible
17 |   with v 1.1 (See examples/example2d1,2many.py for updated usage)
18 | 
19 |     - Made opts a kwarg dict instead of an object:
20 |          def __init__(self, ... , opts=None, dtype=np.float32)
21 |       => def __init__(self, ... , dtype=np.float32, **kwargs)
22 |     - Renamed arguments in plan creation `__init__`:
23 |          ntransforms => n_trans, tol => eps
24 |     - Changed order of arguments in plan creation `__init__`:
25 |          def __init__(self, ... ,isign, eps, ntransforms, opts, dtype)
26 |       => def __init__(self, ... ,ntransforms, eps, isign, opts, dtype)
27 |     - Removed M in `set_pts` arguments:
28 |          def set_pts(self, M, kx, ky=None, kz=None)
29 |       => def set_pts(self, kx, ky=None, kz=None)
30 | 
31 | * Python: added multi-gpu support (in beta)
32 | * Python: added more unit tests (wrong input, kwarg args, multi-gpu)
33 | * Fixed various memory leaks
34 | * Added index bound check in 2D spread kernels (Spread_2d_Subprob(_Horner))
35 | * Added spread/interp tests to `make check`
36 | * Fixed user request tolerance (eps) to kernel width (w) calculation
37 | * Default kernel evaluation method set to 0, ie exp(sqrt()), since faster
38 | * Removed outdated benchmark codes, cleaner spread/interp tests
39 | 
40 | v 1.1 (09/22/20)
41 | 
42 | * Python: extended the mode tuple to 3D and reorder from C/python
43 |   ndarray.shape style input (nZ, nY, nX) to to the (F) order expected by the
44 |   low level library (nX, nY, nZ).
45 | * Added bound checking on the bin size
46 | * Dual-precision support of spread/interp tests
47 | * Improved documentation of spread/interp tests
48 | * Added dummy call of cuFFTPlan1d to avoid timing the constant cost of cuFFT
49 |   library.
50 | * Added heuristic decision of maximum batch size (number of vectors with the
51 |   same nupts to transform at the same time)
52 | * Reported execution throughput in the test codes
53 | * Fixed timing in the tests code
54 | * Professionalized handling of too-small-eps (requested tolerance)
55 | * Rewrote README.md and added cuFINUFFT logo.
56 | * Support of advanced Makefile usage, e.g. make -site=olcf_summit
57 | * Removed FFTW dependency
58 | 
59 | v 1.0 (07/29/20)
60 | 


--------------------------------------------------------------------------------
/src/precision_independent.h:
--------------------------------------------------------------------------------
 1 | /* These are functions that do not rely on FLT.
 2 |    They are organized by originating file.
 3 | */
 4 | 
 5 | #ifndef PRECISION_INDEPENDENT_H
 6 | #define PRECISION_INDEPENDENT_H
 7 | 
 8 | /* Auxiliary var/func to compute power of complex number */
 9 | typedef double     RT;
10 | typedef cuDoubleComplex CT;
11 | #define rpart(x)   (cuCreal(x))
12 | #define ipart(x)   (cuCimag(x))
13 | #define cmplx(x,y) (make_cuDoubleComplex(x,y))
14 | 
15 | __device__ RT carg(const CT& z); // polar angle
16 | __device__ RT cabs(const CT& z);
17 | __device__ CT cpow(const CT& z, const int &n);
18 | 
19 | /* Common Kernels from spreadinterp3d */
20 | __host__ __device__
21 | int CalcGlobalIdx(int xidx, int yidx, int zidx, int onx, int ony, int onz,
22 |                   int bnx, int bny, int bnz);
23 | __device__
24 | int CalcGlobalIdx_V2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz);
25 | 
26 | /* spreadinterp 1d */
27 | __global__
28 | void CalcSubProb_1d(int* bin_size, int* num_subprob, int maxsubprobsize, int numbins);
29 | 
30 | __global__
31 | void MapBintoSubProb_1d(int* d_subprob_to_bin, int* d_subprobstartpts,
32 | 	int* d_numsubprob,int numbins);
33 | 
34 | __global__
35 | void TrivialGlobalSortIdx_1d(int M, int* index);
36 | 
37 | /* spreadinterp 2d */
38 | __global__
39 | void CalcSubProb_2d(int* bin_size, int* num_subprob, int maxsubprobsize, int numbins);
40 | 
41 | __global__
42 | void MapBintoSubProb_2d(int* d_subprob_to_bin, int* d_subprobstartpts,
43 | 	int* d_numsubprob,int numbins);
44 | 
45 | __global__
46 | void CalcSubProb_2d_Paul(int* finegridsize, int* num_subprob,
47 | 	int maxsubprobsize, int bin_size_x, int bin_size_y);
48 | 
49 | __global__
50 | void TrivialGlobalSortIdx_2d(int M, int* index);
51 | 
52 | /* spreadinterp3d */
53 | __global__
54 | void CalcSubProb_3d_v2(int* bin_size, int* num_subprob, int maxsubprobsize,
55 |                        int numbins);
56 | 
57 | __global__
58 | void MapBintoSubProb_3d_v2(int* d_subprob_to_bin,int* d_subprobstartpts,
59 |                            int* d_numsubprob,int numbins);
60 | 
61 | __global__
62 | void CalcSubProb_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz,
63 |                        int* bin_size, int* num_subprob, int maxsubprobsize, int numbins);
64 | 
65 | __global__
66 | void MapBintoSubProb_3d_v1(int* d_subprob_to_obin, int* d_subprobstartpts,
67 |                            int* d_numsubprob,int numbins);
68 | 
69 | __global__
70 | void TrivialGlobalSortIdx_3d(int M, int* index);
71 | 
72 | __global__
73 | void FillGhostBins(int binsperobinx, int binsperobiny, int binsperobinz,
74 |                    int nobinx, int nobiny, int nobinz, int* binsize);
75 | 
76 | __global__
77 | void Temp(int binsperobinx, int binsperobiny, int binsperobinz,
78 |           int nobinx, int nobiny, int nobinz, int* binsize);
79 | 
80 | __global__
81 | void GhostBinPtsIdx(int binsperobinx, int binsperobiny, int binsperobinz,
82 |                     int nobinx, int nobiny, int nobinz, int* binsize, int* index,
83 |                     int* binstartpts, int M);
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/python/cufinufft/tests/test_error_checks.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | import pycuda.autoinit # NOQA:401
  5 | import pycuda.gpuarray as gpuarray
  6 | 
  7 | from cufinufft import cufinufft
  8 | 
  9 | import utils
 10 | 
 11 | 
 12 | def test_set_nu_raises_on_dtype():
 13 |     dtype = np.float32
 14 | 
 15 |     M = 4096
 16 |     tol = 1e-3
 17 |     shape = (16, 16, 16)
 18 |     dim = len(shape)
 19 | 
 20 |     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 21 | 
 22 |     kxyz_gpu = gpuarray.to_gpu(kxyz)
 23 | 
 24 |     # Here we'll intentionally contruct an incorrect array dtype.
 25 |     kxyz_gpu_wrong_type = gpuarray.to_gpu(kxyz.astype(np.float64))
 26 | 
 27 |     plan = cufinufft(1, shape, eps=tol, dtype=dtype)
 28 | 
 29 |     with pytest.raises(TypeError):
 30 |         plan.set_pts(kxyz_gpu_wrong_type[0],
 31 |                      kxyz_gpu[1], kxyz_gpu[2])
 32 |     with pytest.raises(TypeError):
 33 |         plan.set_pts(kxyz_gpu[0],
 34 |                      kxyz_gpu_wrong_type[1], kxyz_gpu[2])
 35 |     with pytest.raises(TypeError):
 36 |         plan.set_pts(kxyz_gpu[0],
 37 |                      kxyz_gpu[1], kxyz_gpu_wrong_type[2])
 38 |     with pytest.raises(TypeError):
 39 |         plan.set_pts(kxyz_gpu_wrong_type[0],
 40 |                      kxyz_gpu_wrong_type[1], kxyz_gpu_wrong_type[2])
 41 | 
 42 | 
 43 | def test_set_pts_raises_on_size():
 44 |     dtype = np.float32
 45 | 
 46 |     M = 8
 47 |     tol = 1e-3
 48 |     shape = (16, 16, 16)
 49 |     dim = len(shape)
 50 | 
 51 |     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 52 | 
 53 |     kxyz_gpu = gpuarray.to_gpu(kxyz)
 54 | 
 55 |     plan = cufinufft(1, shape, eps=tol, dtype=dtype)
 56 | 
 57 |     with pytest.raises(TypeError) as err:
 58 |         plan.set_pts(kxyz_gpu[0], kxyz_gpu[1][:4])
 59 |     assert 'kx and ky must be equal' in err.value.args[0]
 60 | 
 61 |     with pytest.raises(TypeError) as err:
 62 |         plan.set_pts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2][:4])
 63 |     assert 'kx and kz must be equal' in err.value.args[0]
 64 | 
 65 | 
 66 | def test_wrong_field_names():
 67 |     with pytest.raises(TypeError) as err:
 68 |         plan = cufinufft(1, (8, 8), foo="bar")
 69 |     assert "Invalid option 'foo'" in err.value.args[0]
 70 | 
 71 | 
 72 | def test_exec_raises_on_dtype():
 73 |     dtype = np.float32
 74 |     complex_dtype = np.complex64
 75 | 
 76 |     M = 4096
 77 |     tol = 1e-3
 78 |     shape = (16, 16, 16)
 79 |     dim = len(shape)
 80 | 
 81 |     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 82 |     c = utils.gen_nonuniform_data(M).astype(complex_dtype)
 83 |     c_gpu = gpuarray.to_gpu(c)
 84 |     # Using c.real gives us wrong dtype here...
 85 |     c_gpu_wrong_dtype = gpuarray.to_gpu(c.real)
 86 | 
 87 |     kxyz_gpu = gpuarray.to_gpu(kxyz)
 88 |     fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
 89 |     # Here we'll intentionally contruct an incorrect array dtype.
 90 |     fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128)
 91 | 
 92 |     plan = cufinufft(1, shape, eps=tol, dtype=dtype)
 93 | 
 94 |     plan.set_pts(kxyz_gpu[0],
 95 |                  kxyz_gpu[1], kxyz_gpu[2])
 96 | 
 97 |     with pytest.raises(TypeError):
 98 |         plan.execute(c_gpu, fk_gpu_wrong_dtype)
 99 | 
100 |     with pytest.raises(TypeError):
101 |         plan.execute(c_gpu_wrong_dtype, fk_gpu)
102 | 


--------------------------------------------------------------------------------
/examples/example2d1many.cpp:
--------------------------------------------------------------------------------
  1 | /* This is an example of performing 2d1many
  2 |    in single precision.
  3 | */
  4 | 
  5 | 
  6 | #include <iostream>
  7 | #include <iomanip>
  8 | #include <math.h>
  9 | #include <complex>
 10 | 
 11 | #include <cufinufft.h>
 12 | 
 13 | using namespace std;
 14 | 
 15 | int main(int argc, char* argv[])
 16 | /*
 17 |  * example code for 2D Type 1 transformation.
 18 |  *
 19 |  * To compile the code:
 20 |  * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include /loc/to/cufinufft/lib-static/libcufinufft.a -lcudart -lcufft -lnvToolsExt
 21 |  *
 22 |  * or
 23 |  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
 24 |  * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include -L/loc/to/cufinufft/lib/ -lcufinufft
 25 |  *
 26 |  *
 27 |  */
 28 | {
 29 | 	cout<<scientific<<setprecision(3);
 30 | 
 31 | 	int ier;
 32 | 	int N1 = 256;
 33 | 	int N2 = 256;
 34 | 	int M = 65536;
 35 | 	int ntransf = 2;
 36 | 	int maxbatchsize = 1;
 37 | 	int iflag=1;
 38 | 	float tol=1e-6;
 39 | 
 40 | 	float *x, *y;
 41 | 	complex<float> *c, *fk;
 42 | 	cudaMallocHost(&x, M*sizeof(float));
 43 | 	cudaMallocHost(&y, M*sizeof(float));
 44 | 	cudaMallocHost(&c, M*ntransf*sizeof(complex<float>));
 45 | 	cudaMallocHost(&fk,N1*N2*ntransf*sizeof(complex<float>));
 46 | 
 47 | 	float *d_x, *d_y;
 48 | 	cuFloatComplex *d_c, *d_fk;
 49 | 	cudaMalloc(&d_x,M*sizeof(float));
 50 | 	cudaMalloc(&d_y,M*sizeof(float));
 51 | 	cudaMalloc(&d_c,M*ntransf*sizeof(cuFloatComplex));
 52 | 	cudaMalloc(&d_fk,N1*N2*ntransf*sizeof(cuFloatComplex));
 53 | 
 54 | 	for (int i=0; i<M; i++) {
 55 | 		x[i] = M_PI*randm11();
 56 | 		y[i] = M_PI*randm11();
 57 | 	}
 58 | 
 59 | 	for(int i=0; i<M*ntransf; i++){
 60 | 		c[i].real(randm11());
 61 | 		c[i].imag(randm11());
 62 | 	}
 63 | 	cudaMemcpy(d_x,x,M*sizeof(float),cudaMemcpyHostToDevice);
 64 | 	cudaMemcpy(d_y,y,M*sizeof(float),cudaMemcpyHostToDevice);
 65 | 	cudaMemcpy(d_c,c,M*ntransf*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
 66 | 
 67 | 	cufinufftf_plan dplan;
 68 | 
 69 | 	int dim = 2;
 70 | 	int nmodes[3];
 71 | 	int type = 1;
 72 | 
 73 | 	nmodes[0] = N1;
 74 | 	nmodes[1] = N2;
 75 | 	nmodes[2] = 1;
 76 | 
 77 | 	ier=cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol,
 78 | 				maxbatchsize, &dplan, NULL);
 79 | 
 80 | 	ier=cufinufftf_setpts(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
 81 | 
 82 | 	ier=cufinufftf_execute(d_c, d_fk, dplan);
 83 | 
 84 | 	ier=cufinufftf_destroy(dplan);
 85 | 
 86 | 	cudaMemcpy(fk,d_fk,N1*N2*ntransf*sizeof(cuFloatComplex),cudaMemcpyDeviceToHost);
 87 | 
 88 | 	cout<<endl<<"Accuracy check:"<<endl;
 89 | 	int N = N1*N2;
 90 | 	for(int i=0; i<ntransf; i+=1){
 91 | 		int nt1 = (int)(0.37*N1), nt2 = (int)(0.26*N2);  // choose some mode index to check
 92 | 		complex<float> Ft = complex<float>(0,0), J = complex<float>(0,1)*(float)iflag;
 93 | 		for (BIGINT j=0; j<M; ++j)
 94 | 			Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]));   // crude direct
 95 | 		int it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
 96 | 		printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n",i,nt1,
 97 | 			nt2,abs(Ft-fk[it+i*N]));
 98 | 		printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n",i,nt1,
 99 | 			nt2,abs(Ft-fk[it+i*N])/infnorm(N,fk+i*N));
100 | 	}
101 | 
102 | 	cudaFreeHost(x);
103 | 	cudaFreeHost(y);
104 | 	cudaFreeHost(c);
105 | 	cudaFreeHost(fk);
106 | 
107 | 	cudaFree(d_x);
108 | 	cudaFree(d_y);
109 | 	cudaFree(d_c);
110 | 	cudaFree(d_fk);
111 | 	return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/examples/example2d2many.cpp:
--------------------------------------------------------------------------------
  1 | /* This is an example of performing 2d2many
  2 |    in double precision.
  3 | */
  4 | 
  5 | #include <iostream>
  6 | #include <iomanip>
  7 | #include <math.h>
  8 | #include <complex>
  9 | 
 10 | #include <cufinufft.h>
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | /*
 16 |  * example code for 2D Type 1 transformation.
 17 |  *
 18 |  * To compile the code:
 19 |  * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a -I/loc/to/cufinufft/include -lcudart -lcufft -lnvToolsExt
 20 |  *
 21 |  * or
 22 |  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
 23 |  * nvcc -DSINGLE example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o example2d1 -lcufinufft
 24 |  *
 25 |  *
 26 |  */
 27 | {
 28 | 	cout<<scientific<<setprecision(3);
 29 | 
 30 | 	int ier;
 31 | 	int N1 = 128;
 32 | 	int N2 = 128;
 33 | 	int M = 10;
 34 | 	int ntransf = 4;
 35 | 	int maxbatchsize = 4;
 36 | 	int iflag=1;
 37 | 	double tol=1e-6;
 38 | 
 39 | 	double *x, *y;
 40 | 	complex<double> *c, *fk;
 41 | 	cudaMallocHost(&x, M*sizeof(double));
 42 | 	cudaMallocHost(&y, M*sizeof(double));
 43 | 	cudaMallocHost(&c, M*ntransf*sizeof(complex<double>));
 44 | 	cudaMallocHost(&fk,N1*N2*ntransf*sizeof(complex<double>));
 45 | 
 46 | 	double *d_x, *d_y;
 47 | 	cuDoubleComplex *d_c, *d_fk;
 48 | 	cudaMalloc(&d_x,M*sizeof(double));
 49 | 	cudaMalloc(&d_y,M*sizeof(double));
 50 | 	cudaMalloc(&d_c,M*ntransf*sizeof(cuDoubleComplex));
 51 | 	cudaMalloc(&d_fk,N1*N2*ntransf*sizeof(cuDoubleComplex));
 52 | 
 53 | 	for (int i=0; i<M; i++) {
 54 | 		x[i] = M_PI*randm11();
 55 | 		y[i] = M_PI*randm11();
 56 | 	}
 57 | 
 58 | 	for(int i=0; i<N1*N2*ntransf; i++){
 59 | 		fk[i].real(randm11());
 60 | 		fk[i].imag(randm11());
 61 | 	}
 62 | 	cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice);
 63 | 	cudaMemcpy(d_y,y,M*sizeof(double),cudaMemcpyHostToDevice);
 64 | 	cudaMemcpy(d_fk,fk,N1*N2*ntransf*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice);
 65 | 
 66 | 	cufinufft_plan dplan;
 67 | 
 68 | 	int dim = 2;
 69 | 	int nmodes[3];
 70 | 	int type = 2;
 71 | 
 72 | 	nmodes[0] = N1;
 73 | 	nmodes[1] = N2;
 74 | 	nmodes[2] = 1;
 75 | 
 76 | 	ier=cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol,
 77 | 			       maxbatchsize, &dplan, NULL);
 78 | 
 79 | 	ier=cufinufft_setpts(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
 80 | 
 81 | 	ier=cufinufft_execute(d_c, d_fk, dplan);
 82 | 
 83 | 	ier=cufinufft_destroy(dplan);
 84 | 
 85 | 	cudaMemcpy(c,d_c,M*ntransf*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost);
 86 | 
 87 | 	cout<<endl<<"Accuracy check:"<<endl;
 88 | 	complex<double>* fkstart;
 89 | 	complex<double>* cstart;
 90 | 	for(int t=0; t<ntransf; t++){
 91 | 		fkstart = fk + t*N1*N2;
 92 | 		cstart = c + t*M;
 93 | 		int jt = M/2;          // check arbitrary choice of one targ pt
 94 | 		complex<double> J(0,iflag*1);
 95 | 		complex<double> ct(0,0);
 96 | 		int m=0;
 97 | 		for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
 98 | 			for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
 99 | 				ct += fkstart[m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
100 | 
101 | 		printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n",t,jt,abs(cstart[jt]-ct)/infnorm(M,c));
102 | 	}
103 | 
104 | 	cudaFreeHost(x);
105 | 	cudaFreeHost(y);
106 | 	cudaFreeHost(c);
107 | 	cudaFreeHost(fk);
108 | 
109 | 	cudaFree(d_x);
110 | 	cudaFree(d_y);
111 | 	cudaFree(d_c);
112 | 	cudaFree(d_fk);
113 | 	return 0;
114 | }
115 | 


--------------------------------------------------------------------------------
/contrib/dirft2d.cpp:
--------------------------------------------------------------------------------
 1 | #include "dirft.h"
 2 | #include <iostream>
 3 | 
 4 | // This is basically a port of dirft2d.f from CMCL package, except with
 5 | // the 1/nj prefactors for type-1 removed.
 6 | 
 7 | void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
 8 | /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1.
 9 | c                  nj-1
10 | c     f[k1,k2] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j]))
11 | c                  j=0
12 | c
13 | c     for -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2.
14 | c     The output array is in increasing k1 ordering (fast), then increasing
15 |       k2 ordering (slow). If iflag>0 the + sign is
16 | c     used, otherwise the - sign is used, in the exponential.
17 | *  Uses C++ complex type and winding trick.  Barnett 1/26/17
18 | */
19 | {
20 |   BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
21 |   BIGINT N = ms*mt;        // total # output modes
22 |   for (BIGINT m=0;m<N;++m) f[m] = CPX(0,0);    // it knows f is complex type
23 |   for (BIGINT j=0;j<nj;++j) {            // src pts
24 |     CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
25 |     CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
26 |     CPX sp1 = pow(a1,(FLT)k1min);  // starting phase for most neg k1 freq  
27 |     CPX p2 = pow(a2,(FLT)k2min);
28 |     CPX cc = c[j];                 // no 1/nj norm
29 |     BIGINT m=0;      // output pointer
30 |     for (BIGINT m2=0;m2<mt;++m2) {
31 |       CPX p1 = sp1;                // must reset p1 for each inner loop
32 |       for (BIGINT m1=0;m1<ms;++m1) {  // ms is fast, mt slow
33 | 	f[m++] += cc * p1 * p2;
34 | 	p1 *= a1;
35 |       }
36 |       p2 *= a2;
37 |     }
38 |   }
39 | }
40 | 
41 | void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
42 | /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2
43 | 
44 |      c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) 
45 |             k1,k2  
46 |                             for j = 0,...,nj-1
47 |     where sum is over -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2.
48 | 
49 |     The input array is in increasing k1 ordering (fast), then increasing
50 |     k2 ordering (slow).
51 |     If iflag>0 the + sign is used, otherwise the - sign is used, in the
52 |     exponential.
53 |     Uses C++ complex type and winding trick.  Barnett 1/26/17
54 | */
55 | {
56 |   BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
57 |   for (BIGINT j=0;j<nj;++j) {
58 |     CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
59 |     CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
60 |     CPX sp1 = pow(a1,(FLT)k1min);
61 |     CPX p2 = pow(a2,(FLT)k2min);
62 |     CPX cc = CPX(0,0);
63 |     BIGINT m=0;      // input pointer
64 |     for (BIGINT m2=0;m2<mt;++m2) {
65 |       CPX p1 = sp1;
66 |       for (BIGINT m1=0;m1<ms;++m1) {
67 | 	cc += f[m++] * p1 * p2;
68 | 	p1 *= a1;
69 |       }
70 |       p2 *= a2;
71 |     }
72 |     c[j] = cc;
73 |   }
74 | }
75 | 
76 | void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f)
77 | /* Direct computation of 2D type-3 nonuniform FFT. Interface same as finufft2d3
78 | c               nj-1
79 | c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j]))
80 | c               j=0                   
81 | c                    for k = 0, ..., nk-1
82 | c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
83 | c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/26/17
84 | */
85 | {
86 |   for (BIGINT k=0;k<nk;++k) {
87 |     CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
88 |     CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k];
89 |     f[k] = CPX(0,0);
90 |     for (BIGINT j=0;j<nj;++j)
91 |       f[k] += c[j] * exp(ss*x[j] + tt*y[j]);
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/python/cufinufft/tests/test_basic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import pycuda.autoinit # NOQA:401
  4 | import pycuda.gpuarray as gpuarray
  5 | 
  6 | from cufinufft import cufinufft
  7 | 
  8 | import utils
  9 | 
 10 | 
 11 | def _test_type1(dtype, shape=(16, 16, 16), M=4096, tol=1e-3):
 12 |     complex_dtype = utils._complex_dtype(dtype)
 13 | 
 14 |     dim = len(shape)
 15 | 
 16 |     k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 17 |     c = utils.gen_nonuniform_data(M).astype(complex_dtype)
 18 | 
 19 |     k_gpu = gpuarray.to_gpu(k)
 20 |     c_gpu = gpuarray.to_gpu(c)
 21 |     fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
 22 | 
 23 |     plan = cufinufft(1, shape, eps=tol, dtype=dtype)
 24 | 
 25 |     plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])
 26 | 
 27 |     plan.execute(c_gpu, fk_gpu)
 28 | 
 29 |     fk = fk_gpu.get()
 30 | 
 31 |     ind = int(0.1789 * np.prod(shape))
 32 | 
 33 |     fk_est = fk.ravel()[ind]
 34 |     fk_target = utils.direct_type1(c, k, shape, ind)
 35 | 
 36 |     type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
 37 | 
 38 |     print('Type 1 relative error:', type1_rel_err)
 39 | 
 40 |     assert type1_rel_err < 0.01
 41 | 
 42 | 
 43 | def test_type1_32(shape=(16, 16, 16), M=4096, tol=1e-3):
 44 |     return _test_type1(dtype=np.float32, shape=shape, M=M, tol=tol)
 45 | 
 46 | 
 47 | def test_type1_64(shape=(16, 16, 16), M=4096, tol=1e-3):
 48 |     return _test_type1(dtype=np.float64, shape=shape, M=M, tol=tol)
 49 | 
 50 | 
 51 | def _test_type2(dtype, shape=(16, 16, 16), M=4096, tol=1e-3):
 52 |     complex_dtype = utils._complex_dtype(dtype)
 53 | 
 54 |     k = utils.gen_nu_pts(M).astype(dtype)
 55 |     fk = utils.gen_uniform_data(shape).astype(complex_dtype)
 56 | 
 57 |     k_gpu = gpuarray.to_gpu(k)
 58 |     fk_gpu = gpuarray.to_gpu(fk)
 59 | 
 60 |     c_gpu = gpuarray.GPUArray(shape=(M,), dtype=complex_dtype)
 61 | 
 62 |     plan = cufinufft(2, shape, eps=tol, dtype=dtype)
 63 | 
 64 |     plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])
 65 | 
 66 |     plan.execute(c_gpu, fk_gpu)
 67 | 
 68 |     c = c_gpu.get()
 69 | 
 70 |     ind = M // 2
 71 | 
 72 |     c_est = c[ind]
 73 |     c_target = utils.direct_type2(fk, k[:, ind])
 74 | 
 75 |     type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)
 76 | 
 77 |     print('Type 2 relative error:', type2_rel_err)
 78 | 
 79 |     assert type2_rel_err < 0.01
 80 | 
 81 | 
 82 | def test_type2_32(shape=(16, 16, 16), M=4096, tol=1e-3):
 83 |     return _test_type2(dtype=np.float32, shape=shape, M=M, tol=tol)
 84 | 
 85 | 
 86 | def test_type2_64(shape=(16, 16, 16), M=4096, tol=1e-3):
 87 |     return _test_type2(dtype=np.float64, shape=shape, M=M, tol=tol)
 88 | 
 89 | 
 90 | def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
 91 |     dtype = np.float32
 92 | 
 93 |     complex_dtype = utils._complex_dtype(dtype)
 94 | 
 95 |     dim = len(shape)
 96 | 
 97 |     k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 98 |     c = utils.gen_nonuniform_data(M).astype(complex_dtype)
 99 | 
100 |     k_gpu = gpuarray.to_gpu(k)
101 |     c_gpu = gpuarray.to_gpu(c)
102 |     fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
103 | 
104 |     plan = cufinufft(1, shape, eps=tol, dtype=dtype, gpu_sort=False,
105 |                      gpu_maxsubprobsize=10)
106 | 
107 |     plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2])
108 | 
109 |     plan.execute(c_gpu, fk_gpu)
110 | 
111 |     fk = fk_gpu.get()
112 | 
113 |     ind = int(0.1789 * np.prod(shape))
114 | 
115 |     fk_est = fk.ravel()[ind]
116 |     fk_target = utils.direct_type1(c, k, shape, ind)
117 | 
118 |     type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
119 | 
120 |     assert type1_rel_err < 0.01
121 | 
122 | 
123 | def main():
124 |     test_type1_32()
125 |     test_type2_32()
126 |     test_type1_64()
127 |     test_type2_64()
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     main()
132 | 


--------------------------------------------------------------------------------
/ci/docker/cuda11.0/Dockerfile-x86_64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux2014_x86_64
 2 | LABEL maintainer "Garrett Wright"
 3 | 
 4 | # ---- The following block adds layers for CUDA --- #
 5 | # base
 6 | RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
 7 |     curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
 8 |     echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict  -
 9 | 
10 | COPY ci/docker/cuda11.0/cuda.repo /etc/yum.repos.d/cuda.repo
11 | 
12 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
13 | RUN yum install -y \
14 |     cuda-cudart-11-0-11.0.171-1 \
15 |     cuda-compat-11-0 \
16 |     && ln -s cuda-11.0 /usr/local/cuda && \
17 |     rm -rf /var/cache/yum/*
18 | 
19 | # nvidia-docker 1.0
20 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
21 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
22 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
23 | ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
24 | 
25 | # nvidia-container-runtime
26 | ENV NVIDIA_VISIBLE_DEVICES all
27 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
28 | ENV NVIDIA_REQUIRE_CUDA "cuda>=11.0 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441"
29 | 
30 | 
31 | # runtime
32 | RUN yum install -y \
33 |     cuda-libraries-11-0-11.0.1-1 \
34 |     cuda-nvtx-11-0-11.0.167-1 \
35 |     && rm -rf /var/cache/yum/*
36 | 
37 | RUN yum install -y xz && NCCL_DOWNLOAD_SUM=d112b722bf557cff96d571ac3386e4f539be7b3e9412561bde59b0ad6e59263d && \
38 |     curl -fsSL https://developer.download.nvidia.com/compute/redist/nccl/v2.7/nccl_2.7.3-1+cuda11.0_x86_64.txz -O && \
39 |     echo "$NCCL_DOWNLOAD_SUM  nccl_2.7.3-1+cuda11.0_x86_64.txz" | sha256sum -c - && \
40 |     unxz nccl_2.7.3-1+cuda11.0_x86_64.txz && \
41 |     tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/lib64/ --strip-components=2 --wildcards '*/lib/libnccl.so.*' && \
42 |     tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf  nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/lib64/pkgconfig/ --strip-components=3 --wildcards '*/lib/pkgconfig/*' && \
43 |     rm -f nccl_2.7.3-1+cuda11.0_x86_64.tar && \
44 |     ldconfig
45 | 
46 | 
47 | # devel
48 | RUN yum install -y \
49 |     cuda-nvml-devel-11-0-11.0.167-1 \
50 |     cuda-command-line-tools-11-0-11.0.1-1 \
51 |     cuda-cudart-devel-11-0-11.0.171-1 \
52 |     cuda-libraries-devel-11-0-11.0.1-1 \
53 |     cuda-minimal-build-11-0-11.0.1-1 \
54 |     libcublas-devel-11-0-11.0.0.191-1 \
55 |     && rm -rf /var/cache/yum/*
56 | 
57 | RUN yum install -y xz && NCCL_DOWNLOAD_SUM=d112b722bf557cff96d571ac3386e4f539be7b3e9412561bde59b0ad6e59263d && \
58 |     curl -fsSL https://developer.download.nvidia.com/compute/redist/nccl/v2.7/nccl_2.7.3-1+cuda11.0_x86_64.txz -O && \
59 |     echo "$NCCL_DOWNLOAD_SUM  nccl_2.7.3-1+cuda11.0_x86_64.txz" | sha256sum -c - && \
60 |     unxz nccl_2.7.3-1+cuda11.0_x86_64.txz && \
61 |     tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf  nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/include/ --strip-components=2 --wildcards '*/include/*' && \
62 |     tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf  nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/lib64/ --strip-components=2 --wildcards '*/lib/libnccl.so' && \
63 |     rm -f nccl_2.7.3-1+cuda11.0_x86_64.tar && \
64 |     ldconfig
65 | 
66 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
67 | 
68 | 
69 | # /CUDA #
70 | 
71 | 
72 | # Okay, so now we can begin cufinufft
73 | 
74 | # We need to build the CUDA code now.
75 | # assume we are building container in the root of the git repo...
76 | COPY . /io
77 | WORKDIR /io
78 | RUN make target=manylinux
79 | # And we need to pack it in our LD path
80 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH}
81 | 
82 | 
83 | CMD ["/bin/bash"]
84 | 


--------------------------------------------------------------------------------
/test/cufinufft2d2api_test.cu:
--------------------------------------------------------------------------------
  1 | /* This test should excercise the API
  2 |    close to how a user might use the code */
  3 | 
  4 | #include <iostream>
  5 | #include <iomanip>
  6 | #include <math.h>
  7 | #include <helper_cuda.h>
  8 | #include <complex>
  9 | 
 10 | #include <cufinufft.h>
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | {
 16 |   int N1 = 256;
 17 |   int N2 = 256;
 18 |   int M = N1*N2;
 19 | 
 20 |   double tol=1e-6;
 21 | 
 22 |   int iflag=1;
 23 | 
 24 |   cout<<scientific<<setprecision(3);
 25 |   int ier;
 26 | 
 27 |   // malloc host arrays
 28 |   double *x, *y;
 29 |   complex<double> *c, *fk;
 30 |   checkCudaErrors(cudaMallocHost(&x, M*sizeof(double)));
 31 |   checkCudaErrors(cudaMallocHost(&y, M*sizeof(double)));
 32 |   checkCudaErrors(cudaMallocHost(&c, M*sizeof(complex<double>)));
 33 |   checkCudaErrors(cudaMallocHost(&fk,N1*N2*sizeof(complex<double>)));
 34 | 
 35 |   // malloc device arrays
 36 |   double *d_x, *d_y;
 37 |   cuDoubleComplex *d_c, *d_fk;
 38 |   checkCudaErrors(cudaMalloc(&d_x,M*sizeof(double)));
 39 |   checkCudaErrors(cudaMalloc(&d_y,M*sizeof(double)));
 40 |   checkCudaErrors(cudaMalloc(&d_c,M*sizeof(cuDoubleComplex)));
 41 |   checkCudaErrors(cudaMalloc(&d_fk,N1*N2*sizeof(cuDoubleComplex)));
 42 | 
 43 |   // Making data
 44 |   for (int i = 0; i < M; i++) {
 45 |     x[i] = M_PI*randm11();  // x in [-pi,pi)
 46 |     y[i] = M_PI*randm11();
 47 |   }
 48 |   for(int i=0; i<N1*N2; i++){
 49 |     fk[i].real(1.0);
 50 |     fk[i].imag(1.0);
 51 |   }
 52 | 
 53 |   // Copy data to device memory, real users might just populate in memory.
 54 |   checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
 55 |   checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(double),cudaMemcpyHostToDevice));
 56 |   checkCudaErrors(cudaMemcpy(d_fk, fk, N1*N2*sizeof(complex<double>),
 57 |                              cudaMemcpyHostToDevice));
 58 | 
 59 | 
 60 |   // construct plan
 61 |   cufinufft_plan dplan;
 62 |   int dim = 2;
 63 |   int type = 2;
 64 |   
 65 |   int nmodes[3];
 66 |   int ntransf = 1;
 67 |   int maxbatchsize = 1;
 68 |   nmodes[0] = N1;
 69 |   nmodes[1] = N2;
 70 |   nmodes[2] = 1;
 71 | 
 72 |   ier=cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol,
 73 |                          maxbatchsize, &dplan, NULL);
 74 |   if (ier!=0){
 75 |     printf("err: cufinufft2d_plan\n");
 76 |     return ier;
 77 |   }
 78 | 
 79 | 
 80 |   // Set Non uniform points
 81 |   ier=cufinufft_setpts(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
 82 |   if (ier!=0){
 83 |     printf("err: cufinufft_setpts\n");
 84 |     return ier;
 85 |   }
 86 | 
 87 |   // Execute the plan on the data
 88 |   ier=cufinufft_execute(d_c, d_fk, dplan);
 89 |   if (ier!=0){
 90 |     printf("err: cufinufft2d2_exec\n");
 91 |     return ier;
 92 |   }
 93 | 
 94 |   // Destroy the plan when done processing
 95 |   ier=cufinufft_destroy(dplan);
 96 |   if (ier!=0){
 97 |     printf("err: cufinufft_destroyc\n");
 98 |     return ier;
 99 |   }
100 | 
101 |   // Copy test data back to host and compare
102 |   checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
103 |   int jt = M/2;          // check arbitrary choice of one targ pt
104 |   complex<double> J = complex<double>(0,1)*(double)iflag;
105 |   complex<double> ct = complex<double>(0,0);
106 |   int m=0;
107 |   for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
108 |     for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
109 |       ct += fk[m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
110 |   printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,abs(c[jt]-ct)/infnorm(M,c));
111 | 
112 | 
113 |   // Cleanup
114 |   checkCudaErrors(cudaFreeHost(x));
115 |   checkCudaErrors(cudaFreeHost(y));
116 |   checkCudaErrors(cudaFreeHost(c));
117 |   checkCudaErrors(cudaFreeHost(fk));
118 |   checkCudaErrors(cudaFree(d_x));
119 |   checkCudaErrors(cudaFree(d_y));
120 |   checkCudaErrors(cudaFree(d_c));
121 |   checkCudaErrors(cudaFree(d_fk));
122 | 
123 |   return 0;
124 | }
125 | 


--------------------------------------------------------------------------------
/contrib/spreadinterp.cpp:
--------------------------------------------------------------------------------
 1 | #include "spreadinterp.h"
 2 | #include <stdlib.h>
 3 | #include <vector>
 4 | #include <math.h>
 5 | 
 6 | int setup_spreader(SPREAD_OPTS &opts,FLT eps, FLT upsampfac, int kerevalmeth)
 7 | // Initializes spreader kernel parameters given desired NUFFT tolerance eps,
 8 | // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth
 9 | // (etiher 0:exp(sqrt()), 1: Horner ppval).
10 | // Also sets all default options in SPREAD_OPTS. See cnufftspread.h for opts.
11 | // Must call before any kernel evals done.
12 | // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h)
13 | {
14 |   if (upsampfac!=2.0) {   // nonstandard sigma
15 |     if (kerevalmeth==1) {
16 |       fprintf(stderr,"setup_spreader: nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",(double)upsampfac);
17 |       return HORNER_WRONG_BETA;
18 |     }
19 |     if (upsampfac<=1.0) {
20 |       fprintf(stderr,"setup_spreader: error, upsampfac=%.3g is <=1.0\n",(double)upsampfac);
21 |       return ERR_UPSAMPFAC_TOO_SMALL;
22 |     }
23 |     // calling routine must abort on above errors, since opts is garbage!
24 |     if (upsampfac>4.0)
25 |       fprintf(stderr,"setup_spreader: warning, upsampfac=%.3g is too large to be beneficial!\n",(double)upsampfac);
26 |   }
27 |     
28 |   // defaults... (user can change after this function called)
29 |   opts.spread_direction = 1;    // user should always set to 1 or 2 as desired
30 |   opts.pirange = 1;             // user also should always set this
31 |   opts.upsampfac = upsampfac;
32 | 
33 |   // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
34 |   int ier = 0;
35 |   if (eps<EPSILON) {
36 |     fprintf(stderr,"setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n",(double)eps,(double)EPSILON);
37 |     eps = EPSILON;
38 |     ier = WARN_EPS_TOO_SMALL;
39 |   }
40 | 
41 |   // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
42 |   int ns = std::ceil(-log10(eps/(FLT)10.0));   // 1 digit per power of ten
43 |   if (upsampfac!=2.0)           // override ns for custom sigma
44 |     ns = std::ceil(-log(eps) / (PI*sqrt(1-1/upsampfac)));  // formula, gamma=1
45 |   ns = max(2,ns);               // we don't have ns=1 version yet
46 |   if (ns>MAX_NSPREAD) {         // clip to match allocated arrays
47 |     fprintf(stderr,"%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",__func__,
48 | 	    upsampfac,(double)eps,ns,MAX_NSPREAD);
49 |     ns = MAX_NSPREAD;
50 |     ier = WARN_EPS_TOO_SMALL;
51 |   }
52 |   opts.nspread = ns;
53 |   opts.ES_halfwidth=(FLT)ns/2;   // constants to help ker eval (except Horner)
54 |   opts.ES_c = 4.0/(FLT)(ns*ns);
55 | 
56 |   FLT betaoverns = 2.30;         // gives decent betas for default sigma=2.0
57 |   if (ns==2) betaoverns = 2.20;  // some small-width tweaks...
58 |   if (ns==3) betaoverns = 2.26;
59 |   if (ns==4) betaoverns = 2.38;
60 |   if (upsampfac!=2.0) {          // again, override beta for custom sigma
61 |     FLT gamma=0.97;              // must match devel/gen_all_horner_C_code.m
62 |     betaoverns = gamma*PI*(1-1/(2*upsampfac));  // formula based on cutoff
63 |   }
64 |   opts.ES_beta = betaoverns * (FLT)ns;    // set the kernel beta parameter
65 |   //fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta); // user hasn't set debug yet
66 |   return ier;
67 | }
68 | 
69 | FLT evaluate_kernel(FLT x, const SPREAD_OPTS &opts)
70 | /* ES ("exp sqrt") kernel evaluation at single real argument:
71 |       phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
72 |    related to an asymptotic approximation to the Kaiser--Bessel, itself an
73 |    approximation to prolate spheroidal wavefunction (PSWF) of order 0.
74 |    This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
75 | {
76 |   if (abs(x)>=opts.ES_halfwidth)
77 |     // if spreading/FT careful, shouldn't need this if, but causes no speed hit
78 |     return 0.0;
79 |   else
80 |     return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c*x*x));
81 | }
82 | 
83 | 


--------------------------------------------------------------------------------
/test/cufinufft2d2api_test_32.cu:
--------------------------------------------------------------------------------
  1 | /* This test should excercise the API
  2 |    close to how a user might use the code
  3 | 
  4 |    Note this single precision version changes
  5 |    doubles ~~> float    and
  6 |    cufinufft_* ~~> cufinufftf_* function names.
  7 | 
  8 | */
  9 | 
 10 | #include <iostream>
 11 | #include <iomanip>
 12 | #include <math.h>
 13 | #include <helper_cuda.h>
 14 | #include <complex>
 15 | 
 16 | #include <cufinufft.h>
 17 | 
 18 | #include <profile.h>
 19 | #include "../contrib/utils.h"
 20 | 
 21 | using namespace std;
 22 | 
 23 | typedef std::complex<float> CPX;
 24 | 
 25 | int main(int argc, char* argv[])
 26 | {
 27 |   int N1 = 256;
 28 |   int N2 = 256;
 29 |   int M = N1*N2;
 30 | 
 31 |   float tol=1e-6;
 32 | 
 33 |   int iflag=1;
 34 | 
 35 |   cout<<scientific<<setprecision(3);
 36 |   int ier;
 37 | 
 38 |   // malloc host arrays
 39 |   float *x, *y;
 40 |   CPX *c, *fk;
 41 |   checkCudaErrors(cudaMallocHost(&x, M*sizeof(float)));
 42 |   checkCudaErrors(cudaMallocHost(&y, M*sizeof(float)));
 43 |   checkCudaErrors(cudaMallocHost(&c, M*sizeof(CPX)));
 44 |   checkCudaErrors(cudaMallocHost(&fk,N1*N2*sizeof(CPX)));
 45 | 
 46 |   // malloc device arrays
 47 |   float *d_x, *d_y;
 48 |   cuFloatComplex *d_c, *d_fk;
 49 |   checkCudaErrors(cudaMalloc(&d_x,M*sizeof(float)));
 50 |   checkCudaErrors(cudaMalloc(&d_y,M*sizeof(float)));
 51 |   checkCudaErrors(cudaMalloc(&d_c,M*sizeof(cuFloatComplex)));
 52 |   checkCudaErrors(cudaMalloc(&d_fk,N1*N2*sizeof(cuFloatComplex)));
 53 | 
 54 |   // Making data
 55 |   for (int i = 0; i < M; i++) {
 56 |     x[i] = M_PI*randm11();  // x in [-pi,pi)
 57 |     y[i] = M_PI*randm11();
 58 |   }
 59 |   for(int i=0; i<N1*N2; i++){
 60 |     fk[i].real(1.0);
 61 |     fk[i].imag(1.0);
 62 |   }
 63 | 
 64 |   // Copy data to device memory, real users might just populate in memory.
 65 |   checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(float),cudaMemcpyHostToDevice));
 66 |   checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(float),cudaMemcpyHostToDevice));
 67 |   checkCudaErrors(cudaMemcpy(d_fk, fk, N1*N2*sizeof(CPX),
 68 |                              cudaMemcpyHostToDevice));
 69 | 
 70 | 
 71 |   // construct plan
 72 |   cufinufftf_plan dplan;
 73 |   int dim = 2;
 74 |   int type = 2;
 75 | 
 76 |   int nmodes[3];
 77 |   int ntransf = 1;
 78 |   int maxbatchsize = 1;
 79 |   nmodes[0] = N1;
 80 |   nmodes[1] = N2;
 81 |   nmodes[2] = 1;
 82 | 
 83 |   ier=cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol,
 84 |                          maxbatchsize, &dplan, NULL);
 85 |   if (ier!=0){
 86 |     printf("err: cufinufft2d_plan\n");
 87 |     return ier;
 88 |   }
 89 | 
 90 | 
 91 |   // Set Non uniform points
 92 |   ier=cufinufftf_setpts(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
 93 |   if (ier!=0){
 94 |     printf("err: cufinufft_setpts\n");
 95 |     return ier;
 96 |   }
 97 | 
 98 |   // Execute the plan on the data
 99 |   ier=cufinufftf_execute(d_c, d_fk, dplan);
100 |   if (ier!=0){
101 |     printf("err: cufinufft2d2_exec\n");
102 |     return ier;
103 |   }
104 | 
105 |   // Destroy the plan when done processing
106 |   ier=cufinufftf_destroy(dplan);
107 |   if (ier!=0){
108 |     printf("err: cufinufft_destroyc\n");
109 |     return ier;
110 |   }
111 | 
112 |   // Copy test data back to host and compare
113 |   checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(cuFloatComplex),cudaMemcpyDeviceToHost));
114 |   int jt = M/2;          // check arbitrary choice of one targ pt
115 |   CPX J = IMA*(float)iflag;
116 |   CPX ct = CPX(0,0);
117 |   int m=0;
118 |   for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
119 |     for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
120 |       ct += fk[m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
121 |   printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,abs(c[jt]-ct)/infnorm(M,c));
122 | 
123 | 
124 |   // Cleanup
125 |   checkCudaErrors(cudaFreeHost(x));
126 |   checkCudaErrors(cudaFreeHost(y));
127 |   checkCudaErrors(cudaFreeHost(c));
128 |   checkCudaErrors(cudaFreeHost(fk));
129 |   checkCudaErrors(cudaFree(d_x));
130 |   checkCudaErrors(cudaFree(d_y));
131 |   checkCudaErrors(cudaFree(d_c));
132 |   checkCudaErrors(cudaFree(d_fk));
133 | 
134 |   return 0;
135 | }
136 | 


--------------------------------------------------------------------------------
/test/interp1d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include "../src/cuspreadinterp.h"
  7 | #include "../contrib/utils.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | int main(int argc, char* argv[])
 12 | {
 13 | 	int nf1;
 14 | 	FLT upsampfac=2.0;
 15 | 	int N1, M;
 16 | 	if (argc<4) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: interp1d method nupts_distr nf1 [M [tol [kerevalmeth [sort]]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven\n"
 22 | 			"  nupts_distr: The distribution of the points; one of\n"
 23 | 			"    0: uniform, or\n"
 24 | 			"    1: concentrated in a small region.\n"
 25 | 			"  nf1: The size of the 2D array.\n"
 26 | 			"  M: The number of non-uniform points (default nf1 / 2).\n"
 27 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 28 | 			"  kerevalmeth: Kernel evaluation method; one of\n"
 29 | 			"     0: Exponential of square root (default), or\n"
 30 | 			"     1: Horner evaluation.\n"
 31 | 			"  sort: One of\n"
 32 | 			"     0: do not sort the points, or\n"
 33 | 			"     1: sort the points (default).\n");
 34 | 		return 1;
 35 | 	}
 36 | 	double w;
 37 | 	int method;
 38 | 	sscanf(argv[1],"%d",&method);
 39 | 	int nupts_distribute;
 40 | 	sscanf(argv[2],"%d",&nupts_distribute);
 41 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 42 | 
 43 | 	N1 = (int) nf1/upsampfac;
 44 | 	M = N1;// let density always be 1
 45 | 	if(argc>4){
 46 | 		sscanf(argv[4],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 47 | 		if(M == 0) M=N1;
 48 | 	}
 49 | 
 50 | 	FLT tol=1e-6;
 51 | 	if(argc>5){
 52 | 		sscanf(argv[5],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 53 | 	}
 54 | 
 55 | 	int kerevalmeth=0;
 56 | 	if(argc>6){
 57 | 		sscanf(argv[6],"%d",&kerevalmeth);
 58 | 	}
 59 | 
 60 | 	int sort=1;
 61 | 	if(argc>7){
 62 | 		sscanf(argv[7],"%d",&sort);
 63 | 	}
 64 | 
 65 | 	int ier;
 66 | 	cout<<scientific<<setprecision(3);
 67 | 
 68 | 	FLT *x;
 69 | 	CPX *c, *fw;
 70 | 	cudaMallocHost(&x, M*sizeof(FLT));
 71 | 	cudaMallocHost(&c, M*sizeof(CPX));
 72 | 	cudaMallocHost(&fw,nf1*sizeof(CPX));
 73 | 
 74 | 	FLT *d_x;
 75 | 	CUCPX *d_c, *d_fw;
 76 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 77 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 78 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*sizeof(CUCPX)));
 79 | 
 80 | 	int dim=1;
 81 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 82 |         // Zero out your struct, (sets all pointers to NULL, crucial)
 83 | 	memset(dplan, 0, sizeof(*dplan));
 84 | 	ier = CUFINUFFT_DEFAULT_OPTS(2, dim, &(dplan->opts));
 85 | 	dplan->opts.gpu_method           = method;
 86 | 	dplan->opts.gpu_maxsubprobsize   = 1024;
 87 | 	dplan->opts.gpu_kerevalmeth      = kerevalmeth;
 88 | 	dplan->opts.gpu_sort             = sort;
 89 | 	dplan->opts.gpu_spreadinterponly = 1;
 90 | 	dplan->opts.gpu_binsizex         = 1024; //binsize needs to be set here, since
 91 |                                            //SETUP_BINSIZE() is not called in 
 92 |                                            //spread, interp only wrappers.
 93 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
 94 | 
 95 | 	switch(nupts_distribute){
 96 | 		case 0: //uniform
 97 | 			{
 98 | 				for (int i = 0; i < M; i++) {
 99 | 					x[i] = M_PI*randm11();// x in [-pi,pi)
100 | 				}
101 | 			}
102 | 			break;
103 | 		case 1: // concentrate on a small region
104 | 			{
105 | 				for (int i = 0; i < M; i++) {
106 | 					x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi)
107 | 				}
108 | 			}
109 | 			break;
110 | 	}
111 | 	for(int i=0; i<nf1; i++){
112 | 		fw[i].real(1.0);
113 | 		fw[i].imag(0.0);
114 | 	}
115 | 
116 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
117 | 	checkCudaErrors(cudaMemcpy(d_fw,fw,nf1*sizeof(CUCPX),cudaMemcpyHostToDevice));
118 | 
119 | 	CNTime timer;
120 | 	timer.restart();
121 | 	ier = CUFINUFFT_INTERP1D(nf1, d_fw, M, d_x, d_c, dplan);
122 | 	if(ier != 0 ){
123 | 		cout<<"error: cnufftinterp2d"<<endl;
124 | 		return 0;
125 | 	}
126 | 	FLT t=timer.elapsedsec();
127 | 	printf("[Method %d] %ld U pts to #%d NU pts in %.3g s (\t%.3g NU pts/s)\n",
128 | 			dplan->opts.gpu_method,nf1,M,t,M/t);
129 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
130 | #ifdef RESULT
131 | 	cout<<"[result-input]"<<endl;
132 | 	for(int j=0; j<M; j++){
133 | 		printf(" (%2.3g,%2.3g)",c[j].real(),c[j].imag() );
134 | 		cout<<endl;
135 | 	}
136 | 	cout<<endl;
137 | #endif
138 | 
139 | 	cudaFreeHost(x);
140 | 	cudaFreeHost(c);
141 | 	cudaFreeHost(fw);
142 | 	cudaFree(d_x);
143 | 	cudaFree(d_c);
144 | 	cudaFree(d_fw);
145 | 	return 0;
146 | }
147 | 


--------------------------------------------------------------------------------
/test/spread1d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <algorithm>
  7 | #include "../src/cuspreadinterp.h"
  8 | #include "../contrib/utils.h"
  9 | 
 10 | using namespace std;
 11 | 
 12 | int main(int argc, char* argv[])
 13 | {
 14 | 	int nf1, N1, M;
 15 | 	FLT upsampfac=2.0;
 16 | 	if (argc<4) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: spread1d_test method nupts_distr nf1 [maxsubprobsize [M [tol [kerevalmeth]]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven, or\n"
 22 | 			"    2: sub-problem\n"
 23 | 			"  nupts_distr: The distribution of the points; one of\n"
 24 | 			"    0: uniform, or\n"
 25 | 			"    1: concentrated in a small region.\n"
 26 | 			"  nf1: The size of the 1D array.\n"
 27 | 			"  maxsubprobsize: Maximum size of subproblems (default 65536).\n"
 28 | 			"  M: The number of non-uniform points (default nf1 / 2).\n"
 29 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 30 | 			"  kerevalmeth: Kernel evaluation method; one of\n"
 31 | 			"     0: Exponential of square root (default), or\n"
 32 | 			"     1: Horner evaluation.\n");
 33 | 		return 1;
 34 | 	}
 35 | 	double w;
 36 | 	int method;
 37 | 	sscanf(argv[1],"%d",&method);
 38 | 
 39 | 	int nupts_distribute;
 40 | 	sscanf(argv[2],"%d",&nupts_distribute);
 41 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 42 | 
 43 | 	int maxsubprobsize=65536;
 44 | 	if(argc>4){
 45 | 		sscanf(argv[4],"%d",&maxsubprobsize);
 46 | 	}
 47 | 
 48 | 	N1 = (int) nf1/upsampfac;
 49 | 	M = N1;
 50 | 	if(argc>5){
 51 | 		sscanf(argv[5],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 52 | 	}
 53 | 
 54 | 	FLT tol=1e-6;
 55 | 	if(argc>6){
 56 | 		sscanf(argv[6],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 57 | 	}
 58 | 
 59 | 	int kerevalmeth=0;
 60 | 	if(argc>7){
 61 | 		sscanf(argv[7],"%d",&kerevalmeth);
 62 | 	}
 63 | 
 64 | 	int ier;
 65 | 	int dim=1;
 66 | 
 67 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 68 |         // Zero out your struct, (sets all pointers to NULL, crucial)
 69 |         memset(dplan, 0, sizeof(*dplan));
 70 | 	ier = CUFINUFFT_DEFAULT_OPTS(1 /*type*/, dim, &(dplan->opts));
 71 | 
 72 | 	dplan->opts.gpu_method           = method;
 73 | 	dplan->opts.gpu_maxsubprobsize   = maxsubprobsize;
 74 | 	dplan->opts.gpu_kerevalmeth      = kerevalmeth;
 75 | 	dplan->opts.gpu_sort             = 1;  // ahb changed from 0
 76 | 	dplan->opts.gpu_spreadinterponly = 1;
 77 | 	dplan->opts.gpu_binsizex         = 1024; //binsize needs to be set here, since
 78 |                                            //SETUP_BINSIZE() is not called in 
 79 |                                            //spread, interp only wrappers.
 80 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
 81 | 
 82 | 	cout<<scientific<<setprecision(3);
 83 | 
 84 | 	FLT *x;
 85 | 	CPX *c, *fw;
 86 | 	cudaMallocHost(&x, M*sizeof(FLT));
 87 | 	cudaMallocHost(&c, M*sizeof(CPX));
 88 | 	cudaMallocHost(&fw,nf1*sizeof(CPX));
 89 | 
 90 | 	FLT *d_x;
 91 | 	CUCPX *d_c, *d_fw;
 92 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 93 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 94 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*sizeof(CUCPX)));
 95 | 
 96 | 	switch(nupts_distribute){
 97 | 		// Making data
 98 | 		case 0: //uniform
 99 | 			{
100 | 				for (int i = 0; i < M; i++) {
101 | 					x[i] = M_PI*randm11();// x in [-pi,pi)
102 | 					c[i].real(randm11());
103 | 					c[i].imag(randm11());
104 | 				}
105 | 			}
106 | 			break;
107 | 		case 1: // concentrate on a small region
108 | 			{
109 | 				for (int i = 0; i < M; i++) {
110 | 					x[i] = M_PI*rand01()/(nf1*2/32);
111 | 					c[i].real(randm11());
112 | 					c[i].imag(randm11());
113 | 				}
114 | 			}
115 | 			break;
116 | 		default:
117 | 			cerr << "not valid nupts distr" << endl;
118 | 	}
119 | 
120 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
121 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CUCPX),cudaMemcpyHostToDevice));
122 | 
123 | 	CNTime timer;
124 | 	timer.restart();
125 | 	ier = CUFINUFFT_SPREAD1D(nf1, d_fw, M, d_x, d_c, dplan);
126 | 	if(ier != 0 ){
127 | 		cout<<"error: cnufftspread2d"<<endl;
128 | 		return 0;
129 | 	}
130 | 	FLT t=timer.elapsedsec();
131 | 	printf("[Method %d] %ld NU pts to #%d U pts in %.3g s (%.3g NU pts/s)\n",
132 | 			dplan->opts.gpu_method,M,nf1,t,M/t);
133 | 
134 | 	checkCudaErrors(cudaMemcpy(fw,d_fw,nf1*sizeof(CUCPX),
135 | 		cudaMemcpyDeviceToHost));
136 | #ifdef RESULT
137 | 	cout<<"[result-input]"<<endl;
138 | 	for (int i=0; i<nf1; i++){
139 | 		if( i % dplan->opts.gpu_binsizex == 0 && i!=0)
140 | 			printf(" |");
141 | 		printf(" (%2.3g,%2.3g)",fw[i].real(),fw[i].imag() );
142 | 	}
143 | #endif
144 | 
145 | 	cudaFreeHost(x);
146 | 	cudaFreeHost(c);
147 | 	cudaFreeHost(fw);
148 | 	cudaFree(d_x);
149 | 	cudaFree(d_c);
150 | 	cudaFree(d_fw);
151 | 	return 0;
152 | }
153 | 


--------------------------------------------------------------------------------
/python/cufinufft/_cufinufft.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | This file contains low level python bindings for the cufinufft CUDA libraries.
  4 | Seperate bindings are provided for single and double precision libraries,
  5 | differentiated by 'f' suffix.
  6 | """
  7 | 
  8 | import ctypes
  9 | import os
 10 | import warnings
 11 | 
 12 | # While imp is deprecated, it is currently the inspection solution
 13 | #   that works for all versions of Python 2 and 3.
 14 | # One day if that changes, can be replaced
 15 | #   with importlib.find_spec.
 16 | with warnings.catch_warnings():
 17 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
 18 |     import imp
 19 | 
 20 | import numpy as np
 21 | 
 22 | from ctypes import c_double
 23 | from ctypes import c_int
 24 | from ctypes import c_float
 25 | from ctypes import c_void_p
 26 | 
 27 | c_int_p = ctypes.POINTER(c_int)
 28 | c_float_p = ctypes.POINTER(c_float)
 29 | c_double_p = ctypes.POINTER(c_double)
 30 | 
 31 | # TODO: See if there is a way to improve this so it is less hacky.
 32 | lib = None
 33 | # Try to load a local library directly.
 34 | try:
 35 |     lib = ctypes.cdll.LoadLibrary('libcufinufft.so')
 36 | except OSError:
 37 |     pass
 38 | 
 39 | # Should that not work, try to find the full path of a packaged lib.
 40 | #   The packaged lib should have a py/platform decorated name,
 41 | #   and be rpath'ed the true CUDA C cufinufft library through the
 42 | #   Extension and wheel systems.
 43 | try:
 44 |     if lib is None:
 45 |         # Find the library.
 46 |         fh = imp.find_module('cufinufftc')[0]
 47 |         # Get the full path for the ctypes loader.
 48 |         full_lib_path = os.path.realpath(fh.name)
 49 |         fh.close()    # Be nice and close the open file handle.
 50 | 
 51 |         # Load the library,
 52 |         #    which rpaths the libraries we care about.
 53 |         lib = ctypes.cdll.LoadLibrary(full_lib_path)
 54 | 
 55 | except Exception:
 56 |     raise RuntimeError('Failed to find a suitable cufinufft library')
 57 | 
 58 | 
 59 | def _get_ctypes(dtype):
 60 |     """
 61 |     Checks dtype is float32 or float64.
 62 |     Returns floating point and floating point pointer.
 63 |     """
 64 | 
 65 |     if dtype == np.float64:
 66 |         REAL_t = c_double
 67 |     elif dtype == np.float32:
 68 |         REAL_t = c_float
 69 |     else:
 70 |         raise TypeError("Expected np.float32 or np.float64.")
 71 | 
 72 |     REAL_ptr = ctypes.POINTER(REAL_t)
 73 | 
 74 |     return REAL_t, REAL_ptr
 75 | 
 76 | 
 77 | def _get_NufftOpts():
 78 |     fields = [
 79 |         ('upsampfac', c_double),
 80 |         ('gpu_method', c_int),
 81 |         ('gpu_sort', c_int),
 82 |         ('gpu_binsizex', c_int),
 83 |         ('gpu_binsizey', c_int),
 84 |         ('gpu_binsizez', c_int),
 85 |         ('gpu_obinsizex', c_int),
 86 |         ('gpu_obinsizey', c_int),
 87 |         ('gpu_obinsizez', c_int),
 88 |         ('gpu_maxsubprobsize', c_int),
 89 |         ('gpu_nstreams', c_int),
 90 |         ('gpu_kerevalmeth', c_int),
 91 |         ('gpu_spreadinterponly', c_int),
 92 |         ('gpu_device_id', c_int)]
 93 |     return fields
 94 | 
 95 | 
 96 | class NufftOpts(ctypes.Structure):
 97 |     pass
 98 | 
 99 | 
100 | NufftOpts._fields_ = _get_NufftOpts()
101 | 
102 | 
103 | CufinufftPlan = c_void_p
104 | CufinufftPlanf = c_void_p
105 | 
106 | CufinufftPlan_p = ctypes.POINTER(CufinufftPlan)
107 | CufinufftPlanf_p = ctypes.POINTER(CufinufftPlanf)
108 | 
109 | NufftOpts_p = ctypes.POINTER(NufftOpts)
110 | 
111 | _default_opts = lib.cufinufft_default_opts
112 | _default_opts.argtypes = [c_int, c_int, NufftOpts_p]
113 | _default_opts.restype = c_int
114 | 
115 | _make_plan = lib.cufinufft_makeplan
116 | _make_plan.argtypes = [
117 |     c_int, c_int, c_int_p, c_int,
118 |     c_int, c_double, c_int, CufinufftPlan_p, NufftOpts_p]
119 | _make_plan.restypes = c_int
120 | 
121 | _make_planf = lib.cufinufftf_makeplan
122 | _make_planf.argtypes = [
123 |     c_int, c_int, c_int_p, c_int,
124 |     c_int, c_float, c_int, CufinufftPlanf_p, NufftOpts_p]
125 | _make_planf.restypes = c_int
126 | 
127 | _set_pts = lib.cufinufft_setpts
128 | _set_pts.argtypes = [
129 |     c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
130 |     c_double_p, c_double_p, c_void_p]
131 | _set_pts.restype = c_int
132 | 
133 | _set_ptsf = lib.cufinufftf_setpts
134 | _set_ptsf.argtypes = [
135 |     c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
136 |     c_float_p, c_float_p, c_void_p]
137 | _set_ptsf.restype = c_int
138 | 
139 | _exec_plan = lib.cufinufft_execute
140 | _exec_plan.argtypes = [c_void_p, c_void_p, c_void_p]
141 | _exec_plan.restype = c_int
142 | 
143 | _exec_planf = lib.cufinufftf_execute
144 | _exec_planf.argtypes = [c_void_p, c_void_p, c_void_p]
145 | _exec_planf.restype = c_int
146 | 
147 | _destroy_plan = lib.cufinufft_destroy
148 | _destroy_plan.argtypes = [c_void_p]
149 | _destroy_plan.restype = c_int
150 | 
151 | _destroy_planf = lib.cufinufftf_destroy
152 | _destroy_planf.argtypes = [c_void_p]
153 | _destroy_planf.restype = c_int
154 | 


--------------------------------------------------------------------------------
/src/1d/interp1d_wrapper.cu:
--------------------------------------------------------------------------------
  1 | #include <helper_cuda.h>
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | 
  5 | #include <cuComplex.h>
  6 | #include "../cuspreadinterp.h"
  7 | #include "../memtransfer.h"
  8 | #include <profile.h>
  9 | 
 10 | using namespace std;
 11 | 
 12 | int CUFINUFFT_INTERP1D(int nf1, CUCPX* d_fw, int M, FLT *d_kx, CUCPX *d_c, 
 13 | 	CUFINUFFT_PLAN d_plan)
 14 | /*
 15 | 	This c function is written for only doing 1D interpolation. See 
 16 | 	test/interp1d_test.cu for usage.
 17 | 
 18 | 	note: not allocate,transfer and free memories on gpu.
 19 | 	Melody Shih 11/21/21
 20 | */
 21 | {
 22 | 	cudaEvent_t start, stop;
 23 | 	cudaEventCreate(&start);
 24 | 	cudaEventCreate(&stop);
 25 | 
 26 | 	d_plan->nf1 = nf1;
 27 | 	d_plan->M = M;
 28 | 	d_plan->maxbatchsize = 1;
 29 | 
 30 | 	d_plan->kx = d_kx;
 31 | 	d_plan->c  = d_c;
 32 | 	d_plan->fw = d_fw;
 33 | 
 34 | 	int ier;
 35 | 	cudaEventRecord(start);
 36 | 	ier = ALLOCGPUMEM1D_PLAN(d_plan);
 37 | 	ier = ALLOCGPUMEM1D_NUPTS(d_plan);
 38 | 	if(d_plan->opts.gpu_method == 1){
 39 | 		ier = CUSPREAD1D_NUPTSDRIVEN_PROP(nf1,M,d_plan);
 40 | 		if(ier != 0 ){
 41 | 			printf("error: cuspread1d_subprob_prop, method(%d)\n", 
 42 | 				d_plan->opts.gpu_method);
 43 | 			return ier;
 44 | 		}
 45 | 	}
 46 | 	if(d_plan->opts.gpu_method == 2){
 47 | 		ier = CUSPREAD1D_SUBPROB_PROP(nf1,M,d_plan);
 48 | 		if(ier != 0 ){
 49 | 			printf("error: cuspread1d_subprob_prop, method(%d)\n", 
 50 | 				d_plan->opts.gpu_method);
 51 | 			return ier;
 52 | 		}
 53 | 	}
 54 | #ifdef TIME
 55 | 	float milliseconds = 0;
 56 | 	cudaEventRecord(stop);
 57 | 	cudaEventSynchronize(stop);
 58 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 59 | 	printf("[time  ] Obtain Interp Prop\t %.3g ms\n", milliseconds);
 60 | #endif
 61 | 	cudaEventRecord(start);
 62 | 	ier = CUINTERP1D(d_plan,1);
 63 | #ifdef TIME
 64 | 	cudaEventRecord(stop);
 65 | 	cudaEventSynchronize(stop);
 66 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 67 | 	printf("[time  ] Interp (%d)\t\t %.3g ms\n", d_plan->opts.gpu_method, 
 68 | 		milliseconds);
 69 | #endif
 70 | 	cudaEventRecord(start);
 71 | 	FREEGPUMEMORY1D(d_plan);
 72 | #ifdef TIME
 73 | 	cudaEventRecord(stop);
 74 | 	cudaEventSynchronize(stop);
 75 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 76 | 	printf("[time  ] Free GPU memory\t %.3g ms\n", milliseconds);
 77 | #endif
 78 | 	return ier;
 79 | }
 80 | 
 81 | int CUINTERP1D(CUFINUFFT_PLAN d_plan, int blksize)
 82 | /*
 83 | 	A wrapper for different interpolation methods. 
 84 | 
 85 | 	Methods available:
 86 | 	(1) Non-uniform points driven
 87 | 	(2) Subproblem
 88 | 
 89 | 	Melody Shih 11/21/21
 90 | */
 91 | {
 92 | 	int nf1 = d_plan->nf1;
 93 | 	int M = d_plan->M;
 94 | 
 95 | 	cudaEvent_t start, stop;
 96 | 	cudaEventCreate(&start);
 97 | 	cudaEventCreate(&stop);
 98 | 
 99 | 	int ier;
100 | 	switch(d_plan->opts.gpu_method)
101 | 	{
102 | 		case 1:
103 | 			{
104 | 				cudaEventRecord(start);
105 | 				{
106 | 					ier = CUINTERP1D_NUPTSDRIVEN(nf1, M, d_plan, blksize);
107 | 					if(ier != 0 ){
108 | 						cout<<"error: cnufftspread1d_gpu_nuptsdriven"<<endl;
109 | 						return 1;
110 | 					}
111 | 				}
112 | 			}
113 | 			break;
114 | 		default:
115 | 			cout<<"error: incorrect method, should be 1"<<endl;
116 | 			return 2;
117 | 	}
118 | #ifdef SPREADTIME
119 | 	float milliseconds;
120 | 	cudaEventRecord(stop);
121 | 	cudaEventSynchronize(stop);
122 | 	cudaEventElapsedTime(&milliseconds, start, stop);
123 | 	cout<<"[time  ]"<< " Interp " << milliseconds <<" ms"<<endl;
124 | #endif
125 | 	return ier;
126 | }
127 | 
128 | int CUINTERP1D_NUPTSDRIVEN(int nf1, int M, CUFINUFFT_PLAN d_plan, int blksize)
129 | {
130 | 	cudaEvent_t start, stop;
131 | 	cudaEventCreate(&start);
132 | 	cudaEventCreate(&stop);
133 | 
134 | 	dim3 threadsPerBlock;
135 | 	dim3 blocks;
136 | 
137 | 	int ns=d_plan->spopts.nspread;   // psi's support in terms of number of cells
138 | 	FLT es_c=d_plan->spopts.ES_c;
139 | 	FLT es_beta=d_plan->spopts.ES_beta;
140 | 	FLT sigma=d_plan->opts.upsampfac;
141 | 	int pirange=d_plan->spopts.pirange;
142 | 	int *d_idxnupts=d_plan->idxnupts;
143 | 
144 | 	FLT* d_kx = d_plan->kx;
145 | 	CUCPX* d_c = d_plan->c;
146 | 	CUCPX* d_fw = d_plan->fw;
147 | 
148 | 	threadsPerBlock.x = 32;
149 | 	threadsPerBlock.y = 1;
150 | 	blocks.x = (M + threadsPerBlock.x - 1)/threadsPerBlock.x;
151 | 	blocks.y = 1;
152 | 
153 | 	cudaEventRecord(start);
154 | 	if(d_plan->opts.gpu_kerevalmeth){
155 | 		for(int t=0; t<blksize; t++){
156 | 			Interp_1d_NUptsdriven_Horner<<<blocks, threadsPerBlock>>>(d_kx, 
157 | 				d_c+t*M, d_fw+t*nf1, M, ns, nf1, sigma, d_idxnupts, pirange);
158 | 		}
159 | 	}else{
160 | 		for(int t=0; t<blksize; t++){
161 | 			Interp_1d_NUptsdriven<<<blocks, threadsPerBlock>>>(d_kx, 
162 | 				d_c+t*M, d_fw+t*nf1, M, ns, nf1, es_c, es_beta, d_idxnupts, pirange);
163 | 		}
164 | 	}
165 | #ifdef SPREADTIME
166 | 	float milliseconds = 0;
167 | 	cudaEventRecord(stop);
168 | 	cudaEventSynchronize(stop);
169 | 	cudaEventElapsedTime(&milliseconds, start, stop);
170 | 	printf("[time  ] \tKernel Interp_1d_NUptsdriven (%d)\t%.3g ms\n", 
171 | 		milliseconds, d_plan->opts.gpu_kerevalmeth);
172 | #endif
173 | 	return 0;
174 | }
175 | 


--------------------------------------------------------------------------------
/test/interp2d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include "../src/cuspreadinterp.h"
  7 | #include "../contrib/utils.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | int main(int argc, char* argv[])
 12 | {
 13 | 	int nf1, nf2;
 14 | 	FLT upsampfac=2.0;
 15 | 	int N1, N2, M;
 16 | 	if (argc<5) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: interp2d method nupts_distr nf1 nf2 [M [tol [kerevalmeth [sort]]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven, or\n"
 22 | 			"    2: sub-problem.\n"
 23 | 			"  nupts_distr: The distribution of the points; one of\n"
 24 | 			"    0: uniform, or\n"
 25 | 			"    1: concentrated in a small region.\n"
 26 | 			"  nf1, nf2: The size of the 2D array.\n"
 27 | 			"  M: The number of non-uniform points (default nf1 * nf2 / 4).\n"
 28 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 29 | 			"  kerevalmeth: Kernel evaluation method; one of\n"
 30 | 			"     0: Exponential of square root (default), or\n"
 31 | 			"     1: Horner evaluation.\n"
 32 | 			"  sort: One of\n"
 33 | 			"     0: do not sort the points, or\n"
 34 | 			"     1: sort the points (default).\n");
 35 | 		return 1;
 36 | 	}
 37 | 	double w;
 38 | 	int method;
 39 | 	sscanf(argv[1],"%d",&method);
 40 | 	int nupts_distribute;
 41 | 	sscanf(argv[2],"%d",&nupts_distribute);
 42 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 43 | 	sscanf(argv[4],"%lf",&w); nf2 = (int)w;  // so can read 1e6 right!
 44 | 
 45 | 	N1 = (int) nf1/upsampfac;
 46 | 	N2 = (int) nf2/upsampfac;
 47 | 	M = N1*N2;// let density always be 1
 48 | 	if(argc>5){
 49 | 		sscanf(argv[5],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 50 | 		if(M == 0) M=N1*N2;
 51 | 	}
 52 | 
 53 | 	FLT tol=1e-6;
 54 | 	if(argc>6){
 55 | 		sscanf(argv[6],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 56 | 	}
 57 | 
 58 | 	int kerevalmeth=0;
 59 | 	if(argc>7){
 60 | 		sscanf(argv[7],"%d",&kerevalmeth);
 61 | 	}
 62 | 
 63 | 	int sort=1;
 64 | 	if(argc>8){
 65 | 		sscanf(argv[8],"%d",&sort);
 66 | 	}
 67 | 
 68 | 	int ier;
 69 | 	cout<<scientific<<setprecision(3);
 70 | 
 71 | 	FLT *x, *y;
 72 | 	CPX *c, *fw;
 73 | 	cudaMallocHost(&x, M*sizeof(FLT));
 74 | 	cudaMallocHost(&y, M*sizeof(FLT));
 75 | 	cudaMallocHost(&c, M*sizeof(CPX));
 76 | 	cudaMallocHost(&fw,nf1*nf2*sizeof(CPX));
 77 | 
 78 | 	FLT *d_x, *d_y;
 79 | 	CUCPX *d_c, *d_fw;
 80 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 81 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 82 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 83 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*nf2*sizeof(CUCPX)));
 84 | 
 85 | 	int dim=2;
 86 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 87 |         // Zero out your struct, (sets all pointers to NULL, crucial)
 88 | 	memset(dplan, 0, sizeof(*dplan));
 89 | 	ier = CUFINUFFT_DEFAULT_OPTS(2, dim, &(dplan->opts));
 90 | 	dplan->opts.gpu_method           = method;
 91 | 	dplan->opts.gpu_maxsubprobsize   = 1024;
 92 | 	dplan->opts.gpu_kerevalmeth      = kerevalmeth;
 93 | 	dplan->opts.gpu_sort             = sort;
 94 | 	dplan->opts.gpu_spreadinterponly = 1;
 95 | 	dplan->opts.gpu_binsizex         = 32; //binsize needs to be set here, since
 96 |                                            //SETUP_BINSIZE() is not called in 
 97 |                                            //spread, interp only wrappers.
 98 | 	dplan->opts.gpu_binsizey         = 32;
 99 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
100 | 
101 | 	switch(nupts_distribute){
102 | 		case 0: //uniform
103 | 			{
104 | 				for (int i = 0; i < M; i++) {
105 | 					x[i] = M_PI*randm11();// x in [-pi,pi)
106 | 					y[i] = M_PI*randm11();
107 | 				}
108 | 			}
109 | 			break;
110 | 		case 1: // concentrate on a small region
111 | 			{
112 | 				for (int i = 0; i < M; i++) {
113 | 					x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi)
114 | 					y[i] = M_PI*rand01()/(nf2*2/32);
115 | 				}
116 | 			}
117 | 			break;
118 | 	}
119 | 	for(int i=0; i<nf1*nf2; i++){
120 | 		fw[i].real(1.0);
121 | 		fw[i].imag(0.0);
122 | 	}
123 | 
124 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
125 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
126 | 	checkCudaErrors(cudaMemcpy(d_fw,fw,nf1*nf2*sizeof(CUCPX),cudaMemcpyHostToDevice));
127 | 
128 | 	CNTime timer;
129 | 	timer.restart();
130 | 	ier = CUFINUFFT_INTERP2D(nf1, nf2, d_fw, M, d_x, d_y, d_c, dplan);
131 | 	if(ier != 0 ){
132 | 		cout<<"error: cnufftinterp2d"<<endl;
133 | 		return 0;
134 | 	}
135 | 	FLT t=timer.elapsedsec();
136 | 	printf("[Method %d] %ld U pts to #%d NU pts in %.3g s (\t%.3g NU pts/s)\n",
137 | 			dplan->opts.gpu_method,nf1*nf2,M,t,M/t);
138 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
139 | #ifdef RESULT
140 | 	cout<<"[result-input]"<<endl;
141 | 	for(int j=0; j<M; j++){
142 | 		printf(" (%2.3g,%2.3g)",c[j].real(),c[j].imag() );
143 | 		cout<<endl;
144 | 	}
145 | 	cout<<endl;
146 | #endif
147 | 
148 | 	cudaFreeHost(x);
149 | 	cudaFreeHost(y);
150 | 	cudaFreeHost(c);
151 | 	cudaFreeHost(fw);
152 | 	cudaFree(d_x);
153 | 	cudaFree(d_y);
154 | 	cudaFree(d_c);
155 | 	cudaFree(d_fw);
156 | 	return 0;
157 | }
158 | 


--------------------------------------------------------------------------------
/src/3d/cufinufft3d.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <cufft.h>
  7 | 
  8 | #include <cufinufft_eitherprec.h>
  9 | #include "../cuspreadinterp.h"
 10 | #include "../cudeconvolve.h"
 11 | #include "../memtransfer.h"
 12 | 
 13 | using namespace std;
 14 | 
 15 | int CUFINUFFT3D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 16 | /*  
 17 | 	3D Type-1 NUFFT
 18 | 
 19 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 20 | 	It includes (copied from doc in finufft library)
 21 | 		Step 1: spread data to oversampled regular mesh using kernel
 22 | 		Step 2: compute FFT on uniform mesh
 23 | 		Step 3: deconvolve by division of each Fourier mode independently by the
 24 | 		        Fourier series coefficient of the kernel.
 25 | 
 26 | 	Melody Shih 07/25/19		
 27 | */
 28 | {
 29 | 	cudaEvent_t start, stop;
 30 | 	cudaEventCreate(&start);
 31 | 	cudaEventCreate(&stop);
 32 | 
 33 | 	cudaEventRecord(start);
 34 | 	int blksize; 
 35 | 	int ier;
 36 | 	CUCPX* d_fkstart;
 37 | 	CUCPX* d_cstart;
 38 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
 39 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
 40 | 			d_plan->maxbatchsize);
 41 | 		d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M;
 42 | 		d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt*
 43 | 			d_plan->mu;
 44 | 
 45 | 		d_plan->c = d_cstart;
 46 | 		d_plan->fk = d_fkstart;
 47 | 
 48 | 		checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize*
 49 | 					d_plan->nf1*d_plan->nf2*d_plan->nf3*sizeof(CUCPX)));
 50 | #ifdef TIME
 51 | 		float milliseconds = 0;
 52 | 		cudaEventRecord(stop);
 53 | 		cudaEventSynchronize(stop);
 54 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 55 | 		printf("[time  ] \tInitialize fw\t\t %.3g s\n", milliseconds/1000);
 56 | #endif
 57 | 		// Step 1: Spread
 58 | 		cudaEventRecord(start);
 59 | 		ier = CUSPREAD3D(d_plan, blksize);
 60 | 		if(ier != 0 ){
 61 | 			printf("error: cuspread3d, method(%d)\n", d_plan->opts.gpu_method);
 62 | 			return ier;
 63 | 		}
 64 | #ifdef TIME
 65 | 		cudaEventRecord(stop);
 66 | 		cudaEventSynchronize(stop);
 67 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 68 | 		printf("[time  ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 
 69 | 			d_plan->opts.gpu_method);
 70 | #endif
 71 | 		// Step 2: FFT
 72 | 		cudaEventRecord(start);
 73 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
 74 | #ifdef TIME
 75 | 		cudaEventRecord(stop);
 76 | 		cudaEventSynchronize(stop);
 77 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 78 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
 79 | #endif
 80 | 
 81 | 		// Step 3: deconvolve and shuffle
 82 | 		cudaEventRecord(start);
 83 | 		CUDECONVOLVE3D(d_plan, blksize);
 84 | #ifdef TIME
 85 | 		cudaEventRecord(stop);
 86 | 		cudaEventSynchronize(stop);
 87 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 88 | 		printf("[time  ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000);
 89 | #endif
 90 | 	}
 91 | 	return ier;
 92 | }
 93 | 
 94 | int CUFINUFFT3D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 95 | /*  
 96 | 	3D Type-2 NUFFT
 97 | 
 98 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 99 | 	It includes (copied from doc in finufft library)
100 | 		Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 
101 | 		        Fourier coeff
102 | 		Step 2: compute FFT on uniform mesh
103 | 		Step 3: interpolate data to regular mesh
104 | 
105 | 	Melody Shih 07/25/19		
106 | */
107 | {
108 | 	cudaEvent_t start, stop;
109 | 	cudaEventCreate(&start);
110 | 	cudaEventCreate(&stop);
111 | 
112 | 	int blksize;
113 | 	int ier;
114 | 	CUCPX* d_fkstart;
115 | 	CUCPX* d_cstart;
116 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
117 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
118 | 			d_plan->maxbatchsize);
119 | 		d_cstart  = d_c  + i*d_plan->maxbatchsize*d_plan->M;
120 | 		d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt*
121 | 			d_plan->mu;
122 | 
123 | 		d_plan->c = d_cstart;
124 | 		d_plan->fk = d_fkstart;
125 | 
126 | 		// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
127 | 		cudaEventRecord(start);
128 | 		CUDECONVOLVE3D(d_plan, blksize);
129 | #ifdef TIME
130 | 		float milliseconds = 0;
131 | 		cudaEventRecord(stop);
132 | 		cudaEventSynchronize(stop);
133 | 		cudaEventElapsedTime(&milliseconds, start, stop);
134 | 		printf("[time  ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000);
135 | #endif
136 | 		// Step 2: FFT
137 | 		cudaEventRecord(start);
138 | 		cudaDeviceSynchronize();
139 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
140 | #ifdef TIME
141 | 		cudaEventRecord(stop);
142 | 		cudaEventSynchronize(stop);
143 | 		cudaEventElapsedTime(&milliseconds, start, stop);
144 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
145 | #endif
146 | 
147 | 		// Step 3: deconvolve and shuffle
148 | 		cudaEventRecord(start);
149 | 		ier = CUINTERP3D(d_plan, blksize);
150 | 		if(ier != 0 ){
151 | 			printf("error: cuinterp3d, method(%d)\n", d_plan->opts.gpu_method);
152 | 			return ier;
153 | 		}
154 | #ifdef TIME
155 | 		cudaEventRecord(stop);
156 | 		cudaEventSynchronize(stop);
157 | 		cudaEventElapsedTime(&milliseconds, start, stop);
158 | 		printf("[time  ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000,
159 | 			d_plan->opts.gpu_method);
160 | #endif
161 | 	}
162 | 
163 | 	return ier;
164 | }
165 | 


--------------------------------------------------------------------------------
/src/1d/cufinufft1d.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <cufft.h>
  7 | 
  8 | #include <cufinufft_eitherprec.h>
  9 | #include "../cuspreadinterp.h"
 10 | #include "../cudeconvolve.h"
 11 | #include "../memtransfer.h"
 12 | 
 13 | using namespace std;
 14 | 
 15 | int CUFINUFFT1D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 16 | /*  
 17 | 	1D Type-1 NUFFT
 18 | 
 19 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 20 | 	It includes (copied from doc in finufft library)
 21 | 		Step 1: spread data to oversampled regular mesh using kernel
 22 | 		Step 2: compute FFT on uniform mesh
 23 | 		Step 3: deconvolve by division of each Fourier mode independently by the
 24 | 		        Fourier series coefficient of the kernel.
 25 | 
 26 | 	Melody Shih 11/21/21
 27 | */
 28 | {
 29 | 	assert(d_plan->spopts.spread_direction == 1);
 30 | 	cudaEvent_t start, stop;
 31 | 	cudaEventCreate(&start);
 32 | 	cudaEventCreate(&stop);
 33 | 
 34 | 	cudaEventRecord(start);
 35 | 	int blksize;
 36 | 	int ier;
 37 | 	CUCPX* d_fkstart;
 38 | 	CUCPX* d_cstart;
 39 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
 40 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
 41 | 			d_plan->maxbatchsize);
 42 | 		d_cstart   = d_c + i*d_plan->maxbatchsize*d_plan->M;
 43 | 		d_fkstart  = d_fk + i*d_plan->maxbatchsize*d_plan->ms;
 44 | 		d_plan->c  = d_cstart;
 45 | 		d_plan->fk = d_fkstart;
 46 | 
 47 | 		checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize*
 48 | 					d_plan->nf1*sizeof(CUCPX)));// this is needed
 49 | #ifdef TIME
 50 | 		float milliseconds = 0;
 51 | 		cudaEventRecord(stop);
 52 | 		cudaEventSynchronize(stop);
 53 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 54 | 		printf("[time  ] \tInitialize fw to 0\t %.3g s\n", 
 55 | 			milliseconds/1000);
 56 | #endif
 57 | 		// Step 1: Spread
 58 | 		cudaEventRecord(start);
 59 | 		ier = CUSPREAD1D(d_plan,blksize);
 60 | 		if(ier != 0 ){
 61 | 			printf("error: cuspread1d, method(%d)\n", d_plan->opts.gpu_method);
 62 | 			return ier;
 63 | 		}
 64 | #ifdef TIME
 65 | 		cudaEventRecord(stop);
 66 | 		cudaEventSynchronize(stop);
 67 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 68 | 		printf("[time  ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 
 69 | 			d_plan->opts.gpu_method);
 70 | #endif
 71 | 		// Step 2: FFT
 72 | 		cudaEventRecord(start);
 73 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
 74 | #ifdef TIME
 75 | 		cudaEventRecord(stop);
 76 | 		cudaEventSynchronize(stop);
 77 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 78 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
 79 | #endif
 80 | 
 81 | 		// Step 3: deconvolve and shuffle
 82 | 		cudaEventRecord(start);
 83 | 		CUDECONVOLVE1D(d_plan,blksize);
 84 | #ifdef TIME
 85 | 		cudaEventRecord(stop);
 86 | 		cudaEventSynchronize(stop);
 87 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 88 | 		printf("[time  ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000);
 89 | #endif
 90 | 	}
 91 | 	return ier;
 92 | }
 93 | 
 94 | int CUFINUFFT1D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 95 | /*  
 96 | 	1D Type-2 NUFFT
 97 | 
 98 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 99 | 	It includes (copied from doc in finufft library)
100 | 		Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 
101 | 		        Fourier coeff
102 | 		Step 2: compute FFT on uniform mesh
103 | 		Step 3: interpolate data to regular mesh
104 | 
105 | 	Melody Shih 11/21/21
106 | */
107 | {
108 | 	assert(d_plan->spopts.spread_direction == 2);
109 | 
110 | 	cudaEvent_t start, stop;
111 | 	cudaEventCreate(&start);
112 | 	cudaEventCreate(&stop);
113 | 
114 | 	cudaEventRecord(start);
115 | 	int blksize;
116 | 	int ier;
117 | 	CUCPX* d_fkstart;
118 | 	CUCPX* d_cstart;
119 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
120 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
121 | 			d_plan->maxbatchsize);
122 | 		d_cstart  = d_c  + i*d_plan->maxbatchsize*d_plan->M;
123 | 		d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms;
124 | 
125 | 		d_plan->c = d_cstart;
126 | 		d_plan->fk = d_fkstart;
127 | 
128 | 		// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
129 | 		cudaEventRecord(start);
130 | 		CUDECONVOLVE1D(d_plan,blksize);
131 | #ifdef TIME
132 | 		float milliseconds = 0;
133 | 		cudaEventRecord(stop);
134 | 		cudaEventSynchronize(stop);
135 | 		cudaEventElapsedTime(&milliseconds, start, stop);
136 | 		printf("[time  ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000);
137 | #endif
138 | 		// Step 2: FFT
139 | 		cudaDeviceSynchronize();
140 | 		cudaEventRecord(start);
141 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
142 | #ifdef TIME
143 | 		cudaEventRecord(stop);
144 | 		cudaEventSynchronize(stop);
145 | 		cudaEventElapsedTime(&milliseconds, start, stop);
146 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
147 | #endif
148 | 
149 | 		// Step 3: deconvolve and shuffle
150 | 		cudaEventRecord(start);
151 | 		ier = CUINTERP1D(d_plan, blksize);
152 | 		if(ier != 0 ){
153 | 			printf("error: cuinterp1d, method(%d)\n", d_plan->opts.gpu_method);
154 | 			return ier;
155 | 		}
156 | #ifdef TIME
157 | 		cudaEventRecord(stop);
158 | 		cudaEventSynchronize(stop);
159 | 		cudaEventElapsedTime(&milliseconds, start, stop);
160 | 		printf("[time  ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000,
161 | 			d_plan->opts.gpu_method);
162 | #endif
163 | 	}
164 | 	return ier;
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/contrib/common.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include <cufinufft_eitherprec.h>
  3 | #include <math.h>
  4 | #include <stdio.h>
  5 | #include <vector>
  6 | 
  7 | #ifdef __cplusplus
  8 | extern "C" {
  9 |   #include "legendre_rule_fast.h"
 10 | }
 11 | #else
 12 |   #include "legendre_rule_fast.h"
 13 | #endif
 14 | 
 15 | int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts)
 16 | // Set up the spreader parameters given eps, and pass across various nufft
 17 | // options. Report status of setup_spreader.  Barnett 10/30/17
 18 | {
 19 |   int ier=setup_spreader(spopts, eps, opts.upsampfac, opts.gpu_kerevalmeth);
 20 |   spopts.pirange = 1;                 // could allow user control?
 21 |   return ier;
 22 | }
 23 | 
 24 | void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts,
 25 | 				   BIGINT *nf, BIGINT bs)
 26 | // type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
 27 | // and requested number of Fourier modes ms.
 28 | {
 29 |   *nf = (BIGINT)(opts.upsampfac*ms);
 30 |   if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; // otherwise spread fails
 31 |   if (*nf<MAX_NF){                                // otherwise will fail anyway
 32 |     if (opts.gpu_method == 4)                     // expensive at huge nf
 33 |       *nf = next235beven(*nf, bs);
 34 |     else
 35 |       *nf = next235beven(*nf, 1);
 36 |   }
 37 | }
 38 | 
 39 | void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts)
 40 | /*
 41 |   Approximates exact Fourier series coeffs of cnufftspread's real symmetric
 42 |   kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
 43 |   narrowness of kernel. Uses phase winding for cheap eval on the regular freq
 44 |   grid. Note that this is also the Fourier transform of the non-periodized
 45 |   kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
 46 |   overall prefactor of 1/h, which is needed anyway for the correction, and
 47 |   arises because the quadrature weights are scaled for grid units not x units.
 48 | 
 49 |   Inputs:
 50 |   nf - size of 1d uniform spread grid, must be even.
 51 |   opts - spreading opts object, needed to eval kernel (must be already set up)
 52 | 
 53 |   Outputs:
 54 |   fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
 55 |               divided by h = 2pi/n.
 56 |               (should be allocated for at least nf/2+1 FLTs)
 57 | 
 58 |   Compare onedim_dct_kernel which has same interface, but computes DFT of
 59 |   sampled kernel, not quite the same object.
 60 | 
 61 |   Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18
 62 |   Melody 2/20/22 separate into precomp & comp functions defined below.
 63 |  */
 64 | {
 65 |   FLT f[MAX_NQUAD];
 66 |   dcomplex a[MAX_NQUAD];
 67 |   onedim_fseries_kernel_precomp(nf, f, a, opts);
 68 |   onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
 69 | }
 70 | 
 71 | /*
 72 |   Precomputation of approximations of exact Fourier series coeffs of cnufftspread's
 73 |   real symmetric kernel.
 74 | 
 75 |   Inputs:
 76 |   nf - size of 1d uniform spread grid, must be even.
 77 |   opts - spreading opts object, needed to eval kernel (must be already set up)
 78 | 
 79 |   Outputs:
 80 |   a - phase winding rates
 81 |   f - funciton values at quadrature nodes multiplied with quadrature weights
 82 |   (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
 83 | */
 84 | void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts)
 85 | {
 86 |   FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
 87 |   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
 88 |   int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
 89 |   double z[2*MAX_NQUAD],w[2*MAX_NQUAD];
 90 |   legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
 91 |   for (int n=0;n<q;++n) {               // set up nodes z_n and vals f_n
 92 |     z[n] *= J2;                         // rescale nodes
 93 |     f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
 94 |     a[n] = exp(2*PI*IMA*(FLT)(nf/2-z[n])/(FLT)nf);  // phase winding rates
 95 |   }
 96 | }
 97 | 
 98 | void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts)
 99 | {
100 |   FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
101 |   int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
102 |   BIGINT nout=nf/2+1;                   // how many values we're writing to
103 |   int nt = MIN(nout,MY_OMP_GET_MAX_THREADS());  // how many chunks
104 |   std::vector<BIGINT> brk(nt+1);        // start indices for each thread
105 |   for (int t=0; t<=nt; ++t)             // split nout mode indices btw threads
106 |     brk[t] = (BIGINT)(0.5 + nout*t/(double)nt);
107 | #pragma omp parallel
108 |   {
109 |     int t = MY_OMP_GET_THREAD_NUM();
110 |     if (t<nt) {                         // could be nt < actual # threads
111 |       dcomplex aj[MAX_NQUAD];           // phase rotator for this thread
112 |       for (int n=0;n<q;++n)
113 | 	aj[n] = pow(a[n],(FLT)brk[t]);       // init phase factors for chunk
114 |       for (BIGINT j=brk[t];j<brk[t+1];++j) {       // loop along output array
115 | 	FLT x = 0.0;                       // accumulator for answer at this j
116 | 	for (int n=0;n<q;++n) {
117 | 	  x += f[n] * 2*real(aj[n]);       // include the negative freq
118 | 	  aj[n] *= a[n];                   // wind the phases
119 | 	}
120 | 	fwkerhalf[j] = x;
121 |       }
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/2d/cufinufft2d.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <cufft.h>
  7 | 
  8 | #include <cufinufft_eitherprec.h>
  9 | #include "../cuspreadinterp.h"
 10 | #include "../cudeconvolve.h"
 11 | #include "../memtransfer.h"
 12 | 
 13 | using namespace std;
 14 | 
 15 | int CUFINUFFT2D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 16 | /*  
 17 | 	2D Type-1 NUFFT
 18 | 
 19 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 20 | 	It includes (copied from doc in finufft library)
 21 | 		Step 1: spread data to oversampled regular mesh using kernel
 22 | 		Step 2: compute FFT on uniform mesh
 23 | 		Step 3: deconvolve by division of each Fourier mode independently by the
 24 | 		        Fourier series coefficient of the kernel.
 25 | 
 26 | 	Melody Shih 07/25/19		
 27 | */
 28 | {
 29 | 	assert(d_plan->spopts.spread_direction == 1);
 30 | 	cudaEvent_t start, stop;
 31 | 	cudaEventCreate(&start);
 32 | 	cudaEventCreate(&stop);
 33 | 
 34 | 	cudaEventRecord(start);
 35 | 	int blksize;
 36 | 	int ier;
 37 | 	CUCPX* d_fkstart;
 38 | 	CUCPX* d_cstart;
 39 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
 40 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
 41 | 			d_plan->maxbatchsize);
 42 | 		d_cstart   = d_c + i*d_plan->maxbatchsize*d_plan->M;
 43 | 		d_fkstart  = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt;
 44 | 		d_plan->c  = d_cstart;
 45 | 		d_plan->fk = d_fkstart;
 46 | 
 47 | 		checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize*
 48 | 					d_plan->nf1*d_plan->nf2*sizeof(CUCPX)));// this is needed
 49 | #ifdef TIME
 50 | 		float milliseconds = 0;
 51 | 		cudaEventRecord(stop);
 52 | 		cudaEventSynchronize(stop);
 53 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 54 | 		printf("[time  ] \tInitialize fw to 0\t %.3g s\n", 
 55 | 			milliseconds/1000);
 56 | #endif
 57 | 		// Step 1: Spread
 58 | 		cudaEventRecord(start);
 59 | 		ier = CUSPREAD2D(d_plan,blksize);
 60 | 		if(ier != 0 ){
 61 | 			printf("error: cuspread2d, method(%d)\n", d_plan->opts.gpu_method);
 62 | 			return ier;
 63 | 		}
 64 | #ifdef TIME
 65 | 		cudaEventRecord(stop);
 66 | 		cudaEventSynchronize(stop);
 67 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 68 | 		printf("[time  ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 
 69 | 			d_plan->opts.gpu_method);
 70 | #endif
 71 | 		// Step 2: FFT
 72 | 		cudaEventRecord(start);
 73 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
 74 | #ifdef TIME
 75 | 		cudaEventRecord(stop);
 76 | 		cudaEventSynchronize(stop);
 77 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 78 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
 79 | #endif
 80 | 
 81 | 		// Step 3: deconvolve and shuffle
 82 | 		cudaEventRecord(start);
 83 | 		CUDECONVOLVE2D(d_plan,blksize);
 84 | #ifdef TIME
 85 | 		cudaEventRecord(stop);
 86 | 		cudaEventSynchronize(stop);
 87 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 88 | 		printf("[time  ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000);
 89 | #endif
 90 | 	}
 91 | 	return ier;
 92 | }
 93 | 
 94 | int CUFINUFFT2D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan)
 95 | /*  
 96 | 	2D Type-2 NUFFT
 97 | 
 98 | 	This function is called in "exec" stage (See ../cufinufft.cu).
 99 | 	It includes (copied from doc in finufft library)
100 | 		Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 
101 | 		        Fourier coeff
102 | 		Step 2: compute FFT on uniform mesh
103 | 		Step 3: interpolate data to regular mesh
104 | 
105 | 	Melody Shih 07/25/19
106 | */
107 | {
108 | 	assert(d_plan->spopts.spread_direction == 2);
109 | 
110 | 	cudaEvent_t start, stop;
111 | 	cudaEventCreate(&start);
112 | 	cudaEventCreate(&stop);
113 | 
114 | 	cudaEventRecord(start);
115 | 	int blksize;
116 | 	int ier;
117 | 	CUCPX* d_fkstart;
118 | 	CUCPX* d_cstart;
119 | 	for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){
120 | 		blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 
121 | 			d_plan->maxbatchsize);
122 | 		d_cstart  = d_c  + i*d_plan->maxbatchsize*d_plan->M;
123 | 		d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt;
124 | 
125 | 		d_plan->c = d_cstart;
126 | 		d_plan->fk = d_fkstart;
127 | 
128 | 		// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
129 | 		cudaEventRecord(start);
130 | 		CUDECONVOLVE2D(d_plan,blksize);
131 | #ifdef TIME
132 | 		float milliseconds = 0;
133 | 		cudaEventRecord(stop);
134 | 		cudaEventSynchronize(stop);
135 | 		cudaEventElapsedTime(&milliseconds, start, stop);
136 | 		printf("[time  ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000);
137 | #endif
138 | 		// Step 2: FFT
139 | 		cudaDeviceSynchronize();
140 | 		cudaEventRecord(start);
141 | 		CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
142 | #ifdef TIME
143 | 		cudaEventRecord(stop);
144 | 		cudaEventSynchronize(stop);
145 | 		cudaEventElapsedTime(&milliseconds, start, stop);
146 | 		printf("[time  ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000);
147 | #endif
148 | 
149 | 		// Step 3: deconvolve and shuffle
150 | 		cudaEventRecord(start);
151 | 		ier = CUINTERP2D(d_plan, blksize);
152 | 		if(ier != 0 ){
153 | 			printf("error: cuinterp2d, method(%d)\n", d_plan->opts.gpu_method);
154 | 			return ier;
155 | 		}
156 | #ifdef TIME
157 | 		cudaEventRecord(stop);
158 | 		cudaEventSynchronize(stop);
159 | 		cudaEventElapsedTime(&milliseconds, start, stop);
160 | 		printf("[time  ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000,
161 | 			d_plan->opts.gpu_method);
162 | #endif
163 | 	}
164 | 	return ier;
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/test/spread2d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <algorithm>
  7 | #include "../src/cuspreadinterp.h"
  8 | #include "../contrib/utils.h"
  9 | 
 10 | using namespace std;
 11 | 
 12 | int main(int argc, char* argv[])
 13 | {
 14 | 	int nf1, nf2;
 15 | 	FLT upsampfac=2.0;
 16 | 	int N1, N2, M;
 17 | 	if (argc<5) {
 18 | 		fprintf(stderr,
 19 | 			"Usage: spread2d_test method nupts_distr nf1 nf2 [maxsubprobsize [M [tol [kerevalmeth]]]]\n"
 20 | 			"Arguments:\n"
 21 | 			"  method: One of\n"
 22 | 			"    1: nupts driven,\n"
 23 | 			"    2: sub-problem, or\n"
 24 | 			"    3: sub-problem with Paul's idea.\n"
 25 | 			"  nupts_distr: The distribution of the points; one of\n"
 26 | 			"    0: uniform, or\n"
 27 | 			"    1: concentrated in a small region.\n"
 28 | 			"  nf1, nf2: The size of the 2D array.\n"
 29 | 			"  maxsubprobsize: Maximum size of subproblems (default 65536).\n"
 30 | 			"  M: The number of non-uniform points (default nf1 * nf2 / 4).\n"
 31 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 32 | 			"  kerevalmeth: Kernel evaluation method; one of\n"
 33 | 			"     0: Exponential of square root (default), or\n"
 34 | 			"     1: Horner evaluation.\n");
 35 | 		return 1;
 36 | 	}
 37 | 	double w;
 38 | 	int method;
 39 | 	sscanf(argv[1],"%d",&method);
 40 | 
 41 | 	int nupts_distribute;
 42 | 	sscanf(argv[2],"%d",&nupts_distribute);
 43 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 44 | 	sscanf(argv[4],"%lf",&w); nf2 = (int)w;  // so can read 1e6 right!
 45 | 
 46 | 	int maxsubprobsize=65536;
 47 | 	if(argc>5){
 48 | 		sscanf(argv[5],"%d",&maxsubprobsize);
 49 | 	}
 50 | 
 51 | 	N1 = (int) nf1/upsampfac;
 52 | 	N2 = (int) nf2/upsampfac;
 53 | 	M = N1*N2;
 54 | 	if(argc>6){
 55 | 		sscanf(argv[6],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 56 | 	}
 57 | 
 58 | 	FLT tol=1e-6;
 59 | 	if(argc>7){
 60 | 		sscanf(argv[7],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 61 | 	}
 62 | 
 63 | 	int kerevalmeth=0;
 64 | 	if(argc>8){
 65 | 		sscanf(argv[8],"%d",&kerevalmeth);
 66 | 	}
 67 | 
 68 | 	int ier;
 69 | 	int dim=2;
 70 | 
 71 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 72 |         // Zero out your struct, (sets all pointers to NULL, crucial)
 73 |         memset(dplan, 0, sizeof(*dplan));
 74 | 	ier = CUFINUFFT_DEFAULT_OPTS(1, dim, &(dplan->opts));
 75 | 
 76 | 	dplan->opts.gpu_method           = method;
 77 | 	dplan->opts.gpu_maxsubprobsize   = maxsubprobsize;
 78 | 	dplan->opts.gpu_kerevalmeth      = kerevalmeth;
 79 | 	dplan->opts.gpu_sort             = 1;   // ahb changed from 0
 80 | 	dplan->opts.gpu_spreadinterponly = 1;
 81 | 	dplan->opts.gpu_binsizex         = 32; //binsize needs to be set here, since
 82 |                                            //SETUP_BINSIZE() is not called in 
 83 |                                            //spread, interp only wrappers.
 84 | 	dplan->opts.gpu_binsizey         = 32;
 85 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
 86 | 
 87 | 	cout<<scientific<<setprecision(3);
 88 | 
 89 | 	FLT *x, *y;
 90 | 	CPX *c, *fw;
 91 | 	cudaMallocHost(&x, M*sizeof(FLT));
 92 | 	cudaMallocHost(&y, M*sizeof(FLT));
 93 | 	cudaMallocHost(&c, M*sizeof(CPX));
 94 | 	cudaMallocHost(&fw,nf1*nf2*sizeof(CPX));
 95 | 
 96 | 	FLT *d_x, *d_y;
 97 | 	CUCPX *d_c, *d_fw;
 98 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 99 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
100 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
101 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*nf2*sizeof(CUCPX)));
102 | 
103 | 	switch(nupts_distribute){
104 | 		// Making data
105 | 		case 0: //uniform
106 | 			{
107 | 				for (int i = 0; i < M; i++) {
108 | 					x[i] = M_PI*randm11();// x in [-pi,pi)
109 | 					y[i] = M_PI*randm11();
110 | 					c[i].real(randm11());
111 | 					c[i].imag(randm11());
112 | 				}
113 | 			}
114 | 			break;
115 | 		case 1: // concentrate on a small region
116 | 			{
117 | 				for (int i = 0; i < M; i++) {
118 | 					x[i] = M_PI*rand01()/(nf1*2/32);
119 | 					y[i] = M_PI*rand01()/(nf2*2/32);
120 | 					c[i].real(randm11());
121 | 					c[i].imag(randm11());
122 | 				}
123 | 			}
124 | 			break;
125 | 		default:
126 | 			cerr << "not valid nupts distr" << endl;
127 | 	}
128 | 
129 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
130 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
131 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CUCPX),cudaMemcpyHostToDevice));
132 | 
133 | 	CNTime timer;
134 | 	/*warm up gpu*/
135 | 	timer.restart();
136 | 	ier = CUFINUFFT_SPREAD2D(nf1, nf2, d_fw, M, d_x, d_y, d_c, dplan);
137 | 	if(ier != 0 ){
138 | 		cout<<"error: cnufftspread2d"<<endl;
139 | 		return 0;
140 | 	}
141 | 	FLT t=timer.elapsedsec();
142 | 	printf("[Method %d] %ld NU pts to #%d U pts in %.3g s (%.3g NU pts/s)\n",
143 | 			dplan->opts.gpu_method,M,nf1*nf2,t,M/t);
144 | 
145 | 	checkCudaErrors(cudaMemcpy(fw,d_fw,nf1*nf2*sizeof(CUCPX),
146 | 		cudaMemcpyDeviceToHost));
147 | #ifdef RESULT
148 | 	cout<<"[result-input]"<<endl;
149 | 	for(int j=0; j<nf2; j++){
150 | 		if( j % dplan->opts.gpu_binsizey == 0)
151 | 			printf("\n");
152 | 		for (int i=0; i<nf1; i++){
153 | 			if( i % dplan->opts.gpu_binsizex == 0 && i!=0)
154 | 				printf(" |");
155 | 			printf(" (%2.3g,%2.3g)",fw[i+j*nf1].real(),fw[i+j*nf1].imag() );
156 | 		}
157 | 		cout<<endl;
158 | 	}
159 | 	cout<<endl;
160 | #endif
161 | 
162 | 	cudaFreeHost(x);
163 | 	cudaFreeHost(y);
164 | 	cudaFreeHost(c);
165 | 	cudaFreeHost(fw);
166 | 	cudaFree(d_x);
167 | 	cudaFree(d_y);
168 | 	cudaFree(d_c);
169 | 	cudaFree(d_fw);
170 | 	return 0;
171 | }
172 | 


--------------------------------------------------------------------------------
/test/cufinufft1d1_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include "../contrib/utils.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | int main(int argc, char* argv[])
 14 | {
 15 | 	int N1, M, N;
 16 | 	if (argc<3) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: cufinufft1d1_test method N1 [M [tol]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven, or\n"
 22 | 			"    2: sub-problem\n"
 23 | 			"  N1: The size of the 1D array.\n"
 24 | 			"  M: The number of non-uniform points (default N1).\n"
 25 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 26 | 		return 1;
 27 | 	}
 28 | 	double w;
 29 | 	int method;
 30 | 	sscanf(argv[1],"%d",&method);
 31 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 32 | 	N = N1;
 33 | 	M = N1;// let density always be 1
 34 | 	if(argc>3){
 35 | 		sscanf(argv[3],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 36 | 	}
 37 | 
 38 | 	FLT tol=1e-6;
 39 | 	if(argc>4){
 40 | 		sscanf(argv[4],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 41 | 	}
 42 | 	int iflag=1;
 43 | 
 44 | 
 45 | 	cout<<scientific<<setprecision(3);
 46 | 	int ier;
 47 | 
 48 | 
 49 | 	FLT *x;
 50 | 	CPX *c, *fk;
 51 | 	cudaMallocHost(&x, M*sizeof(FLT));
 52 | 	cudaMallocHost(&c, M*sizeof(CPX));
 53 | 	cudaMallocHost(&fk,N1*sizeof(CPX));
 54 | 
 55 | 	FLT *d_x;
 56 | 	CUCPX *d_c, *d_fk;
 57 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 58 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 59 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*sizeof(CUCPX)));
 60 | 
 61 | 	// Making data
 62 | 	for (int i = 0; i < M; i++) {
 63 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 64 | 		c[i].real(randm11());
 65 | 		c[i].imag(randm11());
 66 | 	}
 67 | 
 68 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 69 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CPX),cudaMemcpyHostToDevice));
 70 | 
 71 | 	cudaEvent_t start, stop;
 72 | 	float milliseconds = 0;
 73 | 	float totaltime = 0;
 74 | 	cudaEventCreate(&start);
 75 | 	cudaEventCreate(&stop);
 76 | 
 77 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
 78 | 	cudaEventRecord(start);
 79 |  	{
 80 | 		int nf1=1;
 81 | 		cufftHandle fftplan;
 82 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 83 | 	}
 84 | 	cudaEventRecord(stop);
 85 | 	cudaEventSynchronize(stop);
 86 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 87 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
 88 | 
 89 | 	// now to our tests...
 90 | 	CUFINUFFT_PLAN dplan;
 91 | 	int dim = 1;
 92 | 	int type = 1;
 93 | 
 94 | 	// Here we setup our own opts, for gpu_method.
 95 | 	cufinufft_opts opts;
 96 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
 97 | 	if(ier!=0){
 98 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
 99 | 	  return ier;
100 | 	}
101 | 
102 | 	opts.gpu_method=method;
103 | 
104 | 	int nmodes[3];
105 | 	int ntransf = 1;
106 | 	int maxbatchsize = 1;
107 | 	nmodes[0] = N1;
108 | 	nmodes[1] = 1;
109 | 	nmodes[2] = 1;
110 | 	cudaEventRecord(start);
111 | 	ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
112 | 			       maxbatchsize, &dplan, &opts);
113 | 	if (ier!=0){
114 | 	  printf("err: cufinufft1d_plan\n");
115 | 	  return ier;
116 | 	}
117 | 	cudaEventRecord(stop);
118 | 	cudaEventSynchronize(stop);
119 | 	cudaEventElapsedTime(&milliseconds, start, stop);
120 | 	totaltime += milliseconds;
121 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
122 | 
123 | 
124 | 	cudaEventRecord(start);
125 | 	ier=CUFINUFFT_SETPTS(M, d_x, NULL, NULL, 0, NULL, NULL, NULL, dplan);
126 | 	if (ier!=0){
127 | 	  printf("err: cufinufft_setpts\n");
128 | 	  return ier;
129 | 	}
130 | 	cudaEventRecord(stop);
131 | 	cudaEventSynchronize(stop);
132 | 	cudaEventElapsedTime(&milliseconds, start, stop);
133 | 	totaltime += milliseconds;
134 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
135 | 
136 | 
137 | 	cudaEventRecord(start);
138 | 	ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
139 | 	if (ier!=0){
140 | 	  printf("err: cufinufft1d1_exec\n");
141 | 	  return ier;
142 | 	}
143 | 	cudaEventRecord(stop);
144 | 	cudaEventSynchronize(stop);
145 | 	cudaEventElapsedTime(&milliseconds, start, stop);
146 | 	totaltime += milliseconds;
147 | 	float exec_ms = milliseconds;
148 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
149 | 
150 | 	cudaEventRecord(start);
151 | 	ier=CUFINUFFT_DESTROY(dplan);
152 | 	cudaEventRecord(stop);
153 | 	cudaEventSynchronize(stop);
154 | 	cudaEventElapsedTime(&milliseconds, start, stop);
155 | 	totaltime += milliseconds;
156 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
157 | 
158 | 	checkCudaErrors(cudaMemcpy(fk,d_fk,N1*sizeof(CUCPX), cudaMemcpyDeviceToHost));
159 | 
160 | 	printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
161 | 			opts.gpu_method,M,N1,totaltime/1000,M/totaltime*1000);
162 | 	printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
163 | 
164 | 	int nt1 = (int)(0.37*N1);  // choose some mode index to check
165 | 	CPX Ft = CPX(0,0), J = IMA*(FLT)iflag;
166 | 	for (int j=0; j<M; ++j)
167 | 		Ft += c[j] * exp(J*(nt1*x[j]));   // crude direct
168 | 	int it = N1/2+nt1;   // index in complex F as 1d array
169 | //	printf("[gpu   ] one mode: abs err in F[%ld is %.3g\n",(int)nt1,abs(Ft-fk[it]));
170 | 	printf("[gpu   ] one mode: rel err in F[%ld] is %.3g\n",(int)nt1,abs(Ft-fk[it])/infnorm(N,fk));
171 | 
172 | 	cudaFreeHost(x);
173 | 	cudaFreeHost(c);
174 | 	cudaFreeHost(fk);
175 | 	cudaFree(d_x);
176 | 	cudaFree(d_c);
177 | 	cudaFree(d_fk);
178 | 	return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/test/fseries_kernel_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include "../contrib/utils.h"
 10 | #include "../src/common.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | {
 16 | 	int nf1;
 17 | 	if (argc<2) {
 18 | 		fprintf(stderr,
 19 | 			"Usage: onedim_fseries_kernel_test nf1 [dim [tol [gpuversion [nf2 [nf3]]]]]\n"
 20 | 			"Arguments:\n"
 21 | 			"  nf1: The size of the upsampled fine grid size in x.\n"
 22 | 			"  dim: Dimension of the nuFFT.\n"
 23 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 24 | 			"  gpuversion: Use gpu version or not (default True).\n"
 25 | 			"  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
 26 | 			"  nf3: The size of the upsampled fine grid size in z. (default nf3)\n"
 27 | 			);
 28 | 		return 1;
 29 | 	}
 30 | 	double w;
 31 | 	sscanf(argv[1],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 32 | 	int dim = 1;
 33 | 	if (argc > 2) 
 34 | 		sscanf(argv[2],"%d",&dim);
 35 | 	FLT eps = 1e-6;
 36 | 	if (argc > 3) 
 37 | 		sscanf(argv[3],"%lf",&w); eps = (FLT)w;
 38 | 	int gpu = 1;
 39 | 	if (argc > 4) 
 40 | 		sscanf(argv[4],"%d",&gpu);
 41 | 
 42 | 	int nf2=nf1;
 43 | 	if (argc > 5) 
 44 | 		sscanf(argv[5],"%lf",&w); nf2 = (int)w;
 45 | 	int nf3=nf1;
 46 | 	if (argc > 6) 
 47 | 		sscanf(argv[6],"%lf",&w); nf3 = (int)w;
 48 | 
 49 | 	SPREAD_OPTS opts;
 50 | 	FLT *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
 51 | 	FLT *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
 52 | 	checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(FLT)*(nf1/2+1)));
 53 | 	if(dim > 1)
 54 | 		checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(FLT)*(nf2/2+1)));
 55 | 	if(dim > 2)
 56 | 		checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(FLT)*(nf3/2+1)));
 57 | 
 58 | 	int ier = setup_spreader(opts, eps, 2.0, 0);
 59 | 
 60 | 	cudaEvent_t start, stop;
 61 | 	cudaEventCreate(&start);
 62 | 	cudaEventCreate(&stop);
 63 | 
 64 | 	float milliseconds = 0;
 65 | 	float gputime = 0;
 66 | 	float cputime = 0;
 67 | 
 68 | 	CNTime timer;
 69 | 	if( !gpu ) {
 70 | 		timer.start();
 71 | 		fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1));
 72 | 		if(dim > 1)
 73 | 			fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1));
 74 | 		if(dim > 2)
 75 | 			fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1));
 76 | 
 77 | 		onedim_fseries_kernel(nf1, fwkerhalf1, opts);
 78 | 		if(dim > 1)
 79 | 			onedim_fseries_kernel(nf2, fwkerhalf2, opts);
 80 | 		if(dim > 2)
 81 | 			onedim_fseries_kernel(nf3, fwkerhalf3, opts);
 82 | 		cputime = timer.elapsedsec();
 83 | 		cudaEventRecord(start);
 84 |  		{
 85 | 			checkCudaErrors(cudaMemcpy(d_fwkerhalf1,fwkerhalf1,
 86 | 				sizeof(FLT)*(nf1/2+1),cudaMemcpyHostToDevice));
 87 | 			if(dim > 1)
 88 | 				checkCudaErrors(cudaMemcpy(d_fwkerhalf2,fwkerhalf2,
 89 | 					sizeof(FLT)*(nf2/2+1),cudaMemcpyHostToDevice));
 90 | 			if(dim > 2)
 91 | 				checkCudaErrors(cudaMemcpy(d_fwkerhalf3,fwkerhalf3,
 92 | 					sizeof(FLT)*(nf3/2+1),cudaMemcpyHostToDevice));
 93 | 		}
 94 | 		cudaEventRecord(stop);
 95 | 		cudaEventSynchronize(stop);
 96 | 		cudaEventElapsedTime(&milliseconds, start, stop);
 97 | 		gputime = milliseconds;
 98 | 		printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n",
 99 | 				dim, nf1, opts.nspread, gputime+cputime*1000);
100 | 		free(fwkerhalf1);
101 | 		if(dim > 1)
102 | 			free(fwkerhalf2);
103 | 		if(dim > 2)
104 | 			free(fwkerhalf3);
105 | 	} else {
106 | 		timer.start();
107 | 		complex<double> a[dim*MAX_NQUAD];
108 | 		FLT             f[dim*MAX_NQUAD];
109 | 		onedim_fseries_kernel_precomp(nf1, f, a, opts);
110 | 		if(dim > 1)
111 | 			onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts);
112 | 		if(dim > 2)
113 | 			onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts);
114 | 		cputime = timer.elapsedsec();
115 | 
116 | 		cuDoubleComplex *d_a;
117 | 		FLT   *d_f;
118 | 		cudaEventRecord(start);
119 |  		{
120 | 			checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex)));
121 | 			checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT)));
122 | 			checkCudaErrors(cudaMemcpy(d_a,a,
123 | 				dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
124 | 			checkCudaErrors(cudaMemcpy(d_f,f,
125 | 				dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice));
126 | 			ier = CUFSERIESKERNELCOMPUTE(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
127 | 				d_fwkerhalf2, d_fwkerhalf3, opts.nspread);
128 | 		}
129 | 		cudaEventRecord(stop);
130 | 		cudaEventSynchronize(stop);
131 | 		cudaEventElapsedTime(&milliseconds, start, stop);
132 | 		gputime = milliseconds;
133 | 		printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n",
134 | 				dim, nf1, opts.nspread, gputime+cputime*1000);
135 | 		cudaFree(d_a);
136 | 		cudaFree(d_f);
137 | 	}
138 | 
139 | #ifdef RESULT
140 | 	fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1));
141 | 	if(dim > 1)
142 | 		fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1));
143 | 	if(dim > 2)
144 | 		fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1));
145 | 
146 | 	checkCudaErrors(cudaMemcpy(fwkerhalf1,d_fwkerhalf1,sizeof(FLT)*(nf1/2+1),cudaMemcpyDeviceToHost));
147 | 	if(dim > 1)
148 | 		checkCudaErrors(cudaMemcpy(fwkerhalf2,d_fwkerhalf2,sizeof(FLT)*(nf2/2+1),cudaMemcpyDeviceToHost));
149 | 	if(dim > 2)
150 | 		checkCudaErrors(cudaMemcpy(fwkerhalf3,d_fwkerhalf3,sizeof(FLT)*(nf3/2+1),cudaMemcpyDeviceToHost));
151 | 	for(int i=0; i<nf1/2+1; i++)
152 | 		printf("%10.8e ", fwkerhalf1[i]);
153 | 	printf("\n");
154 | 	if(dim > 1)
155 | 		for(int i=0; i<nf2/2+1; i++)
156 | 			printf("%10.8e ", fwkerhalf2[i]);
157 | 		printf("\n");
158 | 	if(dim > 2)
159 | 		for(int i=0; i<nf3/2+1; i++)
160 | 			printf("%10.8e ", fwkerhalf3[i]);
161 | 		printf("\n");
162 | #endif
163 | 
164 | 	return 0;
165 | }
166 | 


--------------------------------------------------------------------------------
/test/cufinufft1d2_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include <profile.h>
 10 | #include "../contrib/utils.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | {
 16 | 	int N1, M;
 17 | 	if (argc<3) {
 18 | 		fprintf(stderr,
 19 | 			"Usage: cufinufft2d2_test method N1 [M [tol]]\n"
 20 | 			"Arguments:\n"
 21 | 			"  method: One of\n"
 22 | 			"    1: nupts driven\n"
 23 | 			"  N1: The size of the 1D array.\n"
 24 | 			"  M: The number of non-uniform points (default N1).\n"
 25 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 26 | 		return 1;
 27 | 	}
 28 | 	double w;
 29 | 	int method;
 30 | 	sscanf(argv[1],"%d",&method);
 31 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 32 | 	M = N1;// let density always be 1
 33 | 	if(argc>3){
 34 | 		sscanf(argv[3],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 35 | 	}
 36 | 
 37 | 	FLT tol=1e-6;
 38 | 	if(argc>4){
 39 | 		sscanf(argv[4],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 40 | 	}
 41 | 	int iflag=1;
 42 | 
 43 | 
 44 | 	cout<<scientific<<setprecision(3);
 45 | 	int ier;
 46 | 
 47 | 
 48 | 	FLT *x;
 49 | 	CPX *c, *fk;
 50 | 	cudaMallocHost(&x, M*sizeof(FLT));
 51 | 	cudaMallocHost(&c, M*sizeof(CPX));
 52 | 	cudaMallocHost(&fk,N1*sizeof(CPX));
 53 | 
 54 | 	FLT *d_x;
 55 | 	CUCPX *d_c, *d_fk;
 56 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 57 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 58 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*sizeof(CUCPX)));
 59 | 	// Making data
 60 | 	for (int i = 0; i < M; i++) {
 61 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 62 | 	}
 63 | 	for(int i=0; i<N1; i++){
 64 | 		fk[i].real(randm11());
 65 | 		fk[i].imag(randm11());
 66 | 	}
 67 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 68 | 	checkCudaErrors(cudaMemcpy(d_fk,fk,N1*sizeof(CPX),cudaMemcpyHostToDevice));
 69 | 
 70 | 	cudaEvent_t start, stop;
 71 | 	float milliseconds = 0;
 72 |         float totaltime = 0;
 73 | 	cudaEventCreate(&start);
 74 | 	cudaEventCreate(&stop);
 75 | 
 76 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
 77 | 	cudaEventRecord(start);
 78 | 	{
 79 | 		int nf1=1;
 80 | 		cufftHandle fftplan;
 81 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 82 | 	}
 83 | 	cudaEventRecord(stop);
 84 | 	cudaEventSynchronize(stop);
 85 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 86 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
 87 | 
 88 | 	// now to the test...
 89 | 	CUFINUFFT_PLAN dplan;
 90 | 	int dim = 1;
 91 | 	int type = 2;
 92 | 
 93 | 	// Here we setup our own opts, for gpu_method.
 94 | 	cufinufft_opts opts;
 95 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
 96 | 	if(ier!=0){
 97 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
 98 | 	  return ier;
 99 | 	}
100 | 	opts.gpu_method=method;
101 | 
102 | 	int nmodes[3];
103 | 	int ntransf = 1;
104 | 	int maxbatchsize = 1;
105 | 	nmodes[0] = N1;
106 | 	nmodes[1] = 1;
107 | 	nmodes[2] = 1;
108 | 	cudaEventRecord(start);
109 | 	{
110 | 		ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
111 | 				       maxbatchsize, &dplan, &opts);
112 | 		if (ier!=0){
113 | 			printf("err: cufinufft1d_plan\n");
114 | 			return ier;
115 | 		}
116 | 	}
117 | 	cudaEventRecord(stop);
118 | 	cudaEventSynchronize(stop);
119 | 	cudaEventElapsedTime(&milliseconds, start, stop);
120 | 	totaltime += milliseconds;
121 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
122 | 
123 | 	cudaEventRecord(start);
124 | 	{
125 | 		ier=CUFINUFFT_SETPTS(M, d_x, NULL, NULL, 0, NULL, NULL, NULL, dplan); 
126 | 		if (ier!=0){
127 | 			printf("err: cufinufft_setpts\n");
128 | 			return ier;
129 | 		}
130 | 	}
131 | 	cudaEventRecord(stop);
132 | 	cudaEventSynchronize(stop);
133 | 	cudaEventElapsedTime(&milliseconds, start, stop);
134 | 	totaltime += milliseconds;
135 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
136 | 
137 | 	cudaEventRecord(start);
138 | 	{
139 | 		ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
140 | 		if (ier!=0){
141 | 			printf("err: cufinufft1d2_exec\n");
142 | 			return ier;
143 | 		}
144 | 	}
145 | 	cudaEventRecord(stop);
146 | 	cudaEventSynchronize(stop);
147 | 	cudaEventElapsedTime(&milliseconds, start, stop);
148 | 	totaltime += milliseconds;
149 | 	float exec_ms = milliseconds;
150 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
151 | 
152 | 	cudaEventRecord(start);
153 | 	{
154 | 		PROFILE_CUDA_GROUP("cufinufft1d_destroy",5);
155 | 		ier=CUFINUFFT_DESTROY(dplan);
156 | 		if(ier!=0){
157 | 		  printf("err %d: cufinufft1d2_destroy\n", ier);
158 | 		  return ier;
159 | 		}
160 | 	}
161 | 	cudaEventRecord(stop);
162 | 	cudaEventSynchronize(stop);
163 | 	cudaEventElapsedTime(&milliseconds, start, stop);
164 | 	totaltime += milliseconds;
165 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
166 | 
167 | 	printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method,N1,M,totaltime/1000,M/totaltime*1000);
168 | 	printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
169 | 
170 | 
171 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
172 | 	int jt = M/2;          // check arbitrary choice of one targ pt
173 | 	CPX J = IMA*(FLT)iflag;
174 | 	CPX ct = CPX(0,0);
175 | 	int m=0;
176 | 	for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
177 | 		ct += fk[m++] * exp(J*(m1*x[jt]));   // crude direct
178 | 	printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,abs(c[jt]-ct)/infnorm(M,c));
179 | 
180 | 	cudaFreeHost(x);
181 | 	cudaFreeHost(c);
182 | 	cudaFreeHost(fk);
183 | 	cudaFree(d_x);
184 | 	cudaFree(d_c);
185 | 	cudaFree(d_fk);
186 | 	return 0;
187 | }
188 | 


--------------------------------------------------------------------------------
/test/interp3d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include "../src/cuspreadinterp.h"
  7 | #include "../contrib/utils.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | int main(int argc, char* argv[])
 12 | {
 13 | 	int nf1, nf2, nf3;
 14 | 	FLT sigma = 2.0;
 15 | 	int N1, N2, N3, M;
 16 | 	if (argc<5) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: interp3d method nupts_distr nf1 nf2 nf3 [M [tol [sort]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven, or\n"
 22 | 			"    2: sub-problem.\n"
 23 | 			"  nupts_distr: The distribution of the points; one of\n"
 24 | 			"    0: uniform, or\n"
 25 | 			"    1: concentrated in a small region.\n"
 26 | 			"  nf1, nf2, nf3: The size of the 3D array.\n"
 27 | 			"  M: The number of non-uniform points (default nf1 * nf2 * nf3 / 8).\n"
 28 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 29 | 			"  sort: One of\n"
 30 | 			"     0: do not sort the points, or\n"
 31 | 			"     1: sort the points (default).\n");
 32 | 		return 1;
 33 | 	}
 34 | 	double w;
 35 | 	int method;
 36 | 	sscanf(argv[1],"%d",&method);
 37 | 	int nupts_distribute;
 38 | 	sscanf(argv[2],"%d",&nupts_distribute);
 39 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 40 | 	sscanf(argv[4],"%lf",&w); nf2 = (int)w;  // so can read 1e6 right!
 41 | 	sscanf(argv[5],"%lf",&w); nf3 = (int)w;  // so can read 1e6 right!
 42 | 
 43 | 	N1 = (int) nf1/sigma;
 44 | 	N2 = (int) nf2/sigma;
 45 | 	N3 = (int) nf3/sigma;
 46 | 	M = N1*N2*N3;// let density always be 1
 47 | 	if(argc>6){
 48 | 		sscanf(argv[6],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 49 | 		if(M == 0) M=N1*N2*N3;
 50 | 	}
 51 | 
 52 | 	FLT tol=1e-6;
 53 | 	if(argc>7){
 54 | 		sscanf(argv[7],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 55 | 	}
 56 | 
 57 | 	int sort=1;
 58 | 	if(argc>8){
 59 | 		sscanf(argv[8],"%d",&sort);
 60 | 	}
 61 | 	int ier;
 62 | 
 63 | 	int ns=std::ceil(-log10(tol/10.0));
 64 | 
 65 | 	cout<<scientific<<setprecision(3);
 66 | 
 67 | 
 68 | 	FLT *x, *y, *z;
 69 | 	CPX *c, *fw;
 70 | 	cudaMallocHost(&x, M*sizeof(FLT));
 71 | 	cudaMallocHost(&y, M*sizeof(FLT));
 72 | 	cudaMallocHost(&z, M*sizeof(FLT));
 73 | 	cudaMallocHost(&c, M*sizeof(CPX));
 74 | 	cudaMallocHost(&fw,nf1*nf2*nf3*sizeof(CPX));
 75 | 
 76 | 	FLT *d_x, *d_y, *d_z;
 77 | 	CUCPX *d_c, *d_fw;
 78 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 79 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 80 | 	checkCudaErrors(cudaMalloc(&d_z,M*sizeof(FLT)));
 81 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 82 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*nf2*nf3*sizeof(CUCPX)));
 83 | 
 84 | 
 85 | 	int dim=3;
 86 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 87 | 	// Zero out your struct, (sets all pointers to NULL, crucial)
 88 |         memset(dplan, 0, sizeof(*dplan));
 89 | 	ier = CUFINUFFT_DEFAULT_OPTS(2, dim, &(dplan->opts));
 90 | 	dplan->opts.gpu_method           = method;
 91 | 	dplan->opts.gpu_maxsubprobsize   = 1024;
 92 | 	dplan->opts.gpu_kerevalmeth      = 0;      // not in cmd-line args
 93 | 	dplan->opts.gpu_sort             = sort;
 94 | 	dplan->opts.gpu_spreadinterponly = 1;
 95 | 
 96 | 	//binsize needs to be set here, since SETUP_BINSIZE() is not called in spread, 
 97 | 	//interp only wrappers.
 98 | 	if(dplan->opts.gpu_method == 1)
 99 | 	{
100 | 		dplan->opts.gpu_binsizex=16;
101 | 		dplan->opts.gpu_binsizey=16;
102 | 		dplan->opts.gpu_binsizez=2;
103 | 	}
104 | 	if(dplan->opts.gpu_method == 2)
105 | 	{
106 | 		dplan->opts.gpu_binsizex=16;
107 | 		dplan->opts.gpu_binsizey=16;
108 | 		dplan->opts.gpu_binsizez=2;
109 | 	}
110 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
111 | 
112 | 	switch(nupts_distribute){
113 | 		// Making data
114 | 		case 0: //uniform
115 | 			{
116 | 				for (int i = 0; i < M; i++) {
117 | 					x[i] = M_PI*randm11();// x in [-pi,pi)
118 | 					y[i] = M_PI*randm11();
119 | 					z[i] = M_PI*randm11();
120 | 					//cout << x[i] << "," << y[i] << "," << z[i] << endl;
121 | 				}
122 | 			}
123 | 			break;
124 | 		case 1: // concentrate on a small region
125 | 			{
126 | 				for (int i = 0; i < M; i++) {
127 | 					x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi)
128 | 					y[i] = M_PI*rand01()/(nf2*2/32);
129 | 					z[i] = M_PI*rand01()/(nf3*2/32);
130 | 				}
131 | 			}
132 | 			break;
133 | 		default:
134 | 			cerr<<"error: nupts distr should be 0,1" << endl;
135 | 			return 1;
136 | 	}
137 | 	for(int i=0; i<nf1*nf2*nf3; i++){
138 | 		fw[i].real(1.0);
139 | 		fw[i].imag(0.0);
140 | 	}
141 | 
142 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
143 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
144 | 	checkCudaErrors(cudaMemcpy(d_z,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
145 | 	checkCudaErrors(cudaMemcpy(d_fw,fw,nf1*nf2*nf3*sizeof(CUCPX),cudaMemcpyHostToDevice));
146 | 
147 | 	CNTime timer;
148 | 	timer.restart();
149 | 	ier = CUFINUFFT_INTERP3D(nf1, nf2, nf3, d_fw, M, d_x, d_y, d_z, d_c, dplan);
150 | 	if(ier != 0 ){
151 | 		cout<<"error: cnufftinterp3d"<<endl;
152 | 		return 0;
153 | 	}
154 | 	FLT t=timer.elapsedsec();
155 | 	printf("[Method %d] %ld U pts to #%d NU pts in %.3g s (\t%.3g NU pts/s)\n",
156 | 			dplan->opts.gpu_method,nf1*nf2*nf3,M,t,M/t);
157 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
158 | #ifdef RESULT
159 | 	cout<<"[result-input]"<<endl;
160 | 	for(int j=0; j<10; j++){
161 | 		printf(" (%2.3g,%2.3g)",c[j].real(),c[j].imag() );
162 | 		cout<<endl;
163 | 	}
164 | 	cout<<endl;
165 | #endif
166 | 
167 | 	cudaFreeHost(x);
168 | 	cudaFreeHost(y);
169 | 	cudaFreeHost(z);
170 | 	cudaFreeHost(c);
171 | 	cudaFreeHost(fw);
172 | 	cudaFree(d_x);
173 | 	cudaFree(d_y);
174 | 	cudaFree(d_z);
175 | 	cudaFree(d_c);
176 | 	cudaFree(d_fw);
177 | 	return 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/test/cufinufft2d1_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include "../contrib/utils.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | int main(int argc, char* argv[])
 14 | {
 15 | 	int N1, N2, M, N;
 16 | 	if (argc<4) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: cufinufft2d1_test method N1 N2 [M [tol]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven,\n"
 22 | 			"    2: sub-problem, or\n"
 23 | 			"    3: sub-problem with Paul's idea.\n"
 24 | 			"  N1, N2: The size of the 2D array.\n"
 25 | 			"  M: The number of non-uniform points (default N1 * N2).\n"
 26 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 27 | 		return 1;
 28 | 	}
 29 | 	double w;
 30 | 	int method;
 31 | 	sscanf(argv[1],"%d",&method);
 32 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 33 | 	sscanf(argv[3],"%lf",&w); N2 = (int)w;  // so can read 1e6 right!
 34 | 	N = N1*N2;
 35 | 	M = N1*N2;// let density always be 1
 36 | 	if(argc>4){
 37 | 		sscanf(argv[4],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 38 | 	}
 39 | 
 40 | 	FLT tol=1e-6;
 41 | 	if(argc>5){
 42 | 		sscanf(argv[5],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 43 | 	}
 44 | 	int iflag=1;
 45 | 
 46 | 
 47 | 	cout<<scientific<<setprecision(3);
 48 | 	int ier;
 49 | 
 50 | 
 51 | 	FLT *x, *y;
 52 | 	CPX *c, *fk;
 53 | 	cudaMallocHost(&x, M*sizeof(FLT));
 54 | 	cudaMallocHost(&y, M*sizeof(FLT));
 55 | 	cudaMallocHost(&c, M*sizeof(CPX));
 56 | 	cudaMallocHost(&fk,N1*N2*sizeof(CPX));
 57 | 
 58 | 	FLT *d_x, *d_y;
 59 | 	CUCPX *d_c, *d_fk;
 60 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 61 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 62 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 63 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*N2*sizeof(CUCPX)));
 64 | 
 65 | 	// Making data
 66 | 	for (int i = 0; i < M; i++) {
 67 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 68 | 		y[i] = M_PI*randm11();
 69 | 		c[i].real(randm11());
 70 | 		c[i].imag(randm11());
 71 | 	}
 72 | 
 73 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 74 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
 75 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CUCPX),cudaMemcpyHostToDevice));
 76 | 
 77 | 	cudaEvent_t start, stop;
 78 | 	float milliseconds = 0;
 79 | 	float totaltime = 0;
 80 | 	cudaEventCreate(&start);
 81 | 	cudaEventCreate(&stop);
 82 | 
 83 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
 84 | 	cudaEventRecord(start);
 85 |  	{
 86 | 		int nf1=1;
 87 | 		cufftHandle fftplan;
 88 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 89 | 	}
 90 | 	cudaEventRecord(stop);
 91 | 	cudaEventSynchronize(stop);
 92 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 93 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
 94 | 
 95 | 	// now to our tests...
 96 | 	CUFINUFFT_PLAN dplan;
 97 | 	int dim = 2;
 98 | 	int type = 1;
 99 | 
100 | 	// Here we setup our own opts, for gpu_method.
101 | 	cufinufft_opts opts;
102 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
103 | 	if(ier!=0){
104 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
105 | 	  return ier;
106 | 	}
107 | 
108 | 	opts.gpu_method=method;
109 | 
110 | 	int nmodes[3];
111 | 	int ntransf = 1;
112 | 	int maxbatchsize = 1;
113 | 	nmodes[0] = N1;
114 | 	nmodes[1] = N2;
115 | 	nmodes[2] = 1;
116 | 	cudaEventRecord(start);
117 | 	ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
118 | 			       maxbatchsize, &dplan, &opts);
119 | 	if (ier!=0){
120 | 	  printf("err: cufinufft2d_plan\n");
121 | 	  return ier;
122 | 	}
123 | 	cudaEventRecord(stop);
124 | 	cudaEventSynchronize(stop);
125 | 	cudaEventElapsedTime(&milliseconds, start, stop);
126 | 	totaltime += milliseconds;
127 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
128 | 
129 | 
130 | 	cudaEventRecord(start);
131 | 	ier=CUFINUFFT_SETPTS(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
132 | 	if (ier!=0){
133 | 	  printf("err: cufinufft_setpts\n");
134 | 	  return ier;
135 | 	}
136 | 	cudaEventRecord(stop);
137 | 	cudaEventSynchronize(stop);
138 | 	cudaEventElapsedTime(&milliseconds, start, stop);
139 | 	totaltime += milliseconds;
140 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
141 | 
142 | 
143 | 	cudaEventRecord(start);
144 | 	ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
145 | 	if (ier!=0){
146 | 	  printf("err: cufinufft2d1_exec\n");
147 | 	  return ier;
148 | 	}
149 | 	cudaEventRecord(stop);
150 | 	cudaEventSynchronize(stop);
151 | 	cudaEventElapsedTime(&milliseconds, start, stop);
152 | 	totaltime += milliseconds;
153 | 	float exec_ms = milliseconds;
154 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
155 | 
156 | 	cudaEventRecord(start);
157 | 	ier=CUFINUFFT_DESTROY(dplan);
158 | 	cudaEventRecord(stop);
159 | 	cudaEventSynchronize(stop);
160 | 	cudaEventElapsedTime(&milliseconds, start, stop);
161 | 	totaltime += milliseconds;
162 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
163 | 
164 | 	checkCudaErrors(cudaMemcpy(fk,d_fk,N1*N2*sizeof(CUCPX),
165 | 		cudaMemcpyDeviceToHost));
166 | 
167 | 	printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
168 | 			opts.gpu_method,M,N1*N2,totaltime/1000,M/totaltime*1000);
169 | 	printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
170 | 
171 | 	int nt1 = (int)(0.37*N1), nt2 = (int)(0.26*N2);  // choose some mode index to check
172 | 	CPX Ft = CPX(0,0), J = IMA*(FLT)iflag;
173 | 	for (int j=0; j<M; ++j)
174 | 		Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]));   // crude direct
175 | 	int it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
176 | //	printf("[gpu   ] one mode: abs err in F[%ld,%ld] is %.3g\n",(int)nt1,(int)nt2,abs(Ft-fk[it]));
177 | 	printf("[gpu   ] one mode: rel err in F[%ld,%ld] is %.3g\n",(int)nt1,(int)nt2,abs(Ft-fk[it])/infnorm(N,fk));
178 | 
179 | 	cudaFreeHost(x);
180 | 	cudaFreeHost(y);
181 | 	cudaFreeHost(c);
182 | 	cudaFreeHost(fk);
183 | 	cudaFree(d_x);
184 | 	cudaFree(d_y);
185 | 	cudaFree(d_c);
186 | 	cudaFree(d_fk);
187 | 	return 0;
188 | }
189 | 


--------------------------------------------------------------------------------
/test/cufinufft2d2_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include <profile.h>
 10 | #include "../contrib/utils.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | {
 16 | 	int N1, N2, M;
 17 | 	if (argc<4) {
 18 | 		fprintf(stderr,
 19 | 			"Usage: cufinufft2d2_test method N1 N2 [M [tol]]\n"
 20 | 			"Arguments:\n"
 21 | 			"  method: One of\n"
 22 | 			"    1: nupts driven, or\n"
 23 | 			"    2: sub-problem.\n"
 24 | 			"  N1, N2: The size of the 2D array.\n"
 25 | 			"  M: The number of non-uniform points (default N1 * N2).\n"
 26 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 27 | 		return 1;
 28 | 	}
 29 | 	double w;
 30 | 	int method;
 31 | 	sscanf(argv[1],"%d",&method);
 32 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 33 | 	sscanf(argv[3],"%lf",&w); N2 = (int)w;  // so can read 1e6 right!
 34 | 	M = N1*N2;// let density always be 1
 35 | 	if(argc>4){
 36 | 		sscanf(argv[4],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 37 | 	}
 38 | 
 39 | 	FLT tol=1e-6;
 40 | 	if(argc>5){
 41 | 		sscanf(argv[5],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 42 | 	}
 43 | 	int iflag=1;
 44 | 
 45 | 
 46 | 	cout<<scientific<<setprecision(3);
 47 | 	int ier;
 48 | 
 49 | 
 50 | 	FLT *x, *y;
 51 | 	CPX *c, *fk;
 52 | 	cudaMallocHost(&x, M*sizeof(FLT));
 53 | 	cudaMallocHost(&y, M*sizeof(FLT));
 54 | 	cudaMallocHost(&c, M*sizeof(CPX));
 55 | 	cudaMallocHost(&fk,N1*N2*sizeof(CPX));
 56 | 
 57 | 	FLT *d_x, *d_y;
 58 | 	CUCPX *d_c, *d_fk;
 59 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 60 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 61 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 62 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*N2*sizeof(CUCPX)));
 63 | 	// Making data
 64 | 	for (int i = 0; i < M; i++) {
 65 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 66 | 		y[i] = M_PI*randm11();
 67 | 	}
 68 | 	for(int i=0; i<N1*N2; i++){
 69 | 		fk[i].real(randm11());
 70 | 		fk[i].imag(randm11());
 71 | 	}
 72 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 73 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
 74 | 	checkCudaErrors(cudaMemcpy(d_fk, fk, N1*N2*sizeof(CPX),
 75 | 		cudaMemcpyHostToDevice));
 76 | 
 77 | 	cudaEvent_t start, stop;
 78 | 	float milliseconds = 0;
 79 |         float totaltime = 0;
 80 | 	cudaEventCreate(&start);
 81 | 	cudaEventCreate(&stop);
 82 | 
 83 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
 84 | 	cudaEventRecord(start);
 85 | 	{
 86 | 		int nf1=1;
 87 | 		cufftHandle fftplan;
 88 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 89 | 	}
 90 | 	cudaEventRecord(stop);
 91 | 	cudaEventSynchronize(stop);
 92 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 93 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
 94 | 
 95 |         // now to the test...
 96 | 	CUFINUFFT_PLAN dplan;
 97 | 	int dim = 2;
 98 | 	int type = 2;
 99 | 
100 | 	// Here we setup our own opts, for gpu_method.
101 | 	cufinufft_opts opts;
102 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
103 | 	if(ier!=0){
104 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
105 | 	  return ier;
106 | 	}
107 | 	opts.gpu_method=method;
108 | 
109 | 	int nmodes[3];
110 | 	int ntransf = 1;
111 | 	int maxbatchsize = 1;
112 | 	nmodes[0] = N1;
113 | 	nmodes[1] = N2;
114 | 	nmodes[2] = 1;
115 | 	cudaEventRecord(start);
116 | 	{
117 | 		PROFILE_CUDA_GROUP("cufinufft2d_plan",2);
118 | 		ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
119 | 				       maxbatchsize, &dplan, &opts);
120 | 		if (ier!=0){
121 | 			printf("err: cufinufft2d_plan\n");
122 | 			return ier;
123 | 		}
124 | 	}
125 | 	cudaEventRecord(stop);
126 | 	cudaEventSynchronize(stop);
127 | 	cudaEventElapsedTime(&milliseconds, start, stop);
128 | 	totaltime += milliseconds;
129 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
130 | 
131 | 	cudaEventRecord(start);
132 | 	{
133 | 		PROFILE_CUDA_GROUP("cufinufft2d_setNUpts",3);
134 | 		ier=CUFINUFFT_SETPTS(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
135 | 		if (ier!=0){
136 | 			printf("err: cufinufft_setpts\n");
137 | 			return ier;
138 | 		}
139 | 	}
140 | 	cudaEventRecord(stop);
141 | 	cudaEventSynchronize(stop);
142 | 	cudaEventElapsedTime(&milliseconds, start, stop);
143 | 	totaltime += milliseconds;
144 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
145 | 
146 | 	cudaEventRecord(start);
147 | 	{
148 | 		PROFILE_CUDA_GROUP("cufinufft2d2_exec",4);
149 | 		ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
150 | 		if (ier!=0){
151 | 			printf("err: cufinufft2d2_exec\n");
152 | 			return ier;
153 | 		}
154 | 	}
155 | 	cudaEventRecord(stop);
156 | 	cudaEventSynchronize(stop);
157 | 	cudaEventElapsedTime(&milliseconds, start, stop);
158 | 	totaltime += milliseconds;
159 | 	float exec_ms = milliseconds;
160 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
161 | 
162 | 	cudaEventRecord(start);
163 | 	{
164 | 		PROFILE_CUDA_GROUP("cufinufft2d_destroy",5);
165 | 		ier=CUFINUFFT_DESTROY(dplan);
166 | 		if(ier!=0){
167 | 		  printf("err %d: cufinufft2d2_destroy\n", ier);
168 | 		  return ier;
169 | 		}
170 | 	}
171 | 	cudaEventRecord(stop);
172 | 	cudaEventSynchronize(stop);
173 | 	cudaEventElapsedTime(&milliseconds, start, stop);
174 | 	totaltime += milliseconds;
175 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
176 | 
177 |         printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method,N1*N2,M,totaltime/1000,M/totaltime*1000);
178 |         printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
179 | 
180 | 
181 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
182 | 	int jt = M/2;          // check arbitrary choice of one targ pt
183 | 	CPX J = IMA*(FLT)iflag;
184 | 	CPX ct = CPX(0,0);
185 | 	int m=0;
186 | 	for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
187 | 		for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
188 | 			ct += fk[m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
189 | 	printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,abs(c[jt]-ct)/infnorm(M,c));
190 | 
191 | 	cudaFreeHost(x);
192 | 	cudaFreeHost(y);
193 | 	cudaFreeHost(c);
194 | 	cudaFreeHost(fk);
195 | 	cudaFree(d_x);
196 | 	cudaFree(d_y);
197 | 	cudaFree(d_c);
198 | 	cudaFree(d_fk);
199 | 	return 0;
200 | }
201 | 


--------------------------------------------------------------------------------
/test/cufinufft3d1_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include "../contrib/utils.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | int main(int argc, char* argv[])
 14 | {
 15 | 	int N1, N2, N3, M, N;
 16 | 	if (argc<4) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: cufinufft3d1_test method N1 N2 N3 [M [tol]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven,\n"
 22 | 			"    2: sub-problem, or\n"
 23 | 			"    4: block gather.\n"
 24 | 			"  N1, N2, N3: The size of the 3D array.\n"
 25 | 			"  M: The number of non-uniform points (default N1 * N2 * N3).\n"
 26 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 27 | 		return 1;
 28 | 	}
 29 | 	double w;
 30 | 	int method;
 31 | 	sscanf(argv[1],"%d",&method);
 32 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 33 | 	sscanf(argv[3],"%lf",&w); N2 = (int)w;  // so can read 1e6 right!
 34 | 	sscanf(argv[4],"%lf",&w); N3 = (int)w;  // so can read 1e6 right!
 35 | 
 36 | 	M = N1*N2*N3;// let density always be 1
 37 | 	if(argc>5){
 38 | 		sscanf(argv[5],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 39 | 	}
 40 | 
 41 | 	FLT tol=1e-6;
 42 | 	if(argc>6){
 43 | 		sscanf(argv[6],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 44 | 	}
 45 | 	int iflag=1;
 46 | 
 47 | 
 48 | 	cout<<scientific<<setprecision(3);
 49 | 	int ier;
 50 | 
 51 | 
 52 | 	FLT *x, *y, *z;
 53 | 	CPX *c, *fk;
 54 | 	cudaMallocHost(&x, M*sizeof(FLT));
 55 | 	cudaMallocHost(&y, M*sizeof(FLT));
 56 | 	cudaMallocHost(&z, M*sizeof(FLT));
 57 | 	cudaMallocHost(&c, M*sizeof(CPX));
 58 | 	cudaMallocHost(&fk,N1*N2*N3*sizeof(CPX));
 59 | 
 60 | 	FLT *d_x, *d_y, *d_z;
 61 | 	CUCPX *d_c, *d_fk;
 62 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 63 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 64 | 	checkCudaErrors(cudaMalloc(&d_z,M*sizeof(FLT)));
 65 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 66 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*N2*N3*sizeof(CUCPX)));
 67 | 
 68 | 	// Making data
 69 | 	for (int i = 0; i < M; i++) {
 70 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 71 | 		y[i] = M_PI*randm11();
 72 | 		z[i] = M_PI*randm11();
 73 | 		c[i].real(randm11());
 74 | 		c[i].imag(randm11());
 75 | 	}
 76 | 
 77 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 78 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
 79 | 	checkCudaErrors(cudaMemcpy(d_z,z,M*sizeof(FLT),cudaMemcpyHostToDevice));
 80 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CUCPX),cudaMemcpyHostToDevice));
 81 | 
 82 | 	cudaEvent_t start, stop;
 83 | 	float milliseconds = 0;
 84 | 	float totaltime = 0;
 85 | 	cudaEventCreate(&start);
 86 | 	cudaEventCreate(&stop);
 87 | 
 88 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
 89 | 	cudaEventRecord(start);
 90 | 	{
 91 | 		int nf1=1;
 92 | 		cufftHandle fftplan;
 93 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 94 | 	}
 95 | 	cudaEventRecord(stop);
 96 | 	cudaEventSynchronize(stop);
 97 | 	cudaEventElapsedTime(&milliseconds, start, stop);
 98 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
 99 | 
100 |         // now to the test...
101 | 	CUFINUFFT_PLAN dplan;
102 | 	int dim = 3;
103 | 	int type = 1;
104 | 
105 | 	// Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
106 | 	cufinufft_opts opts;
107 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
108 | 	if(ier!=0){
109 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
110 | 	  return ier;
111 | 	}
112 | 	opts.gpu_method=method;
113 | 	opts.gpu_kerevalmeth=1;
114 | 
115 | 	int nmodes[3];
116 | 	int ntransf = 1;
117 | 	int maxbatchsize = 1;
118 | 	nmodes[0] = N1;
119 | 	nmodes[1] = N2;
120 | 	nmodes[2] = N3;
121 | 	cudaEventRecord(start);
122 | 	ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
123 | 			       maxbatchsize, &dplan, &opts);
124 | 	if (ier!=0){
125 | 	  printf("err: cufinufft_makeplan\n");
126 | 	  return ier;
127 | 	}
128 | 	cudaEventRecord(stop);
129 | 	cudaEventSynchronize(stop);
130 | 	cudaEventElapsedTime(&milliseconds, start, stop);
131 | 	totaltime += milliseconds;
132 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
133 | 
134 | 
135 | 	cudaEventRecord(start);
136 | 	ier=CUFINUFFT_SETPTS(M, d_x, d_y, d_z, 0, NULL, NULL, NULL, dplan);
137 | 	if (ier!=0){
138 | 	  printf("err: cufinufft_setpts\n");
139 | 	  return ier;
140 | 	}
141 | 	cudaEventRecord(stop);
142 | 	cudaEventSynchronize(stop);
143 | 	cudaEventElapsedTime(&milliseconds, start, stop);
144 | 	totaltime += milliseconds;
145 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
146 | 
147 | 	cudaEventRecord(start);
148 | 	ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
149 | 	if (ier!=0){
150 | 	  printf("err: cufinufft_execute\n");
151 | 	  return ier;
152 | 	}
153 | 	cudaEventRecord(stop);
154 | 	cudaEventSynchronize(stop);
155 | 	cudaEventElapsedTime(&milliseconds, start, stop);
156 | 	totaltime += milliseconds;
157 | 	float exec_ms =	milliseconds;
158 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
159 | 
160 | 	cudaEventRecord(start);
161 | 	ier=CUFINUFFT_DESTROY(dplan);
162 | 	cudaEventRecord(stop);
163 | 	cudaEventSynchronize(stop);
164 | 	cudaEventElapsedTime(&milliseconds, start, stop);
165 | 	totaltime += milliseconds;
166 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
167 | 
168 | 	checkCudaErrors(cudaMemcpy(fk,d_fk,N1*N2*N3*sizeof(CUCPX),
169 | 		cudaMemcpyDeviceToHost));
170 | 
171 | 	printf("[Method %d] %ld NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n",
172 | 			opts.gpu_method,M,N1*N2*N3,totaltime/1000,M/totaltime*1000);
173 |         printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
174 | 
175 | 	int nt1 = (int)(0.37*N1), nt2 = (int)(0.26*N2), nt3 = (int) (0.13*N3);  // choose some mode index to check
176 | 	CPX Ft = CPX(0,0), J = IMA*(FLT)iflag;
177 | 	for (int j=0; j<M; ++j)
178 | 		Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]));   // crude direct
179 | 	int it = N1/2+nt1 + N1*(N2/2+nt2) + N1*N2*(N3/2+nt3);   // index in complex F as 1d array
180 | 	N = N1*N2*N3;
181 | //	printf("[gpu   ] one mode: abs err in F[%ld,%ld,%ld] is %.3g\n",(int)nt1,
182 | //		(int)nt2, (int)nt3, (abs(Ft-fk[it])));
183 | 	printf("[gpu   ] one mode: rel err in F[%ld,%ld,%ld] is %.3g\n",(int)nt1,
184 | 		(int)nt2, (int)nt3, abs(Ft-fk[it])/infnorm(N,fk));
185 | 
186 | 	cudaFreeHost(x);
187 | 	cudaFreeHost(y);
188 | 	cudaFreeHost(z);
189 | 	cudaFreeHost(c);
190 | 	cudaFreeHost(fk);
191 | 	cudaFree(d_x);
192 | 	cudaFree(d_y);
193 | 	cudaFree(d_z);
194 | 	cudaFree(d_c);
195 | 	cudaFree(d_fk);
196 | 
197 | 	return 0;
198 | }
199 | 


--------------------------------------------------------------------------------
/test/cufinufft2d1many_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | 
  7 | #include <cufinufft_eitherprec.h>
  8 | 
  9 | #include "../contrib/utils.h"
 10 | 
 11 | using namespace std;
 12 | 
 13 | int main(int argc, char* argv[])
 14 | {
 15 | 	int N1, N2, M, N, ntransf, maxbatchsize;
 16 | 	if (argc<4) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: cufinufft2d1many_test method N1 N2 [ntransf [maxbatchsize [M [tol]]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven,\n"
 22 | 			"    2: sub-problem, or\n"
 23 | 			"    3: sub-problem with Paul's idea.\n"
 24 | 			"  N1, N2: The size of the 2D array.\n"
 25 | 			"  ntransf: Number of inputs (default 2 ^ 27 / (N1 * N2)).\n"
 26 | 			"  maxbatchsize: Number of simultaneous transforms (or 0 for default).\n"
 27 | 			"  M: The number of non-uniform points (default N1 * N2).\n"
 28 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 29 | 		return 1;
 30 | 	}
 31 | 	double w;
 32 | 	int method;
 33 | 	sscanf(argv[1],"%d",&method);
 34 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 35 | 	sscanf(argv[3],"%lf",&w); N2 = (int)w;  // so can read 1e6 right!
 36 | 	N = N1*N2;
 37 | 	M = N1*N2*2;// let density always be 2
 38 | 	ntransf = pow(2,28)/M;
 39 | 	if(argc>4){
 40 | 		sscanf(argv[4],"%d",&ntransf);
 41 | 	}
 42 | 	maxbatchsize = 0;    // default (cufinufft chooses)
 43 | 	if(argc>5){
 44 | 		sscanf(argv[5],"%d",&maxbatchsize);
 45 | 	}
 46 | 
 47 | 	if(argc>6){
 48 | 		sscanf(argv[6],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 49 | 	}
 50 | 
 51 | 	FLT tol=1e-6;
 52 | 	if(argc>7){
 53 | 		sscanf(argv[7],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 54 | 	}
 55 | 	int iflag=1;
 56 | 
 57 | 
 58 | 	cout<<scientific<<setprecision(3);
 59 | 	int ier;
 60 | 
 61 | 	printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
 62 | 
 63 | 	FLT *x, *y;
 64 | 	CPX *c, *fk;
 65 | 	cudaMallocHost(&x, M*sizeof(FLT));
 66 | 	cudaMallocHost(&y, M*sizeof(FLT));
 67 | 	cudaMallocHost(&c, M*ntransf*sizeof(CPX));
 68 | 	cudaMallocHost(&fk,N1*N2*ntransf*sizeof(CPX));
 69 | 
 70 | 	FLT *d_x, *d_y;
 71 | 	CUCPX *d_c, *d_fk;
 72 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 73 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 74 | 	checkCudaErrors(cudaMalloc(&d_c,M*ntransf*sizeof(CUCPX)));
 75 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*N2*ntransf*sizeof(CUCPX)));
 76 | 
 77 | 
 78 | 	// Making data
 79 | 	for (int i=0; i<M; i++) {
 80 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 81 | 		y[i] = M_PI*randm11();
 82 | 	}
 83 | 
 84 | 	for(int i=0; i<M*ntransf; i++){
 85 | 		c[i].real(randm11());
 86 | 		c[i].imag(randm11());
 87 | 	}
 88 | 
 89 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 90 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
 91 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*ntransf*sizeof(CUCPX),cudaMemcpyHostToDevice));
 92 | 
 93 | 	cudaEvent_t start, stop;
 94 | 	float milliseconds = 0;
 95 | 	double totaltime = 0;
 96 | 	cudaEventCreate(&start);
 97 | 	cudaEventCreate(&stop);
 98 | 
 99 | 	// warm up CUFFT (is slow, takes around 0.2 sec... )
100 | 	cudaEventRecord(start);
101 | 	{
102 | 		int nf1=1;
103 | 		cufftHandle fftplan;
104 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
105 | 	}
106 | 	cudaEventRecord(stop);
107 | 	cudaEventSynchronize(stop);
108 | 	cudaEventElapsedTime(&milliseconds, start, stop);
109 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
110 | 
111 |         // now to the test...
112 | 	CUFINUFFT_PLAN dplan;
113 | 	int dim = 2;
114 | 	int type = 1;
115 | 
116 | 	// Here we setup our own opts, for gpu_method.
117 | 	cufinufft_opts opts;
118 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
119 | 	if(ier!=0){
120 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
121 | 	  return ier;
122 | 	}
123 | 	opts.gpu_method=method;
124 | 
125 | 	int nmodes[3];
126 | 	nmodes[0] = N1;
127 | 	nmodes[1] = N2;
128 | 	nmodes[2] = 1;
129 | 	cudaEventRecord(start);
130 | 	ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
131 | 			       maxbatchsize, &dplan, &opts);
132 | 	if (ier!=0){
133 | 	  printf("err: cufinufft2d_plan\n");
134 | 	  return ier;
135 | 	}
136 | 	cudaEventRecord(stop);
137 | 	cudaEventSynchronize(stop);
138 | 	cudaEventElapsedTime(&milliseconds, start, stop);
139 | 	totaltime += milliseconds;
140 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
141 | 
142 | 	cudaEventRecord(start);
143 | 	ier=CUFINUFFT_SETPTS(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan);
144 | 	if (ier!=0){
145 | 	  printf("err: cufinufft_setpts\n");
146 | 	  return ier;
147 | 	}
148 | 	cudaEventRecord(stop);
149 | 	cudaEventSynchronize(stop);
150 | 	cudaEventElapsedTime(&milliseconds, start, stop);
151 | 	totaltime += milliseconds;
152 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
153 | 
154 | 	cudaEventRecord(start);
155 | 	ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
156 | 	if (ier!=0){
157 | 	  printf("err: cufinufft2d1_exec\n");
158 | 	  return ier;
159 | 	}
160 | 	cudaEventRecord(stop);
161 | 	cudaEventSynchronize(stop);
162 | 	cudaEventElapsedTime(&milliseconds, start, stop);
163 | 	float exec_ms = milliseconds;
164 | 	totaltime += milliseconds;
165 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
166 | 
167 | 	cudaEventRecord(start);
168 | 	ier=CUFINUFFT_DESTROY(dplan);
169 | 	cudaEventRecord(stop);
170 | 	cudaEventSynchronize(stop);
171 | 	cudaEventElapsedTime(&milliseconds, start, stop);
172 | 	totaltime += milliseconds;
173 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
174 | 
175 | 	checkCudaErrors(cudaMemcpy(fk,d_fk,N1*N2*ntransf*sizeof(CUCPX),
176 | 		cudaMemcpyDeviceToHost));
177 | 
178 | 	int i = ntransf-1; // // choose some data to check
179 | 	int nt1 = (int)(0.37*N1), nt2 = (int)(0.26*N2);  // choose some mode index to check
180 | 	CPX Ft = CPX(0,0), J = IMA*(FLT)iflag;
181 | 	for (int j=0; j<M; ++j)
182 | 		Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]));   // crude direct
183 | 	int it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
184 | //	printf("[gpu   ] %dth data one mode: abs err in F[%ld,%ld] is %.3g\n",(int)i, (int)nt1,(int)nt2,abs(Ft-fk[it+i*N]));
185 | 	printf("[gpu   ] %dth data one mode: rel err in F[%ld,%ld] is %.3g\n",(int)i, (int)nt1,(int)nt2,abs(Ft-fk[it+i*N])/infnorm(N,fk+i*N));
186 | 
187 | 	printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime*1000, M*ntransf/totaltime*1000);
188 |         printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M*ntransf/exec_ms*1000);
189 | 
190 | 	cudaFreeHost(x);
191 | 	cudaFreeHost(y);
192 | 	cudaFreeHost(c);
193 | 	cudaFreeHost(fk);
194 | 	return 0;
195 | }
196 | 


--------------------------------------------------------------------------------
/test/spread3d_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include "../src/cuspreadinterp.h"
  7 | #include "../contrib/utils.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | int main(int argc, char* argv[])
 12 | {
 13 | 	int nf1, nf2, nf3;
 14 | 	FLT sigma = 2.0;
 15 | 	int N1, N2, N3, M;
 16 | 	if (argc<6) {
 17 | 		fprintf(stderr,
 18 | 			"Usage: spread3d_test method nupts_distr nf1 nf2 nf3 [maxsubprobsize [M [tol [kerevalmeth [sort]]]]]\n"
 19 | 			"Arguments:\n"
 20 | 			"  method: One of\n"
 21 | 			"    1: nupts driven,\n"
 22 | 			"    2: sub-problem, or\n"
 23 | 			"    4: block gather (each nf must be multiple of 8).\n"
 24 | 			"  nupts_distr: The distribution of the points; one of\n"
 25 | 			"    0: uniform, or\n"
 26 | 			"    1: concentrated in a small region.\n"
 27 | 			"  nf1, nf2, nf3: The size of the 3D array.\n"
 28 | 			"  maxsubprobsize: Maximum size of subproblems (default 65536).\n"
 29 | 			"  M: The number of non-uniform points (default nf1 * nf2 * nf3 / 8).\n"
 30 | 			"  tol: NUFFT tolerance (default 1e-6).\n"
 31 | 			"  kerevalmeth: Kernel evaluation method; one of\n"
 32 | 			"     0: Exponential of square root (default), or\n"
 33 | 			"     1: Horner evaluation.\n"
 34 | 			"  sort: One of\n"
 35 | 			"     0: do not sort the points, or\n"
 36 | 			"     1: sort the points (default).\n");
 37 | 		return 1;
 38 | 	}
 39 | 	double w;
 40 | 	int method;
 41 | 	sscanf(argv[1],"%d",&method);
 42 | 	int nupts_distribute;
 43 | 	sscanf(argv[2],"%d",&nupts_distribute);
 44 | 	sscanf(argv[3],"%lf",&w); nf1 = (int)w;  // so can read 1e6 right!
 45 | 	sscanf(argv[4],"%lf",&w); nf2 = (int)w;  // so can read 1e6 right!
 46 | 	sscanf(argv[5],"%lf",&w); nf3 = (int)w;  // so can read 1e6 right!
 47 | 
 48 | 	int maxsubprobsize=1024;
 49 | 	if(argc>6){
 50 | 		sscanf(argv[6],"%d",&maxsubprobsize);
 51 | 	}
 52 | 	N1 = (int) nf1/sigma;
 53 | 	N2 = (int) nf2/sigma;
 54 | 	N3 = (int) nf3/sigma;
 55 | 	M = N1*N2*N3;// let density always be 1
 56 | 	if(argc>7){
 57 | 		sscanf(argv[7],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 58 | 		//if(M == 0) M=N1*N2;
 59 | 	}
 60 | 
 61 | 	FLT tol=1e-6;
 62 | 	if(argc>8){
 63 | 		sscanf(argv[8],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 64 | 	}
 65 | 
 66 | 	int kerevalmeth=0;
 67 | 	if(argc>9){
 68 | 		sscanf(argv[9],"%d",&kerevalmeth);
 69 | 	}
 70 | 
 71 | 	int sort=1;
 72 | 	if(argc>10){
 73 | 		sscanf(argv[10],"%d",&sort);
 74 | 	}
 75 | 
 76 | 	int ier;
 77 | 	FLT *x, *y, *z;
 78 | 	CPX *c, *fw;
 79 | 	cudaMallocHost(&x, M*sizeof(FLT));
 80 | 	cudaMallocHost(&y, M*sizeof(FLT));
 81 | 	cudaMallocHost(&z, M*sizeof(FLT));
 82 | 	cudaMallocHost(&c, M*sizeof(CPX));
 83 | 	cudaMallocHost(&fw,nf1*nf2*nf3*sizeof(CPX));
 84 | 
 85 | 	FLT *d_x, *d_y, *d_z;
 86 | 	CUCPX *d_c, *d_fw;
 87 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 88 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 89 | 	checkCudaErrors(cudaMalloc(&d_z,M*sizeof(FLT)));
 90 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 91 | 	checkCudaErrors(cudaMalloc(&d_fw,nf1*nf2*nf3*sizeof(CUCPX)));
 92 | 
 93 | 	int dim=3;
 94 | 	CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S;
 95 | 	// Zero out your struct, (sets all pointers to NULL, crucial)
 96 |         memset(dplan, 0, sizeof(*dplan));
 97 | 	ier = CUFINUFFT_DEFAULT_OPTS(1, dim, &(dplan->opts));
 98 | 
 99 | 	dplan->opts.gpu_method          =method;
100 | 	dplan->opts.gpu_maxsubprobsize  =maxsubprobsize;
101 | 	dplan->opts.gpu_kerevalmeth     =kerevalmeth;
102 | 	dplan->opts.gpu_sort            =sort;
103 | 	dplan->opts.gpu_spreadinterponly=1;
104 | 	ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
105 | 
106 | 	//binsize, obinsize need to be set here, since SETUP_BINSIZE() is not 
107 | 	//called in spread, interp only wrappers.
108 | 	if(dplan->opts.gpu_method == 4)
109 | 	{
110 | 		dplan->opts.gpu_binsizex=4;
111 | 		dplan->opts.gpu_binsizey=4;
112 | 		dplan->opts.gpu_binsizez=4;
113 | 		dplan->opts.gpu_obinsizex=8;
114 | 		dplan->opts.gpu_obinsizey=8;
115 | 		dplan->opts.gpu_obinsizez=8;
116 | 		dplan->opts.gpu_maxsubprobsize=maxsubprobsize;
117 | 	}
118 | 	if(dplan->opts.gpu_method == 2)
119 | 	{
120 | 		dplan->opts.gpu_binsizex=16;
121 | 		dplan->opts.gpu_binsizey=16;
122 | 		dplan->opts.gpu_binsizez=2;
123 | 		dplan->opts.gpu_maxsubprobsize=maxsubprobsize;
124 | 	}
125 | 	if(dplan->opts.gpu_method == 1)
126 | 	{
127 | 		dplan->opts.gpu_binsizex=16;
128 | 		dplan->opts.gpu_binsizey=16;
129 | 		dplan->opts.gpu_binsizez=2;
130 | 	}
131 | 
132 | 	cout<<scientific<<setprecision(3);
133 | 
134 | 	switch(nupts_distribute){
135 | 		// Making data
136 | 		case 0: //uniform
137 | 			{
138 | 				for (int i = 0; i < M; i++) {
139 | 					x[i] = M_PI*randm11();
140 | 					y[i] = M_PI*randm11();
141 | 					z[i] = M_PI*randm11();
142 | 					c[i].real(randm11());
143 | 					c[i].imag(randm11());
144 | 				}
145 | 			}
146 | 			break;
147 | 		case 1: // concentrate on a small region
148 | 			{
149 | 				for (int i = 0; i < M; i++) {
150 | 					x[i] = M_PI*rand01()/nf1*16;
151 | 					y[i] = M_PI*rand01()/nf2*16;
152 | 					z[i] = M_PI*rand01()/nf3*16;
153 | 					c[i].real(randm11());
154 | 					c[i].imag(randm11());
155 | 				}
156 | 			}
157 | 			break;
158 | 		default:
159 | 			cerr << "not valid nupts distr" << endl;
160 | 			return 1;
161 | 	}
162 | 
163 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
164 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
165 | 	checkCudaErrors(cudaMemcpy(d_z,z,M*sizeof(FLT),cudaMemcpyHostToDevice));
166 | 	checkCudaErrors(cudaMemcpy(d_c,c,M*sizeof(CUCPX),cudaMemcpyHostToDevice));
167 | 
168 | 	CNTime timer;
169 | 	/*warm up gpu*/
170 | 	char *a;
171 | 	timer.restart();
172 | 	checkCudaErrors(cudaMalloc(&a,1));
173 | 	// cout<<"[time  ]"<< " (warm up) First cudamalloc call " << timer.elapsedsec()
174 | 	//	<<" s"<<endl<<endl;
175 | 
176 | 
177 | 
178 | 	timer.restart();
179 | 	ier = CUFINUFFT_SPREAD3D(nf1, nf2, nf3, d_fw, M, d_x, d_y, d_z, d_c, dplan);
180 | 	if(ier != 0 ){
181 | 		cout<<"error: cnufftspread3d"<<endl;
182 | 		return 0;
183 | 	}
184 | 	FLT t=timer.elapsedsec();
185 | 	printf("[Method %d] %ld NU pts to #%d U pts in %.3g s (%.3g NU pts/s)\n",
186 | 			dplan->opts.gpu_method,M,nf1*nf2*nf3,t,M/t);
187 | #ifdef RESULT
188 | 	cout<<"[result-input]"<<endl;
189 | 	for(int k=0; k<nf3; k++){
190 | 		for(int j=0; j<nf2; j++){
191 | 			for (int i=0; i<nf1; i++){
192 | 				if( i % dplan->opts.gpu_binsizex == 0 && i!=0)
193 | 					printf(" |");
194 | 				printf(" (%2.3g,%2.3g)",fw[i+j*nf1+k*nf2*nf1].real(),
195 | 					fw[i+j*nf1+k*nf2*nf1].imag() );
196 | 			}
197 | 			cout<<endl;
198 | 		}
199 | 		cout<<"----------------------------------------------------------------"<<endl;
200 | 	}
201 | #endif
202 | 
203 | 	cudaDeviceReset();
204 | 	cudaFreeHost(x);
205 | 	cudaFreeHost(y);
206 | 	cudaFreeHost(z);
207 | 	cudaFreeHost(c);
208 | 	cudaFreeHost(fw);
209 | 	return 0;
210 | }
211 | 


--------------------------------------------------------------------------------
/test/cufinufft3d2_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <math.h>
  4 | #include <helper_cuda.h>
  5 | #include <complex>
  6 | #include <profile.h>
  7 | 
  8 | #include <cufinufft_eitherprec.h>
  9 | 
 10 | #include "../contrib/utils.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | int main(int argc, char* argv[])
 15 | {
 16 | 	int N1, N2, N3, M;
 17 | 	if (argc<4) {
 18 | 		fprintf(stderr,
 19 | 			"Usage: cufinufft3d2_test method N1 N2 N3 [M [tol]]\n"
 20 | 			"Arguments:\n"
 21 | 			"  method: One of\n"
 22 | 			"    1: nupts driven, or\n"
 23 | 			"    2: sub-problem.\n"
 24 | 			"  N1, N2, N3: The size of the 3D array.\n"
 25 | 			"  M: The number of non-uniform points (default N1 * N2 * N3).\n"
 26 | 			"  tol: NUFFT tolerance (default 1e-6).\n");
 27 | 		return 1;
 28 | 	}
 29 | 	double w;
 30 | 	int method;
 31 | 	sscanf(argv[1],"%d",&method);
 32 | 	sscanf(argv[2],"%lf",&w); N1 = (int)w;  // so can read 1e6 right!
 33 | 	sscanf(argv[3],"%lf",&w); N2 = (int)w;  // so can read 1e6 right!
 34 | 	sscanf(argv[4],"%lf",&w); N3 = (int)w;  // so can read 1e6 right!
 35 | 	M = N1*N2*N3;// let density always be 1
 36 | 	if(argc>5){
 37 | 		sscanf(argv[5],"%lf",&w); M  = (int)w;  // so can read 1e6 right!
 38 | 	}
 39 | 
 40 | 	FLT tol=1e-6;
 41 | 	if(argc>6){
 42 | 		sscanf(argv[6],"%lf",&w); tol  = (FLT)w;  // so can read 1e6 right!
 43 | 	}
 44 | 	int iflag=1;
 45 | 
 46 | 
 47 | 	cout<<scientific<<setprecision(3);
 48 | 	int ier;
 49 | 
 50 | 
 51 | 	FLT *x, *y, *z;
 52 | 	CPX *c, *fk;
 53 | 	cudaMallocHost(&x, M*sizeof(FLT));
 54 | 	cudaMallocHost(&y, M*sizeof(FLT));
 55 | 	cudaMallocHost(&z, M*sizeof(FLT));
 56 | 	cudaMallocHost(&c, M*sizeof(CPX));
 57 | 	cudaMallocHost(&fk,N1*N2*N3*sizeof(CPX));
 58 | 
 59 | 	FLT *d_x, *d_y, *d_z;
 60 | 	CUCPX *d_c, *d_fk;
 61 | 	checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT)));
 62 | 	checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT)));
 63 | 	checkCudaErrors(cudaMalloc(&d_z,M*sizeof(FLT)));
 64 | 	checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX)));
 65 | 	checkCudaErrors(cudaMalloc(&d_fk,N1*N2*N3*sizeof(CUCPX)));
 66 | 
 67 | 	// Making data
 68 | 	for (int i = 0; i < M; i++) {
 69 | 		x[i] = M_PI*randm11();// x in [-pi,pi)
 70 | 		y[i] = M_PI*randm11();
 71 | 		z[i] = M_PI*randm11();
 72 | 	}
 73 | 
 74 | 	for(int i=0; i<N1*N2*N3; i++){
 75 | 		fk[i].real(randm11());
 76 | 		fk[i].imag(randm11());
 77 | 	}
 78 | 
 79 | 	checkCudaErrors(cudaMemcpy(d_x,x,M*sizeof(FLT),cudaMemcpyHostToDevice));
 80 | 	checkCudaErrors(cudaMemcpy(d_y,y,M*sizeof(FLT),cudaMemcpyHostToDevice));
 81 | 	checkCudaErrors(cudaMemcpy(d_z,z,M*sizeof(FLT),cudaMemcpyHostToDevice));
 82 | 	checkCudaErrors(cudaMemcpy(d_fk,fk,N1*N2*N3*sizeof(CPX),
 83 | 		cudaMemcpyHostToDevice));
 84 | 
 85 | 	cudaEvent_t start, stop;
 86 | 	float milliseconds = 0;
 87 | 	float totaltime = 0;
 88 | 	cudaEventCreate(&start);
 89 | 	cudaEventCreate(&stop);
 90 | 
 91 |     // warm up CUFFT (is slow, takes around 0.2 sec... )
 92 | 	cudaEventRecord(start);
 93 | 	{
 94 | 		int nf1=1;
 95 | 		cufftHandle fftplan;
 96 | 		cufftPlan1d(&fftplan,nf1,CUFFT_TYPE,1);
 97 | 	}
 98 | 	cudaEventRecord(stop);
 99 | 	cudaEventSynchronize(stop);
100 | 	cudaEventElapsedTime(&milliseconds, start, stop);
101 | 	printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds/1000);
102 | 
103 |         // now to the test...
104 | 	CUFINUFFT_PLAN dplan;
105 | 	int dim = 3;
106 | 	int type = 2;
107 | 
108 | 	// Here we setup our own opts, for gpu_method.
109 | 	cufinufft_opts opts;
110 | 	ier=CUFINUFFT_DEFAULT_OPTS(type, dim, &opts);
111 | 	if(ier!=0){
112 | 	  printf("err %d: CUFINUFFT_DEFAULT_OPTS\n", ier);
113 | 	  return ier;
114 | 	}
115 | 	opts.gpu_method=method;
116 | 
117 | 	int nmodes[3];
118 | 	int ntransf = 1;
119 | 	int maxbatchsize = 1;
120 | 	nmodes[0] = N1;
121 | 	nmodes[1] = N2;
122 | 	nmodes[2] = N3;
123 | 
124 | 	cudaEventRecord(start);
125 | 	{
126 | 		PROFILE_CUDA_GROUP("cufinufft3d_plan",2);
127 | 		ier=CUFINUFFT_MAKEPLAN(type, dim, nmodes, iflag, ntransf, tol,
128 | 				       maxbatchsize, &dplan, &opts);
129 | 		if (ier!=0){
130 | 			printf("err: cufinufft_makeplan\n");
131 | 		}
132 | 	}
133 | 	cudaEventRecord(stop);
134 | 	cudaEventSynchronize(stop);
135 | 	cudaEventElapsedTime(&milliseconds, start, stop);
136 | 	totaltime += milliseconds;
137 | 	printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds/1000);
138 | 
139 | 	cudaEventRecord(start);
140 | 	{
141 | 		PROFILE_CUDA_GROUP("cufinufft_setpts",3);
142 | 		ier=CUFINUFFT_SETPTS(M, d_x, d_y, d_z, 0, NULL, NULL, NULL, dplan);
143 | 		if (ier!=0){
144 | 		  printf("err: cufinufft_setpts\n");
145 | 		  return ier;
146 | 		}
147 | 	}
148 | 	cudaEventRecord(stop);
149 | 	cudaEventSynchronize(stop);
150 | 	cudaEventElapsedTime(&milliseconds, start, stop);
151 | 	totaltime += milliseconds;
152 | 	printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds/1000);
153 | 
154 | 	cudaEventRecord(start);
155 | 	{
156 | 		PROFILE_CUDA_GROUP("cufinufft_execute",4);
157 | 		ier=CUFINUFFT_EXECUTE(d_c, d_fk, dplan);
158 | 		if (ier!=0){
159 | 		  printf("err: cufinufft_execute\n");
160 | 		  return ier;
161 | 		}
162 | 	}
163 | 	cudaEventRecord(stop);
164 | 	cudaEventSynchronize(stop);
165 | 	cudaEventElapsedTime(&milliseconds, start, stop);
166 | 	totaltime += milliseconds;
167 | 	float exec_ms =	milliseconds;
168 | 	printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds/1000);
169 | 
170 | 	cudaEventRecord(start);
171 | 	{
172 | 		PROFILE_CUDA_GROUP("cufinufft3d_destroy",5);
173 | 		ier=CUFINUFFT_DESTROY(dplan);
174 | 		if (ier!=0){
175 | 		  printf("err: cufinufft_destroy\n");
176 | 		  return ier;
177 | 		}
178 | 	}
179 | 	cudaEventRecord(stop);
180 | 	cudaEventSynchronize(stop);
181 | 	cudaEventElapsedTime(&milliseconds, start, stop);
182 | 	totaltime += milliseconds;
183 | 	printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds/1000);
184 | 
185 | 	checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost));
186 | 
187 | 	printf("[Method %d] %ld U pts to %d NU pts in %.3g s:\t%.3g NU pts/s\n",
188 | 			opts.gpu_method,N1*N2*N3,M,totaltime/1000,M/totaltime*1000);
189 |         printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",M/exec_ms*1000);
190 | 
191 | 	int jt = M/2;          // check arbitrary choice of one targ pt
192 | 	CPX J = IMA*(FLT)iflag;
193 | 	CPX ct = CPX(0,0);
194 | 	int m=0;
195 | 	for (int m3=-(N3/2); m3<=(N3-1)/2; ++m3)  // loop in correct order over F
196 | 		for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
197 | 			for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1)
198 | 				ct += fk[m++] * exp(J*(m1*x[jt] + m2*y[jt] + m3*z[jt]));   // crude direct
199 | 	printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,
200 | 		abs(c[jt]-ct)/infnorm(M,c));
201 | 
202 | 	cudaFreeHost(x);
203 | 	cudaFreeHost(y);
204 | 	cudaFreeHost(z);
205 | 	cudaFreeHost(c);
206 | 	cudaFreeHost(fk);
207 | 	cudaFree(d_x);
208 | 	cudaFree(d_y);
209 | 	cudaFree(d_z);
210 | 	cudaFree(d_c);
211 | 	cudaFree(d_fk);
212 | 	return 0;
213 | }
214 | 


--------------------------------------------------------------------------------