├── .gitmodules ├── sites ├── make.inc.olcf_summit ├── make.inc.nersc_cgpu ├── make.inc.CIMS ├── make.inc.nersc_cori ├── make.inc.nersc_perlmutter └── make.inc.FI ├── python └── cufinufft │ ├── requirements.txt │ ├── __init__.py │ ├── docs │ ├── index.rst │ ├── Makefile │ ├── make.bat │ └── conf.py │ ├── tests │ ├── test_examples.py │ ├── test_multi.py │ ├── utils.py │ ├── test_error_checks.py │ └── test_basic.py │ ├── README.md │ └── _cufinufft.py ├── docs ├── logo.png └── cufinufft_announce.png ├── MANIFEST.in ├── targets ├── make.inc.manylinux └── make.inc.power9 ├── contrib ├── legendre_rule_fast.h ├── legendre_rule_fast.license ├── dirft.h ├── README ├── common.h ├── dataTypes.h ├── utils.cpp ├── spreadinterp.h ├── utils.h ├── utils_fp.cpp ├── utils_fp.h ├── dirft2d.cpp ├── spreadinterp.cpp └── common.cpp ├── .gitignore ├── ci ├── docker │ ├── cuda10.1 │ │ ├── cuda.repo │ │ ├── README │ │ ├── CentOS-SCLo-scl-rh.repo │ │ ├── CentOS-SCLo-scl.repo │ │ ├── vault.repo │ │ └── Dockerfile-x86_64 │ ├── cuda11.0 │ │ ├── cuda.repo │ │ ├── README │ │ └── Dockerfile-x86_64 │ └── cuda10.1-manylinux2014 │ │ ├── cuda.repo │ │ ├── README │ │ └── Dockerfile-x86_64 ├── build-wheels.sh └── distribution_helper.sh ├── include ├── profile.h ├── cufinufft.h ├── utils.h ├── cufinufft_opts.h └── cufinufft_errors.h ├── src ├── common.h ├── 1d │ ├── README │ ├── interp1d_wrapper.cu │ └── cufinufft1d.cu ├── 2d │ ├── README │ └── cufinufft2d.cu ├── 3d │ ├── README │ └── cufinufft3d.cu ├── memtransfer.h ├── README ├── profile.cu ├── cudeconvolve.h ├── common.cu └── precision_independent.h ├── test ├── fseriesperf.sh ├── spreadperf.sh ├── cufinufft2d2api_test.cu ├── cufinufft2d2api_test_32.cu ├── interp1d_test.cu ├── spread1d_test.cu ├── interp2d_test.cu ├── spread2d_test.cu ├── cufinufft1d1_test.cu ├── fseries_kernel_test.cu ├── cufinufft1d2_test.cu ├── interp3d_test.cu ├── cufinufft2d1_test.cu ├── cufinufft2d2_test.cu ├── cufinufft3d1_test.cu ├── cufinufft2d1many_test.cu ├── spread3d_test.cu └── cufinufft3d2_test.cu ├── examples ├── README ├── example2d2many.py ├── example2d1many.py ├── example2d1many.cpp └── example2d2many.cpp ├── .bumpversion.cfg ├── Jenkinsfile ├── LICENSE ├── setup.py └── CHANGELOG /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sites/make.inc.olcf_summit: -------------------------------------------------------------------------------- 1 | target:=power9 2 | -------------------------------------------------------------------------------- /python/cufinufft/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pycuda 3 | six 4 | -------------------------------------------------------------------------------- /sites/make.inc.nersc_cgpu: -------------------------------------------------------------------------------- 1 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs 2 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatironinstitute/cufinufft/HEAD/docs/logo.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include python/cufinufft/README.md 2 | include python/cufinufft/requirements.txt 3 | -------------------------------------------------------------------------------- /docs/cufinufft_announce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatironinstitute/cufinufft/HEAD/docs/cufinufft_announce.png -------------------------------------------------------------------------------- /targets/make.inc.manylinux: -------------------------------------------------------------------------------- 1 | CFLAGS = -fPIC -O3 -funroll-loops -march=x86-64 -mtune=generic -msse4 -fcx-limited-range 2 | -------------------------------------------------------------------------------- /python/cufinufft/__init__.py: -------------------------------------------------------------------------------- 1 | from cufinufft.cufinufft import cufinufft 2 | 3 | __all__ = ['cufinufft'] 4 | __version__ = '1.3' 5 | -------------------------------------------------------------------------------- /contrib/legendre_rule_fast.h: -------------------------------------------------------------------------------- 1 | #ifndef GAUSSQUAD_H 2 | #define GAUSSQUAD_H 3 | 4 | void legendre_compute_glr ( int n, double x[], double w[] ); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | __pycache__ 3 | bin 4 | lib 5 | lib-static 6 | 7 | python/cufinufft/docs/_build 8 | python/cufinufft/docs/_static 9 | python/cufinufft/docs/_templates 10 | -------------------------------------------------------------------------------- /sites/make.inc.CIMS: -------------------------------------------------------------------------------- 1 | CUDA_ROOT=/usr/local/stow/cuda-10.0 2 | INC=-I$(CUDA_ROOT)/include \ 3 | -I$(CUDA_ROOT)/samples/common/inc 4 | NVCC_LIBS_PATH=-L$(CUDA_ROOT)/lib64 5 | NVARCH=-arch=sm_70 6 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/cuda.repo: -------------------------------------------------------------------------------- 1 | [cuda] 2 | name=cuda 3 | baseurl=http://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA 7 | -------------------------------------------------------------------------------- /ci/docker/cuda11.0/cuda.repo: -------------------------------------------------------------------------------- 1 | [cuda] 2 | name=cuda 3 | baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA 7 | -------------------------------------------------------------------------------- /sites/make.inc.nersc_cori: -------------------------------------------------------------------------------- 1 | CC := $(shell which cc) 2 | CXX := $(shell which CC) 3 | 4 | $(info detected compiler wrappers:) 5 | $(info CC = $(CC)) 6 | $(info CXX = $(CXX)) 7 | 8 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs 9 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1-manylinux2014/cuda.repo: -------------------------------------------------------------------------------- 1 | [cuda] 2 | name=cuda 3 | baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64 4 | enabled=1 5 | gpgcheck=1 6 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA 7 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/README: -------------------------------------------------------------------------------- 1 | This configuration is based off of manylinux2010, 2 | which is iteself based off of centos6. 3 | 4 | I have extended manylinux with a compatible CUDA 5 | toolkit and runtime environment suitable for 6 | both building and running code inside docker. 7 | -------------------------------------------------------------------------------- /ci/docker/cuda11.0/README: -------------------------------------------------------------------------------- 1 | This configuration is based off of manylinux2014, 2 | which is iteself based off of centos8. 3 | 4 | I have extended manylinux with a compatible CUDA 5 | toolkit and runtime environment suitable for 6 | both building and running code inside docker. 7 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1-manylinux2014/README: -------------------------------------------------------------------------------- 1 | This configuration is based off of manylinux2014, 2 | which is iteself based off of centos8. 3 | 4 | I have extended manylinux with a compatible CUDA 5 | toolkit and runtime environment suitable for 6 | both building and running code inside docker. 7 | -------------------------------------------------------------------------------- /targets/make.inc.power9: -------------------------------------------------------------------------------- 1 | # -march is not always supported when compiling power9 targets, we use `mcpu` and `mtune` instead. 2 | CFLAGS := -fPIC -O3 -funroll-loops -g 3 | CXXFLAGS := -fPIC -O3 -funroll-loops -mcpu=native -mtune=native -g -std=c++11 4 | 5 | # All power9 systems so far have had recent GPU hardware. 6 | NVARCH := -arch=sm_70 7 | -------------------------------------------------------------------------------- /contrib/legendre_rule_fast.license: -------------------------------------------------------------------------------- 1 | LICENSE info for legendre_rule_fast.c ONLY: 2 | 3 | According to 4 | https://people.sc.fsu.edu/~jburkardt/c_src/legendre_rule_fast/legendre_rule_fast.html 5 | 6 | The computer code and data files described and made available on this web page are distributed under the GNU LGPL license: 7 | 8 | https://www.gnu.org/licenses/lgpl-3.0.en.html 9 | -------------------------------------------------------------------------------- /python/cufinufft/docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to cufinufft's Python documentation! 2 | ============================================== 3 | 4 | .. automodule:: cufinufft 5 | :members: 6 | :member-order: bysource 7 | 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | -------------------------------------------------------------------------------- /include/profile.h: -------------------------------------------------------------------------------- 1 | #ifndef PROFILE_H 2 | #define PROFILE_H 3 | 4 | #include 5 | 6 | class CudaTracer { 7 | public: 8 | CudaTracer(const char* name, int cid = 0); 9 | ~CudaTracer(); 10 | }; 11 | 12 | 13 | 14 | #define PROFILE_CUDA(fname) CudaTracer uniq_name_using_macros__(fname); 15 | #define PROFILE_CUDA_GROUP(fname, groupid) CudaTracer uniq_name_using_macros__(fname, groupid); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H__ 2 | #define __COMMON_H__ 3 | #include 4 | 5 | __global__ 6 | void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns); 7 | 8 | int CUFSERIESKERNELCOMPUTE(int dim, int nf1, int nf2, int nf3, FLT *d_f, cuDoubleComplex *d_a, FLT *d_fwkerhalf1, FLT *d_fwkerhalf2, FLT *d_fwkerhalf3, int ns); 9 | #endif 10 | -------------------------------------------------------------------------------- /src/1d/README: -------------------------------------------------------------------------------- 1 | - cufinufft1d.cu 2 | This file contains the execution functions 1d type 1,2 that are called in ../cufinufft.cu 3 | 4 | - spreadinterp1d.cu 5 | This file contains all the GPU kernels for 1d spreading, interpolation. 6 | 7 | - interp1d_wrapper.cu 8 | Wrappers for 1d interpolations. One method is implemented: 9 | (1) nonuniform driven, 10 | 11 | - spread1d_wrapper.cu 12 | Wrappers for 1d spreading. Two methods are implemented: 13 | (1) nonuniform driven, 14 | (2) subproblem 15 | -------------------------------------------------------------------------------- /sites/make.inc.nersc_perlmutter: -------------------------------------------------------------------------------- 1 | CC := $(shell which cc) 2 | CXX := $(shell which CC) 3 | 4 | $(info detected compiler wrappers:) 5 | $(info CC = $(CC)) 6 | $(info CXX = $(CXX)) 7 | 8 | 9 | CUDA_ROOT := $(CUDATOOLKIT_HOME) 10 | NVARCH := -arch=sm_80 \ 11 | -gencode=arch=compute_70,code=sm_70 \ 12 | -gencode=arch=compute_75,code=sm_75 \ 13 | -gencode=arch=compute_80,code=sm_80 \ 14 | -gencode=arch=compute_80,code=sm_80 \ 15 | -gencode=arch=compute_86,code=compute_86 16 | 17 | NVCC_STUBS := $(CUDA_ROOT)/lib64/stubs 18 | -------------------------------------------------------------------------------- /src/2d/README: -------------------------------------------------------------------------------- 1 | - cufinufft2d.cu 2 | This file contains the execution functions 2d type 1,2 that are called in ../cufinufft.cu 3 | 4 | - spreadinterp2d.cu 5 | This file contains all the GPU kernels for 2d spreading, interpolation. 6 | 7 | - interp2d_wrapper.cu 8 | Wrappers for 2d interpolations. Two methods are implemented: 9 | (1) nonuniform driven, 10 | (2) subproblem 11 | 12 | - spread2d_wrapper.cu 13 | Wrappers for 2d spreading. Three methods are implemented: 14 | (1) nonuniform driven, 15 | (2) subproblem, 16 | (3) paul's idea 17 | -------------------------------------------------------------------------------- /src/3d/README: -------------------------------------------------------------------------------- 1 | - cufinufft3d.cu 2 | This file contains the execution functions for 3d type1,2 that are called in ../cufinufft.cu 3 | 4 | - spreadinterp3d.cu 5 | This file contains all the GPU kernels for 3d spreading, interpolation. 6 | 7 | - interp3d_wrapper.cu 8 | Wrappers for 3d interpolations. Two methods are implemented: 9 | (1) nonuniform driven, 10 | (2) subproblem 11 | 12 | - spread3d_wrapper.cu 13 | Wrappers for 3d spreading. Three methods are implemented: 14 | (1) nonuniform points driven, 15 | (2) subproblem, 16 | (4) block gather 17 | -------------------------------------------------------------------------------- /include/cufinufft.h: -------------------------------------------------------------------------------- 1 | // Defines the C++/C user interface to FINUFFT library. 2 | 3 | // It simply combines single and double precision headers, by flipping a flag 4 | // in the main macros which are in cufinufft_eitherprec.h 5 | // No usual #ifndef testing is needed; it's done in cufinufft_eitherprec.h 6 | // Internal cufinufft routines that are compiled separately for 7 | // each precision should include cufinufft_eitherprec.h directly, and not cufinufft.h. 8 | 9 | #undef SINGLE 10 | #include 11 | #define SINGLE 12 | #include 13 | #undef SINGLE 14 | -------------------------------------------------------------------------------- /src/memtransfer.h: -------------------------------------------------------------------------------- 1 | #ifndef __MEMTRANSFER_H__ 2 | #define __MEMTRANSFER_H__ 3 | 4 | #include 5 | 6 | int ALLOCGPUMEM1D_PLAN(CUFINUFFT_PLAN d_plan); 7 | int ALLOCGPUMEM1D_NUPTS(CUFINUFFT_PLAN d_plan); 8 | void FREEGPUMEMORY1D(CUFINUFFT_PLAN d_plan); 9 | 10 | int ALLOCGPUMEM2D_PLAN(CUFINUFFT_PLAN d_plan); 11 | int ALLOCGPUMEM2D_NUPTS(CUFINUFFT_PLAN d_plan); 12 | void FREEGPUMEMORY2D(CUFINUFFT_PLAN d_plan); 13 | 14 | int ALLOCGPUMEM3D_PLAN(CUFINUFFT_PLAN d_plan); 15 | int ALLOCGPUMEM3D_NUPTS(CUFINUFFT_PLAN d_plan); 16 | void FREEGPUMEMORY3D(CUFINUFFT_PLAN d_plan); 17 | #endif 18 | -------------------------------------------------------------------------------- /python/cufinufft/tests/test_examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Discover and run Python example scripts as unit tests. 3 | """ 4 | 5 | import os 6 | import subprocess 7 | import sys 8 | from pathlib import Path 9 | 10 | import pytest 11 | 12 | examples_dir = os.path.join(Path(__file__).resolve().parents[3], "examples") 13 | 14 | scripts = [] 15 | for filename in os.listdir(examples_dir): 16 | if filename.endswith(".py"): 17 | scripts.append(os.path.join(examples_dir, filename)) 18 | 19 | @pytest.mark.parametrize("filename", scripts) 20 | def test_example(filename): 21 | subprocess.check_call([sys.executable, filename]) 22 | -------------------------------------------------------------------------------- /src/README: -------------------------------------------------------------------------------- 1 | This folder contains the main source files of the GPU implementations. 2 | - cufinufft.cu 3 | Four main stages of cufinufft API. 4 | (1) cufinufft_makeplan, (2) cufinufft_setpts, (3) cufinufft_execute, (4) cufinufft_destroy. 5 | Also, cufinufft_default_opts may precede stage 1. 6 | 7 | - memtransfer_wrapper.cu 8 | Wrapper of allocation and free GPU memories for different dimensions and methods. 9 | 10 | - deconvolve_wrapper.cu 11 | GPU kernels and wrappers of deconvolve and amplify the input/output coefficients by correction factor. (Step 3 in Type 1; Step 1 in Type 2) 12 | 13 | - profile.cu 14 | Codes for using NVProf 15 | -------------------------------------------------------------------------------- /test/fseriesperf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # basic perf test of compute fseries for 1d, single/double 3 | # Melody 02/20/22 4 | 5 | BIN=../bin/fseries_kernel_test 6 | DIM=1 7 | 8 | echo "Double.............................................." 9 | for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5 10 | do 11 | for TOL in 1e-8 12 | do 13 | $BIN $N $DIM $TOL 0 14 | $BIN $N $DIM $TOL 1 15 | done 16 | done 17 | 18 | BIN=../bin/fseries_kernel_test_32 19 | echo "Single.............................................." 20 | for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5 21 | do 22 | for TOL in 1e-6 23 | do 24 | $BIN $N $DIM $TOL 0 25 | $BIN $N $DIM $TOL 1 26 | done 27 | done 28 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef FINUFFT_UTILS_H 2 | #define FINUFFT_UTILS_H 3 | 4 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 5 | #else 6 | __inline__ __device__ double atomicAdd(double* address, double val) 7 | { 8 | unsigned long long int* address_as_ull = 9 | (unsigned long long int*)address; 10 | unsigned long long int old = *address_as_ull, assumed; 11 | 12 | do { 13 | assumed = old; 14 | old = atomicCAS(address_as_ull, assumed, 15 | __double_as_longlong(val + 16 | __longlong_as_double(assumed))); 17 | 18 | // Note: uses integer comparison to avoid hang in case of NaN 19 | // (since NaN != NaN) 20 | } while (assumed != old); 21 | 22 | return __longlong_as_double(old); 23 | } 24 | #endif 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /python/cufinufft/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /examples/README: -------------------------------------------------------------------------------- 1 | Examples of cuFINUFFT usage in C++ and Python 2 | 3 | Here we show 2D transforms of type 1 and 2, being performed, and tested, 4 | in C++, and in Python. In each case, a batch of transforms is done with 5 | new coefficients or weights, but the same set of nonuniform points; this 6 | explains the suffix "many" in the code names. You may set ntransf=1 to 7 | perform a single transform. Default options are used. In each case the 8 | four steps (plan, setpts, execute, destroy) are used. A math test is also 9 | performed; see the FINUFFT documentation for the definitions of the 10 | transforms: https://finufft.readthedocs.io/en/latest/math.html 11 | 12 | For more usage examples see: 13 | 14 | ../test/cufinufft*.cu 15 | ../python/cufinufft/tests/*.py 16 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.3 3 | parse = (?P\d+)\.(?P\d+) 4 | serialize = 5 | {major}.{minor} 6 | commit = True 7 | tag = True 8 | 9 | [bumpversion:file:setup.py] 10 | search = version='{current_version}' 11 | replace = version='{new_version}' 12 | 13 | [bumpversion:file:README.md] 14 | search = v{current_version} 15 | replace = v{new_version} 16 | 17 | [bumpversion:file:python/cufinufft/README.md] 18 | search = v{current_version} 19 | replace = v{new_version} 20 | 21 | [bumpversion:file:python/cufinufft/docs/conf.py] 22 | search = release = '{current_version}' 23 | replace = release = '{new_version}' 24 | 25 | [bumpversion:file:python/cufinufft/__init__.py] 26 | search = __version__ = '{current_version}' 27 | replace = __version__ = '{new_version}' 28 | 29 | [bumpversion:file:ci/distribution_helper.sh] 30 | search = cufinufft_version={current_version} 31 | replace = cufinufft_version={new_version} 32 | -------------------------------------------------------------------------------- /contrib/dirft.h: -------------------------------------------------------------------------------- 1 | #ifndef DIRFT_H 2 | #define DIRFT_H 3 | 4 | #include "utils.h" 5 | #include "utils_fp.h" 6 | 7 | void dirft1d1(BIGINT nj,FLT* x,CPX* c,int isign,BIGINT ms, CPX* f); 8 | void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f); 9 | void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f); 10 | 11 | void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f); 12 | void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f); 13 | void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f); 14 | 15 | void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f); 16 | void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f); 17 | void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /python/cufinufft/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /ci/build-wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u -x 3 | 4 | function repair_wheel { 5 | wheel="$1" 6 | if ! "${PYBIN}/auditwheel" show "$wheel"; then 7 | echo "Skipping non-platform wheel $wheel" 8 | else 9 | "${PYBIN}/auditwheel" repair "$wheel" --plat "$PLAT" -w /io/wheelhouse/ 10 | fi 11 | } 12 | 13 | 14 | # Compile wheels 15 | for PYBIN in /opt/python/cp3*/bin; do 16 | "${PYBIN}/pip" install --upgrade pip 17 | "${PYBIN}/pip" install -r /io/python/cufinufft/requirements.txt 18 | "${PYBIN}/pip" install auditwheel pytest 19 | "${PYBIN}/pip" wheel /io/ --no-deps -w wheelhouse/ 20 | done 21 | 22 | 23 | # Bundle external shared libraries into the wheels 24 | for whl in wheelhouse/*.whl; do 25 | repair_wheel "$whl" 26 | done 27 | 28 | 29 | # Install packages and test 30 | for PYBIN in /opt/python/cp3*/bin/; do 31 | "${PYBIN}/pip" install cufinufft -f /io/wheelhouse 32 | "${PYBIN}/python" -m pytest /io/python/cufinufft/tests 33 | done 34 | -------------------------------------------------------------------------------- /include/cufinufft_opts.h: -------------------------------------------------------------------------------- 1 | #ifndef __CUFINUFFT_OPTS_H__ 2 | #define __CUFINUFFT_OPTS_H__ 3 | 4 | typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults 5 | double upsampfac; // upsampling ratio sigma, only 2.0 (standard) is implemented 6 | /* following options are for gpu */ 7 | int gpu_method; // 1: nonuniform-pts driven, 2: shared mem (SM) 8 | int gpu_sort; // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort) 9 | 10 | int gpu_binsizex; // used for 2D, 3D subproblem method 11 | int gpu_binsizey; 12 | int gpu_binsizez; 13 | 14 | int gpu_obinsizex; // used for 3D spread block gather method 15 | int gpu_obinsizey; 16 | int gpu_obinsizez; 17 | 18 | int gpu_maxsubprobsize; 19 | int gpu_nstreams; 20 | int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval 21 | 22 | int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only 23 | 24 | /* multi-gpu support */ 25 | int gpu_device_id; 26 | } cufinufft_opts; 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /contrib/README: -------------------------------------------------------------------------------- 1 | This folder contains functions/files from FINUFFT that cuFINUFFT used. Following describes details of dependencies of each file. 2 | - utils.h 3 | Definitions of CUCPX, CUFFT_TYPE, CUFFT_EX are added and are set depending on preprocessor SINGLE. 4 | Definition of BIGINT is changed to the normal 4 byte integer (See line 81) 5 | 6 | - utils.cpp 7 | This is required because of the use of computing norm, relative norm of vectors, e.g. relerrtwonorm, in the test codes. 8 | 9 | - common.h 10 | - common.cpp (hence legendre_rule_fast.c/.h are included) 11 | setup_spreader_for_nufft, set_nf_type12, onedim_fseries_kernel are called in cufinufft_makeplan. 12 | 13 | - spreadinterp.h 14 | cufinufft plan contains the spread_opts struct where nspread, spread_direction, pirange, upsampfac, ES_beta, ES_c are used. 15 | 16 | - ker_horner_allw_loop.c 17 | - ker_lowupsampfac_horner_allw_loop.c 18 | These two files are included in the src/2,3d/spreadinterp2,3d.cu files (See device function eval_kernel_vec_Horner) 19 | -------------------------------------------------------------------------------- /src/profile.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | const uint32_t colors[] = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 7 | 0x0000ffff, 0x00ff0000, 0x00ffffff }; 8 | const int num_colors = sizeof(colors)/sizeof(uint32_t); 9 | 10 | #define PUSH_RANGE(name,cid) { \ 11 | int color_id = cid; \ 12 | color_id = color_id%num_colors;\ 13 | nvtxEventAttributes_t eventAttrib = {0}; \ 14 | eventAttrib.version = NVTX_VERSION; \ 15 | eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ 16 | eventAttrib.colorType = NVTX_COLOR_ARGB; \ 17 | eventAttrib.color = colors[color_id]; \ 18 | eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ 19 | eventAttrib.message.ascii = name; \ 20 | nvtxRangePushEx(&eventAttrib); \ 21 | } 22 | #define POP_RANGE nvtxRangePop(); 23 | 24 | CudaTracer::CudaTracer(const char* name, int cid) 25 | { 26 | PUSH_RANGE(name,cid); 27 | } 28 | 29 | CudaTracer::~CudaTracer() { 30 | POP_RANGE; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/CentOS-SCLo-scl-rh.repo: -------------------------------------------------------------------------------- 1 | # CentOS-SCLo-rh.repo 2 | # 3 | # Please see http://wiki.centos.org/SpecialInterestGroup/SCLo for more 4 | # information 5 | 6 | [centos-sclo-rh] 7 | name=CentOS-6 - SCLo rh 8 | baseurl=http://vault.centos.org/centos/6/sclo/$basearch/rh/ 9 | gpgcheck=1 10 | enabled=1 11 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 12 | 13 | [centos-sclo-rh-testing] 14 | name=CentOS-6 - SCLo rh Testing 15 | baseurl=http://buildlogs.centos.org/centos/6/sclo/$basearch/rh/ 16 | gpgcheck=0 17 | enabled=0 18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 19 | 20 | [centos-sclo-rh-source] 21 | name=CentOS-6 - SCLo rh Sources 22 | baseurl=http://vault.centos.org/centos/6/sclo/Source/rh/ 23 | gpgcheck=1 24 | enabled=0 25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 26 | 27 | [centos-sclo-rh-debuginfo] 28 | name=CentOS-6 - SCLo rh Debuginfo 29 | baseurl=http://debuginfo.centos.org/centos/6/sclo/$basearch/ 30 | gpgcheck=1 31 | enabled=0 32 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 33 | 34 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/CentOS-SCLo-scl.repo: -------------------------------------------------------------------------------- 1 | # CentOS-SCLo-sclo.repo 2 | # 3 | # Please see http://wiki.centos.org/SpecialInterestGroup/SCLo for more 4 | # information 5 | 6 | [centos-sclo-sclo] 7 | name=CentOS-6 - SCLo sclo 8 | baseurl=http://vault.centos.org/centos/6/sclo/$basearch/sclo/ 9 | gpgcheck=1 10 | enabled=1 11 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 12 | 13 | [centos-sclo-sclo-testing] 14 | name=CentOS-6 - SCLo sclo Testing 15 | baseurl=http://buildlogs.centos.org/centos/6/sclo/$basearch/sclo/ 16 | gpgcheck=0 17 | enabled=0 18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 19 | 20 | [centos-sclo-sclo-source] 21 | name=CentOS-6 - SCLo sclo Sources 22 | baseurl=http://vault.centos.org/centos/6/sclo/Source/sclo/ 23 | gpgcheck=1 24 | enabled=0 25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 26 | 27 | [centos-sclo-sclo-debuginfo] 28 | name=CentOS-6 - SCLo sclo Debuginfo 29 | baseurl=http://debuginfo.centos.org/centos/6/sclo/$basearch/ 30 | gpgcheck=1 31 | enabled=0 32 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo 33 | 34 | -------------------------------------------------------------------------------- /contrib/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include "dataTypes.h" 5 | #include "utils.h" 6 | #include "utils_fp.h" 7 | #include "spreadinterp.h" 8 | 9 | // constants needed within common 10 | #define MAX_NQUAD 100 // max number of positive quadr nodes 11 | // increase this if you need >1TB RAM... 12 | #define MAX_NF (BIGINT)INT_MAX // In cufinufft we limit array sizes to 2^31 13 | // which is about 2 billion, since we set 14 | // BIGINT to int. (Differs from FINUFFT) 15 | 16 | struct cufinufft_opts; 17 | 18 | // common.cpp provides... 19 | int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts); 20 | void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts,BIGINT *nf, 21 | BIGINT b); 22 | void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, SPREAD_OPTS opts); 23 | void onedim_fseries_kernel_precomp(BIGINT nf, FLT *f, dcomplex *a, SPREAD_OPTS opts); 24 | void onedim_fseries_kernel_compute(BIGINT nf, FLT *f, dcomplex *a, FLT *fwkerhalf, SPREAD_OPTS opts); 25 | #endif // COMMON_H 26 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/vault.repo: -------------------------------------------------------------------------------- 1 | [base] 2 | name=CentOS-$releasever - Base 3 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=os&infra=$infra 4 | # baseurl=http://mirror.centos.org/centos/$releasever/os/$basearch/ 5 | baseurl=https://vault.centos.org/6.10/os/$basearch/ 6 | gpgcheck=1 7 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6 8 | 9 | # released updates 10 | [updates] 11 | name=CentOS-$releasever - Updates 12 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates&infra=$infra 13 | # baseurl=http://mirror.centos.org/centos/$releasever/updates/$basearch/ 14 | baseurl=https://vault.centos.org/6.10/updates/$basearch/ 15 | gpgcheck=1 16 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6 17 | 18 | # additional packages that may be useful 19 | [extras] 20 | name=CentOS-$releasever - Extras 21 | # mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras&infra=$infra 22 | # baseurl=http://mirror.centos.org/centos/$releasever/extras/$basearch/ 23 | baseurl=https://vault.centos.org/6.10/extras/$basearch/ 24 | gpgcheck=1 25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6 26 | -------------------------------------------------------------------------------- /src/cudeconvolve.h: -------------------------------------------------------------------------------- 1 | #ifndef __CUDECONVOLVE_H__ 2 | #define __CUDECONVOLVE_H__ 3 | 4 | #include 5 | 6 | __global__ 7 | void Deconvolve_1d(int ms, int nf1, int fw_width, CUCPX* fw, CUCPX *fk, 8 | FLT *fwkerhalf1); 9 | __global__ 10 | void Amplify_1d(int ms, int nf1, int fw_width, CUCPX* fw, CUCPX *fk, 11 | FLT *fwkerhalf2); 12 | 13 | __global__ 14 | void Deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, CUCPX* fw, 15 | CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2); 16 | __global__ 17 | void Amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, CUCPX* fw, 18 | CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2); 19 | 20 | __global__ 21 | void Deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, 22 | int fw_width, CUCPX* fw, CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2, 23 | FLT *fwkerhalf3); 24 | __global__ 25 | void Amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, 26 | CUCPX* fw, CUCPX *fk, FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3); 27 | 28 | int CUDECONVOLVE1D(CUFINUFFT_PLAN d_mem, int blksize); 29 | int CUDECONVOLVE2D(CUFINUFFT_PLAN d_mem, int blksize); 30 | int CUDECONVOLVE3D(CUFINUFFT_PLAN d_mem, int blksize); 31 | #endif 32 | -------------------------------------------------------------------------------- /sites/make.inc.FI: -------------------------------------------------------------------------------- 1 | # FI: Flatiron Institute, rusty cluster, running on Cuda 11.4.2, up to A100 2 | # devices. The A100 seems to need SM80 arch code. 3 | # Barnett 12/2/21 4 | 5 | # Here's some cmds to run experiments on rusty: 6 | 7 | # log into rusty, some node, then... 8 | #module load slurm 9 | #srun -p gpu -N1 --gpus=1 -c 1 --constraint=a100 --exclusive --pty bash -i 10 | # to check the GPU... seems device has cuda 11.2 not 11.4 11 | #nvidia-smi 12 | #module load cuda/11.4.2 13 | #module load gcc/7.5.0 14 | # (cuda seems not to be able to use later gcc!) 15 | #make all -j 16 | # compile takes <1min with -j. 17 | #bin/cufinufft1d1_test 2 1e6 1e7 18 | #make check 19 | 20 | # see http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ 21 | NVARCH = -arch=sm_80 \ 22 | -gencode=arch=compute_50,code=sm_50 \ 23 | -gencode=arch=compute_52,code=sm_52 \ 24 | -gencode=arch=compute_60,code=sm_60 \ 25 | -gencode=arch=compute_61,code=sm_61 \ 26 | -gencode=arch=compute_70,code=sm_70 \ 27 | -gencode=arch=compute_75,code=sm_75 \ 28 | -gencode=arch=compute_80,code=sm_80 \ 29 | -gencode=arch=compute_86,code=sm_86 \ 30 | -gencode=arch=compute_86,code=compute_86 31 | -------------------------------------------------------------------------------- /contrib/dataTypes.h: -------------------------------------------------------------------------------- 1 | // ------------ FINUFFT data type definitions ---------------------------------- 2 | 3 | #if (!defined(DATATYPES_H) && !defined(SINGLE)) || (!defined(DATATYPESF_H) && defined(SINGLE)) 4 | // Make sure we only include once per precision (as in finufft_eitherprec.h). 5 | #ifndef SINGLE 6 | #define DATATYPES_H 7 | #else 8 | #define DATATYPESF_H 9 | #endif 10 | 11 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is! 12 | #include 13 | 14 | // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. 15 | // This includes all calling arguments (eg M,N) that could be huge someday... 16 | // Note: BIGINT is modified to have ``int'' data type for cufinufft. 17 | typedef int BIGINT; 18 | 19 | // decide which kind of complex numbers to use in interface... 20 | #ifdef __cplusplus 21 | #include // C++ type 22 | #define COMPLEXIFY(X) std::complex 23 | #else 24 | #include // C99 type 25 | #define COMPLEXIFY(X) X complex 26 | #endif 27 | 28 | #undef FLT 29 | #undef CPX 30 | 31 | // Precision-independent real and complex types for interfacing... 32 | // (note these cannot be typedefs since we want dual-precision library) 33 | #ifdef SINGLE 34 | #define FLT float 35 | #else 36 | #define FLT double 37 | #endif 38 | 39 | #define CPX COMPLEXIFY(FLT) 40 | 41 | #endif // DATATYPES_H or DATATYPESF_H 42 | -------------------------------------------------------------------------------- /include/cufinufft_errors.h: -------------------------------------------------------------------------------- 1 | #ifndef __CUFINUFFT_ERRORS_H__ 2 | #define __CUFINUFFT_ERRORS_H__ 3 | 4 | // For error checking 5 | static const char* _cufftGetErrorEnum(cufftResult_t error) 6 | { 7 | switch(error) 8 | { 9 | case CUFFT_SUCCESS: 10 | return "cufft_success"; 11 | case CUFFT_INVALID_PLAN: 12 | return "cufft_invalid_plan"; 13 | case CUFFT_ALLOC_FAILED: 14 | return "cufft_alloc_failed"; 15 | case CUFFT_INVALID_TYPE: 16 | return "cufft_invalid_type"; 17 | case CUFFT_INVALID_VALUE: 18 | return "cufft_invalid_value"; 19 | case CUFFT_INTERNAL_ERROR: 20 | return "cufft_internal_error"; 21 | case CUFFT_EXEC_FAILED: 22 | return "cufft_exec_failed"; 23 | case CUFFT_SETUP_FAILED: 24 | return "cufft_setup_failed"; 25 | case CUFFT_INVALID_SIZE: 26 | return "cufft_invalid_size"; 27 | case CUFFT_UNALIGNED_DATA: 28 | return "cufft_unaligned data"; 29 | case CUFFT_INCOMPLETE_PARAMETER_LIST: 30 | return "cufft_incomplete_parameter_list"; 31 | case CUFFT_INVALID_DEVICE: 32 | return "cufft_invalid_device"; 33 | case CUFFT_PARSE_ERROR: 34 | return "cufft_parse_error"; 35 | case CUFFT_NO_WORKSPACE: 36 | return "cufft_no_workspace"; 37 | case CUFFT_NOT_IMPLEMENTED: 38 | return "cufft_not_implemented"; 39 | case CUFFT_LICENSE_ERROR: 40 | return "cufft_license_error"; 41 | case CUFFT_NOT_SUPPORTED: 42 | return "cufft_not_supported"; 43 | } 44 | return ""; 45 | } 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /python/cufinufft/README.md: -------------------------------------------------------------------------------- 1 | # cuFINUFFT v1.3 Python package 2 | 3 | The cuFINUFFT library is an efficient GPU implementation of the 1-, 2- and 4 | 3-dimensional nonuniform fast Fourier transform (NUFFT). It includes both type 5 | 1 (nonuniform to uniform) and type 2 (uniform to nonuniform) transforms. 6 | It is based on the [FINUFFT](https://github.com/flatironinstitute/finufft) 7 | implementation for the CPU. This package provides a Python interface to the 8 | cuFINUFFT library, which is written in C++ and CUDA. 9 | 10 | For a mathematical description of the NUFFT and applications to signal 11 | processing, imaging, and scientific computing, see [the FINUFFT 12 | documentation](https://finufft.readthedocs.io). Usage examples can be found 13 | [here](https://github.com/flatironinstitute/cufinufft/tree/v1.3/examples). 14 | 15 | If you use this package, please cite our paper: 16 | 17 | Y. Shih, G. Wright, J. Andén, J. Blaschke, A. H. Barnett (2021). 18 | cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs. 19 | arXiv preprint arXiv:2102.08463. 20 | [(paper)](https://arxiv.org/abs/2102.08463) 21 | [(bibtex)](https://arxiv.org/bibtex/2102.08463) 22 | 23 | **Note**: We are currently in the process of adapting the cuFINUFFT interface to 24 | closer match that of FINUFFT. This will likely break code depending on the 25 | current interface once the next release is published. At this point we will 26 | publish a migration guide that will detail the exact changes to the 27 | interfaces. 28 | -------------------------------------------------------------------------------- /ci/distribution_helper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | # Helper Script For Building Wheels 4 | 5 | cufinufft_version=1.3 6 | manylinux_version=manylinux2014 7 | cuda_version=11.0 8 | dockerhub=garrettwrong 9 | 10 | 11 | echo "# build the wheel" 12 | docker build -f ci/docker/cuda${cuda_version}/Dockerfile-x86_64 -t ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} . 13 | 14 | 15 | echo "# Run the container, invoking the build-wheels script to generate the wheels" 16 | docker run --gpus all -it -v `pwd`/wheelhouse:/io/wheelhouse -e PLAT=${manylinux_version}_x86_64 ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} /io/ci/build-wheels.sh 17 | 18 | echo "# Copy the wheels we care about to the dist folder" 19 | mkdir -p dist 20 | cp -v wheelhouse/cufinufft-${cufinufft_version}-cp3*${manylinux_version}* dist 21 | 22 | 23 | echo "The following steps should be performed manually for now.\n" 24 | 25 | 26 | echo "# Push to Test PyPI for review/testing" 27 | echo "#twine upload -r testpypi dist/*" 28 | echo 29 | 30 | 31 | echo "# Tag release." 32 | ## Can do in a repo and push or on manually on GH gui. 33 | echo 34 | 35 | 36 | echo "# Review wheels from test index" 37 | echo "#pip install -i https://test.pypi.org/simple/ --no-deps cufinufft" 38 | echo 39 | 40 | 41 | echo "# Push to live index" 42 | echo "## twine upload dist/*" 43 | echo 44 | 45 | 46 | echo "# optionally push it (might take a long time)." 47 | echo "#docker push ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version}" 48 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent none 3 | options { 4 | disableConcurrentBuilds() 5 | buildDiscarder(logRotator(numToKeepStr: '8', daysToKeepStr: '20')) 6 | timeout(time: 1, unit: 'HOURS') 7 | } 8 | stages { 9 | stage('main') { 10 | agent { 11 | dockerfile { 12 | filename 'ci/docker/cuda10.1/Dockerfile-x86_64' 13 | args '--gpus 1' 14 | } 15 | } 16 | environment { 17 | HOME = "$WORKSPACE/build" 18 | PYBIN = "/opt/python/cp38-cp38/bin" 19 | } 20 | steps { 21 | sh '${PYBIN}/python3 -m venv $HOME' 22 | sh '''#!/bin/bash -ex 23 | source $HOME/bin/activate 24 | LIBRARY_PATH=/io/lib python3 -m pip install -e . 25 | python3 -m pip install --upgrade "numpy<1.22" 26 | python3 -m pip install pytest 27 | python3 -m pytest 28 | ''' 29 | sh 'make check' 30 | } 31 | } 32 | } 33 | post { 34 | failure { 35 | emailext subject: '$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS', 36 | body: '''$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS 37 | 38 | Check console output at $BUILD_URL to view full results. 39 | 40 | Building $BRANCH_NAME for $CAUSE 41 | $JOB_DESCRIPTION 42 | 43 | Chages: 44 | $CHANGES 45 | 46 | End of build log: 47 | ${BUILD_LOG,maxLines=200} 48 | ''', 49 | recipientProviders: [ 50 | [$class: 'DevelopersRecipientProvider'], 51 | ], 52 | replyTo: '$DEFAULT_REPLYTO', 53 | to: 'janden@flatironinstitute.org' 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /contrib/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | BIGINT next235beven(BIGINT n, BIGINT b) 4 | // finds even integer not less than n, with prime factors no larger than 5 5 | // (ie, "smooth") and is a multiple of b (b is a number that the only prime 6 | // factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17 7 | // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n. 8 | // added condition about b Melody 05/31/20 9 | { 10 | if (n<=2) return 2; 11 | if (n%2 == 1) n+=1; // even 12 | BIGINT nplus = n-2; // to cancel out the +=2 at start of loop 13 | BIGINT numdiv = 2; // a dummy that is >1 14 | while ((numdiv>1) || (nplus%b != 0)) { 15 | nplus += 2; // stays even 16 | numdiv = nplus; 17 | while (numdiv%2 == 0) numdiv /= 2; // remove all factors of 2,3,5... 18 | while (numdiv%3 == 0) numdiv /= 3; 19 | while (numdiv%5 == 0) numdiv /= 5; 20 | } 21 | return nplus; 22 | } 23 | 24 | // ----------------------- helpers for timing (always stay double prec)... 25 | using namespace std; 26 | 27 | void CNTime::start() 28 | { 29 | gettimeofday(&initial, 0); 30 | } 31 | 32 | double CNTime::restart() 33 | // Barnett changed to returning in sec 34 | { 35 | double delta = this->elapsedsec(); 36 | this->start(); 37 | return delta; 38 | } 39 | 40 | double CNTime::elapsedsec() 41 | // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18 42 | { 43 | struct timeval now; 44 | gettimeofday(&now, 0); 45 | double nowsec = (double)now.tv_sec + 1e-6*now.tv_usec; 46 | double initialsec = (double)initial.tv_sec + 1e-6*initial.tv_usec; 47 | return nowsec - initialsec; 48 | } 49 | -------------------------------------------------------------------------------- /contrib/spreadinterp.h: -------------------------------------------------------------------------------- 1 | #if (!defined(SPREADINTERP_H) && !defined(SINGLE)) || \ 2 | (!defined(SPREADINTERPF_H) && defined(SINGLE)) 3 | 4 | #include 5 | #include 6 | #include 7 | #include "utils.h" 8 | #include "utils_fp.h" 9 | 10 | #define MAX_NSPREAD 16 // upper bound on w, ie nspread, even when padded 11 | // (see evaluate_kernel_vector); also for common 12 | 13 | #undef SPREAD_OPTS 14 | 15 | #ifdef SINGLE 16 | #define SPREAD_OPTS spread_optsf 17 | #define SPREADINTERPF_H 18 | #else 19 | #define SPREAD_OPTS spread_opts 20 | #define SPREADINTERP_H 21 | #endif 22 | 23 | struct SPREAD_OPTS { // see cnufftspread:setup_spreader for defaults. 24 | int nspread; // w, the kernel width in grid pts 25 | int spread_direction; // 1 means spread NU->U, 2 means interpolate U->NU 26 | int pirange; // 0: coords in [0,N), 1 coords in [-pi,pi) 27 | FLT upsampfac; // sigma, upsampling factor, default 2.0 28 | // ES kernel specific... 29 | FLT ES_beta; 30 | FLT ES_halfwidth; 31 | FLT ES_c; 32 | }; 33 | 34 | // NU coord handling macro: if p is true, rescales from [-pi,pi] to [0,N], then 35 | // folds *only* one period below and above, ie [-N,2N], into the domain [0,N]... 36 | #define RESCALE(x,N,p) (p ? \ 37 | ((x*M_1_2PI + (x<-PI ? 1.5 : (x>=PI ? -0.5 : 0.5)))*N) : \ 38 | (x<0 ? x+N : (x>=N ? x-N : x))) 39 | // yuk! But this is *so* much faster than slow std::fmod that we stick to it. 40 | FLT evaluate_kernel(FLT x, const SPREAD_OPTS &opts); 41 | int setup_spreader(SPREAD_OPTS &opts, FLT eps, FLT upsampfac, int kerevalmeth); 42 | 43 | #endif // SPREADINTERP_H 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2018-2021 The Simons Foundation, Inc. - All Rights Reserved. 2 | 3 | Lead developer: Yu-Hsuan Melody Shih (New York University). 4 | 5 | Other developers: (see github site for full list) 6 | 7 | Garrett Wright (Princeton) 8 | Joakim Anden (KTH) 9 | Johannes Blaschke (LBNL) 10 | Alex Barnett (CCM, Flatiron Institute) 11 | 12 | This project came out of Melody's 2018 and 2019 summer internships at 13 | the Flatiron Institute, advised by Alex Barnett. 14 | 15 | ------ 16 | 17 | cuFINUFFT is licensed under the Apache License, Version 2.0 (the 18 | "License"); you may not use this file except in compliance with the 19 | License. You may obtain a copy of the License at 20 | 21 | http://www.apache.org/licenses/LICENSE-2.0 22 | 23 | Unless required by applicable law or agreed to in writing, software 24 | distributed under the License is distributed on an "AS IS" BASIS, 25 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | See the License for the specific language governing permissions and 27 | limitations under the License. 28 | 29 | ------ 30 | 31 | Certain parts of this repository are contributed by others. 32 | For their license info, see: 33 | 34 | contrib/legendre_rule_fast.license 35 | fortran/cmcl_license.txt 36 | 37 | ------ 38 | 39 | If you find this library useful, or it helps you in creating software 40 | or publications, please let us know, and acknowledge that fact by citing our 41 | repository: 42 | 43 | https://github.com/flatironinstitute/cufinufft 44 | 45 | and the publication: 46 | 47 | cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs, Yu-hsuan Shih, Garrett Wright, Joakim Andén, Johannes Blaschke, Alex H. Barnett. PDSEC2021 conference (best paper prize). https://arxiv.org/abs/2102.08463 48 | -------------------------------------------------------------------------------- /python/cufinufft/tests/test_multi.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | 5 | import pycuda.driver as drv 6 | import pycuda.gpuarray as gpuarray 7 | 8 | from cufinufft import cufinufft 9 | 10 | import utils 11 | 12 | 13 | def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3): 14 | complex_dtype = utils._complex_dtype(dtype) 15 | 16 | drv.init() 17 | 18 | dev_count = drv.Device.count() 19 | 20 | if dev_count == 1: 21 | pytest.skip() 22 | 23 | devs = [drv.Device(dev_id) for dev_id in range(dev_count)] 24 | 25 | dim = len(shape) 26 | 27 | errs = [] 28 | 29 | for dev_id, dev in enumerate(devs): 30 | ctx = dev.make_context() 31 | 32 | k = utils.gen_nu_pts(M, dim=dim).astype(dtype) 33 | c = utils.gen_nonuniform_data(M).astype(complex_dtype) 34 | 35 | k_gpu = gpuarray.to_gpu(k) 36 | c_gpu = gpuarray.to_gpu(c) 37 | fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype) 38 | 39 | plan = cufinufft(1, shape, eps=tol, dtype=dtype, 40 | gpu_device_id=dev_id) 41 | 42 | plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2]) 43 | 44 | plan.execute(c_gpu, fk_gpu) 45 | 46 | fk = fk_gpu.get() 47 | 48 | ind = int(0.1789 * np.prod(shape)) 49 | 50 | fk_est = fk.ravel()[ind] 51 | fk_target = utils.direct_type1(c, k, shape, ind) 52 | 53 | type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target) 54 | 55 | print(f'Type 1 relative error (GPU {dev_id}):', type1_rel_err) 56 | 57 | ctx.pop() 58 | 59 | errs.append(type1_rel_err) 60 | 61 | assert all(err < 0.01 for err in errs) 62 | 63 | 64 | def main(): 65 | test_multi_type1() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /src/common.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include "precision_independent.h" 8 | #include "common.h" 9 | 10 | using namespace std; 11 | 12 | /* Kernel for computing approximations of exact Fourier series coeffs of 13 | cnufftspread's real symmetric kernel. */ 14 | // a , f are intermediate results from function onedim_fseries_kernel_precomp() 15 | // (see cufinufft/contrib/common.cpp for description) 16 | __global__ 17 | void FseriesKernelCompute(int nf1, int nf2, int nf3, FLT *f, cuDoubleComplex *a, 18 | FLT *fwkerhalf1, FLT *fwkerhalf2, FLT *fwkerhalf3, int ns) 19 | { 20 | FLT J2 = ns/2.0; 21 | int q=(int)(2 + 3.0*J2); 22 | int nf; 23 | cuDoubleComplex *at = a + threadIdx.y*MAX_NQUAD; 24 | FLT *ft = f + threadIdx.y*MAX_NQUAD; 25 | FLT *oarr; 26 | if (threadIdx.y == 0){ 27 | oarr = fwkerhalf1; 28 | nf = nf1; 29 | }else if (threadIdx.y == 1){ 30 | oarr = fwkerhalf2; 31 | nf = nf2; 32 | }else{ 33 | oarr = fwkerhalf3; 34 | nf = nf3; 35 | } 36 | 37 | for(int i=blockDim.x*blockIdx.x+threadIdx.x; i>>(nf1, nf2, nf3, d_f, 63 | d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /python/cufinufft/tests/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def _complex_dtype(dtype): 5 | if dtype == np.float32: 6 | complex_dtype = np.complex64 7 | elif dtype == np.float64: 8 | complex_dtype = np.complex128 9 | else: 10 | raise TypeError("dtype should be np.float32 or np.float64.") 11 | 12 | return complex_dtype 13 | 14 | 15 | def _real_dtype(complex_dtype): 16 | if complex_dtype == np.complex64: 17 | real_dtype = np.float32 18 | elif complex_dtype == np.complex128: 19 | real_dtype = np.float64 20 | else: 21 | raise TypeError("dtype should be np.complex64 or np.complex128.") 22 | 23 | return real_dtype 24 | 25 | 26 | def gen_nu_pts(M, dim=3, seed=0): 27 | np.random.seed(seed) 28 | k = np.random.uniform(-np.pi, np.pi, (dim, M)) 29 | k = k.astype(np.float64) 30 | return k 31 | 32 | 33 | def gen_uniform_data(shape, seed=0): 34 | np.random.seed(seed) 35 | fk = np.random.standard_normal(shape + (2,)) 36 | fk = fk.astype(np.float64).view(np.complex128)[..., 0] 37 | return fk 38 | 39 | 40 | def gen_nonuniform_data(M, seed=0): 41 | np.random.seed(seed) 42 | c = np.random.standard_normal(2 * M) 43 | c = c.astype(np.float64).view(np.complex128) 44 | return c 45 | 46 | 47 | def make_grid(shape): 48 | dim = len(shape) 49 | shape = (1,) * (3 - dim) + shape 50 | 51 | grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape] 52 | x, y, z = np.meshgrid(*grids, indexing='ij') 53 | return np.stack((x, y, z)) 54 | 55 | 56 | def direct_type1(c, k, shape, ind): 57 | grid = make_grid(shape) 58 | 59 | phase = k.T @ grid.reshape((3, -1))[:, ind] 60 | fk = np.sum(c * np.exp(1j * phase)) 61 | 62 | return fk 63 | 64 | 65 | def direct_type2(fk, k): 66 | grid = make_grid(fk.shape) 67 | 68 | phase = k @ grid.reshape((3, -1)) 69 | c = np.sum(fk.ravel() * np.exp(-1j * phase)) 70 | 71 | return c 72 | -------------------------------------------------------------------------------- /test/spreadperf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # basic perf test of spread/interp for 2/3d, single/double 3 | # Barnett 1/29/21, some 1D added 12/2/21. 4 | 5 | BINDIR=../bin 6 | 7 | n=1000000 8 | M=1000000 9 | dist=0 # 0= random unif, 1 = clustered 10 | Msub=10000 # claimed default is 65536 11 | tols=1e-5 12 | told=1e-12 13 | 14 | echo "spread 1D.............................................." 15 | $BINDIR/spread1d_test 1 $dist $n $Msub $M $told 16 | $BINDIR/spread1d_test 2 $dist $n $Msub $M $told 17 | $BINDIR/spread1d_test_32 1 $dist $n $Msub $M $tols 18 | $BINDIR/spread1d_test_32 2 $dist $n $Msub $M $tols 19 | 20 | echo "interp 1D.............................................." 21 | $BINDIR/interp1d_test 1 $dist $n $M $told 22 | $BINDIR/interp1d_test_32 1 $dist $n $M $tols 23 | # note there is no meth=2 in 1D interp 24 | 25 | # 2D params... (n is grid size per dim) 26 | n=1000 27 | M=1000000 28 | 29 | echo "spread 2D.............................................." 30 | $BINDIR/spread2d_test 1 $dist $n $n $Msub $M $told 31 | $BINDIR/spread2d_test 2 $dist $n $n $Msub $M $told 32 | $BINDIR/spread2d_test_32 1 $dist $n $n $Msub $M $tols 33 | $BINDIR/spread2d_test_32 2 $dist $n $n $Msub $M $tols 34 | 35 | echo "interp 2D.............................................." 36 | $BINDIR/interp2d_test 1 $dist $n $n $M $told 37 | $BINDIR/interp2d_test 2 $dist $n $n $M $told 38 | $BINDIR/interp2d_test_32 1 $dist $n $n $M $tols 39 | $BINDIR/interp2d_test_32 2 $dist $n $n $M $tols 40 | 41 | 42 | # 3D params... 43 | n=100 44 | M=1000000 45 | 46 | echo "spread 3D.............................................." 47 | $BINDIR/spread3d_test 1 $dist $n $n $n $Msub $M $told 48 | # note absence of meth=2 for 3D double 49 | $BINDIR/spread3d_test_32 1 $dist $n $n $n $Msub $M $tols 50 | $BINDIR/spread3d_test_32 2 $dist $n $n $n $Msub $M $tols 51 | 52 | echo "interp 3D.............................................." 53 | $BINDIR/interp3d_test 1 $dist $n $n $n $M $told 54 | # note absence of meth=2 for 3D double 55 | $BINDIR/interp3d_test_32 1 $dist $n $n $n $M $tols 56 | $BINDIR/interp3d_test_32 2 $dist $n $n $n $M $tols 57 | -------------------------------------------------------------------------------- /examples/example2d2many.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demonstrate the type 2 NUFFT using cuFINUFFT 3 | """ 4 | 5 | import numpy as np 6 | 7 | import pycuda.autoinit 8 | from pycuda.gpuarray import GPUArray, to_gpu 9 | 10 | from cufinufft import cufinufft 11 | 12 | # Set up parameters for problem. 13 | N1, N2 = 37, 41 # Size of uniform grid 14 | M = 17 # Number of nonuniform points 15 | n_transf = 2 # Number of input arrays 16 | eps = 1e-6 # Requested tolerance 17 | dtype = np.float32 # Datatype (real) 18 | complex_dtype = np.complex64 # Datatype (complex) 19 | 20 | # Generate coordinates of non-uniform points. 21 | kx = np.random.uniform(-np.pi, np.pi, size=M) 22 | ky = np.random.uniform(-np.pi, np.pi, size=M) 23 | 24 | # Generate grid values. 25 | fk = (np.random.standard_normal((n_transf, N1, N2)) 26 | + 1j * np.random.standard_normal((n_transf, N1, N2))) 27 | 28 | # Cast to desired datatype. 29 | kx = kx.astype(dtype) 30 | ky = ky.astype(dtype) 31 | fk = fk.astype(complex_dtype) 32 | 33 | # Allocate memory for the nonuniform coefficients on the GPU. 34 | c_gpu = GPUArray((n_transf, M), dtype=complex_dtype) 35 | 36 | # Initialize the plan and set the points. 37 | plan = cufinufft(2, (N1, N2), n_transf, eps=eps, dtype=dtype) 38 | plan.set_pts(to_gpu(kx), to_gpu(ky)) 39 | 40 | # Execute the plan, reading from the uniform grid fk c and storing the result 41 | # in c_gpu. 42 | plan.execute(c_gpu, to_gpu(fk)) 43 | 44 | # Retreive the result from the GPU. 45 | c = c_gpu.get() 46 | 47 | # Check accuracy of the transform at index jt. 48 | jt = M // 2 49 | 50 | for i in range(n_transf): 51 | # Calculate the true value of the type 2 transform at the index jt. 52 | x, y = np.mgrid[-(N1 // 2):(N1 + 1) // 2, -(N2 // 2):(N2 + 1) // 2] 53 | c_true = np.sum(fk[i] * np.exp(-1j * (x * kx[jt] + y * ky[jt]))) 54 | 55 | # Calculate the absolute and relative error. 56 | err = np.abs(c[i, jt] - c_true) 57 | rel_err = err / np.max(np.abs(c[i])) 58 | 59 | print(f"[{i}] Absolute error on point [{jt}] is {err:.3g}") 60 | print(f"[{i}] Relative error on point [{jt}] is {rel_err:.3g}") 61 | 62 | assert(rel_err < 10 * eps) 63 | -------------------------------------------------------------------------------- /examples/example2d1many.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demonstrate the type 1 NUFFT using cuFINUFFT 3 | """ 4 | 5 | import numpy as np 6 | 7 | import pycuda.autoinit 8 | from pycuda.gpuarray import GPUArray, to_gpu 9 | 10 | from cufinufft import cufinufft 11 | 12 | # Set up parameters for problem. 13 | N1, N2 = 59, 61 # Size of uniform grid 14 | M = 100 # Number of nonuniform points 15 | n_transf = 2 # Number of input arrays 16 | eps = 1e-6 # Requested tolerance 17 | dtype = np.float32 # Datatype (real) 18 | complex_dtype = np.complex64 # Datatype (complex) 19 | 20 | # Generate coordinates of non-uniform points. 21 | kx = np.random.uniform(-np.pi, np.pi, size=M) 22 | ky = np.random.uniform(-np.pi, np.pi, size=M) 23 | 24 | # Generate source strengths. 25 | c = (np.random.standard_normal((n_transf, M)) 26 | + 1j * np.random.standard_normal((n_transf, M))) 27 | 28 | # Cast to desired datatype. 29 | kx = kx.astype(dtype) 30 | ky = ky.astype(dtype) 31 | c = c.astype(complex_dtype) 32 | 33 | # Allocate memory for the uniform grid on the GPU. 34 | fk_gpu = GPUArray((n_transf, N1, N2), dtype=complex_dtype) 35 | 36 | # Initialize the plan and set the points. 37 | plan = cufinufft(1, (N1, N2), n_transf, eps=eps, dtype=dtype) 38 | plan.set_pts(to_gpu(kx), to_gpu(ky)) 39 | 40 | # Execute the plan, reading from the strengths array c and storing the 41 | # result in fk_gpu. 42 | plan.execute(to_gpu(c), fk_gpu) 43 | 44 | # Retreive the result from the GPU. 45 | fk = fk_gpu.get() 46 | 47 | # Check accuracy of the transform at position (nt1, nt2). 48 | nt1 = int(0.37 * N1) 49 | nt2 = int(0.26 * N2) 50 | 51 | for i in range(n_transf): 52 | # Calculate the true value of the type 1 transform at the uniform grid 53 | # point (nt1, nt2), which corresponds to the coordinate nt1 - N1 // 2 and 54 | # nt2 - N2 // 2. 55 | x, y = nt1 - N1 // 2, nt2 - N2 // 2 56 | fk_true = np.sum(c[i] * np.exp(1j * (x * kx + y * ky))) 57 | 58 | # Calculate the absolute and relative error. 59 | err = np.abs(fk[i, nt1, nt2] - fk_true) 60 | rel_err = err / np.max(np.abs(fk[i])) 61 | 62 | print(f"[{i}] Absolute error on mode [{nt1}, {nt2}] is {err:.3g}") 63 | print(f"[{i}] Relative error on mode [{nt1}, {nt2}] is {rel_err:.3g}") 64 | 65 | assert(rel_err < 10 * eps) 66 | -------------------------------------------------------------------------------- /python/cufinufft/docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'cufinufft' 21 | copyright = ('2020 The Simons Foundation, ' 22 | 'Melody Shih, Joakim Anden, Garrett Wright.') 23 | author = 'Melody Shih, Joakim Anden, Garrett Wright' 24 | 25 | # The full version, including alpha/beta/rc tags 26 | release = '1.3' 27 | 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx_rtd_theme', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = [] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | 50 | # The theme to use for HTML and HTML Help pages. See the documentation for 51 | # a list of builtin themes. 52 | # 53 | html_theme = 'sphinx_rtd_theme' 54 | 55 | # Add any paths that contain custom static files (such as style sheets) here, 56 | # relative to this directory. They are copied after the builtin static files, 57 | # so a file named "default.css" will overwrite the builtin "default.css". 58 | html_static_path = [] 59 | 60 | # Autodoc config 61 | autoclass_content = 'both' 62 | -------------------------------------------------------------------------------- /contrib/utils.h: -------------------------------------------------------------------------------- 1 | // This contains some library-wide definitions & precision/OMP switches, 2 | // as well as the interfaces to utilities in utils.cpp. Barnett 6/18/18. 3 | 4 | #ifndef UTILS_H 5 | #define UTILS_H 6 | 7 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is! 8 | #include 9 | 10 | #include // C++ type complex 11 | #include 12 | #include "dataTypes.h" 13 | 14 | // fraction growth cut-off in arraywidcen(), to decide if translate in type-3 15 | #define ARRAYWIDCEN_GROWFRAC 0.1 16 | 17 | // math consts not in math.h ... 18 | #define M_1_2PI 0.159154943091895336 19 | #define M_2PI 6.28318530717958648 20 | // to avoid mixed precision operators in eg i*pi... 21 | #define PI (FLT)M_PI 22 | 23 | using namespace std; // means std:: not needed for cout, max, etc 24 | 25 | typedef complex dcomplex; // slightly sneaky since duplicated by mwrap 26 | 27 | // Global error codes for the library... 28 | #define WARN_EPS_TOO_SMALL 1 29 | #define ERR_MAXNALLOC 2 30 | #define ERR_SPREAD_BOX_SMALL 3 31 | #define ERR_SPREAD_PTS_OUT_RANGE 4 32 | #define ERR_SPREAD_ALLOC 5 33 | #define ERR_SPREAD_DIR 6 34 | #define ERR_UPSAMPFAC_TOO_SMALL 7 35 | #define HORNER_WRONG_BETA 8 36 | #define ERR_NDATA_NOTVALID 9 37 | 38 | 39 | //#define MAX(a,b) (a>b) ? a : b // but we use std::max instead 40 | #define MIN(a,b) (a 47 | class CNTime { 48 | public: 49 | void start(); 50 | double restart(); 51 | double elapsedsec(); 52 | private: 53 | struct timeval initial; 54 | }; 55 | 56 | // allow compile-time switch off of openmp, so compilation without any openmp 57 | // is done (Note: _OPENMP is automatically set by -fopenmp compile flag) 58 | #ifdef _OPENMP 59 | #include 60 | // point to actual omp utils 61 | #define MY_OMP_GET_NUM_THREADS() omp_get_num_threads() 62 | #define MY_OMP_GET_MAX_THREADS() omp_get_max_threads() 63 | #define MY_OMP_GET_THREAD_NUM() omp_get_thread_num() 64 | #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x) 65 | #define MY_OMP_SET_NESTED(x) omp_set_nested(x) 66 | #else 67 | // non-omp safe dummy versions of omp utils 68 | #define MY_OMP_GET_NUM_THREADS() 1 69 | #define MY_OMP_GET_MAX_THREADS() 1 70 | #define MY_OMP_GET_THREAD_NUM() 0 71 | #define MY_OMP_SET_NUM_THREADS(x) 72 | #define MY_OMP_SET_NESTED(x) 73 | #endif 74 | 75 | #endif // UTILS_H 76 | -------------------------------------------------------------------------------- /contrib/utils_fp.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include "utils_fp.h" 3 | 4 | 5 | // ------------ complex array utils --------------------------------- 6 | 7 | FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b) 8 | // ||a-b||_2 / ||a||_2 9 | { 10 | FLT err = 0.0, nrm = 0.0; 11 | for (BIGINT m=0; mnrm) nrm = aa; 46 | } 47 | return sqrt(nrm); 48 | } 49 | 50 | void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi) 51 | // With a a length-n array, writes out min(a) to lo and max(a) to hi, 52 | // so that all a values lie in [lo,hi]. 53 | // If n==0, lo and hi are not finite. 54 | { 55 | *lo = INFINITY; *hi = -INFINITY; 56 | for (BIGINT m=0; m*hi) *hi = a[m]; 59 | } 60 | } 61 | 62 | void indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi) 63 | // With i a list of n indices, and a an array of length max(i), writes out 64 | // min(a(i)) to lo and max(a(i)) to hi, so that all a(i) values lie in [lo,hi]. 65 | // This is not currently used in FINUFFT v1.2. 66 | { 67 | *lo = INFINITY; *hi = -INFINITY; 68 | for (BIGINT m=0; m*hi) *hi = A; 72 | } 73 | } 74 | 75 | void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c) 76 | // Writes out w = half-width and c = center of an interval enclosing all a[n]'s 77 | // Only chooses a nonzero center if this increases w by less than fraction 78 | // ARRAYWIDCEN_GROWFRAC defined in defs.h. 79 | // This prevents rephasings which don't grow nf by much. 6/8/17 80 | // If n==0, w and c are not finite. 81 | { 82 | FLT lo,hi; 83 | arrayrange(n,a,&lo,&hi); 84 | *w = (hi-lo)/2; 85 | *c = (hi+lo)/2; 86 | if (std::abs(*c) /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ 8 | echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - 9 | 10 | COPY ci/docker/cuda10.1-manylinux2014/cuda.repo /etc/yum.repos.d/cuda.repo 11 | 12 | ENV CUDA_VERSION 10.1.243 13 | 14 | ENV CUDA_PKG_VERSION 10-1-$CUDA_VERSION-1 15 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a 16 | RUN yum install -y \ 17 | cuda-cudart-$CUDA_PKG_VERSION \ 18 | cuda-compat-10-1 \ 19 | && \ 20 | ln -s cuda-10.1 /usr/local/cuda && \ 21 | rm -rf /var/cache/yum/* 22 | 23 | # nvidia-docker 1.0 24 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 25 | echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf 26 | 27 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 28 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH} 29 | 30 | # nvidia-container-runtime 31 | ENV NVIDIA_VISIBLE_DEVICES all 32 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 33 | ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411" 34 | 35 | #runtime 36 | RUN yum install -y \ 37 | cuda-libraries-$CUDA_PKG_VERSION \ 38 | cuda-nvtx-$CUDA_PKG_VERSION \ 39 | libcublas10-10.2.1.243-1 \ 40 | && \ 41 | rm -rf /var/cache/yum/* 42 | 43 | # devel 44 | RUN yum install -y \ 45 | cuda-nvml-dev-$CUDA_PKG_VERSION \ 46 | cuda-command-line-tools-$CUDA_PKG_VERSION \ 47 | cuda-libraries-dev-$CUDA_PKG_VERSION \ 48 | cuda-minimal-build-$CUDA_PKG_VERSION \ 49 | libcublas-devel-10.2.1.243-1 \ 50 | && \ 51 | rm -rf /var/cache/yum/* 52 | 53 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs 54 | 55 | # /CUDA # 56 | 57 | RUN yum install -y devtoolset-8 58 | 59 | RUN scl enable devtoolset-8 -- g++ --version 60 | 61 | # Okay, so now we can begin cufinufft 62 | 63 | # We need to build the CUDA code now. 64 | # assume we are building container in the root of the git repo... 65 | COPY . /io 66 | WORKDIR /io 67 | RUN scl enable devtoolset-8 -- make target=manylinux 68 | # And we need to pack it in our LD path 69 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH} 70 | 71 | 72 | CMD ["/bin/bash"] 73 | -------------------------------------------------------------------------------- /contrib/utils_fp.h: -------------------------------------------------------------------------------- 1 | // Header for utils_fp.cpp, a little library of low-level array stuff. 2 | // These are functions which depend on single/double precision. 3 | // (rest of finufft defs and types are now in defs.h) 4 | 5 | #if (!defined(UTILS_FP_H) && !defined(SINGLE)) || (!defined(UTILS_FPF_H) && defined(SINGLE)) 6 | // Make sure we only include once per precision (as in finufft_eitherprec.h). 7 | #ifndef SINGLE 8 | #define UTILS_FP_H 9 | #else 10 | #define UTILS_FPF_H 11 | #endif 12 | 13 | 14 | // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is! 15 | #include 16 | 17 | #include // C++ type complex 18 | #include 19 | #include "dataTypes.h" 20 | 21 | 22 | #undef EPSILON 23 | #undef IMA 24 | #undef FABS 25 | #undef CUCPX 26 | #undef CUFFT_TYPE 27 | #undef CUFFT_EX 28 | #undef SET_NF_TYPE12 29 | 30 | // Compile-flag choice of single or double (default) precision: 31 | // (Note in the other codes, FLT is "double" or "float", CPX same but complex) 32 | #ifdef SINGLE 33 | // machine epsilon for rounding 34 | #define EPSILON (float)6e-08 35 | #define IMA complex(0.0,1.0) 36 | #define FABS(x) fabs(x) 37 | #define CUCPX cuFloatComplex 38 | #define CUFFT_TYPE CUFFT_C2C 39 | #define CUFFT_EX cufftExecC2C 40 | #define SET_NF_TYPE12 set_nf_type12f 41 | #else 42 | // machine epsilon for rounding 43 | #define EPSILON (double)1.1e-16 44 | #define IMA complex(0.0,1.0) 45 | #define FABS(x) fabsf(x) 46 | #define CUCPX cuDoubleComplex 47 | #define CUFFT_TYPE CUFFT_Z2Z 48 | #define CUFFT_EX cufftExecZ2Z 49 | #define SET_NF_TYPE12 set_nf_type12 50 | #endif 51 | 52 | 53 | // ahb's low-level array helpers 54 | FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b); 55 | FLT errtwonorm(BIGINT n, CPX* a, CPX* b); 56 | FLT twonorm(BIGINT n, CPX* a); 57 | FLT infnorm(BIGINT n, CPX* a); 58 | void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi); 59 | void indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi); 60 | void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c); 61 | 62 | // Random numbers: crappy unif random number generator in [0,1): 63 | //#define rand01() (((FLT)(rand()%RAND_MAX))/RAND_MAX) 64 | #define rand01() ((FLT)rand()/RAND_MAX) 65 | // unif[-1,1]: 66 | #define randm11() (2*rand01() - (FLT)1.0) 67 | // complex unif[-1,1] for Re and Im: 68 | #define crandm11() (randm11() + IMA*randm11()) 69 | 70 | // Thread-safe seed-carrying versions of above (x is ptr to seed)... 71 | #define rand01r(x) ((FLT)rand_r(x)/RAND_MAX) 72 | // unif[-1,1]: 73 | #define randm11r(x) (2*rand01r(x) - (FLT)1.0) 74 | // complex unif[-1,1] for Re and Im: 75 | #define crandm11r(x) (randm11r(x) + IMA*randm11r(x)) 76 | 77 | 78 | #endif // UTILS_FP_H 79 | -------------------------------------------------------------------------------- /ci/docker/cuda10.1/Dockerfile-x86_64: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux2010_x86_64 2 | LABEL maintainer "Garrett Wright" 3 | 4 | # ---- CentOS 6 has been deprecated. 5 | # We'll need to patch the repo links to point to the CentOS 6 Vault 6 | COPY ci/docker/cuda10.1/vault.repo /etc/yum.repos.d/CentOS-Base.repo 7 | COPY ci/docker/cuda10.1/CentOS-SCLo-scl-rh.repo /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo 8 | COPY ci/docker/cuda10.1/CentOS-SCLo-scl.repo /etc/yum.repos.d/CentOS-SCLo-scl.repo 9 | 10 | # ---- The following block adds layers for CUDA --- # 11 | # base 12 | RUN NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ 13 | curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel6/x86_64/7fa2af80.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ 14 | echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c - 15 | 16 | COPY ci/docker/cuda10.1/cuda.repo /etc/yum.repos.d/cuda.repo 17 | 18 | ENV CUDA_VERSION 10.1.243 19 | 20 | ENV CUDA_PKG_VERSION 10-1-$CUDA_VERSION-1 21 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a 22 | RUN yum install -y \ 23 | cuda-cudart-$CUDA_PKG_VERSION \ 24 | cuda-compat-10-1 \ 25 | && \ 26 | ln -s cuda-10.1 /usr/local/cuda && \ 27 | rm -rf /var/cache/yum/* 28 | 29 | # nvidia-docker 1.0 30 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 31 | echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf 32 | 33 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 34 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH} 35 | 36 | # nvidia-container-runtime 37 | ENV NVIDIA_VISIBLE_DEVICES all 38 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 39 | ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411" 40 | 41 | #runtime 42 | RUN yum install -y \ 43 | cuda-libraries-$CUDA_PKG_VERSION \ 44 | cuda-nvtx-$CUDA_PKG_VERSION \ 45 | libcublas10-10.2.1.243-1 \ 46 | && \ 47 | rm -rf /var/cache/yum/* 48 | 49 | # devel 50 | RUN yum install -y \ 51 | cuda-nvml-dev-$CUDA_PKG_VERSION \ 52 | cuda-command-line-tools-$CUDA_PKG_VERSION \ 53 | cuda-libraries-dev-$CUDA_PKG_VERSION \ 54 | cuda-minimal-build-$CUDA_PKG_VERSION \ 55 | libcublas-devel-10.2.1.243-1 \ 56 | && \ 57 | rm -rf /var/cache/yum/* 58 | 59 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs 60 | 61 | # /CUDA # 62 | 63 | 64 | # Okay, so now we can begin cufinufft 65 | 66 | # We need to build the CUDA code now. 67 | # assume we are building container in the root of the git repo... 68 | COPY . /io 69 | WORKDIR /io 70 | RUN make target=manylinux 71 | # And we need to pack it in our LD path 72 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH} 73 | 74 | 75 | CMD ["/bin/bash"] 76 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # This defines the Python module installation. 2 | 3 | import os 4 | import ctypes 5 | 6 | from setuptools import setup, Extension 7 | 8 | # Description 9 | DESCRIPTION = "Non-uniform fast Fourier transforms on the GPU" 10 | 11 | with open(os.path.join('python', 'cufinufft', 'README.md'), encoding='utf8') as fh: 12 | LONG_DESCRIPTION = fh.read() 13 | 14 | # Parse the requirements 15 | with open(os.path.join('python', 'cufinufft', 'requirements.txt'), 'r') as fh: 16 | requirements = [item.strip() for item in fh.readlines()] 17 | 18 | # Sanity check that we can find the CUDA cufinufft libraries before we get too far. 19 | try: 20 | lib = ctypes.cdll.LoadLibrary('libcufinufft.so') 21 | except Exception as e: 22 | print('CUDA shared libraries not found in library path.' 23 | ' Please refer to installation documentation at http://github.com/flatironinstitute/cufinufft' 24 | ' and ensure CUDA installation is successful first before attempting to install the Python wrappers.') 25 | raise(e) 26 | print('cufinufft CUDA shared libraries found, continuing...') 27 | 28 | 29 | # Python Package Setup 30 | setup( 31 | name='cufinufft', 32 | version='1.3', 33 | author='Yu-shuan Melody Shih, Garrett Wright, Joakim Anden, Johannes Blaschke, Alex Barnett', 34 | author_email='janden-vscholar@flatironinstitute.org', 35 | url='https://github.com/flatironinstitute/cufinufft', 36 | description=DESCRIPTION, 37 | long_description=LONG_DESCRIPTION, 38 | long_description_content_type="text/markdown", 39 | license="Apache 2", 40 | packages=['cufinufft'], 41 | package_dir={'': 'python'}, 42 | install_requires=requirements, 43 | # If you'd like to build or alter the docs you may additionally require these. 44 | extras_require={ 45 | 'docs': ['sphinx', 'sphinx_rtd_theme'] 46 | }, 47 | classifiers=['Intended Audience :: Science/Research', 48 | 'License :: OSI Approved :: Apache Software License', 49 | 'Programming Language :: Python :: 3', 50 | 'Programming Language :: C++', 51 | 'Operating System :: POSIX :: Linux', 52 | 'Environment :: GPU', 53 | 'Topic :: Scientific/Engineering :: Mathematics'], 54 | python_requires='>=3.6', 55 | zip_safe=False, 56 | # This explicitly tells the wheel systems that we're platform specific. 57 | # Addiitonally, will create a new cPython library with a decorated name 58 | # that is rpath linked to CUDA library, also decorated (by auditwheel). 59 | # Most importantly, pip will manage to install all this stuff in 60 | # in places Python can find it (with a little help). 61 | py_modules=['cufinufftc'], 62 | ext_modules=[ 63 | Extension(name='cufinufftc', 64 | sources=[], 65 | libraries=['cufinufft'], 66 | library_dirs=['lib']) 67 | ] 68 | ) 69 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | v 1.3 (06/10/23) 2 | 3 | * Move second half of onedim_fseries_kernel() to GPU (with a simple heuristic 4 | basing on nf1 to switch between the CPU and the GPU version). 5 | * Melody fixed bug in MAX_NF being 0 due to typecasting 1e11 to int (thanks 6 | Elliot Slaughter for catching that). 7 | * Melody fixed kernel eval so done w*d not w^d times, speeds up 2d a little, 3d 8 | quite a lot! (PR#130) 9 | * Melody added 1D support for both types 1 (GM-sort and SM methods) 2 (GM-sort), 10 | in C++/CUDA and their test executables (but not Python interface). 11 | * Various fixes to package config. 12 | * Miscellaneous bug fixes. 13 | 14 | v 1.2 (02/17/21) 15 | 16 | * Warning: Following are Python interface changes -- not backwards compatible 17 | with v 1.1 (See examples/example2d1,2many.py for updated usage) 18 | 19 | - Made opts a kwarg dict instead of an object: 20 | def __init__(self, ... , opts=None, dtype=np.float32) 21 | => def __init__(self, ... , dtype=np.float32, **kwargs) 22 | - Renamed arguments in plan creation `__init__`: 23 | ntransforms => n_trans, tol => eps 24 | - Changed order of arguments in plan creation `__init__`: 25 | def __init__(self, ... ,isign, eps, ntransforms, opts, dtype) 26 | => def __init__(self, ... ,ntransforms, eps, isign, opts, dtype) 27 | - Removed M in `set_pts` arguments: 28 | def set_pts(self, M, kx, ky=None, kz=None) 29 | => def set_pts(self, kx, ky=None, kz=None) 30 | 31 | * Python: added multi-gpu support (in beta) 32 | * Python: added more unit tests (wrong input, kwarg args, multi-gpu) 33 | * Fixed various memory leaks 34 | * Added index bound check in 2D spread kernels (Spread_2d_Subprob(_Horner)) 35 | * Added spread/interp tests to `make check` 36 | * Fixed user request tolerance (eps) to kernel width (w) calculation 37 | * Default kernel evaluation method set to 0, ie exp(sqrt()), since faster 38 | * Removed outdated benchmark codes, cleaner spread/interp tests 39 | 40 | v 1.1 (09/22/20) 41 | 42 | * Python: extended the mode tuple to 3D and reorder from C/python 43 | ndarray.shape style input (nZ, nY, nX) to to the (F) order expected by the 44 | low level library (nX, nY, nZ). 45 | * Added bound checking on the bin size 46 | * Dual-precision support of spread/interp tests 47 | * Improved documentation of spread/interp tests 48 | * Added dummy call of cuFFTPlan1d to avoid timing the constant cost of cuFFT 49 | library. 50 | * Added heuristic decision of maximum batch size (number of vectors with the 51 | same nupts to transform at the same time) 52 | * Reported execution throughput in the test codes 53 | * Fixed timing in the tests code 54 | * Professionalized handling of too-small-eps (requested tolerance) 55 | * Rewrote README.md and added cuFINUFFT logo. 56 | * Support of advanced Makefile usage, e.g. make -site=olcf_summit 57 | * Removed FFTW dependency 58 | 59 | v 1.0 (07/29/20) 60 | -------------------------------------------------------------------------------- /src/precision_independent.h: -------------------------------------------------------------------------------- 1 | /* These are functions that do not rely on FLT. 2 | They are organized by originating file. 3 | */ 4 | 5 | #ifndef PRECISION_INDEPENDENT_H 6 | #define PRECISION_INDEPENDENT_H 7 | 8 | /* Auxiliary var/func to compute power of complex number */ 9 | typedef double RT; 10 | typedef cuDoubleComplex CT; 11 | #define rpart(x) (cuCreal(x)) 12 | #define ipart(x) (cuCimag(x)) 13 | #define cmplx(x,y) (make_cuDoubleComplex(x,y)) 14 | 15 | __device__ RT carg(const CT& z); // polar angle 16 | __device__ RT cabs(const CT& z); 17 | __device__ CT cpow(const CT& z, const int &n); 18 | 19 | /* Common Kernels from spreadinterp3d */ 20 | __host__ __device__ 21 | int CalcGlobalIdx(int xidx, int yidx, int zidx, int onx, int ony, int onz, 22 | int bnx, int bny, int bnz); 23 | __device__ 24 | int CalcGlobalIdx_V2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz); 25 | 26 | /* spreadinterp 1d */ 27 | __global__ 28 | void CalcSubProb_1d(int* bin_size, int* num_subprob, int maxsubprobsize, int numbins); 29 | 30 | __global__ 31 | void MapBintoSubProb_1d(int* d_subprob_to_bin, int* d_subprobstartpts, 32 | int* d_numsubprob,int numbins); 33 | 34 | __global__ 35 | void TrivialGlobalSortIdx_1d(int M, int* index); 36 | 37 | /* spreadinterp 2d */ 38 | __global__ 39 | void CalcSubProb_2d(int* bin_size, int* num_subprob, int maxsubprobsize, int numbins); 40 | 41 | __global__ 42 | void MapBintoSubProb_2d(int* d_subprob_to_bin, int* d_subprobstartpts, 43 | int* d_numsubprob,int numbins); 44 | 45 | __global__ 46 | void CalcSubProb_2d_Paul(int* finegridsize, int* num_subprob, 47 | int maxsubprobsize, int bin_size_x, int bin_size_y); 48 | 49 | __global__ 50 | void TrivialGlobalSortIdx_2d(int M, int* index); 51 | 52 | /* spreadinterp3d */ 53 | __global__ 54 | void CalcSubProb_3d_v2(int* bin_size, int* num_subprob, int maxsubprobsize, 55 | int numbins); 56 | 57 | __global__ 58 | void MapBintoSubProb_3d_v2(int* d_subprob_to_bin,int* d_subprobstartpts, 59 | int* d_numsubprob,int numbins); 60 | 61 | __global__ 62 | void CalcSubProb_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, 63 | int* bin_size, int* num_subprob, int maxsubprobsize, int numbins); 64 | 65 | __global__ 66 | void MapBintoSubProb_3d_v1(int* d_subprob_to_obin, int* d_subprobstartpts, 67 | int* d_numsubprob,int numbins); 68 | 69 | __global__ 70 | void TrivialGlobalSortIdx_3d(int M, int* index); 71 | 72 | __global__ 73 | void FillGhostBins(int binsperobinx, int binsperobiny, int binsperobinz, 74 | int nobinx, int nobiny, int nobinz, int* binsize); 75 | 76 | __global__ 77 | void Temp(int binsperobinx, int binsperobiny, int binsperobinz, 78 | int nobinx, int nobiny, int nobinz, int* binsize); 79 | 80 | __global__ 81 | void GhostBinPtsIdx(int binsperobinx, int binsperobiny, int binsperobinz, 82 | int nobinx, int nobiny, int nobinz, int* binsize, int* index, 83 | int* binstartpts, int M); 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /python/cufinufft/tests/test_error_checks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import pycuda.autoinit # NOQA:401 5 | import pycuda.gpuarray as gpuarray 6 | 7 | from cufinufft import cufinufft 8 | 9 | import utils 10 | 11 | 12 | def test_set_nu_raises_on_dtype(): 13 | dtype = np.float32 14 | 15 | M = 4096 16 | tol = 1e-3 17 | shape = (16, 16, 16) 18 | dim = len(shape) 19 | 20 | kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype) 21 | 22 | kxyz_gpu = gpuarray.to_gpu(kxyz) 23 | 24 | # Here we'll intentionally contruct an incorrect array dtype. 25 | kxyz_gpu_wrong_type = gpuarray.to_gpu(kxyz.astype(np.float64)) 26 | 27 | plan = cufinufft(1, shape, eps=tol, dtype=dtype) 28 | 29 | with pytest.raises(TypeError): 30 | plan.set_pts(kxyz_gpu_wrong_type[0], 31 | kxyz_gpu[1], kxyz_gpu[2]) 32 | with pytest.raises(TypeError): 33 | plan.set_pts(kxyz_gpu[0], 34 | kxyz_gpu_wrong_type[1], kxyz_gpu[2]) 35 | with pytest.raises(TypeError): 36 | plan.set_pts(kxyz_gpu[0], 37 | kxyz_gpu[1], kxyz_gpu_wrong_type[2]) 38 | with pytest.raises(TypeError): 39 | plan.set_pts(kxyz_gpu_wrong_type[0], 40 | kxyz_gpu_wrong_type[1], kxyz_gpu_wrong_type[2]) 41 | 42 | 43 | def test_set_pts_raises_on_size(): 44 | dtype = np.float32 45 | 46 | M = 8 47 | tol = 1e-3 48 | shape = (16, 16, 16) 49 | dim = len(shape) 50 | 51 | kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype) 52 | 53 | kxyz_gpu = gpuarray.to_gpu(kxyz) 54 | 55 | plan = cufinufft(1, shape, eps=tol, dtype=dtype) 56 | 57 | with pytest.raises(TypeError) as err: 58 | plan.set_pts(kxyz_gpu[0], kxyz_gpu[1][:4]) 59 | assert 'kx and ky must be equal' in err.value.args[0] 60 | 61 | with pytest.raises(TypeError) as err: 62 | plan.set_pts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2][:4]) 63 | assert 'kx and kz must be equal' in err.value.args[0] 64 | 65 | 66 | def test_wrong_field_names(): 67 | with pytest.raises(TypeError) as err: 68 | plan = cufinufft(1, (8, 8), foo="bar") 69 | assert "Invalid option 'foo'" in err.value.args[0] 70 | 71 | 72 | def test_exec_raises_on_dtype(): 73 | dtype = np.float32 74 | complex_dtype = np.complex64 75 | 76 | M = 4096 77 | tol = 1e-3 78 | shape = (16, 16, 16) 79 | dim = len(shape) 80 | 81 | kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype) 82 | c = utils.gen_nonuniform_data(M).astype(complex_dtype) 83 | c_gpu = gpuarray.to_gpu(c) 84 | # Using c.real gives us wrong dtype here... 85 | c_gpu_wrong_dtype = gpuarray.to_gpu(c.real) 86 | 87 | kxyz_gpu = gpuarray.to_gpu(kxyz) 88 | fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype) 89 | # Here we'll intentionally contruct an incorrect array dtype. 90 | fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128) 91 | 92 | plan = cufinufft(1, shape, eps=tol, dtype=dtype) 93 | 94 | plan.set_pts(kxyz_gpu[0], 95 | kxyz_gpu[1], kxyz_gpu[2]) 96 | 97 | with pytest.raises(TypeError): 98 | plan.execute(c_gpu, fk_gpu_wrong_dtype) 99 | 100 | with pytest.raises(TypeError): 101 | plan.execute(c_gpu_wrong_dtype, fk_gpu) 102 | -------------------------------------------------------------------------------- /examples/example2d1many.cpp: -------------------------------------------------------------------------------- 1 | /* This is an example of performing 2d1many 2 | in single precision. 3 | */ 4 | 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | int main(int argc, char* argv[]) 16 | /* 17 | * example code for 2D Type 1 transformation. 18 | * 19 | * To compile the code: 20 | * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include /loc/to/cufinufft/lib-static/libcufinufft.a -lcudart -lcufft -lnvToolsExt 21 | * 22 | * or 23 | * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib 24 | * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include -L/loc/to/cufinufft/lib/ -lcufinufft 25 | * 26 | * 27 | */ 28 | { 29 | cout< *c, *fk; 42 | cudaMallocHost(&x, M*sizeof(float)); 43 | cudaMallocHost(&y, M*sizeof(float)); 44 | cudaMallocHost(&c, M*ntransf*sizeof(complex)); 45 | cudaMallocHost(&fk,N1*N2*ntransf*sizeof(complex)); 46 | 47 | float *d_x, *d_y; 48 | cuFloatComplex *d_c, *d_fk; 49 | cudaMalloc(&d_x,M*sizeof(float)); 50 | cudaMalloc(&d_y,M*sizeof(float)); 51 | cudaMalloc(&d_c,M*ntransf*sizeof(cuFloatComplex)); 52 | cudaMalloc(&d_fk,N1*N2*ntransf*sizeof(cuFloatComplex)); 53 | 54 | for (int i=0; i Ft = complex(0,0), J = complex(0,1)*(float)iflag; 93 | for (BIGINT j=0; j 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | /* 16 | * example code for 2D Type 1 transformation. 17 | * 18 | * To compile the code: 19 | * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a -I/loc/to/cufinufft/include -lcudart -lcufft -lnvToolsExt 20 | * 21 | * or 22 | * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib 23 | * nvcc -DSINGLE example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o example2d1 -lcufinufft 24 | * 25 | * 26 | */ 27 | { 28 | cout< *c, *fk; 41 | cudaMallocHost(&x, M*sizeof(double)); 42 | cudaMallocHost(&y, M*sizeof(double)); 43 | cudaMallocHost(&c, M*ntransf*sizeof(complex)); 44 | cudaMallocHost(&fk,N1*N2*ntransf*sizeof(complex)); 45 | 46 | double *d_x, *d_y; 47 | cuDoubleComplex *d_c, *d_fk; 48 | cudaMalloc(&d_x,M*sizeof(double)); 49 | cudaMalloc(&d_y,M*sizeof(double)); 50 | cudaMalloc(&d_c,M*ntransf*sizeof(cuDoubleComplex)); 51 | cudaMalloc(&d_fk,N1*N2*ntransf*sizeof(cuDoubleComplex)); 52 | 53 | for (int i=0; i* fkstart; 89 | complex* cstart; 90 | for(int t=0; t J(0,iflag*1); 95 | complex ct(0,0); 96 | int m=0; 97 | for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2) // loop in correct order over F 98 | for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1) 99 | ct += fkstart[m++] * exp(J*(m1*x[jt] + m2*y[jt])); // crude direct 100 | 101 | printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n",t,jt,abs(cstart[jt]-ct)/infnorm(M,c)); 102 | } 103 | 104 | cudaFreeHost(x); 105 | cudaFreeHost(y); 106 | cudaFreeHost(c); 107 | cudaFreeHost(fk); 108 | 109 | cudaFree(d_x); 110 | cudaFree(d_y); 111 | cudaFree(d_c); 112 | cudaFree(d_fk); 113 | return 0; 114 | } 115 | -------------------------------------------------------------------------------- /contrib/dirft2d.cpp: -------------------------------------------------------------------------------- 1 | #include "dirft.h" 2 | #include 3 | 4 | // This is basically a port of dirft2d.f from CMCL package, except with 5 | // the 1/nj prefactors for type-1 removed. 6 | 7 | void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f) 8 | /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1. 9 | c nj-1 10 | c f[k1,k2] = SUM c[j] exp(+-i (k1 x[j] + k2 y[j])) 11 | c j=0 12 | c 13 | c for -ms/2 <= k1 <= (ms-1)/2, -mt/2 <= k2 <= (mt-1)/2. 14 | c The output array is in increasing k1 ordering (fast), then increasing 15 | k2 ordering (slow). If iflag>0 the + sign is 16 | c used, otherwise the - sign is used, in the exponential. 17 | * Uses C++ complex type and winding trick. Barnett 1/26/17 18 | */ 19 | { 20 | BIGINT k1min = -(ms/2), k2min = -(mt/2); // integer divide 21 | BIGINT N = ms*mt; // total # output modes 22 | for (BIGINT m=0;m0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); 25 | CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); 26 | CPX sp1 = pow(a1,(FLT)k1min); // starting phase for most neg k1 freq 27 | CPX p2 = pow(a2,(FLT)k2min); 28 | CPX cc = c[j]; // no 1/nj norm 29 | BIGINT m=0; // output pointer 30 | for (BIGINT m2=0;m20 the + sign is used, otherwise the - sign is used, in the 52 | exponential. 53 | Uses C++ complex type and winding trick. Barnett 1/26/17 54 | */ 55 | { 56 | BIGINT k1min = -(ms/2), k2min = -(mt/2); // integer divide 57 | for (BIGINT j=0;j0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); 59 | CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); 60 | CPX sp1 = pow(a1,(FLT)k1min); 61 | CPX p2 = pow(a2,(FLT)k2min); 62 | CPX cc = CPX(0,0); 63 | BIGINT m=0; // input pointer 64 | for (BIGINT m2=0;m20 the + sign is used, otherwise the - sign is used, in the 83 | c exponential. Uses C++ complex type. Simple brute force. Barnett 1/26/17 84 | */ 85 | { 86 | for (BIGINT k=0;k0) ? IMA*s[k] : -IMA*s[k]; 88 | CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k]; 89 | f[k] = CPX(0,0); 90 | for (BIGINT j=0;j /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ 8 | echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - 9 | 10 | COPY ci/docker/cuda11.0/cuda.repo /etc/yum.repos.d/cuda.repo 11 | 12 | # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a 13 | RUN yum install -y \ 14 | cuda-cudart-11-0-11.0.171-1 \ 15 | cuda-compat-11-0 \ 16 | && ln -s cuda-11.0 /usr/local/cuda && \ 17 | rm -rf /var/cache/yum/* 18 | 19 | # nvidia-docker 1.0 20 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 21 | echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf 22 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 23 | ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 24 | 25 | # nvidia-container-runtime 26 | ENV NVIDIA_VISIBLE_DEVICES all 27 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 28 | ENV NVIDIA_REQUIRE_CUDA "cuda>=11.0 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441" 29 | 30 | 31 | # runtime 32 | RUN yum install -y \ 33 | cuda-libraries-11-0-11.0.1-1 \ 34 | cuda-nvtx-11-0-11.0.167-1 \ 35 | && rm -rf /var/cache/yum/* 36 | 37 | RUN yum install -y xz && NCCL_DOWNLOAD_SUM=d112b722bf557cff96d571ac3386e4f539be7b3e9412561bde59b0ad6e59263d && \ 38 | curl -fsSL https://developer.download.nvidia.com/compute/redist/nccl/v2.7/nccl_2.7.3-1+cuda11.0_x86_64.txz -O && \ 39 | echo "$NCCL_DOWNLOAD_SUM nccl_2.7.3-1+cuda11.0_x86_64.txz" | sha256sum -c - && \ 40 | unxz nccl_2.7.3-1+cuda11.0_x86_64.txz && \ 41 | tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/lib64/ --strip-components=2 --wildcards '*/lib/libnccl.so.*' && \ 42 | tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/lib64/pkgconfig/ --strip-components=3 --wildcards '*/lib/pkgconfig/*' && \ 43 | rm -f nccl_2.7.3-1+cuda11.0_x86_64.tar && \ 44 | ldconfig 45 | 46 | 47 | # devel 48 | RUN yum install -y \ 49 | cuda-nvml-devel-11-0-11.0.167-1 \ 50 | cuda-command-line-tools-11-0-11.0.1-1 \ 51 | cuda-cudart-devel-11-0-11.0.171-1 \ 52 | cuda-libraries-devel-11-0-11.0.1-1 \ 53 | cuda-minimal-build-11-0-11.0.1-1 \ 54 | libcublas-devel-11-0-11.0.0.191-1 \ 55 | && rm -rf /var/cache/yum/* 56 | 57 | RUN yum install -y xz && NCCL_DOWNLOAD_SUM=d112b722bf557cff96d571ac3386e4f539be7b3e9412561bde59b0ad6e59263d && \ 58 | curl -fsSL https://developer.download.nvidia.com/compute/redist/nccl/v2.7/nccl_2.7.3-1+cuda11.0_x86_64.txz -O && \ 59 | echo "$NCCL_DOWNLOAD_SUM nccl_2.7.3-1+cuda11.0_x86_64.txz" | sha256sum -c - && \ 60 | unxz nccl_2.7.3-1+cuda11.0_x86_64.txz && \ 61 | tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/include/ --strip-components=2 --wildcards '*/include/*' && \ 62 | tar --no-same-owner --keep-old-files --no-overwrite-dir -xvf nccl_2.7.3-1+cuda11.0_x86_64.tar -C /usr/local/cuda/lib64/ --strip-components=2 --wildcards '*/lib/libnccl.so' && \ 63 | rm -f nccl_2.7.3-1+cuda11.0_x86_64.tar && \ 64 | ldconfig 65 | 66 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs 67 | 68 | 69 | # /CUDA # 70 | 71 | 72 | # Okay, so now we can begin cufinufft 73 | 74 | # We need to build the CUDA code now. 75 | # assume we are building container in the root of the git repo... 76 | COPY . /io 77 | WORKDIR /io 78 | RUN make target=manylinux 79 | # And we need to pack it in our LD path 80 | ENV LD_LIBRARY_PATH /io/lib:${LD_LIBRARY_PATH} 81 | 82 | 83 | CMD ["/bin/bash"] 84 | -------------------------------------------------------------------------------- /test/cufinufft2d2api_test.cu: -------------------------------------------------------------------------------- 1 | /* This test should excercise the API 2 | close to how a user might use the code */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | int N1 = 256; 17 | int N2 = 256; 18 | int M = N1*N2; 19 | 20 | double tol=1e-6; 21 | 22 | int iflag=1; 23 | 24 | cout< *c, *fk; 30 | checkCudaErrors(cudaMallocHost(&x, M*sizeof(double))); 31 | checkCudaErrors(cudaMallocHost(&y, M*sizeof(double))); 32 | checkCudaErrors(cudaMallocHost(&c, M*sizeof(complex))); 33 | checkCudaErrors(cudaMallocHost(&fk,N1*N2*sizeof(complex))); 34 | 35 | // malloc device arrays 36 | double *d_x, *d_y; 37 | cuDoubleComplex *d_c, *d_fk; 38 | checkCudaErrors(cudaMalloc(&d_x,M*sizeof(double))); 39 | checkCudaErrors(cudaMalloc(&d_y,M*sizeof(double))); 40 | checkCudaErrors(cudaMalloc(&d_c,M*sizeof(cuDoubleComplex))); 41 | checkCudaErrors(cudaMalloc(&d_fk,N1*N2*sizeof(cuDoubleComplex))); 42 | 43 | // Making data 44 | for (int i = 0; i < M; i++) { 45 | x[i] = M_PI*randm11(); // x in [-pi,pi) 46 | y[i] = M_PI*randm11(); 47 | } 48 | for(int i=0; i), 57 | cudaMemcpyHostToDevice)); 58 | 59 | 60 | // construct plan 61 | cufinufft_plan dplan; 62 | int dim = 2; 63 | int type = 2; 64 | 65 | int nmodes[3]; 66 | int ntransf = 1; 67 | int maxbatchsize = 1; 68 | nmodes[0] = N1; 69 | nmodes[1] = N2; 70 | nmodes[2] = 1; 71 | 72 | ier=cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, 73 | maxbatchsize, &dplan, NULL); 74 | if (ier!=0){ 75 | printf("err: cufinufft2d_plan\n"); 76 | return ier; 77 | } 78 | 79 | 80 | // Set Non uniform points 81 | ier=cufinufft_setpts(M, d_x, d_y, NULL, 0, NULL, NULL, NULL, dplan); 82 | if (ier!=0){ 83 | printf("err: cufinufft_setpts\n"); 84 | return ier; 85 | } 86 | 87 | // Execute the plan on the data 88 | ier=cufinufft_execute(d_c, d_fk, dplan); 89 | if (ier!=0){ 90 | printf("err: cufinufft2d2_exec\n"); 91 | return ier; 92 | } 93 | 94 | // Destroy the plan when done processing 95 | ier=cufinufft_destroy(dplan); 96 | if (ier!=0){ 97 | printf("err: cufinufft_destroyc\n"); 98 | return ier; 99 | } 100 | 101 | // Copy test data back to host and compare 102 | checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost)); 103 | int jt = M/2; // check arbitrary choice of one targ pt 104 | complex J = complex(0,1)*(double)iflag; 105 | complex ct = complex(0,0); 106 | int m=0; 107 | for (int m2=-(N2/2); m2<=(N2-1)/2; ++m2) // loop in correct order over F 108 | for (int m1=-(N1/2); m1<=(N1-1)/2; ++m1) 109 | ct += fk[m++] * exp(J*(m1*x[jt] + m2*y[jt])); // crude direct 110 | printf("[gpu ] one targ: rel err in c[%ld] is %.3g\n",(int64_t)jt,abs(c[jt]-ct)/infnorm(M,c)); 111 | 112 | 113 | // Cleanup 114 | checkCudaErrors(cudaFreeHost(x)); 115 | checkCudaErrors(cudaFreeHost(y)); 116 | checkCudaErrors(cudaFreeHost(c)); 117 | checkCudaErrors(cudaFreeHost(fk)); 118 | checkCudaErrors(cudaFree(d_x)); 119 | checkCudaErrors(cudaFree(d_y)); 120 | checkCudaErrors(cudaFree(d_c)); 121 | checkCudaErrors(cudaFree(d_fk)); 122 | 123 | return 0; 124 | } 125 | -------------------------------------------------------------------------------- /contrib/spreadinterp.cpp: -------------------------------------------------------------------------------- 1 | #include "spreadinterp.h" 2 | #include 3 | #include 4 | #include 5 | 6 | int setup_spreader(SPREAD_OPTS &opts,FLT eps, FLT upsampfac, int kerevalmeth) 7 | // Initializes spreader kernel parameters given desired NUFFT tolerance eps, 8 | // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth 9 | // (etiher 0:exp(sqrt()), 1: Horner ppval). 10 | // Also sets all default options in SPREAD_OPTS. See cnufftspread.h for opts. 11 | // Must call before any kernel evals done. 12 | // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h) 13 | { 14 | if (upsampfac!=2.0) { // nonstandard sigma 15 | if (kerevalmeth==1) { 16 | fprintf(stderr,"setup_spreader: nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",(double)upsampfac); 17 | return HORNER_WRONG_BETA; 18 | } 19 | if (upsampfac<=1.0) { 20 | fprintf(stderr,"setup_spreader: error, upsampfac=%.3g is <=1.0\n",(double)upsampfac); 21 | return ERR_UPSAMPFAC_TOO_SMALL; 22 | } 23 | // calling routine must abort on above errors, since opts is garbage! 24 | if (upsampfac>4.0) 25 | fprintf(stderr,"setup_spreader: warning, upsampfac=%.3g is too large to be beneficial!\n",(double)upsampfac); 26 | } 27 | 28 | // defaults... (user can change after this function called) 29 | opts.spread_direction = 1; // user should always set to 1 or 2 as desired 30 | opts.pirange = 1; // user also should always set this 31 | opts.upsampfac = upsampfac; 32 | 33 | // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach... 34 | int ier = 0; 35 | if (epsMAX_NSPREAD) { // clip to match allocated arrays 47 | fprintf(stderr,"%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",__func__, 48 | upsampfac,(double)eps,ns,MAX_NSPREAD); 49 | ns = MAX_NSPREAD; 50 | ier = WARN_EPS_TOO_SMALL; 51 | } 52 | opts.nspread = ns; 53 | opts.ES_halfwidth=(FLT)ns/2; // constants to help ker eval (except Horner) 54 | opts.ES_c = 4.0/(FLT)(ns*ns); 55 | 56 | FLT betaoverns = 2.30; // gives decent betas for default sigma=2.0 57 | if (ns==2) betaoverns = 2.20; // some small-width tweaks... 58 | if (ns==3) betaoverns = 2.26; 59 | if (ns==4) betaoverns = 2.38; 60 | if (upsampfac!=2.0) { // again, override beta for custom sigma 61 | FLT gamma=0.97; // must match devel/gen_all_horner_C_code.m 62 | betaoverns = gamma*PI*(1-1/(2*upsampfac)); // formula based on cutoff 63 | } 64 | opts.ES_beta = betaoverns * (FLT)ns; // set the kernel beta parameter 65 | //fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta); // user hasn't set debug yet 66 | return ier; 67 | } 68 | 69 | FLT evaluate_kernel(FLT x, const SPREAD_OPTS &opts) 70 | /* ES ("exp sqrt") kernel evaluation at single real argument: 71 | phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2 72 | related to an asymptotic approximation to the Kaiser--Bessel, itself an 73 | approximation to prolate spheroidal wavefunction (PSWF) of order 0. 74 | This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ 75 | { 76 | if (abs(x)>=opts.ES_halfwidth) 77 | // if spreading/FT careful, shouldn't need this if, but causes no speed hit 78 | return 0.0; 79 | else 80 | return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c*x*x)); 81 | } 82 | 83 | -------------------------------------------------------------------------------- /test/cufinufft2d2api_test_32.cu: -------------------------------------------------------------------------------- 1 | /* This test should excercise the API 2 | close to how a user might use the code 3 | 4 | Note this single precision version changes 5 | doubles ~~> float and 6 | cufinufft_* ~~> cufinufftf_* function names. 7 | 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include 19 | #include "../contrib/utils.h" 20 | 21 | using namespace std; 22 | 23 | typedef std::complex CPX; 24 | 25 | int main(int argc, char* argv[]) 26 | { 27 | int N1 = 256; 28 | int N2 = 256; 29 | int M = N1*N2; 30 | 31 | float tol=1e-6; 32 | 33 | int iflag=1; 34 | 35 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../src/cuspreadinterp.h" 7 | #include "../contrib/utils.h" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char* argv[]) 12 | { 13 | int nf1; 14 | FLT upsampfac=2.0; 15 | int N1, M; 16 | if (argc<4) { 17 | fprintf(stderr, 18 | "Usage: interp1d method nupts_distr nf1 [M [tol [kerevalmeth [sort]]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven\n" 22 | " nupts_distr: The distribution of the points; one of\n" 23 | " 0: uniform, or\n" 24 | " 1: concentrated in a small region.\n" 25 | " nf1: The size of the 2D array.\n" 26 | " M: The number of non-uniform points (default nf1 / 2).\n" 27 | " tol: NUFFT tolerance (default 1e-6).\n" 28 | " kerevalmeth: Kernel evaluation method; one of\n" 29 | " 0: Exponential of square root (default), or\n" 30 | " 1: Horner evaluation.\n" 31 | " sort: One of\n" 32 | " 0: do not sort the points, or\n" 33 | " 1: sort the points (default).\n"); 34 | return 1; 35 | } 36 | double w; 37 | int method; 38 | sscanf(argv[1],"%d",&method); 39 | int nupts_distribute; 40 | sscanf(argv[2],"%d",&nupts_distribute); 41 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 42 | 43 | N1 = (int) nf1/upsampfac; 44 | M = N1;// let density always be 1 45 | if(argc>4){ 46 | sscanf(argv[4],"%lf",&w); M = (int)w; // so can read 1e6 right! 47 | if(M == 0) M=N1; 48 | } 49 | 50 | FLT tol=1e-6; 51 | if(argc>5){ 52 | sscanf(argv[5],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 53 | } 54 | 55 | int kerevalmeth=0; 56 | if(argc>6){ 57 | sscanf(argv[6],"%d",&kerevalmeth); 58 | } 59 | 60 | int sort=1; 61 | if(argc>7){ 62 | sscanf(argv[7],"%d",&sort); 63 | } 64 | 65 | int ier; 66 | cout<opts)); 85 | dplan->opts.gpu_method = method; 86 | dplan->opts.gpu_maxsubprobsize = 1024; 87 | dplan->opts.gpu_kerevalmeth = kerevalmeth; 88 | dplan->opts.gpu_sort = sort; 89 | dplan->opts.gpu_spreadinterponly = 1; 90 | dplan->opts.gpu_binsizex = 1024; //binsize needs to be set here, since 91 | //SETUP_BINSIZE() is not called in 92 | //spread, interp only wrappers. 93 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 94 | 95 | switch(nupts_distribute){ 96 | case 0: //uniform 97 | { 98 | for (int i = 0; i < M; i++) { 99 | x[i] = M_PI*randm11();// x in [-pi,pi) 100 | } 101 | } 102 | break; 103 | case 1: // concentrate on a small region 104 | { 105 | for (int i = 0; i < M; i++) { 106 | x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi) 107 | } 108 | } 109 | break; 110 | } 111 | for(int i=0; iopts.gpu_method,nf1,M,t,M/t); 129 | checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost)); 130 | #ifdef RESULT 131 | cout<<"[result-input]"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../src/cuspreadinterp.h" 8 | #include "../contrib/utils.h" 9 | 10 | using namespace std; 11 | 12 | int main(int argc, char* argv[]) 13 | { 14 | int nf1, N1, M; 15 | FLT upsampfac=2.0; 16 | if (argc<4) { 17 | fprintf(stderr, 18 | "Usage: spread1d_test method nupts_distr nf1 [maxsubprobsize [M [tol [kerevalmeth]]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven, or\n" 22 | " 2: sub-problem\n" 23 | " nupts_distr: The distribution of the points; one of\n" 24 | " 0: uniform, or\n" 25 | " 1: concentrated in a small region.\n" 26 | " nf1: The size of the 1D array.\n" 27 | " maxsubprobsize: Maximum size of subproblems (default 65536).\n" 28 | " M: The number of non-uniform points (default nf1 / 2).\n" 29 | " tol: NUFFT tolerance (default 1e-6).\n" 30 | " kerevalmeth: Kernel evaluation method; one of\n" 31 | " 0: Exponential of square root (default), or\n" 32 | " 1: Horner evaluation.\n"); 33 | return 1; 34 | } 35 | double w; 36 | int method; 37 | sscanf(argv[1],"%d",&method); 38 | 39 | int nupts_distribute; 40 | sscanf(argv[2],"%d",&nupts_distribute); 41 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 42 | 43 | int maxsubprobsize=65536; 44 | if(argc>4){ 45 | sscanf(argv[4],"%d",&maxsubprobsize); 46 | } 47 | 48 | N1 = (int) nf1/upsampfac; 49 | M = N1; 50 | if(argc>5){ 51 | sscanf(argv[5],"%lf",&w); M = (int)w; // so can read 1e6 right! 52 | } 53 | 54 | FLT tol=1e-6; 55 | if(argc>6){ 56 | sscanf(argv[6],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 57 | } 58 | 59 | int kerevalmeth=0; 60 | if(argc>7){ 61 | sscanf(argv[7],"%d",&kerevalmeth); 62 | } 63 | 64 | int ier; 65 | int dim=1; 66 | 67 | CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S; 68 | // Zero out your struct, (sets all pointers to NULL, crucial) 69 | memset(dplan, 0, sizeof(*dplan)); 70 | ier = CUFINUFFT_DEFAULT_OPTS(1 /*type*/, dim, &(dplan->opts)); 71 | 72 | dplan->opts.gpu_method = method; 73 | dplan->opts.gpu_maxsubprobsize = maxsubprobsize; 74 | dplan->opts.gpu_kerevalmeth = kerevalmeth; 75 | dplan->opts.gpu_sort = 1; // ahb changed from 0 76 | dplan->opts.gpu_spreadinterponly = 1; 77 | dplan->opts.gpu_binsizex = 1024; //binsize needs to be set here, since 78 | //SETUP_BINSIZE() is not called in 79 | //spread, interp only wrappers. 80 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 81 | 82 | cout<opts.gpu_method,M,nf1,t,M/t); 133 | 134 | checkCudaErrors(cudaMemcpy(fw,d_fw,nf1*sizeof(CUCPX), 135 | cudaMemcpyDeviceToHost)); 136 | #ifdef RESULT 137 | cout<<"[result-input]"<opts.gpu_binsizex == 0 && i!=0) 140 | printf(" |"); 141 | printf(" (%2.3g,%2.3g)",fw[i].real(),fw[i].imag() ); 142 | } 143 | #endif 144 | 145 | cudaFreeHost(x); 146 | cudaFreeHost(c); 147 | cudaFreeHost(fw); 148 | cudaFree(d_x); 149 | cudaFree(d_c); 150 | cudaFree(d_fw); 151 | return 0; 152 | } 153 | -------------------------------------------------------------------------------- /python/cufinufft/_cufinufft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This file contains low level python bindings for the cufinufft CUDA libraries. 4 | Seperate bindings are provided for single and double precision libraries, 5 | differentiated by 'f' suffix. 6 | """ 7 | 8 | import ctypes 9 | import os 10 | import warnings 11 | 12 | # While imp is deprecated, it is currently the inspection solution 13 | # that works for all versions of Python 2 and 3. 14 | # One day if that changes, can be replaced 15 | # with importlib.find_spec. 16 | with warnings.catch_warnings(): 17 | warnings.filterwarnings("ignore", category=DeprecationWarning) 18 | import imp 19 | 20 | import numpy as np 21 | 22 | from ctypes import c_double 23 | from ctypes import c_int 24 | from ctypes import c_float 25 | from ctypes import c_void_p 26 | 27 | c_int_p = ctypes.POINTER(c_int) 28 | c_float_p = ctypes.POINTER(c_float) 29 | c_double_p = ctypes.POINTER(c_double) 30 | 31 | # TODO: See if there is a way to improve this so it is less hacky. 32 | lib = None 33 | # Try to load a local library directly. 34 | try: 35 | lib = ctypes.cdll.LoadLibrary('libcufinufft.so') 36 | except OSError: 37 | pass 38 | 39 | # Should that not work, try to find the full path of a packaged lib. 40 | # The packaged lib should have a py/platform decorated name, 41 | # and be rpath'ed the true CUDA C cufinufft library through the 42 | # Extension and wheel systems. 43 | try: 44 | if lib is None: 45 | # Find the library. 46 | fh = imp.find_module('cufinufftc')[0] 47 | # Get the full path for the ctypes loader. 48 | full_lib_path = os.path.realpath(fh.name) 49 | fh.close() # Be nice and close the open file handle. 50 | 51 | # Load the library, 52 | # which rpaths the libraries we care about. 53 | lib = ctypes.cdll.LoadLibrary(full_lib_path) 54 | 55 | except Exception: 56 | raise RuntimeError('Failed to find a suitable cufinufft library') 57 | 58 | 59 | def _get_ctypes(dtype): 60 | """ 61 | Checks dtype is float32 or float64. 62 | Returns floating point and floating point pointer. 63 | """ 64 | 65 | if dtype == np.float64: 66 | REAL_t = c_double 67 | elif dtype == np.float32: 68 | REAL_t = c_float 69 | else: 70 | raise TypeError("Expected np.float32 or np.float64.") 71 | 72 | REAL_ptr = ctypes.POINTER(REAL_t) 73 | 74 | return REAL_t, REAL_ptr 75 | 76 | 77 | def _get_NufftOpts(): 78 | fields = [ 79 | ('upsampfac', c_double), 80 | ('gpu_method', c_int), 81 | ('gpu_sort', c_int), 82 | ('gpu_binsizex', c_int), 83 | ('gpu_binsizey', c_int), 84 | ('gpu_binsizez', c_int), 85 | ('gpu_obinsizex', c_int), 86 | ('gpu_obinsizey', c_int), 87 | ('gpu_obinsizez', c_int), 88 | ('gpu_maxsubprobsize', c_int), 89 | ('gpu_nstreams', c_int), 90 | ('gpu_kerevalmeth', c_int), 91 | ('gpu_spreadinterponly', c_int), 92 | ('gpu_device_id', c_int)] 93 | return fields 94 | 95 | 96 | class NufftOpts(ctypes.Structure): 97 | pass 98 | 99 | 100 | NufftOpts._fields_ = _get_NufftOpts() 101 | 102 | 103 | CufinufftPlan = c_void_p 104 | CufinufftPlanf = c_void_p 105 | 106 | CufinufftPlan_p = ctypes.POINTER(CufinufftPlan) 107 | CufinufftPlanf_p = ctypes.POINTER(CufinufftPlanf) 108 | 109 | NufftOpts_p = ctypes.POINTER(NufftOpts) 110 | 111 | _default_opts = lib.cufinufft_default_opts 112 | _default_opts.argtypes = [c_int, c_int, NufftOpts_p] 113 | _default_opts.restype = c_int 114 | 115 | _make_plan = lib.cufinufft_makeplan 116 | _make_plan.argtypes = [ 117 | c_int, c_int, c_int_p, c_int, 118 | c_int, c_double, c_int, CufinufftPlan_p, NufftOpts_p] 119 | _make_plan.restypes = c_int 120 | 121 | _make_planf = lib.cufinufftf_makeplan 122 | _make_planf.argtypes = [ 123 | c_int, c_int, c_int_p, c_int, 124 | c_int, c_float, c_int, CufinufftPlanf_p, NufftOpts_p] 125 | _make_planf.restypes = c_int 126 | 127 | _set_pts = lib.cufinufft_setpts 128 | _set_pts.argtypes = [ 129 | c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p, 130 | c_double_p, c_double_p, c_void_p] 131 | _set_pts.restype = c_int 132 | 133 | _set_ptsf = lib.cufinufftf_setpts 134 | _set_ptsf.argtypes = [ 135 | c_int, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p, 136 | c_float_p, c_float_p, c_void_p] 137 | _set_ptsf.restype = c_int 138 | 139 | _exec_plan = lib.cufinufft_execute 140 | _exec_plan.argtypes = [c_void_p, c_void_p, c_void_p] 141 | _exec_plan.restype = c_int 142 | 143 | _exec_planf = lib.cufinufftf_execute 144 | _exec_planf.argtypes = [c_void_p, c_void_p, c_void_p] 145 | _exec_planf.restype = c_int 146 | 147 | _destroy_plan = lib.cufinufft_destroy 148 | _destroy_plan.argtypes = [c_void_p] 149 | _destroy_plan.restype = c_int 150 | 151 | _destroy_planf = lib.cufinufftf_destroy 152 | _destroy_planf.argtypes = [c_void_p] 153 | _destroy_planf.restype = c_int 154 | -------------------------------------------------------------------------------- /src/1d/interp1d_wrapper.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include "../cuspreadinterp.h" 7 | #include "../memtransfer.h" 8 | #include 9 | 10 | using namespace std; 11 | 12 | int CUFINUFFT_INTERP1D(int nf1, CUCPX* d_fw, int M, FLT *d_kx, CUCPX *d_c, 13 | CUFINUFFT_PLAN d_plan) 14 | /* 15 | This c function is written for only doing 1D interpolation. See 16 | test/interp1d_test.cu for usage. 17 | 18 | note: not allocate,transfer and free memories on gpu. 19 | Melody Shih 11/21/21 20 | */ 21 | { 22 | cudaEvent_t start, stop; 23 | cudaEventCreate(&start); 24 | cudaEventCreate(&stop); 25 | 26 | d_plan->nf1 = nf1; 27 | d_plan->M = M; 28 | d_plan->maxbatchsize = 1; 29 | 30 | d_plan->kx = d_kx; 31 | d_plan->c = d_c; 32 | d_plan->fw = d_fw; 33 | 34 | int ier; 35 | cudaEventRecord(start); 36 | ier = ALLOCGPUMEM1D_PLAN(d_plan); 37 | ier = ALLOCGPUMEM1D_NUPTS(d_plan); 38 | if(d_plan->opts.gpu_method == 1){ 39 | ier = CUSPREAD1D_NUPTSDRIVEN_PROP(nf1,M,d_plan); 40 | if(ier != 0 ){ 41 | printf("error: cuspread1d_subprob_prop, method(%d)\n", 42 | d_plan->opts.gpu_method); 43 | return ier; 44 | } 45 | } 46 | if(d_plan->opts.gpu_method == 2){ 47 | ier = CUSPREAD1D_SUBPROB_PROP(nf1,M,d_plan); 48 | if(ier != 0 ){ 49 | printf("error: cuspread1d_subprob_prop, method(%d)\n", 50 | d_plan->opts.gpu_method); 51 | return ier; 52 | } 53 | } 54 | #ifdef TIME 55 | float milliseconds = 0; 56 | cudaEventRecord(stop); 57 | cudaEventSynchronize(stop); 58 | cudaEventElapsedTime(&milliseconds, start, stop); 59 | printf("[time ] Obtain Interp Prop\t %.3g ms\n", milliseconds); 60 | #endif 61 | cudaEventRecord(start); 62 | ier = CUINTERP1D(d_plan,1); 63 | #ifdef TIME 64 | cudaEventRecord(stop); 65 | cudaEventSynchronize(stop); 66 | cudaEventElapsedTime(&milliseconds, start, stop); 67 | printf("[time ] Interp (%d)\t\t %.3g ms\n", d_plan->opts.gpu_method, 68 | milliseconds); 69 | #endif 70 | cudaEventRecord(start); 71 | FREEGPUMEMORY1D(d_plan); 72 | #ifdef TIME 73 | cudaEventRecord(stop); 74 | cudaEventSynchronize(stop); 75 | cudaEventElapsedTime(&milliseconds, start, stop); 76 | printf("[time ] Free GPU memory\t %.3g ms\n", milliseconds); 77 | #endif 78 | return ier; 79 | } 80 | 81 | int CUINTERP1D(CUFINUFFT_PLAN d_plan, int blksize) 82 | /* 83 | A wrapper for different interpolation methods. 84 | 85 | Methods available: 86 | (1) Non-uniform points driven 87 | (2) Subproblem 88 | 89 | Melody Shih 11/21/21 90 | */ 91 | { 92 | int nf1 = d_plan->nf1; 93 | int M = d_plan->M; 94 | 95 | cudaEvent_t start, stop; 96 | cudaEventCreate(&start); 97 | cudaEventCreate(&stop); 98 | 99 | int ier; 100 | switch(d_plan->opts.gpu_method) 101 | { 102 | case 1: 103 | { 104 | cudaEventRecord(start); 105 | { 106 | ier = CUINTERP1D_NUPTSDRIVEN(nf1, M, d_plan, blksize); 107 | if(ier != 0 ){ 108 | cout<<"error: cnufftspread1d_gpu_nuptsdriven"<spopts.nspread; // psi's support in terms of number of cells 138 | FLT es_c=d_plan->spopts.ES_c; 139 | FLT es_beta=d_plan->spopts.ES_beta; 140 | FLT sigma=d_plan->opts.upsampfac; 141 | int pirange=d_plan->spopts.pirange; 142 | int *d_idxnupts=d_plan->idxnupts; 143 | 144 | FLT* d_kx = d_plan->kx; 145 | CUCPX* d_c = d_plan->c; 146 | CUCPX* d_fw = d_plan->fw; 147 | 148 | threadsPerBlock.x = 32; 149 | threadsPerBlock.y = 1; 150 | blocks.x = (M + threadsPerBlock.x - 1)/threadsPerBlock.x; 151 | blocks.y = 1; 152 | 153 | cudaEventRecord(start); 154 | if(d_plan->opts.gpu_kerevalmeth){ 155 | for(int t=0; t>>(d_kx, 157 | d_c+t*M, d_fw+t*nf1, M, ns, nf1, sigma, d_idxnupts, pirange); 158 | } 159 | }else{ 160 | for(int t=0; t>>(d_kx, 162 | d_c+t*M, d_fw+t*nf1, M, ns, nf1, es_c, es_beta, d_idxnupts, pirange); 163 | } 164 | } 165 | #ifdef SPREADTIME 166 | float milliseconds = 0; 167 | cudaEventRecord(stop); 168 | cudaEventSynchronize(stop); 169 | cudaEventElapsedTime(&milliseconds, start, stop); 170 | printf("[time ] \tKernel Interp_1d_NUptsdriven (%d)\t%.3g ms\n", 171 | milliseconds, d_plan->opts.gpu_kerevalmeth); 172 | #endif 173 | return 0; 174 | } 175 | -------------------------------------------------------------------------------- /test/interp2d_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../src/cuspreadinterp.h" 7 | #include "../contrib/utils.h" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char* argv[]) 12 | { 13 | int nf1, nf2; 14 | FLT upsampfac=2.0; 15 | int N1, N2, M; 16 | if (argc<5) { 17 | fprintf(stderr, 18 | "Usage: interp2d method nupts_distr nf1 nf2 [M [tol [kerevalmeth [sort]]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven, or\n" 22 | " 2: sub-problem.\n" 23 | " nupts_distr: The distribution of the points; one of\n" 24 | " 0: uniform, or\n" 25 | " 1: concentrated in a small region.\n" 26 | " nf1, nf2: The size of the 2D array.\n" 27 | " M: The number of non-uniform points (default nf1 * nf2 / 4).\n" 28 | " tol: NUFFT tolerance (default 1e-6).\n" 29 | " kerevalmeth: Kernel evaluation method; one of\n" 30 | " 0: Exponential of square root (default), or\n" 31 | " 1: Horner evaluation.\n" 32 | " sort: One of\n" 33 | " 0: do not sort the points, or\n" 34 | " 1: sort the points (default).\n"); 35 | return 1; 36 | } 37 | double w; 38 | int method; 39 | sscanf(argv[1],"%d",&method); 40 | int nupts_distribute; 41 | sscanf(argv[2],"%d",&nupts_distribute); 42 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 43 | sscanf(argv[4],"%lf",&w); nf2 = (int)w; // so can read 1e6 right! 44 | 45 | N1 = (int) nf1/upsampfac; 46 | N2 = (int) nf2/upsampfac; 47 | M = N1*N2;// let density always be 1 48 | if(argc>5){ 49 | sscanf(argv[5],"%lf",&w); M = (int)w; // so can read 1e6 right! 50 | if(M == 0) M=N1*N2; 51 | } 52 | 53 | FLT tol=1e-6; 54 | if(argc>6){ 55 | sscanf(argv[6],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 56 | } 57 | 58 | int kerevalmeth=0; 59 | if(argc>7){ 60 | sscanf(argv[7],"%d",&kerevalmeth); 61 | } 62 | 63 | int sort=1; 64 | if(argc>8){ 65 | sscanf(argv[8],"%d",&sort); 66 | } 67 | 68 | int ier; 69 | cout<opts)); 90 | dplan->opts.gpu_method = method; 91 | dplan->opts.gpu_maxsubprobsize = 1024; 92 | dplan->opts.gpu_kerevalmeth = kerevalmeth; 93 | dplan->opts.gpu_sort = sort; 94 | dplan->opts.gpu_spreadinterponly = 1; 95 | dplan->opts.gpu_binsizex = 32; //binsize needs to be set here, since 96 | //SETUP_BINSIZE() is not called in 97 | //spread, interp only wrappers. 98 | dplan->opts.gpu_binsizey = 32; 99 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 100 | 101 | switch(nupts_distribute){ 102 | case 0: //uniform 103 | { 104 | for (int i = 0; i < M; i++) { 105 | x[i] = M_PI*randm11();// x in [-pi,pi) 106 | y[i] = M_PI*randm11(); 107 | } 108 | } 109 | break; 110 | case 1: // concentrate on a small region 111 | { 112 | for (int i = 0; i < M; i++) { 113 | x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi) 114 | y[i] = M_PI*rand01()/(nf2*2/32); 115 | } 116 | } 117 | break; 118 | } 119 | for(int i=0; iopts.gpu_method,nf1*nf2,M,t,M/t); 138 | checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost)); 139 | #ifdef RESULT 140 | cout<<"[result-input]"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include "../cuspreadinterp.h" 10 | #include "../cudeconvolve.h" 11 | #include "../memtransfer.h" 12 | 13 | using namespace std; 14 | 15 | int CUFINUFFT3D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 16 | /* 17 | 3D Type-1 NUFFT 18 | 19 | This function is called in "exec" stage (See ../cufinufft.cu). 20 | It includes (copied from doc in finufft library) 21 | Step 1: spread data to oversampled regular mesh using kernel 22 | Step 2: compute FFT on uniform mesh 23 | Step 3: deconvolve by division of each Fourier mode independently by the 24 | Fourier series coefficient of the kernel. 25 | 26 | Melody Shih 07/25/19 27 | */ 28 | { 29 | cudaEvent_t start, stop; 30 | cudaEventCreate(&start); 31 | cudaEventCreate(&stop); 32 | 33 | cudaEventRecord(start); 34 | int blksize; 35 | int ier; 36 | CUCPX* d_fkstart; 37 | CUCPX* d_cstart; 38 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 39 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 40 | d_plan->maxbatchsize); 41 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 42 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt* 43 | d_plan->mu; 44 | 45 | d_plan->c = d_cstart; 46 | d_plan->fk = d_fkstart; 47 | 48 | checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize* 49 | d_plan->nf1*d_plan->nf2*d_plan->nf3*sizeof(CUCPX))); 50 | #ifdef TIME 51 | float milliseconds = 0; 52 | cudaEventRecord(stop); 53 | cudaEventSynchronize(stop); 54 | cudaEventElapsedTime(&milliseconds, start, stop); 55 | printf("[time ] \tInitialize fw\t\t %.3g s\n", milliseconds/1000); 56 | #endif 57 | // Step 1: Spread 58 | cudaEventRecord(start); 59 | ier = CUSPREAD3D(d_plan, blksize); 60 | if(ier != 0 ){ 61 | printf("error: cuspread3d, method(%d)\n", d_plan->opts.gpu_method); 62 | return ier; 63 | } 64 | #ifdef TIME 65 | cudaEventRecord(stop); 66 | cudaEventSynchronize(stop); 67 | cudaEventElapsedTime(&milliseconds, start, stop); 68 | printf("[time ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 69 | d_plan->opts.gpu_method); 70 | #endif 71 | // Step 2: FFT 72 | cudaEventRecord(start); 73 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 74 | #ifdef TIME 75 | cudaEventRecord(stop); 76 | cudaEventSynchronize(stop); 77 | cudaEventElapsedTime(&milliseconds, start, stop); 78 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 79 | #endif 80 | 81 | // Step 3: deconvolve and shuffle 82 | cudaEventRecord(start); 83 | CUDECONVOLVE3D(d_plan, blksize); 84 | #ifdef TIME 85 | cudaEventRecord(stop); 86 | cudaEventSynchronize(stop); 87 | cudaEventElapsedTime(&milliseconds, start, stop); 88 | printf("[time ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000); 89 | #endif 90 | } 91 | return ier; 92 | } 93 | 94 | int CUFINUFFT3D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 95 | /* 96 | 3D Type-2 NUFFT 97 | 98 | This function is called in "exec" stage (See ../cufinufft.cu). 99 | It includes (copied from doc in finufft library) 100 | Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 101 | Fourier coeff 102 | Step 2: compute FFT on uniform mesh 103 | Step 3: interpolate data to regular mesh 104 | 105 | Melody Shih 07/25/19 106 | */ 107 | { 108 | cudaEvent_t start, stop; 109 | cudaEventCreate(&start); 110 | cudaEventCreate(&stop); 111 | 112 | int blksize; 113 | int ier; 114 | CUCPX* d_fkstart; 115 | CUCPX* d_cstart; 116 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 117 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 118 | d_plan->maxbatchsize); 119 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 120 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt* 121 | d_plan->mu; 122 | 123 | d_plan->c = d_cstart; 124 | d_plan->fk = d_fkstart; 125 | 126 | // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw 127 | cudaEventRecord(start); 128 | CUDECONVOLVE3D(d_plan, blksize); 129 | #ifdef TIME 130 | float milliseconds = 0; 131 | cudaEventRecord(stop); 132 | cudaEventSynchronize(stop); 133 | cudaEventElapsedTime(&milliseconds, start, stop); 134 | printf("[time ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000); 135 | #endif 136 | // Step 2: FFT 137 | cudaEventRecord(start); 138 | cudaDeviceSynchronize(); 139 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 140 | #ifdef TIME 141 | cudaEventRecord(stop); 142 | cudaEventSynchronize(stop); 143 | cudaEventElapsedTime(&milliseconds, start, stop); 144 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 145 | #endif 146 | 147 | // Step 3: deconvolve and shuffle 148 | cudaEventRecord(start); 149 | ier = CUINTERP3D(d_plan, blksize); 150 | if(ier != 0 ){ 151 | printf("error: cuinterp3d, method(%d)\n", d_plan->opts.gpu_method); 152 | return ier; 153 | } 154 | #ifdef TIME 155 | cudaEventRecord(stop); 156 | cudaEventSynchronize(stop); 157 | cudaEventElapsedTime(&milliseconds, start, stop); 158 | printf("[time ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000, 159 | d_plan->opts.gpu_method); 160 | #endif 161 | } 162 | 163 | return ier; 164 | } 165 | -------------------------------------------------------------------------------- /src/1d/cufinufft1d.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include "../cuspreadinterp.h" 10 | #include "../cudeconvolve.h" 11 | #include "../memtransfer.h" 12 | 13 | using namespace std; 14 | 15 | int CUFINUFFT1D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 16 | /* 17 | 1D Type-1 NUFFT 18 | 19 | This function is called in "exec" stage (See ../cufinufft.cu). 20 | It includes (copied from doc in finufft library) 21 | Step 1: spread data to oversampled regular mesh using kernel 22 | Step 2: compute FFT on uniform mesh 23 | Step 3: deconvolve by division of each Fourier mode independently by the 24 | Fourier series coefficient of the kernel. 25 | 26 | Melody Shih 11/21/21 27 | */ 28 | { 29 | assert(d_plan->spopts.spread_direction == 1); 30 | cudaEvent_t start, stop; 31 | cudaEventCreate(&start); 32 | cudaEventCreate(&stop); 33 | 34 | cudaEventRecord(start); 35 | int blksize; 36 | int ier; 37 | CUCPX* d_fkstart; 38 | CUCPX* d_cstart; 39 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 40 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 41 | d_plan->maxbatchsize); 42 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 43 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms; 44 | d_plan->c = d_cstart; 45 | d_plan->fk = d_fkstart; 46 | 47 | checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize* 48 | d_plan->nf1*sizeof(CUCPX)));// this is needed 49 | #ifdef TIME 50 | float milliseconds = 0; 51 | cudaEventRecord(stop); 52 | cudaEventSynchronize(stop); 53 | cudaEventElapsedTime(&milliseconds, start, stop); 54 | printf("[time ] \tInitialize fw to 0\t %.3g s\n", 55 | milliseconds/1000); 56 | #endif 57 | // Step 1: Spread 58 | cudaEventRecord(start); 59 | ier = CUSPREAD1D(d_plan,blksize); 60 | if(ier != 0 ){ 61 | printf("error: cuspread1d, method(%d)\n", d_plan->opts.gpu_method); 62 | return ier; 63 | } 64 | #ifdef TIME 65 | cudaEventRecord(stop); 66 | cudaEventSynchronize(stop); 67 | cudaEventElapsedTime(&milliseconds, start, stop); 68 | printf("[time ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 69 | d_plan->opts.gpu_method); 70 | #endif 71 | // Step 2: FFT 72 | cudaEventRecord(start); 73 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 74 | #ifdef TIME 75 | cudaEventRecord(stop); 76 | cudaEventSynchronize(stop); 77 | cudaEventElapsedTime(&milliseconds, start, stop); 78 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 79 | #endif 80 | 81 | // Step 3: deconvolve and shuffle 82 | cudaEventRecord(start); 83 | CUDECONVOLVE1D(d_plan,blksize); 84 | #ifdef TIME 85 | cudaEventRecord(stop); 86 | cudaEventSynchronize(stop); 87 | cudaEventElapsedTime(&milliseconds, start, stop); 88 | printf("[time ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000); 89 | #endif 90 | } 91 | return ier; 92 | } 93 | 94 | int CUFINUFFT1D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 95 | /* 96 | 1D Type-2 NUFFT 97 | 98 | This function is called in "exec" stage (See ../cufinufft.cu). 99 | It includes (copied from doc in finufft library) 100 | Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 101 | Fourier coeff 102 | Step 2: compute FFT on uniform mesh 103 | Step 3: interpolate data to regular mesh 104 | 105 | Melody Shih 11/21/21 106 | */ 107 | { 108 | assert(d_plan->spopts.spread_direction == 2); 109 | 110 | cudaEvent_t start, stop; 111 | cudaEventCreate(&start); 112 | cudaEventCreate(&stop); 113 | 114 | cudaEventRecord(start); 115 | int blksize; 116 | int ier; 117 | CUCPX* d_fkstart; 118 | CUCPX* d_cstart; 119 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 120 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 121 | d_plan->maxbatchsize); 122 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 123 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms; 124 | 125 | d_plan->c = d_cstart; 126 | d_plan->fk = d_fkstart; 127 | 128 | // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw 129 | cudaEventRecord(start); 130 | CUDECONVOLVE1D(d_plan,blksize); 131 | #ifdef TIME 132 | float milliseconds = 0; 133 | cudaEventRecord(stop); 134 | cudaEventSynchronize(stop); 135 | cudaEventElapsedTime(&milliseconds, start, stop); 136 | printf("[time ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000); 137 | #endif 138 | // Step 2: FFT 139 | cudaDeviceSynchronize(); 140 | cudaEventRecord(start); 141 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 142 | #ifdef TIME 143 | cudaEventRecord(stop); 144 | cudaEventSynchronize(stop); 145 | cudaEventElapsedTime(&milliseconds, start, stop); 146 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 147 | #endif 148 | 149 | // Step 3: deconvolve and shuffle 150 | cudaEventRecord(start); 151 | ier = CUINTERP1D(d_plan, blksize); 152 | if(ier != 0 ){ 153 | printf("error: cuinterp1d, method(%d)\n", d_plan->opts.gpu_method); 154 | return ier; 155 | } 156 | #ifdef TIME 157 | cudaEventRecord(stop); 158 | cudaEventSynchronize(stop); 159 | cudaEventElapsedTime(&milliseconds, start, stop); 160 | printf("[time ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000, 161 | d_plan->opts.gpu_method); 162 | #endif 163 | } 164 | return ier; 165 | } 166 | 167 | -------------------------------------------------------------------------------- /contrib/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #include "legendre_rule_fast.h" 10 | } 11 | #else 12 | #include "legendre_rule_fast.h" 13 | #endif 14 | 15 | int setup_spreader_for_nufft(SPREAD_OPTS &spopts, FLT eps, cufinufft_opts opts) 16 | // Set up the spreader parameters given eps, and pass across various nufft 17 | // options. Report status of setup_spreader. Barnett 10/30/17 18 | { 19 | int ier=setup_spreader(spopts, eps, opts.upsampfac, opts.gpu_kerevalmeth); 20 | spopts.pirange = 1; // could allow user control? 21 | return ier; 22 | } 23 | 24 | void SET_NF_TYPE12(BIGINT ms, cufinufft_opts opts, SPREAD_OPTS spopts, 25 | BIGINT *nf, BIGINT bs) 26 | // type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts 27 | // and requested number of Fourier modes ms. 28 | { 29 | *nf = (BIGINT)(opts.upsampfac*ms); 30 | if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; // otherwise spread fails 31 | if (*nf brk(nt+1); // start indices for each thread 105 | for (int t=0; t<=nt; ++t) // split nout mode indices btw threads 106 | brk[t] = (BIGINT)(0.5 + nout*t/(double)nt); 107 | #pragma omp parallel 108 | { 109 | int t = MY_OMP_GET_THREAD_NUM(); 110 | if (t 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include "../cuspreadinterp.h" 10 | #include "../cudeconvolve.h" 11 | #include "../memtransfer.h" 12 | 13 | using namespace std; 14 | 15 | int CUFINUFFT2D1_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 16 | /* 17 | 2D Type-1 NUFFT 18 | 19 | This function is called in "exec" stage (See ../cufinufft.cu). 20 | It includes (copied from doc in finufft library) 21 | Step 1: spread data to oversampled regular mesh using kernel 22 | Step 2: compute FFT on uniform mesh 23 | Step 3: deconvolve by division of each Fourier mode independently by the 24 | Fourier series coefficient of the kernel. 25 | 26 | Melody Shih 07/25/19 27 | */ 28 | { 29 | assert(d_plan->spopts.spread_direction == 1); 30 | cudaEvent_t start, stop; 31 | cudaEventCreate(&start); 32 | cudaEventCreate(&stop); 33 | 34 | cudaEventRecord(start); 35 | int blksize; 36 | int ier; 37 | CUCPX* d_fkstart; 38 | CUCPX* d_cstart; 39 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 40 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 41 | d_plan->maxbatchsize); 42 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 43 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt; 44 | d_plan->c = d_cstart; 45 | d_plan->fk = d_fkstart; 46 | 47 | checkCudaErrors(cudaMemset(d_plan->fw,0,d_plan->maxbatchsize* 48 | d_plan->nf1*d_plan->nf2*sizeof(CUCPX)));// this is needed 49 | #ifdef TIME 50 | float milliseconds = 0; 51 | cudaEventRecord(stop); 52 | cudaEventSynchronize(stop); 53 | cudaEventElapsedTime(&milliseconds, start, stop); 54 | printf("[time ] \tInitialize fw to 0\t %.3g s\n", 55 | milliseconds/1000); 56 | #endif 57 | // Step 1: Spread 58 | cudaEventRecord(start); 59 | ier = CUSPREAD2D(d_plan,blksize); 60 | if(ier != 0 ){ 61 | printf("error: cuspread2d, method(%d)\n", d_plan->opts.gpu_method); 62 | return ier; 63 | } 64 | #ifdef TIME 65 | cudaEventRecord(stop); 66 | cudaEventSynchronize(stop); 67 | cudaEventElapsedTime(&milliseconds, start, stop); 68 | printf("[time ] \tSpread (%d)\t\t %.3g s\n", milliseconds/1000, 69 | d_plan->opts.gpu_method); 70 | #endif 71 | // Step 2: FFT 72 | cudaEventRecord(start); 73 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 74 | #ifdef TIME 75 | cudaEventRecord(stop); 76 | cudaEventSynchronize(stop); 77 | cudaEventElapsedTime(&milliseconds, start, stop); 78 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 79 | #endif 80 | 81 | // Step 3: deconvolve and shuffle 82 | cudaEventRecord(start); 83 | CUDECONVOLVE2D(d_plan,blksize); 84 | #ifdef TIME 85 | cudaEventRecord(stop); 86 | cudaEventSynchronize(stop); 87 | cudaEventElapsedTime(&milliseconds, start, stop); 88 | printf("[time ] \tDeconvolve\t\t %.3g s\n", milliseconds/1000); 89 | #endif 90 | } 91 | return ier; 92 | } 93 | 94 | int CUFINUFFT2D2_EXEC(CUCPX* d_c, CUCPX* d_fk, CUFINUFFT_PLAN d_plan) 95 | /* 96 | 2D Type-2 NUFFT 97 | 98 | This function is called in "exec" stage (See ../cufinufft.cu). 99 | It includes (copied from doc in finufft library) 100 | Step 1: deconvolve (amplify) each Fourier mode, dividing by kernel 101 | Fourier coeff 102 | Step 2: compute FFT on uniform mesh 103 | Step 3: interpolate data to regular mesh 104 | 105 | Melody Shih 07/25/19 106 | */ 107 | { 108 | assert(d_plan->spopts.spread_direction == 2); 109 | 110 | cudaEvent_t start, stop; 111 | cudaEventCreate(&start); 112 | cudaEventCreate(&stop); 113 | 114 | cudaEventRecord(start); 115 | int blksize; 116 | int ier; 117 | CUCPX* d_fkstart; 118 | CUCPX* d_cstart; 119 | for(int i=0; i*d_plan->maxbatchsize < d_plan->ntransf; i++){ 120 | blksize = min(d_plan->ntransf - i*d_plan->maxbatchsize, 121 | d_plan->maxbatchsize); 122 | d_cstart = d_c + i*d_plan->maxbatchsize*d_plan->M; 123 | d_fkstart = d_fk + i*d_plan->maxbatchsize*d_plan->ms*d_plan->mt; 124 | 125 | d_plan->c = d_cstart; 126 | d_plan->fk = d_fkstart; 127 | 128 | // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw 129 | cudaEventRecord(start); 130 | CUDECONVOLVE2D(d_plan,blksize); 131 | #ifdef TIME 132 | float milliseconds = 0; 133 | cudaEventRecord(stop); 134 | cudaEventSynchronize(stop); 135 | cudaEventElapsedTime(&milliseconds, start, stop); 136 | printf("[time ] \tAmplify & Copy fktofw\t %.3g s\n", milliseconds/1000); 137 | #endif 138 | // Step 2: FFT 139 | cudaDeviceSynchronize(); 140 | cudaEventRecord(start); 141 | CUFFT_EX(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); 142 | #ifdef TIME 143 | cudaEventRecord(stop); 144 | cudaEventSynchronize(stop); 145 | cudaEventElapsedTime(&milliseconds, start, stop); 146 | printf("[time ] \tCUFFT Exec\t\t %.3g s\n", milliseconds/1000); 147 | #endif 148 | 149 | // Step 3: deconvolve and shuffle 150 | cudaEventRecord(start); 151 | ier = CUINTERP2D(d_plan, blksize); 152 | if(ier != 0 ){ 153 | printf("error: cuinterp2d, method(%d)\n", d_plan->opts.gpu_method); 154 | return ier; 155 | } 156 | #ifdef TIME 157 | cudaEventRecord(stop); 158 | cudaEventSynchronize(stop); 159 | cudaEventElapsedTime(&milliseconds, start, stop); 160 | printf("[time ] \tUnspread (%d)\t\t %.3g s\n", milliseconds/1000, 161 | d_plan->opts.gpu_method); 162 | #endif 163 | } 164 | return ier; 165 | } 166 | 167 | -------------------------------------------------------------------------------- /test/spread2d_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../src/cuspreadinterp.h" 8 | #include "../contrib/utils.h" 9 | 10 | using namespace std; 11 | 12 | int main(int argc, char* argv[]) 13 | { 14 | int nf1, nf2; 15 | FLT upsampfac=2.0; 16 | int N1, N2, M; 17 | if (argc<5) { 18 | fprintf(stderr, 19 | "Usage: spread2d_test method nupts_distr nf1 nf2 [maxsubprobsize [M [tol [kerevalmeth]]]]\n" 20 | "Arguments:\n" 21 | " method: One of\n" 22 | " 1: nupts driven,\n" 23 | " 2: sub-problem, or\n" 24 | " 3: sub-problem with Paul's idea.\n" 25 | " nupts_distr: The distribution of the points; one of\n" 26 | " 0: uniform, or\n" 27 | " 1: concentrated in a small region.\n" 28 | " nf1, nf2: The size of the 2D array.\n" 29 | " maxsubprobsize: Maximum size of subproblems (default 65536).\n" 30 | " M: The number of non-uniform points (default nf1 * nf2 / 4).\n" 31 | " tol: NUFFT tolerance (default 1e-6).\n" 32 | " kerevalmeth: Kernel evaluation method; one of\n" 33 | " 0: Exponential of square root (default), or\n" 34 | " 1: Horner evaluation.\n"); 35 | return 1; 36 | } 37 | double w; 38 | int method; 39 | sscanf(argv[1],"%d",&method); 40 | 41 | int nupts_distribute; 42 | sscanf(argv[2],"%d",&nupts_distribute); 43 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 44 | sscanf(argv[4],"%lf",&w); nf2 = (int)w; // so can read 1e6 right! 45 | 46 | int maxsubprobsize=65536; 47 | if(argc>5){ 48 | sscanf(argv[5],"%d",&maxsubprobsize); 49 | } 50 | 51 | N1 = (int) nf1/upsampfac; 52 | N2 = (int) nf2/upsampfac; 53 | M = N1*N2; 54 | if(argc>6){ 55 | sscanf(argv[6],"%lf",&w); M = (int)w; // so can read 1e6 right! 56 | } 57 | 58 | FLT tol=1e-6; 59 | if(argc>7){ 60 | sscanf(argv[7],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 61 | } 62 | 63 | int kerevalmeth=0; 64 | if(argc>8){ 65 | sscanf(argv[8],"%d",&kerevalmeth); 66 | } 67 | 68 | int ier; 69 | int dim=2; 70 | 71 | CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S; 72 | // Zero out your struct, (sets all pointers to NULL, crucial) 73 | memset(dplan, 0, sizeof(*dplan)); 74 | ier = CUFINUFFT_DEFAULT_OPTS(1, dim, &(dplan->opts)); 75 | 76 | dplan->opts.gpu_method = method; 77 | dplan->opts.gpu_maxsubprobsize = maxsubprobsize; 78 | dplan->opts.gpu_kerevalmeth = kerevalmeth; 79 | dplan->opts.gpu_sort = 1; // ahb changed from 0 80 | dplan->opts.gpu_spreadinterponly = 1; 81 | dplan->opts.gpu_binsizex = 32; //binsize needs to be set here, since 82 | //SETUP_BINSIZE() is not called in 83 | //spread, interp only wrappers. 84 | dplan->opts.gpu_binsizey = 32; 85 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 86 | 87 | cout<opts.gpu_method,M,nf1*nf2,t,M/t); 144 | 145 | checkCudaErrors(cudaMemcpy(fw,d_fw,nf1*nf2*sizeof(CUCPX), 146 | cudaMemcpyDeviceToHost)); 147 | #ifdef RESULT 148 | cout<<"[result-input]"<opts.gpu_binsizey == 0) 151 | printf("\n"); 152 | for (int i=0; iopts.gpu_binsizex == 0 && i!=0) 154 | printf(" |"); 155 | printf(" (%2.3g,%2.3g)",fw[i+j*nf1].real(),fw[i+j*nf1].imag() ); 156 | } 157 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "../contrib/utils.h" 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | int N1, M, N; 16 | if (argc<3) { 17 | fprintf(stderr, 18 | "Usage: cufinufft1d1_test method N1 [M [tol]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven, or\n" 22 | " 2: sub-problem\n" 23 | " N1: The size of the 1D array.\n" 24 | " M: The number of non-uniform points (default N1).\n" 25 | " tol: NUFFT tolerance (default 1e-6).\n"); 26 | return 1; 27 | } 28 | double w; 29 | int method; 30 | sscanf(argv[1],"%d",&method); 31 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 32 | N = N1; 33 | M = N1;// let density always be 1 34 | if(argc>3){ 35 | sscanf(argv[3],"%lf",&w); M = (int)w; // so can read 1e6 right! 36 | } 37 | 38 | FLT tol=1e-6; 39 | if(argc>4){ 40 | sscanf(argv[4],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 41 | } 42 | int iflag=1; 43 | 44 | 45 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "../contrib/utils.h" 10 | #include "../src/common.h" 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | int nf1; 17 | if (argc<2) { 18 | fprintf(stderr, 19 | "Usage: onedim_fseries_kernel_test nf1 [dim [tol [gpuversion [nf2 [nf3]]]]]\n" 20 | "Arguments:\n" 21 | " nf1: The size of the upsampled fine grid size in x.\n" 22 | " dim: Dimension of the nuFFT.\n" 23 | " tol: NUFFT tolerance (default 1e-6).\n" 24 | " gpuversion: Use gpu version or not (default True).\n" 25 | " nf2: The size of the upsampled fine grid size in y. (default nf1)\n" 26 | " nf3: The size of the upsampled fine grid size in z. (default nf3)\n" 27 | ); 28 | return 1; 29 | } 30 | double w; 31 | sscanf(argv[1],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 32 | int dim = 1; 33 | if (argc > 2) 34 | sscanf(argv[2],"%d",&dim); 35 | FLT eps = 1e-6; 36 | if (argc > 3) 37 | sscanf(argv[3],"%lf",&w); eps = (FLT)w; 38 | int gpu = 1; 39 | if (argc > 4) 40 | sscanf(argv[4],"%d",&gpu); 41 | 42 | int nf2=nf1; 43 | if (argc > 5) 44 | sscanf(argv[5],"%lf",&w); nf2 = (int)w; 45 | int nf3=nf1; 46 | if (argc > 6) 47 | sscanf(argv[6],"%lf",&w); nf3 = (int)w; 48 | 49 | SPREAD_OPTS opts; 50 | FLT *fwkerhalf1, *fwkerhalf2, *fwkerhalf3; 51 | FLT *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3; 52 | checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(FLT)*(nf1/2+1))); 53 | if(dim > 1) 54 | checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(FLT)*(nf2/2+1))); 55 | if(dim > 2) 56 | checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(FLT)*(nf3/2+1))); 57 | 58 | int ier = setup_spreader(opts, eps, 2.0, 0); 59 | 60 | cudaEvent_t start, stop; 61 | cudaEventCreate(&start); 62 | cudaEventCreate(&stop); 63 | 64 | float milliseconds = 0; 65 | float gputime = 0; 66 | float cputime = 0; 67 | 68 | CNTime timer; 69 | if( !gpu ) { 70 | timer.start(); 71 | fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1)); 72 | if(dim > 1) 73 | fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1)); 74 | if(dim > 2) 75 | fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1)); 76 | 77 | onedim_fseries_kernel(nf1, fwkerhalf1, opts); 78 | if(dim > 1) 79 | onedim_fseries_kernel(nf2, fwkerhalf2, opts); 80 | if(dim > 2) 81 | onedim_fseries_kernel(nf3, fwkerhalf3, opts); 82 | cputime = timer.elapsedsec(); 83 | cudaEventRecord(start); 84 | { 85 | checkCudaErrors(cudaMemcpy(d_fwkerhalf1,fwkerhalf1, 86 | sizeof(FLT)*(nf1/2+1),cudaMemcpyHostToDevice)); 87 | if(dim > 1) 88 | checkCudaErrors(cudaMemcpy(d_fwkerhalf2,fwkerhalf2, 89 | sizeof(FLT)*(nf2/2+1),cudaMemcpyHostToDevice)); 90 | if(dim > 2) 91 | checkCudaErrors(cudaMemcpy(d_fwkerhalf3,fwkerhalf3, 92 | sizeof(FLT)*(nf3/2+1),cudaMemcpyHostToDevice)); 93 | } 94 | cudaEventRecord(stop); 95 | cudaEventSynchronize(stop); 96 | cudaEventElapsedTime(&milliseconds, start, stop); 97 | gputime = milliseconds; 98 | printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", 99 | dim, nf1, opts.nspread, gputime+cputime*1000); 100 | free(fwkerhalf1); 101 | if(dim > 1) 102 | free(fwkerhalf2); 103 | if(dim > 2) 104 | free(fwkerhalf3); 105 | } else { 106 | timer.start(); 107 | complex a[dim*MAX_NQUAD]; 108 | FLT f[dim*MAX_NQUAD]; 109 | onedim_fseries_kernel_precomp(nf1, f, a, opts); 110 | if(dim > 1) 111 | onedim_fseries_kernel_precomp(nf2, f+MAX_NQUAD, a+MAX_NQUAD, opts); 112 | if(dim > 2) 113 | onedim_fseries_kernel_precomp(nf3, f+2*MAX_NQUAD, a+2*MAX_NQUAD, opts); 114 | cputime = timer.elapsedsec(); 115 | 116 | cuDoubleComplex *d_a; 117 | FLT *d_f; 118 | cudaEventRecord(start); 119 | { 120 | checkCudaErrors(cudaMalloc(&d_a, dim*MAX_NQUAD*sizeof(cuDoubleComplex))); 121 | checkCudaErrors(cudaMalloc(&d_f, dim*MAX_NQUAD*sizeof(FLT))); 122 | checkCudaErrors(cudaMemcpy(d_a,a, 123 | dim*MAX_NQUAD*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); 124 | checkCudaErrors(cudaMemcpy(d_f,f, 125 | dim*MAX_NQUAD*sizeof(FLT),cudaMemcpyHostToDevice)); 126 | ier = CUFSERIESKERNELCOMPUTE(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, 127 | d_fwkerhalf2, d_fwkerhalf3, opts.nspread); 128 | } 129 | cudaEventRecord(stop); 130 | cudaEventSynchronize(stop); 131 | cudaEventElapsedTime(&milliseconds, start, stop); 132 | gputime = milliseconds; 133 | printf("[time ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", 134 | dim, nf1, opts.nspread, gputime+cputime*1000); 135 | cudaFree(d_a); 136 | cudaFree(d_f); 137 | } 138 | 139 | #ifdef RESULT 140 | fwkerhalf1 = (FLT*)malloc(sizeof(FLT)*(nf1/2+1)); 141 | if(dim > 1) 142 | fwkerhalf2 = (FLT*)malloc(sizeof(FLT)*(nf2/2+1)); 143 | if(dim > 2) 144 | fwkerhalf3 = (FLT*)malloc(sizeof(FLT)*(nf3/2+1)); 145 | 146 | checkCudaErrors(cudaMemcpy(fwkerhalf1,d_fwkerhalf1,sizeof(FLT)*(nf1/2+1),cudaMemcpyDeviceToHost)); 147 | if(dim > 1) 148 | checkCudaErrors(cudaMemcpy(fwkerhalf2,d_fwkerhalf2,sizeof(FLT)*(nf2/2+1),cudaMemcpyDeviceToHost)); 149 | if(dim > 2) 150 | checkCudaErrors(cudaMemcpy(fwkerhalf3,d_fwkerhalf3,sizeof(FLT)*(nf3/2+1),cudaMemcpyDeviceToHost)); 151 | for(int i=0; i 1) 155 | for(int i=0; i 2) 159 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | #include "../contrib/utils.h" 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | int N1, M; 17 | if (argc<3) { 18 | fprintf(stderr, 19 | "Usage: cufinufft2d2_test method N1 [M [tol]]\n" 20 | "Arguments:\n" 21 | " method: One of\n" 22 | " 1: nupts driven\n" 23 | " N1: The size of the 1D array.\n" 24 | " M: The number of non-uniform points (default N1).\n" 25 | " tol: NUFFT tolerance (default 1e-6).\n"); 26 | return 1; 27 | } 28 | double w; 29 | int method; 30 | sscanf(argv[1],"%d",&method); 31 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 32 | M = N1;// let density always be 1 33 | if(argc>3){ 34 | sscanf(argv[3],"%lf",&w); M = (int)w; // so can read 1e6 right! 35 | } 36 | 37 | FLT tol=1e-6; 38 | if(argc>4){ 39 | sscanf(argv[4],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 40 | } 41 | int iflag=1; 42 | 43 | 44 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../src/cuspreadinterp.h" 7 | #include "../contrib/utils.h" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char* argv[]) 12 | { 13 | int nf1, nf2, nf3; 14 | FLT sigma = 2.0; 15 | int N1, N2, N3, M; 16 | if (argc<5) { 17 | fprintf(stderr, 18 | "Usage: interp3d method nupts_distr nf1 nf2 nf3 [M [tol [sort]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven, or\n" 22 | " 2: sub-problem.\n" 23 | " nupts_distr: The distribution of the points; one of\n" 24 | " 0: uniform, or\n" 25 | " 1: concentrated in a small region.\n" 26 | " nf1, nf2, nf3: The size of the 3D array.\n" 27 | " M: The number of non-uniform points (default nf1 * nf2 * nf3 / 8).\n" 28 | " tol: NUFFT tolerance (default 1e-6).\n" 29 | " sort: One of\n" 30 | " 0: do not sort the points, or\n" 31 | " 1: sort the points (default).\n"); 32 | return 1; 33 | } 34 | double w; 35 | int method; 36 | sscanf(argv[1],"%d",&method); 37 | int nupts_distribute; 38 | sscanf(argv[2],"%d",&nupts_distribute); 39 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 40 | sscanf(argv[4],"%lf",&w); nf2 = (int)w; // so can read 1e6 right! 41 | sscanf(argv[5],"%lf",&w); nf3 = (int)w; // so can read 1e6 right! 42 | 43 | N1 = (int) nf1/sigma; 44 | N2 = (int) nf2/sigma; 45 | N3 = (int) nf3/sigma; 46 | M = N1*N2*N3;// let density always be 1 47 | if(argc>6){ 48 | sscanf(argv[6],"%lf",&w); M = (int)w; // so can read 1e6 right! 49 | if(M == 0) M=N1*N2*N3; 50 | } 51 | 52 | FLT tol=1e-6; 53 | if(argc>7){ 54 | sscanf(argv[7],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 55 | } 56 | 57 | int sort=1; 58 | if(argc>8){ 59 | sscanf(argv[8],"%d",&sort); 60 | } 61 | int ier; 62 | 63 | int ns=std::ceil(-log10(tol/10.0)); 64 | 65 | cout<opts)); 90 | dplan->opts.gpu_method = method; 91 | dplan->opts.gpu_maxsubprobsize = 1024; 92 | dplan->opts.gpu_kerevalmeth = 0; // not in cmd-line args 93 | dplan->opts.gpu_sort = sort; 94 | dplan->opts.gpu_spreadinterponly = 1; 95 | 96 | //binsize needs to be set here, since SETUP_BINSIZE() is not called in spread, 97 | //interp only wrappers. 98 | if(dplan->opts.gpu_method == 1) 99 | { 100 | dplan->opts.gpu_binsizex=16; 101 | dplan->opts.gpu_binsizey=16; 102 | dplan->opts.gpu_binsizez=2; 103 | } 104 | if(dplan->opts.gpu_method == 2) 105 | { 106 | dplan->opts.gpu_binsizex=16; 107 | dplan->opts.gpu_binsizey=16; 108 | dplan->opts.gpu_binsizez=2; 109 | } 110 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 111 | 112 | switch(nupts_distribute){ 113 | // Making data 114 | case 0: //uniform 115 | { 116 | for (int i = 0; i < M; i++) { 117 | x[i] = M_PI*randm11();// x in [-pi,pi) 118 | y[i] = M_PI*randm11(); 119 | z[i] = M_PI*randm11(); 120 | //cout << x[i] << "," << y[i] << "," << z[i] << endl; 121 | } 122 | } 123 | break; 124 | case 1: // concentrate on a small region 125 | { 126 | for (int i = 0; i < M; i++) { 127 | x[i] = M_PI*rand01()/(nf1*2/32);// x in [-pi,pi) 128 | y[i] = M_PI*rand01()/(nf2*2/32); 129 | z[i] = M_PI*rand01()/(nf3*2/32); 130 | } 131 | } 132 | break; 133 | default: 134 | cerr<<"error: nupts distr should be 0,1" << endl; 135 | return 1; 136 | } 137 | for(int i=0; iopts.gpu_method,nf1*nf2*nf3,M,t,M/t); 157 | checkCudaErrors(cudaMemcpy(c,d_c,M*sizeof(CUCPX),cudaMemcpyDeviceToHost)); 158 | #ifdef RESULT 159 | cout<<"[result-input]"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "../contrib/utils.h" 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | int N1, N2, M, N; 16 | if (argc<4) { 17 | fprintf(stderr, 18 | "Usage: cufinufft2d1_test method N1 N2 [M [tol]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven,\n" 22 | " 2: sub-problem, or\n" 23 | " 3: sub-problem with Paul's idea.\n" 24 | " N1, N2: The size of the 2D array.\n" 25 | " M: The number of non-uniform points (default N1 * N2).\n" 26 | " tol: NUFFT tolerance (default 1e-6).\n"); 27 | return 1; 28 | } 29 | double w; 30 | int method; 31 | sscanf(argv[1],"%d",&method); 32 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 33 | sscanf(argv[3],"%lf",&w); N2 = (int)w; // so can read 1e6 right! 34 | N = N1*N2; 35 | M = N1*N2;// let density always be 1 36 | if(argc>4){ 37 | sscanf(argv[4],"%lf",&w); M = (int)w; // so can read 1e6 right! 38 | } 39 | 40 | FLT tol=1e-6; 41 | if(argc>5){ 42 | sscanf(argv[5],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 43 | } 44 | int iflag=1; 45 | 46 | 47 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | #include "../contrib/utils.h" 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | int N1, N2, M; 17 | if (argc<4) { 18 | fprintf(stderr, 19 | "Usage: cufinufft2d2_test method N1 N2 [M [tol]]\n" 20 | "Arguments:\n" 21 | " method: One of\n" 22 | " 1: nupts driven, or\n" 23 | " 2: sub-problem.\n" 24 | " N1, N2: The size of the 2D array.\n" 25 | " M: The number of non-uniform points (default N1 * N2).\n" 26 | " tol: NUFFT tolerance (default 1e-6).\n"); 27 | return 1; 28 | } 29 | double w; 30 | int method; 31 | sscanf(argv[1],"%d",&method); 32 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 33 | sscanf(argv[3],"%lf",&w); N2 = (int)w; // so can read 1e6 right! 34 | M = N1*N2;// let density always be 1 35 | if(argc>4){ 36 | sscanf(argv[4],"%lf",&w); M = (int)w; // so can read 1e6 right! 37 | } 38 | 39 | FLT tol=1e-6; 40 | if(argc>5){ 41 | sscanf(argv[5],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 42 | } 43 | int iflag=1; 44 | 45 | 46 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "../contrib/utils.h" 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | int N1, N2, N3, M, N; 16 | if (argc<4) { 17 | fprintf(stderr, 18 | "Usage: cufinufft3d1_test method N1 N2 N3 [M [tol]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven,\n" 22 | " 2: sub-problem, or\n" 23 | " 4: block gather.\n" 24 | " N1, N2, N3: The size of the 3D array.\n" 25 | " M: The number of non-uniform points (default N1 * N2 * N3).\n" 26 | " tol: NUFFT tolerance (default 1e-6).\n"); 27 | return 1; 28 | } 29 | double w; 30 | int method; 31 | sscanf(argv[1],"%d",&method); 32 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 33 | sscanf(argv[3],"%lf",&w); N2 = (int)w; // so can read 1e6 right! 34 | sscanf(argv[4],"%lf",&w); N3 = (int)w; // so can read 1e6 right! 35 | 36 | M = N1*N2*N3;// let density always be 1 37 | if(argc>5){ 38 | sscanf(argv[5],"%lf",&w); M = (int)w; // so can read 1e6 right! 39 | } 40 | 41 | FLT tol=1e-6; 42 | if(argc>6){ 43 | sscanf(argv[6],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 44 | } 45 | int iflag=1; 46 | 47 | 48 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include "../contrib/utils.h" 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | int N1, N2, M, N, ntransf, maxbatchsize; 16 | if (argc<4) { 17 | fprintf(stderr, 18 | "Usage: cufinufft2d1many_test method N1 N2 [ntransf [maxbatchsize [M [tol]]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven,\n" 22 | " 2: sub-problem, or\n" 23 | " 3: sub-problem with Paul's idea.\n" 24 | " N1, N2: The size of the 2D array.\n" 25 | " ntransf: Number of inputs (default 2 ^ 27 / (N1 * N2)).\n" 26 | " maxbatchsize: Number of simultaneous transforms (or 0 for default).\n" 27 | " M: The number of non-uniform points (default N1 * N2).\n" 28 | " tol: NUFFT tolerance (default 1e-6).\n"); 29 | return 1; 30 | } 31 | double w; 32 | int method; 33 | sscanf(argv[1],"%d",&method); 34 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 35 | sscanf(argv[3],"%lf",&w); N2 = (int)w; // so can read 1e6 right! 36 | N = N1*N2; 37 | M = N1*N2*2;// let density always be 2 38 | ntransf = pow(2,28)/M; 39 | if(argc>4){ 40 | sscanf(argv[4],"%d",&ntransf); 41 | } 42 | maxbatchsize = 0; // default (cufinufft chooses) 43 | if(argc>5){ 44 | sscanf(argv[5],"%d",&maxbatchsize); 45 | } 46 | 47 | if(argc>6){ 48 | sscanf(argv[6],"%lf",&w); M = (int)w; // so can read 1e6 right! 49 | } 50 | 51 | FLT tol=1e-6; 52 | if(argc>7){ 53 | sscanf(argv[7],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 54 | } 55 | int iflag=1; 56 | 57 | 58 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../src/cuspreadinterp.h" 7 | #include "../contrib/utils.h" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char* argv[]) 12 | { 13 | int nf1, nf2, nf3; 14 | FLT sigma = 2.0; 15 | int N1, N2, N3, M; 16 | if (argc<6) { 17 | fprintf(stderr, 18 | "Usage: spread3d_test method nupts_distr nf1 nf2 nf3 [maxsubprobsize [M [tol [kerevalmeth [sort]]]]]\n" 19 | "Arguments:\n" 20 | " method: One of\n" 21 | " 1: nupts driven,\n" 22 | " 2: sub-problem, or\n" 23 | " 4: block gather (each nf must be multiple of 8).\n" 24 | " nupts_distr: The distribution of the points; one of\n" 25 | " 0: uniform, or\n" 26 | " 1: concentrated in a small region.\n" 27 | " nf1, nf2, nf3: The size of the 3D array.\n" 28 | " maxsubprobsize: Maximum size of subproblems (default 65536).\n" 29 | " M: The number of non-uniform points (default nf1 * nf2 * nf3 / 8).\n" 30 | " tol: NUFFT tolerance (default 1e-6).\n" 31 | " kerevalmeth: Kernel evaluation method; one of\n" 32 | " 0: Exponential of square root (default), or\n" 33 | " 1: Horner evaluation.\n" 34 | " sort: One of\n" 35 | " 0: do not sort the points, or\n" 36 | " 1: sort the points (default).\n"); 37 | return 1; 38 | } 39 | double w; 40 | int method; 41 | sscanf(argv[1],"%d",&method); 42 | int nupts_distribute; 43 | sscanf(argv[2],"%d",&nupts_distribute); 44 | sscanf(argv[3],"%lf",&w); nf1 = (int)w; // so can read 1e6 right! 45 | sscanf(argv[4],"%lf",&w); nf2 = (int)w; // so can read 1e6 right! 46 | sscanf(argv[5],"%lf",&w); nf3 = (int)w; // so can read 1e6 right! 47 | 48 | int maxsubprobsize=1024; 49 | if(argc>6){ 50 | sscanf(argv[6],"%d",&maxsubprobsize); 51 | } 52 | N1 = (int) nf1/sigma; 53 | N2 = (int) nf2/sigma; 54 | N3 = (int) nf3/sigma; 55 | M = N1*N2*N3;// let density always be 1 56 | if(argc>7){ 57 | sscanf(argv[7],"%lf",&w); M = (int)w; // so can read 1e6 right! 58 | //if(M == 0) M=N1*N2; 59 | } 60 | 61 | FLT tol=1e-6; 62 | if(argc>8){ 63 | sscanf(argv[8],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 64 | } 65 | 66 | int kerevalmeth=0; 67 | if(argc>9){ 68 | sscanf(argv[9],"%d",&kerevalmeth); 69 | } 70 | 71 | int sort=1; 72 | if(argc>10){ 73 | sscanf(argv[10],"%d",&sort); 74 | } 75 | 76 | int ier; 77 | FLT *x, *y, *z; 78 | CPX *c, *fw; 79 | cudaMallocHost(&x, M*sizeof(FLT)); 80 | cudaMallocHost(&y, M*sizeof(FLT)); 81 | cudaMallocHost(&z, M*sizeof(FLT)); 82 | cudaMallocHost(&c, M*sizeof(CPX)); 83 | cudaMallocHost(&fw,nf1*nf2*nf3*sizeof(CPX)); 84 | 85 | FLT *d_x, *d_y, *d_z; 86 | CUCPX *d_c, *d_fw; 87 | checkCudaErrors(cudaMalloc(&d_x,M*sizeof(FLT))); 88 | checkCudaErrors(cudaMalloc(&d_y,M*sizeof(FLT))); 89 | checkCudaErrors(cudaMalloc(&d_z,M*sizeof(FLT))); 90 | checkCudaErrors(cudaMalloc(&d_c,M*sizeof(CUCPX))); 91 | checkCudaErrors(cudaMalloc(&d_fw,nf1*nf2*nf3*sizeof(CUCPX))); 92 | 93 | int dim=3; 94 | CUFINUFFT_PLAN dplan = new CUFINUFFT_PLAN_S; 95 | // Zero out your struct, (sets all pointers to NULL, crucial) 96 | memset(dplan, 0, sizeof(*dplan)); 97 | ier = CUFINUFFT_DEFAULT_OPTS(1, dim, &(dplan->opts)); 98 | 99 | dplan->opts.gpu_method =method; 100 | dplan->opts.gpu_maxsubprobsize =maxsubprobsize; 101 | dplan->opts.gpu_kerevalmeth =kerevalmeth; 102 | dplan->opts.gpu_sort =sort; 103 | dplan->opts.gpu_spreadinterponly=1; 104 | ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts); 105 | 106 | //binsize, obinsize need to be set here, since SETUP_BINSIZE() is not 107 | //called in spread, interp only wrappers. 108 | if(dplan->opts.gpu_method == 4) 109 | { 110 | dplan->opts.gpu_binsizex=4; 111 | dplan->opts.gpu_binsizey=4; 112 | dplan->opts.gpu_binsizez=4; 113 | dplan->opts.gpu_obinsizex=8; 114 | dplan->opts.gpu_obinsizey=8; 115 | dplan->opts.gpu_obinsizez=8; 116 | dplan->opts.gpu_maxsubprobsize=maxsubprobsize; 117 | } 118 | if(dplan->opts.gpu_method == 2) 119 | { 120 | dplan->opts.gpu_binsizex=16; 121 | dplan->opts.gpu_binsizey=16; 122 | dplan->opts.gpu_binsizez=2; 123 | dplan->opts.gpu_maxsubprobsize=maxsubprobsize; 124 | } 125 | if(dplan->opts.gpu_method == 1) 126 | { 127 | dplan->opts.gpu_binsizex=16; 128 | dplan->opts.gpu_binsizey=16; 129 | dplan->opts.gpu_binsizez=2; 130 | } 131 | 132 | cout<opts.gpu_method,M,nf1*nf2*nf3,t,M/t); 187 | #ifdef RESULT 188 | cout<<"[result-input]"<opts.gpu_binsizex == 0 && i!=0) 193 | printf(" |"); 194 | printf(" (%2.3g,%2.3g)",fw[i+j*nf1+k*nf2*nf1].real(), 195 | fw[i+j*nf1+k*nf2*nf1].imag() ); 196 | } 197 | cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "../contrib/utils.h" 11 | 12 | using namespace std; 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | int N1, N2, N3, M; 17 | if (argc<4) { 18 | fprintf(stderr, 19 | "Usage: cufinufft3d2_test method N1 N2 N3 [M [tol]]\n" 20 | "Arguments:\n" 21 | " method: One of\n" 22 | " 1: nupts driven, or\n" 23 | " 2: sub-problem.\n" 24 | " N1, N2, N3: The size of the 3D array.\n" 25 | " M: The number of non-uniform points (default N1 * N2 * N3).\n" 26 | " tol: NUFFT tolerance (default 1e-6).\n"); 27 | return 1; 28 | } 29 | double w; 30 | int method; 31 | sscanf(argv[1],"%d",&method); 32 | sscanf(argv[2],"%lf",&w); N1 = (int)w; // so can read 1e6 right! 33 | sscanf(argv[3],"%lf",&w); N2 = (int)w; // so can read 1e6 right! 34 | sscanf(argv[4],"%lf",&w); N3 = (int)w; // so can read 1e6 right! 35 | M = N1*N2*N3;// let density always be 1 36 | if(argc>5){ 37 | sscanf(argv[5],"%lf",&w); M = (int)w; // so can read 1e6 right! 38 | } 39 | 40 | FLT tol=1e-6; 41 | if(argc>6){ 42 | sscanf(argv[6],"%lf",&w); tol = (FLT)w; // so can read 1e6 right! 43 | } 44 | int iflag=1; 45 | 46 | 47 | cout<