├── docs ├── benchmark │ ├── sample.jpg │ ├── result_10.jpg │ ├── result_100.jpg │ ├── result_1000.jpg │ ├── baseline │ │ ├── iter10_mid.jpg │ │ ├── iter1000_mid.jpg │ │ └── iter100_mid.jpg │ ├── basic_rl_lineprofile.png │ └── accelerated │ │ ├── iter1000_mid.jpg │ │ ├── iter100_mid.jpg │ │ └── iter10_mid.jpg ├── style.css └── index.html ├── util ├── CMakeLists.txt └── demo.cpp ├── .gitignore ├── src ├── DumpData.cuh ├── Helper.cuh ├── DumpData.cu ├── DeconvRLImpl.cuh ├── Core.cu ├── DeconvLRDriver.cpp ├── PSF.cu └── HelperMath.cuh ├── include ├── DeconvRLDriver.hpp └── ImageStack.hpp ├── CMakeLists.txt ├── README.md └── LICENSE /docs/benchmark/sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/sample.jpg -------------------------------------------------------------------------------- /docs/benchmark/result_10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_10.jpg -------------------------------------------------------------------------------- /docs/benchmark/result_100.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_100.jpg -------------------------------------------------------------------------------- /docs/benchmark/result_1000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_1000.jpg -------------------------------------------------------------------------------- /docs/benchmark/baseline/iter10_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter10_mid.jpg -------------------------------------------------------------------------------- /docs/benchmark/baseline/iter1000_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter1000_mid.jpg -------------------------------------------------------------------------------- /docs/benchmark/baseline/iter100_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter100_mid.jpg -------------------------------------------------------------------------------- /docs/benchmark/basic_rl_lineprofile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/basic_rl_lineprofile.png -------------------------------------------------------------------------------- /docs/benchmark/accelerated/iter1000_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter1000_mid.jpg -------------------------------------------------------------------------------- /docs/benchmark/accelerated/iter100_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter100_mid.jpg -------------------------------------------------------------------------------- /docs/benchmark/accelerated/iter10_mid.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter10_mid.jpg -------------------------------------------------------------------------------- /docs/style.css: -------------------------------------------------------------------------------- 1 | 2 | @media (min-width: 1200px) { 3 | .container { 4 | max-width: 60%; 5 | } 6 | } 7 | 8 | header { 9 | padding: 80px; 10 | /* padding-top: 20px; */ 11 | /* box-shadow: 5px 5px 5px 5px gray; */ 12 | text-align: center; 13 | } 14 | 15 | .container { 16 | padding: 20px; 17 | } 18 | 19 | 20 | .button-wrapper { 21 | padding-top: 50px; 22 | } 23 | 24 | 25 | .button-wrapper a { 26 | font-size: 18px; 27 | } 28 | -------------------------------------------------------------------------------- /util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file (GLOB SOURCES "${PROJECT_SOURCE_DIR}/util/*.cpp") 2 | 3 | # configure compiler 4 | add_compile_options (-Wall) 5 | add_compile_options (-std=c++11) 6 | include_directories ("${PROJECT_SOURCE_DIR}/include" "/opt/X11/include") 7 | 8 | # generate the executable 9 | add_executable (deconvlr_util ${SOURCES}) 10 | target_link_libraries (deconvlr_util LINK_PUBLIC deconvlr) 11 | 12 | # drop the suffix 13 | set_target_properties(deconvlr_util PROPERTIES 14 | OUTPUT_NAME "deconvrl" 15 | ) 16 | 17 | # add custom target to run the binary 18 | add_custom_target (run 19 | COMMAND deconvlr_util 20 | DEPENDS deconvlr_util 21 | WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/bin" 22 | ) 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | *.smod 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | *.lib 25 | 26 | # Executables 27 | *.exe 28 | *.out 29 | *.app 30 | bin/ 31 | 32 | # CMake 33 | CMakeCache.txt 34 | CMakeFiles 35 | CMakeScripts 36 | Testing 37 | Makefile 38 | cmake_install.cmake 39 | install_manifest.txt 40 | compile_commands.json 41 | CTestTestfile.cmake 42 | build/ 43 | 44 | # Atom Linter 45 | .gcc-flags.json 46 | 47 | # macOS cache 48 | .DS_Store 49 | 50 | # Data set 51 | data/ 52 | -------------------------------------------------------------------------------- /src/DumpData.cuh: -------------------------------------------------------------------------------- 1 | #ifndef HELPER_DUMP_CUH 2 | #define HELPER_DUMP_CUH 3 | 4 | // corresponded header file 5 | // necessary project headers 6 | // 3rd party libraries headers 7 | #include 8 | // standard libraries headers 9 | #include 10 | // system headers 11 | 12 | namespace DumpData { 13 | 14 | namespace Device { 15 | 16 | void real( 17 | std::string fname, 18 | const cufftReal *d_idata, 19 | const size_t nx, const size_t ny, const size_t nz 20 | ); 21 | 22 | void complex( 23 | std::string fname, 24 | const cufftComplex *d_idata, 25 | const size_t nx, const size_t ny, const size_t nz 26 | ); 27 | 28 | } 29 | 30 | namespace Host { 31 | 32 | void real( 33 | std::string fname, 34 | cufftReal *h_idata, 35 | const size_t nx, const size_t ny, const size_t nz 36 | ); 37 | 38 | } 39 | 40 | } 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /include/DeconvRLDriver.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DECONV_RL_DRIVER_HPP 2 | #define DECONV_RL_DRIVER_HPP 3 | 4 | // corresponded header file 5 | // necessary project headers 6 | #include "ImageStack.hpp" 7 | // 3rd party libraries headers 8 | // standard libraries headers 9 | #include 10 | // system headers 11 | 12 | namespace DeconvRL { 13 | 14 | class DeconvRL { 15 | public: 16 | DeconvRL(); 17 | ~DeconvRL(); 18 | 19 | void setResolution( 20 | const float dx, const float dy, const float dz, 21 | const float dpx = 1.0f, const float dpy = 1.0f, const float dpz = 1.0f 22 | ); 23 | void setVolumeSize(const size_t nx, const size_t ny, const size_t nz); 24 | void setPSF(const ImageStack &psf); 25 | 26 | // allocate host and device resources 27 | void initialize(); 28 | // set iterations 29 | void setIterations(const int iterations); 30 | // start the RL core routines 31 | void process( 32 | ImageStack &output, 33 | const ImageStack &input 34 | ); 35 | 36 | private: 37 | struct Impl; 38 | std::unique_ptr pimpl; 39 | }; 40 | 41 | } 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /util/demo.cpp: -------------------------------------------------------------------------------- 1 | // corresponded header file 2 | // necessary project headers 3 | #include "ImageStack.hpp" 4 | #include "DeconvRLDriver.hpp" 5 | // 3rd party libraries headers 6 | // standard libraries headers 7 | #include 8 | #include 9 | // system headers 10 | 11 | int main(void) 12 | { 13 | TIFFSetWarningHandler(NULL); 14 | 15 | std::string origImgFile = "data/bigradient/sample.tif"; 16 | std::string psfFile = "data/bigradient/psf.tif"; 17 | 18 | // scan the folder 19 | // search and load the otf 20 | ImageStack psf(psfFile); 21 | // init the deconvlr 22 | DeconvRL::DeconvRL deconvWorker; 23 | deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); 24 | // iterate through the images 25 | // open the image 26 | const ImageStack input(origImgFile); 27 | ImageStack output(input, 0); 28 | // use the first image to init the resources 29 | deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz()); 30 | deconvWorker.setPSF(psf); 31 | deconvWorker.initialize(); 32 | deconvWorker.setIterations(10); 33 | // run the deconv 34 | deconvWorker.process(output, input); 35 | // save the image 36 | output.saveAs("result.tif"); 37 | // save the log 38 | // release the resources 39 | 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.6) 2 | 3 | project (deconvlr) 4 | set (CMAKE_BUILD_TYPE Debug) 5 | 6 | # set output directories 7 | set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/lib") 8 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bin") 9 | 10 | # find source files 11 | file (GLOB SOURCES "${PROJECT_SOURCE_DIR}/src/*") 12 | 13 | # boost 14 | find_package (Boost 1.59.0 COMPONENTS filesystem iostreams REQUIRED) 15 | if (NOT Boost_FOUND) 16 | message (FATAL_ERROR "Fatal error: Boost (version >= 1.59) required.") 17 | else () 18 | message (STATUS "Setting up Boost") 19 | message (STATUS " Includes - ${Boost_INCLUDE_DIRS}") 20 | message (STATUS " Library - ${Boost_LIBRARY_DIRS}") 21 | endif (NOT Boost_FOUND) 22 | # configure boost 23 | set (Boost_USE_STATIC_LIBS OFF) 24 | include_directories (${Boost_INCLUDE_DIRS}) 25 | link_directories (${Boost_LIBRARY_DIRS}) 26 | 27 | # cuda 28 | find_package (CUDA REQUIRED 7.5) 29 | if (NOT CUDA_FOUND) 30 | message (FATAL_ERROR "Fatal error: CUDA SDK (version >= 7.5) required.") 31 | endif (NOT CUDA_FOUND) 32 | # configure cuda 33 | # compile for baseline of CC 2.0, and archtecture 2.0/3.0 34 | list (APPEND CUDA_NVCC_FLAGS "-arch=compute_20; -code=sm_20,sm_30,sm_52; -std=c++11") 35 | cuda_include_directories ("${PROJECT_SOURCE_DIR}/include") 36 | 37 | # x11 (debug) 38 | find_package(X11 REQUIRED) 39 | include_directories(${X11_INCLUDE_DIR}) 40 | if (NOT CUDA_FOUND) 41 | message (FATAL_ERROR "Fatal error: X11 library required.") 42 | endif (NOT CUDA_FOUND) 43 | 44 | # configure compiler 45 | add_compile_options (-Wall) 46 | add_compile_options (-std=c++11) 47 | include_directories ("${PROJECT_SOURCE_DIR}/include") 48 | 49 | # generate the shared library 50 | cuda_add_library (deconvlr SHARED ${SOURCES}) 51 | target_link_libraries (deconvlr ${Boost_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} tiff ${X11_LIBRARIES}) 52 | 53 | # build demo program 54 | add_subdirectory ("util") 55 | -------------------------------------------------------------------------------- /include/ImageStack.hpp: -------------------------------------------------------------------------------- 1 | #ifndef IMAGE_STACK_HPP 2 | #define IMAGE_STACK_HPP 3 | 4 | // corresponded header file 5 | // necessary project headers 6 | // 3rd party libraries headers 7 | #define BOOST_FILESYSTEM_NO_DEPRECATED 8 | #define BOOST_FILESYSTEM_VERSION 3 9 | #include 10 | 11 | #define cimg_use_tiff 12 | #include "CImg.h" 13 | using namespace cimg_library; 14 | // standard libraries headers 15 | #include 16 | #include 17 | #include 18 | // system headers 19 | 20 | namespace fs = boost::filesystem; 21 | 22 | template 23 | class ImageStack { 24 | public: 25 | ImageStack(const fs::path path_) 26 | : path(path_) { 27 | try { 28 | image.assign(path.c_str()); 29 | } catch(CImgIOException &err) { 30 | throw std::runtime_error("unable to open image"); 31 | } 32 | } 33 | 34 | // type conversion 35 | template 36 | ImageStack(const ImageStack &tpl) { 37 | image.assign(tpl.object()); 38 | } 39 | 40 | // init image of the same dimension with the default value 41 | template 42 | ImageStack(const ImageStack &tpl, const T value) { 43 | image.assign(tpl.object(), "xyzc", value); 44 | } 45 | 46 | void debug() { 47 | std::cout << "file: " << path << std::endl; 48 | image.display(); 49 | } 50 | 51 | T * data() const { 52 | return image._data; 53 | } 54 | 55 | const CImg & object() const { 56 | return image; 57 | } 58 | 59 | void save() { 60 | saveAs(path); 61 | } 62 | 63 | void saveAs(const fs::path p) { 64 | image.save_tiff(p.c_str()); 65 | } 66 | 67 | /* 68 | * Volume size 69 | */ 70 | size_t nx() const { 71 | return image._width; 72 | } 73 | 74 | size_t ny() const { 75 | return image._height; 76 | } 77 | 78 | size_t nz() const { 79 | return image._depth; 80 | } 81 | 82 | private: 83 | const fs::path path; 84 | CImg image; 85 | }; 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /src/Helper.cuh: -------------------------------------------------------------------------------- 1 | #ifndef HELPER_CUH 2 | #define HELPER_CUH 3 | 4 | // corresponded header file 5 | // necessary project headers 6 | #include "HelperMath.cuh" 7 | // 3rd party libraries headers 8 | #include 9 | #include 10 | // standard libraries headers 11 | #include 12 | #include 13 | // system headers 14 | 15 | /* 16 | * Kernel thread size computation. 17 | */ 18 | #define DIVUP(x, y) ((x+y-1)/y) 19 | 20 | /* 21 | * CUDA Runtime 22 | */ 23 | #define cudaErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); } 24 | 25 | inline void cudaAssert( 26 | cudaError_t code, 27 | const char *file, int line, 28 | bool abort=true 29 | ) { 30 | if (code != cudaSuccess) { 31 | fprintf(stderr,"CUDA Runtime: %s\n.. %s ln%d\n", cudaGetErrorString(code), file, line); 32 | if (abort) { 33 | exit(code); 34 | } 35 | } 36 | } 37 | 38 | /* 39 | * cuFFT 40 | */ 41 | static const char * cufftGetErrorString(cufftResult error) { 42 | switch (error) { 43 | case CUFFT_SUCCESS: 44 | return "the cuFFT operation was successful"; 45 | 46 | case CUFFT_INVALID_PLAN: 47 | return "cuFFT was passed an invalid plan handle"; 48 | 49 | case CUFFT_ALLOC_FAILED: 50 | return "cuFFT failed to allocate GPU or CPU memory"; 51 | 52 | case CUFFT_INVALID_VALUE: 53 | return "user specified an invalid pointer or parameter"; 54 | 55 | case CUFFT_INTERNAL_ERROR: 56 | return "driver or internal cuFFT library error"; 57 | 58 | case CUFFT_EXEC_FAILED: 59 | return "failed to execute an FFT on the GPU"; 60 | 61 | case CUFFT_SETUP_FAILED: 62 | return "the cuFFT library failed to initialize"; 63 | 64 | case CUFFT_INVALID_SIZE: 65 | return "user specified an invalid transform size"; 66 | 67 | case CUFFT_INCOMPLETE_PARAMETER_LIST: 68 | return "missing parameters in call"; 69 | 70 | case CUFFT_INVALID_DEVICE: 71 | return "execution of a plan was on different GPU than plan creation"; 72 | 73 | case CUFFT_PARSE_ERROR: 74 | return "internal plan database error"; 75 | 76 | case CUFFT_NO_WORKSPACE: 77 | return "no workspace has been provided prior to plan execution"; 78 | 79 | case CUFFT_NOT_IMPLEMENTED: 80 | return "function does not implement functionality for parameters given"; 81 | 82 | case CUFFT_NOT_SUPPORTED: 83 | return "operation is not supported for parameters given"; 84 | 85 | default: 86 | return ""; 87 | } 88 | } 89 | 90 | inline void cudaAssert( 91 | cufftResult_t code, 92 | const char *file, int line, 93 | bool abort=true 94 | ) { 95 | if (code != CUFFT_SUCCESS) { 96 | fprintf(stderr,"cuFFT: %s\n.. %s ln%d\n", cufftGetErrorString(code), file, line); 97 | if (abort) { 98 | exit(code); 99 | } 100 | } 101 | } 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeconvLR 2 | DeconvLR is a open source CUDA implementation of accelerated Richard-Lucy Deconvolution algorithm regularized with total variation loss. This library is developed to recovered blurred image due to the spreading of point source in optical system. As far as we know, there is no other fully functional open source GPU accelerated implementation. This project is aim to develope an open source, high efficient library to process high resolution images of high quality. 3 | 4 | ## Getting Started 5 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. 6 | 7 | ### Prerequisites 8 | You need the following packages to get started. 9 | 10 | ***nix** 11 | ``` 12 | make 13 | g++ <= 5 14 | CMake >= 3.6 15 | Boost >= 1.59 16 | CUDA >= 8.0 17 | ``` 18 | 19 | **Windows** 20 | 21 | **TODO** I haven't exactly tested this on Windows. DLL export symbols are needed in the public header. 22 | 23 | ### Build 24 | 1. Please clone this repository 25 | ```bash 26 | git clone https://github.com/liuyenting/DeconvLR.git 27 | ``` 28 | or download and extract the tarball from [release page](https://github.com/liuyenting/DeconvLR/releases). 29 | ```bash 30 | tar zxvf DeconRL.tar.gz 31 | ``` 32 | 2. Go to source directory and create a new build output directory. 33 | ```bash 34 | cd DeconvLR 35 | mkdir build 36 | ``` 37 | 3. We use `cmake` to do the heavy lifting. 38 | ```bash 39 | cd build 40 | cmake .. 41 | ``` 42 | if everything runs smoothly, we can proceed with 43 | ```bash 44 | make 45 | ``` 46 | 47 | ## Running the demo 48 | **TODO** Explain how to run the demo. 49 | 50 | Asides from the demo, this library is intended to use as 51 | ```c++ 52 | std::string origImgFile = "data/bigradient/sample.tif"; 53 | std::string psfFile = "data/bigradient/psf_n15_z5.tif"; 54 | 55 | // load psf 56 | ImageStack psf(psfFile); 57 | 58 | // init the deconvlr 59 | DeconvLR deconvWorker; 60 | deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); 61 | 62 | // open the image 63 | const ImageStack input(origImgFile); 64 | ImageStack output(input, 0); 65 | 66 | // use the first image to init the resources 67 | deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz()); 68 | deconvWorker.setPSF(psf); 69 | 70 | // run the deconv 71 | deconvWorker.process(output, input); 72 | ``` 73 | 74 | ## Benchmark 75 | **TODO** move benchmakr images from gh-page (in docs folder) to here. 76 | 77 | ## Authors 78 | * **Liu, Yen-Ting** - *Initial work* - [liuyenting](https://github.com/liuyenting/) 79 | * **Chiang, Tin-Ray** - *Initial work* - [CTinRay](https://github.com/CTinRay) 80 | 81 | ## License 82 | This project is licensed under the Apache License - see the [LICENSE](LICENSE) file for details 83 | 84 | ## References 85 | * William Hadley Richardson (1972), "Bayesian-Based Iterative Method of Image Restoration*," J. Opt. Soc. Am. 62, 55-59. 86 | * Lucy, L. B. (1974). "An iterative technique for the rectification of observed distributions". Astronomical Journal. 79 (6): 745–754. 87 | * Biggs, D. S., & Andrews, M. (1997). Acceleration of iterative image restoration algorithms. Applied optics, 36(8), 1766-1775. 88 | * Dey, N., Blanc-Féraud, L., Zimmer, C., Roux, P., Kam, Z., Olivo-Marin, J. C., & Zerubia, J. (2004). 3D microscopy deconvolution using Richardson-Lucy algorithm with total variation regularization (Doctoral dissertation, INRIA). 89 | -------------------------------------------------------------------------------- /src/DumpData.cu: -------------------------------------------------------------------------------- 1 | // corresponded header file 2 | #include "DumpData.cuh" 3 | // necessary project headers 4 | #include "Helper.cuh" 5 | // 3rd party libraries headers 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define cimg_use_tiff 15 | #include "CImg.h" 16 | using namespace cimg_library; 17 | // standard libraries headers 18 | #include 19 | #include 20 | // system headers 21 | 22 | namespace DumpData { 23 | 24 | namespace { 25 | 26 | __global__ 27 | void abs_kernel( 28 | cufftReal *odata, 29 | const cufftComplex *idata, 30 | const size_t nx, const size_t ny, const size_t nz 31 | ) { 32 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 33 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 34 | int iz = blockIdx.z*blockDim.z + threadIdx.z; 35 | 36 | // skip out-of-bound threads 37 | if (ix >= nx or iy >= ny or iz >= nz) { 38 | return; 39 | } 40 | 41 | int idx = iz * (nx*ny) + iy * nx + ix; 42 | odata[idx] = cuCabsf(idata[idx]); 43 | } 44 | 45 | } 46 | 47 | namespace Device { 48 | 49 | void real( 50 | std::string fname, 51 | const cufftReal *d_idata, 52 | const size_t nx, const size_t ny, const size_t nz 53 | ) { 54 | fprintf(stderr, 55 | "[DBG] dump %ldx%ldx%ld from device to \"%s\"\n", 56 | nx, ny, nz, fname.c_str() 57 | ); 58 | 59 | CImg data(nx, ny, nz); 60 | const size_t size = data.size() * sizeof(float); 61 | 62 | // pinned down the host memory region 63 | float *d_odata; 64 | cudaErrChk(cudaHostRegister(data.data(), size, cudaHostRegisterMapped)); 65 | cudaErrChk(cudaHostGetDevicePointer(&d_odata, data.data(), 0)); 66 | 67 | // copy from device to host 68 | cudaErrChk(cudaMemcpy(d_odata, d_idata, size, cudaMemcpyDeviceToHost)); 69 | 70 | // release the resources 71 | cudaErrChk(cudaHostUnregister(data.data())); 72 | 73 | // save the result to file 74 | data.save_tiff(fname.c_str()); 75 | } 76 | 77 | void complex( 78 | std::string fname, 79 | const cufftComplex *d_idata, 80 | const size_t nx, const size_t ny, const size_t nz 81 | ) { 82 | fprintf(stderr, 83 | "[DBG] dump %ldx%ldx%ld from device to \"%s\"\n", 84 | nx, ny, nz, fname.c_str() 85 | ); 86 | 87 | CImg data(nx, ny, nz); 88 | const size_t size = data.size() * sizeof(float); 89 | 90 | // pinned down the host memory region 91 | float *d_odata; 92 | cudaErrChk(cudaHostRegister(data.data(), size, cudaHostRegisterMapped)); 93 | cudaErrChk(cudaHostGetDevicePointer(&d_odata, data.data(), 0)); 94 | 95 | dim3 nthreads(16, 16, 4); 96 | dim3 nblocks( 97 | DIVUP(nx, nthreads.x), DIVUP(ny, nthreads.y), DIVUP(nz, nthreads.z) 98 | ); 99 | abs_kernel<<>>( 100 | d_odata, 101 | d_idata, 102 | nx, ny, nz 103 | ); 104 | cudaErrChk(cudaPeekAtLastError()); 105 | 106 | // release the resources 107 | cudaErrChk(cudaHostUnregister(data.data())); 108 | 109 | // save the result to file 110 | data.save_tiff(fname.c_str()); 111 | } 112 | 113 | } 114 | 115 | namespace Host { 116 | 117 | void real( 118 | std::string fname, 119 | cufftReal *h_idata, 120 | const size_t nx, const size_t ny, const size_t nz 121 | ) { 122 | fprintf(stderr, 123 | "[DBG] dump %ldx%ldx%ld from host to \"%s\"\n", 124 | nx, ny, nz, fname.c_str() 125 | ); 126 | 127 | CImg data(nx, ny, nz); 128 | const size_t size = data.size() * sizeof(float); 129 | 130 | // copy to image data region 131 | std::memcpy(data.data(), h_idata, size); 132 | 133 | // save the result to file 134 | data.save_tiff(fname.c_str()); 135 | } 136 | 137 | } 138 | 139 | } 140 | -------------------------------------------------------------------------------- /src/DeconvRLImpl.cuh: -------------------------------------------------------------------------------- 1 | #ifndef DECONV_LR_CORE_CUH 2 | #define DECONV_LR_CORE_CUH 3 | 4 | // corresponded header file 5 | // necessary project headers 6 | // 3rd party libraries headers 7 | #include 8 | #include 9 | // standard libraries headers 10 | #include 11 | // system headers 12 | 13 | namespace DeconvRL { 14 | 15 | namespace PSF { 16 | 17 | class PSF { 18 | public: 19 | PSF( 20 | float *h_psf, 21 | const size_t npx, const size_t npy, const size_t npz = 1 22 | ); 23 | ~PSF(); 24 | 25 | /** 26 | * @brief Center the centroid of the provided PSF. 27 | * 28 | * The method calls upon the the estimateBackground method to estimate and 29 | * remove the potential bacground noises by its mean. Later, centroid of the 30 | * PSF is calculated and used to circular shift the original PSF to its 31 | * align with its center. 32 | * 33 | * @see findCentroid, estimateBackground 34 | */ 35 | void alignCenter(const size_t nx, const size_t ny, const size_t nz = 1); 36 | 37 | /** 38 | * @brief Convert the PSF to OTF. 39 | * 40 | * Convert the PSF to an OTF by a FFT. Caller has to allocate the OTF 41 | * pointer with a proper memory space, (nx/2+1)*ny*nz*sizeof(cufftComplex). 42 | * 43 | * @param d_otf The converted OTF. 44 | * @param nx Number of elements in the X dimension (fastest variation). 45 | * @param ny Number of elements in the Y dimension. 46 | * @param nz Number of elements in the Z dimension (slowest variation). 47 | * 48 | * @see 49 | */ 50 | void createOTF(cufftComplex *d_otf); 51 | 52 | private: 53 | float3 findCentroid(); 54 | float estimateBackground(); 55 | void padPSF(const size_t nx, const size_t ny, const size_t nz); 56 | 57 | // PSF memory, host side and mirrored device address 58 | float *d_psf; 59 | 60 | // size of the PSF 61 | size_t npx, npy, npz; 62 | size_t nelem; 63 | }; 64 | 65 | } 66 | 67 | namespace Core { 68 | 69 | /** 70 | * @brief Brief introduction to the function. 71 | * 72 | * Description of what the function does 73 | * @param PARAM1 Description of the first parameter of the function. 74 | * @return Describe what the function returns. 75 | * @see FUNCTION 76 | */ 77 | 78 | /** 79 | * Parameter class that holds all constant and temporary variables during the 80 | * Richardson-Lucy iteration steps. 81 | */ 82 | struct Parameters { 83 | //TODO destructor to free the memory region 84 | //TODO destructor to free the FFT handles 85 | 86 | /** 87 | * Dimension of the image in real space. 88 | */ 89 | size_t nx, ny, nz; 90 | // product of nx, ny and nz 91 | size_t nelem; 92 | 93 | // original image 94 | float *raw; 95 | 96 | // OTF 97 | cufftComplex *otf; 98 | 99 | /** 100 | * cuFFT handles for forward (R2C) and reverse (C2R) FFT operations. 101 | */ 102 | struct { 103 | cufftHandle forward; 104 | cufftHandle reverse; 105 | } fftHandle; 106 | 107 | /** 108 | * I/O buffer to interface with the host. 109 | */ 110 | struct { 111 | cufftReal *input; 112 | cufftReal *output; 113 | } ioBuffer; 114 | 115 | /** 116 | * Intermediate buffers, maximum size is used, aka padded input data size. 117 | */ 118 | struct { 119 | cufftComplex *complexA; 120 | } filterBuffer; 121 | 122 | struct { 123 | cufftReal *realA; 124 | } RLBuffer; 125 | 126 | /** 127 | * Prediction buffers 128 | */ 129 | struct { 130 | float *prevIter; 131 | float *prevPredChg; 132 | } predBuffer; 133 | }; 134 | 135 | namespace RL { 136 | 137 | /** 138 | * @brief One iteration in the Richardson-Lucy algorithm. 139 | * 140 | * DESCRIPTION 141 | * @param odata Result from current iteration. 142 | * @param idata Result of previous iteration. 143 | * @param parm Algorithm related parameters. 144 | * @return 145 | * @see 146 | */ 147 | void step( 148 | float *odata, const float *idata, 149 | Core::Parameters &parms 150 | ); 151 | 152 | } 153 | 154 | namespace Biggs { 155 | 156 | /** 157 | * @brief One iteration in the accelerated Richardson-Lucy algorithm. 158 | * 159 | * DESCRIPTION 160 | * @param odata Result from current iteration. 161 | * @param idata Result of previous iteration. 162 | * @param parm Algorithm related parameters. 163 | * @return 164 | * @see 165 | */ 166 | void step( 167 | float *odata, const float *idata, 168 | Core::Parameters &parms 169 | ); 170 | 171 | } 172 | 173 | } 174 | 175 | namespace Common { 176 | 177 | void ushort2float(float *odata, const uint16_t *idata, const size_t nelem); 178 | 179 | } 180 | 181 | } 182 | 183 | #endif 184 | -------------------------------------------------------------------------------- /src/Core.cu: -------------------------------------------------------------------------------- 1 | // corresponded header file 2 | // necessary project headers 3 | #include "DeconvRLImpl.cuh" 4 | #include "Helper.cuh" 5 | // 3rd party libraries headers 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | // standard libraries headers 16 | #include 17 | // system headers 18 | 19 | namespace DeconvRL { 20 | 21 | namespace Core { 22 | 23 | namespace RL { 24 | 25 | enum class ConvType { 26 | PLAIN = 1, CONJUGATE 27 | }; 28 | 29 | namespace { 30 | // generic complex number operation 31 | struct MultiplyAndScale 32 | : public thrust::binary_function { 33 | MultiplyAndScale(const float c_) 34 | : c(c_) { 35 | } 36 | 37 | __host__ __device__ 38 | cuComplex operator()(const cuComplex &a, const cuComplex &b) const { 39 | return cuCmulf(a, b)/c; 40 | } 41 | 42 | private: 43 | const float c; 44 | }; 45 | 46 | void filter( 47 | cufftReal *odata, const cufftReal *idata, const cufftComplex *otf, 48 | Core::Parameters &parm 49 | ) { 50 | const size_t nelem = (parm.nx/2+1) * parm.ny * parm.nz; 51 | cufftComplex *buffer = (cufftComplex *)parm.filterBuffer.complexA; 52 | 53 | // convert to frequency space 54 | cudaErrChk(cufftExecR2C( 55 | parm.fftHandle.forward, 56 | const_cast(idata), 57 | buffer 58 | )); 59 | // element-wise multiplication and scale down 60 | thrust::transform( 61 | thrust::device, 62 | buffer, buffer+nelem, // first input sequence 63 | otf, // second input sequence 64 | buffer, // output sequence 65 | MultiplyAndScale(1.0f/parm.nelem) 66 | ); 67 | // convert back to real space 68 | cudaErrChk(cufftExecC2R( 69 | parm.fftHandle.reverse, 70 | buffer, 71 | odata 72 | )); 73 | } 74 | 75 | thrust::divides DivfOp; 76 | thrust::multiplies MulfOp; 77 | 78 | } 79 | 80 | void step( 81 | float *odata, const float *idata, 82 | Core::Parameters &parms 83 | ) { 84 | fprintf(stderr, "[DBG] +++ ENTER RL::step() +++\n"); 85 | 86 | const size_t nelem = parms.nelem; 87 | cufftReal *buffer = parms.RLBuffer.realA; 88 | 89 | cufftComplex *otf = parms.otf; 90 | 91 | /* 92 | * \hat{f_{k+1}} = 93 | * \hat{f_k} \left( 94 | * h \ast \frac{g}{h \otimes \hat{f_k}} 95 | * \right) 96 | */ 97 | 98 | // reblur the image 99 | filter(buffer, idata, otf, parms); 100 | // error 101 | thrust::transform( 102 | thrust::device, 103 | parms.raw, parms.raw+nelem, 104 | buffer, 105 | buffer, // output 106 | DivfOp 107 | ); 108 | filter(buffer, buffer, otf, parms); 109 | // latent image 110 | thrust::transform( 111 | thrust::device, 112 | idata, idata+nelem, 113 | buffer, 114 | odata, // output 115 | MulfOp 116 | ); 117 | 118 | fprintf(stderr, "[DBG] +++ EXIT RL::step() +++\n"); 119 | } 120 | 121 | } 122 | 123 | namespace Biggs { 124 | 125 | namespace { 126 | 127 | struct ScaleAndAdd 128 | : public thrust::binary_function { 129 | ScaleAndAdd(const float alpha_) 130 | : alpha(alpha_) { 131 | } 132 | 133 | __host__ __device__ 134 | float operator()(const float &a, const float &b) const { 135 | // apply positivity constraint after SAXPY 136 | //return fmaxf(a + alpha*b, 0.0f); 137 | return a + alpha*b; 138 | } 139 | 140 | private: 141 | const float alpha; 142 | }; 143 | 144 | } 145 | 146 | void step( 147 | float *odata, const float *idata, 148 | Core::Parameters &parm 149 | ) { 150 | // borrow space from odata, rename to avoid confusion 151 | float* iter = odata; 152 | // calcualte x_k 153 | RL::step(iter, idata, parm); 154 | 155 | // extract the definition 156 | float *prevIter = parm.predBuffer.prevIter; 157 | float *prevPredChg = parm.predBuffer.prevPredChg; 158 | 159 | // updateDir borrow buffer from prevIter 160 | float* updateDir = prevIter; 161 | // h_k in the paper 162 | // update_direction = prev_iter - iter; 163 | thrust::transform( 164 | thrust::device, 165 | iter, iter+parm.nelem, 166 | prevIter, 167 | updateDir, 168 | thrust::minus() 169 | ); 170 | 171 | // reuse space of idata 172 | float *predChg = const_cast(idata); 173 | // calculate g_{k - 1} = x_k - y_{k - 1}. 174 | // pred_change = iter - prev_pred; 175 | thrust::transform( 176 | thrust::device, 177 | iter, iter+parm.nelem, 178 | idata, 179 | predChg, 180 | thrust::minus() 181 | ); 182 | 183 | // calculate alpha (acceleration factor). 184 | float den = thrust::inner_product( 185 | thrust::device, 186 | predChg, predChg+parm.nelem, 187 | prevPredChg, 188 | 0.0f 189 | ); 190 | float nom = ( 191 | thrust::inner_product( 192 | thrust::device, 193 | prevPredChg, prevPredChg+parm.nelem, 194 | prevPredChg, 195 | 0.0f 196 | ) + std::numeric_limits::epsilon() 197 | ); 198 | float alpha = den / nom; 199 | fprintf(stderr, "[DBG] fraction [%f/%f = %f]\n", den, nom, alpha); 200 | 201 | // stability enforcement 202 | alpha = std::max(std::min(alpha, 1.0f), 0.0f); 203 | fprintf(stderr, "[INF] alpha = %f\n", alpha); 204 | 205 | // save current predictions 206 | cudaErrChk(cudaMemcpy( 207 | prevIter, 208 | iter, 209 | parm.nelem * sizeof(float), 210 | cudaMemcpyDeviceToDevice 211 | )); 212 | cudaErrChk(cudaMemcpy( 213 | prevPredChg, 214 | predChg, 215 | parm.nelem * sizeof(float), 216 | cudaMemcpyDeviceToDevice 217 | )); 218 | 219 | // calculate y_k 220 | // odata = iter + alpha * update_direction; 221 | thrust::transform( 222 | thrust::device, 223 | iter, iter+parm.nelem, 224 | updateDir, 225 | odata, 226 | ScaleAndAdd(alpha) 227 | ); 228 | } 229 | 230 | } 231 | 232 | } 233 | 234 | namespace Common { 235 | 236 | namespace { 237 | 238 | template 239 | struct ToFloat 240 | : public thrust::unary_function { 241 | __host__ __device__ 242 | float operator()(const T &v) const { 243 | return (float)v; 244 | } 245 | }; 246 | 247 | } 248 | 249 | void ushort2float(float *odata, const uint16_t *idata, const size_t nelem) { 250 | thrust::transform( 251 | thrust::device, 252 | idata, idata + nelem, // input 253 | odata, // output 254 | ToFloat() 255 | ); 256 | } 257 | 258 | } 259 | 260 | } 261 | -------------------------------------------------------------------------------- /src/DeconvLRDriver.cpp: -------------------------------------------------------------------------------- 1 | // corresponded header file 2 | #include "DeconvRLDriver.hpp" 3 | // necessary project headers 4 | #include "DeconvRLImpl.cuh" 5 | #include "Helper.cuh" 6 | #include "DumpData.cuh" 7 | // 3rd party libraries headers 8 | #include 9 | // standard libraries headers 10 | #include 11 | #include 12 | // system headers 13 | 14 | namespace DeconvRL { 15 | 16 | struct DeconvRL::Impl { 17 | Impl() { 18 | } 19 | 20 | ~Impl() { 21 | // TODO free iterParms 22 | } 23 | 24 | // volume size 25 | dim3 volumeSize; 26 | // voxel size 27 | struct { 28 | float3 raw; 29 | float3 psf; 30 | } voxelSize; 31 | 32 | /* 33 | * Algorithm configurations. 34 | */ 35 | int iterations; 36 | Core::Parameters iterParms; 37 | }; 38 | 39 | // C++14 feature 40 | template 41 | std::unique_ptr make_unique(Args&& ... args) { 42 | return std::unique_ptr(new T(std::forward(args) ...)); 43 | } 44 | 45 | DeconvRL::DeconvRL() 46 | : pimpl(make_unique()) { 47 | } 48 | 49 | DeconvRL::~DeconvRL() { 50 | 51 | } 52 | 53 | void DeconvRL::setResolution( 54 | const float dx, const float dy, const float dz, 55 | const float dpx, const float dpy, const float dpz 56 | ) { 57 | /* 58 | * Spatial frequency ratio (along one dimension) 59 | * 60 | * 1/(NS * DS) NP DP NP 61 | * R = ----------- = -- * -- = -- * r 62 | * 1/(NP * DP) NS DS NS 63 | * 64 | * NS, sample size 65 | * DS, sample voxel size 66 | * NP, PSF size 67 | * DP, PSF voxel size 68 | * r, voxel ratio 69 | */ 70 | pimpl->voxelSize.raw = make_float3(dx, dy, dz); 71 | pimpl->voxelSize.psf = make_float3(dpx, dpy, dpz); 72 | } 73 | 74 | void DeconvRL::setVolumeSize( 75 | const size_t nx, const size_t ny, const size_t nz 76 | ) { 77 | //TODO probe for device specification 78 | if (nx > 2048 or ny > 2048 or nz > 2048) { 79 | throw std::range_error("volume size exceeds maximum constraints"); 80 | } 81 | pimpl->volumeSize.x = nx; 82 | pimpl->volumeSize.y = ny; 83 | pimpl->volumeSize.z = nz; 84 | 85 | fprintf( 86 | stderr, 87 | "[INF] volume size = %ux%ux%u\n", 88 | pimpl->volumeSize.x, pimpl->volumeSize.y, pimpl->volumeSize.z 89 | ); 90 | } 91 | 92 | //TODO remove ImageStack dependency 93 | void DeconvRL::setPSF(const ImageStack &psf_u16) { 94 | /* 95 | * Ensure we are working with floating points. 96 | */ 97 | ImageStack psf(psf_u16); 98 | fprintf( 99 | stderr, 100 | "[INF] PSF size = %ldx%ldx%ld\n", 101 | psf.nx(), psf.ny(), psf.nz() 102 | ); 103 | 104 | /* 105 | * Generate the OTF. 106 | */ 107 | PSF::PSF psfProc(psf.data(), psf.nx(), psf.ny(), psf.nz()); 108 | psfProc.alignCenter( 109 | pimpl->volumeSize.x, pimpl->volumeSize.y, pimpl->volumeSize.z 110 | ); 111 | 112 | // allocate memory space for OTF 113 | cudaErrChk(cudaMalloc( 114 | &pimpl->iterParms.otf, 115 | (pimpl->volumeSize.x/2+1) * pimpl->volumeSize.y * pimpl->volumeSize.z * sizeof(cufftComplex) 116 | )); 117 | // create the OTF 118 | psfProc.createOTF(pimpl->iterParms.otf); 119 | fprintf(stderr, "[INF] OTF established\n"); 120 | } 121 | 122 | void DeconvRL::initialize() { 123 | const dim3 volumeSize = pimpl->volumeSize; 124 | Core::Parameters &iterParms = pimpl->iterParms; 125 | 126 | /* 127 | * Load dimension information into the iteration parameter. 128 | */ 129 | iterParms.nx = volumeSize.x; 130 | iterParms.ny = volumeSize.y; 131 | iterParms.nz = volumeSize.z; 132 | iterParms.nelem = volumeSize.x * volumeSize.y * volumeSize.z; 133 | 134 | /* 135 | * Create FFT plans. 136 | */ 137 | // FFT plans for estimation 138 | cudaErrChk(cufftPlan3d( 139 | &iterParms.fftHandle.forward, 140 | volumeSize.z, volumeSize.y, volumeSize.x, 141 | CUFFT_R2C 142 | )); 143 | cudaErrChk(cufftPlan3d( 144 | &iterParms.fftHandle.reverse, 145 | volumeSize.z, volumeSize.y, volumeSize.x, 146 | CUFFT_C2R 147 | )); 148 | 149 | //TODO attach callback device functions 150 | 151 | /* 152 | * Estimate memory usage from FFT procedures. 153 | */ 154 | 155 | /* 156 | * Allocate device staging area. 157 | */ 158 | size_t realSize = 159 | volumeSize.x * volumeSize.y * volumeSize.z * sizeof(cufftReal); 160 | size_t complexSize = 161 | (volumeSize.x/2+1) * volumeSize.y * volumeSize.z * sizeof(cufftComplex); 162 | 163 | // template 164 | cudaErrChk(cudaMalloc((void **)&iterParms.raw, realSize)); 165 | 166 | // IO buffer 167 | cudaErrChk(cudaMalloc((void **)&iterParms.ioBuffer.input, realSize)); 168 | cudaErrChk(cudaMalloc((void **)&iterParms.ioBuffer.output, realSize)); 169 | 170 | // FFT Buffer 171 | cudaErrChk(cudaMalloc((void **)&iterParms.filterBuffer.complexA, complexSize)); 172 | 173 | // RL Buffer 174 | cudaErrChk(cudaMalloc((void **)&iterParms.RLBuffer.realA, realSize)); 175 | 176 | // prediction buffer 177 | cudaErrChk(cudaMalloc((void **)&iterParms.predBuffer.prevIter, realSize)); 178 | cudaErrChk(cudaMalloc((void **)&iterParms.predBuffer.prevPredChg, realSize)); 179 | } 180 | 181 | void DeconvRL::setIterations(const int i) { 182 | if (i < 1) { 183 | throw std::range_error("iteration cycle has to be at least 1"); 184 | } 185 | pimpl->iterations = i; 186 | } 187 | 188 | //TODO scale output from float to uint16 189 | void DeconvRL::process( 190 | ImageStack &odata, 191 | const ImageStack &idata 192 | ) { 193 | Core::Parameters &iterParms = pimpl->iterParms; 194 | const size_t nelem = iterParms.nelem; 195 | 196 | // register the input data memory region on host as pinned 197 | cudaErrChk(cudaHostRegister( 198 | idata.data(), 199 | nelem * sizeof(uint16_t), 200 | cudaHostRegisterMapped 201 | )); 202 | 203 | // retrieve the host pointer 204 | uint16_t *d_idata = nullptr; 205 | cudaErrChk(cudaHostGetDevicePointer(&d_idata, idata.data(), 0)); 206 | 207 | /* 208 | * Copy the data to buffer area along with type casts. 209 | */ 210 | fprintf(stderr, "[DBG] %ld elements to type cast\n", nelem); 211 | Common::ushort2float( 212 | iterParms.ioBuffer.input, // output 213 | d_idata, // input 214 | nelem 215 | ); 216 | 217 | // duplicate the to store a copy of raw data 218 | cudaErrChk(cudaMemcpy( 219 | iterParms.raw, 220 | iterParms.ioBuffer.input, 221 | nelem * sizeof(float), 222 | cudaMemcpyDeviceToDevice 223 | )); 224 | 225 | /* 226 | * Release the pinned memory region. 227 | */ 228 | cudaErrChk(cudaHostUnregister(idata.data())); 229 | 230 | // preset the iteration 231 | cudaErrChk(cudaMemcpy( 232 | iterParms.predBuffer.prevIter, 233 | iterParms.ioBuffer.input, 234 | nelem * sizeof(float), 235 | cudaMemcpyDeviceToDevice 236 | )); 237 | cudaErrChk(cudaMemset( 238 | iterParms.predBuffer.prevPredChg, 239 | 0, 240 | nelem * sizeof(float) 241 | )); 242 | 243 | /* 244 | * Execute the core functions. 245 | */ 246 | const int nIter = pimpl->iterations; 247 | for (int iIter = 1; iIter <= nIter; iIter++) { 248 | //Core::RL::step( 249 | Core::Biggs::step( 250 | iterParms.ioBuffer.output, // output 251 | iterParms.ioBuffer.input, // input 252 | iterParms 253 | ); 254 | // swap A, B buffer 255 | std::swap(iterParms.ioBuffer.input, iterParms.ioBuffer.output); 256 | 257 | fprintf(stderr, "[INF] %d/%d\n", iIter, nIter); 258 | } 259 | 260 | // swap back to avoid confusion 261 | std::swap(iterParms.ioBuffer.input, iterParms.ioBuffer.output); 262 | 263 | // noarmalize the result to [0, 65535] 264 | 265 | 266 | // copy back to host 267 | cudaErrChk(cudaMemcpy( 268 | odata.data(), 269 | iterParms.ioBuffer.output, 270 | nelem * sizeof(cufftReal), 271 | cudaMemcpyDeviceToHost 272 | )); 273 | } 274 | 275 | } 276 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Liu, Yen-Ting 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 |
14 |

DeconvRL

15 |

A GPU Accelerated Richard-Lucy Deconvolution Library

16 | 24 | DeconvRL 25 |
26 |
27 |

What is DeconvRL

28 | DeconvRL is a open source CUDA implementation of accelerated Richard-Lucy Deconvolution algorithm regularized with total variation loss. This library is developed to recovered blurred image due to the spreading of point source in optical system. As far as we know, there is no other fully functional open source GPU accelerated implementation. This project is aim to develope an open source, high efficient library to process high resolution images of high quality. 29 |

Quick Start

30 |

Requirements

31 |
    32 |
  • make
  • 33 |
  • g++ <= 5
  • 34 |
  • CMake >= 3.6
  • 35 |
  • Boost >= 1.59
  • 36 |
  • CUDA >= 8.0
  • 37 |
38 |

Build

39 |
    40 |
  • 41 | First download the source code and untar it. 42 |
    tar zxvf DeconvRL.tar.gz
    43 |
  • 44 |
  • 45 | Then cd into the source code directory and make a directory called build. 46 |
     47 | cd DeconvRL
     48 | mkdir build
    49 |
  • 50 |
  • Then cd into directory build and use command cmake and make to build the project. 51 |
    cd build
     52 | cmake .
     53 | make
    54 |
  • 55 |
56 |

Usage

57 | This library can be used as followed: 58 |
    std::string origImgFile = "data/bigradient/sample.tif";
 59 |     std::string psfFile = "data/bigradient/psf_n15_z5.tif";
 60 | 
 61 |     // load psf
 62 |     ImageStack psf(psfFile);
 63 | 
 64 |     // init the deconvlr
 65 |     DeconvLR deconvWorker;
 66 |     deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
 67 | 
 68 |     // open the image
 69 |     const ImageStack<uint16_t> input(origImgFile);
 70 |     ImageStack<uint16_t> output(input, 0);
 71 | 
 72 |     // use the first image to init the resources
 73 |     deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz());
 74 |     deconvWorker.setPSF(psf);
 75 | 
 76 |     // run the deconv
 77 |     deconvWorker.process(output, input);
 78 |                 
79 | 80 |

Algorithm

81 |

Richard-Lucy Deconvolution

82 | A image can be seen as a combination of point sources. However, point sources are usually not ideal, so it will spread out as we recored them with photo. In Richard-Lucy deconvolution algorithm, the pixel \(d_i\) of a photo is modeled as expectaction sum of light from each real point sources. That is 83 | 84 | \[d_i = \sum_j p_{ij} u_j\] 85 | 86 | where \(p_{ij}\) is the probability of the event that light from point source at position \(j\) recoreded in pixel \(i\) of photo, and \(u_j\) is the amount of light from point source at position \(j\). So after some conduction, and generalized in multi-dimension, given a photo \(d\), we can reconstruct the distribution of real point source of maximum likelihood by expectaction maximization algorithm: 87 | \[u_{t + 1} = u_{t} \cdot \left( \frac{d}{u_{t} * p} * \hat{p} \right)\] 88 | 89 | ,where \(\hat{p}_{ij} = p_{(n-i)(m-j)}\) for \(d\) of shape \(n \times m\), and \(*\) is convolution operator, which is implemented with elementwise multiplication in frequence domain. 90 | 91 |

Biggs Andrew Acceleration

92 | It is an acceleration of Richard-Lucy's algorithm proposed by Biggs and Andrew. Let the original Richard-Lucy's iteration be \(\phi\), namely 93 | 94 | \[u_{t + 1} = \phi(u_t) = u_t \cdot \left( \frac{d}{u_t * p} * \hat{p} \right)\] 95 | 96 | Consider a image is a point in high dimension space, then the optimization process is to finding a point with maximum likelihood iteratively. The original Richard-Lucy's alogithm maximize the likelihood by applying \(\phi\) times and times in each iteration. To accelerate it, Biggs and Andrew proposed to go to some direction \(h\) times some acceleration factor \(\alpha\) between each two iterations (namely apply \(\phi\)). That is, after an iteration, we can get 97 | 98 | \[ x_k = \phi(y_{k-1}) \] 99 | 100 | Then the direction \(h\) is computed from 101 | 102 | \[ h_k = x_k - x_{k-1} \] 103 | 104 | and the accelerate factor \(\alpha\) is from 105 | 106 | \[ \alpha = \frac{(x_k - y_{k-1}) \cdot (x_{k - 1} - y_{k-2})}{\lVert x_{k - 1} - y_{k-2} \rVert_2} \] 107 | 108 | So between each itrations \(x_k\) and \(x_{k+1}\), an intermediate prediction \(y_k\) is computed 109 | 110 | \[ y_k = x_k + \alpha_k h_k \] 111 | 112 | It is the first order acceleration. Furthermore, it can be extended to second or more order acceleration. However, we did only implement the first order acceleration for now. 113 | 114 |

Total Variation Regularization

115 | 116 | The original Richard-Lucy's algorithm often converges to a maximum likelihood image but the image may be noisy. Thus, the total variation loss can be added to act as regularization. After some more math conductions, it turns out that only a little bit of modification of original iteration is required: 117 | 118 | \[u_{(t + 1)} = \phi(u_t) = \frac{u_{t}}{1 - \lambda \mathrm{div}(\frac{\nabla u_t}{|\nabla u_t|}) } \cdot \left( \frac{d}{u_{t} * p} * \hat{p} \right)\] 119 | 120 |

Benchmark

121 | 122 |
123 |
124 | 126 |
Original Image
127 |
128 |
129 | 130 |
131 |
132 | 134 |
After 10 updates
135 |
136 |
137 | 139 |
After 100 updates
140 |
141 |
142 | 144 |
After 1000 updates
145 |
146 |
147 | 148 | 150 | 151 |

Original Richard-Lucy's Algorithm

152 | 153 |
154 |
155 | 157 |
After 10 updates
158 |
159 |
160 | 162 |
After 100 updates
163 |
164 |
165 | 167 |
After 1000 updates
168 |
169 |
170 | 171 |

Biggs Andrew Acceleration

172 | 173 |
174 |
175 | 177 |
After 10 updates
178 |
179 |
180 | 182 |
After 100 updates
183 |
184 |
185 | 187 |
After 1000 updates
188 |
189 |
190 | 191 | 192 | 193 |

References

194 |
    195 |
  • William Hadley Richardson (1972), "Bayesian-Based Iterative Method of Image Restoration*," J. Opt. Soc. Am. 62, 55-59.
  • 196 |
  • Lucy, L. B. (1974). "An iterative technique for the rectification of observed distributions". Astronomical Journal. 79 (6): 745–754.
  • 197 |
  • Biggs, D. S., & Andrews, M. (1997). Acceleration of iterative image restoration algorithms. Applied optics, 36(8), 1766-1775.
  • 198 |
  • Dey, N., Blanc-Féraud, L., Zimmer, C., Roux, P., Kam, Z., Olivo-Marin, J. C., & Zerubia, J. (2004). 3D microscopy deconvolution using Richardson-Lucy algorithm with total variation regularization (Doctoral dissertation, INRIA).
  • 199 |
200 |
201 | 202 | Fork me on GitHub 205 | 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /src/PSF.cu: -------------------------------------------------------------------------------- 1 | // corresponded header file 2 | // necessary project headers 3 | #include "DeconvRLImpl.cuh" 4 | #include "Helper.cuh" 5 | #include "DumpData.cuh" 6 | // 3rd party libraries headers 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define cimg_use_tiff 17 | #include "CImg.h" 18 | using namespace cimg_library; 19 | // standard libraries headers 20 | #include 21 | #include 22 | // system headers 23 | 24 | namespace DeconvRL { 25 | 26 | namespace PSF { 27 | 28 | namespace { 29 | 30 | cudaArray_t psfRes = nullptr; 31 | texture psfTexRef; 32 | 33 | struct SubConstant 34 | : public thrust::unary_function { 35 | SubConstant(const float c_) 36 | : c(c_) { 37 | } 38 | 39 | __host__ __device__ 40 | float operator()(const float &p) const { 41 | float o = p-c; 42 | return (o < 0) ? 0 : o; 43 | } 44 | 45 | private: 46 | const float c; 47 | }; 48 | 49 | __global__ 50 | void createGrid_kernel( 51 | int3 *d_grid, 52 | const size_t nx, const size_t ny, const size_t nz 53 | ) { 54 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 55 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 56 | int iz = blockIdx.z*blockDim.z + threadIdx.z; 57 | 58 | // skip out-of-bound threads 59 | if (ix >= nx or iy >= ny or iz >= nz) { 60 | return; 61 | } 62 | 63 | int idx = iz * (nx*ny) + iy * nx + ix; 64 | d_grid[idx] = make_int3(ix, iy, iz); 65 | } 66 | 67 | struct MultiplyWeighting 68 | : public thrust::unary_function { 69 | MultiplyWeighting( 70 | const float *data, 71 | const size_t nx_, const size_t ny_, const size_t nz_ 72 | ) 73 | : d_weight(data), nx(nx_), ny(ny_), nz(nz_) { 74 | } 75 | 76 | __host__ __device__ 77 | float4 operator()(const int3 &p) const { 78 | const int idx = p.z * (nx*ny) + p.y * nx + p.x; 79 | const float w = d_weight[idx]; 80 | return make_float4(p.x*w, p.y*w, p.z*w, w); 81 | } 82 | 83 | private: 84 | const float *d_weight; 85 | size_t nx, ny, nz; 86 | }; 87 | 88 | __global__ 89 | void simpleClone_kenel( 90 | float *d_tmp, 91 | const size_t nx, const size_t ny, const size_t nz, 92 | const float *d_psf, 93 | const size_t npx, const size_t npy, const size_t npz 94 | ) { 95 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 96 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 97 | int iz = blockIdx.z*blockDim.z + threadIdx.z; 98 | 99 | // skip out-of-bound threads 100 | if (ix >= nx or iy >= ny or iz >= nz) { 101 | return; 102 | } 103 | 104 | int oidx = iz * (nx*ny) + iy * nx + ix; 105 | // rest of the space is filled with zero 106 | if (ix >= npx or iy >= npy or iz >= npz) { 107 | d_tmp[oidx] = 0.0f; 108 | } else { 109 | int iidx = iz * (npx*npy) + iy * npx + ix; 110 | d_tmp[oidx] = d_psf[iidx]; 111 | } 112 | } 113 | 114 | __global__ 115 | void alignCenter_kernel( 116 | float *odata, 117 | const size_t nx, const size_t ny, const size_t nz, 118 | const float ox, const float oy, const float oz 119 | ) { 120 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 121 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 122 | int iz = blockIdx.z*blockDim.z + threadIdx.z; 123 | 124 | // skip out-of-bound threads 125 | if (ix >= nx or iy >= ny or iz >= nz) { 126 | return; 127 | } 128 | 129 | // normalized coordinate 130 | float fx = (ix+ox+0.5f) / nx; 131 | float fy = (iy+oy+0.5f) / ny; 132 | float fz = (iz+oz+0.5f) / nz; 133 | 134 | // sampling from the texture 135 | // (coordinates are backtracked to the deviated ones) 136 | int idx = iz * (nx*ny) + iy * nx + ix; 137 | odata[idx] = tex3D(psfTexRef, fx, fy, fz); 138 | } 139 | 140 | __global__ 141 | void fftshift3_kernel( 142 | float *odata, 143 | const float *idata, 144 | const size_t nx, const size_t ny, const size_t nz 145 | ) { 146 | int ix = blockIdx.x*blockDim.x + threadIdx.x; 147 | int iy = blockIdx.y*blockDim.y + threadIdx.y; 148 | int iz = blockIdx.z*blockDim.z + threadIdx.z; 149 | 150 | // skip out-of-bound threads 151 | if (ix >= nx or iy >= ny or iz >= nz) { 152 | return; 153 | } 154 | 155 | // input linear index 156 | const int iidx = iz * (nx*ny) + iy * nx + ix; 157 | 158 | // calculate wrap-around (x, y, z) 159 | // ... advance half cycle 160 | ix += nx/2, iy += ny/2, iz += nz/2; 161 | // ... wrap-around 162 | ix %= nx, iy %= ny, iz %= nz; 163 | // output linear index 164 | const int oidx = iz * (nx*ny) + iy * nx + ix; 165 | 166 | odata[oidx] = idata[iidx]; 167 | } 168 | 169 | } 170 | 171 | PSF::PSF( 172 | float *h_psf, 173 | const size_t npx_, const size_t npy_, const size_t npz_ 174 | ) : npx(npx_), npy(npy_), npz(npz_) { 175 | nelem = npx * npy * npz; 176 | 177 | // create PSF memory space on device 178 | const size_t size = nelem * sizeof(float); 179 | cudaErrChk(cudaMalloc(&d_psf, size)); 180 | // copy from host to device 181 | cudaErrChk(cudaMemcpy(d_psf, h_psf, size, cudaMemcpyHostToDevice)); 182 | } 183 | 184 | PSF::~PSF() { 185 | cudaErrChk(cudaFree(d_psf)); 186 | } 187 | 188 | void PSF::alignCenter(const size_t nx, const size_t ny, const size_t nz) { 189 | float3 centroid = findCentroid(); 190 | fprintf( 191 | stderr, 192 | "[INF] centroid = (%.2f, %.2f, %.2f)\n", 193 | centroid.x, centroid.y, centroid.z 194 | ); 195 | 196 | // simple clone is performed, so we can reuse the centroid location 197 | if (nx != npx or ny != npy or nz != npz) { 198 | padPSF(nx, ny, nz); 199 | } 200 | 201 | /* 202 | * Bind the data source to the texture. 203 | */ 204 | // create cudaArray for the texture. 205 | cudaChannelFormatDesc desc = cudaCreateChannelDesc( 206 | 32, 0, 0, 0, cudaChannelFormatKindFloat 207 | ); 208 | cudaExtent extent = make_cudaExtent(npx, npy, npz); 209 | cudaErrChk(cudaMalloc3DArray( 210 | &psfRes, 211 | &desc, // pixel channel description 212 | extent, // array dimension 213 | cudaArrayDefault 214 | )); 215 | 216 | // copy the data to cudaArray_t 217 | cudaMemcpy3DParms parms = {0}; 218 | parms.srcPtr = make_cudaPitchedPtr(d_psf, npx * sizeof(float), npx, npy); 219 | parms.dstArray = psfRes; 220 | parms.extent = extent; 221 | parms.kind = cudaMemcpyDeviceToDevice; 222 | cudaErrChk(cudaMemcpy3D(&parms)); 223 | 224 | // reconfigure the texture 225 | psfTexRef.normalized = true; 226 | // sampled data is interpolated 227 | psfTexRef.filterMode = cudaFilterModeLinear; 228 | // wrap around the texture if exceeds border limit 229 | psfTexRef.addressMode[0] = cudaAddressModeWrap; 230 | psfTexRef.addressMode[1] = cudaAddressModeWrap; 231 | psfTexRef.addressMode[2] = cudaAddressModeWrap; 232 | 233 | // start the binding 234 | cudaErrChk(cudaBindTextureToArray(psfTexRef, psfRes)); 235 | 236 | /* 237 | * Execute the alignment kernel. 238 | */ 239 | // coordinate of the center of the volume 240 | const float3 center = make_float3( 241 | (npx-1)/2.0f, (npy-1)/2.0f, (npz-1)/2.0f 242 | ); 243 | // offset 244 | const float3 offset = centroid - center; 245 | fprintf(stderr, "[DBG] offset = (%.2f, %.2f, %.2f)\n", offset.x, offset.y, offset.z); 246 | 247 | // begin resample the kernel 248 | dim3 nthreads(16, 16, 4); 249 | dim3 nblocks( 250 | DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z) 251 | ); 252 | alignCenter_kernel<<>>( 253 | d_psf, 254 | npx, npy, npz, 255 | offset.x, offset.y, offset.z 256 | ); 257 | cudaErrChk(cudaPeekAtLastError()); 258 | 259 | /* 260 | * Release the resources. 261 | */ 262 | cudaErrChk(cudaUnbindTexture(psfTexRef)); 263 | cudaErrChk(cudaFreeArray(psfRes)); 264 | 265 | DumpData::Device::real("psf_aligned.tif", d_psf, npx, npy, npz); 266 | } 267 | 268 | void PSF::createOTF(cufftComplex *d_otf) { 269 | /* 270 | * Prepare FFT environment. 271 | */ 272 | cufftHandle otfHdl; 273 | cudaErrChk(cufftPlan3d(&otfHdl, npz, npy, npx, CUFFT_R2C)); 274 | // estimate resource requirements 275 | size_t size; 276 | cudaErrChk(cufftGetSize3d(otfHdl, npz, npy, npx, CUFFT_R2C, &size)); 277 | fprintf(stderr, "[DBG] require %ld bytes to generate an OTF\n", size); 278 | 279 | /* 280 | * Execute the conversion. 281 | */ 282 | // temporary memory space to hold the shifted PSF 283 | float *d_tmp; 284 | cudaErrChk(cudaMalloc(&d_tmp, nelem * sizeof(float))); 285 | cudaErrChk(cudaMemcpy(d_tmp, d_psf, nelem * sizeof(float), cudaMemcpyDeviceToDevice)); 286 | 287 | // fftshift 288 | dim3 nthreads(16, 16, 4); 289 | dim3 nblocks( 290 | DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z) 291 | ); 292 | fftshift3_kernel<<>>(d_tmp, d_psf, npx, npy, npz); 293 | 294 | cudaErrChk(cufftExecR2C(otfHdl, d_tmp, d_otf)); 295 | 296 | // release FFT resource 297 | cudaErrChk(cudaFree(d_tmp)); 298 | cudaErrChk(cufftDestroy(otfHdl)); 299 | 300 | DumpData::Device::complex("otf_dump.tif", d_otf, npx/2+1, npy, npz); 301 | } 302 | 303 | // center the PSF to its potential centroid 304 | float3 PSF::findCentroid() { 305 | /* 306 | * Create temporary PSF to find the centroid. 307 | */ 308 | float *d_tmp; 309 | const size_t size = nelem * sizeof(float); 310 | cudaErrChk(cudaMalloc(&d_tmp, size)); 311 | // copy the raw PSF to temporary PSF 312 | cudaErrChk(cudaMemcpy(d_tmp, d_psf, size, cudaMemcpyDeviceToDevice)); 313 | 314 | // background value is clamped in [0, +inf) 315 | const float bkgVal = estimateBackground(); 316 | fprintf(stderr, "[INF] PSF background value is %.2f\n", bkgVal); 317 | thrust::transform( 318 | thrust::device, 319 | d_tmp, d_tmp+nelem, 320 | d_tmp, 321 | SubConstant(bkgVal) 322 | ); 323 | 324 | /* 325 | * Generate 3-D grid for weighting. 326 | */ 327 | int3 *d_grid; 328 | cudaErrChk(cudaMalloc(&d_grid, nelem * sizeof(int3))); 329 | dim3 nthreads(16, 16, 4); 330 | dim3 nblocks( 331 | DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z) 332 | ); 333 | createGrid_kernel<<>>(d_grid, npx, npy, npz); 334 | cudaErrChk(cudaPeekAtLastError()); 335 | 336 | /* 337 | * Calculate the centroid along weighted grid points using cleaned PSF. 338 | */ 339 | float4 result = thrust::transform_reduce( 340 | thrust::device, 341 | d_grid, d_grid+nelem, 342 | MultiplyWeighting(d_tmp, npx, npy, npz), 343 | make_float4(0), 344 | thrust::plus() 345 | ); 346 | 347 | float3 centroid = make_float3( 348 | result.x/result.w, result.y/result.w, result.z/result.w 349 | ); 350 | 351 | // free the weight computation resources 352 | cudaErrChk(cudaFree(d_grid)); 353 | cudaErrChk(cudaFree(d_tmp)); 354 | 355 | return centroid; 356 | } 357 | 358 | float PSF::estimateBackground() { 359 | float sum = thrust::reduce( 360 | thrust::device, 361 | d_psf, d_psf+nelem, 362 | 0, 363 | thrust::plus() 364 | ); 365 | return sum/nelem; 366 | } 367 | 368 | void PSF::padPSF(const size_t nx, const size_t ny, const size_t nz) { 369 | // verify the size requirement 370 | if (nx < npx or ny < npy or nz < npz) { 371 | throw std::range_error( 372 | "volume has to be greater or equal than the original PSF" 373 | ); 374 | } 375 | 376 | // create new PSF memory space with the larger size 377 | float *d_tmp; 378 | cudaErrChk(cudaMalloc( 379 | &d_tmp, 380 | nx * ny * nz * sizeof(float) 381 | )); 382 | 383 | // copy the original PSF to the larger space at (0, 0) 384 | dim3 nthreads(16, 16, 4); 385 | dim3 nblocks( 386 | DIVUP(nx, nthreads.x), DIVUP(ny, nthreads.y), DIVUP(nz, nthreads.z) 387 | ); 388 | simpleClone_kenel<<>>( 389 | d_tmp, 390 | nx, ny, nz, 391 | d_psf, 392 | npx, npy, npz 393 | ); 394 | // swap the pointer and free the smaller space 395 | std::swap(d_tmp, d_psf); 396 | cudaErrChk(cudaFree(d_tmp)); 397 | 398 | // update PSF size 399 | npx = nx; 400 | npy = ny; 401 | npz = nz; 402 | nelem = npx * npy * npz; 403 | fprintf(stderr, "[DBG] update PSF size is %ldx%ldx%ld\n", npx, npy, npz); 404 | } 405 | 406 | } 407 | 408 | } 409 | -------------------------------------------------------------------------------- /src/HelperMath.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /* 13 | * This file implements common mathematical operations on vector types 14 | * (float3, float4 etc.) since these are not provided as standard by CUDA. 15 | * 16 | * The syntax is modeled on the Cg standard library. 17 | * 18 | * This is part of the Helper library includes 19 | * 20 | * Thanks to Linh Hah for additions and fixes. 21 | */ 22 | 23 | #ifndef HELPER_MATH_H 24 | #define HELPER_MATH_H 25 | 26 | #include 27 | 28 | typedef unsigned int uint; 29 | typedef unsigned short ushort; 30 | 31 | #ifndef __CUDACC__ 32 | #include 33 | 34 | //////////////////////////////////////////////////////////////////////////////// 35 | // host implementations of CUDA functions 36 | //////////////////////////////////////////////////////////////////////////////// 37 | 38 | inline float fminf(float a, float b) 39 | { 40 | return a < b ? a : b; 41 | } 42 | 43 | inline float fmaxf(float a, float b) 44 | { 45 | return a > b ? a : b; 46 | } 47 | 48 | inline int max(int a, int b) 49 | { 50 | return a > b ? a : b; 51 | } 52 | 53 | inline int min(int a, int b) 54 | { 55 | return a < b ? a : b; 56 | } 57 | 58 | inline float rsqrtf(float x) 59 | { 60 | return 1.0f / sqrtf(x); 61 | } 62 | #endif 63 | 64 | //////////////////////////////////////////////////////////////////////////////// 65 | // constructors 66 | //////////////////////////////////////////////////////////////////////////////// 67 | 68 | inline __host__ __device__ float2 make_float2(float s) 69 | { 70 | return make_float2(s, s); 71 | } 72 | inline __host__ __device__ float2 make_float2(float3 a) 73 | { 74 | return make_float2(a.x, a.y); 75 | } 76 | inline __host__ __device__ float2 make_float2(int2 a) 77 | { 78 | return make_float2(float(a.x), float(a.y)); 79 | } 80 | inline __host__ __device__ float2 make_float2(uint2 a) 81 | { 82 | return make_float2(float(a.x), float(a.y)); 83 | } 84 | 85 | inline __host__ __device__ int2 make_int2(int s) 86 | { 87 | return make_int2(s, s); 88 | } 89 | inline __host__ __device__ int2 make_int2(int3 a) 90 | { 91 | return make_int2(a.x, a.y); 92 | } 93 | inline __host__ __device__ int2 make_int2(uint2 a) 94 | { 95 | return make_int2(int(a.x), int(a.y)); 96 | } 97 | inline __host__ __device__ int2 make_int2(float2 a) 98 | { 99 | return make_int2(int(a.x), int(a.y)); 100 | } 101 | 102 | inline __host__ __device__ uint2 make_uint2(uint s) 103 | { 104 | return make_uint2(s, s); 105 | } 106 | inline __host__ __device__ uint2 make_uint2(uint3 a) 107 | { 108 | return make_uint2(a.x, a.y); 109 | } 110 | inline __host__ __device__ uint2 make_uint2(int2 a) 111 | { 112 | return make_uint2(uint(a.x), uint(a.y)); 113 | } 114 | 115 | inline __host__ __device__ float3 make_float3(float s) 116 | { 117 | return make_float3(s, s, s); 118 | } 119 | inline __host__ __device__ float3 make_float3(float2 a) 120 | { 121 | return make_float3(a.x, a.y, 0.0f); 122 | } 123 | inline __host__ __device__ float3 make_float3(float2 a, float s) 124 | { 125 | return make_float3(a.x, a.y, s); 126 | } 127 | inline __host__ __device__ float3 make_float3(float4 a) 128 | { 129 | return make_float3(a.x, a.y, a.z); 130 | } 131 | inline __host__ __device__ float3 make_float3(int3 a) 132 | { 133 | return make_float3(float(a.x), float(a.y), float(a.z)); 134 | } 135 | inline __host__ __device__ float3 make_float3(uint3 a) 136 | { 137 | return make_float3(float(a.x), float(a.y), float(a.z)); 138 | } 139 | 140 | inline __host__ __device__ int3 make_int3(int s) 141 | { 142 | return make_int3(s, s, s); 143 | } 144 | inline __host__ __device__ int3 make_int3(int2 a) 145 | { 146 | return make_int3(a.x, a.y, 0); 147 | } 148 | inline __host__ __device__ int3 make_int3(int2 a, int s) 149 | { 150 | return make_int3(a.x, a.y, s); 151 | } 152 | inline __host__ __device__ int3 make_int3(uint3 a) 153 | { 154 | return make_int3(int(a.x), int(a.y), int(a.z)); 155 | } 156 | inline __host__ __device__ int3 make_int3(float3 a) 157 | { 158 | return make_int3(int(a.x), int(a.y), int(a.z)); 159 | } 160 | 161 | inline __host__ __device__ uint3 make_uint3(uint s) 162 | { 163 | return make_uint3(s, s, s); 164 | } 165 | inline __host__ __device__ uint3 make_uint3(uint2 a) 166 | { 167 | return make_uint3(a.x, a.y, 0); 168 | } 169 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) 170 | { 171 | return make_uint3(a.x, a.y, s); 172 | } 173 | inline __host__ __device__ uint3 make_uint3(uint4 a) 174 | { 175 | return make_uint3(a.x, a.y, a.z); 176 | } 177 | inline __host__ __device__ uint3 make_uint3(int3 a) 178 | { 179 | return make_uint3(uint(a.x), uint(a.y), uint(a.z)); 180 | } 181 | 182 | inline __host__ __device__ float4 make_float4(float s) 183 | { 184 | return make_float4(s, s, s, s); 185 | } 186 | inline __host__ __device__ float4 make_float4(float3 a) 187 | { 188 | return make_float4(a.x, a.y, a.z, 0.0f); 189 | } 190 | inline __host__ __device__ float4 make_float4(float3 a, float w) 191 | { 192 | return make_float4(a.x, a.y, a.z, w); 193 | } 194 | inline __host__ __device__ float4 make_float4(int4 a) 195 | { 196 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 197 | } 198 | inline __host__ __device__ float4 make_float4(uint4 a) 199 | { 200 | return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); 201 | } 202 | 203 | inline __host__ __device__ int4 make_int4(int s) 204 | { 205 | return make_int4(s, s, s, s); 206 | } 207 | inline __host__ __device__ int4 make_int4(int3 a) 208 | { 209 | return make_int4(a.x, a.y, a.z, 0); 210 | } 211 | inline __host__ __device__ int4 make_int4(int3 a, int w) 212 | { 213 | return make_int4(a.x, a.y, a.z, w); 214 | } 215 | inline __host__ __device__ int4 make_int4(uint4 a) 216 | { 217 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 218 | } 219 | inline __host__ __device__ int4 make_int4(float4 a) 220 | { 221 | return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); 222 | } 223 | 224 | 225 | inline __host__ __device__ uint4 make_uint4(uint s) 226 | { 227 | return make_uint4(s, s, s, s); 228 | } 229 | inline __host__ __device__ uint4 make_uint4(uint3 a) 230 | { 231 | return make_uint4(a.x, a.y, a.z, 0); 232 | } 233 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) 234 | { 235 | return make_uint4(a.x, a.y, a.z, w); 236 | } 237 | inline __host__ __device__ uint4 make_uint4(int4 a) 238 | { 239 | return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); 240 | } 241 | 242 | //////////////////////////////////////////////////////////////////////////////// 243 | // negate 244 | //////////////////////////////////////////////////////////////////////////////// 245 | 246 | inline __host__ __device__ float2 operator-(float2 &a) 247 | { 248 | return make_float2(-a.x, -a.y); 249 | } 250 | inline __host__ __device__ int2 operator-(int2 &a) 251 | { 252 | return make_int2(-a.x, -a.y); 253 | } 254 | inline __host__ __device__ float3 operator-(float3 &a) 255 | { 256 | return make_float3(-a.x, -a.y, -a.z); 257 | } 258 | inline __host__ __device__ int3 operator-(int3 &a) 259 | { 260 | return make_int3(-a.x, -a.y, -a.z); 261 | } 262 | inline __host__ __device__ float4 operator-(float4 &a) 263 | { 264 | return make_float4(-a.x, -a.y, -a.z, -a.w); 265 | } 266 | inline __host__ __device__ int4 operator-(int4 &a) 267 | { 268 | return make_int4(-a.x, -a.y, -a.z, -a.w); 269 | } 270 | 271 | //////////////////////////////////////////////////////////////////////////////// 272 | // addition 273 | //////////////////////////////////////////////////////////////////////////////// 274 | 275 | inline __host__ __device__ float2 operator+(float2 a, float2 b) 276 | { 277 | return make_float2(a.x + b.x, a.y + b.y); 278 | } 279 | inline __host__ __device__ void operator+=(float2 &a, float2 b) 280 | { 281 | a.x += b.x; 282 | a.y += b.y; 283 | } 284 | inline __host__ __device__ float2 operator+(float2 a, float b) 285 | { 286 | return make_float2(a.x + b, a.y + b); 287 | } 288 | inline __host__ __device__ float2 operator+(float b, float2 a) 289 | { 290 | return make_float2(a.x + b, a.y + b); 291 | } 292 | inline __host__ __device__ void operator+=(float2 &a, float b) 293 | { 294 | a.x += b; 295 | a.y += b; 296 | } 297 | 298 | inline __host__ __device__ int2 operator+(int2 a, int2 b) 299 | { 300 | return make_int2(a.x + b.x, a.y + b.y); 301 | } 302 | inline __host__ __device__ void operator+=(int2 &a, int2 b) 303 | { 304 | a.x += b.x; 305 | a.y += b.y; 306 | } 307 | inline __host__ __device__ int2 operator+(int2 a, int b) 308 | { 309 | return make_int2(a.x + b, a.y + b); 310 | } 311 | inline __host__ __device__ int2 operator+(int b, int2 a) 312 | { 313 | return make_int2(a.x + b, a.y + b); 314 | } 315 | inline __host__ __device__ void operator+=(int2 &a, int b) 316 | { 317 | a.x += b; 318 | a.y += b; 319 | } 320 | 321 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) 322 | { 323 | return make_uint2(a.x + b.x, a.y + b.y); 324 | } 325 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b) 326 | { 327 | a.x += b.x; 328 | a.y += b.y; 329 | } 330 | inline __host__ __device__ uint2 operator+(uint2 a, uint b) 331 | { 332 | return make_uint2(a.x + b, a.y + b); 333 | } 334 | inline __host__ __device__ uint2 operator+(uint b, uint2 a) 335 | { 336 | return make_uint2(a.x + b, a.y + b); 337 | } 338 | inline __host__ __device__ void operator+=(uint2 &a, uint b) 339 | { 340 | a.x += b; 341 | a.y += b; 342 | } 343 | 344 | 345 | inline __host__ __device__ float3 operator+(float3 a, float3 b) 346 | { 347 | return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); 348 | } 349 | inline __host__ __device__ void operator+=(float3 &a, float3 b) 350 | { 351 | a.x += b.x; 352 | a.y += b.y; 353 | a.z += b.z; 354 | } 355 | inline __host__ __device__ float3 operator+(float3 a, float b) 356 | { 357 | return make_float3(a.x + b, a.y + b, a.z + b); 358 | } 359 | inline __host__ __device__ void operator+=(float3 &a, float b) 360 | { 361 | a.x += b; 362 | a.y += b; 363 | a.z += b; 364 | } 365 | 366 | inline __host__ __device__ int3 operator+(int3 a, int3 b) 367 | { 368 | return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); 369 | } 370 | inline __host__ __device__ void operator+=(int3 &a, int3 b) 371 | { 372 | a.x += b.x; 373 | a.y += b.y; 374 | a.z += b.z; 375 | } 376 | inline __host__ __device__ int3 operator+(int3 a, int b) 377 | { 378 | return make_int3(a.x + b, a.y + b, a.z + b); 379 | } 380 | inline __host__ __device__ void operator+=(int3 &a, int b) 381 | { 382 | a.x += b; 383 | a.y += b; 384 | a.z += b; 385 | } 386 | 387 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) 388 | { 389 | return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); 390 | } 391 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b) 392 | { 393 | a.x += b.x; 394 | a.y += b.y; 395 | a.z += b.z; 396 | } 397 | inline __host__ __device__ uint3 operator+(uint3 a, uint b) 398 | { 399 | return make_uint3(a.x + b, a.y + b, a.z + b); 400 | } 401 | inline __host__ __device__ void operator+=(uint3 &a, uint b) 402 | { 403 | a.x += b; 404 | a.y += b; 405 | a.z += b; 406 | } 407 | 408 | inline __host__ __device__ int3 operator+(int b, int3 a) 409 | { 410 | return make_int3(a.x + b, a.y + b, a.z + b); 411 | } 412 | inline __host__ __device__ uint3 operator+(uint b, uint3 a) 413 | { 414 | return make_uint3(a.x + b, a.y + b, a.z + b); 415 | } 416 | inline __host__ __device__ float3 operator+(float b, float3 a) 417 | { 418 | return make_float3(a.x + b, a.y + b, a.z + b); 419 | } 420 | 421 | inline __host__ __device__ float4 operator+(float4 a, float4 b) 422 | { 423 | return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 424 | } 425 | inline __host__ __device__ void operator+=(float4 &a, float4 b) 426 | { 427 | a.x += b.x; 428 | a.y += b.y; 429 | a.z += b.z; 430 | a.w += b.w; 431 | } 432 | inline __host__ __device__ float4 operator+(float4 a, float b) 433 | { 434 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 435 | } 436 | inline __host__ __device__ float4 operator+(float b, float4 a) 437 | { 438 | return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); 439 | } 440 | inline __host__ __device__ void operator+=(float4 &a, float b) 441 | { 442 | a.x += b; 443 | a.y += b; 444 | a.z += b; 445 | a.w += b; 446 | } 447 | 448 | inline __host__ __device__ int4 operator+(int4 a, int4 b) 449 | { 450 | return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 451 | } 452 | inline __host__ __device__ void operator+=(int4 &a, int4 b) 453 | { 454 | a.x += b.x; 455 | a.y += b.y; 456 | a.z += b.z; 457 | a.w += b.w; 458 | } 459 | inline __host__ __device__ int4 operator+(int4 a, int b) 460 | { 461 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 462 | } 463 | inline __host__ __device__ int4 operator+(int b, int4 a) 464 | { 465 | return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); 466 | } 467 | inline __host__ __device__ void operator+=(int4 &a, int b) 468 | { 469 | a.x += b; 470 | a.y += b; 471 | a.z += b; 472 | a.w += b; 473 | } 474 | 475 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) 476 | { 477 | return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); 478 | } 479 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b) 480 | { 481 | a.x += b.x; 482 | a.y += b.y; 483 | a.z += b.z; 484 | a.w += b.w; 485 | } 486 | inline __host__ __device__ uint4 operator+(uint4 a, uint b) 487 | { 488 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 489 | } 490 | inline __host__ __device__ uint4 operator+(uint b, uint4 a) 491 | { 492 | return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); 493 | } 494 | inline __host__ __device__ void operator+=(uint4 &a, uint b) 495 | { 496 | a.x += b; 497 | a.y += b; 498 | a.z += b; 499 | a.w += b; 500 | } 501 | 502 | //////////////////////////////////////////////////////////////////////////////// 503 | // subtract 504 | //////////////////////////////////////////////////////////////////////////////// 505 | 506 | inline __host__ __device__ float2 operator-(float2 a, float2 b) 507 | { 508 | return make_float2(a.x - b.x, a.y - b.y); 509 | } 510 | inline __host__ __device__ void operator-=(float2 &a, float2 b) 511 | { 512 | a.x -= b.x; 513 | a.y -= b.y; 514 | } 515 | inline __host__ __device__ float2 operator-(float2 a, float b) 516 | { 517 | return make_float2(a.x - b, a.y - b); 518 | } 519 | inline __host__ __device__ float2 operator-(float b, float2 a) 520 | { 521 | return make_float2(b - a.x, b - a.y); 522 | } 523 | inline __host__ __device__ void operator-=(float2 &a, float b) 524 | { 525 | a.x -= b; 526 | a.y -= b; 527 | } 528 | 529 | inline __host__ __device__ int2 operator-(int2 a, int2 b) 530 | { 531 | return make_int2(a.x - b.x, a.y - b.y); 532 | } 533 | inline __host__ __device__ void operator-=(int2 &a, int2 b) 534 | { 535 | a.x -= b.x; 536 | a.y -= b.y; 537 | } 538 | inline __host__ __device__ int2 operator-(int2 a, int b) 539 | { 540 | return make_int2(a.x - b, a.y - b); 541 | } 542 | inline __host__ __device__ int2 operator-(int b, int2 a) 543 | { 544 | return make_int2(b - a.x, b - a.y); 545 | } 546 | inline __host__ __device__ void operator-=(int2 &a, int b) 547 | { 548 | a.x -= b; 549 | a.y -= b; 550 | } 551 | 552 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) 553 | { 554 | return make_uint2(a.x - b.x, a.y - b.y); 555 | } 556 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b) 557 | { 558 | a.x -= b.x; 559 | a.y -= b.y; 560 | } 561 | inline __host__ __device__ uint2 operator-(uint2 a, uint b) 562 | { 563 | return make_uint2(a.x - b, a.y - b); 564 | } 565 | inline __host__ __device__ uint2 operator-(uint b, uint2 a) 566 | { 567 | return make_uint2(b - a.x, b - a.y); 568 | } 569 | inline __host__ __device__ void operator-=(uint2 &a, uint b) 570 | { 571 | a.x -= b; 572 | a.y -= b; 573 | } 574 | 575 | inline __host__ __device__ float3 operator-(float3 a, float3 b) 576 | { 577 | return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); 578 | } 579 | inline __host__ __device__ void operator-=(float3 &a, float3 b) 580 | { 581 | a.x -= b.x; 582 | a.y -= b.y; 583 | a.z -= b.z; 584 | } 585 | inline __host__ __device__ float3 operator-(float3 a, float b) 586 | { 587 | return make_float3(a.x - b, a.y - b, a.z - b); 588 | } 589 | inline __host__ __device__ float3 operator-(float b, float3 a) 590 | { 591 | return make_float3(b - a.x, b - a.y, b - a.z); 592 | } 593 | inline __host__ __device__ void operator-=(float3 &a, float b) 594 | { 595 | a.x -= b; 596 | a.y -= b; 597 | a.z -= b; 598 | } 599 | 600 | inline __host__ __device__ int3 operator-(int3 a, int3 b) 601 | { 602 | return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); 603 | } 604 | inline __host__ __device__ void operator-=(int3 &a, int3 b) 605 | { 606 | a.x -= b.x; 607 | a.y -= b.y; 608 | a.z -= b.z; 609 | } 610 | inline __host__ __device__ int3 operator-(int3 a, int b) 611 | { 612 | return make_int3(a.x - b, a.y - b, a.z - b); 613 | } 614 | inline __host__ __device__ int3 operator-(int b, int3 a) 615 | { 616 | return make_int3(b - a.x, b - a.y, b - a.z); 617 | } 618 | inline __host__ __device__ void operator-=(int3 &a, int b) 619 | { 620 | a.x -= b; 621 | a.y -= b; 622 | a.z -= b; 623 | } 624 | 625 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) 626 | { 627 | return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); 628 | } 629 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b) 630 | { 631 | a.x -= b.x; 632 | a.y -= b.y; 633 | a.z -= b.z; 634 | } 635 | inline __host__ __device__ uint3 operator-(uint3 a, uint b) 636 | { 637 | return make_uint3(a.x - b, a.y - b, a.z - b); 638 | } 639 | inline __host__ __device__ uint3 operator-(uint b, uint3 a) 640 | { 641 | return make_uint3(b - a.x, b - a.y, b - a.z); 642 | } 643 | inline __host__ __device__ void operator-=(uint3 &a, uint b) 644 | { 645 | a.x -= b; 646 | a.y -= b; 647 | a.z -= b; 648 | } 649 | 650 | inline __host__ __device__ float4 operator-(float4 a, float4 b) 651 | { 652 | return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 653 | } 654 | inline __host__ __device__ void operator-=(float4 &a, float4 b) 655 | { 656 | a.x -= b.x; 657 | a.y -= b.y; 658 | a.z -= b.z; 659 | a.w -= b.w; 660 | } 661 | inline __host__ __device__ float4 operator-(float4 a, float b) 662 | { 663 | return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); 664 | } 665 | inline __host__ __device__ void operator-=(float4 &a, float b) 666 | { 667 | a.x -= b; 668 | a.y -= b; 669 | a.z -= b; 670 | a.w -= b; 671 | } 672 | 673 | inline __host__ __device__ int4 operator-(int4 a, int4 b) 674 | { 675 | return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 676 | } 677 | inline __host__ __device__ void operator-=(int4 &a, int4 b) 678 | { 679 | a.x -= b.x; 680 | a.y -= b.y; 681 | a.z -= b.z; 682 | a.w -= b.w; 683 | } 684 | inline __host__ __device__ int4 operator-(int4 a, int b) 685 | { 686 | return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); 687 | } 688 | inline __host__ __device__ int4 operator-(int b, int4 a) 689 | { 690 | return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); 691 | } 692 | inline __host__ __device__ void operator-=(int4 &a, int b) 693 | { 694 | a.x -= b; 695 | a.y -= b; 696 | a.z -= b; 697 | a.w -= b; 698 | } 699 | 700 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) 701 | { 702 | return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); 703 | } 704 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b) 705 | { 706 | a.x -= b.x; 707 | a.y -= b.y; 708 | a.z -= b.z; 709 | a.w -= b.w; 710 | } 711 | inline __host__ __device__ uint4 operator-(uint4 a, uint b) 712 | { 713 | return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); 714 | } 715 | inline __host__ __device__ uint4 operator-(uint b, uint4 a) 716 | { 717 | return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); 718 | } 719 | inline __host__ __device__ void operator-=(uint4 &a, uint b) 720 | { 721 | a.x -= b; 722 | a.y -= b; 723 | a.z -= b; 724 | a.w -= b; 725 | } 726 | 727 | //////////////////////////////////////////////////////////////////////////////// 728 | // multiply 729 | //////////////////////////////////////////////////////////////////////////////// 730 | 731 | inline __host__ __device__ float2 operator*(float2 a, float2 b) 732 | { 733 | return make_float2(a.x * b.x, a.y * b.y); 734 | } 735 | inline __host__ __device__ void operator*=(float2 &a, float2 b) 736 | { 737 | a.x *= b.x; 738 | a.y *= b.y; 739 | } 740 | inline __host__ __device__ float2 operator*(float2 a, float b) 741 | { 742 | return make_float2(a.x * b, a.y * b); 743 | } 744 | inline __host__ __device__ float2 operator*(float b, float2 a) 745 | { 746 | return make_float2(b * a.x, b * a.y); 747 | } 748 | inline __host__ __device__ void operator*=(float2 &a, float b) 749 | { 750 | a.x *= b; 751 | a.y *= b; 752 | } 753 | 754 | inline __host__ __device__ int2 operator*(int2 a, int2 b) 755 | { 756 | return make_int2(a.x * b.x, a.y * b.y); 757 | } 758 | inline __host__ __device__ void operator*=(int2 &a, int2 b) 759 | { 760 | a.x *= b.x; 761 | a.y *= b.y; 762 | } 763 | inline __host__ __device__ int2 operator*(int2 a, int b) 764 | { 765 | return make_int2(a.x * b, a.y * b); 766 | } 767 | inline __host__ __device__ int2 operator*(int b, int2 a) 768 | { 769 | return make_int2(b * a.x, b * a.y); 770 | } 771 | inline __host__ __device__ void operator*=(int2 &a, int b) 772 | { 773 | a.x *= b; 774 | a.y *= b; 775 | } 776 | 777 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) 778 | { 779 | return make_uint2(a.x * b.x, a.y * b.y); 780 | } 781 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b) 782 | { 783 | a.x *= b.x; 784 | a.y *= b.y; 785 | } 786 | inline __host__ __device__ uint2 operator*(uint2 a, uint b) 787 | { 788 | return make_uint2(a.x * b, a.y * b); 789 | } 790 | inline __host__ __device__ uint2 operator*(uint b, uint2 a) 791 | { 792 | return make_uint2(b * a.x, b * a.y); 793 | } 794 | inline __host__ __device__ void operator*=(uint2 &a, uint b) 795 | { 796 | a.x *= b; 797 | a.y *= b; 798 | } 799 | 800 | inline __host__ __device__ float3 operator*(float3 a, float3 b) 801 | { 802 | return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); 803 | } 804 | inline __host__ __device__ void operator*=(float3 &a, float3 b) 805 | { 806 | a.x *= b.x; 807 | a.y *= b.y; 808 | a.z *= b.z; 809 | } 810 | inline __host__ __device__ float3 operator*(float3 a, float b) 811 | { 812 | return make_float3(a.x * b, a.y * b, a.z * b); 813 | } 814 | inline __host__ __device__ float3 operator*(float b, float3 a) 815 | { 816 | return make_float3(b * a.x, b * a.y, b * a.z); 817 | } 818 | inline __host__ __device__ void operator*=(float3 &a, float b) 819 | { 820 | a.x *= b; 821 | a.y *= b; 822 | a.z *= b; 823 | } 824 | 825 | inline __host__ __device__ int3 operator*(int3 a, int3 b) 826 | { 827 | return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); 828 | } 829 | inline __host__ __device__ void operator*=(int3 &a, int3 b) 830 | { 831 | a.x *= b.x; 832 | a.y *= b.y; 833 | a.z *= b.z; 834 | } 835 | inline __host__ __device__ int3 operator*(int3 a, int b) 836 | { 837 | return make_int3(a.x * b, a.y * b, a.z * b); 838 | } 839 | inline __host__ __device__ int3 operator*(int b, int3 a) 840 | { 841 | return make_int3(b * a.x, b * a.y, b * a.z); 842 | } 843 | inline __host__ __device__ void operator*=(int3 &a, int b) 844 | { 845 | a.x *= b; 846 | a.y *= b; 847 | a.z *= b; 848 | } 849 | 850 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) 851 | { 852 | return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); 853 | } 854 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b) 855 | { 856 | a.x *= b.x; 857 | a.y *= b.y; 858 | a.z *= b.z; 859 | } 860 | inline __host__ __device__ uint3 operator*(uint3 a, uint b) 861 | { 862 | return make_uint3(a.x * b, a.y * b, a.z * b); 863 | } 864 | inline __host__ __device__ uint3 operator*(uint b, uint3 a) 865 | { 866 | return make_uint3(b * a.x, b * a.y, b * a.z); 867 | } 868 | inline __host__ __device__ void operator*=(uint3 &a, uint b) 869 | { 870 | a.x *= b; 871 | a.y *= b; 872 | a.z *= b; 873 | } 874 | 875 | inline __host__ __device__ float4 operator*(float4 a, float4 b) 876 | { 877 | return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 878 | } 879 | inline __host__ __device__ void operator*=(float4 &a, float4 b) 880 | { 881 | a.x *= b.x; 882 | a.y *= b.y; 883 | a.z *= b.z; 884 | a.w *= b.w; 885 | } 886 | inline __host__ __device__ float4 operator*(float4 a, float b) 887 | { 888 | return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); 889 | } 890 | inline __host__ __device__ float4 operator*(float b, float4 a) 891 | { 892 | return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); 893 | } 894 | inline __host__ __device__ void operator*=(float4 &a, float b) 895 | { 896 | a.x *= b; 897 | a.y *= b; 898 | a.z *= b; 899 | a.w *= b; 900 | } 901 | 902 | inline __host__ __device__ int4 operator*(int4 a, int4 b) 903 | { 904 | return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 905 | } 906 | inline __host__ __device__ void operator*=(int4 &a, int4 b) 907 | { 908 | a.x *= b.x; 909 | a.y *= b.y; 910 | a.z *= b.z; 911 | a.w *= b.w; 912 | } 913 | inline __host__ __device__ int4 operator*(int4 a, int b) 914 | { 915 | return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); 916 | } 917 | inline __host__ __device__ int4 operator*(int b, int4 a) 918 | { 919 | return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); 920 | } 921 | inline __host__ __device__ void operator*=(int4 &a, int b) 922 | { 923 | a.x *= b; 924 | a.y *= b; 925 | a.z *= b; 926 | a.w *= b; 927 | } 928 | 929 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) 930 | { 931 | return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); 932 | } 933 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b) 934 | { 935 | a.x *= b.x; 936 | a.y *= b.y; 937 | a.z *= b.z; 938 | a.w *= b.w; 939 | } 940 | inline __host__ __device__ uint4 operator*(uint4 a, uint b) 941 | { 942 | return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); 943 | } 944 | inline __host__ __device__ uint4 operator*(uint b, uint4 a) 945 | { 946 | return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); 947 | } 948 | inline __host__ __device__ void operator*=(uint4 &a, uint b) 949 | { 950 | a.x *= b; 951 | a.y *= b; 952 | a.z *= b; 953 | a.w *= b; 954 | } 955 | 956 | //////////////////////////////////////////////////////////////////////////////// 957 | // divide 958 | //////////////////////////////////////////////////////////////////////////////// 959 | 960 | inline __host__ __device__ float2 operator/(float2 a, float2 b) 961 | { 962 | return make_float2(a.x / b.x, a.y / b.y); 963 | } 964 | inline __host__ __device__ void operator/=(float2 &a, float2 b) 965 | { 966 | a.x /= b.x; 967 | a.y /= b.y; 968 | } 969 | inline __host__ __device__ float2 operator/(float2 a, float b) 970 | { 971 | return make_float2(a.x / b, a.y / b); 972 | } 973 | inline __host__ __device__ void operator/=(float2 &a, float b) 974 | { 975 | a.x /= b; 976 | a.y /= b; 977 | } 978 | inline __host__ __device__ float2 operator/(float b, float2 a) 979 | { 980 | return make_float2(b / a.x, b / a.y); 981 | } 982 | 983 | inline __host__ __device__ float3 operator/(float3 a, float3 b) 984 | { 985 | return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); 986 | } 987 | inline __host__ __device__ void operator/=(float3 &a, float3 b) 988 | { 989 | a.x /= b.x; 990 | a.y /= b.y; 991 | a.z /= b.z; 992 | } 993 | inline __host__ __device__ float3 operator/(float3 a, float b) 994 | { 995 | return make_float3(a.x / b, a.y / b, a.z / b); 996 | } 997 | inline __host__ __device__ void operator/=(float3 &a, float b) 998 | { 999 | a.x /= b; 1000 | a.y /= b; 1001 | a.z /= b; 1002 | } 1003 | inline __host__ __device__ float3 operator/(float b, float3 a) 1004 | { 1005 | return make_float3(b / a.x, b / a.y, b / a.z); 1006 | } 1007 | 1008 | inline __host__ __device__ float4 operator/(float4 a, float4 b) 1009 | { 1010 | return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); 1011 | } 1012 | inline __host__ __device__ void operator/=(float4 &a, float4 b) 1013 | { 1014 | a.x /= b.x; 1015 | a.y /= b.y; 1016 | a.z /= b.z; 1017 | a.w /= b.w; 1018 | } 1019 | inline __host__ __device__ float4 operator/(float4 a, float b) 1020 | { 1021 | return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); 1022 | } 1023 | inline __host__ __device__ void operator/=(float4 &a, float b) 1024 | { 1025 | a.x /= b; 1026 | a.y /= b; 1027 | a.z /= b; 1028 | a.w /= b; 1029 | } 1030 | inline __host__ __device__ float4 operator/(float b, float4 a) 1031 | { 1032 | return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); 1033 | } 1034 | 1035 | //////////////////////////////////////////////////////////////////////////////// 1036 | // min 1037 | //////////////////////////////////////////////////////////////////////////////// 1038 | 1039 | inline __host__ __device__ float2 fminf(float2 a, float2 b) 1040 | { 1041 | return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); 1042 | } 1043 | inline __host__ __device__ float3 fminf(float3 a, float3 b) 1044 | { 1045 | return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); 1046 | } 1047 | inline __host__ __device__ float4 fminf(float4 a, float4 b) 1048 | { 1049 | return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); 1050 | } 1051 | 1052 | inline __host__ __device__ int2 min(int2 a, int2 b) 1053 | { 1054 | return make_int2(min(a.x,b.x), min(a.y,b.y)); 1055 | } 1056 | inline __host__ __device__ int3 min(int3 a, int3 b) 1057 | { 1058 | return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); 1059 | } 1060 | inline __host__ __device__ int4 min(int4 a, int4 b) 1061 | { 1062 | return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); 1063 | } 1064 | 1065 | inline __host__ __device__ uint2 min(uint2 a, uint2 b) 1066 | { 1067 | return make_uint2(min(a.x,b.x), min(a.y,b.y)); 1068 | } 1069 | inline __host__ __device__ uint3 min(uint3 a, uint3 b) 1070 | { 1071 | return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); 1072 | } 1073 | inline __host__ __device__ uint4 min(uint4 a, uint4 b) 1074 | { 1075 | return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); 1076 | } 1077 | 1078 | //////////////////////////////////////////////////////////////////////////////// 1079 | // max 1080 | //////////////////////////////////////////////////////////////////////////////// 1081 | 1082 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b) 1083 | { 1084 | return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); 1085 | } 1086 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b) 1087 | { 1088 | return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); 1089 | } 1090 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b) 1091 | { 1092 | return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); 1093 | } 1094 | 1095 | inline __host__ __device__ int2 max(int2 a, int2 b) 1096 | { 1097 | return make_int2(max(a.x,b.x), max(a.y,b.y)); 1098 | } 1099 | inline __host__ __device__ int3 max(int3 a, int3 b) 1100 | { 1101 | return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); 1102 | } 1103 | inline __host__ __device__ int4 max(int4 a, int4 b) 1104 | { 1105 | return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); 1106 | } 1107 | 1108 | inline __host__ __device__ uint2 max(uint2 a, uint2 b) 1109 | { 1110 | return make_uint2(max(a.x,b.x), max(a.y,b.y)); 1111 | } 1112 | inline __host__ __device__ uint3 max(uint3 a, uint3 b) 1113 | { 1114 | return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); 1115 | } 1116 | inline __host__ __device__ uint4 max(uint4 a, uint4 b) 1117 | { 1118 | return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); 1119 | } 1120 | 1121 | //////////////////////////////////////////////////////////////////////////////// 1122 | // lerp 1123 | // - linear interpolation between a and b, based on value t in [0, 1] range 1124 | //////////////////////////////////////////////////////////////////////////////// 1125 | 1126 | inline __device__ __host__ float lerp(float a, float b, float t) 1127 | { 1128 | return a + t*(b-a); 1129 | } 1130 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) 1131 | { 1132 | return a + t*(b-a); 1133 | } 1134 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) 1135 | { 1136 | return a + t*(b-a); 1137 | } 1138 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) 1139 | { 1140 | return a + t*(b-a); 1141 | } 1142 | 1143 | //////////////////////////////////////////////////////////////////////////////// 1144 | // clamp 1145 | // - clamp the value v to be in the range [a, b] 1146 | //////////////////////////////////////////////////////////////////////////////// 1147 | 1148 | inline __device__ __host__ float clamp(float f, float a, float b) 1149 | { 1150 | return fmaxf(a, fminf(f, b)); 1151 | } 1152 | inline __device__ __host__ int clamp(int f, int a, int b) 1153 | { 1154 | return max(a, min(f, b)); 1155 | } 1156 | inline __device__ __host__ uint clamp(uint f, uint a, uint b) 1157 | { 1158 | return max(a, min(f, b)); 1159 | } 1160 | 1161 | inline __device__ __host__ float2 clamp(float2 v, float a, float b) 1162 | { 1163 | return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); 1164 | } 1165 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) 1166 | { 1167 | return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1168 | } 1169 | inline __device__ __host__ float3 clamp(float3 v, float a, float b) 1170 | { 1171 | return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1172 | } 1173 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) 1174 | { 1175 | return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1176 | } 1177 | inline __device__ __host__ float4 clamp(float4 v, float a, float b) 1178 | { 1179 | return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1180 | } 1181 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) 1182 | { 1183 | return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1184 | } 1185 | 1186 | inline __device__ __host__ int2 clamp(int2 v, int a, int b) 1187 | { 1188 | return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); 1189 | } 1190 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) 1191 | { 1192 | return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1193 | } 1194 | inline __device__ __host__ int3 clamp(int3 v, int a, int b) 1195 | { 1196 | return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1197 | } 1198 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) 1199 | { 1200 | return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1201 | } 1202 | inline __device__ __host__ int4 clamp(int4 v, int a, int b) 1203 | { 1204 | return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1205 | } 1206 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) 1207 | { 1208 | return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1209 | } 1210 | 1211 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) 1212 | { 1213 | return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); 1214 | } 1215 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) 1216 | { 1217 | return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); 1218 | } 1219 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) 1220 | { 1221 | return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); 1222 | } 1223 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) 1224 | { 1225 | return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); 1226 | } 1227 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) 1228 | { 1229 | return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); 1230 | } 1231 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) 1232 | { 1233 | return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); 1234 | } 1235 | 1236 | //////////////////////////////////////////////////////////////////////////////// 1237 | // dot product 1238 | //////////////////////////////////////////////////////////////////////////////// 1239 | 1240 | inline __host__ __device__ float dot(float2 a, float2 b) 1241 | { 1242 | return a.x * b.x + a.y * b.y; 1243 | } 1244 | inline __host__ __device__ float dot(float3 a, float3 b) 1245 | { 1246 | return a.x * b.x + a.y * b.y + a.z * b.z; 1247 | } 1248 | inline __host__ __device__ float dot(float4 a, float4 b) 1249 | { 1250 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1251 | } 1252 | 1253 | inline __host__ __device__ int dot(int2 a, int2 b) 1254 | { 1255 | return a.x * b.x + a.y * b.y; 1256 | } 1257 | inline __host__ __device__ int dot(int3 a, int3 b) 1258 | { 1259 | return a.x * b.x + a.y * b.y + a.z * b.z; 1260 | } 1261 | inline __host__ __device__ int dot(int4 a, int4 b) 1262 | { 1263 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1264 | } 1265 | 1266 | inline __host__ __device__ uint dot(uint2 a, uint2 b) 1267 | { 1268 | return a.x * b.x + a.y * b.y; 1269 | } 1270 | inline __host__ __device__ uint dot(uint3 a, uint3 b) 1271 | { 1272 | return a.x * b.x + a.y * b.y + a.z * b.z; 1273 | } 1274 | inline __host__ __device__ uint dot(uint4 a, uint4 b) 1275 | { 1276 | return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; 1277 | } 1278 | 1279 | //////////////////////////////////////////////////////////////////////////////// 1280 | // length 1281 | //////////////////////////////////////////////////////////////////////////////// 1282 | 1283 | inline __host__ __device__ float length(float2 v) 1284 | { 1285 | return sqrtf(dot(v, v)); 1286 | } 1287 | inline __host__ __device__ float length(float3 v) 1288 | { 1289 | return sqrtf(dot(v, v)); 1290 | } 1291 | inline __host__ __device__ float length(float4 v) 1292 | { 1293 | return sqrtf(dot(v, v)); 1294 | } 1295 | 1296 | //////////////////////////////////////////////////////////////////////////////// 1297 | // normalize 1298 | //////////////////////////////////////////////////////////////////////////////// 1299 | 1300 | inline __host__ __device__ float2 normalize(float2 v) 1301 | { 1302 | float invLen = rsqrtf(dot(v, v)); 1303 | return v * invLen; 1304 | } 1305 | inline __host__ __device__ float3 normalize(float3 v) 1306 | { 1307 | float invLen = rsqrtf(dot(v, v)); 1308 | return v * invLen; 1309 | } 1310 | inline __host__ __device__ float4 normalize(float4 v) 1311 | { 1312 | float invLen = rsqrtf(dot(v, v)); 1313 | return v * invLen; 1314 | } 1315 | 1316 | //////////////////////////////////////////////////////////////////////////////// 1317 | // floor 1318 | //////////////////////////////////////////////////////////////////////////////// 1319 | 1320 | inline __host__ __device__ float2 floorf(float2 v) 1321 | { 1322 | return make_float2(floorf(v.x), floorf(v.y)); 1323 | } 1324 | inline __host__ __device__ float3 floorf(float3 v) 1325 | { 1326 | return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); 1327 | } 1328 | inline __host__ __device__ float4 floorf(float4 v) 1329 | { 1330 | return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); 1331 | } 1332 | 1333 | //////////////////////////////////////////////////////////////////////////////// 1334 | // frac - returns the fractional portion of a scalar or each vector component 1335 | //////////////////////////////////////////////////////////////////////////////// 1336 | 1337 | inline __host__ __device__ float fracf(float v) 1338 | { 1339 | return v - floorf(v); 1340 | } 1341 | inline __host__ __device__ float2 fracf(float2 v) 1342 | { 1343 | return make_float2(fracf(v.x), fracf(v.y)); 1344 | } 1345 | inline __host__ __device__ float3 fracf(float3 v) 1346 | { 1347 | return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); 1348 | } 1349 | inline __host__ __device__ float4 fracf(float4 v) 1350 | { 1351 | return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); 1352 | } 1353 | 1354 | //////////////////////////////////////////////////////////////////////////////// 1355 | // fmod 1356 | //////////////////////////////////////////////////////////////////////////////// 1357 | 1358 | inline __host__ __device__ float2 fmodf(float2 a, float2 b) 1359 | { 1360 | return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); 1361 | } 1362 | inline __host__ __device__ float3 fmodf(float3 a, float3 b) 1363 | { 1364 | return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); 1365 | } 1366 | inline __host__ __device__ float4 fmodf(float4 a, float4 b) 1367 | { 1368 | return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); 1369 | } 1370 | 1371 | //////////////////////////////////////////////////////////////////////////////// 1372 | // absolute value 1373 | //////////////////////////////////////////////////////////////////////////////// 1374 | 1375 | inline __host__ __device__ float2 fabs(float2 v) 1376 | { 1377 | return make_float2(fabs(v.x), fabs(v.y)); 1378 | } 1379 | inline __host__ __device__ float3 fabs(float3 v) 1380 | { 1381 | return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); 1382 | } 1383 | inline __host__ __device__ float4 fabs(float4 v) 1384 | { 1385 | return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); 1386 | } 1387 | 1388 | inline __host__ __device__ int2 abs(int2 v) 1389 | { 1390 | return make_int2(abs(v.x), abs(v.y)); 1391 | } 1392 | inline __host__ __device__ int3 abs(int3 v) 1393 | { 1394 | return make_int3(abs(v.x), abs(v.y), abs(v.z)); 1395 | } 1396 | inline __host__ __device__ int4 abs(int4 v) 1397 | { 1398 | return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); 1399 | } 1400 | 1401 | //////////////////////////////////////////////////////////////////////////////// 1402 | // reflect 1403 | // - returns reflection of incident ray I around surface normal N 1404 | // - N should be normalized, reflected vector's length is equal to length of I 1405 | //////////////////////////////////////////////////////////////////////////////// 1406 | 1407 | inline __host__ __device__ float3 reflect(float3 i, float3 n) 1408 | { 1409 | return i - 2.0f * n * dot(n,i); 1410 | } 1411 | 1412 | //////////////////////////////////////////////////////////////////////////////// 1413 | // cross product 1414 | //////////////////////////////////////////////////////////////////////////////// 1415 | 1416 | inline __host__ __device__ float3 cross(float3 a, float3 b) 1417 | { 1418 | return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 1419 | } 1420 | 1421 | //////////////////////////////////////////////////////////////////////////////// 1422 | // smoothstep 1423 | // - returns 0 if x < a 1424 | // - returns 1 if x > b 1425 | // - otherwise returns smooth interpolation between 0 and 1 based on x 1426 | //////////////////////////////////////////////////////////////////////////////// 1427 | 1428 | inline __device__ __host__ float smoothstep(float a, float b, float x) 1429 | { 1430 | float y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1431 | return (y*y*(3.0f - (2.0f*y))); 1432 | } 1433 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) 1434 | { 1435 | float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1436 | return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); 1437 | } 1438 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) 1439 | { 1440 | float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1441 | return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); 1442 | } 1443 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) 1444 | { 1445 | float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); 1446 | return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); 1447 | } 1448 | 1449 | #endif 1450 | --------------------------------------------------------------------------------