├── docs
    ├── benchmark
    │   ├── sample.jpg
    │   ├── result_10.jpg
    │   ├── result_100.jpg
    │   ├── result_1000.jpg
    │   ├── baseline
    │   │   ├── iter10_mid.jpg
    │   │   ├── iter1000_mid.jpg
    │   │   └── iter100_mid.jpg
    │   ├── basic_rl_lineprofile.png
    │   └── accelerated
    │   │   ├── iter1000_mid.jpg
    │   │   ├── iter100_mid.jpg
    │   │   └── iter10_mid.jpg
    ├── style.css
    └── index.html
├── util
    ├── CMakeLists.txt
    └── demo.cpp
├── .gitignore
├── src
    ├── DumpData.cuh
    ├── Helper.cuh
    ├── DumpData.cu
    ├── DeconvRLImpl.cuh
    ├── Core.cu
    ├── DeconvLRDriver.cpp
    ├── PSF.cu
    └── HelperMath.cuh
├── include
    ├── DeconvRLDriver.hpp
    └── ImageStack.hpp
├── CMakeLists.txt
├── README.md
└── LICENSE


/docs/benchmark/sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/sample.jpg


--------------------------------------------------------------------------------
/docs/benchmark/result_10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_10.jpg


--------------------------------------------------------------------------------
/docs/benchmark/result_100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_100.jpg


--------------------------------------------------------------------------------
/docs/benchmark/result_1000.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/result_1000.jpg


--------------------------------------------------------------------------------
/docs/benchmark/baseline/iter10_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter10_mid.jpg


--------------------------------------------------------------------------------
/docs/benchmark/baseline/iter1000_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter1000_mid.jpg


--------------------------------------------------------------------------------
/docs/benchmark/baseline/iter100_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/baseline/iter100_mid.jpg


--------------------------------------------------------------------------------
/docs/benchmark/basic_rl_lineprofile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/basic_rl_lineprofile.png


--------------------------------------------------------------------------------
/docs/benchmark/accelerated/iter1000_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter1000_mid.jpg


--------------------------------------------------------------------------------
/docs/benchmark/accelerated/iter100_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter100_mid.jpg


--------------------------------------------------------------------------------
/docs/benchmark/accelerated/iter10_mid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y3nr1ng/DeconvLR/HEAD/docs/benchmark/accelerated/iter10_mid.jpg


--------------------------------------------------------------------------------
/docs/style.css:
--------------------------------------------------------------------------------
 1 | 
 2 | @media (min-width: 1200px) {
 3 |     .container {
 4 |         max-width: 60%;
 5 |     }
 6 | }
 7 | 
 8 | header {
 9 |     padding: 80px;
10 |     /* padding-top: 20px; */
11 |     /* box-shadow: 5px 5px 5px 5px gray; */
12 |     text-align: center;
13 | }
14 | 
15 | .container {
16 |     padding: 20px;
17 | }
18 |     
19 | 
20 | .button-wrapper {
21 |     padding-top: 50px;
22 | }
23 | 
24 | 
25 | .button-wrapper a {
26 |     font-size: 18px;
27 | }
28 | 


--------------------------------------------------------------------------------
/util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file (GLOB SOURCES "${PROJECT_SOURCE_DIR}/util/*.cpp")
 2 | 
 3 | # configure compiler
 4 | add_compile_options (-Wall)
 5 | add_compile_options (-std=c++11)
 6 | include_directories ("${PROJECT_SOURCE_DIR}/include" "/opt/X11/include")
 7 | 
 8 | # generate the executable
 9 | add_executable (deconvlr_util ${SOURCES})
10 | target_link_libraries (deconvlr_util LINK_PUBLIC deconvlr)
11 | 
12 | # drop the suffix
13 | set_target_properties(deconvlr_util PROPERTIES
14 |     OUTPUT_NAME "deconvrl"
15 | )
16 | 
17 | # add custom target to run the binary
18 | add_custom_target (run
19 |     COMMAND deconvlr_util
20 |     DEPENDS deconvlr_util
21 |     WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/bin"
22 | )
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | *.smod
19 | 
20 | # Compiled Static libraries
21 | *.lai
22 | *.la
23 | *.a
24 | *.lib
25 | 
26 | # Executables
27 | *.exe
28 | *.out
29 | *.app
30 | bin/
31 | 
32 | # CMake
33 | CMakeCache.txt
34 | CMakeFiles
35 | CMakeScripts
36 | Testing
37 | Makefile
38 | cmake_install.cmake
39 | install_manifest.txt
40 | compile_commands.json
41 | CTestTestfile.cmake
42 | build/
43 | 
44 | # Atom Linter
45 | .gcc-flags.json
46 | 
47 | # macOS cache
48 | .DS_Store
49 | 
50 | # Data set
51 | data/
52 | 


--------------------------------------------------------------------------------
/src/DumpData.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef HELPER_DUMP_CUH
 2 | #define HELPER_DUMP_CUH
 3 | 
 4 | // corresponded header file
 5 | // necessary project headers
 6 | // 3rd party libraries headers
 7 | #include <cufft.h>
 8 | // standard libraries headers
 9 | #include <string>
10 | // system headers
11 | 
12 | namespace DumpData {
13 | 
14 | namespace Device {
15 | 
16 | void real(
17 |     std::string fname,
18 |     const cufftReal *d_idata,
19 |     const size_t nx, const size_t ny, const size_t nz
20 | );
21 | 
22 | void complex(
23 |     std::string fname,
24 |     const cufftComplex *d_idata,
25 |     const size_t nx, const size_t ny, const size_t nz
26 | );
27 | 
28 | }
29 | 
30 | namespace Host {
31 | 
32 | void real(
33 |     std::string fname,
34 |     cufftReal *h_idata,
35 |     const size_t nx, const size_t ny, const size_t nz
36 | );
37 | 
38 | }
39 | 
40 | }
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/include/DeconvRLDriver.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DECONV_RL_DRIVER_HPP
 2 | #define DECONV_RL_DRIVER_HPP
 3 | 
 4 | // corresponded header file
 5 | // necessary project headers
 6 | #include "ImageStack.hpp"
 7 | // 3rd party libraries headers
 8 | // standard libraries headers
 9 | #include <memory>
10 | // system headers
11 | 
12 | namespace DeconvRL {
13 | 
14 | class DeconvRL {
15 | public:
16 |     DeconvRL();
17 |     ~DeconvRL();
18 | 
19 |     void setResolution(
20 |         const float dx, const float dy, const float dz,
21 |         const float dpx = 1.0f, const float dpy = 1.0f, const float dpz = 1.0f
22 |     );
23 |     void setVolumeSize(const size_t nx, const size_t ny, const size_t nz);
24 |     void setPSF(const ImageStack<uint16_t> &psf);
25 | 
26 |     // allocate host and device resources
27 |     void initialize();
28 |     // set iterations
29 |     void setIterations(const int iterations);
30 |     // start the RL core routines
31 |     void process(
32 |         ImageStack<float> &output,
33 |         const ImageStack<uint16_t> &input
34 |     );
35 | 
36 | private:
37 |     struct Impl;
38 |     std::unique_ptr<Impl> pimpl;
39 | };
40 | 
41 | }
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/util/demo.cpp:
--------------------------------------------------------------------------------
 1 | // corresponded header file
 2 | // necessary project headers
 3 | #include "ImageStack.hpp"
 4 | #include "DeconvRLDriver.hpp"
 5 | // 3rd party libraries headers
 6 | // standard libraries headers
 7 | #include <cstdint>
 8 | #include <iostream>
 9 | // system headers
10 | 
11 | int main(void)
12 | {
13 |     TIFFSetWarningHandler(NULL);
14 | 
15 |     std::string origImgFile = "data/bigradient/sample.tif";
16 |     std::string psfFile = "data/bigradient/psf.tif";
17 | 
18 |     // scan the folder
19 |     // search and load the otf
20 |     ImageStack<uint16_t> psf(psfFile);
21 |     // init the deconvlr
22 |     DeconvRL::DeconvRL deconvWorker;
23 |     deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
24 |     // iterate through the images
25 |     //      open the image
26 |     const ImageStack<uint16_t> input(origImgFile);
27 |     ImageStack<float> output(input, 0);
28 |     //      use the first image to init the resources
29 |     deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz());
30 |     deconvWorker.setPSF(psf);
31 |     deconvWorker.initialize();
32 |     deconvWorker.setIterations(10);
33 |     //      run the deconv
34 |     deconvWorker.process(output, input);
35 |     //      save the image
36 |     output.saveAs("result.tif");
37 |     // save the log
38 |     // release the resources
39 | 
40 |     return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required (VERSION 3.6)
 2 | 
 3 | project (deconvlr)
 4 | set (CMAKE_BUILD_TYPE Debug)
 5 | 
 6 | # set output directories
 7 | set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/lib")
 8 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bin")
 9 | 
10 | # find source files
11 | file (GLOB SOURCES "${PROJECT_SOURCE_DIR}/src/*")
12 | 
13 | # boost
14 | find_package (Boost 1.59.0 COMPONENTS filesystem iostreams REQUIRED)
15 | if (NOT Boost_FOUND)
16 |     message (FATAL_ERROR "Fatal error: Boost (version >= 1.59) required.")
17 | else ()
18 |     message (STATUS "Setting up Boost")
19 |     message (STATUS " Includes - ${Boost_INCLUDE_DIRS}")
20 |     message (STATUS " Library  - ${Boost_LIBRARY_DIRS}")
21 | endif (NOT Boost_FOUND)
22 | # configure boost
23 | set (Boost_USE_STATIC_LIBS OFF)
24 | include_directories (${Boost_INCLUDE_DIRS})
25 | link_directories (${Boost_LIBRARY_DIRS})
26 | 
27 | # cuda
28 | find_package (CUDA REQUIRED 7.5)
29 | if (NOT CUDA_FOUND)
30 |     message (FATAL_ERROR "Fatal error: CUDA SDK (version >= 7.5) required.")
31 | endif (NOT CUDA_FOUND)
32 | # configure cuda
33 | #   compile for baseline of CC 2.0, and archtecture 2.0/3.0
34 | list (APPEND CUDA_NVCC_FLAGS "-arch=compute_20; -code=sm_20,sm_30,sm_52; -std=c++11")
35 | cuda_include_directories ("${PROJECT_SOURCE_DIR}/include")
36 | 
37 | # x11 (debug)
38 | find_package(X11 REQUIRED)
39 | include_directories(${X11_INCLUDE_DIR})
40 | if (NOT CUDA_FOUND)
41 |     message (FATAL_ERROR "Fatal error: X11 library required.")
42 | endif (NOT CUDA_FOUND)
43 | 
44 | # configure compiler
45 | add_compile_options (-Wall)
46 | add_compile_options (-std=c++11)
47 | include_directories ("${PROJECT_SOURCE_DIR}/include")
48 | 
49 | # generate the shared library
50 | cuda_add_library (deconvlr SHARED ${SOURCES})
51 | target_link_libraries (deconvlr ${Boost_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} tiff ${X11_LIBRARIES})
52 | 
53 | # build demo program
54 | add_subdirectory ("util")
55 | 


--------------------------------------------------------------------------------
/include/ImageStack.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef IMAGE_STACK_HPP
 2 | #define IMAGE_STACK_HPP
 3 | 
 4 | // corresponded header file
 5 | // necessary project headers
 6 | // 3rd party libraries headers
 7 | #define BOOST_FILESYSTEM_NO_DEPRECATED
 8 | #define BOOST_FILESYSTEM_VERSION 3
 9 | #include <boost/filesystem.hpp>
10 | 
11 | #define cimg_use_tiff
12 | #include "CImg.h"
13 | using namespace cimg_library;
14 | // standard libraries headers
15 | #include <string>
16 | #include <exception>
17 | #include <iostream>
18 | // system headers
19 | 
20 | namespace fs = boost::filesystem;
21 | 
22 | template <typename T>
23 | class ImageStack {
24 | public:
25 |     ImageStack(const fs::path path_)
26 |         : path(path_) {
27 |         try {
28 |             image.assign(path.c_str());
29 |         } catch(CImgIOException &err) {
30 |             throw std::runtime_error("unable to open image");
31 |         }
32 |     }
33 | 
34 |     // type conversion
35 |     template <typename Q>
36 |     ImageStack(const ImageStack<Q> &tpl) {
37 |         image.assign(tpl.object());
38 |     }
39 | 
40 |     // init image of the same dimension with the default value
41 |     template <typename Q>
42 |     ImageStack(const ImageStack<Q> &tpl, const T value) {
43 |         image.assign(tpl.object(), "xyzc", value);
44 |     }
45 | 
46 |     void debug() {
47 |         std::cout << "file: " << path << std::endl;
48 |         image.display();
49 |     }
50 | 
51 |     T * data() const {
52 |         return image._data;
53 |     }
54 | 
55 |     const CImg<T> & object() const {
56 |         return image;
57 |     }
58 | 
59 |     void save() {
60 |         saveAs(path);
61 |     }
62 | 
63 |     void saveAs(const fs::path p) {
64 |         image.save_tiff(p.c_str());
65 |     }
66 | 
67 |     /*
68 |      * Volume size
69 |      */
70 |     size_t nx() const {
71 |         return image._width;
72 |     }
73 | 
74 |     size_t ny() const {
75 |         return image._height;
76 |     }
77 | 
78 |     size_t nz() const {
79 |         return image._depth;
80 |     }
81 | 
82 | private:
83 |     const fs::path path;
84 |     CImg<T> image;
85 | };
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/src/Helper.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef HELPER_CUH
  2 | #define HELPER_CUH
  3 | 
  4 | // corresponded header file
  5 | // necessary project headers
  6 | #include "HelperMath.cuh"
  7 | // 3rd party libraries headers
  8 | #include <cuda_runtime.h>
  9 | #include <cufft.h>
 10 | // standard libraries headers
 11 | #include <cstdio>
 12 | #include <cstdlib>
 13 | // system headers
 14 | 
 15 | /*
 16 |  * Kernel thread size computation.
 17 |  */
 18 | #define DIVUP(x, y) ((x+y-1)/y)
 19 | 
 20 | /*
 21 |  * CUDA Runtime
 22 |  */
 23 | #define cudaErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
 24 | 
 25 | inline void cudaAssert(
 26 | 	cudaError_t code,
 27 | 	const char *file, int line,
 28 | 	bool abort=true
 29 | ) {
 30 | 	if (code != cudaSuccess) {
 31 | 		fprintf(stderr,"CUDA Runtime: %s\n.. %s ln%d\n", cudaGetErrorString(code), file, line);
 32 | 		if (abort) {
 33 | 			exit(code);
 34 | 		}
 35 | 	}
 36 | }
 37 | 
 38 | /*
 39 |  * cuFFT
 40 |  */
 41 | static const char * cufftGetErrorString(cufftResult error) {
 42 | 	switch (error) {
 43 |         case CUFFT_SUCCESS:
 44 |             return "the cuFFT operation was successful";
 45 | 
 46 |         case CUFFT_INVALID_PLAN:
 47 |             return "cuFFT was passed an invalid plan handle";
 48 | 
 49 |         case CUFFT_ALLOC_FAILED:
 50 |             return "cuFFT failed to allocate GPU or CPU memory";
 51 | 
 52 |         case CUFFT_INVALID_VALUE:
 53 |             return "user specified an invalid pointer or parameter";
 54 | 
 55 |         case CUFFT_INTERNAL_ERROR:
 56 |             return "driver or internal cuFFT library error";
 57 | 
 58 |         case CUFFT_EXEC_FAILED:
 59 |             return "failed to execute an FFT on the GPU";
 60 | 
 61 |         case CUFFT_SETUP_FAILED:
 62 |             return "the cuFFT library failed to initialize";
 63 | 
 64 |         case CUFFT_INVALID_SIZE:
 65 |             return "user specified an invalid transform size";
 66 | 
 67 |         case CUFFT_INCOMPLETE_PARAMETER_LIST:
 68 |             return "missing parameters in call";
 69 | 
 70 |         case CUFFT_INVALID_DEVICE:
 71 |             return "execution of a plan was on different GPU than plan creation";
 72 | 
 73 |         case CUFFT_PARSE_ERROR:
 74 |             return "internal plan database error";
 75 | 
 76 |         case CUFFT_NO_WORKSPACE:
 77 |             return "no workspace has been provided prior to plan execution";
 78 | 
 79 |         case CUFFT_NOT_IMPLEMENTED:
 80 |             return "function does not implement functionality for parameters given";
 81 | 
 82 |         case CUFFT_NOT_SUPPORTED:
 83 |             return "operation is not supported for parameters given";
 84 | 
 85 |         default:
 86 |             return "<unknown>";
 87 |     }
 88 | }
 89 | 
 90 | inline void cudaAssert(
 91 | 	cufftResult_t code,
 92 | 	const char *file, int line,
 93 | 	bool abort=true
 94 | 	) {
 95 | 	if (code != CUFFT_SUCCESS) {
 96 | 		fprintf(stderr,"cuFFT: %s\n.. %s ln%d\n", cufftGetErrorString(code), file, line);
 97 | 		if (abort) {
 98 | 			exit(code);
 99 | 		}
100 | 	}
101 | }
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeconvLR
 2 | DeconvLR is a open source CUDA implementation of accelerated Richard-Lucy Deconvolution algorithm regularized with total variation loss. This library is developed to recovered blurred image due to the spreading of point source in optical system. As far as we know, there is no other fully functional open source GPU accelerated implementation. This project is aim to develope an open source, high efficient library to process high resolution images of high quality.
 3 | 
 4 | ## Getting Started
 5 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
 6 | 
 7 | ### Prerequisites
 8 | You need the following packages to get started.
 9 | 
10 | ***nix**
11 | ```
12 | make
13 | g++ <= 5
14 | CMake >= 3.6
15 | Boost >= 1.59
16 | CUDA >= 8.0
17 | ```
18 | 
19 | **Windows**
20 | 
21 | **TODO** I haven't exactly tested this on Windows. DLL export symbols are needed in the public header.
22 | 
23 | ### Build
24 | 1. Please clone this repository
25 |    ```bash
26 |    git clone https://github.com/liuyenting/DeconvLR.git
27 |    ```
28 |    or download and extract the tarball from [release page](https://github.com/liuyenting/DeconvLR/releases).
29 |    ```bash
30 |    tar zxvf DeconRL.tar.gz
31 |    ```
32 | 2. Go to source directory and create a new build output directory. 
33 |    ```bash
34 |    cd DeconvLR
35 |    mkdir build
36 |    ```
37 | 3. We use `cmake` to do the heavy lifting.
38 |    ```bash
39 |    cd build
40 |    cmake ..
41 |    ```
42 |    if everything runs smoothly, we can proceed with
43 |    ```bash
44 |    make
45 |    ```
46 | 
47 | ## Running the demo
48 | **TODO** Explain how to run the demo.
49 | 
50 | Asides from the demo, this library is intended to use as 
51 | ```c++
52 | std::string origImgFile = "data/bigradient/sample.tif";
53 | std::string psfFile = "data/bigradient/psf_n15_z5.tif";
54 | 
55 | // load psf
56 | ImageStack psf(psfFile);
57 | 
58 | // init the deconvlr
59 | DeconvLR deconvWorker;
60 | deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
61 | 
62 | // open the image
63 | const ImageStack<uint16_t> input(origImgFile);
64 | ImageStack<uint16_t> output(input, 0);
65 | 
66 | // use the first image to init the resources
67 | deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz());
68 | deconvWorker.setPSF(psf);
69 | 
70 | // run the deconv
71 | deconvWorker.process(output, input);
72 | ```
73 | 
74 | ## Benchmark
75 | **TODO** move benchmakr images from gh-page (in docs folder) to here.
76 | 
77 | ## Authors
78 | * **Liu, Yen-Ting** - *Initial work* - [liuyenting](https://github.com/liuyenting/)
79 | * **Chiang, Tin-Ray** - *Initial work* - [CTinRay](https://github.com/CTinRay)
80 | 
81 | ## License
82 | This project is licensed under the Apache License - see the [LICENSE](LICENSE) file for details
83 | 
84 | ## References
85 | * William Hadley Richardson (1972), "Bayesian-Based Iterative Method of Image Restoration*," J. Opt. Soc. Am. 62, 55-59.
86 | * Lucy, L. B. (1974). "An iterative technique for the rectification of observed distributions". Astronomical Journal. 79 (6): 745–754.
87 | * Biggs, D. S., & Andrews, M. (1997). Acceleration of iterative image restoration algorithms. Applied optics, 36(8), 1766-1775.
88 | * Dey, N., Blanc-Féraud, L., Zimmer, C., Roux, P., Kam, Z., Olivo-Marin, J. C., & Zerubia, J. (2004). 3D microscopy deconvolution using Richardson-Lucy algorithm with total variation regularization (Doctoral dissertation, INRIA).
89 | 


--------------------------------------------------------------------------------
/src/DumpData.cu:
--------------------------------------------------------------------------------
  1 | // corresponded header file
  2 | #include "DumpData.cuh"
  3 | // necessary project headers
  4 | #include "Helper.cuh"
  5 | // 3rd party libraries headers
  6 | #include <cuda_runtime.h>
  7 | #include <cuComplex.h>
  8 | #include <thrust/device_vector.h>
  9 | #include <thrust/transform.h>
 10 | #include <thrust/transform_reduce.h>
 11 | #include <thrust/functional.h>
 12 | #include <thrust/execution_policy.h>
 13 | 
 14 | #define cimg_use_tiff
 15 | #include "CImg.h"
 16 | using namespace cimg_library;
 17 | // standard libraries headers
 18 | #include <cstdio>
 19 | #include <cstring>
 20 | // system headers
 21 | 
 22 | namespace DumpData {
 23 | 
 24 | namespace {
 25 | 
 26 | __global__
 27 | void abs_kernel(
 28 |     cufftReal *odata,
 29 |     const cufftComplex *idata,
 30 |     const size_t nx, const size_t ny, const size_t nz
 31 | ) {
 32 |     int ix = blockIdx.x*blockDim.x + threadIdx.x;
 33 |     int iy = blockIdx.y*blockDim.y + threadIdx.y;
 34 |     int iz = blockIdx.z*blockDim.z + threadIdx.z;
 35 | 
 36 |     // skip out-of-bound threads
 37 |     if (ix >= nx or iy >= ny or iz >= nz) {
 38 |         return;
 39 |     }
 40 | 
 41 |     int idx = iz * (nx*ny) + iy * nx + ix;
 42 |     odata[idx] = cuCabsf(idata[idx]);
 43 | }
 44 | 
 45 | }
 46 | 
 47 | namespace Device {
 48 | 
 49 | void real(
 50 |     std::string fname,
 51 |     const cufftReal *d_idata,
 52 |     const size_t nx, const size_t ny, const size_t nz
 53 | ) {
 54 |     fprintf(stderr,
 55 |         "[DBG] dump %ldx%ldx%ld <real> from device to \"%s\"\n",
 56 |         nx, ny, nz, fname.c_str()
 57 |     );
 58 | 
 59 |     CImg<float> data(nx, ny, nz);
 60 |     const size_t size = data.size() * sizeof(float);
 61 | 
 62 |     // pinned down the host memory region
 63 |     float *d_odata;
 64 |     cudaErrChk(cudaHostRegister(data.data(), size, cudaHostRegisterMapped));
 65 |     cudaErrChk(cudaHostGetDevicePointer(&d_odata, data.data(), 0));
 66 | 
 67 |     // copy from device to host
 68 |     cudaErrChk(cudaMemcpy(d_odata, d_idata, size, cudaMemcpyDeviceToHost));
 69 | 
 70 |     // release the resources
 71 |     cudaErrChk(cudaHostUnregister(data.data()));
 72 | 
 73 |     // save the result to file
 74 |     data.save_tiff(fname.c_str());
 75 | }
 76 | 
 77 | void complex(
 78 |     std::string fname,
 79 |     const cufftComplex *d_idata,
 80 |     const size_t nx, const size_t ny, const size_t nz
 81 | ) {
 82 |     fprintf(stderr,
 83 |         "[DBG] dump %ldx%ldx%ld <complex> from device to \"%s\"\n",
 84 |         nx, ny, nz, fname.c_str()
 85 |     );
 86 | 
 87 |     CImg<float> data(nx, ny, nz);
 88 |     const size_t size = data.size() * sizeof(float);
 89 | 
 90 |     // pinned down the host memory region
 91 |     float *d_odata;
 92 |     cudaErrChk(cudaHostRegister(data.data(), size, cudaHostRegisterMapped));
 93 |     cudaErrChk(cudaHostGetDevicePointer(&d_odata, data.data(), 0));
 94 | 
 95 |     dim3 nthreads(16, 16, 4);
 96 |     dim3 nblocks(
 97 |         DIVUP(nx, nthreads.x), DIVUP(ny, nthreads.y), DIVUP(nz, nthreads.z)
 98 |     );
 99 |     abs_kernel<<<nblocks, nthreads>>>(
100 |         d_odata,
101 |         d_idata,
102 |         nx, ny, nz
103 |     );
104 |     cudaErrChk(cudaPeekAtLastError());
105 | 
106 |     // release the resources
107 |     cudaErrChk(cudaHostUnregister(data.data()));
108 | 
109 |     // save the result to file
110 |     data.save_tiff(fname.c_str());
111 | }
112 | 
113 | }
114 | 
115 | namespace Host {
116 | 
117 | void real(
118 |     std::string fname,
119 |     cufftReal *h_idata,
120 |     const size_t nx, const size_t ny, const size_t nz
121 | ) {
122 |     fprintf(stderr,
123 |         "[DBG] dump %ldx%ldx%ld <real> from host to \"%s\"\n",
124 |         nx, ny, nz, fname.c_str()
125 |     );
126 | 
127 |     CImg<float> data(nx, ny, nz);
128 |     const size_t size = data.size() * sizeof(float);
129 | 
130 |     // copy to image data region
131 |     std::memcpy(data.data(), h_idata, size);
132 | 
133 |     // save the result to file
134 |     data.save_tiff(fname.c_str());
135 | }
136 | 
137 | }
138 | 
139 | }
140 | 


--------------------------------------------------------------------------------
/src/DeconvRLImpl.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef DECONV_LR_CORE_CUH
  2 | #define DECONV_LR_CORE_CUH
  3 | 
  4 | // corresponded header file
  5 | // necessary project headers
  6 | // 3rd party libraries headers
  7 | #include <cuda_runtime.h>
  8 | #include <cufft.h>
  9 | // standard libraries headers
 10 | #include <cstdint>
 11 | // system headers
 12 | 
 13 | namespace DeconvRL {
 14 | 
 15 | namespace PSF {
 16 | 
 17 | class PSF {
 18 | public:
 19 |     PSF(
 20 |         float *h_psf,
 21 |         const size_t npx, const size_t npy, const size_t npz = 1
 22 |     );
 23 |     ~PSF();
 24 | 
 25 |     /**
 26 |      * @brief Center the centroid of the provided PSF.
 27 |      *
 28 |      * The method calls upon the the estimateBackground method to estimate and
 29 |      * remove the potential bacground noises by its mean. Later, centroid of the
 30 |      * PSF is calculated and used to circular shift the original PSF to its
 31 |      * align with its center.
 32 |      *
 33 |      * @see findCentroid, estimateBackground
 34 |      */
 35 |     void alignCenter(const size_t nx, const size_t ny, const size_t nz = 1);
 36 | 
 37 |     /**
 38 |      * @brief Convert the PSF to OTF.
 39 |      *
 40 |      * Convert the PSF to an OTF by a FFT. Caller has to allocate the OTF
 41 |      * pointer with a proper memory space, (nx/2+1)*ny*nz*sizeof(cufftComplex).
 42 |      *
 43 |      * @param d_otf The converted OTF.
 44 |      * @param nx Number of elements in the X dimension (fastest variation).
 45 |      * @param ny Number of elements in the Y dimension.
 46 |      * @param nz Number of elements in the Z dimension (slowest variation).
 47 |      *
 48 |      * @see
 49 |      */
 50 |     void createOTF(cufftComplex *d_otf);
 51 | 
 52 | private:
 53 |     float3 findCentroid();
 54 |     float estimateBackground();
 55 |     void padPSF(const size_t nx, const size_t ny, const size_t nz);
 56 | 
 57 |     // PSF memory, host side and mirrored device address
 58 |     float *d_psf;
 59 | 
 60 |     // size of the PSF
 61 |     size_t npx, npy, npz;
 62 |     size_t nelem;
 63 | };
 64 | 
 65 | }
 66 | 
 67 | namespace Core {
 68 | 
 69 | /**
 70 |  * @brief Brief introduction to the function.
 71 |  *
 72 |  * Description of what the function does
 73 |  * @param PARAM1 Description of the first parameter of the function.
 74 |  * @return Describe what the function returns.
 75 |  * @see FUNCTION
 76 |  */
 77 | 
 78 | /**
 79 |  * Parameter class that holds all constant and temporary variables during the
 80 |  * Richardson-Lucy iteration steps.
 81 |  */
 82 | struct Parameters {
 83 |     //TODO destructor to free the memory region
 84 |     //TODO destructor to free the FFT handles
 85 | 
 86 |     /**
 87 |      * Dimension of the image in real space.
 88 |      */
 89 |     size_t nx, ny, nz;
 90 |     // product of nx, ny and nz
 91 |     size_t nelem;
 92 | 
 93 |     // original image
 94 |     float *raw;
 95 | 
 96 |     // OTF
 97 |     cufftComplex *otf;
 98 | 
 99 |     /**
100 |      * cuFFT handles for forward (R2C) and reverse (C2R) FFT operations.
101 |      */
102 |     struct {
103 |         cufftHandle forward;
104 |         cufftHandle reverse;
105 |     } fftHandle;
106 | 
107 |     /**
108 |      * I/O buffer to interface with the host.
109 |      */
110 |     struct {
111 |         cufftReal *input;
112 |         cufftReal *output;
113 |     } ioBuffer;
114 | 
115 |     /**
116 |      * Intermediate buffers, maximum size is used, aka padded input data size.
117 |      */
118 |     struct {
119 |         cufftComplex *complexA;
120 |     } filterBuffer;
121 | 
122 |     struct {
123 |         cufftReal *realA;
124 |     } RLBuffer;
125 | 
126 |     /**
127 |      * Prediction buffers
128 |      */
129 |     struct {
130 |         float *prevIter;
131 |         float *prevPredChg;
132 |     } predBuffer;
133 | };
134 | 
135 | namespace RL {
136 | 
137 | /**
138 |  * @brief One iteration in the Richardson-Lucy algorithm.
139 |  *
140 |  * DESCRIPTION
141 |  * @param odata Result from current iteration.
142 |  * @param idata Result of previous iteration.
143 |  * @param parm Algorithm related parameters.
144 |  * @return
145 |  * @see
146 |  */
147 | void step(
148 |     float *odata, const float *idata,
149 |     Core::Parameters &parms
150 | );
151 | 
152 | }
153 | 
154 | namespace Biggs {
155 | 
156 | /**
157 |  * @brief One iteration in the accelerated Richardson-Lucy algorithm.
158 |  *
159 |  * DESCRIPTION
160 |  * @param odata Result from current iteration.
161 |  * @param idata Result of previous iteration.
162 |  * @param parm Algorithm related parameters.
163 |  * @return
164 |  * @see
165 |  */
166 | void step(
167 |     float *odata, const float *idata,
168 |     Core::Parameters &parms
169 | );
170 | 
171 | }
172 | 
173 | }
174 | 
175 | namespace Common {
176 | 
177 | void ushort2float(float *odata, const uint16_t *idata, const size_t nelem);
178 | 
179 | }
180 | 
181 | }
182 | 
183 | #endif
184 | 


--------------------------------------------------------------------------------
/src/Core.cu:
--------------------------------------------------------------------------------
  1 | // corresponded header file
  2 | // necessary project headers
  3 | #include "DeconvRLImpl.cuh"
  4 | #include "Helper.cuh"
  5 | // 3rd party libraries headers
  6 | #include <cuda_runtime.h>
  7 | #include <cuComplex.h>
  8 | #include <thrust/device_vector.h>
  9 | #include <thrust/transform.h>
 10 | #include <thrust/transform_reduce.h>
 11 | #include <thrust/functional.h>
 12 | #include <thrust/execution_policy.h>
 13 | #include <thrust/inner_product.h>
 14 | #include <cufft.h>
 15 | // standard libraries headers
 16 | #include <cstdint>
 17 | // system headers
 18 | 
 19 | namespace DeconvRL {
 20 | 
 21 | namespace Core {
 22 | 
 23 | namespace RL {
 24 | 
 25 | enum class ConvType {
 26 |     PLAIN = 1, CONJUGATE
 27 | };
 28 | 
 29 | namespace {
 30 | // generic complex number operation
 31 | struct MultiplyAndScale
 32 |     : public thrust::binary_function<cuComplex, cuComplex, cuComplex> {
 33 |     MultiplyAndScale(const float c_)
 34 |         : c(c_) {
 35 |     }
 36 | 
 37 |     __host__ __device__
 38 |     cuComplex operator()(const cuComplex &a, const cuComplex &b) const {
 39 |         return cuCmulf(a, b)/c;
 40 |     }
 41 | 
 42 | private:
 43 |     const float c;
 44 | };
 45 | 
 46 | void filter(
 47 |     cufftReal *odata, const cufftReal *idata, const cufftComplex *otf,
 48 |     Core::Parameters &parm
 49 | ) {
 50 |     const size_t nelem = (parm.nx/2+1) * parm.ny * parm.nz;
 51 |     cufftComplex *buffer = (cufftComplex *)parm.filterBuffer.complexA;
 52 | 
 53 |     // convert to frequency space
 54 |     cudaErrChk(cufftExecR2C(
 55 |         parm.fftHandle.forward,
 56 |         const_cast<cufftReal *>(idata),
 57 |         buffer
 58 |     ));
 59 |     // element-wise multiplication and scale down
 60 |     thrust::transform(
 61 |         thrust::device,
 62 |         buffer, buffer+nelem,       // first input sequence
 63 |         otf,                        // second input sequence
 64 |         buffer,                     // output sequence
 65 |         MultiplyAndScale(1.0f/parm.nelem)
 66 |     );
 67 |     // convert back to real space
 68 |     cudaErrChk(cufftExecC2R(
 69 |         parm.fftHandle.reverse,
 70 |         buffer,
 71 |         odata
 72 |     ));
 73 | }
 74 | 
 75 | thrust::divides<float> DivfOp;
 76 | thrust::multiplies<float> MulfOp;
 77 | 
 78 | }
 79 | 
 80 | void step(
 81 |     float *odata, const float *idata,
 82 |     Core::Parameters &parms
 83 | ) {
 84 |     fprintf(stderr, "[DBG] +++ ENTER RL::step() +++\n");
 85 | 
 86 |     const size_t nelem = parms.nelem;
 87 |     cufftReal *buffer = parms.RLBuffer.realA;
 88 | 
 89 |     cufftComplex *otf = parms.otf;
 90 | 
 91 |     /*
 92 |      * \hat{f_{k+1}} =
 93 |      *     \hat{f_k} \left(
 94 |      *         h \ast \frac{g}{h \otimes \hat{f_k}}
 95 |      *     \right)
 96 |      */
 97 | 
 98 |     // reblur the image
 99 |     filter(buffer, idata, otf, parms);
100 |     // error
101 |     thrust::transform(
102 |         thrust::device,
103 |         parms.raw,  parms.raw+nelem,
104 |         buffer,
105 |         buffer, // output
106 |         DivfOp
107 |     );
108 |     filter(buffer, buffer, otf, parms);
109 |     // latent image
110 |     thrust::transform(
111 |         thrust::device,
112 |         idata, idata+nelem,
113 |         buffer,
114 |         odata,  // output
115 |         MulfOp
116 |     );
117 | 
118 |     fprintf(stderr, "[DBG] +++ EXIT RL::step() +++\n");
119 | }
120 | 
121 | }
122 | 
123 | namespace Biggs {
124 | 
125 | namespace {
126 | 
127 | struct ScaleAndAdd
128 |     : public thrust::binary_function<float, float, float> {
129 |     ScaleAndAdd(const float alpha_)
130 |         : alpha(alpha_) {
131 |     }
132 | 
133 |     __host__ __device__
134 |     float operator()(const float &a, const float &b) const {
135 |         // apply positivity constraint after SAXPY
136 |         //return fmaxf(a + alpha*b, 0.0f);
137 |         return a + alpha*b;
138 |     }
139 | 
140 | private:
141 |     const float alpha;
142 | };
143 | 
144 | }
145 | 
146 | void step(
147 |     float *odata, const float *idata,
148 |     Core::Parameters &parm
149 | ) {
150 |     // borrow space from odata, rename to avoid confusion
151 |     float* iter = odata;
152 |     // calcualte x_k
153 |     RL::step(iter, idata, parm);
154 | 
155 |     // extract the definition
156 |     float *prevIter = parm.predBuffer.prevIter;
157 |     float *prevPredChg = parm.predBuffer.prevPredChg;
158 | 
159 |     // updateDir borrow buffer from prevIter
160 |     float* updateDir = prevIter;
161 |     // h_k in the paper
162 |     // update_direction = prev_iter - iter;
163 |     thrust::transform(
164 |         thrust::device,
165 |         iter, iter+parm.nelem,
166 |         prevIter,
167 |         updateDir,
168 |         thrust::minus<float>()
169 |     );
170 | 
171 |     // reuse space of idata
172 |     float *predChg = const_cast<float *>(idata);
173 |     // calculate g_{k - 1} = x_k - y_{k - 1}.
174 |     // pred_change = iter - prev_pred;
175 |     thrust::transform(
176 |         thrust::device,
177 |         iter, iter+parm.nelem,
178 |         idata,
179 |         predChg,
180 |         thrust::minus<float>()
181 |     );
182 | 
183 |     // calculate alpha (acceleration factor).
184 |     float den = thrust::inner_product(
185 |         thrust::device,
186 |         predChg, predChg+parm.nelem,
187 |         prevPredChg,
188 |         0.0f
189 |     );
190 |     float nom = (
191 |         thrust::inner_product(
192 |             thrust::device,
193 |             prevPredChg, prevPredChg+parm.nelem,
194 |             prevPredChg,
195 |             0.0f
196 |         ) + std::numeric_limits<float>::epsilon()
197 |     );
198 |     float alpha = den / nom;
199 |     fprintf(stderr, "[DBG] fraction [%f/%f = %f]\n", den, nom, alpha);
200 | 
201 |     // stability enforcement
202 |     alpha = std::max(std::min(alpha, 1.0f), 0.0f);
203 |     fprintf(stderr, "[INF] alpha = %f\n", alpha);
204 | 
205 |     // save current predictions
206 |     cudaErrChk(cudaMemcpy(
207 |         prevIter,
208 |         iter,
209 |         parm.nelem * sizeof(float),
210 |         cudaMemcpyDeviceToDevice
211 |     ));
212 |     cudaErrChk(cudaMemcpy(
213 |         prevPredChg,
214 |         predChg,
215 |         parm.nelem * sizeof(float),
216 |         cudaMemcpyDeviceToDevice
217 |     ));
218 | 
219 |     // calculate y_k
220 |     // odata = iter + alpha * update_direction;
221 |     thrust::transform(
222 |         thrust::device,
223 |         iter, iter+parm.nelem,
224 |         updateDir,
225 |         odata,
226 |         ScaleAndAdd(alpha)
227 |     );
228 | }
229 | 
230 | }
231 | 
232 | }
233 | 
234 | namespace Common {
235 | 
236 | namespace {
237 | 
238 | template <typename T>
239 | struct ToFloat
240 |     : public thrust::unary_function<const T, float> {
241 |     __host__ __device__
242 |     float operator()(const T &v) const {
243 |         return (float)v;
244 |     }
245 | };
246 | 
247 | }
248 | 
249 | void ushort2float(float *odata, const uint16_t *idata, const size_t nelem) {
250 |     thrust::transform(
251 |         thrust::device,
252 |         idata, idata + nelem,   // input
253 |         odata,                  // output
254 |         ToFloat<uint16_t>()
255 |     );
256 | }
257 | 
258 | }
259 | 
260 | }
261 | 


--------------------------------------------------------------------------------
/src/DeconvLRDriver.cpp:
--------------------------------------------------------------------------------
  1 | // corresponded header file
  2 | #include "DeconvRLDriver.hpp"
  3 | // necessary project headers
  4 | #include "DeconvRLImpl.cuh"
  5 | #include "Helper.cuh"
  6 | #include "DumpData.cuh"
  7 | // 3rd party libraries headers
  8 | #include <cuda_runtime.h>
  9 | // standard libraries headers
 10 | #include <exception>
 11 | #include <cstdio>
 12 | // system headers
 13 | 
 14 | namespace DeconvRL {
 15 | 
 16 | struct DeconvRL::Impl {
 17 |     Impl() {
 18 |     }
 19 | 
 20 |     ~Impl() {
 21 |         // TODO free iterParms
 22 |     }
 23 | 
 24 |     // volume size
 25 |     dim3 volumeSize;
 26 |     // voxel size
 27 |     struct {
 28 |         float3 raw;
 29 |         float3 psf;
 30 |     } voxelSize;
 31 | 
 32 |     /*
 33 |      * Algorithm configurations.
 34 |      */
 35 |     int iterations;
 36 |     Core::Parameters iterParms;
 37 | };
 38 | 
 39 | // C++14 feature
 40 | template<typename T, typename ... Args>
 41 | std::unique_ptr<T> make_unique(Args&& ... args) {
 42 |     return std::unique_ptr<T>(new T(std::forward<Args>(args) ...));
 43 | }
 44 | 
 45 | DeconvRL::DeconvRL()
 46 |     : pimpl(make_unique<Impl>()) {
 47 | }
 48 | 
 49 | DeconvRL::~DeconvRL() {
 50 | 
 51 | }
 52 | 
 53 | void DeconvRL::setResolution(
 54 |     const float dx, const float dy, const float dz,
 55 |     const float dpx, const float dpy, const float dpz
 56 | ) {
 57 |     /*
 58 |      * Spatial frequency ratio (along one dimension)
 59 |      *
 60 |      *       1/(NS * DS)   NP   DP   NP
 61 |      *   R = ----------- = -- * -- = -- * r
 62 |      *       1/(NP * DP)   NS   DS   NS
 63 |      *
 64 |      *   NS, sample size
 65 |      *   DS, sample voxel size
 66 |      *   NP, PSF size
 67 |      *   DP, PSF voxel size
 68 |      *   r, voxel ratio
 69 |      */
 70 |     pimpl->voxelSize.raw = make_float3(dx, dy, dz);
 71 |     pimpl->voxelSize.psf = make_float3(dpx, dpy, dpz);
 72 | }
 73 | 
 74 | void DeconvRL::setVolumeSize(
 75 |     const size_t nx, const size_t ny, const size_t nz
 76 | ) {
 77 |     //TODO probe for device specification
 78 |     if (nx > 2048 or ny > 2048 or nz > 2048) {
 79 |         throw std::range_error("volume size exceeds maximum constraints");
 80 |     }
 81 |     pimpl->volumeSize.x = nx;
 82 |     pimpl->volumeSize.y = ny;
 83 |     pimpl->volumeSize.z = nz;
 84 | 
 85 |     fprintf(
 86 |         stderr,
 87 |         "[INF] volume size = %ux%ux%u\n",
 88 |         pimpl->volumeSize.x, pimpl->volumeSize.y, pimpl->volumeSize.z
 89 |     );
 90 | }
 91 | 
 92 | //TODO remove ImageStack dependency
 93 | void DeconvRL::setPSF(const ImageStack<uint16_t> &psf_u16) {
 94 |     /*
 95 |      * Ensure we are working with floating points.
 96 |      */
 97 |     ImageStack<float> psf(psf_u16);
 98 |     fprintf(
 99 |         stderr,
100 |         "[INF] PSF size = %ldx%ldx%ld\n",
101 |         psf.nx(), psf.ny(), psf.nz()
102 |     );
103 | 
104 |     /*
105 |      * Generate the OTF.
106 |      */
107 |     PSF::PSF psfProc(psf.data(), psf.nx(), psf.ny(), psf.nz());
108 |     psfProc.alignCenter(
109 |         pimpl->volumeSize.x, pimpl->volumeSize.y, pimpl->volumeSize.z
110 |     );
111 | 
112 |     // allocate memory space for OTF
113 |     cudaErrChk(cudaMalloc(
114 |         &pimpl->iterParms.otf,
115 |         (pimpl->volumeSize.x/2+1) * pimpl->volumeSize.y * pimpl->volumeSize.z * sizeof(cufftComplex)
116 |     ));
117 |     // create the OTF
118 |     psfProc.createOTF(pimpl->iterParms.otf);
119 |     fprintf(stderr, "[INF] OTF established\n");
120 | }
121 | 
122 | void DeconvRL::initialize() {
123 |     const dim3 volumeSize = pimpl->volumeSize;
124 |     Core::Parameters &iterParms = pimpl->iterParms;
125 | 
126 |     /*
127 |      * Load dimension information into the iteration parameter.
128 |      */
129 |     iterParms.nx = volumeSize.x;
130 |     iterParms.ny = volumeSize.y;
131 |     iterParms.nz = volumeSize.z;
132 |     iterParms.nelem = volumeSize.x * volumeSize.y * volumeSize.z;
133 | 
134 |     /*
135 |      * Create FFT plans.
136 |      */
137 |      // FFT plans for estimation
138 |      cudaErrChk(cufftPlan3d(
139 |          &iterParms.fftHandle.forward,
140 |          volumeSize.z, volumeSize.y, volumeSize.x,
141 |          CUFFT_R2C
142 |      ));
143 |      cudaErrChk(cufftPlan3d(
144 |          &iterParms.fftHandle.reverse,
145 |          volumeSize.z, volumeSize.y, volumeSize.x,
146 |          CUFFT_C2R
147 |      ));
148 | 
149 |      //TODO attach callback device functions
150 | 
151 |      /*
152 |       * Estimate memory usage from FFT procedures.
153 |       */
154 | 
155 |      /*
156 |       * Allocate device staging area.
157 |       */
158 |       size_t realSize =
159 |           volumeSize.x * volumeSize.y * volumeSize.z * sizeof(cufftReal);
160 |       size_t complexSize =
161 |           (volumeSize.x/2+1) * volumeSize.y * volumeSize.z * sizeof(cufftComplex);
162 | 
163 |      // template
164 |      cudaErrChk(cudaMalloc((void **)&iterParms.raw, realSize));
165 | 
166 |      // IO buffer
167 |      cudaErrChk(cudaMalloc((void **)&iterParms.ioBuffer.input, realSize));
168 |      cudaErrChk(cudaMalloc((void **)&iterParms.ioBuffer.output, realSize));
169 | 
170 |      // FFT Buffer
171 |      cudaErrChk(cudaMalloc((void **)&iterParms.filterBuffer.complexA, complexSize));
172 | 
173 |      // RL Buffer
174 |      cudaErrChk(cudaMalloc((void **)&iterParms.RLBuffer.realA, realSize));
175 | 
176 |      // prediction buffer
177 |      cudaErrChk(cudaMalloc((void **)&iterParms.predBuffer.prevIter, realSize));
178 |      cudaErrChk(cudaMalloc((void **)&iterParms.predBuffer.prevPredChg, realSize));
179 | }
180 | 
181 | void DeconvRL::setIterations(const int i) {
182 |     if (i < 1) {
183 |         throw std::range_error("iteration cycle has to be at least 1");
184 |     }
185 |     pimpl->iterations = i;
186 | }
187 | 
188 | //TODO scale output from float to uint16
189 | void DeconvRL::process(
190 | 	ImageStack<float> &odata,
191 | 	const ImageStack<uint16_t> &idata
192 | ) {
193 |     Core::Parameters &iterParms = pimpl->iterParms;
194 |     const size_t nelem = iterParms.nelem;
195 | 
196 |     // register the input data memory region on host as pinned
197 |     cudaErrChk(cudaHostRegister(
198 |         idata.data(),
199 |         nelem * sizeof(uint16_t),
200 |         cudaHostRegisterMapped
201 |     ));
202 | 
203 |     // retrieve the host pointer
204 |     uint16_t *d_idata = nullptr;
205 |     cudaErrChk(cudaHostGetDevicePointer(&d_idata, idata.data(), 0));
206 | 
207 |     /*
208 |      * Copy the data to buffer area along with type casts.
209 |      */
210 |     fprintf(stderr, "[DBG] %ld elements to type cast\n", nelem);
211 |     Common::ushort2float(
212 |         iterParms.ioBuffer.input,   // output
213 |         d_idata,                    // input
214 |         nelem
215 |     );
216 | 
217 |     // duplicate the to store a copy of raw data
218 |     cudaErrChk(cudaMemcpy(
219 |         iterParms.raw,
220 |         iterParms.ioBuffer.input,
221 |         nelem * sizeof(float),
222 |         cudaMemcpyDeviceToDevice
223 |     ));
224 | 
225 |     /*
226 |      * Release the pinned memory region.
227 |      */
228 |     cudaErrChk(cudaHostUnregister(idata.data()));
229 | 
230 |     // preset the iteration
231 |     cudaErrChk(cudaMemcpy(
232 |         iterParms.predBuffer.prevIter,
233 |         iterParms.ioBuffer.input,
234 |         nelem * sizeof(float),
235 |         cudaMemcpyDeviceToDevice
236 |     ));
237 |     cudaErrChk(cudaMemset(
238 |         iterParms.predBuffer.prevPredChg,
239 |         0,
240 |         nelem * sizeof(float)
241 |     ));
242 | 
243 |     /*
244 |      * Execute the core functions.
245 |      */
246 |     const int nIter = pimpl->iterations;
247 |     for (int iIter = 1; iIter <= nIter; iIter++) {
248 |         //Core::RL::step(
249 |         Core::Biggs::step(
250 |             iterParms.ioBuffer.output,  // output
251 |             iterParms.ioBuffer.input,   // input
252 |             iterParms
253 |         );
254 |         // swap A, B buffer
255 |         std::swap(iterParms.ioBuffer.input, iterParms.ioBuffer.output);
256 | 
257 |         fprintf(stderr, "[INF] %d/%d\n", iIter, nIter);
258 |     }
259 | 
260 |     // swap back to avoid confusion
261 |     std::swap(iterParms.ioBuffer.input, iterParms.ioBuffer.output);
262 | 
263 |     // noarmalize the result to [0, 65535]
264 | 
265 | 
266 |     // copy back to host
267 |     cudaErrChk(cudaMemcpy(
268 |         odata.data(),
269 |         iterParms.ioBuffer.output,
270 |         nelem * sizeof(cufftReal),
271 |         cudaMemcpyDeviceToHost
272 |     ));
273 | }
274 | 
275 | }
276 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017 Liu, Yen-Ting
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |     <head>
  4 |         <meta name="viewport" content="width=device-width, initial-scale=1">
  5 |         <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
  6 |         <link rel="stylesheet" href="./style.css">
  7 |         <script type="text/javascript" async
  8 |                 src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML">
  9 |         </script>
 10 |     </head>
 11 |     <body>
 12 |         <!-- <nav class="navbar"></nav> -->
 13 |         <header>
 14 |             <h1>DeconvRL</h1>
 15 |             <h4>A GPU Accelerated Richard-Lucy Deconvolution Library</h4>
 16 |             <div class="button-wrapper">
 17 |                 <a class="btn btn-default" href="https://github.com/liuyenting/DeconvLR/archive/devel.zip" role="button">
 18 |                     <span class="glyphicon glyphicon-download-alt" aria-hidden="true">
 19 |                     </span>
 20 |                     Download <br>
 21 |                     <span class="small">DeconvRL-v0.0.0.tar.gz</span>
 22 |                 </a>
 23 |             </div>
 24 |             <title>DeconvRL</title>
 25 |         </header>
 26 |         <main class="container">
 27 |             <h2>What is DeconvRL</h2>
 28 |             DeconvRL is a open source CUDA implementation of accelerated Richard-Lucy Deconvolution algorithm regularized with total variation loss. This library is developed to recovered blurred image due to the spreading of point source in optical system. As far as we know, there is no other fully functional open source GPU accelerated implementation. This project is aim to develope an open source, high efficient library to process high resolution images of high quality.
 29 |             <h2>Quick Start</h2>
 30 |             <h3>Requirements</h3>
 31 |             <ul>
 32 |                 <li>make</li>
 33 |                 <li>g++ <= 5</li>
 34 |                 <li>CMake >= 3.6</li>
 35 |                 <li>Boost >= 1.59</li>
 36 |                 <li>CUDA >= 8.0</li>                
 37 |             </ul>
 38 |             <h3>Build</h3>            
 39 |             <ul>
 40 |                 <li>
 41 |                     First download the source code and untar it.
 42 |                     <pre>tar zxvf DeconvRL.tar.gz</pre>
 43 |                 </li>
 44 |                 <li>
 45 |                     Then <code>cd</code> into the source code directory and make a directory called <code>build</code>.
 46 |                     <pre>
 47 | cd DeconvRL
 48 | mkdir build</pre>
 49 |                 </li>
 50 |                 <li>Then <code>cd</code> into directory <code>build</code> and use command <code>cmake</code> and <code>make</code> to build the project.
 51 |                     <pre>cd build
 52 | cmake .
 53 | make</pre>
 54 |                 </li>
 55 |             </ul>
 56 |             <h3>Usage</h3>
 57 |             This library can be used as followed:
 58 |             <pre><code class="cpp">    std::string origImgFile = "data/bigradient/sample.tif";
 59 |     std::string psfFile = "data/bigradient/psf_n15_z5.tif";
 60 | 
 61 |     // load psf
 62 |     ImageStack<uint16_t> psf(psfFile);
 63 | 
 64 |     // init the deconvlr
 65 |     DeconvLR deconvWorker;
 66 |     deconvWorker.setResolution(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
 67 | 
 68 |     // open the image
 69 |     const ImageStack&ltuint16_t&gt input(origImgFile);
 70 |     ImageStack&ltuint16_t&gt output(input, 0);
 71 | 
 72 |     // use the first image to init the resources
 73 |     deconvWorker.setVolumeSize(input.nx(), input.ny(), input.nz());
 74 |     deconvWorker.setPSF(psf);
 75 | 
 76 |     // run the deconv
 77 |     deconvWorker.process(output, input);
 78 |                 </code></pre>
 79 |             
 80 |             <h2>Algorithm</h2>
 81 |             <h3>Richard-Lucy Deconvolution</h3>
 82 |             A image can be seen as a combination of point sources. However, point sources are usually not ideal, so it will spread out as we recored them with photo. In Richard-Lucy deconvolution algorithm, the pixel \(d_i\) of a photo is modeled as expectaction sum of light from each real point sources. That is
 83 | 
 84 |             \[d_i = \sum_j p_{ij} u_j\]
 85 | 
 86 |             where \(p_{ij}\) is the probability of the event that light from point source at position \(j\) recoreded in pixel \(i\) of photo, and \(u_j\) is the amount of light from point source at position \(j\). So after some conduction, and generalized in multi-dimension, given a photo \(d\), we can reconstruct the distribution of real point source of maximum likelihood by expectaction maximization algorithm:
 87 |             \[u_{t + 1} = u_{t} \cdot \left( \frac{d}{u_{t} * p} * \hat{p} \right)\]
 88 | 
 89 |             ,where \(\hat{p}_{ij} = p_{(n-i)(m-j)}\) for \(d\) of shape \(n \times m\), and \(*\) is convolution operator, which is implemented with elementwise multiplication in frequence domain.
 90 |             
 91 |             <h3>Biggs Andrew Acceleration</h3>
 92 |             It is an acceleration of Richard-Lucy's algorithm proposed by Biggs and Andrew. Let the original Richard-Lucy's iteration be \(\phi\), namely
 93 | 
 94 |             \[u_{t + 1} = \phi(u_t) = u_t \cdot \left( \frac{d}{u_t * p} * \hat{p} \right)\]
 95 | 
 96 |             Consider a image is a point in high dimension space, then the optimization process is to finding a point with maximum likelihood iteratively. The original Richard-Lucy's alogithm maximize the likelihood by applying \(\phi\) times and times in each iteration. To accelerate it, Biggs and Andrew proposed to go to some direction \(h\) times some acceleration factor \(\alpha\) between each two iterations (namely apply \(\phi\)). That is, after an iteration, we can get
 97 |             
 98 |             \[ x_k = \phi(y_{k-1}) \]
 99 |             
100 |             Then the direction \(h\) is computed from
101 | 
102 |             \[ h_k = x_k - x_{k-1} \]
103 | 
104 |             and the accelerate factor \(\alpha\) is from
105 | 
106 |             \[ \alpha = \frac{(x_k - y_{k-1}) \cdot (x_{k - 1} - y_{k-2})}{\lVert x_{k - 1} - y_{k-2} \rVert_2} \]
107 | 
108 |             So between each itrations \(x_k\) and \(x_{k+1}\), an intermediate prediction \(y_k\) is computed
109 | 
110 |             \[ y_k = x_k + \alpha_k h_k \]
111 | 
112 |             It is the first order acceleration. Furthermore, it can be extended to second or more order acceleration. However, we did only implement the first order acceleration for now.
113 |             
114 |             <h3>Total Variation Regularization</h3>
115 | 
116 |             The original Richard-Lucy's algorithm often converges to a maximum likelihood image but the image may be noisy. Thus, the total variation loss can be added to act as regularization. After some more math conductions, it turns out that only a little bit of modification of original iteration is required:
117 | 
118 |             \[u_{(t + 1)} = \phi(u_t) = \frac{u_{t}}{1 - \lambda \mathrm{div}(\frac{\nabla u_t}{|\nabla u_t|}) } \cdot \left( \frac{d}{u_{t} * p} * \hat{p} \right)\]
119 |                         
120 |             <h2>Benchmark</h2>
121 | 
122 |             <div class="row">
123 |                 <figure class="figure col-md-4">
124 |                     <img src="./benchmark/sample.jpg"
125 |                          class="figure-img img-fluid rounded img-responsive">
126 |                     <figcaption class="figure-caption">Original Image</figcaption>
127 |                 </figure>
128 |             </div>
129 | 
130 |             <div class="row">
131 |                 <figure class="figure col-md-4">
132 |                     <img src="./benchmark/result_10.jpg"
133 |                          class="figure-img img-fluid rounded img-responsive">
134 |                     <figcaption class="figure-caption">After 10 updates</figcaption>
135 |                 </figure>
136 |                 <figure class="figure col-md-4">
137 |                     <img src="./benchmark/result_100.jpg"
138 |                          class="figure-img img-fluid rounded img-responsive">
139 |                     <figcaption class="figure-caption">After 100 updates</figcaption>
140 |                 </figure>
141 |                 <figure class="figure col-md-4">
142 |                     <img src="./benchmark/result_1000.jpg"
143 |                          class="figure-img img-fluid rounded img-responsive">
144 |                     <figcaption class="figure-caption">After 1000 updates</figcaption>
145 |                 </figure>
146 |             </div>
147 |             
148 |             <img class="img-responsive"
149 |                  src="./benchmark/basic_rl_lineprofile.png">            
150 | 
151 |             <h3>Original Richard-Lucy's Algorithm</h3>
152 | 
153 |             <div class="row">
154 |                 <figure class="figure col-md-4">
155 |                     <img src="./benchmark/baseline/iter10_mid.jpg"
156 |                          class="figure-img img-fluid rounded img-responsive">
157 |                     <figcaption class="figure-caption">After 10 updates</figcaption>
158 |                 </figure>
159 |                 <figure class="figure col-md-4">
160 |                     <img src="./benchmark/baseline/iter100_mid.jpg"
161 |                          class="figure-img img-fluid rounded img-responsive">
162 |                     <figcaption class="figure-caption">After 100 updates</figcaption>
163 |                 </figure>
164 |                 <figure class="figure col-md-4">
165 |                     <img src="./benchmark/baseline/iter1000_mid.jpg"
166 |                          class="figure-img img-fluid rounded img-responsive">
167 |                     <figcaption class="figure-caption">After 1000 updates</figcaption>
168 |                 </figure>
169 |             </div>
170 | 
171 |             <h3>Biggs Andrew Acceleration</h3>
172 | 
173 |             <div class="row">
174 |                 <figure class="figure col-md-4">
175 |                     <img src="./benchmark/accelerated/iter10_mid.jpg"
176 |                          class="figure-img img-fluid rounded img-responsive">
177 |                     <figcaption class="figure-caption">After 10 updates</figcaption>
178 |                 </figure>
179 |                 <figure class="figure col-md-4">
180 |                     <img src="./benchmark/accelerated/iter100_mid.jpg"
181 |                          class="figure-img img-fluid rounded img-responsive">
182 |                     <figcaption class="figure-caption">After 100 updates</figcaption>
183 |                 </figure>
184 |                 <figure class="figure col-md-4">
185 |                     <img src="./benchmark/accelerated/iter1000_mid.jpg"
186 |                          class="figure-img img-fluid rounded img-responsive">
187 |                     <figcaption class="figure-caption">After 1000 updates</figcaption>
188 |                 </figure>
189 |             </div>
190 | 
191 |             
192 |             
193 |             <h2>References</h2>
194 |             <ul>
195 |                 <li>William Hadley Richardson  (1972), "Bayesian-Based Iterative Method of Image Restoration*," J. Opt. Soc. Am. 62, 55-59.</li>
196 |                 <li> Lucy, L. B. (1974). "An iterative technique for the rectification of observed distributions". Astronomical Journal. 79 (6): 745–754.</li>
197 |                 <li>Biggs, D. S., & Andrews, M. (1997). Acceleration of iterative image restoration algorithms. Applied optics, 36(8), 1766-1775.</li>
198 |                 <li>Dey, N., Blanc-Féraud, L., Zimmer, C., Roux, P., Kam, Z., Olivo-Marin, J. C., & Zerubia, J. (2004). 3D microscopy deconvolution using Richardson-Lucy algorithm with total variation regularization (Doctoral dissertation, INRIA).</li>
199 |             </ul>
200 |         </main>
201 |         <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-fork-ribbon-css/0.2.0/gh-fork-ribbon.min.css" />
202 |         <a class="github-fork-ribbon"
203 |            href="https://github.com/liuyenting/DeconvLR"
204 |            title="Fork me on GitHub">Fork me on GitHub</a>
205 | 
206 |         <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/default.min.css">
207 |         <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
208 |         <script>hljs.initHighlightingOnLoad();</script>
209 |     </body>
210 | </html>
211 | 


--------------------------------------------------------------------------------
/src/PSF.cu:
--------------------------------------------------------------------------------
  1 | // corresponded header file
  2 | // necessary project headers
  3 | #include "DeconvRLImpl.cuh"
  4 | #include "Helper.cuh"
  5 | #include "DumpData.cuh"
  6 | // 3rd party libraries headers
  7 | #include <cuda_runtime.h>
  8 | #include <cuComplex.h>
  9 | #include <thrust/device_vector.h>
 10 | #include <thrust/transform.h>
 11 | #include <thrust/transform_reduce.h>
 12 | #include <thrust/functional.h>
 13 | #include <thrust/execution_policy.h>
 14 | #include <cufft.h>
 15 | 
 16 | #define cimg_use_tiff
 17 | #include "CImg.h"
 18 | using namespace cimg_library;
 19 | // standard libraries headers
 20 | #include <cstdint>
 21 | #include <exception>
 22 | // system headers
 23 | 
 24 | namespace DeconvRL {
 25 | 
 26 | namespace PSF {
 27 | 
 28 | namespace {
 29 | 
 30 | cudaArray_t psfRes = nullptr;
 31 | texture<float, cudaTextureType3D, cudaReadModeElementType> psfTexRef;
 32 | 
 33 | struct SubConstant
 34 |     : public thrust::unary_function<float, float> {
 35 |     SubConstant(const float c_)
 36 |         : c(c_) {
 37 |     }
 38 | 
 39 |     __host__ __device__
 40 |     float operator()(const float &p) const {
 41 |         float o = p-c;
 42 |         return (o < 0) ? 0 : o;
 43 |     }
 44 | 
 45 | private:
 46 |     const float c;
 47 | };
 48 | 
 49 | __global__
 50 | void createGrid_kernel(
 51 |     int3 *d_grid,
 52 |     const size_t nx, const size_t ny, const size_t nz
 53 | ) {
 54 |     int ix = blockIdx.x*blockDim.x + threadIdx.x;
 55 |     int iy = blockIdx.y*blockDim.y + threadIdx.y;
 56 |     int iz = blockIdx.z*blockDim.z + threadIdx.z;
 57 | 
 58 |     // skip out-of-bound threads
 59 |     if (ix >= nx or iy >= ny or iz >= nz) {
 60 |         return;
 61 |     }
 62 | 
 63 |     int idx = iz * (nx*ny) + iy * nx + ix;
 64 |     d_grid[idx] = make_int3(ix, iy, iz);
 65 | }
 66 | 
 67 | struct MultiplyWeighting
 68 |     : public thrust::unary_function<int3, float4> {
 69 |     MultiplyWeighting(
 70 |         const float *data,
 71 |         const size_t nx_, const size_t ny_, const size_t nz_
 72 |     )
 73 |         : d_weight(data), nx(nx_), ny(ny_), nz(nz_) {
 74 |     }
 75 | 
 76 |     __host__ __device__
 77 |     float4 operator()(const int3 &p) const {
 78 |         const int idx = p.z * (nx*ny) + p.y * nx + p.x;
 79 |         const float w = d_weight[idx];
 80 |         return make_float4(p.x*w, p.y*w, p.z*w, w);
 81 |     }
 82 | 
 83 | private:
 84 |     const float *d_weight;
 85 |     size_t nx, ny, nz;
 86 | };
 87 | 
 88 | __global__
 89 | void simpleClone_kenel(
 90 |     float *d_tmp,
 91 |     const size_t nx, const size_t ny, const size_t nz,
 92 |     const float *d_psf,
 93 |     const size_t npx, const size_t npy, const size_t npz
 94 | ) {
 95 |     int ix = blockIdx.x*blockDim.x + threadIdx.x;
 96 |     int iy = blockIdx.y*blockDim.y + threadIdx.y;
 97 |     int iz = blockIdx.z*blockDim.z + threadIdx.z;
 98 | 
 99 |     // skip out-of-bound threads
100 |     if (ix >= nx or iy >= ny or iz >= nz) {
101 |         return;
102 |     }
103 | 
104 |     int oidx = iz * (nx*ny) + iy * nx + ix;
105 |     // rest of the space is filled with zero
106 |     if (ix >= npx or iy >= npy or iz >= npz) {
107 |         d_tmp[oidx] = 0.0f;
108 |     } else {
109 |         int iidx = iz * (npx*npy) + iy * npx + ix;
110 |         d_tmp[oidx] = d_psf[iidx];
111 |     }
112 | }
113 | 
114 | __global__
115 | void alignCenter_kernel(
116 |     float *odata,
117 |     const size_t nx, const size_t ny, const size_t nz,
118 |     const float ox, const float oy, const float oz
119 | ) {
120 |     int ix = blockIdx.x*blockDim.x + threadIdx.x;
121 |     int iy = blockIdx.y*blockDim.y + threadIdx.y;
122 |     int iz = blockIdx.z*blockDim.z + threadIdx.z;
123 | 
124 |     // skip out-of-bound threads
125 |     if (ix >= nx or iy >= ny or iz >= nz) {
126 |         return;
127 |     }
128 | 
129 |     // normalized coordinate
130 |     float fx = (ix+ox+0.5f) / nx;
131 |     float fy = (iy+oy+0.5f) / ny;
132 |     float fz = (iz+oz+0.5f) / nz;
133 | 
134 |     // sampling from the texture
135 |     // (coordinates are backtracked to the deviated ones)
136 |     int idx = iz * (nx*ny) + iy * nx + ix;
137 |     odata[idx] = tex3D(psfTexRef, fx, fy, fz);
138 | }
139 | 
140 | __global__
141 | void fftshift3_kernel(
142 |     float *odata,
143 |     const float *idata,
144 |     const size_t nx, const size_t ny, const size_t nz
145 | ) {
146 |     int ix = blockIdx.x*blockDim.x + threadIdx.x;
147 |     int iy = blockIdx.y*blockDim.y + threadIdx.y;
148 |     int iz = blockIdx.z*blockDim.z + threadIdx.z;
149 | 
150 |     // skip out-of-bound threads
151 |     if (ix >= nx or iy >= ny or iz >= nz) {
152 |         return;
153 |     }
154 | 
155 |     // input linear index
156 |     const int iidx = iz * (nx*ny) + iy * nx + ix;
157 | 
158 |     // calculate wrap-around (x, y, z)
159 |     // ... advance half cycle
160 |     ix += nx/2, iy += ny/2, iz += nz/2;
161 |     // ... wrap-around
162 |     ix %= nx, iy %= ny, iz %= nz;
163 |     // output linear index
164 |     const int oidx = iz * (nx*ny) + iy * nx + ix;
165 | 
166 |     odata[oidx] = idata[iidx];
167 | }
168 | 
169 | }
170 | 
171 | PSF::PSF(
172 |     float *h_psf,
173 |     const size_t npx_, const size_t npy_, const size_t npz_
174 | ) : npx(npx_), npy(npy_), npz(npz_) {
175 |     nelem = npx * npy * npz;
176 | 
177 |     // create PSF memory space on device
178 |     const size_t size = nelem * sizeof(float);
179 |     cudaErrChk(cudaMalloc(&d_psf, size));
180 |     // copy from host to device
181 |     cudaErrChk(cudaMemcpy(d_psf, h_psf, size, cudaMemcpyHostToDevice));
182 | }
183 | 
184 | PSF::~PSF() {
185 |     cudaErrChk(cudaFree(d_psf));
186 | }
187 | 
188 | void PSF::alignCenter(const size_t nx, const size_t ny, const size_t nz) {
189 |     float3 centroid = findCentroid();
190 |     fprintf(
191 |         stderr,
192 |         "[INF] centroid = (%.2f, %.2f, %.2f)\n",
193 |         centroid.x, centroid.y, centroid.z
194 |     );
195 | 
196 |     // simple clone is performed, so we can reuse the centroid location
197 |     if (nx != npx or ny != npy or nz != npz) {
198 |         padPSF(nx, ny, nz);
199 |     }
200 | 
201 |     /*
202 |      * Bind the data source to the texture.
203 |      */
204 |     // create cudaArray for the texture.
205 |     cudaChannelFormatDesc desc = cudaCreateChannelDesc(
206 |         32, 0, 0, 0, cudaChannelFormatKindFloat
207 |     );
208 |     cudaExtent extent = make_cudaExtent(npx, npy, npz);
209 |     cudaErrChk(cudaMalloc3DArray(
210 |         &psfRes,
211 |         &desc,      // pixel channel description
212 |         extent,     // array dimension
213 |         cudaArrayDefault
214 |     ));
215 | 
216 |     // copy the data to cudaArray_t
217 |     cudaMemcpy3DParms parms = {0};
218 |     parms.srcPtr = make_cudaPitchedPtr(d_psf, npx * sizeof(float), npx, npy);
219 |     parms.dstArray = psfRes;
220 |     parms.extent = extent;
221 |     parms.kind = cudaMemcpyDeviceToDevice;
222 |     cudaErrChk(cudaMemcpy3D(&parms));
223 | 
224 |     // reconfigure the texture
225 |     psfTexRef.normalized = true;
226 |     // sampled data is interpolated
227 |     psfTexRef.filterMode = cudaFilterModeLinear;
228 |     // wrap around the texture if exceeds border limit
229 |     psfTexRef.addressMode[0] = cudaAddressModeWrap;
230 |     psfTexRef.addressMode[1] = cudaAddressModeWrap;
231 |     psfTexRef.addressMode[2] = cudaAddressModeWrap;
232 | 
233 |     // start the binding
234 |     cudaErrChk(cudaBindTextureToArray(psfTexRef, psfRes));
235 | 
236 |     /*
237 |      * Execute the alignment kernel.
238 |      */
239 |     // coordinate of the center of the volume
240 |     const float3 center = make_float3(
241 |         (npx-1)/2.0f, (npy-1)/2.0f, (npz-1)/2.0f
242 |     );
243 |     // offset
244 |     const float3 offset = centroid - center;
245 |     fprintf(stderr, "[DBG] offset = (%.2f, %.2f, %.2f)\n", offset.x, offset.y, offset.z);
246 | 
247 |     // begin resample the kernel
248 |     dim3 nthreads(16, 16, 4);
249 |     dim3 nblocks(
250 |         DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z)
251 |     );
252 |     alignCenter_kernel<<<nblocks, nthreads>>>(
253 |         d_psf,
254 |         npx, npy, npz,
255 |         offset.x, offset.y, offset.z
256 |     );
257 |     cudaErrChk(cudaPeekAtLastError());
258 | 
259 |     /*
260 |      * Release the resources.
261 |      */
262 |     cudaErrChk(cudaUnbindTexture(psfTexRef));
263 |     cudaErrChk(cudaFreeArray(psfRes));
264 | 
265 |     DumpData::Device::real("psf_aligned.tif", d_psf, npx, npy, npz);
266 | }
267 | 
268 | void PSF::createOTF(cufftComplex *d_otf) {
269 |     /*
270 |      * Prepare FFT environment.
271 |      */
272 |     cufftHandle otfHdl;
273 |     cudaErrChk(cufftPlan3d(&otfHdl, npz, npy, npx, CUFFT_R2C));
274 |     // estimate resource requirements
275 |     size_t size;
276 |     cudaErrChk(cufftGetSize3d(otfHdl, npz, npy, npx, CUFFT_R2C, &size));
277 |     fprintf(stderr, "[DBG] require %ld bytes to generate an OTF\n", size);
278 | 
279 |     /*
280 |      * Execute the conversion.
281 |      */
282 |     // temporary memory space to hold the shifted PSF
283 |     float *d_tmp;
284 |     cudaErrChk(cudaMalloc(&d_tmp, nelem * sizeof(float)));
285 |     cudaErrChk(cudaMemcpy(d_tmp, d_psf, nelem * sizeof(float), cudaMemcpyDeviceToDevice));
286 | 
287 |     // fftshift
288 |     dim3 nthreads(16, 16, 4);
289 |     dim3 nblocks(
290 |         DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z)
291 |     );
292 |     fftshift3_kernel<<<nblocks, nthreads>>>(d_tmp, d_psf, npx, npy, npz);
293 | 
294 |     cudaErrChk(cufftExecR2C(otfHdl, d_tmp, d_otf));
295 | 
296 |     // release FFT resource
297 |     cudaErrChk(cudaFree(d_tmp));
298 |     cudaErrChk(cufftDestroy(otfHdl));
299 | 
300 |     DumpData::Device::complex("otf_dump.tif", d_otf, npx/2+1, npy, npz);
301 | }
302 | 
303 | // center the PSF to its potential centroid
304 | float3 PSF::findCentroid() {
305 |     /*
306 |      * Create temporary PSF to find the centroid.
307 |      */
308 |     float *d_tmp;
309 |     const size_t size = nelem * sizeof(float);
310 |     cudaErrChk(cudaMalloc(&d_tmp, size));
311 |     // copy the raw PSF to temporary PSF
312 |     cudaErrChk(cudaMemcpy(d_tmp, d_psf, size, cudaMemcpyDeviceToDevice));
313 | 
314 |     // background value is clamped in [0, +inf)
315 |     const float bkgVal = estimateBackground();
316 |     fprintf(stderr, "[INF] PSF background value is %.2f\n", bkgVal);
317 |     thrust::transform(
318 |         thrust::device,
319 |         d_tmp, d_tmp+nelem,
320 |         d_tmp,
321 |         SubConstant(bkgVal)
322 |     );
323 | 
324 |     /*
325 |      * Generate 3-D grid for weighting.
326 |      */
327 |     int3 *d_grid;
328 |     cudaErrChk(cudaMalloc(&d_grid, nelem * sizeof(int3)));
329 |     dim3 nthreads(16, 16, 4);
330 |     dim3 nblocks(
331 |         DIVUP(npx, nthreads.x), DIVUP(npy, nthreads.y), DIVUP(npz, nthreads.z)
332 |     );
333 |     createGrid_kernel<<<nblocks, nthreads>>>(d_grid, npx, npy, npz);
334 |     cudaErrChk(cudaPeekAtLastError());
335 | 
336 |     /*
337 |      * Calculate the centroid along weighted grid points using cleaned PSF.
338 |      */
339 |     float4 result = thrust::transform_reduce(
340 |         thrust::device,
341 |         d_grid, d_grid+nelem,
342 |         MultiplyWeighting(d_tmp, npx, npy, npz),
343 |         make_float4(0),
344 |         thrust::plus<float4>()
345 |     );
346 | 
347 |     float3 centroid = make_float3(
348 |         result.x/result.w, result.y/result.w, result.z/result.w
349 |     );
350 | 
351 |     // free the weight computation resources
352 |     cudaErrChk(cudaFree(d_grid));
353 |     cudaErrChk(cudaFree(d_tmp));
354 | 
355 |     return centroid;
356 | }
357 | 
358 | float PSF::estimateBackground() {
359 |     float sum = thrust::reduce(
360 |         thrust::device,
361 |         d_psf, d_psf+nelem,
362 |         0,
363 |         thrust::plus<float>()
364 |     );
365 |     return sum/nelem;
366 | }
367 | 
368 | void PSF::padPSF(const size_t nx, const size_t ny, const size_t nz) {
369 |     // verify the size requirement
370 |     if (nx < npx or ny < npy or nz < npz) {
371 |         throw std::range_error(
372 |             "volume has to be greater or equal than the original PSF"
373 |         );
374 |     }
375 | 
376 |     // create new PSF memory space with the larger size
377 |     float *d_tmp;
378 |     cudaErrChk(cudaMalloc(
379 |         &d_tmp,
380 |         nx * ny * nz * sizeof(float)
381 |     ));
382 | 
383 |     // copy the original PSF to the larger space at (0, 0)
384 |     dim3 nthreads(16, 16, 4);
385 |     dim3 nblocks(
386 |         DIVUP(nx, nthreads.x), DIVUP(ny, nthreads.y), DIVUP(nz, nthreads.z)
387 |     );
388 |     simpleClone_kenel<<<nblocks, nthreads>>>(
389 |         d_tmp,
390 |         nx, ny, nz,
391 |         d_psf,
392 |         npx, npy, npz
393 |     );
394 |     // swap the pointer and free the smaller space
395 |     std::swap(d_tmp, d_psf);
396 |     cudaErrChk(cudaFree(d_tmp));
397 | 
398 |     // update PSF size
399 |     npx = nx;
400 |     npy = ny;
401 |     npz = nz;
402 |     nelem = npx * npy * npz;
403 |     fprintf(stderr, "[DBG] update PSF size is %ldx%ldx%ld\n", npx, npy, npz);
404 | }
405 | 
406 | }
407 | 
408 | }
409 | 


--------------------------------------------------------------------------------
/src/HelperMath.cuh:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
   3 |  *
   4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5 |  * with this source code for terms and conditions that govern your use of
   6 |  * this software. Any use, reproduction, disclosure, or distribution of
   7 |  * this software and related documentation outside the terms of the EULA
   8 |  * is strictly prohibited.
   9 |  *
  10 |  */
  11 | 
  12 | /*
  13 |  *  This file implements common mathematical operations on vector types
  14 |  *  (float3, float4 etc.) since these are not provided as standard by CUDA.
  15 |  *
  16 |  *  The syntax is modeled on the Cg standard library.
  17 |  *
  18 |  *  This is part of the Helper library includes
  19 |  *
  20 |  *    Thanks to Linh Hah for additions and fixes.
  21 |  */
  22 | 
  23 | #ifndef HELPER_MATH_H
  24 | #define HELPER_MATH_H
  25 | 
  26 | #include <cuda_runtime.h>
  27 | 
  28 | typedef unsigned int uint;
  29 | typedef unsigned short ushort;
  30 | 
  31 | #ifndef __CUDACC__
  32 | #include <cmath>
  33 | 
  34 | ////////////////////////////////////////////////////////////////////////////////
  35 | // host implementations of CUDA functions
  36 | ////////////////////////////////////////////////////////////////////////////////
  37 | 
  38 | inline float fminf(float a, float b)
  39 | {
  40 |     return a < b ? a : b;
  41 | }
  42 | 
  43 | inline float fmaxf(float a, float b)
  44 | {
  45 |     return a > b ? a : b;
  46 | }
  47 | 
  48 | inline int max(int a, int b)
  49 | {
  50 |     return a > b ? a : b;
  51 | }
  52 | 
  53 | inline int min(int a, int b)
  54 | {
  55 |     return a < b ? a : b;
  56 | }
  57 | 
  58 | inline float rsqrtf(float x)
  59 | {
  60 |     return 1.0f / sqrtf(x);
  61 | }
  62 | #endif
  63 | 
  64 | ////////////////////////////////////////////////////////////////////////////////
  65 | // constructors
  66 | ////////////////////////////////////////////////////////////////////////////////
  67 | 
  68 | inline __host__ __device__ float2 make_float2(float s)
  69 | {
  70 |     return make_float2(s, s);
  71 | }
  72 | inline __host__ __device__ float2 make_float2(float3 a)
  73 | {
  74 |     return make_float2(a.x, a.y);
  75 | }
  76 | inline __host__ __device__ float2 make_float2(int2 a)
  77 | {
  78 |     return make_float2(float(a.x), float(a.y));
  79 | }
  80 | inline __host__ __device__ float2 make_float2(uint2 a)
  81 | {
  82 |     return make_float2(float(a.x), float(a.y));
  83 | }
  84 | 
  85 | inline __host__ __device__ int2 make_int2(int s)
  86 | {
  87 |     return make_int2(s, s);
  88 | }
  89 | inline __host__ __device__ int2 make_int2(int3 a)
  90 | {
  91 |     return make_int2(a.x, a.y);
  92 | }
  93 | inline __host__ __device__ int2 make_int2(uint2 a)
  94 | {
  95 |     return make_int2(int(a.x), int(a.y));
  96 | }
  97 | inline __host__ __device__ int2 make_int2(float2 a)
  98 | {
  99 |     return make_int2(int(a.x), int(a.y));
 100 | }
 101 | 
 102 | inline __host__ __device__ uint2 make_uint2(uint s)
 103 | {
 104 |     return make_uint2(s, s);
 105 | }
 106 | inline __host__ __device__ uint2 make_uint2(uint3 a)
 107 | {
 108 |     return make_uint2(a.x, a.y);
 109 | }
 110 | inline __host__ __device__ uint2 make_uint2(int2 a)
 111 | {
 112 |     return make_uint2(uint(a.x), uint(a.y));
 113 | }
 114 | 
 115 | inline __host__ __device__ float3 make_float3(float s)
 116 | {
 117 |     return make_float3(s, s, s);
 118 | }
 119 | inline __host__ __device__ float3 make_float3(float2 a)
 120 | {
 121 |     return make_float3(a.x, a.y, 0.0f);
 122 | }
 123 | inline __host__ __device__ float3 make_float3(float2 a, float s)
 124 | {
 125 |     return make_float3(a.x, a.y, s);
 126 | }
 127 | inline __host__ __device__ float3 make_float3(float4 a)
 128 | {
 129 |     return make_float3(a.x, a.y, a.z);
 130 | }
 131 | inline __host__ __device__ float3 make_float3(int3 a)
 132 | {
 133 |     return make_float3(float(a.x), float(a.y), float(a.z));
 134 | }
 135 | inline __host__ __device__ float3 make_float3(uint3 a)
 136 | {
 137 |     return make_float3(float(a.x), float(a.y), float(a.z));
 138 | }
 139 | 
 140 | inline __host__ __device__ int3 make_int3(int s)
 141 | {
 142 |     return make_int3(s, s, s);
 143 | }
 144 | inline __host__ __device__ int3 make_int3(int2 a)
 145 | {
 146 |     return make_int3(a.x, a.y, 0);
 147 | }
 148 | inline __host__ __device__ int3 make_int3(int2 a, int s)
 149 | {
 150 |     return make_int3(a.x, a.y, s);
 151 | }
 152 | inline __host__ __device__ int3 make_int3(uint3 a)
 153 | {
 154 |     return make_int3(int(a.x), int(a.y), int(a.z));
 155 | }
 156 | inline __host__ __device__ int3 make_int3(float3 a)
 157 | {
 158 |     return make_int3(int(a.x), int(a.y), int(a.z));
 159 | }
 160 | 
 161 | inline __host__ __device__ uint3 make_uint3(uint s)
 162 | {
 163 |     return make_uint3(s, s, s);
 164 | }
 165 | inline __host__ __device__ uint3 make_uint3(uint2 a)
 166 | {
 167 |     return make_uint3(a.x, a.y, 0);
 168 | }
 169 | inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
 170 | {
 171 |     return make_uint3(a.x, a.y, s);
 172 | }
 173 | inline __host__ __device__ uint3 make_uint3(uint4 a)
 174 | {
 175 |     return make_uint3(a.x, a.y, a.z);
 176 | }
 177 | inline __host__ __device__ uint3 make_uint3(int3 a)
 178 | {
 179 |     return make_uint3(uint(a.x), uint(a.y), uint(a.z));
 180 | }
 181 | 
 182 | inline __host__ __device__ float4 make_float4(float s)
 183 | {
 184 |     return make_float4(s, s, s, s);
 185 | }
 186 | inline __host__ __device__ float4 make_float4(float3 a)
 187 | {
 188 |     return make_float4(a.x, a.y, a.z, 0.0f);
 189 | }
 190 | inline __host__ __device__ float4 make_float4(float3 a, float w)
 191 | {
 192 |     return make_float4(a.x, a.y, a.z, w);
 193 | }
 194 | inline __host__ __device__ float4 make_float4(int4 a)
 195 | {
 196 |     return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 197 | }
 198 | inline __host__ __device__ float4 make_float4(uint4 a)
 199 | {
 200 |     return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
 201 | }
 202 | 
 203 | inline __host__ __device__ int4 make_int4(int s)
 204 | {
 205 |     return make_int4(s, s, s, s);
 206 | }
 207 | inline __host__ __device__ int4 make_int4(int3 a)
 208 | {
 209 |     return make_int4(a.x, a.y, a.z, 0);
 210 | }
 211 | inline __host__ __device__ int4 make_int4(int3 a, int w)
 212 | {
 213 |     return make_int4(a.x, a.y, a.z, w);
 214 | }
 215 | inline __host__ __device__ int4 make_int4(uint4 a)
 216 | {
 217 |     return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 218 | }
 219 | inline __host__ __device__ int4 make_int4(float4 a)
 220 | {
 221 |     return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
 222 | }
 223 | 
 224 | 
 225 | inline __host__ __device__ uint4 make_uint4(uint s)
 226 | {
 227 |     return make_uint4(s, s, s, s);
 228 | }
 229 | inline __host__ __device__ uint4 make_uint4(uint3 a)
 230 | {
 231 |     return make_uint4(a.x, a.y, a.z, 0);
 232 | }
 233 | inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
 234 | {
 235 |     return make_uint4(a.x, a.y, a.z, w);
 236 | }
 237 | inline __host__ __device__ uint4 make_uint4(int4 a)
 238 | {
 239 |     return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
 240 | }
 241 | 
 242 | ////////////////////////////////////////////////////////////////////////////////
 243 | // negate
 244 | ////////////////////////////////////////////////////////////////////////////////
 245 | 
 246 | inline __host__ __device__ float2 operator-(float2 &a)
 247 | {
 248 |     return make_float2(-a.x, -a.y);
 249 | }
 250 | inline __host__ __device__ int2 operator-(int2 &a)
 251 | {
 252 |     return make_int2(-a.x, -a.y);
 253 | }
 254 | inline __host__ __device__ float3 operator-(float3 &a)
 255 | {
 256 |     return make_float3(-a.x, -a.y, -a.z);
 257 | }
 258 | inline __host__ __device__ int3 operator-(int3 &a)
 259 | {
 260 |     return make_int3(-a.x, -a.y, -a.z);
 261 | }
 262 | inline __host__ __device__ float4 operator-(float4 &a)
 263 | {
 264 |     return make_float4(-a.x, -a.y, -a.z, -a.w);
 265 | }
 266 | inline __host__ __device__ int4 operator-(int4 &a)
 267 | {
 268 |     return make_int4(-a.x, -a.y, -a.z, -a.w);
 269 | }
 270 | 
 271 | ////////////////////////////////////////////////////////////////////////////////
 272 | // addition
 273 | ////////////////////////////////////////////////////////////////////////////////
 274 | 
 275 | inline __host__ __device__ float2 operator+(float2 a, float2 b)
 276 | {
 277 |     return make_float2(a.x + b.x, a.y + b.y);
 278 | }
 279 | inline __host__ __device__ void operator+=(float2 &a, float2 b)
 280 | {
 281 |     a.x += b.x;
 282 |     a.y += b.y;
 283 | }
 284 | inline __host__ __device__ float2 operator+(float2 a, float b)
 285 | {
 286 |     return make_float2(a.x + b, a.y + b);
 287 | }
 288 | inline __host__ __device__ float2 operator+(float b, float2 a)
 289 | {
 290 |     return make_float2(a.x + b, a.y + b);
 291 | }
 292 | inline __host__ __device__ void operator+=(float2 &a, float b)
 293 | {
 294 |     a.x += b;
 295 |     a.y += b;
 296 | }
 297 | 
 298 | inline __host__ __device__ int2 operator+(int2 a, int2 b)
 299 | {
 300 |     return make_int2(a.x + b.x, a.y + b.y);
 301 | }
 302 | inline __host__ __device__ void operator+=(int2 &a, int2 b)
 303 | {
 304 |     a.x += b.x;
 305 |     a.y += b.y;
 306 | }
 307 | inline __host__ __device__ int2 operator+(int2 a, int b)
 308 | {
 309 |     return make_int2(a.x + b, a.y + b);
 310 | }
 311 | inline __host__ __device__ int2 operator+(int b, int2 a)
 312 | {
 313 |     return make_int2(a.x + b, a.y + b);
 314 | }
 315 | inline __host__ __device__ void operator+=(int2 &a, int b)
 316 | {
 317 |     a.x += b;
 318 |     a.y += b;
 319 | }
 320 | 
 321 | inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
 322 | {
 323 |     return make_uint2(a.x + b.x, a.y + b.y);
 324 | }
 325 | inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
 326 | {
 327 |     a.x += b.x;
 328 |     a.y += b.y;
 329 | }
 330 | inline __host__ __device__ uint2 operator+(uint2 a, uint b)
 331 | {
 332 |     return make_uint2(a.x + b, a.y + b);
 333 | }
 334 | inline __host__ __device__ uint2 operator+(uint b, uint2 a)
 335 | {
 336 |     return make_uint2(a.x + b, a.y + b);
 337 | }
 338 | inline __host__ __device__ void operator+=(uint2 &a, uint b)
 339 | {
 340 |     a.x += b;
 341 |     a.y += b;
 342 | }
 343 | 
 344 | 
 345 | inline __host__ __device__ float3 operator+(float3 a, float3 b)
 346 | {
 347 |     return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 348 | }
 349 | inline __host__ __device__ void operator+=(float3 &a, float3 b)
 350 | {
 351 |     a.x += b.x;
 352 |     a.y += b.y;
 353 |     a.z += b.z;
 354 | }
 355 | inline __host__ __device__ float3 operator+(float3 a, float b)
 356 | {
 357 |     return make_float3(a.x + b, a.y + b, a.z + b);
 358 | }
 359 | inline __host__ __device__ void operator+=(float3 &a, float b)
 360 | {
 361 |     a.x += b;
 362 |     a.y += b;
 363 |     a.z += b;
 364 | }
 365 | 
 366 | inline __host__ __device__ int3 operator+(int3 a, int3 b)
 367 | {
 368 |     return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
 369 | }
 370 | inline __host__ __device__ void operator+=(int3 &a, int3 b)
 371 | {
 372 |     a.x += b.x;
 373 |     a.y += b.y;
 374 |     a.z += b.z;
 375 | }
 376 | inline __host__ __device__ int3 operator+(int3 a, int b)
 377 | {
 378 |     return make_int3(a.x + b, a.y + b, a.z + b);
 379 | }
 380 | inline __host__ __device__ void operator+=(int3 &a, int b)
 381 | {
 382 |     a.x += b;
 383 |     a.y += b;
 384 |     a.z += b;
 385 | }
 386 | 
 387 | inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
 388 | {
 389 |     return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
 390 | }
 391 | inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
 392 | {
 393 |     a.x += b.x;
 394 |     a.y += b.y;
 395 |     a.z += b.z;
 396 | }
 397 | inline __host__ __device__ uint3 operator+(uint3 a, uint b)
 398 | {
 399 |     return make_uint3(a.x + b, a.y + b, a.z + b);
 400 | }
 401 | inline __host__ __device__ void operator+=(uint3 &a, uint b)
 402 | {
 403 |     a.x += b;
 404 |     a.y += b;
 405 |     a.z += b;
 406 | }
 407 | 
 408 | inline __host__ __device__ int3 operator+(int b, int3 a)
 409 | {
 410 |     return make_int3(a.x + b, a.y + b, a.z + b);
 411 | }
 412 | inline __host__ __device__ uint3 operator+(uint b, uint3 a)
 413 | {
 414 |     return make_uint3(a.x + b, a.y + b, a.z + b);
 415 | }
 416 | inline __host__ __device__ float3 operator+(float b, float3 a)
 417 | {
 418 |     return make_float3(a.x + b, a.y + b, a.z + b);
 419 | }
 420 | 
 421 | inline __host__ __device__ float4 operator+(float4 a, float4 b)
 422 | {
 423 |     return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
 424 | }
 425 | inline __host__ __device__ void operator+=(float4 &a, float4 b)
 426 | {
 427 |     a.x += b.x;
 428 |     a.y += b.y;
 429 |     a.z += b.z;
 430 |     a.w += b.w;
 431 | }
 432 | inline __host__ __device__ float4 operator+(float4 a, float b)
 433 | {
 434 |     return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 435 | }
 436 | inline __host__ __device__ float4 operator+(float b, float4 a)
 437 | {
 438 |     return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 439 | }
 440 | inline __host__ __device__ void operator+=(float4 &a, float b)
 441 | {
 442 |     a.x += b;
 443 |     a.y += b;
 444 |     a.z += b;
 445 |     a.w += b;
 446 | }
 447 | 
 448 | inline __host__ __device__ int4 operator+(int4 a, int4 b)
 449 | {
 450 |     return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
 451 | }
 452 | inline __host__ __device__ void operator+=(int4 &a, int4 b)
 453 | {
 454 |     a.x += b.x;
 455 |     a.y += b.y;
 456 |     a.z += b.z;
 457 |     a.w += b.w;
 458 | }
 459 | inline __host__ __device__ int4 operator+(int4 a, int b)
 460 | {
 461 |     return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
 462 | }
 463 | inline __host__ __device__ int4 operator+(int b, int4 a)
 464 | {
 465 |     return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
 466 | }
 467 | inline __host__ __device__ void operator+=(int4 &a, int b)
 468 | {
 469 |     a.x += b;
 470 |     a.y += b;
 471 |     a.z += b;
 472 |     a.w += b;
 473 | }
 474 | 
 475 | inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
 476 | {
 477 |     return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
 478 | }
 479 | inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
 480 | {
 481 |     a.x += b.x;
 482 |     a.y += b.y;
 483 |     a.z += b.z;
 484 |     a.w += b.w;
 485 | }
 486 | inline __host__ __device__ uint4 operator+(uint4 a, uint b)
 487 | {
 488 |     return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
 489 | }
 490 | inline __host__ __device__ uint4 operator+(uint b, uint4 a)
 491 | {
 492 |     return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
 493 | }
 494 | inline __host__ __device__ void operator+=(uint4 &a, uint b)
 495 | {
 496 |     a.x += b;
 497 |     a.y += b;
 498 |     a.z += b;
 499 |     a.w += b;
 500 | }
 501 | 
 502 | ////////////////////////////////////////////////////////////////////////////////
 503 | // subtract
 504 | ////////////////////////////////////////////////////////////////////////////////
 505 | 
 506 | inline __host__ __device__ float2 operator-(float2 a, float2 b)
 507 | {
 508 |     return make_float2(a.x - b.x, a.y - b.y);
 509 | }
 510 | inline __host__ __device__ void operator-=(float2 &a, float2 b)
 511 | {
 512 |     a.x -= b.x;
 513 |     a.y -= b.y;
 514 | }
 515 | inline __host__ __device__ float2 operator-(float2 a, float b)
 516 | {
 517 |     return make_float2(a.x - b, a.y - b);
 518 | }
 519 | inline __host__ __device__ float2 operator-(float b, float2 a)
 520 | {
 521 |     return make_float2(b - a.x, b - a.y);
 522 | }
 523 | inline __host__ __device__ void operator-=(float2 &a, float b)
 524 | {
 525 |     a.x -= b;
 526 |     a.y -= b;
 527 | }
 528 | 
 529 | inline __host__ __device__ int2 operator-(int2 a, int2 b)
 530 | {
 531 |     return make_int2(a.x - b.x, a.y - b.y);
 532 | }
 533 | inline __host__ __device__ void operator-=(int2 &a, int2 b)
 534 | {
 535 |     a.x -= b.x;
 536 |     a.y -= b.y;
 537 | }
 538 | inline __host__ __device__ int2 operator-(int2 a, int b)
 539 | {
 540 |     return make_int2(a.x - b, a.y - b);
 541 | }
 542 | inline __host__ __device__ int2 operator-(int b, int2 a)
 543 | {
 544 |     return make_int2(b - a.x, b - a.y);
 545 | }
 546 | inline __host__ __device__ void operator-=(int2 &a, int b)
 547 | {
 548 |     a.x -= b;
 549 |     a.y -= b;
 550 | }
 551 | 
 552 | inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
 553 | {
 554 |     return make_uint2(a.x - b.x, a.y - b.y);
 555 | }
 556 | inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
 557 | {
 558 |     a.x -= b.x;
 559 |     a.y -= b.y;
 560 | }
 561 | inline __host__ __device__ uint2 operator-(uint2 a, uint b)
 562 | {
 563 |     return make_uint2(a.x - b, a.y - b);
 564 | }
 565 | inline __host__ __device__ uint2 operator-(uint b, uint2 a)
 566 | {
 567 |     return make_uint2(b - a.x, b - a.y);
 568 | }
 569 | inline __host__ __device__ void operator-=(uint2 &a, uint b)
 570 | {
 571 |     a.x -= b;
 572 |     a.y -= b;
 573 | }
 574 | 
 575 | inline __host__ __device__ float3 operator-(float3 a, float3 b)
 576 | {
 577 |     return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
 578 | }
 579 | inline __host__ __device__ void operator-=(float3 &a, float3 b)
 580 | {
 581 |     a.x -= b.x;
 582 |     a.y -= b.y;
 583 |     a.z -= b.z;
 584 | }
 585 | inline __host__ __device__ float3 operator-(float3 a, float b)
 586 | {
 587 |     return make_float3(a.x - b, a.y - b, a.z - b);
 588 | }
 589 | inline __host__ __device__ float3 operator-(float b, float3 a)
 590 | {
 591 |     return make_float3(b - a.x, b - a.y, b - a.z);
 592 | }
 593 | inline __host__ __device__ void operator-=(float3 &a, float b)
 594 | {
 595 |     a.x -= b;
 596 |     a.y -= b;
 597 |     a.z -= b;
 598 | }
 599 | 
 600 | inline __host__ __device__ int3 operator-(int3 a, int3 b)
 601 | {
 602 |     return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
 603 | }
 604 | inline __host__ __device__ void operator-=(int3 &a, int3 b)
 605 | {
 606 |     a.x -= b.x;
 607 |     a.y -= b.y;
 608 |     a.z -= b.z;
 609 | }
 610 | inline __host__ __device__ int3 operator-(int3 a, int b)
 611 | {
 612 |     return make_int3(a.x - b, a.y - b, a.z - b);
 613 | }
 614 | inline __host__ __device__ int3 operator-(int b, int3 a)
 615 | {
 616 |     return make_int3(b - a.x, b - a.y, b - a.z);
 617 | }
 618 | inline __host__ __device__ void operator-=(int3 &a, int b)
 619 | {
 620 |     a.x -= b;
 621 |     a.y -= b;
 622 |     a.z -= b;
 623 | }
 624 | 
 625 | inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
 626 | {
 627 |     return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
 628 | }
 629 | inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
 630 | {
 631 |     a.x -= b.x;
 632 |     a.y -= b.y;
 633 |     a.z -= b.z;
 634 | }
 635 | inline __host__ __device__ uint3 operator-(uint3 a, uint b)
 636 | {
 637 |     return make_uint3(a.x - b, a.y - b, a.z - b);
 638 | }
 639 | inline __host__ __device__ uint3 operator-(uint b, uint3 a)
 640 | {
 641 |     return make_uint3(b - a.x, b - a.y, b - a.z);
 642 | }
 643 | inline __host__ __device__ void operator-=(uint3 &a, uint b)
 644 | {
 645 |     a.x -= b;
 646 |     a.y -= b;
 647 |     a.z -= b;
 648 | }
 649 | 
 650 | inline __host__ __device__ float4 operator-(float4 a, float4 b)
 651 | {
 652 |     return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
 653 | }
 654 | inline __host__ __device__ void operator-=(float4 &a, float4 b)
 655 | {
 656 |     a.x -= b.x;
 657 |     a.y -= b.y;
 658 |     a.z -= b.z;
 659 |     a.w -= b.w;
 660 | }
 661 | inline __host__ __device__ float4 operator-(float4 a, float b)
 662 | {
 663 |     return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
 664 | }
 665 | inline __host__ __device__ void operator-=(float4 &a, float b)
 666 | {
 667 |     a.x -= b;
 668 |     a.y -= b;
 669 |     a.z -= b;
 670 |     a.w -= b;
 671 | }
 672 | 
 673 | inline __host__ __device__ int4 operator-(int4 a, int4 b)
 674 | {
 675 |     return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
 676 | }
 677 | inline __host__ __device__ void operator-=(int4 &a, int4 b)
 678 | {
 679 |     a.x -= b.x;
 680 |     a.y -= b.y;
 681 |     a.z -= b.z;
 682 |     a.w -= b.w;
 683 | }
 684 | inline __host__ __device__ int4 operator-(int4 a, int b)
 685 | {
 686 |     return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
 687 | }
 688 | inline __host__ __device__ int4 operator-(int b, int4 a)
 689 | {
 690 |     return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
 691 | }
 692 | inline __host__ __device__ void operator-=(int4 &a, int b)
 693 | {
 694 |     a.x -= b;
 695 |     a.y -= b;
 696 |     a.z -= b;
 697 |     a.w -= b;
 698 | }
 699 | 
 700 | inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
 701 | {
 702 |     return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
 703 | }
 704 | inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
 705 | {
 706 |     a.x -= b.x;
 707 |     a.y -= b.y;
 708 |     a.z -= b.z;
 709 |     a.w -= b.w;
 710 | }
 711 | inline __host__ __device__ uint4 operator-(uint4 a, uint b)
 712 | {
 713 |     return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
 714 | }
 715 | inline __host__ __device__ uint4 operator-(uint b, uint4 a)
 716 | {
 717 |     return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
 718 | }
 719 | inline __host__ __device__ void operator-=(uint4 &a, uint b)
 720 | {
 721 |     a.x -= b;
 722 |     a.y -= b;
 723 |     a.z -= b;
 724 |     a.w -= b;
 725 | }
 726 | 
 727 | ////////////////////////////////////////////////////////////////////////////////
 728 | // multiply
 729 | ////////////////////////////////////////////////////////////////////////////////
 730 | 
 731 | inline __host__ __device__ float2 operator*(float2 a, float2 b)
 732 | {
 733 |     return make_float2(a.x * b.x, a.y * b.y);
 734 | }
 735 | inline __host__ __device__ void operator*=(float2 &a, float2 b)
 736 | {
 737 |     a.x *= b.x;
 738 |     a.y *= b.y;
 739 | }
 740 | inline __host__ __device__ float2 operator*(float2 a, float b)
 741 | {
 742 |     return make_float2(a.x * b, a.y * b);
 743 | }
 744 | inline __host__ __device__ float2 operator*(float b, float2 a)
 745 | {
 746 |     return make_float2(b * a.x, b * a.y);
 747 | }
 748 | inline __host__ __device__ void operator*=(float2 &a, float b)
 749 | {
 750 |     a.x *= b;
 751 |     a.y *= b;
 752 | }
 753 | 
 754 | inline __host__ __device__ int2 operator*(int2 a, int2 b)
 755 | {
 756 |     return make_int2(a.x * b.x, a.y * b.y);
 757 | }
 758 | inline __host__ __device__ void operator*=(int2 &a, int2 b)
 759 | {
 760 |     a.x *= b.x;
 761 |     a.y *= b.y;
 762 | }
 763 | inline __host__ __device__ int2 operator*(int2 a, int b)
 764 | {
 765 |     return make_int2(a.x * b, a.y * b);
 766 | }
 767 | inline __host__ __device__ int2 operator*(int b, int2 a)
 768 | {
 769 |     return make_int2(b * a.x, b * a.y);
 770 | }
 771 | inline __host__ __device__ void operator*=(int2 &a, int b)
 772 | {
 773 |     a.x *= b;
 774 |     a.y *= b;
 775 | }
 776 | 
 777 | inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
 778 | {
 779 |     return make_uint2(a.x * b.x, a.y * b.y);
 780 | }
 781 | inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
 782 | {
 783 |     a.x *= b.x;
 784 |     a.y *= b.y;
 785 | }
 786 | inline __host__ __device__ uint2 operator*(uint2 a, uint b)
 787 | {
 788 |     return make_uint2(a.x * b, a.y * b);
 789 | }
 790 | inline __host__ __device__ uint2 operator*(uint b, uint2 a)
 791 | {
 792 |     return make_uint2(b * a.x, b * a.y);
 793 | }
 794 | inline __host__ __device__ void operator*=(uint2 &a, uint b)
 795 | {
 796 |     a.x *= b;
 797 |     a.y *= b;
 798 | }
 799 | 
 800 | inline __host__ __device__ float3 operator*(float3 a, float3 b)
 801 | {
 802 |     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 803 | }
 804 | inline __host__ __device__ void operator*=(float3 &a, float3 b)
 805 | {
 806 |     a.x *= b.x;
 807 |     a.y *= b.y;
 808 |     a.z *= b.z;
 809 | }
 810 | inline __host__ __device__ float3 operator*(float3 a, float b)
 811 | {
 812 |     return make_float3(a.x * b, a.y * b, a.z * b);
 813 | }
 814 | inline __host__ __device__ float3 operator*(float b, float3 a)
 815 | {
 816 |     return make_float3(b * a.x, b * a.y, b * a.z);
 817 | }
 818 | inline __host__ __device__ void operator*=(float3 &a, float b)
 819 | {
 820 |     a.x *= b;
 821 |     a.y *= b;
 822 |     a.z *= b;
 823 | }
 824 | 
 825 | inline __host__ __device__ int3 operator*(int3 a, int3 b)
 826 | {
 827 |     return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
 828 | }
 829 | inline __host__ __device__ void operator*=(int3 &a, int3 b)
 830 | {
 831 |     a.x *= b.x;
 832 |     a.y *= b.y;
 833 |     a.z *= b.z;
 834 | }
 835 | inline __host__ __device__ int3 operator*(int3 a, int b)
 836 | {
 837 |     return make_int3(a.x * b, a.y * b, a.z * b);
 838 | }
 839 | inline __host__ __device__ int3 operator*(int b, int3 a)
 840 | {
 841 |     return make_int3(b * a.x, b * a.y, b * a.z);
 842 | }
 843 | inline __host__ __device__ void operator*=(int3 &a, int b)
 844 | {
 845 |     a.x *= b;
 846 |     a.y *= b;
 847 |     a.z *= b;
 848 | }
 849 | 
 850 | inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
 851 | {
 852 |     return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
 853 | }
 854 | inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
 855 | {
 856 |     a.x *= b.x;
 857 |     a.y *= b.y;
 858 |     a.z *= b.z;
 859 | }
 860 | inline __host__ __device__ uint3 operator*(uint3 a, uint b)
 861 | {
 862 |     return make_uint3(a.x * b, a.y * b, a.z * b);
 863 | }
 864 | inline __host__ __device__ uint3 operator*(uint b, uint3 a)
 865 | {
 866 |     return make_uint3(b * a.x, b * a.y, b * a.z);
 867 | }
 868 | inline __host__ __device__ void operator*=(uint3 &a, uint b)
 869 | {
 870 |     a.x *= b;
 871 |     a.y *= b;
 872 |     a.z *= b;
 873 | }
 874 | 
 875 | inline __host__ __device__ float4 operator*(float4 a, float4 b)
 876 | {
 877 |     return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
 878 | }
 879 | inline __host__ __device__ void operator*=(float4 &a, float4 b)
 880 | {
 881 |     a.x *= b.x;
 882 |     a.y *= b.y;
 883 |     a.z *= b.z;
 884 |     a.w *= b.w;
 885 | }
 886 | inline __host__ __device__ float4 operator*(float4 a, float b)
 887 | {
 888 |     return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
 889 | }
 890 | inline __host__ __device__ float4 operator*(float b, float4 a)
 891 | {
 892 |     return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
 893 | }
 894 | inline __host__ __device__ void operator*=(float4 &a, float b)
 895 | {
 896 |     a.x *= b;
 897 |     a.y *= b;
 898 |     a.z *= b;
 899 |     a.w *= b;
 900 | }
 901 | 
 902 | inline __host__ __device__ int4 operator*(int4 a, int4 b)
 903 | {
 904 |     return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
 905 | }
 906 | inline __host__ __device__ void operator*=(int4 &a, int4 b)
 907 | {
 908 |     a.x *= b.x;
 909 |     a.y *= b.y;
 910 |     a.z *= b.z;
 911 |     a.w *= b.w;
 912 | }
 913 | inline __host__ __device__ int4 operator*(int4 a, int b)
 914 | {
 915 |     return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
 916 | }
 917 | inline __host__ __device__ int4 operator*(int b, int4 a)
 918 | {
 919 |     return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
 920 | }
 921 | inline __host__ __device__ void operator*=(int4 &a, int b)
 922 | {
 923 |     a.x *= b;
 924 |     a.y *= b;
 925 |     a.z *= b;
 926 |     a.w *= b;
 927 | }
 928 | 
 929 | inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
 930 | {
 931 |     return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
 932 | }
 933 | inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
 934 | {
 935 |     a.x *= b.x;
 936 |     a.y *= b.y;
 937 |     a.z *= b.z;
 938 |     a.w *= b.w;
 939 | }
 940 | inline __host__ __device__ uint4 operator*(uint4 a, uint b)
 941 | {
 942 |     return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
 943 | }
 944 | inline __host__ __device__ uint4 operator*(uint b, uint4 a)
 945 | {
 946 |     return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
 947 | }
 948 | inline __host__ __device__ void operator*=(uint4 &a, uint b)
 949 | {
 950 |     a.x *= b;
 951 |     a.y *= b;
 952 |     a.z *= b;
 953 |     a.w *= b;
 954 | }
 955 | 
 956 | ////////////////////////////////////////////////////////////////////////////////
 957 | // divide
 958 | ////////////////////////////////////////////////////////////////////////////////
 959 | 
 960 | inline __host__ __device__ float2 operator/(float2 a, float2 b)
 961 | {
 962 |     return make_float2(a.x / b.x, a.y / b.y);
 963 | }
 964 | inline __host__ __device__ void operator/=(float2 &a, float2 b)
 965 | {
 966 |     a.x /= b.x;
 967 |     a.y /= b.y;
 968 | }
 969 | inline __host__ __device__ float2 operator/(float2 a, float b)
 970 | {
 971 |     return make_float2(a.x / b, a.y / b);
 972 | }
 973 | inline __host__ __device__ void operator/=(float2 &a, float b)
 974 | {
 975 |     a.x /= b;
 976 |     a.y /= b;
 977 | }
 978 | inline __host__ __device__ float2 operator/(float b, float2 a)
 979 | {
 980 |     return make_float2(b / a.x, b / a.y);
 981 | }
 982 | 
 983 | inline __host__ __device__ float3 operator/(float3 a, float3 b)
 984 | {
 985 |     return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 986 | }
 987 | inline __host__ __device__ void operator/=(float3 &a, float3 b)
 988 | {
 989 |     a.x /= b.x;
 990 |     a.y /= b.y;
 991 |     a.z /= b.z;
 992 | }
 993 | inline __host__ __device__ float3 operator/(float3 a, float b)
 994 | {
 995 |     return make_float3(a.x / b, a.y / b, a.z / b);
 996 | }
 997 | inline __host__ __device__ void operator/=(float3 &a, float b)
 998 | {
 999 |     a.x /= b;
1000 |     a.y /= b;
1001 |     a.z /= b;
1002 | }
1003 | inline __host__ __device__ float3 operator/(float b, float3 a)
1004 | {
1005 |     return make_float3(b / a.x, b / a.y, b / a.z);
1006 | }
1007 | 
1008 | inline __host__ __device__ float4 operator/(float4 a, float4 b)
1009 | {
1010 |     return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
1011 | }
1012 | inline __host__ __device__ void operator/=(float4 &a, float4 b)
1013 | {
1014 |     a.x /= b.x;
1015 |     a.y /= b.y;
1016 |     a.z /= b.z;
1017 |     a.w /= b.w;
1018 | }
1019 | inline __host__ __device__ float4 operator/(float4 a, float b)
1020 | {
1021 |     return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
1022 | }
1023 | inline __host__ __device__ void operator/=(float4 &a, float b)
1024 | {
1025 |     a.x /= b;
1026 |     a.y /= b;
1027 |     a.z /= b;
1028 |     a.w /= b;
1029 | }
1030 | inline __host__ __device__ float4 operator/(float b, float4 a)
1031 | {
1032 |     return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
1033 | }
1034 | 
1035 | ////////////////////////////////////////////////////////////////////////////////
1036 | // min
1037 | ////////////////////////////////////////////////////////////////////////////////
1038 | 
1039 | inline  __host__ __device__ float2 fminf(float2 a, float2 b)
1040 | {
1041 |     return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
1042 | }
1043 | inline __host__ __device__ float3 fminf(float3 a, float3 b)
1044 | {
1045 |     return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
1046 | }
1047 | inline  __host__ __device__ float4 fminf(float4 a, float4 b)
1048 | {
1049 |     return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
1050 | }
1051 | 
1052 | inline __host__ __device__ int2 min(int2 a, int2 b)
1053 | {
1054 |     return make_int2(min(a.x,b.x), min(a.y,b.y));
1055 | }
1056 | inline __host__ __device__ int3 min(int3 a, int3 b)
1057 | {
1058 |     return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
1059 | }
1060 | inline __host__ __device__ int4 min(int4 a, int4 b)
1061 | {
1062 |     return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
1063 | }
1064 | 
1065 | inline __host__ __device__ uint2 min(uint2 a, uint2 b)
1066 | {
1067 |     return make_uint2(min(a.x,b.x), min(a.y,b.y));
1068 | }
1069 | inline __host__ __device__ uint3 min(uint3 a, uint3 b)
1070 | {
1071 |     return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
1072 | }
1073 | inline __host__ __device__ uint4 min(uint4 a, uint4 b)
1074 | {
1075 |     return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
1076 | }
1077 | 
1078 | ////////////////////////////////////////////////////////////////////////////////
1079 | // max
1080 | ////////////////////////////////////////////////////////////////////////////////
1081 | 
1082 | inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
1083 | {
1084 |     return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
1085 | }
1086 | inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
1087 | {
1088 |     return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
1089 | }
1090 | inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
1091 | {
1092 |     return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
1093 | }
1094 | 
1095 | inline __host__ __device__ int2 max(int2 a, int2 b)
1096 | {
1097 |     return make_int2(max(a.x,b.x), max(a.y,b.y));
1098 | }
1099 | inline __host__ __device__ int3 max(int3 a, int3 b)
1100 | {
1101 |     return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
1102 | }
1103 | inline __host__ __device__ int4 max(int4 a, int4 b)
1104 | {
1105 |     return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
1106 | }
1107 | 
1108 | inline __host__ __device__ uint2 max(uint2 a, uint2 b)
1109 | {
1110 |     return make_uint2(max(a.x,b.x), max(a.y,b.y));
1111 | }
1112 | inline __host__ __device__ uint3 max(uint3 a, uint3 b)
1113 | {
1114 |     return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
1115 | }
1116 | inline __host__ __device__ uint4 max(uint4 a, uint4 b)
1117 | {
1118 |     return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
1119 | }
1120 | 
1121 | ////////////////////////////////////////////////////////////////////////////////
1122 | // lerp
1123 | // - linear interpolation between a and b, based on value t in [0, 1] range
1124 | ////////////////////////////////////////////////////////////////////////////////
1125 | 
1126 | inline __device__ __host__ float lerp(float a, float b, float t)
1127 | {
1128 |     return a + t*(b-a);
1129 | }
1130 | inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
1131 | {
1132 |     return a + t*(b-a);
1133 | }
1134 | inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
1135 | {
1136 |     return a + t*(b-a);
1137 | }
1138 | inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
1139 | {
1140 |     return a + t*(b-a);
1141 | }
1142 | 
1143 | ////////////////////////////////////////////////////////////////////////////////
1144 | // clamp
1145 | // - clamp the value v to be in the range [a, b]
1146 | ////////////////////////////////////////////////////////////////////////////////
1147 | 
1148 | inline __device__ __host__ float clamp(float f, float a, float b)
1149 | {
1150 |     return fmaxf(a, fminf(f, b));
1151 | }
1152 | inline __device__ __host__ int clamp(int f, int a, int b)
1153 | {
1154 |     return max(a, min(f, b));
1155 | }
1156 | inline __device__ __host__ uint clamp(uint f, uint a, uint b)
1157 | {
1158 |     return max(a, min(f, b));
1159 | }
1160 | 
1161 | inline __device__ __host__ float2 clamp(float2 v, float a, float b)
1162 | {
1163 |     return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
1164 | }
1165 | inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
1166 | {
1167 |     return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1168 | }
1169 | inline __device__ __host__ float3 clamp(float3 v, float a, float b)
1170 | {
1171 |     return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1172 | }
1173 | inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
1174 | {
1175 |     return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1176 | }
1177 | inline __device__ __host__ float4 clamp(float4 v, float a, float b)
1178 | {
1179 |     return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1180 | }
1181 | inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
1182 | {
1183 |     return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1184 | }
1185 | 
1186 | inline __device__ __host__ int2 clamp(int2 v, int a, int b)
1187 | {
1188 |     return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
1189 | }
1190 | inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
1191 | {
1192 |     return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1193 | }
1194 | inline __device__ __host__ int3 clamp(int3 v, int a, int b)
1195 | {
1196 |     return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1197 | }
1198 | inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
1199 | {
1200 |     return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1201 | }
1202 | inline __device__ __host__ int4 clamp(int4 v, int a, int b)
1203 | {
1204 |     return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1205 | }
1206 | inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
1207 | {
1208 |     return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1209 | }
1210 | 
1211 | inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
1212 | {
1213 |     return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
1214 | }
1215 | inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
1216 | {
1217 |     return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
1218 | }
1219 | inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
1220 | {
1221 |     return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
1222 | }
1223 | inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
1224 | {
1225 |     return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
1226 | }
1227 | inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
1228 | {
1229 |     return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
1230 | }
1231 | inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
1232 | {
1233 |     return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
1234 | }
1235 | 
1236 | ////////////////////////////////////////////////////////////////////////////////
1237 | // dot product
1238 | ////////////////////////////////////////////////////////////////////////////////
1239 | 
1240 | inline __host__ __device__ float dot(float2 a, float2 b)
1241 | {
1242 |     return a.x * b.x + a.y * b.y;
1243 | }
1244 | inline __host__ __device__ float dot(float3 a, float3 b)
1245 | {
1246 |     return a.x * b.x + a.y * b.y + a.z * b.z;
1247 | }
1248 | inline __host__ __device__ float dot(float4 a, float4 b)
1249 | {
1250 |     return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1251 | }
1252 | 
1253 | inline __host__ __device__ int dot(int2 a, int2 b)
1254 | {
1255 |     return a.x * b.x + a.y * b.y;
1256 | }
1257 | inline __host__ __device__ int dot(int3 a, int3 b)
1258 | {
1259 |     return a.x * b.x + a.y * b.y + a.z * b.z;
1260 | }
1261 | inline __host__ __device__ int dot(int4 a, int4 b)
1262 | {
1263 |     return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1264 | }
1265 | 
1266 | inline __host__ __device__ uint dot(uint2 a, uint2 b)
1267 | {
1268 |     return a.x * b.x + a.y * b.y;
1269 | }
1270 | inline __host__ __device__ uint dot(uint3 a, uint3 b)
1271 | {
1272 |     return a.x * b.x + a.y * b.y + a.z * b.z;
1273 | }
1274 | inline __host__ __device__ uint dot(uint4 a, uint4 b)
1275 | {
1276 |     return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
1277 | }
1278 | 
1279 | ////////////////////////////////////////////////////////////////////////////////
1280 | // length
1281 | ////////////////////////////////////////////////////////////////////////////////
1282 | 
1283 | inline __host__ __device__ float length(float2 v)
1284 | {
1285 |     return sqrtf(dot(v, v));
1286 | }
1287 | inline __host__ __device__ float length(float3 v)
1288 | {
1289 |     return sqrtf(dot(v, v));
1290 | }
1291 | inline __host__ __device__ float length(float4 v)
1292 | {
1293 |     return sqrtf(dot(v, v));
1294 | }
1295 | 
1296 | ////////////////////////////////////////////////////////////////////////////////
1297 | // normalize
1298 | ////////////////////////////////////////////////////////////////////////////////
1299 | 
1300 | inline __host__ __device__ float2 normalize(float2 v)
1301 | {
1302 |     float invLen = rsqrtf(dot(v, v));
1303 |     return v * invLen;
1304 | }
1305 | inline __host__ __device__ float3 normalize(float3 v)
1306 | {
1307 |     float invLen = rsqrtf(dot(v, v));
1308 |     return v * invLen;
1309 | }
1310 | inline __host__ __device__ float4 normalize(float4 v)
1311 | {
1312 |     float invLen = rsqrtf(dot(v, v));
1313 |     return v * invLen;
1314 | }
1315 | 
1316 | ////////////////////////////////////////////////////////////////////////////////
1317 | // floor
1318 | ////////////////////////////////////////////////////////////////////////////////
1319 | 
1320 | inline __host__ __device__ float2 floorf(float2 v)
1321 | {
1322 |     return make_float2(floorf(v.x), floorf(v.y));
1323 | }
1324 | inline __host__ __device__ float3 floorf(float3 v)
1325 | {
1326 |     return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
1327 | }
1328 | inline __host__ __device__ float4 floorf(float4 v)
1329 | {
1330 |     return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
1331 | }
1332 | 
1333 | ////////////////////////////////////////////////////////////////////////////////
1334 | // frac - returns the fractional portion of a scalar or each vector component
1335 | ////////////////////////////////////////////////////////////////////////////////
1336 | 
1337 | inline __host__ __device__ float fracf(float v)
1338 | {
1339 |     return v - floorf(v);
1340 | }
1341 | inline __host__ __device__ float2 fracf(float2 v)
1342 | {
1343 |     return make_float2(fracf(v.x), fracf(v.y));
1344 | }
1345 | inline __host__ __device__ float3 fracf(float3 v)
1346 | {
1347 |     return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
1348 | }
1349 | inline __host__ __device__ float4 fracf(float4 v)
1350 | {
1351 |     return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
1352 | }
1353 | 
1354 | ////////////////////////////////////////////////////////////////////////////////
1355 | // fmod
1356 | ////////////////////////////////////////////////////////////////////////////////
1357 | 
1358 | inline __host__ __device__ float2 fmodf(float2 a, float2 b)
1359 | {
1360 |     return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
1361 | }
1362 | inline __host__ __device__ float3 fmodf(float3 a, float3 b)
1363 | {
1364 |     return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
1365 | }
1366 | inline __host__ __device__ float4 fmodf(float4 a, float4 b)
1367 | {
1368 |     return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
1369 | }
1370 | 
1371 | ////////////////////////////////////////////////////////////////////////////////
1372 | // absolute value
1373 | ////////////////////////////////////////////////////////////////////////////////
1374 | 
1375 | inline __host__ __device__ float2 fabs(float2 v)
1376 | {
1377 |     return make_float2(fabs(v.x), fabs(v.y));
1378 | }
1379 | inline __host__ __device__ float3 fabs(float3 v)
1380 | {
1381 |     return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
1382 | }
1383 | inline __host__ __device__ float4 fabs(float4 v)
1384 | {
1385 |     return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
1386 | }
1387 | 
1388 | inline __host__ __device__ int2 abs(int2 v)
1389 | {
1390 |     return make_int2(abs(v.x), abs(v.y));
1391 | }
1392 | inline __host__ __device__ int3 abs(int3 v)
1393 | {
1394 |     return make_int3(abs(v.x), abs(v.y), abs(v.z));
1395 | }
1396 | inline __host__ __device__ int4 abs(int4 v)
1397 | {
1398 |     return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
1399 | }
1400 | 
1401 | ////////////////////////////////////////////////////////////////////////////////
1402 | // reflect
1403 | // - returns reflection of incident ray I around surface normal N
1404 | // - N should be normalized, reflected vector's length is equal to length of I
1405 | ////////////////////////////////////////////////////////////////////////////////
1406 | 
1407 | inline __host__ __device__ float3 reflect(float3 i, float3 n)
1408 | {
1409 |     return i - 2.0f * n * dot(n,i);
1410 | }
1411 | 
1412 | ////////////////////////////////////////////////////////////////////////////////
1413 | // cross product
1414 | ////////////////////////////////////////////////////////////////////////////////
1415 | 
1416 | inline __host__ __device__ float3 cross(float3 a, float3 b)
1417 | {
1418 |     return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
1419 | }
1420 | 
1421 | ////////////////////////////////////////////////////////////////////////////////
1422 | // smoothstep
1423 | // - returns 0 if x < a
1424 | // - returns 1 if x > b
1425 | // - otherwise returns smooth interpolation between 0 and 1 based on x
1426 | ////////////////////////////////////////////////////////////////////////////////
1427 | 
1428 | inline __device__ __host__ float smoothstep(float a, float b, float x)
1429 | {
1430 |     float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1431 |     return (y*y*(3.0f - (2.0f*y)));
1432 | }
1433 | inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
1434 | {
1435 |     float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1436 |     return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
1437 | }
1438 | inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
1439 | {
1440 |     float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1441 |     return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
1442 | }
1443 | inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
1444 | {
1445 |     float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
1446 |     return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
1447 | }
1448 | 
1449 | #endif
1450 | 


--------------------------------------------------------------------------------