├── .gitignore
├── LICENSE
├── README.md
├── common
    ├── helper_cuda.h
    ├── helper_cuda_drvapi.h
    ├── helper_cuda_gl.h
    ├── helper_functions.h
    ├── helper_image.h
    ├── helper_math.h
    ├── helper_string.h
    └── helper_timer.h
├── compile.m
├── cuda_compile.m
├── demoCudaConvolutionFFT.m
└── src
    ├── convolutionFFTkernel.cu
    ├── cudaConvFFTData.cu
    ├── cudaConvFFTData.cuh
    ├── cudaConvFFTData.h
    ├── cudaConvFFTDataStreams.cu
    ├── cudaConvolutionFFT.cu
    ├── cudaFFTData.cu
    └── cutil.h


/.gitignore:
--------------------------------------------------------------------------------
1 | *.*~
2 | *.o
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CUDA-FFT-Convolution
 2 | ==============
 3 | 
 4 | Using a standard multi-threaded CPU convolution for very large kernels is very inefficient and slow. This package provides GPU convolution using Fast Fourier Transformation implementation using CUDA.
 5 | 
 6 | Standard convolution in time domain takes O(nm) time whereas convolution in frequency domain takes O((n+m) log (n+m)) time where n is the data length and k is the kernel length.
 7 | 
 8 | ## cudaConvolutionFFT.cu
 9 | 
10 | The main file takes data, max kernel height, width, convolution kernels (multiple kernels in cell format) and returns convolution results that corresponds to the convolution kernels.
11 | 
12 | ## Usage and Instructions
13 | 
14 | 1. Download the repo.
15 | 
16 |     ```
17 |     git clone http://github.com/chrischoy/MatlabCUDAConv
18 |     ```
19 | 
20 | 2. Go to the repo. Open MATLAB and type
21 | 
22 |     ```
23 |     compile
24 |     ```
25 | 
26 | 3. Run demo. the demo file `demoCudaConvolutionFFT.m` contains a detailed instruction and demo usage
27 | 
28 | 
29 |     ```
30 |     demoCudaConvolutionFFT
31 |     ```
32 | 
33 | ## Output
34 | 
35 | ![](https://dl.dropboxusercontent.com/u/57360783/cudafft_matlabfft_conv.png)
36 | 
37 | ### More resource
38 | 
39 | [http://chrischoy.org/projects/cuda-fft-convolution](http://chrischoy.org/projects/cuda-fft-convolution)
40 | 


--------------------------------------------------------------------------------
/common/helper_cuda.h:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
   3 |  *
   4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5 |  * with this source code for terms and conditions that govern your use of
   6 |  * this software. Any use, reproduction, disclosure, or distribution of
   7 |  * this software and related documentation outside the terms of the EULA
   8 |  * is strictly prohibited.
   9 |  *
  10 |  */
  11 | 
  12 | ////////////////////////////////////////////////////////////////////////////////
  13 | // These are CUDA Helper functions for initialization and error checking
  14 | 
  15 | #ifndef HELPER_CUDA_H
  16 | #define HELPER_CUDA_H
  17 | 
  18 | #pragma once
  19 | 
  20 | #include <stdlib.h>
  21 | #include <stdio.h>
  22 | #include <string.h>
  23 | 
  24 | #include <helper_string.h>
  25 | 
  26 | /*
  27 | inline void __ExitInTime(int seconds)
  28 | {
  29 |     fprintf(stdout, "> exiting in %d seconds: ", seconds);
  30 |     fflush(stdout);
  31 |     time_t t;
  32 |     int count;
  33 | 
  34 |     for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
  35 |         fprintf(stdout, "%d...", count);
  36 | #if defined(WIN32)
  37 |         Sleep(1000);
  38 | #else
  39 |         sleep(1);
  40 | #endif
  41 |     }
  42 | 
  43 |     fprintf(stdout,"done!\n\n");
  44 |     fflush(stdout);
  45 | }
  46 | 
  47 | #define EXIT_TIME_DELAY 2
  48 | 
  49 | inline void EXIT_DELAY(int return_code)
  50 | {
  51 |     __ExitInTime(EXIT_TIME_DELAY);
  52 |     exit(return_code);
  53 | }
  54 | */
  55 | 
  56 | #ifndef EXIT_WAIVED
  57 | #define EXIT_WAIVED 2
  58 | #endif
  59 | 
  60 | // Note, it is required that your SDK sample to include the proper header files, please
  61 | // refer the CUDA examples for examples of the needed CUDA headers, which may change depending
  62 | // on which CUDA functions are used.
  63 | 
  64 | // CUDA Runtime error messages
  65 | #ifdef __DRIVER_TYPES_H__
  66 | static const char *_cudaGetErrorEnum(cudaError_t error)
  67 | {
  68 |     switch (error)
  69 |     {
  70 |         case cudaSuccess:
  71 |             return "cudaSuccess";
  72 | 
  73 |         case cudaErrorMissingConfiguration:
  74 |             return "cudaErrorMissingConfiguration";
  75 | 
  76 |         case cudaErrorMemoryAllocation:
  77 |             return "cudaErrorMemoryAllocation";
  78 | 
  79 |         case cudaErrorInitializationError:
  80 |             return "cudaErrorInitializationError";
  81 | 
  82 |         case cudaErrorLaunchFailure:
  83 |             return "cudaErrorLaunchFailure";
  84 | 
  85 |         case cudaErrorPriorLaunchFailure:
  86 |             return "cudaErrorPriorLaunchFailure";
  87 | 
  88 |         case cudaErrorLaunchTimeout:
  89 |             return "cudaErrorLaunchTimeout";
  90 | 
  91 |         case cudaErrorLaunchOutOfResources:
  92 |             return "cudaErrorLaunchOutOfResources";
  93 | 
  94 |         case cudaErrorInvalidDeviceFunction:
  95 |             return "cudaErrorInvalidDeviceFunction";
  96 | 
  97 |         case cudaErrorInvalidConfiguration:
  98 |             return "cudaErrorInvalidConfiguration";
  99 | 
 100 |         case cudaErrorInvalidDevice:
 101 |             return "cudaErrorInvalidDevice";
 102 | 
 103 |         case cudaErrorInvalidValue:
 104 |             return "cudaErrorInvalidValue";
 105 | 
 106 |         case cudaErrorInvalidPitchValue:
 107 |             return "cudaErrorInvalidPitchValue";
 108 | 
 109 |         case cudaErrorInvalidSymbol:
 110 |             return "cudaErrorInvalidSymbol";
 111 | 
 112 |         case cudaErrorMapBufferObjectFailed:
 113 |             return "cudaErrorMapBufferObjectFailed";
 114 | 
 115 |         case cudaErrorUnmapBufferObjectFailed:
 116 |             return "cudaErrorUnmapBufferObjectFailed";
 117 | 
 118 |         case cudaErrorInvalidHostPointer:
 119 |             return "cudaErrorInvalidHostPointer";
 120 | 
 121 |         case cudaErrorInvalidDevicePointer:
 122 |             return "cudaErrorInvalidDevicePointer";
 123 | 
 124 |         case cudaErrorInvalidTexture:
 125 |             return "cudaErrorInvalidTexture";
 126 | 
 127 |         case cudaErrorInvalidTextureBinding:
 128 |             return "cudaErrorInvalidTextureBinding";
 129 | 
 130 |         case cudaErrorInvalidChannelDescriptor:
 131 |             return "cudaErrorInvalidChannelDescriptor";
 132 | 
 133 |         case cudaErrorInvalidMemcpyDirection:
 134 |             return "cudaErrorInvalidMemcpyDirection";
 135 | 
 136 |         case cudaErrorAddressOfConstant:
 137 |             return "cudaErrorAddressOfConstant";
 138 | 
 139 |         case cudaErrorTextureFetchFailed:
 140 |             return "cudaErrorTextureFetchFailed";
 141 | 
 142 |         case cudaErrorTextureNotBound:
 143 |             return "cudaErrorTextureNotBound";
 144 | 
 145 |         case cudaErrorSynchronizationError:
 146 |             return "cudaErrorSynchronizationError";
 147 | 
 148 |         case cudaErrorInvalidFilterSetting:
 149 |             return "cudaErrorInvalidFilterSetting";
 150 | 
 151 |         case cudaErrorInvalidNormSetting:
 152 |             return "cudaErrorInvalidNormSetting";
 153 | 
 154 |         case cudaErrorMixedDeviceExecution:
 155 |             return "cudaErrorMixedDeviceExecution";
 156 | 
 157 |         case cudaErrorCudartUnloading:
 158 |             return "cudaErrorCudartUnloading";
 159 | 
 160 |         case cudaErrorUnknown:
 161 |             return "cudaErrorUnknown";
 162 | 
 163 |         case cudaErrorNotYetImplemented:
 164 |             return "cudaErrorNotYetImplemented";
 165 | 
 166 |         case cudaErrorMemoryValueTooLarge:
 167 |             return "cudaErrorMemoryValueTooLarge";
 168 | 
 169 |         case cudaErrorInvalidResourceHandle:
 170 |             return "cudaErrorInvalidResourceHandle";
 171 | 
 172 |         case cudaErrorNotReady:
 173 |             return "cudaErrorNotReady";
 174 | 
 175 |         case cudaErrorInsufficientDriver:
 176 |             return "cudaErrorInsufficientDriver";
 177 | 
 178 |         case cudaErrorSetOnActiveProcess:
 179 |             return "cudaErrorSetOnActiveProcess";
 180 | 
 181 |         case cudaErrorInvalidSurface:
 182 |             return "cudaErrorInvalidSurface";
 183 | 
 184 |         case cudaErrorNoDevice:
 185 |             return "cudaErrorNoDevice";
 186 | 
 187 |         case cudaErrorECCUncorrectable:
 188 |             return "cudaErrorECCUncorrectable";
 189 | 
 190 |         case cudaErrorSharedObjectSymbolNotFound:
 191 |             return "cudaErrorSharedObjectSymbolNotFound";
 192 | 
 193 |         case cudaErrorSharedObjectInitFailed:
 194 |             return "cudaErrorSharedObjectInitFailed";
 195 | 
 196 |         case cudaErrorUnsupportedLimit:
 197 |             return "cudaErrorUnsupportedLimit";
 198 | 
 199 |         case cudaErrorDuplicateVariableName:
 200 |             return "cudaErrorDuplicateVariableName";
 201 | 
 202 |         case cudaErrorDuplicateTextureName:
 203 |             return "cudaErrorDuplicateTextureName";
 204 | 
 205 |         case cudaErrorDuplicateSurfaceName:
 206 |             return "cudaErrorDuplicateSurfaceName";
 207 | 
 208 |         case cudaErrorDevicesUnavailable:
 209 |             return "cudaErrorDevicesUnavailable";
 210 | 
 211 |         case cudaErrorInvalidKernelImage:
 212 |             return "cudaErrorInvalidKernelImage";
 213 | 
 214 |         case cudaErrorNoKernelImageForDevice:
 215 |             return "cudaErrorNoKernelImageForDevice";
 216 | 
 217 |         case cudaErrorIncompatibleDriverContext:
 218 |             return "cudaErrorIncompatibleDriverContext";
 219 | 
 220 |         case cudaErrorPeerAccessAlreadyEnabled:
 221 |             return "cudaErrorPeerAccessAlreadyEnabled";
 222 | 
 223 |         case cudaErrorPeerAccessNotEnabled:
 224 |             return "cudaErrorPeerAccessNotEnabled";
 225 | 
 226 |         case cudaErrorDeviceAlreadyInUse:
 227 |             return "cudaErrorDeviceAlreadyInUse";
 228 | 
 229 |         case cudaErrorProfilerDisabled:
 230 |             return "cudaErrorProfilerDisabled";
 231 | 
 232 |         case cudaErrorProfilerNotInitialized:
 233 |             return "cudaErrorProfilerNotInitialized";
 234 | 
 235 |         case cudaErrorProfilerAlreadyStarted:
 236 |             return "cudaErrorProfilerAlreadyStarted";
 237 | 
 238 |         case cudaErrorProfilerAlreadyStopped:
 239 |             return "cudaErrorProfilerAlreadyStopped";
 240 | 
 241 | #if __CUDA_API_VERSION >= 0x4000
 242 | 
 243 |         case cudaErrorAssert:
 244 |             return "cudaErrorAssert";
 245 | 
 246 |         case cudaErrorTooManyPeers:
 247 |             return "cudaErrorTooManyPeers";
 248 | 
 249 |         case cudaErrorHostMemoryAlreadyRegistered:
 250 |             return "cudaErrorHostMemoryAlreadyRegistered";
 251 | 
 252 |         case cudaErrorHostMemoryNotRegistered:
 253 |             return "cudaErrorHostMemoryNotRegistered";
 254 | #endif
 255 | 
 256 |         case cudaErrorStartupFailure:
 257 |             return "cudaErrorStartupFailure";
 258 | 
 259 |         case cudaErrorApiFailureBase:
 260 |             return "cudaErrorApiFailureBase";
 261 |     }
 262 | 
 263 |     return "<unknown>";
 264 | }
 265 | #endif
 266 | 
 267 | #ifdef __cuda_cuda_h__
 268 | // CUDA Driver API errors
 269 | static const char *_cudaGetErrorEnum(CUresult error)
 270 | {
 271 |     switch (error)
 272 |     {
 273 |         case CUDA_SUCCESS:
 274 |             return "CUDA_SUCCESS";
 275 | 
 276 |         case CUDA_ERROR_INVALID_VALUE:
 277 |             return "CUDA_ERROR_INVALID_VALUE";
 278 | 
 279 |         case CUDA_ERROR_OUT_OF_MEMORY:
 280 |             return "CUDA_ERROR_OUT_OF_MEMORY";
 281 | 
 282 |         case CUDA_ERROR_NOT_INITIALIZED:
 283 |             return "CUDA_ERROR_NOT_INITIALIZED";
 284 | 
 285 |         case CUDA_ERROR_DEINITIALIZED:
 286 |             return "CUDA_ERROR_DEINITIALIZED";
 287 | 
 288 |         case CUDA_ERROR_PROFILER_DISABLED:
 289 |             return "CUDA_ERROR_PROFILER_DISABLED";
 290 | 
 291 |         case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
 292 |             return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
 293 | 
 294 |         case CUDA_ERROR_PROFILER_ALREADY_STARTED:
 295 |             return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
 296 | 
 297 |         case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
 298 |             return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
 299 | 
 300 |         case CUDA_ERROR_NO_DEVICE:
 301 |             return "CUDA_ERROR_NO_DEVICE";
 302 | 
 303 |         case CUDA_ERROR_INVALID_DEVICE:
 304 |             return "CUDA_ERROR_INVALID_DEVICE";
 305 | 
 306 |         case CUDA_ERROR_INVALID_IMAGE:
 307 |             return "CUDA_ERROR_INVALID_IMAGE";
 308 | 
 309 |         case CUDA_ERROR_INVALID_CONTEXT:
 310 |             return "CUDA_ERROR_INVALID_CONTEXT";
 311 | 
 312 |         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
 313 |             return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
 314 | 
 315 |         case CUDA_ERROR_MAP_FAILED:
 316 |             return "CUDA_ERROR_MAP_FAILED";
 317 | 
 318 |         case CUDA_ERROR_UNMAP_FAILED:
 319 |             return "CUDA_ERROR_UNMAP_FAILED";
 320 | 
 321 |         case CUDA_ERROR_ARRAY_IS_MAPPED:
 322 |             return "CUDA_ERROR_ARRAY_IS_MAPPED";
 323 | 
 324 |         case CUDA_ERROR_ALREADY_MAPPED:
 325 |             return "CUDA_ERROR_ALREADY_MAPPED";
 326 | 
 327 |         case CUDA_ERROR_NO_BINARY_FOR_GPU:
 328 |             return "CUDA_ERROR_NO_BINARY_FOR_GPU";
 329 | 
 330 |         case CUDA_ERROR_ALREADY_ACQUIRED:
 331 |             return "CUDA_ERROR_ALREADY_ACQUIRED";
 332 | 
 333 |         case CUDA_ERROR_NOT_MAPPED:
 334 |             return "CUDA_ERROR_NOT_MAPPED";
 335 | 
 336 |         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
 337 |             return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
 338 | 
 339 |         case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
 340 |             return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
 341 | 
 342 |         case CUDA_ERROR_ECC_UNCORRECTABLE:
 343 |             return "CUDA_ERROR_ECC_UNCORRECTABLE";
 344 | 
 345 |         case CUDA_ERROR_UNSUPPORTED_LIMIT:
 346 |             return "CUDA_ERROR_UNSUPPORTED_LIMIT";
 347 | 
 348 |         case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
 349 |             return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
 350 | 
 351 |         case CUDA_ERROR_INVALID_SOURCE:
 352 |             return "CUDA_ERROR_INVALID_SOURCE";
 353 | 
 354 |         case CUDA_ERROR_FILE_NOT_FOUND:
 355 |             return "CUDA_ERROR_FILE_NOT_FOUND";
 356 | 
 357 |         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
 358 |             return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
 359 | 
 360 |         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
 361 |             return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
 362 | 
 363 |         case CUDA_ERROR_OPERATING_SYSTEM:
 364 |             return "CUDA_ERROR_OPERATING_SYSTEM";
 365 | 
 366 |         case CUDA_ERROR_INVALID_HANDLE:
 367 |             return "CUDA_ERROR_INVALID_HANDLE";
 368 | 
 369 |         case CUDA_ERROR_NOT_FOUND:
 370 |             return "CUDA_ERROR_NOT_FOUND";
 371 | 
 372 |         case CUDA_ERROR_NOT_READY:
 373 |             return "CUDA_ERROR_NOT_READY";
 374 | 
 375 |         case CUDA_ERROR_LAUNCH_FAILED:
 376 |             return "CUDA_ERROR_LAUNCH_FAILED";
 377 | 
 378 |         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
 379 |             return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
 380 | 
 381 |         case CUDA_ERROR_LAUNCH_TIMEOUT:
 382 |             return "CUDA_ERROR_LAUNCH_TIMEOUT";
 383 | 
 384 |         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
 385 |             return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
 386 | 
 387 |         case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
 388 |             return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
 389 | 
 390 |         case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
 391 |             return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
 392 | 
 393 |         case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
 394 |             return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
 395 | 
 396 |         case CUDA_ERROR_CONTEXT_IS_DESTROYED:
 397 |             return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
 398 | 
 399 |         case CUDA_ERROR_ASSERT:
 400 |             return "CUDA_ERROR_ASSERT";
 401 | 
 402 |         case CUDA_ERROR_TOO_MANY_PEERS:
 403 |             return "CUDA_ERROR_TOO_MANY_PEERS";
 404 | 
 405 |         case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
 406 |             return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
 407 | 
 408 |         case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
 409 |             return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
 410 | 
 411 |         case CUDA_ERROR_UNKNOWN:
 412 |             return "CUDA_ERROR_UNKNOWN";
 413 |     }
 414 | 
 415 |     return "<unknown>";
 416 | }
 417 | #endif
 418 | 
 419 | #ifdef CUBLAS_API_H_
 420 | // cuBLAS API errors
 421 | static const char *_cudaGetErrorEnum(cublasStatus_t error)
 422 | {
 423 |     switch (error)
 424 |     {
 425 |         case CUBLAS_STATUS_SUCCESS:
 426 |             return "CUBLAS_STATUS_SUCCESS";
 427 | 
 428 |         case CUBLAS_STATUS_NOT_INITIALIZED:
 429 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
 430 | 
 431 |         case CUBLAS_STATUS_ALLOC_FAILED:
 432 |             return "CUBLAS_STATUS_ALLOC_FAILED";
 433 | 
 434 |         case CUBLAS_STATUS_INVALID_VALUE:
 435 |             return "CUBLAS_STATUS_INVALID_VALUE";
 436 | 
 437 |         case CUBLAS_STATUS_ARCH_MISMATCH:
 438 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
 439 | 
 440 |         case CUBLAS_STATUS_MAPPING_ERROR:
 441 |             return "CUBLAS_STATUS_MAPPING_ERROR";
 442 | 
 443 |         case CUBLAS_STATUS_EXECUTION_FAILED:
 444 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
 445 | 
 446 |         case CUBLAS_STATUS_INTERNAL_ERROR:
 447 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
 448 |     }
 449 | 
 450 |     return "<unknown>";
 451 | }
 452 | #endif
 453 | 
 454 | #ifdef _CUFFT_H_
 455 | // cuFFT API errors
 456 | static const char *_cudaGetErrorEnum(cufftResult error)
 457 | {
 458 |     switch (error)
 459 |     {
 460 |         case CUFFT_SUCCESS:
 461 |             return "CUFFT_SUCCESS";
 462 | 
 463 |         case CUFFT_INVALID_PLAN:
 464 |             return "CUFFT_INVALID_PLAN";
 465 | 
 466 |         case CUFFT_ALLOC_FAILED:
 467 |             return "CUFFT_ALLOC_FAILED";
 468 | 
 469 |         case CUFFT_INVALID_TYPE:
 470 |             return "CUFFT_INVALID_TYPE";
 471 | 
 472 |         case CUFFT_INVALID_VALUE:
 473 |             return "CUFFT_INVALID_VALUE";
 474 | 
 475 |         case CUFFT_INTERNAL_ERROR:
 476 |             return "CUFFT_INTERNAL_ERROR";
 477 | 
 478 |         case CUFFT_EXEC_FAILED:
 479 |             return "CUFFT_EXEC_FAILED";
 480 | 
 481 |         case CUFFT_SETUP_FAILED:
 482 |             return "CUFFT_SETUP_FAILED";
 483 | 
 484 |         case CUFFT_INVALID_SIZE:
 485 |             return "CUFFT_INVALID_SIZE";
 486 | 
 487 |         case CUFFT_UNALIGNED_DATA:
 488 |             return "CUFFT_UNALIGNED_DATA";
 489 |     }
 490 | 
 491 |     return "<unknown>";
 492 | }
 493 | #endif
 494 | 
 495 | 
 496 | #ifdef CUSPARSEAPI
 497 | // cuSPARSE API errors
 498 | static const char *_cudaGetErrorEnum(cusparseStatus_t error)
 499 | {
 500 |     switch (error)
 501 |     {
 502 |         case CUSPARSE_STATUS_SUCCESS:
 503 |             return "CUSPARSE_STATUS_SUCCESS";
 504 | 
 505 |         case CUSPARSE_STATUS_NOT_INITIALIZED:
 506 |             return "CUSPARSE_STATUS_NOT_INITIALIZED";
 507 | 
 508 |         case CUSPARSE_STATUS_ALLOC_FAILED:
 509 |             return "CUSPARSE_STATUS_ALLOC_FAILED";
 510 | 
 511 |         case CUSPARSE_STATUS_INVALID_VALUE:
 512 |             return "CUSPARSE_STATUS_INVALID_VALUE";
 513 | 
 514 |         case CUSPARSE_STATUS_ARCH_MISMATCH:
 515 |             return "CUSPARSE_STATUS_ARCH_MISMATCH";
 516 | 
 517 |         case CUSPARSE_STATUS_MAPPING_ERROR:
 518 |             return "CUSPARSE_STATUS_MAPPING_ERROR";
 519 | 
 520 |         case CUSPARSE_STATUS_EXECUTION_FAILED:
 521 |             return "CUSPARSE_STATUS_EXECUTION_FAILED";
 522 | 
 523 |         case CUSPARSE_STATUS_INTERNAL_ERROR:
 524 |             return "CUSPARSE_STATUS_INTERNAL_ERROR";
 525 | 
 526 |         case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
 527 |             return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
 528 |     }
 529 | 
 530 |     return "<unknown>";
 531 | }
 532 | #endif
 533 | 
 534 | #ifdef CURAND_H_
 535 | // cuRAND API errors
 536 | static const char *_cudaGetErrorEnum(curandStatus_t error)
 537 | {
 538 |     switch (error)
 539 |     {
 540 |         case CURAND_STATUS_SUCCESS:
 541 |             return "CURAND_STATUS_SUCCESS";
 542 | 
 543 |         case CURAND_STATUS_VERSION_MISMATCH:
 544 |             return "CURAND_STATUS_VERSION_MISMATCH";
 545 | 
 546 |         case CURAND_STATUS_NOT_INITIALIZED:
 547 |             return "CURAND_STATUS_NOT_INITIALIZED";
 548 | 
 549 |         case CURAND_STATUS_ALLOCATION_FAILED:
 550 |             return "CURAND_STATUS_ALLOCATION_FAILED";
 551 | 
 552 |         case CURAND_STATUS_TYPE_ERROR:
 553 |             return "CURAND_STATUS_TYPE_ERROR";
 554 | 
 555 |         case CURAND_STATUS_OUT_OF_RANGE:
 556 |             return "CURAND_STATUS_OUT_OF_RANGE";
 557 | 
 558 |         case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
 559 |             return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
 560 | 
 561 |         case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
 562 |             return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
 563 | 
 564 |         case CURAND_STATUS_LAUNCH_FAILURE:
 565 |             return "CURAND_STATUS_LAUNCH_FAILURE";
 566 | 
 567 |         case CURAND_STATUS_PREEXISTING_FAILURE:
 568 |             return "CURAND_STATUS_PREEXISTING_FAILURE";
 569 | 
 570 |         case CURAND_STATUS_INITIALIZATION_FAILED:
 571 |             return "CURAND_STATUS_INITIALIZATION_FAILED";
 572 | 
 573 |         case CURAND_STATUS_ARCH_MISMATCH:
 574 |             return "CURAND_STATUS_ARCH_MISMATCH";
 575 | 
 576 |         case CURAND_STATUS_INTERNAL_ERROR:
 577 |             return "CURAND_STATUS_INTERNAL_ERROR";
 578 |     }
 579 | 
 580 |     return "<unknown>";
 581 | }
 582 | #endif
 583 | 
 584 | #ifdef NV_NPPIDEFS_H
 585 | // NPP API errors
 586 | static const char *_cudaGetErrorEnum(NppStatus error)
 587 | {
 588 |     switch (error)
 589 |     {
 590 |         case NPP_NOT_SUPPORTED_MODE_ERROR:
 591 |             return "NPP_NOT_SUPPORTED_MODE_ERROR";
 592 | 
 593 |         case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
 594 |             return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
 595 | 
 596 |         case NPP_RESIZE_NO_OPERATION_ERROR:
 597 |             return "NPP_RESIZE_NO_OPERATION_ERROR";
 598 | 
 599 |         case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
 600 |             return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
 601 | 
 602 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 603 | 
 604 |         case NPP_BAD_ARG_ERROR:
 605 |             return "NPP_BAD_ARGUMENT_ERROR";
 606 | 
 607 |         case NPP_COEFF_ERROR:
 608 |             return "NPP_COEFFICIENT_ERROR";
 609 | 
 610 |         case NPP_RECT_ERROR:
 611 |             return "NPP_RECTANGLE_ERROR";
 612 | 
 613 |         case NPP_QUAD_ERROR:
 614 |             return "NPP_QUADRANGLE_ERROR";
 615 | 
 616 |         case NPP_MEM_ALLOC_ERR:
 617 |             return "NPP_MEMORY_ALLOCATION_ERROR";
 618 | 
 619 |         case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
 620 |             return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 621 | 
 622 |         case NPP_INVALID_INPUT:
 623 |             return "NPP_INVALID_INPUT";
 624 | 
 625 |         case NPP_POINTER_ERROR:
 626 |             return "NPP_POINTER_ERROR";
 627 | 
 628 |         case NPP_WARNING:
 629 |             return "NPP_WARNING";
 630 | 
 631 |         case NPP_ODD_ROI_WARNING:
 632 |             return "NPP_ODD_ROI_WARNING";
 633 | #else
 634 | 
 635 |             // These are for CUDA 5.5 or higher
 636 |         case NPP_BAD_ARGUMENT_ERROR:
 637 |             return "NPP_BAD_ARGUMENT_ERROR";
 638 | 
 639 |         case NPP_COEFFICIENT_ERROR:
 640 |             return "NPP_COEFFICIENT_ERROR";
 641 | 
 642 |         case NPP_RECTANGLE_ERROR:
 643 |             return "NPP_RECTANGLE_ERROR";
 644 | 
 645 |         case NPP_QUADRANGLE_ERROR:
 646 |             return "NPP_QUADRANGLE_ERROR";
 647 | 
 648 |         case NPP_MEMORY_ALLOCATION_ERR:
 649 |             return "NPP_MEMORY_ALLOCATION_ERROR";
 650 | 
 651 |         case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
 652 |             return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 653 | 
 654 |         case NPP_INVALID_HOST_POINTER_ERROR:
 655 |             return "NPP_INVALID_HOST_POINTER_ERROR";
 656 | 
 657 |         case NPP_INVALID_DEVICE_POINTER_ERROR:
 658 |             return "NPP_INVALID_DEVICE_POINTER_ERROR";
 659 | #endif
 660 | 
 661 |         case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
 662 |             return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
 663 | 
 664 |         case NPP_TEXTURE_BIND_ERROR:
 665 |             return "NPP_TEXTURE_BIND_ERROR";
 666 | 
 667 |         case NPP_WRONG_INTERSECTION_ROI_ERROR:
 668 |             return "NPP_WRONG_INTERSECTION_ROI_ERROR";
 669 | 
 670 |         case NPP_NOT_EVEN_STEP_ERROR:
 671 |             return "NPP_NOT_EVEN_STEP_ERROR";
 672 | 
 673 |         case NPP_INTERPOLATION_ERROR:
 674 |             return "NPP_INTERPOLATION_ERROR";
 675 | 
 676 |         case NPP_RESIZE_FACTOR_ERROR:
 677 |             return "NPP_RESIZE_FACTOR_ERROR";
 678 | 
 679 |         case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
 680 |             return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
 681 | 
 682 | 
 683 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 684 | 
 685 |         case NPP_MEMFREE_ERR:
 686 |             return "NPP_MEMFREE_ERR";
 687 | 
 688 |         case NPP_MEMSET_ERR:
 689 |             return "NPP_MEMSET_ERR";
 690 | 
 691 |         case NPP_MEMCPY_ERR:
 692 |             return "NPP_MEMCPY_ERROR";
 693 | 
 694 |         case NPP_MIRROR_FLIP_ERR:
 695 |             return "NPP_MIRROR_FLIP_ERR";
 696 | #else
 697 | 
 698 |         case NPP_MEMFREE_ERROR:
 699 |             return "NPP_MEMFREE_ERROR";
 700 | 
 701 |         case NPP_MEMSET_ERROR:
 702 |             return "NPP_MEMSET_ERROR";
 703 | 
 704 |         case NPP_MEMCPY_ERROR:
 705 |             return "NPP_MEMCPY_ERROR";
 706 | 
 707 |         case NPP_MIRROR_FLIP_ERROR:
 708 |             return "NPP_MIRROR_FLIP_ERROR";
 709 | #endif
 710 | 
 711 |         case NPP_ALIGNMENT_ERROR:
 712 |             return "NPP_ALIGNMENT_ERROR";
 713 | 
 714 |         case NPP_STEP_ERROR:
 715 |             return "NPP_STEP_ERROR";
 716 | 
 717 |         case NPP_SIZE_ERROR:
 718 |             return "NPP_SIZE_ERROR";
 719 | 
 720 |         case NPP_NULL_POINTER_ERROR:
 721 |             return "NPP_NULL_POINTER_ERROR";
 722 | 
 723 |         case NPP_CUDA_KERNEL_EXECUTION_ERROR:
 724 |             return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
 725 | 
 726 |         case NPP_NOT_IMPLEMENTED_ERROR:
 727 |             return "NPP_NOT_IMPLEMENTED_ERROR";
 728 | 
 729 |         case NPP_ERROR:
 730 |             return "NPP_ERROR";
 731 | 
 732 |         case NPP_SUCCESS:
 733 |             return "NPP_SUCCESS";
 734 | 
 735 |         case NPP_WRONG_INTERSECTION_QUAD_WARNING:
 736 |             return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
 737 | 
 738 |         case NPP_MISALIGNED_DST_ROI_WARNING:
 739 |             return "NPP_MISALIGNED_DST_ROI_WARNING";
 740 | 
 741 |         case NPP_AFFINE_QUAD_INCORRECT_WARNING:
 742 |             return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
 743 | 
 744 |         case NPP_DOUBLE_SIZE_WARNING:
 745 |             return "NPP_DOUBLE_SIZE_WARNING";
 746 | 
 747 |         case NPP_WRONG_INTERSECTION_ROI_WARNING:
 748 |             return "NPP_WRONG_INTERSECTION_ROI_WARNING";
 749 |     }
 750 | 
 751 |     return "<unknown>";
 752 | }
 753 | #endif
 754 | 
 755 | #ifdef __DRIVER_TYPES_H__
 756 | #ifndef DEVICE_RESET
 757 | #define DEVICE_RESET cudaDeviceReset();
 758 | #endif
 759 | #else
 760 | #ifndef DEVICE_RESET
 761 | #define DEVICE_RESET
 762 | #endif
 763 | #endif
 764 | 
 765 | template< typename T >
 766 | void check(T result, char const *const func, const char *const file, int const line)
 767 | {
 768 |     if (result)
 769 |     {
 770 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
 771 |                 file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
 772 |         DEVICE_RESET
 773 |         // Make sure we call CUDA Device Reset before exiting
 774 |         exit(EXIT_FAILURE);
 775 |     }
 776 | }
 777 | 
 778 | #ifdef __DRIVER_TYPES_H__
 779 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 780 | #define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
 781 | 
 782 | // This will output the proper error string when calling cudaGetLastError
 783 | #define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)
 784 | 
 785 | inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
 786 | {
 787 |     cudaError_t err = cudaGetLastError();
 788 | 
 789 |     if (cudaSuccess != err)
 790 |     {
 791 |         fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
 792 |                 file, line, errorMessage, (int)err, cudaGetErrorString(err));
 793 |         DEVICE_RESET
 794 |         exit(EXIT_FAILURE);
 795 |     }
 796 | }
 797 | #endif
 798 | 
 799 | #ifndef MAX
 800 | #define MAX(a,b) (a > b ? a : b)
 801 | #endif
 802 | 
 803 | // Beginning of GPU Architecture definitions
 804 | inline int _ConvertSMVer2Cores(int major, int minor)
 805 | {
 806 |     // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
 807 |     typedef struct
 808 |     {
 809 |         int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
 810 |         int Cores;
 811 |     } sSMtoCores;
 812 | 
 813 |     sSMtoCores nGpuArchCoresPerSM[] =
 814 |     {
 815 |         { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
 816 |         { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
 817 |         { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
 818 |         { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
 819 |         { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
 820 |         { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
 821 |         { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
 822 |         { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
 823 |         { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
 824 |         { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
 825 |         {   -1, -1 }
 826 |     };
 827 | 
 828 |     int index = 0;
 829 | 
 830 |     while (nGpuArchCoresPerSM[index].SM != -1)
 831 |     {
 832 |         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
 833 |         {
 834 |             return nGpuArchCoresPerSM[index].Cores;
 835 |         }
 836 | 
 837 |         index++;
 838 |     }
 839 | 
 840 |     // If we don't find the values, we default use the previous one to run properly
 841 |     printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
 842 |     return nGpuArchCoresPerSM[7].Cores;
 843 | }
 844 | // end of GPU Architecture definitions
 845 | 
 846 | #ifdef __CUDA_RUNTIME_H__
 847 | // General GPU Device CUDA Initialization
 848 | inline int gpuDeviceInit(int devID)
 849 | {
 850 |     int device_count;
 851 |     checkCudaErrors(cudaGetDeviceCount(&device_count));
 852 | 
 853 |     if (device_count == 0)
 854 |     {
 855 |         fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
 856 |         exit(EXIT_FAILURE);
 857 |     }
 858 | 
 859 |     if (devID < 0)
 860 |     {
 861 |         devID = 0;
 862 |     }
 863 | 
 864 |     if (devID > device_count-1)
 865 |     {
 866 |         fprintf(stderr, "\n");
 867 |         fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
 868 |         fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
 869 |         fprintf(stderr, "\n");
 870 |         return -devID;
 871 |     }
 872 | 
 873 |     cudaDeviceProp deviceProp;
 874 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 875 | 
 876 |     if (deviceProp.computeMode == cudaComputeModeProhibited)
 877 |     {
 878 |         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
 879 |         return -1;
 880 |     }
 881 | 
 882 |     if (deviceProp.major < 1)
 883 |     {
 884 |         fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
 885 |         exit(EXIT_FAILURE);
 886 |     }
 887 | 
 888 |     checkCudaErrors(cudaSetDevice(devID));
 889 |     printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
 890 | 
 891 |     return devID;
 892 | }
 893 | 
 894 | // This function returns the best GPU (with maximum GFLOPS)
 895 | inline int gpuGetMaxGflopsDeviceId()
 896 | {
 897 |     int current_device     = 0, sm_per_multiproc  = 0;
 898 |     int max_perf_device    = 0;
 899 |     int device_count       = 0, best_SM_arch      = 0;
 900 |     
 901 |     unsigned long long max_compute_perf = 0;
 902 |     cudaDeviceProp deviceProp;
 903 |     cudaGetDeviceCount(&device_count);
 904 |     
 905 |     checkCudaErrors(cudaGetDeviceCount(&device_count));
 906 | 
 907 |     if (device_count == 0)
 908 |     {
 909 |         fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
 910 |         exit(EXIT_FAILURE);
 911 |     }
 912 | 
 913 |     // Find the best major SM Architecture GPU device
 914 |     while (current_device < device_count)
 915 |     {
 916 |         cudaGetDeviceProperties(&deviceProp, current_device);
 917 | 
 918 |         // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
 919 |         if (deviceProp.computeMode != cudaComputeModeProhibited)
 920 |         {
 921 |             if (deviceProp.major > 0 && deviceProp.major < 9999)
 922 |             {
 923 |                 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
 924 |             }
 925 |         }
 926 | 
 927 |         current_device++;
 928 |     }
 929 | 
 930 |     // Find the best CUDA capable GPU device
 931 |     current_device = 0;
 932 | 
 933 |     while (current_device < device_count)
 934 |     {
 935 |         cudaGetDeviceProperties(&deviceProp, current_device);
 936 | 
 937 |         // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
 938 |         if (deviceProp.computeMode != cudaComputeModeProhibited)
 939 |         {
 940 |             if (deviceProp.major == 9999 && deviceProp.minor == 9999)
 941 |             {
 942 |                 sm_per_multiproc = 1;
 943 |             }
 944 |             else
 945 |             {
 946 |                 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
 947 |             }
 948 | 
 949 |             unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
 950 | 
 951 |             if (compute_perf  > max_compute_perf)
 952 |             {
 953 |                 // If we find GPU with SM major > 2, search only these
 954 |                 if (best_SM_arch > 2)
 955 |                 {
 956 |                     // If our device==dest_SM_arch, choose this, or else pass
 957 |                     if (deviceProp.major == best_SM_arch)
 958 |                     {
 959 |                         max_compute_perf  = compute_perf;
 960 |                         max_perf_device   = current_device;
 961 |                     }
 962 |                 }
 963 |                 else
 964 |                 {
 965 |                     max_compute_perf  = compute_perf;
 966 |                     max_perf_device   = current_device;
 967 |                 }
 968 |             }
 969 |         }
 970 | 
 971 |         ++current_device;
 972 |     }
 973 | 
 974 |     return max_perf_device;
 975 | }
 976 | 
 977 | 
 978 | // Initialization code to find the best CUDA Device
 979 | inline int findCudaDevice(int argc, const char **argv)
 980 | {
 981 |     cudaDeviceProp deviceProp;
 982 |     int devID = 0;
 983 | 
 984 |     // If the command-line has a device number specified, use it
 985 |     if (checkCmdLineFlag(argc, argv, "device"))
 986 |     {
 987 |         devID = getCmdLineArgumentInt(argc, argv, "device=");
 988 | 
 989 |         if (devID < 0)
 990 |         {
 991 |             printf("Invalid command line parameter\n ");
 992 |             exit(EXIT_FAILURE);
 993 |         }
 994 |         else
 995 |         {
 996 |             devID = gpuDeviceInit(devID);
 997 | 
 998 |             if (devID < 0)
 999 |             {
1000 |                 printf("exiting...\n");
1001 |                 exit(EXIT_FAILURE);
1002 |             }
1003 |         }
1004 |     }
1005 |     else
1006 |     {
1007 |         // Otherwise pick the device with highest Gflops/s
1008 |         devID = gpuGetMaxGflopsDeviceId();
1009 |         checkCudaErrors(cudaSetDevice(devID));
1010 |         checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
1011 |         printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
1012 |     }
1013 | 
1014 |     return devID;
1015 | }
1016 | 
1017 | // General check for CUDA GPU SM Capabilities
1018 | inline bool checkCudaCapabilities(int major_version, int minor_version)
1019 | {
1020 |     cudaDeviceProp deviceProp;
1021 |     deviceProp.major = 0;
1022 |     deviceProp.minor = 0;
1023 |     int dev;
1024 | 
1025 |     checkCudaErrors(cudaGetDevice(&dev));
1026 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
1027 | 
1028 |     if ((deviceProp.major > major_version) ||
1029 |         (deviceProp.major == major_version && deviceProp.minor >= minor_version))
1030 |     {
1031 |         printf("  GPU Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
1032 |         return true;
1033 |     }
1034 |     else
1035 |     {
1036 |         printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
1037 |         return false;
1038 |     }
1039 | }
1040 | #endif
1041 | 
1042 | // end of CUDA Helper Functions
1043 | 
1044 | 
1045 | #endif
1046 | 


--------------------------------------------------------------------------------
/common/helper_cuda_drvapi.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects)
 13 | #ifndef HELPER_CUDA_DRVAPI_H
 14 | #define HELPER_CUDA_DRVAPI_H
 15 | 
 16 | #include <stdlib.h>
 17 | #include <stdio.h>
 18 | #include <string.h>
 19 | 
 20 | #include <helper_string.h>
 21 | #include <drvapi_error_string.h>
 22 | 
 23 | #ifndef MAX
 24 | #define MAX(a,b) (a > b ? a : b)
 25 | #endif
 26 | 
 27 | #ifndef EXIT_WAIVED
 28 | #define EXIT_WAIVED 2
 29 | #endif
 30 | 
 31 | ////////////////////////////////////////////////////////////////////////////////
 32 | // These are CUDA Helper functions
 33 | 
 34 | // add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H
 35 | #ifdef  __cuda_cuda_h__
 36 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 37 | #ifndef checkCudaErrors
 38 | #define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
 39 | 
 40 | // These are the inline versions for all of the SDK helper functions
 41 | inline void __checkCudaErrors(CUresult err, const char *file, const int line)
 42 | {
 43 |     if (CUDA_SUCCESS != err)
 44 |     {
 45 |         fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
 46 |                 err, getCudaDrvErrorString(err), file, line);
 47 |         exit(EXIT_FAILURE);
 48 |     }
 49 | }
 50 | #endif
 51 | 
 52 | #ifdef getLastCudaDrvErrorMsg
 53 | #undef getLastCudaDrvErrorMsg
 54 | #endif
 55 | 
 56 | #define getLastCudaDrvErrorMsg(msg)           __getLastCudaDrvErrorMsg  (msg, __FILE__, __LINE__)
 57 | 
 58 | inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line)
 59 | {
 60 |     CUresult err = cuCtxSynchronize();
 61 | 
 62 |     if (CUDA_SUCCESS != err)
 63 |     {
 64 |         fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg);
 65 |         fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n",
 66 |                 err, getCudaDrvErrorString(err), file, line);
 67 |         exit(EXIT_FAILURE);
 68 |     }
 69 | }
 70 | 
 71 | // This function wraps the CUDA Driver API into a template function
 72 | template <class T>
 73 | inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
 74 | {
 75 |     CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device);
 76 | 
 77 |     if (error_result != CUDA_SUCCESS)
 78 |     {
 79 |         printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result));
 80 |         exit(EXIT_SUCCESS);
 81 |     }
 82 | }
 83 | #endif
 84 | 
 85 | // Beginning of GPU Architecture definitions
 86 | inline int _ConvertSMVer2CoresDRV(int major, int minor)
 87 | {
 88 |     // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
 89 |     typedef struct
 90 |     {
 91 |         int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
 92 |         int Cores;
 93 |     } sSMtoCores;
 94 | 
 95 |     sSMtoCores nGpuArchCoresPerSM[] =
 96 |     {
 97 |         { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
 98 |         { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
 99 |         { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
100 |         { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
101 |         { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
102 |         { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
103 |         { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
104 |         { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
105 |         { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
106 |         { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
107 |         {   -1, -1 }
108 |     };
109 | 
110 |     int index = 0;
111 | 
112 |     while (nGpuArchCoresPerSM[index].SM != -1)
113 |     {
114 |         if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
115 |         {
116 |             return nGpuArchCoresPerSM[index].Cores;
117 |         }
118 | 
119 |         index++;
120 |     }
121 | 
122 |     // If we don't find the values, we default use the previous one to run properly
123 |     printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
124 |     return nGpuArchCoresPerSM[7].Cores;
125 | }
126 | // end of GPU Architecture definitions
127 | 
128 | #ifdef __cuda_cuda_h__
129 | // General GPU Device CUDA Initialization
130 | inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
131 | {
132 |     int cuDevice = 0;
133 |     int deviceCount = 0;
134 |     CUresult err = cuInit(0);
135 | 
136 |     if (CUDA_SUCCESS == err)
137 |     {
138 |         checkCudaErrors(cuDeviceGetCount(&deviceCount));
139 |     }
140 | 
141 |     if (deviceCount == 0)
142 |     {
143 |         fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
144 |         exit(EXIT_FAILURE);
145 |     }
146 | 
147 |     int dev = 0;
148 |     dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
149 | 
150 |     if (dev < 0)
151 |     {
152 |         dev = 0;
153 |     }
154 | 
155 |     if (dev > deviceCount-1)
156 |     {
157 |         fprintf(stderr, "\n");
158 |         fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
159 |         fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
160 |         fprintf(stderr, "\n");
161 |         return -dev;
162 |     }
163 | 
164 |     checkCudaErrors(cuDeviceGet(&cuDevice, dev));
165 |     char name[100];
166 |     cuDeviceGetName(name, 100, cuDevice);
167 | 
168 |     int computeMode;
169 |     getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
170 | 
171 |     if (computeMode == CU_COMPUTEMODE_PROHIBITED)
172 |     {
173 |         fprintf(stderr, "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no threads can use this CUDA Device.\n");
174 |         return -1;
175 |     }
176 | 
177 |     if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false)
178 |     {
179 |         printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
180 |     }
181 | 
182 |     return dev;
183 | }
184 | 
185 | // This function returns the best GPU based on performance
186 | inline int gpuGetMaxGflopsDeviceIdDRV()
187 | {
188 |     CUdevice current_device = 0, max_perf_device = 0;
189 |     int device_count        = 0, sm_per_multiproc = 0;
190 |     int max_compute_perf    = 0, best_SM_arch     = 0;
191 |     int major = 0, minor = 0   , multiProcessorCount, clockRate;
192 | 
193 |     cuInit(0);
194 |     checkCudaErrors(cuDeviceGetCount(&device_count));
195 | 
196 |     if (device_count == 0)
197 |     {
198 |         fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
199 |         exit(EXIT_FAILURE);
200 |     }
201 | 
202 |     // Find the best major SM Architecture GPU device
203 |     while (current_device < device_count)
204 |     {
205 |         checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
206 | 
207 |         if (major > 0 && major < 9999)
208 |         {
209 |             best_SM_arch = MAX(best_SM_arch, major);
210 |         }
211 | 
212 |         current_device++;
213 |     }
214 | 
215 |     // Find the best CUDA capable GPU device
216 |     current_device = 0;
217 | 
218 |     while (current_device < device_count)
219 |     {
220 |         checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
221 |                                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
222 |                                              current_device));
223 |         checkCudaErrors(cuDeviceGetAttribute(&clockRate,
224 |                                              CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
225 |                                              current_device));
226 |         checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
227 | 
228 |         int computeMode;
229 |         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
230 | 
231 |         if (computeMode != CU_COMPUTEMODE_PROHIBITED)
232 |         {
233 |             if (major == 9999 && minor == 9999)
234 |             {
235 |                 sm_per_multiproc = 1;
236 |             }
237 |             else
238 |             {
239 |                 sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
240 |             }
241 | 
242 |             int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
243 | 
244 |             if (compute_perf  > max_compute_perf)
245 |             {
246 |                 // If we find GPU with SM major > 2, search only these
247 |                 if (best_SM_arch > 2)
248 |                 {
249 |                     // If our device==dest_SM_arch, choose this, or else pass
250 |                     if (major == best_SM_arch)
251 |                     {
252 |                         max_compute_perf  = compute_perf;
253 |                         max_perf_device   = current_device;
254 |                     }
255 |                 }
256 |                 else
257 |                 {
258 |                     max_compute_perf  = compute_perf;
259 |                     max_perf_device   = current_device;
260 |                 }
261 |             }
262 |         }
263 | 
264 |         ++current_device;
265 |     }
266 | 
267 |     return max_perf_device;
268 | }
269 | 
270 | // This function returns the best Graphics GPU based on performance
271 | inline int gpuGetMaxGflopsGLDeviceIdDRV()
272 | {
273 |     CUdevice current_device = 0, max_perf_device = 0;
274 |     int device_count     = 0, sm_per_multiproc = 0;
275 |     int max_compute_perf = 0, best_SM_arch     = 0;
276 |     int major = 0, minor = 0, multiProcessorCount, clockRate;
277 |     int bTCC = 0;
278 |     char deviceName[256];
279 | 
280 |     cuInit(0);
281 |     checkCudaErrors(cuDeviceGetCount(&device_count));
282 | 
283 |     if (device_count == 0)
284 |     {
285 |         fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n");
286 |         exit(EXIT_FAILURE);
287 |     }
288 | 
289 |     // Find the best major SM Architecture GPU device that are graphics devices
290 |     while (current_device < device_count)
291 |     {
292 |         checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
293 |         checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
294 | 
295 | #if CUDA_VERSION >= 3020
296 |         checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
297 | #else
298 | 
299 |         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
300 |         if (deviceName[0] == 'T')
301 |         {
302 |             bTCC = 1;
303 |         }
304 | 
305 | #endif
306 | 
307 |         int computeMode;
308 |         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
309 | 
310 |         if (computeMode != CU_COMPUTEMODE_PROHIBITED)
311 |         {
312 |             if (!bTCC)
313 |             {
314 |                 if (major > 0 && major < 9999)
315 |                 {
316 |                     best_SM_arch = MAX(best_SM_arch, major);
317 |                 }
318 |             }
319 |         }
320 | 
321 |         current_device++;
322 |     }
323 | 
324 |     // Find the best CUDA capable GPU device
325 |     current_device = 0;
326 | 
327 |     while (current_device < device_count)
328 |     {
329 |         checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
330 |                                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
331 |                                              current_device));
332 |         checkCudaErrors(cuDeviceGetAttribute(&clockRate,
333 |                                              CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
334 |                                              current_device));
335 |         checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
336 | 
337 | #if CUDA_VERSION >= 3020
338 |         checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
339 | #else
340 | 
341 |         // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
342 |         if (deviceName[0] == 'T')
343 |         {
344 |             bTCC = 1;
345 |         }
346 | 
347 | #endif
348 | 
349 |         int computeMode;
350 |         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
351 | 
352 |         if (computeMode != CU_COMPUTEMODE_PROHIBITED)
353 |         {
354 |             if (major == 9999 && minor == 9999)
355 |             {
356 |                 sm_per_multiproc = 1;
357 |             }
358 |             else
359 |             {
360 |                 sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
361 |             }
362 | 
363 |             // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
364 |             if (!bTCC)   // Is this GPU running the TCC driver?  If so we pass on this
365 |             {
366 |                 int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
367 | 
368 |                 if (compute_perf  > max_compute_perf)
369 |                 {
370 |                     // If we find GPU with SM major > 2, search only these
371 |                     if (best_SM_arch > 2)
372 |                     {
373 |                         // If our device = dest_SM_arch, then we pick this one
374 |                         if (major == best_SM_arch)
375 |                         {
376 |                             max_compute_perf  = compute_perf;
377 |                             max_perf_device   = current_device;
378 |                         }
379 |                     }
380 |                     else
381 |                     {
382 |                         max_compute_perf  = compute_perf;
383 |                         max_perf_device   = current_device;
384 |                     }
385 |                 }
386 |             }
387 |         }
388 | 
389 |         ++current_device;
390 |     }
391 | 
392 |     return max_perf_device;
393 | }
394 | 
395 | // General initialization call to pick the best CUDA Device
396 | inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
397 | {
398 |     CUdevice cuDevice;
399 |     int devID = 0;
400 | 
401 |     // If the command-line has a device number specified, use it
402 |     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
403 |     {
404 |         devID = gpuDeviceInitDRV(argc, argv);
405 | 
406 |         if (devID < 0)
407 |         {
408 |             printf("exiting...\n");
409 |             exit(EXIT_SUCCESS);
410 |         }
411 |     }
412 |     else
413 |     {
414 |         // Otherwise pick the device with highest Gflops/s
415 |         char name[100];
416 |         devID = gpuGetMaxGflopsDeviceIdDRV();
417 |         checkCudaErrors(cuDeviceGet(&cuDevice, devID));
418 |         cuDeviceGetName(name, 100, cuDevice);
419 |         printf("> Using CUDA Device [%d]: %s\n", devID, name);
420 |     }
421 | 
422 |     cuDeviceGet(&cuDevice, devID);
423 | 
424 |     return cuDevice;
425 | }
426 | 
427 | // This function will pick the best CUDA device available with OpenGL interop
428 | inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv)
429 | {
430 |     CUdevice cuDevice;
431 |     int devID = 0;
432 | 
433 |     // If the command-line has a device number specified, use it
434 |     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
435 |     {
436 |         devID = gpuDeviceInitDRV(argc, (const char **)argv);
437 | 
438 |         if (devID < 0)
439 |         {
440 |             printf("no CUDA capable devices found, exiting...\n");
441 |             exit(EXIT_SUCCESS);
442 |         }
443 |     }
444 |     else
445 |     {
446 |         char name[100];
447 |         // Otherwise pick the device with highest Gflops/s
448 |         devID = gpuGetMaxGflopsGLDeviceIdDRV();
449 |         checkCudaErrors(cuDeviceGet(&cuDevice, devID));
450 |         cuDeviceGetName(name, 100, cuDevice);
451 |         printf("> Using CUDA/GL Device [%d]: %s\n", devID, name);
452 |     }
453 | 
454 |     return devID;
455 | }
456 | 
457 | // General check for CUDA GPU SM Capabilities
458 | inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
459 | {
460 |     CUdevice cuDevice;
461 |     char name[256];
462 |     int major = 0, minor = 0;
463 | 
464 |     checkCudaErrors(cuDeviceGet(&cuDevice, devID));
465 |     checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
466 |     checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
467 | 
468 |     if ((major > major_version) ||
469 |         (major == major_version && minor >= minor_version))
470 |     {
471 |         printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
472 |         return true;
473 |     }
474 |     else
475 |     {
476 |         printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
477 |         return false;
478 |     }
479 | }
480 | #endif
481 | 
482 | // end of CUDA Helper Functions
483 | 
484 | #endif
485 | 


--------------------------------------------------------------------------------
/common/helper_cuda_gl.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef HELPER_CUDA_GL_H
 13 | #define HELPER_CUDA_GL_H
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <stdlib.h>
 18 | 
 19 | // includes, graphics
 20 | #if defined (__APPLE__) || defined(MACOSX)
 21 | #include <OpenGL/gl.h>
 22 | #include <OpenGL/glu.h>
 23 | #else
 24 | #include <GL/gl.h>
 25 | #include <GL/glu.h>
 26 | #endif
 27 | 
 28 | #ifndef EXIT_WAIVED
 29 | #define EXIT_WAIVED 2
 30 | #endif
 31 | 
 32 | #ifdef __DRIVER_TYPES_H__
 33 | #ifndef DEVICE_RESET
 34 | #define DEVICE_RESET cudaDeviceReset()
 35 | #endif
 36 | #else
 37 | #ifndef DEVICE_RESET
 38 | #define DEVICE_RESET
 39 | #endif
 40 | #endif
 41 | 
 42 | #ifdef __CUDA_GL_INTEROP_H__
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // These are CUDA OpenGL Helper functions
 45 | 
 46 | inline int gpuGLDeviceInit(int ARGC, const char **ARGV)
 47 | {
 48 |     int deviceCount;
 49 |     checkCudaErrors(cudaGetDeviceCount(&deviceCount));
 50 | 
 51 |     if (deviceCount == 0)
 52 |     {
 53 |         fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
 54 |         exit(EXIT_FAILURE);
 55 |     }
 56 | 
 57 |     int dev = 0;
 58 |     dev = getCmdLineArgumentInt(ARGC, ARGV, "device=");
 59 | 
 60 |     if (dev < 0)
 61 |     {
 62 |         dev = 0;
 63 |     }
 64 | 
 65 |     if (dev > deviceCount-1)
 66 |     {
 67 |         fprintf(stderr, "\n");
 68 |         fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
 69 |         fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
 70 |         fprintf(stderr, "\n");
 71 |         return -dev;
 72 |     }
 73 | 
 74 |     cudaDeviceProp deviceProp;
 75 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
 76 | 
 77 |     if (deviceProp.computeMode == cudaComputeModeProhibited)
 78 |     {
 79 |         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
 80 |         return -1;
 81 |     }
 82 | 
 83 |     if (deviceProp.major < 1)
 84 |     {
 85 |         fprintf(stderr, "Error: device does not support CUDA.\n");
 86 |         exit(EXIT_FAILURE);
 87 |     }
 88 | 
 89 |     if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false)
 90 |     {
 91 |         fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
 92 |     }
 93 | 
 94 |     checkCudaErrors(cudaGLSetGLDevice(dev));
 95 |     return dev;
 96 | }
 97 | 
 98 | // This function will pick the best CUDA device available with OpenGL interop
 99 | inline int findCudaGLDevice(int argc, const char **argv)
100 | {
101 |     int devID = 0;
102 | 
103 |     // If the command-line has a device number specified, use it
104 |     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
105 |     {
106 |         devID = gpuGLDeviceInit(argc, (const char **)argv);
107 | 
108 |         if (devID < 0)
109 |         {
110 |             printf("no CUDA capable devices found, exiting...\n");
111 |             DEVICE_RESET
112 |             exit(EXIT_SUCCESS);
113 |         }
114 |     }
115 |     else
116 |     {
117 |         // Otherwise pick the device with highest Gflops/s
118 |         devID = gpuGetMaxGflopsDeviceId();
119 |         cudaGLSetGLDevice(devID);
120 |     }
121 | 
122 |     return devID;
123 | }
124 | 
125 | ////////////////////////////////////////////////////////////////////////////
126 | //! Check for OpenGL error
127 | //! @return bool if no GL error has been encountered, otherwise 0
128 | //! @param file  __FILE__ macro
129 | //! @param line  __LINE__ macro
130 | //! @note The GL error is listed on stderr
131 | //! @note This function should be used via the CHECK_ERROR_GL() macro
132 | ////////////////////////////////////////////////////////////////////////////
133 | inline bool
134 | sdkCheckErrorGL(const char *file, const int line)
135 | {
136 |     bool ret_val = true;
137 | 
138 |     // check for error
139 |     GLenum gl_error = glGetError();
140 | 
141 |     if (gl_error != GL_NO_ERROR)
142 |     {
143 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
144 |         char tmpStr[512];
145 |         // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
146 |         // when the user double clicks on the error line in the Output pane. Like any compile error.
147 |         sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error));
148 |         fprintf(stderr, "%s", tmpStr);
149 | #endif
150 |         fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
151 |         fprintf(stderr, "%s\n", gluErrorString(gl_error));
152 |         ret_val = false;
153 |     }
154 | 
155 |     return ret_val;
156 | }
157 | 
158 | #define SDK_CHECK_ERROR_GL()                                              \
159 |     if( false == sdkCheckErrorGL( __FILE__, __LINE__)) {                  \
160 |         DEVICE_RESET                                                      \
161 |         exit(EXIT_FAILURE);                                               \
162 |     }
163 | #endif
164 | 
165 | #endif
166 | 


--------------------------------------------------------------------------------
/common/helper_functions.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 |  * with this source code for terms and conditions that govern your use of
 6 |  * this software. Any use, reproduction, disclosure, or distribution of
 7 |  * this software and related documentation outside the terms of the EULA
 8 |  * is strictly prohibited.
 9 |  *
10 |  */
11 | 
12 | // These are helper functions for the SDK samples (string parsing, timers, image helpers, etc)
13 | #ifndef HELPER_FUNCTIONS_H
14 | #define HELPER_FUNCTIONS_H
15 | 
16 | #ifdef WIN32
17 | #pragma warning(disable:4996)
18 | #endif
19 | 
20 | // includes, project
21 | #include <stdio.h>
22 | #include <stdlib.h>
23 | #include <string>
24 | #include <assert.h>
25 | #include <exception.h>
26 | #include <math.h>
27 | 
28 | #include <fstream>
29 | #include <vector>
30 | #include <iostream>
31 | #include <algorithm>
32 | 
33 | // includes, timer, string parsing, image helpers
34 | #include <helper_timer.h>   // helper functions for timers
35 | #include <helper_string.h>  // helper functions for string parsing
36 | #include <helper_image.h>   // helper functions for image compare, dump, data comparisons
37 | 
38 | #ifndef EXIT_WAIVED
39 | #define EXIT_WAIVED 2
40 | #endif
41 | 
42 | #endif //  HELPER_FUNCTIONS_H
43 | 


--------------------------------------------------------------------------------
/common/helper_image.h:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
   3 |  *
   4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
   5 |  * with this source code for terms and conditions that govern your use of
   6 |  * this software. Any use, reproduction, disclosure, or distribution of
   7 |  * this software and related documentation outside the terms of the EULA
   8 |  * is strictly prohibited.
   9 |  *
  10 |  */
  11 | 
  12 | // These are helper functions for the SDK samples (image,bitmap)
  13 | #ifndef HELPER_IMAGE_H
  14 | #define HELPER_IMAGE_H
  15 | 
  16 | #include <string>
  17 | #include <fstream>
  18 | #include <vector>
  19 | #include <iostream>
  20 | #include <algorithm>
  21 | 
  22 | #include <assert.h>
  23 | #include <exception.h>
  24 | #include <math.h>
  25 | 
  26 | #ifndef MIN
  27 | #define MIN(a,b) ((a < b) ? a : b)
  28 | #endif
  29 | #ifndef MAX
  30 | #define MAX(a,b) ((a > b) ? a : b)
  31 | #endif
  32 | 
  33 | #ifndef EXIT_WAIVED
  34 | #define EXIT_WAIVED 2
  35 | #endif
  36 | 
  37 | #include <helper_string.h>
  38 | 
  39 | // namespace unnamed (internal)
  40 | namespace
  41 | {
  42 |     //! size of PGM file header
  43 |     const unsigned int PGMHeaderSize = 0x40;
  44 | 
  45 |     // types
  46 | 
  47 |     //! Data converter from unsigned char / unsigned byte to type T
  48 |     template<class T>
  49 |     struct ConverterFromUByte;
  50 | 
  51 |     //! Data converter from unsigned char / unsigned byte
  52 |     template<>
  53 |     struct ConverterFromUByte<unsigned char>
  54 |     {
  55 |         //! Conversion operator
  56 |         //! @return converted value
  57 |         //! @param  val  value to convert
  58 |         float operator()(const unsigned char &val)
  59 |         {
  60 |             return static_cast<unsigned char>(val);
  61 |         }
  62 |     };
  63 | 
  64 |     //! Data converter from unsigned char / unsigned byte to float
  65 |     template<>
  66 |     struct ConverterFromUByte<float>
  67 |     {
  68 |         //! Conversion operator
  69 |         //! @return converted value
  70 |         //! @param  val  value to convert
  71 |         float operator()(const unsigned char &val)
  72 |         {
  73 |             return static_cast<float>(val) / 255.0f;
  74 |         }
  75 |     };
  76 | 
  77 |     //! Data converter from unsigned char / unsigned byte to type T
  78 |     template<class T>
  79 |     struct ConverterToUByte;
  80 | 
  81 |     //! Data converter from unsigned char / unsigned byte to unsigned int
  82 |     template<>
  83 |     struct ConverterToUByte<unsigned char>
  84 |     {
  85 |         //! Conversion operator (essentially a passthru
  86 |         //! @return converted value
  87 |         //! @param  val  value to convert
  88 |         unsigned char operator()(const unsigned char &val)
  89 |         {
  90 |             return val;
  91 |         }
  92 |     };
  93 | 
  94 |     //! Data converter from unsigned char / unsigned byte to unsigned int
  95 |     template<>
  96 |     struct ConverterToUByte<float>
  97 |     {
  98 |         //! Conversion operator
  99 |         //! @return converted value
 100 |         //! @param  val  value to convert
 101 |         unsigned char operator()(const float &val)
 102 |         {
 103 |             return static_cast<unsigned char>(val * 255.0f);
 104 |         }
 105 |     };
 106 | }
 107 | 
 108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 109 | #ifndef FOPEN
 110 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
 111 | #endif
 112 | #ifndef FOPEN_FAIL
 113 | #define FOPEN_FAIL(result) (result != 0)
 114 | #endif
 115 | #ifndef SSCANF
 116 | #define SSCANF sscanf_s
 117 | #endif
 118 | #else
 119 | #ifndef FOPEN
 120 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
 121 | #endif
 122 | #ifndef FOPEN_FAIL
 123 | #define FOPEN_FAIL(result) (result == NULL)
 124 | #endif
 125 | #ifndef SSCANF
 126 | #define SSCANF sscanf
 127 | #endif
 128 | #endif
 129 | 
 130 | inline bool
 131 | __loadPPM(const char *file, unsigned char **data,
 132 |           unsigned int *w, unsigned int *h, unsigned int *channels)
 133 | {
 134 |     FILE *fp = NULL;
 135 | 
 136 |     if (FOPEN_FAIL(FOPEN(fp, file, "rb")))
 137 |     {
 138 |         std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
 139 |         return false;
 140 |     }
 141 | 
 142 |     // check header
 143 |     char header[PGMHeaderSize];
 144 | 
 145 |     if (fgets(header, PGMHeaderSize, fp) == NULL)
 146 |     {
 147 |         std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
 148 |         return false;
 149 |     }
 150 | 
 151 |     if (strncmp(header, "P5", 2) == 0)
 152 |     {
 153 |         *channels = 1;
 154 |     }
 155 |     else if (strncmp(header, "P6", 2) == 0)
 156 |     {
 157 |         *channels = 3;
 158 |     }
 159 |     else
 160 |     {
 161 |         std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
 162 |         *channels = 0;
 163 |         return false;
 164 |     }
 165 | 
 166 |     // parse header, read maxval, width and height
 167 |     unsigned int width = 0;
 168 |     unsigned int height = 0;
 169 |     unsigned int maxval = 0;
 170 |     unsigned int i = 0;
 171 | 
 172 |     while (i < 3)
 173 |     {
 174 |         if (fgets(header, PGMHeaderSize, fp) == NULL)
 175 |         {
 176 |             std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
 177 |             return false;
 178 |         }
 179 | 
 180 |         if (header[0] == '#')
 181 |         {
 182 |             continue;
 183 |         }
 184 | 
 185 |         if (i == 0)
 186 |         {
 187 |             i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
 188 |         }
 189 |         else if (i == 1)
 190 |         {
 191 |             i += SSCANF(header, "%u %u", &height, &maxval);
 192 |         }
 193 |         else if (i == 2)
 194 |         {
 195 |             i += SSCANF(header, "%u", &maxval);
 196 |         }
 197 |     }
 198 | 
 199 |     // check if given handle for the data is initialized
 200 |     if (NULL != *data)
 201 |     {
 202 |         if (*w != width || *h != height)
 203 |         {
 204 |             std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
 205 |         }
 206 |     }
 207 |     else
 208 |     {
 209 |         *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels);
 210 |         *w = width;
 211 |         *h = height;
 212 |     }
 213 | 
 214 |     // read and close file
 215 |     if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0)
 216 |     {
 217 |         std::cerr << "__LoadPPM() read data returned error." << std::endl;
 218 |     }
 219 | 
 220 |     fclose(fp);
 221 | 
 222 |     return true;
 223 | }
 224 | 
 225 | template <class T>
 226 | inline bool
 227 | sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h)
 228 | {
 229 |     unsigned char *idata = NULL;
 230 |     unsigned int channels;
 231 | 
 232 |     if (true != __loadPPM(file, &idata, w, h, &channels))
 233 |     {
 234 |         return false;
 235 |     }
 236 | 
 237 |     unsigned int size = *w **h * channels;
 238 | 
 239 |     // initialize mem if necessary
 240 |     // the correct size is checked / set in loadPGMc()
 241 |     if (NULL == *data)
 242 |     {
 243 |         *data = (T *) malloc(sizeof(T) * size);
 244 |     }
 245 | 
 246 |     // copy and cast data
 247 |     std::transform(idata, idata + size, *data, ConverterFromUByte<T>());
 248 | 
 249 |     free(idata);
 250 | 
 251 |     return true;
 252 | }
 253 | 
 254 | template <class T>
 255 | inline bool
 256 | sdkLoadPPM4(const char *file, T **data,
 257 |             unsigned int *w,unsigned int *h)
 258 | {
 259 |     unsigned char *idata = 0;
 260 |     unsigned int channels;
 261 | 
 262 |     if (__loadPPM(file, &idata, w, h, &channels))
 263 |     {
 264 |         // pad 4th component
 265 |         int size = *w **h;
 266 |         // keep the original pointer
 267 |         unsigned char *idata_orig = idata;
 268 |         *data = (T *) malloc(sizeof(T) * size * 4);
 269 |         unsigned char *ptr = *data;
 270 | 
 271 |         for (int i=0; i<size; i++)
 272 |         {
 273 |             *ptr++ = *idata++;
 274 |             *ptr++ = *idata++;
 275 |             *ptr++ = *idata++;
 276 |             *ptr++ = 0;
 277 |         }
 278 | 
 279 |         free(idata_orig);
 280 |         return true;
 281 |     }
 282 |     else
 283 |     {
 284 |         free(idata);
 285 |         return false;
 286 |     }
 287 | }
 288 | 
 289 | inline bool
 290 | __savePPM(const char *file, unsigned char *data,
 291 |           unsigned int w, unsigned int h, unsigned int channels)
 292 | {
 293 |     assert(NULL != data);
 294 |     assert(w > 0);
 295 |     assert(h > 0);
 296 | 
 297 |     std::fstream fh(file, std::fstream::out | std::fstream::binary);
 298 | 
 299 |     if (fh.bad())
 300 |     {
 301 |         std::cerr << "__savePPM() : Opening file failed." << std::endl;
 302 |         return false;
 303 |     }
 304 | 
 305 |     if (channels == 1)
 306 |     {
 307 |         fh << "P5\n";
 308 |     }
 309 |     else if (channels == 3)
 310 |     {
 311 |         fh << "P6\n";
 312 |     }
 313 |     else
 314 |     {
 315 |         std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
 316 |         return false;
 317 |     }
 318 | 
 319 |     fh << w << "\n" << h << "\n" << 0xff << std::endl;
 320 | 
 321 |     for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
 322 |     {
 323 |         fh << data[i];
 324 |     }
 325 | 
 326 |     fh.flush();
 327 | 
 328 |     if (fh.bad())
 329 |     {
 330 |         std::cerr << "__savePPM() : Writing data failed." << std::endl;
 331 |         return false;
 332 |     }
 333 | 
 334 |     fh.close();
 335 | 
 336 |     return true;
 337 | }
 338 | 
 339 | template<class T>
 340 | inline bool
 341 | sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h)
 342 | {
 343 |     unsigned int size = w * h;
 344 |     unsigned char *idata =
 345 |         (unsigned char *) malloc(sizeof(unsigned char) * size);
 346 | 
 347 |     std::transform(data, data + size, idata, ConverterToUByte<T>());
 348 | 
 349 |     // write file
 350 |     bool result = __savePPM(file, idata, w, h, 1);
 351 | 
 352 |     // cleanup
 353 |     free(idata);
 354 | 
 355 |     return result;
 356 | }
 357 | 
 358 | inline bool
 359 | sdkSavePPM4ub(const char *file, unsigned char *data,
 360 |               unsigned int w, unsigned int h)
 361 | {
 362 |     // strip 4th component
 363 |     int size = w * h;
 364 |     unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3);
 365 |     unsigned char *ptr = ndata;
 366 | 
 367 |     for (int i=0; i<size; i++)
 368 |     {
 369 |         *ptr++ = *data++;
 370 |         *ptr++ = *data++;
 371 |         *ptr++ = *data++;
 372 |         data++;
 373 |     }
 374 | 
 375 |     bool result = __savePPM(file, ndata, w, h, 3);
 376 |     free(ndata);
 377 |     return result;
 378 | }
 379 | 
 380 | 
 381 | //////////////////////////////////////////////////////////////////////////////
 382 | //! Read file \filename and return the data
 383 | //! @return bool if reading the file succeeded, otherwise false
 384 | //! @param filename name of the source file
 385 | //! @param data  uninitialized pointer, returned initialized and pointing to
 386 | //!        the data read
 387 | //! @param len  number of data elements in data, -1 on error
 388 | //////////////////////////////////////////////////////////////////////////////
 389 | template<class T>
 390 | inline bool
 391 | sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
 392 | {
 393 |     // check input arguments
 394 |     assert(NULL != filename);
 395 |     assert(NULL != len);
 396 | 
 397 |     // intermediate storage for the data read
 398 |     std::vector<T>  data_read;
 399 | 
 400 |     // open file for reading
 401 |     FILE *fh = NULL;
 402 | 
 403 |     // check if filestream is valid
 404 |     if (FOPEN_FAIL(FOPEN(fh, filename, "r")))
 405 |     {
 406 |         printf("Unable to open input file: %s\n", filename);
 407 |         return false;
 408 |     }
 409 | 
 410 |     // read all data elements
 411 |     T token;
 412 | 
 413 |     while (!feof(fh))
 414 |     {
 415 |         fscanf(fh, "%f", &token);
 416 |         data_read.push_back(token);
 417 |     }
 418 | 
 419 |     // the last element is read twice
 420 |     data_read.pop_back();
 421 |     fclose(fh);
 422 | 
 423 |     // check if the given handle is already initialized
 424 |     if (NULL != *data)
 425 |     {
 426 |         if (*len != data_read.size())
 427 |         {
 428 |             std::cerr << "sdkReadFile() : Initialized memory given but "
 429 |                       << "size  mismatch with signal read "
 430 |                       << "(data read / data init = " << (unsigned int)data_read.size()
 431 |                       <<  " / " << *len << ")" << std::endl;
 432 | 
 433 |             return false;
 434 |         }
 435 |     }
 436 |     else
 437 |     {
 438 |         // allocate storage for the data read
 439 |         *data = (T *) malloc(sizeof(T) * data_read.size());
 440 |         // store signal size
 441 |         *len = static_cast<unsigned int>(data_read.size());
 442 |     }
 443 | 
 444 |     // copy data
 445 |     memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
 446 | 
 447 |     return true;
 448 | }
 449 | 
 450 | //////////////////////////////////////////////////////////////////////////////
 451 | //! Read file \filename and return the data
 452 | //! @return bool if reading the file succeeded, otherwise false
 453 | //! @param filename name of the source file
 454 | //! @param data  uninitialized pointer, returned initialized and pointing to
 455 | //!        the data read
 456 | //! @param len  number of data elements in data, -1 on error
 457 | //////////////////////////////////////////////////////////////////////////////
 458 | template<class T>
 459 | inline bool
 460 | sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose)
 461 | {
 462 |     // check input arguments
 463 |     assert(NULL != filename);
 464 |     assert(NULL != len);
 465 | 
 466 |     // open file for reading
 467 |     FILE *fh = fopen(filename, "rb");
 468 | 
 469 |     if (fh == NULL && verbose)
 470 |     {
 471 |         std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
 472 |         return false;
 473 |     }
 474 | 
 475 |     // check if the given handle is already initialized
 476 |     // allocate storage for the data read
 477 |     data[block_num] = (T *) malloc(block_size);
 478 | 
 479 |     // read all data elements
 480 |     fseek(fh, block_num * block_size, SEEK_SET);
 481 |     *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh);
 482 | 
 483 |     fclose(fh);
 484 | 
 485 |     return true;
 486 | }
 487 | 
 488 | //////////////////////////////////////////////////////////////////////////////
 489 | //! Write a data file \filename
 490 | //! @return true if writing the file succeeded, otherwise false
 491 | //! @param filename name of the source file
 492 | //! @param data  data to write
 493 | //! @param len  number of data elements in data, -1 on error
 494 | //! @param epsilon  epsilon for comparison
 495 | //////////////////////////////////////////////////////////////////////////////
 496 | template<class T, class S>
 497 | inline bool
 498 | sdkWriteFile(const char *filename, const T *data, unsigned int len,
 499 |              const S epsilon, bool verbose, bool append = false)
 500 | {
 501 |     assert(NULL != filename);
 502 |     assert(NULL != data);
 503 | 
 504 |     // open file for writing
 505 |     //    if (append) {
 506 |     std::fstream fh(filename, std::fstream::out | std::fstream::ate);
 507 | 
 508 |     if (verbose)
 509 |     {
 510 |         std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl;
 511 |     }
 512 | 
 513 |     /*    } else {
 514 |             std::fstream fh(filename, std::fstream::out);
 515 |             if (verbose) {
 516 |                 std::cerr << "sdkWriteFile() : Open file " << filename << " for write." << std::endl;
 517 |             }
 518 |         }
 519 |     */
 520 | 
 521 |     // check if filestream is valid
 522 |     if (! fh.good())
 523 |     {
 524 |         if (verbose)
 525 |         {
 526 |             std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
 527 |         }
 528 | 
 529 |         return false;
 530 |     }
 531 | 
 532 |     // first write epsilon
 533 |     fh << "# " << epsilon << "\n";
 534 | 
 535 |     // write data
 536 |     for (unsigned int i = 0; (i < len) && (fh.good()); ++i)
 537 |     {
 538 |         fh << data[i] << ' ';
 539 |     }
 540 | 
 541 |     // Check if writing succeeded
 542 |     if (! fh.good())
 543 |     {
 544 |         if (verbose)
 545 |         {
 546 |             std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
 547 |         }
 548 | 
 549 |         return false;
 550 |     }
 551 | 
 552 |     // file ends with nl
 553 |     fh << std::endl;
 554 | 
 555 |     return true;
 556 | }
 557 | 
 558 | //////////////////////////////////////////////////////////////////////////////
 559 | //! Compare two arrays of arbitrary type
 560 | //! @return  true if \a reference and \a data are identical, otherwise false
 561 | //! @param reference  timer_interface to the reference data / gold image
 562 | //! @param data       handle to the computed data
 563 | //! @param len        number of elements in reference and data
 564 | //! @param epsilon    epsilon to use for the comparison
 565 | //////////////////////////////////////////////////////////////////////////////
 566 | template<class T, class S>
 567 | inline bool
 568 | compareData(const T *reference, const T *data, const unsigned int len,
 569 |             const S epsilon, const float threshold)
 570 | {
 571 |     assert(epsilon >= 0);
 572 | 
 573 |     bool result = true;
 574 |     unsigned int error_count = 0;
 575 | 
 576 |     for (unsigned int i = 0; i < len; ++i)
 577 |     {
 578 |         float diff = (float)reference[i] - (float)data[i];
 579 |         bool comp = (diff <= epsilon) && (diff >= -epsilon);
 580 |         result &= comp;
 581 | 
 582 |         error_count += !comp;
 583 | 
 584 | #if 0
 585 | 
 586 |         if (! comp)
 587 |         {
 588 |             std::cerr << "ERROR, i = " << i << ",\t "
 589 |                       << reference[i] << " / "
 590 |                       << data[i]
 591 |                       << " (reference / data)\n";
 592 |         }
 593 | 
 594 | #endif
 595 |     }
 596 | 
 597 |     if (threshold == 0.0f)
 598 |     {
 599 |         return (result) ? true : false;
 600 |     }
 601 |     else
 602 |     {
 603 |         if (error_count)
 604 |         {
 605 |             printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
 606 |         }
 607 | 
 608 |         return (len*threshold > error_count) ? true : false;
 609 |     }
 610 | }
 611 | 
 612 | #ifndef __MIN_EPSILON_ERROR
 613 | #define __MIN_EPSILON_ERROR 1e-3f
 614 | #endif
 615 | 
 616 | //////////////////////////////////////////////////////////////////////////////
 617 | //! Compare two arrays of arbitrary type
 618 | //! @return  true if \a reference and \a data are identical, otherwise false
 619 | //! @param reference  handle to the reference data / gold image
 620 | //! @param data       handle to the computed data
 621 | //! @param len        number of elements in reference and data
 622 | //! @param epsilon    epsilon to use for the comparison
 623 | //! @param epsilon    threshold % of (# of bytes) for pass/fail
 624 | //////////////////////////////////////////////////////////////////////////////
 625 | template<class T, class S>
 626 | inline bool
 627 | compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len,
 628 |                             const S epsilon, const float threshold)
 629 | {
 630 |     assert(epsilon >= 0);
 631 | 
 632 |     // If we set epsilon to be 0, let's set a minimum threshold
 633 |     float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
 634 |     int error_count = 0;
 635 |     bool result = true;
 636 | 
 637 |     for (unsigned int i = 0; i < len; ++i)
 638 |     {
 639 |         float diff = fabs((float)reference[i] - (float)data[i]);
 640 |         bool comp = (diff < max_error);
 641 |         result &= comp;
 642 | 
 643 |         if (! comp)
 644 |         {
 645 |             error_count++;
 646 | #if 0
 647 | 
 648 |             if (error_count < 50)
 649 |             {
 650 |                 printf("\n    ERROR(epsilon=%4.3f), i=%d, (ref)0x%02x / (data)0x%02x / (diff)%d\n",
 651 |                        max_error, i,
 652 |                        *(unsigned int *)&reference[i],
 653 |                        *(unsigned int *)&data[i],
 654 |                        (unsigned int)diff);
 655 |             }
 656 | 
 657 | #endif
 658 |         }
 659 |     }
 660 | 
 661 |     if (threshold == 0.0f)
 662 |     {
 663 |         if (error_count)
 664 |         {
 665 |             printf("total # of errors = %d\n", error_count);
 666 |         }
 667 | 
 668 |         return (error_count == 0) ? true : false;
 669 |     }
 670 |     else
 671 |     {
 672 |         if (error_count)
 673 |         {
 674 |             printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
 675 |         }
 676 | 
 677 |         return ((len*threshold > error_count) ? true : false);
 678 |     }
 679 | }
 680 | 
 681 | inline
 682 | void sdkDumpBin(void *data, unsigned int bytes, const char *filename)
 683 | {
 684 |     printf("sdkDumpBin: <%s>\n", filename);
 685 |     FILE *fp;
 686 |     FOPEN(fp, filename, "wb");
 687 |     fwrite(data, bytes, 1, fp);
 688 |     fflush(fp);
 689 |     fclose(fp);
 690 | }
 691 | 
 692 | inline
 693 | bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
 694 | {
 695 |     unsigned int *src_buffer, *ref_buffer;
 696 |     FILE *src_fp = NULL, *ref_fp = NULL;
 697 | 
 698 |     unsigned long error_count = 0;
 699 |     size_t fsize = 0;
 700 | 
 701 |     if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
 702 |     {
 703 |         printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n", src_file);
 704 |         error_count++;
 705 |     }
 706 | 
 707 |     char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
 708 | 
 709 |     if (ref_file_path == NULL)
 710 |     {
 711 |         printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n", ref_file, exec_path);
 712 |         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
 713 |         printf("Aborting comparison!\n");
 714 |         printf("  FAILED\n");
 715 |         error_count++;
 716 | 
 717 |         if (src_fp)
 718 |         {
 719 |             fclose(src_fp);
 720 |         }
 721 | 
 722 |         if (ref_fp)
 723 |         {
 724 |             fclose(ref_fp);
 725 |         }
 726 |     }
 727 |     else
 728 |     {
 729 |         if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
 730 |         {
 731 |             printf("compareBin2Bin <unsigned int>  unable to open ref_file: %s\n", ref_file_path);
 732 |             error_count++;
 733 |         }
 734 | 
 735 |         if (src_fp && ref_fp)
 736 |         {
 737 |             src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
 738 |             ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
 739 | 
 740 |             fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
 741 |             fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
 742 | 
 743 |             printf("> compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
 744 |             printf("   src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
 745 |             printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
 746 | 
 747 |             if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
 748 |             {
 749 |                 error_count++;
 750 |             }
 751 | 
 752 |             fclose(src_fp);
 753 |             fclose(ref_fp);
 754 | 
 755 |             free(src_buffer);
 756 |             free(ref_buffer);
 757 |         }
 758 |         else
 759 |         {
 760 |             if (src_fp)
 761 |             {
 762 |                 fclose(src_fp);
 763 |             }
 764 | 
 765 |             if (ref_fp)
 766 |             {
 767 |                 fclose(ref_fp);
 768 |             }
 769 |         }
 770 |     }
 771 | 
 772 |     if (error_count == 0)
 773 |     {
 774 |         printf("  OK\n");
 775 |     }
 776 |     else
 777 |     {
 778 |         printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
 779 |     }
 780 | 
 781 |     return (error_count == 0);  // returns true if all pixels pass
 782 | }
 783 | 
 784 | inline
 785 | bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
 786 | {
 787 |     float *src_buffer, *ref_buffer;
 788 |     FILE *src_fp = NULL, *ref_fp = NULL;
 789 |     size_t fsize = 0;
 790 | 
 791 |     unsigned long error_count = 0;
 792 | 
 793 |     if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
 794 |     {
 795 |         printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
 796 |         error_count = 1;
 797 |     }
 798 | 
 799 |     char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
 800 | 
 801 |     if (ref_file_path == NULL)
 802 |     {
 803 |         printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, exec_path);
 804 |         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path);
 805 |         printf("Aborting comparison!\n");
 806 |         printf("  FAILED\n");
 807 |         error_count++;
 808 | 
 809 |         if (src_fp)
 810 |         {
 811 |             fclose(src_fp);
 812 |         }
 813 | 
 814 |         if (ref_fp)
 815 |         {
 816 |             fclose(ref_fp);
 817 |         }
 818 |     }
 819 |     else
 820 |     {
 821 |         if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
 822 |         {
 823 |             printf("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path);
 824 |             error_count = 1;
 825 |         }
 826 | 
 827 |         if (src_fp && ref_fp)
 828 |         {
 829 |             src_buffer = (float *)malloc(nelements*sizeof(float));
 830 |             ref_buffer = (float *)malloc(nelements*sizeof(float));
 831 | 
 832 |             fsize = fread(src_buffer, nelements, sizeof(float), src_fp);
 833 |             fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp);
 834 | 
 835 |             printf("> compareBin2Bin <float> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
 836 |             printf("   src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
 837 |             printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
 838 | 
 839 |             if (!compareDataAsFloatThreshold<float, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
 840 |             {
 841 |                 error_count++;
 842 |             }
 843 | 
 844 |             fclose(src_fp);
 845 |             fclose(ref_fp);
 846 | 
 847 |             free(src_buffer);
 848 |             free(ref_buffer);
 849 |         }
 850 |         else
 851 |         {
 852 |             if (src_fp)
 853 |             {
 854 |                 fclose(src_fp);
 855 |             }
 856 | 
 857 |             if (ref_fp)
 858 |             {
 859 |                 fclose(ref_fp);
 860 |             }
 861 |         }
 862 |     }
 863 | 
 864 |     if (error_count == 0)
 865 |     {
 866 |         printf("  OK\n");
 867 |     }
 868 |     else
 869 |     {
 870 |         printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
 871 |     }
 872 | 
 873 |     return (error_count == 0);  // returns true if all pixels pass
 874 | }
 875 | 
 876 | inline bool
 877 | sdkCompareL2fe(const float *reference, const float *data,
 878 |                const unsigned int len, const float epsilon)
 879 | {
 880 |     assert(epsilon >= 0);
 881 | 
 882 |     float error = 0;
 883 |     float ref = 0;
 884 | 
 885 |     for (unsigned int i = 0; i < len; ++i)
 886 |     {
 887 | 
 888 |         float diff = reference[i] - data[i];
 889 |         error += diff * diff;
 890 |         ref += reference[i] * reference[i];
 891 |     }
 892 | 
 893 |     float normRef = sqrtf(ref);
 894 | 
 895 |     if (fabs(ref) < 1e-7)
 896 |     {
 897 | #ifdef _DEBUG
 898 |         std::cerr << "ERROR, reference l2-norm is 0\n";
 899 | #endif
 900 |         return false;
 901 |     }
 902 | 
 903 |     float normError = sqrtf(error);
 904 |     error = normError / normRef;
 905 |     bool result = error < epsilon;
 906 | #ifdef _DEBUG
 907 | 
 908 |     if (! result)
 909 |     {
 910 |         std::cerr << "ERROR, l2-norm error "
 911 |                   << error << " is greater than epsilon " << epsilon << "\n";
 912 |     }
 913 | 
 914 | #endif
 915 | 
 916 |     return result;
 917 | }
 918 | 
 919 | inline bool
 920 | sdkLoadPPMub(const char *file, unsigned char **data,
 921 |              unsigned int *w,unsigned int *h)
 922 | {
 923 |     unsigned int channels;
 924 |     return __loadPPM(file, data, w, h, &channels);
 925 | }
 926 | 
 927 | inline bool
 928 | sdkLoadPPM4ub(const char *file, unsigned char **data,
 929 |               unsigned int *w, unsigned int *h)
 930 | {
 931 |     unsigned char *idata = 0;
 932 |     unsigned int channels;
 933 | 
 934 |     if (__loadPPM(file, &idata, w, h, &channels))
 935 |     {
 936 |         // pad 4th component
 937 |         int size = *w **h;
 938 |         // keep the original pointer
 939 |         unsigned char *idata_orig = idata;
 940 |         *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4);
 941 |         unsigned char *ptr = *data;
 942 | 
 943 |         for (int i=0; i<size; i++)
 944 |         {
 945 |             *ptr++ = *idata++;
 946 |             *ptr++ = *idata++;
 947 |             *ptr++ = *idata++;
 948 |             *ptr++ = 0;
 949 |         }
 950 | 
 951 |         free(idata_orig);
 952 |         return true;
 953 |     }
 954 |     else
 955 |     {
 956 |         free(idata);
 957 |         return false;
 958 |     }
 959 | }
 960 | 
 961 | 
 962 | inline bool
 963 | sdkComparePPM(const char *src_file, const char *ref_file,
 964 |               const float epsilon, const float threshold, bool verboseErrors)
 965 | {
 966 |     unsigned char *src_data, *ref_data;
 967 |     unsigned long error_count = 0;
 968 |     unsigned int ref_width, ref_height;
 969 |     unsigned int src_width, src_height;
 970 | 
 971 |     if (src_file == NULL || ref_file == NULL)
 972 |     {
 973 |         if (verboseErrors)
 974 |         {
 975 |             std::cerr << "PPMvsPPM: src_file or ref_file is NULL.  Aborting comparison\n";
 976 |         }
 977 | 
 978 |         return false;
 979 |     }
 980 | 
 981 |     if (verboseErrors)
 982 |     {
 983 |         std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
 984 |         std::cerr << ">         (b)reference: <" << ref_file << ">\n";
 985 |     }
 986 | 
 987 | 
 988 |     if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
 989 |     {
 990 |         if (verboseErrors)
 991 |         {
 992 |             std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n";
 993 |         }
 994 | 
 995 |         return false;
 996 |     }
 997 | 
 998 |     if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
 999 |     {
1000 |         std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n";
1001 |         return false;
1002 |     }
1003 | 
1004 |     if (src_height != ref_height || src_width != ref_width)
1005 |     {
1006 |         if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width <<
1007 |                                          "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
1008 |     }
1009 | 
1010 |     if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width <<
1011 |                                      "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
1012 | 
1013 |     if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false)
1014 |     {
1015 |         error_count=1;
1016 |     }
1017 | 
1018 |     if (error_count == 0)
1019 |     {
1020 |         if (verboseErrors)
1021 |         {
1022 |             std::cerr << "    OK\n\n";
1023 |         }
1024 |     }
1025 |     else
1026 |     {
1027 |         if (verboseErrors)
1028 |         {
1029 |             std::cerr << "    FAILURE!  "<<error_count<<" errors...\n\n";
1030 |         }
1031 |     }
1032 | 
1033 |     return (error_count == 0)? true : false;  // returns true if all pixels pass
1034 | }
1035 | 
1036 | inline bool
1037 | sdkComparePGM(const char *src_file, const char *ref_file,
1038 |               const float epsilon, const float threshold, bool verboseErrors)
1039 | {
1040 |     unsigned char *src_data = 0, *ref_data = 0;
1041 |     unsigned long error_count = 0;
1042 |     unsigned int ref_width, ref_height;
1043 |     unsigned int src_width, src_height;
1044 | 
1045 |     if (src_file == NULL || ref_file == NULL)
1046 |     {
1047 |         if (verboseErrors)
1048 |         {
1049 |             std::cerr << "PGMvsPGM: src_file or ref_file is NULL.  Aborting comparison\n";
1050 |         }
1051 | 
1052 |         return false;
1053 |     }
1054 | 
1055 |     if (verboseErrors)
1056 |     {
1057 |         std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
1058 |         std::cerr << ">         (b)reference: <" << ref_file << ">\n";
1059 |     }
1060 | 
1061 | 
1062 |     if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true)
1063 |     {
1064 |         if (verboseErrors)
1065 |         {
1066 |             std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n";
1067 |         }
1068 | 
1069 |         return false;
1070 |     }
1071 | 
1072 |     if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true)
1073 |     {
1074 |         std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n";
1075 |         return false;
1076 |     }
1077 | 
1078 |     if (src_height != ref_height || src_width != ref_width)
1079 |     {
1080 |         if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width <<
1081 |                                          "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
1082 |     }
1083 | 
1084 |     if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width <<
1085 |                                      "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
1086 | 
1087 |     if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false)
1088 |     {
1089 |         error_count=1;
1090 |     }
1091 | 
1092 |     if (error_count == 0)
1093 |     {
1094 |         if (verboseErrors)
1095 |         {
1096 |             std::cerr << "    OK\n\n";
1097 |         }
1098 |     }
1099 |     else
1100 |     {
1101 |         if (verboseErrors)
1102 |         {
1103 |             std::cerr << "    FAILURE!  "<<error_count<<" errors...\n\n";
1104 |         }
1105 |     }
1106 | 
1107 |     return (error_count == 0)? true : false;  // returns true if all pixels pass
1108 | }
1109 | 
1110 | #endif // HELPER_IMAGE_H
1111 | 


--------------------------------------------------------------------------------
/common/helper_string.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // These are helper functions for the SDK samples (string parsing, timers, etc)
 13 | #ifndef STRING_HELPER_H
 14 | #define STRING_HELPER_H
 15 | 
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <fstream>
 19 | #include <string>
 20 | 
 21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 22 | #ifndef _CRT_SECURE_NO_DEPRECATE
 23 | #define _CRT_SECURE_NO_DEPRECATE
 24 | #endif
 25 | #ifndef STRCASECMP
 26 | #define STRCASECMP  _stricmp
 27 | #endif
 28 | #ifndef STRNCASECMP
 29 | #define STRNCASECMP _strnicmp
 30 | #endif
 31 | #ifndef STRCPY
 32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
 33 | #endif
 34 | 
 35 | #ifndef FOPEN
 36 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
 37 | #endif
 38 | #ifndef FOPEN_FAIL
 39 | #define FOPEN_FAIL(result) (result != 0)
 40 | #endif
 41 | #ifndef SSCANF
 42 | #define SSCANF sscanf_s
 43 | #endif
 44 | #ifndef SPRINTF
 45 | #define SPRINTF sprintf_s
 46 | #endif
 47 | #else // Linux Includes
 48 | #include <string.h>
 49 | #include <strings.h>
 50 | 
 51 | #ifndef STRCASECMP
 52 | #define STRCASECMP  strcasecmp
 53 | #endif
 54 | #ifndef STRNCASECMP
 55 | #define STRNCASECMP strncasecmp
 56 | #endif
 57 | #ifndef STRCPY
 58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
 59 | #endif
 60 | 
 61 | #ifndef FOPEN
 62 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
 63 | #endif
 64 | #ifndef FOPEN_FAIL
 65 | #define FOPEN_FAIL(result) (result == NULL)
 66 | #endif
 67 | #ifndef SSCANF
 68 | #define SSCANF sscanf
 69 | #endif
 70 | #ifndef SPRINTF
 71 | #define SPRINTF sprintf
 72 | #endif
 73 | #endif
 74 | 
 75 | #ifndef EXIT_WAIVED
 76 | #define EXIT_WAIVED 2
 77 | #endif
 78 | 
 79 | // CUDA Utility Helper Functions
 80 | inline int stringRemoveDelimiter(char delimiter, const char *string)
 81 | {
 82 |     int string_start = 0;
 83 | 
 84 |     while (string[string_start] == delimiter)
 85 |     {
 86 |         string_start++;
 87 |     }
 88 | 
 89 |     if (string_start >= (int)strlen(string)-1)
 90 |     {
 91 |         return 0;
 92 |     }
 93 | 
 94 |     return string_start;
 95 | }
 96 | 
 97 | inline int getFileExtension(char *filename, char **extension)
 98 | {
 99 |     int string_length = (int)strlen(filename);
100 | 
101 |     while (filename[string_length--] != '.')
102 |     {
103 |         if (string_length == 0)
104 |             break;
105 |     }
106 | 
107 |     if (string_length > 0) string_length += 2;
108 | 
109 |     if (string_length == 0)
110 |         *extension = NULL;
111 |     else
112 |         *extension = &filename[string_length];
113 | 
114 |     return string_length;
115 | }
116 | 
117 | 
118 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
119 | {
120 |     bool bFound = false;
121 | 
122 |     if (argc >= 1)
123 |     {
124 |         for (int i=1; i < argc; i++)
125 |         {
126 |             int string_start = stringRemoveDelimiter('-', argv[i]);
127 |             const char *string_argv = &argv[i][string_start];
128 | 
129 |             const char *equal_pos = strchr(string_argv, '=');
130 |             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
131 | 
132 |             int length = (int)strlen(string_ref);
133 | 
134 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
135 |             {
136 |                 bFound = true;
137 |                 continue;
138 |             }
139 |         }
140 |     }
141 | 
142 |     return bFound;
143 | }
144 | 
145 | // This function wraps the CUDA Driver API into a template function
146 | template <class T>
147 | inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
148 | {
149 |     bool bFound = false;
150 | 
151 |     if (argc >= 1)
152 |     {
153 |         for (int i=1; i < argc; i++)
154 |         {
155 |             int string_start = stringRemoveDelimiter('-', argv[i]);
156 |             const char *string_argv = &argv[i][string_start];
157 |             int length = (int)strlen(string_ref);
158 | 
159 |             if (!STRNCASECMP(string_argv, string_ref, length))
160 |             {
161 |                 if (length+1 <= (int)strlen(string_argv))
162 |                 {
163 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
164 |                     *value = (T)atoi(&string_argv[length + auto_inc]);
165 |                 }
166 | 
167 |                 bFound = true;
168 |                 i=argc;
169 |             }
170 |         }
171 |     }
172 | 
173 |     return bFound;
174 | }
175 | 
176 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
177 | {
178 |     bool bFound = false;
179 |     int value = -1;
180 | 
181 |     if (argc >= 1)
182 |     {
183 |         for (int i=1; i < argc; i++)
184 |         {
185 |             int string_start = stringRemoveDelimiter('-', argv[i]);
186 |             const char *string_argv = &argv[i][string_start];
187 |             int length = (int)strlen(string_ref);
188 | 
189 |             if (!STRNCASECMP(string_argv, string_ref, length))
190 |             {
191 |                 if (length+1 <= (int)strlen(string_argv))
192 |                 {
193 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
194 |                     value = atoi(&string_argv[length + auto_inc]);
195 |                 }
196 |                 else
197 |                 {
198 |                     value = 0;
199 |                 }
200 | 
201 |                 bFound = true;
202 |                 continue;
203 |             }
204 |         }
205 |     }
206 | 
207 |     if (bFound)
208 |     {
209 |         return value;
210 |     }
211 |     else
212 |     {
213 |         return 0;
214 |     }
215 | }
216 | 
217 | inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
218 | {
219 |     bool bFound = false;
220 |     float value = -1;
221 | 
222 |     if (argc >= 1)
223 |     {
224 |         for (int i=1; i < argc; i++)
225 |         {
226 |             int string_start = stringRemoveDelimiter('-', argv[i]);
227 |             const char *string_argv = &argv[i][string_start];
228 |             int length = (int)strlen(string_ref);
229 | 
230 |             if (!STRNCASECMP(string_argv, string_ref, length))
231 |             {
232 |                 if (length+1 <= (int)strlen(string_argv))
233 |                 {
234 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
235 |                     value = (float)atof(&string_argv[length + auto_inc]);
236 |                 }
237 |                 else
238 |                 {
239 |                     value = 0.f;
240 |                 }
241 | 
242 |                 bFound = true;
243 |                 continue;
244 |             }
245 |         }
246 |     }
247 | 
248 |     if (bFound)
249 |     {
250 |         return value;
251 |     }
252 |     else
253 |     {
254 |         return 0;
255 |     }
256 | }
257 | 
258 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
259 |                                      const char *string_ref, char **string_retval)
260 | {
261 |     bool bFound = false;
262 | 
263 |     if (argc >= 1)
264 |     {
265 |         for (int i=1; i < argc; i++)
266 |         {
267 |             int string_start = stringRemoveDelimiter('-', argv[i]);
268 |             char *string_argv = (char *)&argv[i][string_start];
269 |             int length = (int)strlen(string_ref);
270 | 
271 |             if (!STRNCASECMP(string_argv, string_ref, length))
272 |             {
273 |                 *string_retval = &string_argv[length+1];
274 |                 bFound = true;
275 |                 continue;
276 |             }
277 |         }
278 |     }
279 | 
280 |     if (!bFound)
281 |     {
282 |         *string_retval = NULL;
283 |     }
284 | 
285 |     return bFound;
286 | }
287 | 
288 | //////////////////////////////////////////////////////////////////////////////
289 | //! Find the path for a file assuming that
290 | //! files are found in the searchPath.
291 | //!
292 | //! @return the path if succeeded, otherwise 0
293 | //! @param filename         name of the file
294 | //! @param executable_path  optional absolute path of the executable
295 | //////////////////////////////////////////////////////////////////////////////
296 | inline char *sdkFindFilePath(const char *filename, const char *executable_path)
297 | {
298 |     // <executable_name> defines a variable that is replaced with the name of the executable
299 | 
300 |     // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
301 |     // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
302 |     const char *searchPath[] =
303 |     {
304 |         "./",                                       // same dir
305 |         "./common/",                                // "/common/" subdir
306 |         "./common/data/",                           // "/common/data/" subdir
307 |         "./data/",                                  // "/data/" subdir
308 |         "./src/",                                   // "/src/" subdir
309 |         "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
310 |         "./inc/",                                   // "/inc/" subdir
311 |         "./0_Simple/",                              // "/0_Simple/" subdir
312 |         "./1_Utilities/",                           // "/1_Utilities/" subdir
313 |         "./2_Graphics/",                            // "/2_Graphics/" subdir
314 |         "./3_Imaging/",                             // "/3_Imaging/" subdir
315 |         "./4_Financial/",                           // "/4_Financial/" subdir
316 |         "./5_Simulations/",                         // "/5_Simulations/" subdir
317 |         "./6_Advanced/",                            // "/6_Advanced/" subdir
318 |         "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
319 |         "./samples/",                               // "/samples/" subdir
320 | 
321 |         "../",                                      // up 1 in tree
322 |         "../common/",                               // up 1 in tree, "/common/" subdir
323 |         "../common/data/",                          // up 1 in tree, "/common/data/" subdir
324 |         "../data/",                                 // up 1 in tree, "/data/" subdir
325 |         "../src/",                                  // up 1 in tree, "/src/" subdir
326 |         "../inc/",                                  // up 1 in tree, "/inc/" subdir
327 | 
328 |         "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
329 |         "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
330 |         "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
331 |         "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
332 |         "../4_Financial/<executable_name>/data/",    // up 1 in tree, "/4_Financial/<executable_name>/" subdir
333 |         "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
334 |         "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
335 |         "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
336 |         "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
337 |         "../../",                                        // up 2 in tree
338 |         "../../common/",                                 // up 2 in tree, "/common/" subdir
339 |         "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
340 |         "../../data/",                                   // up 2 in tree, "/data/" subdir
341 |         "../../src/",                                    // up 2 in tree, "/src/" subdir
342 |         "../../inc/",                                    // up 2 in tree, "/inc/" subdir
343 |         "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
344 |         "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
345 |         "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
346 |         "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
347 |         "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
348 |         "../../4_Financial/<executable_name>/data/",     // up 2 in tree, "/4_Financial/<executable_name>/" subdir
349 |         "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
350 |         "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
351 |         "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
352 |         "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
353 |         "../../../",                                        // up 3 in tree
354 |         "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
355 |         "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
356 |         "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
357 |         "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
358 |         "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
359 |         "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
360 |         "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
361 |         "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
362 |         "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
363 |         "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
364 |         "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
365 |         "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
366 |         "../../../4_Financial/<executable_name>/data/",     // up 3 in tree, "/4_Financial/<executable_name>/" subdir
367 |         "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
368 |         "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
369 |         "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
370 |         "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
371 |         "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
372 |         "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
373 |         "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
374 |         "../../../../",                                // up 4 in tree
375 |         "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
376 |         "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
377 |         "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
378 |         "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
379 |         "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
380 |         "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
381 |         "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
382 |         "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
383 |         "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
384 |         "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
385 |         "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
386 |         "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
387 |         "../../../../4_Financial/<executable_name>/data/",  // up 4 in tree, "/4_Financial/<executable_name>/" subdir
388 |         "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
389 |         "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
390 |         "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
391 |         "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
392 |         "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
393 |         "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
394 |         "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
395 |         "../../../../../",                                // up 5 in tree
396 |         "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
397 |         "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
398 |         "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
399 |         "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
400 |         "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
401 |         "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
402 |         "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
403 |         "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
404 |         "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
405 |         "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
406 |         "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
407 |         "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
408 |         "../../../../../4_Financial/<executable_name>/data/",  // up 5 in tree, "/4_Financial/<executable_name>/" subdir
409 |         "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
410 |         "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
411 |         "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
412 |         "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
413 |         "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
414 |         "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
415 |     };
416 | 
417 |     // Extract the executable name
418 |     std::string executable_name;
419 | 
420 |     if (executable_path != 0)
421 |     {
422 |         executable_name = std::string(executable_path);
423 | 
424 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
425 |         // Windows path delimiter
426 |         size_t delimiter_pos = executable_name.find_last_of('\\');
427 |         executable_name.erase(0, delimiter_pos + 1);
428 | 
429 |         if (executable_name.rfind(".exe") != std::string::npos)
430 |         {
431 |             // we strip .exe, only if the .exe is found
432 |             executable_name.resize(executable_name.size() - 4);
433 |         }
434 | 
435 | #else
436 |         // Linux & OSX path delimiter
437 |         size_t delimiter_pos = executable_name.find_last_of('/');
438 |         executable_name.erase(0,delimiter_pos+1);
439 | #endif
440 |     }
441 | 
442 |     // Loop over all search paths and return the first hit
443 |     for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
444 |     {
445 |         std::string path(searchPath[i]);
446 |         size_t executable_name_pos = path.find("<executable_name>");
447 | 
448 |         // If there is executable_name variable in the searchPath
449 |         // replace it with the value
450 |         if (executable_name_pos != std::string::npos)
451 |         {
452 |             if (executable_path != 0)
453 |             {
454 |                 path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
455 |             }
456 |             else
457 |             {
458 |                 // Skip this path entry if no executable argument is given
459 |                 continue;
460 |             }
461 |         }
462 | 
463 | #ifdef _DEBUG
464 |         printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
465 | #endif
466 | 
467 |         // Test if the file exists
468 |         path.append(filename);
469 |         FILE *fp;
470 |         FOPEN(fp, path.c_str(), "rb");
471 | 
472 |         if (fp != NULL)
473 |         {
474 |             fclose(fp);
475 |             // File found
476 |             // returning an allocated array here for backwards compatibility reasons
477 |             char *file_path = (char *) malloc(path.length() + 1);
478 |             STRCPY(file_path, path.length() + 1, path.c_str());
479 |             return file_path;
480 |         }
481 | 
482 |         if (fp)
483 |         {
484 |             fclose(fp);
485 |         }
486 |     }
487 | 
488 |     // File not found
489 |     return 0;
490 | }
491 | 
492 | #endif
493 | 


--------------------------------------------------------------------------------
/common/helper_timer.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // Helper Timing Functions
 13 | #ifndef HELPER_TIMER_H
 14 | #define HELPER_TIMER_H
 15 | 
 16 | #ifndef EXIT_WAIVED
 17 | #define EXIT_WAIVED 2
 18 | #endif
 19 | 
 20 | // includes, system
 21 | #include <vector>
 22 | 
 23 | // includes, project
 24 | #include <exception.h>
 25 | 
 26 | // Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
 27 | // But rather in a self contained class interface
 28 | class StopWatchInterface
 29 | {
 30 |     public:
 31 |         StopWatchInterface() {};
 32 |         virtual ~StopWatchInterface() {};
 33 | 
 34 |     public:
 35 |         //! Start time measurement
 36 |         virtual void start() = 0;
 37 | 
 38 |         //! Stop time measurement
 39 |         virtual void stop() = 0;
 40 | 
 41 |         //! Reset time counters to zero
 42 |         virtual void reset() = 0;
 43 | 
 44 |         //! Time in msec. after start. If the stop watch is still running (i.e. there
 45 |         //! was no call to stop()) then the elapsed time is returned, otherwise the
 46 |         //! time between the last start() and stop call is returned
 47 |         virtual float getTime() = 0;
 48 | 
 49 |         //! Mean time to date based on the number of times the stopwatch has been
 50 |         //! _stopped_ (ie finished sessions) and the current total time
 51 |         virtual float getAverageTime() = 0;
 52 | };
 53 | 
 54 | 
 55 | //////////////////////////////////////////////////////////////////
 56 | // Begin Stopwatch timer class definitions for all OS platforms //
 57 | //////////////////////////////////////////////////////////////////
 58 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 59 | // includes, system
 60 | #define WINDOWS_LEAN_AND_MEAN
 61 | #include <windows.h>
 62 | #undef min
 63 | #undef max
 64 | 
 65 | //! Windows specific implementation of StopWatch
 66 | class StopWatchWin : public StopWatchInterface
 67 | {
 68 |     public:
 69 |         //! Constructor, default
 70 |         StopWatchWin() :
 71 |             start_time(),     end_time(),
 72 |             diff_time(0.0f),  total_time(0.0f),
 73 |             running(false), clock_sessions(0), freq(0), freq_set(false)
 74 |         {
 75 |             if (! freq_set)
 76 |             {
 77 |                 // helper variable
 78 |                 LARGE_INTEGER temp;
 79 | 
 80 |                 // get the tick frequency from the OS
 81 |                 QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
 82 | 
 83 |                 // convert to type in which it is needed
 84 |                 freq = ((double) temp.QuadPart) / 1000.0;
 85 | 
 86 |                 // rememeber query
 87 |                 freq_set = true;
 88 |             }
 89 |         };
 90 | 
 91 |         // Destructor
 92 |         ~StopWatchWin() { };
 93 | 
 94 |     public:
 95 |         //! Start time measurement
 96 |         inline void start();
 97 | 
 98 |         //! Stop time measurement
 99 |         inline void stop();
100 | 
101 |         //! Reset time counters to zero
102 |         inline void reset();
103 | 
104 |         //! Time in msec. after start. If the stop watch is still running (i.e. there
105 |         //! was no call to stop()) then the elapsed time is returned, otherwise the
106 |         //! time between the last start() and stop call is returned
107 |         inline float getTime();
108 | 
109 |         //! Mean time to date based on the number of times the stopwatch has been
110 |         //! _stopped_ (ie finished sessions) and the current total time
111 |         inline float getAverageTime();
112 | 
113 |     private:
114 |         // member variables
115 | 
116 |         //! Start of measurement
117 |         LARGE_INTEGER  start_time;
118 |         //! End of measurement
119 |         LARGE_INTEGER  end_time;
120 | 
121 |         //! Time difference between the last start and stop
122 |         float  diff_time;
123 | 
124 |         //! TOTAL time difference between starts and stops
125 |         float  total_time;
126 | 
127 |         //! flag if the stop watch is running
128 |         bool running;
129 | 
130 |         //! Number of times clock has been started
131 |         //! and stopped to allow averaging
132 |         int clock_sessions;
133 | 
134 |         //! tick frequency
135 |         double  freq;
136 | 
137 |         //! flag if the frequency has been set
138 |         bool  freq_set;
139 | };
140 | 
141 | // functions, inlined
142 | 
143 | ////////////////////////////////////////////////////////////////////////////////
144 | //! Start time measurement
145 | ////////////////////////////////////////////////////////////////////////////////
146 | inline void
147 | StopWatchWin::start()
148 | {
149 |     QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
150 |     running = true;
151 | }
152 | 
153 | ////////////////////////////////////////////////////////////////////////////////
154 | //! Stop time measurement and increment add to the current diff_time summation
155 | //! variable. Also increment the number of times this clock has been run.
156 | ////////////////////////////////////////////////////////////////////////////////
157 | inline void
158 | StopWatchWin::stop()
159 | {
160 |     QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
161 |     diff_time = (float)
162 |                 (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
163 | 
164 |     total_time += diff_time;
165 |     clock_sessions++;
166 |     running = false;
167 | }
168 | 
169 | ////////////////////////////////////////////////////////////////////////////////
170 | //! Reset the timer to 0. Does not change the timer running state but does
171 | //! recapture this point in time as the current start time if it is running.
172 | ////////////////////////////////////////////////////////////////////////////////
173 | inline void
174 | StopWatchWin::reset()
175 | {
176 |     diff_time = 0;
177 |     total_time = 0;
178 |     clock_sessions = 0;
179 | 
180 |     if (running)
181 |     {
182 |         QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
183 |     }
184 | }
185 | 
186 | 
187 | ////////////////////////////////////////////////////////////////////////////////
188 | //! Time in msec. after start. If the stop watch is still running (i.e. there
189 | //! was no call to stop()) then the elapsed time is returned added to the
190 | //! current diff_time sum, otherwise the current summed time difference alone
191 | //! is returned.
192 | ////////////////////////////////////////////////////////////////////////////////
193 | inline float
194 | StopWatchWin::getTime()
195 | {
196 |     // Return the TOTAL time to date
197 |     float retval = total_time;
198 | 
199 |     if (running)
200 |     {
201 |         LARGE_INTEGER temp;
202 |         QueryPerformanceCounter((LARGE_INTEGER *) &temp);
203 |         retval += (float)
204 |                   (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
205 |     }
206 | 
207 |     return retval;
208 | }
209 | 
210 | ////////////////////////////////////////////////////////////////////////////////
211 | //! Time in msec. for a single run based on the total number of COMPLETED runs
212 | //! and the total time.
213 | ////////////////////////////////////////////////////////////////////////////////
214 | inline float
215 | StopWatchWin::getAverageTime()
216 | {
217 |     return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
218 | }
219 | #else
220 | // Declarations for Stopwatch on Linux and Mac OSX
221 | // includes, system
222 | #include <ctime>
223 | #include <sys/time.h>
224 | 
225 | //! Windows specific implementation of StopWatch
226 | class StopWatchLinux : public StopWatchInterface
227 | {
228 |     public:
229 |         //! Constructor, default
230 |         StopWatchLinux() :
231 |             start_time(), diff_time(0.0), total_time(0.0),
232 |             running(false), clock_sessions(0)
233 |         { };
234 | 
235 |         // Destructor
236 |         virtual ~StopWatchLinux()
237 |         { };
238 | 
239 |     public:
240 |         //! Start time measurement
241 |         inline void start();
242 | 
243 |         //! Stop time measurement
244 |         inline void stop();
245 | 
246 |         //! Reset time counters to zero
247 |         inline void reset();
248 | 
249 |         //! Time in msec. after start. If the stop watch is still running (i.e. there
250 |         //! was no call to stop()) then the elapsed time is returned, otherwise the
251 |         //! time between the last start() and stop call is returned
252 |         inline float getTime();
253 | 
254 |         //! Mean time to date based on the number of times the stopwatch has been
255 |         //! _stopped_ (ie finished sessions) and the current total time
256 |         inline float getAverageTime();
257 | 
258 |     private:
259 | 
260 |         // helper functions
261 | 
262 |         //! Get difference between start time and current time
263 |         inline float getDiffTime();
264 | 
265 |     private:
266 | 
267 |         // member variables
268 | 
269 |         //! Start of measurement
270 |         struct timeval  start_time;
271 | 
272 |         //! Time difference between the last start and stop
273 |         float  diff_time;
274 | 
275 |         //! TOTAL time difference between starts and stops
276 |         float  total_time;
277 | 
278 |         //! flag if the stop watch is running
279 |         bool running;
280 | 
281 |         //! Number of times clock has been started
282 |         //! and stopped to allow averaging
283 |         int clock_sessions;
284 | };
285 | 
286 | // functions, inlined
287 | 
288 | ////////////////////////////////////////////////////////////////////////////////
289 | //! Start time measurement
290 | ////////////////////////////////////////////////////////////////////////////////
291 | inline void
292 | StopWatchLinux::start()
293 | {
294 |     gettimeofday(&start_time, 0);
295 |     running = true;
296 | }
297 | 
298 | ////////////////////////////////////////////////////////////////////////////////
299 | //! Stop time measurement and increment add to the current diff_time summation
300 | //! variable. Also increment the number of times this clock has been run.
301 | ////////////////////////////////////////////////////////////////////////////////
302 | inline void
303 | StopWatchLinux::stop()
304 | {
305 |     diff_time = getDiffTime();
306 |     total_time += diff_time;
307 |     running = false;
308 |     clock_sessions++;
309 | }
310 | 
311 | ////////////////////////////////////////////////////////////////////////////////
312 | //! Reset the timer to 0. Does not change the timer running state but does
313 | //! recapture this point in time as the current start time if it is running.
314 | ////////////////////////////////////////////////////////////////////////////////
315 | inline void
316 | StopWatchLinux::reset()
317 | {
318 |     diff_time = 0;
319 |     total_time = 0;
320 |     clock_sessions = 0;
321 | 
322 |     if (running)
323 |     {
324 |         gettimeofday(&start_time, 0);
325 |     }
326 | }
327 | 
328 | ////////////////////////////////////////////////////////////////////////////////
329 | //! Time in msec. after start. If the stop watch is still running (i.e. there
330 | //! was no call to stop()) then the elapsed time is returned added to the
331 | //! current diff_time sum, otherwise the current summed time difference alone
332 | //! is returned.
333 | ////////////////////////////////////////////////////////////////////////////////
334 | inline float
335 | StopWatchLinux::getTime()
336 | {
337 |     // Return the TOTAL time to date
338 |     float retval = total_time;
339 | 
340 |     if (running)
341 |     {
342 |         retval += getDiffTime();
343 |     }
344 | 
345 |     return retval;
346 | }
347 | 
348 | ////////////////////////////////////////////////////////////////////////////////
349 | //! Time in msec. for a single run based on the total number of COMPLETED runs
350 | //! and the total time.
351 | ////////////////////////////////////////////////////////////////////////////////
352 | inline float
353 | StopWatchLinux::getAverageTime()
354 | {
355 |     return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
356 | }
357 | ////////////////////////////////////////////////////////////////////////////////
358 | 
359 | ////////////////////////////////////////////////////////////////////////////////
360 | inline float
361 | StopWatchLinux::getDiffTime()
362 | {
363 |     struct timeval t_time;
364 |     gettimeofday(&t_time, 0);
365 | 
366 |     // time difference in milli-seconds
367 |     return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
368 |                    + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
369 | }
370 | #endif // WIN32
371 | 
372 | ////////////////////////////////////////////////////////////////////////////////
373 | //! Timer functionality exported
374 | 
375 | ////////////////////////////////////////////////////////////////////////////////
376 | //! Create a new timer
377 | //! @return true if a time has been created, otherwise false
378 | //! @param  name of the new timer, 0 if the creation failed
379 | ////////////////////////////////////////////////////////////////////////////////
380 | inline bool
381 | sdkCreateTimer(StopWatchInterface **timer_interface)
382 | {
383 |     //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
384 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
385 |     *timer_interface = (StopWatchInterface *)new StopWatchWin();
386 | #else
387 |     *timer_interface = (StopWatchInterface *)new StopWatchLinux();
388 | #endif
389 |     return (*timer_interface != NULL) ? true : false;
390 | }
391 | 
392 | 
393 | ////////////////////////////////////////////////////////////////////////////////
394 | //! Delete a timer
395 | //! @return true if a time has been deleted, otherwise false
396 | //! @param  name of the timer to delete
397 | ////////////////////////////////////////////////////////////////////////////////
398 | inline bool
399 | sdkDeleteTimer(StopWatchInterface **timer_interface)
400 | {
401 |     //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
402 |     if (*timer_interface)
403 |     {
404 |         delete *timer_interface;
405 |         *timer_interface = NULL;
406 |     }
407 | 
408 |     return true;
409 | }
410 | 
411 | ////////////////////////////////////////////////////////////////////////////////
412 | //! Start the time with name \a name
413 | //! @param name  name of the timer to start
414 | ////////////////////////////////////////////////////////////////////////////////
415 | inline bool
416 | sdkStartTimer(StopWatchInterface **timer_interface)
417 | {
418 |     //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
419 |     if (*timer_interface)
420 |     {
421 |         (*timer_interface)->start();
422 |     }
423 | 
424 |     return true;
425 | }
426 | 
427 | ////////////////////////////////////////////////////////////////////////////////
428 | //! Stop the time with name \a name. Does not reset.
429 | //! @param name  name of the timer to stop
430 | ////////////////////////////////////////////////////////////////////////////////
431 | inline bool
432 | sdkStopTimer(StopWatchInterface **timer_interface)
433 | {
434 |     // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
435 |     if (*timer_interface)
436 |     {
437 |         (*timer_interface)->stop();
438 |     }
439 | 
440 |     return true;
441 | }
442 | 
443 | ////////////////////////////////////////////////////////////////////////////////
444 | //! Resets the timer's counter.
445 | //! @param name  name of the timer to reset.
446 | ////////////////////////////////////////////////////////////////////////////////
447 | inline bool
448 | sdkResetTimer(StopWatchInterface **timer_interface)
449 | {
450 |     // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
451 |     if (*timer_interface)
452 |     {
453 |         (*timer_interface)->reset();
454 |     }
455 | 
456 |     return true;
457 | }
458 | 
459 | ////////////////////////////////////////////////////////////////////////////////
460 | //! Return the average time for timer execution as the total time
461 | //! for the timer dividied by the number of completed (stopped) runs the timer
462 | //! has made.
463 | //! Excludes the current running time if the timer is currently running.
464 | //! @param name  name of the timer to return the time of
465 | ////////////////////////////////////////////////////////////////////////////////
466 | inline float
467 | sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
468 | {
469 |     //  printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
470 |     if (*timer_interface)
471 |     {
472 |         return (*timer_interface)->getAverageTime();
473 |     }
474 |     else
475 |     {
476 |         return 0.0f;
477 |     }
478 | }
479 | 
480 | ////////////////////////////////////////////////////////////////////////////////
481 | //! Total execution time for the timer over all runs since the last reset
482 | //! or timer creation.
483 | //! @param name  name of the timer to obtain the value of.
484 | ////////////////////////////////////////////////////////////////////////////////
485 | inline float
486 | sdkGetTimerValue(StopWatchInterface **timer_interface)
487 | {
488 |     // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
489 |     if (*timer_interface)
490 |     {
491 |         return (*timer_interface)->getTime();
492 |     }
493 |     else
494 |     {
495 |         return 0.0f;
496 |     }
497 | }
498 | 
499 | #endif // HELPER_TIMER_H
500 | 


--------------------------------------------------------------------------------
/compile.m:
--------------------------------------------------------------------------------
 1 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b';
 2 | CUDA_ROOT = '/usr/local/cuda-6.0';
 3 | 
 4 | if ismac
 5 |   MATLAB_ROOT = '/Applications/MATLAB_R2014a.app';
 6 |   CUDA_ROOT = '/usr/local/cuda';
 7 | end
 8 | 
 9 | cuda_compile('./src', 'cudaFFTData', MATLAB_ROOT, CUDA_ROOT, './bin', false)
10 | cuda_compile('./src', 'cudaConvFFTData',MATLAB_ROOT, CUDA_ROOT, './bin', false)
11 | cuda_compile('./src', 'cudaConvolutionFFT',MATLAB_ROOT, CUDA_ROOT, './bin', false)
12 | 


--------------------------------------------------------------------------------
/cuda_compile.m:
--------------------------------------------------------------------------------
 1 | function cuda_compile( src_path, func_name, matlab_root, cuda_root, out_path, debug)
 2 | %CUDA_COMPILE general cuda compiling helper for MATLAB version < 2014a
 3 | if nargin < 6
 4 |   debug = false;
 5 | end
 6 | 
 7 | if ~exist('./bin', 'dir')
 8 |   mkdir('./bin')
 9 | end
10 | 
11 | % TODO: For matlab version < 8.0.1, Use the following setting,
12 | % if ~verLessThan('matlab', '8.0.1')
13 | % http://www.mathworks.com/help/distcomp/run-mex-functions-containing-cuda-code.html
14 | %   setenv('MW_NVCC_PATH',[cudaroot '/nvcc'])
15 | %   eval(sprintf('mex -v -largeArrayDims %s.cu',func_name));
16 | % elseif isunix && ~ismac && verLessThan('matlab', '8.0.1')
17 | 
18 | 
19 | % ------------------------------------------------------------------------------
20 | %                                                Check cuda computing capability
21 | % ------------------------------------------------------------------------------
22 | % TODO, CUDA Stream if high CM
23 | gpuInfo = gpuDevice;
24 | fprintf('Your GPU Computing Capability %d\n', str2num(gpuInfo.ComputeCapability));
25 | 
26 | % Remove compiled binary files
27 | eval(['!rm bin/' func_name '.o']);
28 | 
29 | % ------------------------------------------------------------------------------
30 | %                                                    Setup environment variables
31 | % ------------------------------------------------------------------------------
32 | 
33 | % Set debugging flag
34 | if debug
35 |   nvcc_debug_flag = '-g -G -O0';
36 |   mex_debug_flag = '-g';
37 | else
38 |   nvcc_debug_flag = '-O3 -DNDEBUG';
39 |   mex_debug_flag = '';
40 | end
41 | 
42 | if ismac
43 |   matlab_bin_path = '/bin/maci64';
44 | else
45 |   matlab_bin_path = '/bin/glnxa64';
46 | end
47 | 
48 | INCLUDE_PATH = sprintf([...
49 |     '-I./common ',...
50 |     '-I%s/extern/include ',...
51 |     '-I%s/toolbox/distcomp/gpu/extern/include'],...
52 |     matlab_root, matlab_root);
53 | NVCC_OPTS = '-arch=sm_30 -ftz=true -prec-div=false -prec-sqrt=false';
54 | COMPILER_OPTS = '-Xcompiler -fPIC -v';
55 | 
56 | MEX_OPTS = '-largeArrayDims';
57 | MEX_INCLUDE_PATH = sprintf('-I%s/include', cuda_root);
58 | MEX_LIBS = '-lcudart -lcufft -lmwgpu';
59 | MEX_LIBRARY_PATH = ['-L', matlab_root, matlab_bin_path];
60 | 
61 | % ------------------------------------------------------------------------------
62 | %                                                                       Compile
63 | % ------------------------------------------------------------------------------
64 | 
65 | % Compile the object file
66 | compile_string = sprintf([...
67 |     '!%s/bin/nvcc ',...
68 |     '%s ',... % Debug flag
69 |     '%s ',... % Compiler options
70 |     '%s ',... % NVCC_OPTS
71 |     '%s ',... % Include paths
72 |     '-c %s/%s.cu --output-file %s/%s.o'], ...
73 |     cuda_root, nvcc_debug_flag, COMPILER_OPTS, NVCC_OPTS, INCLUDE_PATH, src_path, func_name, out_path, func_name);
74 | disp(compile_string);
75 | eval(compile_string);
76 | 
77 | compile_string = sprintf(['mex ',...
78 |     '%s ',... % Debug flag
79 |     '%s ',... % Mex options
80 |     '%s/%s.o  ',... % Object file
81 |     '%s ',... % Mex library path
82 |     '%s ',... % Mex libraries
83 |     '-outdir %s'],... % Out path
84 |     mex_debug_flag, MEX_OPTS, out_path, func_name, MEX_LIBRARY_PATH, MEX_LIBS, out_path);
85 | disp(compile_string);
86 | eval(compile_string);
87 | 
88 | % % Run system command
89 | % !nvcc -O3 -DNDEBUG -c cudaconv.cu -Xcompiler -fPIC -I/afs/cs/package/matlab-r2013b/matlab/r2013b/extern/include -I/afs/cs/package/matlab-r2013b/matlab/r2013b/toolbox/distcomp/gpu/extern/include
90 | % % Link object
91 | % mex cudaconv.o -L/usr/local/cuda-6.0/lib64 -L/afs/cs/package/matlab-r2013b/matlab/r2013b/bin/glnxa64 -lcudart -lcufft -lmwgpu
92 | % -gencode arch=compute_30,code=sm_30 
93 | 


--------------------------------------------------------------------------------
/demoCudaConvolutionFFT.m:
--------------------------------------------------------------------------------
  1 | % MatlabCUDAConv 
  2 | %
  3 | % To speed up convolutions, I made 
  4 | 
  5 | % ------------------------------------------------------------------------------
  6 | %                                                                       Compile
  7 | % ------------------------------------------------------------------------------
  8 | 
  9 | % Change the following lines
 10 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b/';
 11 | CUDA_ROOT = '/usr/local/cuda-6.0/';
 12 | 
 13 | if ismac
 14 |   MATLAB_ROOT = '/Applications/MATLAB_R2014a.app/';
 15 |   CUDA_ROOT = '/usr/local/cuda/';
 16 | end
 17 | 
 18 | % Debugging compile
 19 | compile
 20 | addpath('./bin')
 21 | 
 22 | % ------------------------------------------------------------------------------
 23 | %                                                                  Clear the GPU
 24 | % ------------------------------------------------------------------------------
 25 | 
 26 | clear;
 27 | device_id = 1; % 1-base GPU index (MATLAB convention)
 28 | g = gpuDevice(device_id);
 29 | reset(g);
 30 | cos(gpuArray(1)); % force matlab gpu dynamic library loading
 31 | 
 32 | 
 33 | % ------------------------------------------------------------------------------
 34 | %                                                              Experiment setup
 35 | % ------------------------------------------------------------------------------
 36 | 
 37 | n = 64;  % data height
 38 | m = 8; % data width
 39 | k = 5;   % number of channels
 40 | 
 41 | cn = 10; % kernel height
 42 | cm = 4;  % kernel width
 43 | 
 44 | % Make random data
 45 | data = single(rand(n,m));
 46 | for i = 2:k
 47 |   data(:,:,i) = single(rand(n,m));
 48 | end
 49 | 
 50 | % Make random kernel
 51 | kernel = zeros(cn,cm,k,'single');
 52 | kernel(:,:,1) = single(reshape(1:cn*cm,cn,cm));
 53 | for i = 2:k
 54 |   kernel(:,:,i) = single(rand(cn,cm));
 55 | end
 56 | 
 57 | % To verify experiment, put kernel values to specific regions
 58 | data(5:(4+cn),2:(1+cm),1) = kernel(:,:,1);
 59 | data(21:(20+cn),1:cm,2) = kernel(:,:,1);
 60 | data(1:cn,(m-(cm-1)):m,k) = kernel(:,:,1);
 61 | kernel(:,:,k) = kernel(:,:,1);
 62 | 
 63 | % ------------------------------------------------------------------------------
 64 | %                                                         Flip Kernel (Required)
 65 | % ------------------------------------------------------------------------------
 66 | 
 67 | for i = 1:k
 68 |   kernel(:,:,i) = kernel(end:-1:1,end:-1:1,i);
 69 | end
 70 | 
 71 | 
 72 | % ------------------------------------------------------------------------------
 73 | %                                    Matlab convolution (Conv2 and FFT versions)
 74 | % ------------------------------------------------------------------------------
 75 | 
 76 | % Compute convolution using FFT
 77 | % The size of ffted data should be larger than (n + cn - 1)x(m + cm - 1)
 78 | fft_h = 80;
 79 | fft_w = 16;
 80 | matFFTedData = zeros(fft_h,fft_w,k);
 81 | for i = 1:k
 82 |   matFFTedData(:,:,i) = fft2(data(:,:,i),fft_h,fft_w);
 83 | end
 84 | 
 85 | matFFTedKernel = zeros(fft_h, fft_w, k);
 86 | for i = 1:k
 87 |   matFFTedKernel(:,:,i) = fft2(kernel(:,:,i),fft_h,fft_w);
 88 | end
 89 | 
 90 | % Compute using the naive convolution
 91 | matConv = conv2(data(:,:,1),kernel(:,:,1));
 92 | for i = 2:k
 93 |   matConv(:,:,i) = conv2(data(:,:,i),kernel(:,:,i));
 94 | end
 95 | 
 96 | cvmatlab = sum(matConv,3);
 97 | 
 98 | ematlab = matFFTedKernel .* (matFFTedData);
 99 | matFFTConv = ifft2(ematlab(:,:,1));
100 | for i=1:k
101 |     matFFTConv(:,:,i) = ifft2(ematlab(:,:,i));
102 | end
103 | 
104 | 
105 | % ------------------------------------------------------------------------------
106 | %                                       Convolution using GPU cudaConvolutionFFT
107 | % ------------------------------------------------------------------------------
108 | 
109 | % You can feed multiple kernels in a cell format
110 | kernel2 = kernel;
111 | kernel2(1) = 100;
112 | 
113 | kernelCell = {kernel, kernel2, kernel};
114 | 
115 | thread_per_block_width = 8;
116 | thread_per_block_height = 8;
117 | thread_per_block_depth = 8;
118 | thread_per_block_2d_width = 16;
119 | threads_per_block_in =[thread_per_block_width, ...
120 |                     thread_per_block_height, ...
121 |                     thread_per_block_depth, ...
122 |                     thread_per_block_2d_width];
123 | 
124 | [cvcell] = cudaConvolutionFFT(data, ... % Data 
125 |                             cn,...      % Maximum kernel height
126 |                             cm,...      % Maximum kernel width
127 |                             kernelCell,...  % Multiple kernels in a cell
128 |                             threads_per_block_in,... % threads per block
129 |                             device_id-1); % 0-based indexing for GPU Device ID
130 | cvg = cvcell{1}; % Get the result for the first kernel
131 | cvg2 = cvcell{2}; % Get the result for the second kernel (kernel2)
132 | 
133 | % ------------------------------------------------------------------------------
134 | %                                                   Comparison and visualization
135 | % ------------------------------------------------------------------------------
136 | 
137 | % Visualize convolution result
138 | figure(1);  subplot(131); imagesc(sum(matConv,3)); 
139 |             subplot(132); imagesc(real(sum(matFFTConv,3)));  
140 |             subplot(133); imagesc(real(cvg));
141 | 
142 | % Transformed data
143 | figure(2);  imagesc(real(ematlab(:,:,1)));
144 | 
145 | % Compare matlab convolution with cuda FFT convolution
146 | figure(3);  subplot(131); imagesc(cvg); % Convolution output ( using FFT, 
147 |                                         % data is padded with the size of the 
148 |                                         % kernel -1 )
149 |             subplot(132); imagesc(cvg(1:n + cn - 1,1:m + cm - 1)); % Extract 
150 |                                         % exact convolution part that is the 
151 |                                         % same as matlab convolution
152 |             subplot(133); imagesc(cvmatlab); % Visualize matlab convolution output
153 | 
154 | % Compute residual
155 | figure(4);  imagesc(cvg(1:n + cn - 1,1:m + cm - 1) - cvmatlab); colorbar;
156 | 


--------------------------------------------------------------------------------
/src/convolutionFFTkernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
  3 | 	*
  4 | 	* NOTICE TO USER:   
  5 | 	*
  6 | 	* This source code is subject to NVIDIA ownership rights under U.S. and 
  7 | 	* international Copyright laws.  Users and possessors of this source code 
  8 | 	* are hereby granted a nonexclusive, royalty-free license to use this code 
  9 | 	* in individual and commercial software.
 10 | 	*
 11 | 	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
 12 | 	* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
 13 | 	* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
 14 | 	* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
 15 | 	* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 | 	* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
 17 | 	* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
 18 | 	* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
 19 | 	* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
 20 | 	* OR PERFORMANCE OF THIS SOURCE CODE.  
 21 | 	*
 22 | 	* U.S. Government End Users.   This source code is a "commercial item" as 
 23 | 	* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
 24 | 	* "commercial computer  software"  and "commercial computer software 
 25 | 	* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
 26 | 	* and is provided to the U.S. Government only as a commercial end item.  
 27 | 	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
 28 | 	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
 29 | 	* source code with only those rights set forth herein. 
 30 | 	*
 31 | 	* Any use of this source code in individual and commercial software must 
 32 | 	* include, in the user documentation and internal comments to the code,
 33 | 	* the above Disclaimer and U.S. Government End Users Notice.
 34 | */
 35 | 
 36 | 
 37 | 
 38 | #define IMUL(a, b) __mul24(a, b)
 39 | 
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // Copy input data array to the upper left corner and pad by border values
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | texture<float, 3, cudaReadModeElementType> texData;
 45 | 
 46 | __global__ void padData(
 47 | 	float *d_PaddedData,
 48 | 	int fftW,
 49 | 	int fftH,
 50 | 	int dataW,
 51 | 	int dataH,
 52 | 	int featureDim,
 53 | 	int kernelW,
 54 | 	int kernelH,
 55 | 	int kernelX,
 56 | 	int kernelY
 57 | ){
 58 | 	const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 59 | 	const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 60 | 	const int borderW = dataW + kernelX;
 61 | 	const int borderH = dataH + kernelY;
 62 | 	int dx;
 63 | 	int dy;
 64 | 
 65 | 	if(x < fftW && y < fftH){
 66 | 		if(x < dataW) dx = x;
 67 | 		if(y < dataH) dy = y;
 68 | 		if(x >= dataW && x < borderW) dx = dataW - 1;
 69 | 		if(y >= dataH && y < borderH) dy = dataH - 1;
 70 | 		if(x >= borderW) dx = 0;
 71 | 		if(y >= borderH) dy = 0;
 72 | 
 73 | 		d_PaddedData[IMUL(y, fftW) + x] =
 74 | 			tex2D(texData, (float)dx + 0.5f, (float)dy + 0.5f);
 75 | 	}
 76 | }
 77 | 
 78 | 
 79 | 
 80 | ////////////////////////////////////////////////////////////////////////////////
 81 | // Modulate Fourier image of padded data by Fourier image of padded kernel
 82 | // and normalize by FFT size
 83 | ////////////////////////////////////////////////////////////////////////////////
 84 | __device__ void complexMulAndScale(Complex& a, Complex b, float c){
 85 | 	Complex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
 86 | 	a = t;
 87 | }
 88 | 
 89 | __global__ void modulateAndNormalize(
 90 | 	Complex *fft_PaddedData,
 91 | 	Complex *fft_PaddedKernel,
 92 | 	int dataN
 93 | ){
 94 | 	const int     tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 95 | 	const int threadN = IMUL(blockDim.x, gridDim.x);
 96 | 	const float     q = 1.0f / (float)dataN;
 97 | 
 98 | 	for(int i = tid; i < dataN; i += threadN)
 99 | 		complexMulAndScale(fft_PaddedData[i], fft_PaddedKernel[i], q);
100 | }
101 | 


--------------------------------------------------------------------------------
/src/cudaConvFFTData.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <cufft.h>
  3 | #include "mex.h"
  4 | #include "gpu/mxGPUArray.h"
  5 | // #include "common/helper_cuda.h"
  6 | #include "cudaConvFFTData.h"
  7 | #include "cudaConvFFTData.cuh"
  8 | 
  9 | static bool debug = false;
 10 | 
 11 | enum OUT_INDEX{
 12 |     CONVOLUTION_CELL_INDEX
 13 | };
 14 | 
 15 | enum IN_INDEX{
 16 |     FFT_DATA_INDEX,
 17 |     KERNLE_CELL_INDEX,
 18 |     THREAD_SIZE_INDEX // Optional
 19 | };
 20 | 
 21 | ////////////////////////////////////////////////////////////////////////////////
 22 | // Mex Entry
 23 | ////////////////////////////////////////////////////////////////////////////////
 24 | void mexFunction(int nlhs, mxArray *plhs[],
 25 |                  int nrhs, mxArray const *prhs[])
 26 | {
 27 |     /* Declare all variables.*/
 28 |     const mxGPUArray *mxFFTData;
 29 |     const mxGPUArray *mxKernel;
 30 |     mxGPUArray *mxFFTKernel;
 31 |     mxGPUArray *mxConvolution;
 32 |     mxArray *convolutionResult;
 33 |     
 34 |     /* cufftComplex is float2 */
 35 |     const cufftComplex *d_CFFT_DATA;
 36 |     cufftComplex *d_CFFT_KERNEL;
 37 |     cufftComplex *d_FFTEProd;
 38 | 
 39 |     float *d_CONVOLUTION;
 40 |     float *d_IFFTEProd;
 41 | 
 42 |     float *h_Kernel;
 43 |     float *h_CONVOLUTION;
 44 |     float *d_Kernel;
 45 |     float *d_PaddedKernel;
 46 | 
 47 |     char const * const errId = "cudaConvFFTData:InvalidInput";
 48 | 
 49 |     /* Choose a reasonably sized number of threads for the block. */
 50 |     int THREAD_PER_BLOCK_H = 16;
 51 |     int THREAD_PER_BLOCK_W = 8;
 52 |     int THREAD_PER_BLOCK_D = 8;
 53 |     int THREAD_PER_BLOCK_2D = 32;
 54 | 
 55 |     const mwSize * mxKernel_Dim;
 56 |     const mwSize * mxFFT_Dim;
 57 |     // int MblocksPerGrid, NblocksPerGrid;
 58 |     int KERNEL_H, KERNEL_W, N_KERNEL,
 59 |         CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM,
 60 |         KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE;
 61 | 
 62 |     /* Initialize the MathWorks GPU API. */
 63 |     // If initialized mxInitGPU do nothing
 64 |     if (mxInitGPU() != MX_GPU_SUCCESS)
 65 |         mexErrMsgTxt("mxInitGPU fail");
 66 |     
 67 |     /* Throw an error if the input is not a GPU array. */
 68 |     if ( (nrhs <  (KERNLE_CELL_INDEX + 1)) || (nrhs > (THREAD_SIZE_INDEX + 1) ) || !mxIsGPUArray(prhs[FFT_DATA_INDEX]) )
 69 |         mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU");
 70 | 
 71 |     if (( nrhs > THREAD_SIZE_INDEX)  && mxGetNumberOfElements(prhs[THREAD_SIZE_INDEX]) != 4)
 72 |         mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock");
 73 | 
 74 |     if ( nrhs > THREAD_SIZE_INDEX ){
 75 |         const double* threadSize = (double *)mxGetData(prhs[THREAD_SIZE_INDEX]);
 76 |         THREAD_PER_BLOCK_H = (int)threadSize[0];
 77 |         THREAD_PER_BLOCK_W = (int)threadSize[1];
 78 |         THREAD_PER_BLOCK_D = (int)threadSize[2];
 79 |         THREAD_PER_BLOCK_2D = (int)threadSize[3];
 80 |         if(debug) fprintf(stderr,"Thread size: H=%d, W=%d, D=%d, 2D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D);
 81 |     }
 82 | 
 83 |     // cudaDeviceProp dev;
 84 |     // cudaGetDeviceProperties(&dev,0);
 85 |     // int success = checkDeviceProp(dev);
 86 | 
 87 |     mxFFTData = mxGPUCreateFromMxArray(prhs[FFT_DATA_INDEX]);
 88 |     mxFFT_Dim = mxGPUGetDimensions(mxFFTData);
 89 | 
 90 |     // FFT Dim
 91 |     // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points.
 92 |     CFFT_H = mxFFT_Dim[0];
 93 |     CFFT_W = mxFFT_Dim[1];
 94 | 
 95 |     FFT_H = (mxFFT_Dim[0] - 1) * 2;
 96 |     FFT_W = mxFFT_Dim[1];
 97 | 
 98 |     FEATURE_DIM = mxFFT_Dim[2];
 99 | 
100 |     CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2);
101 |     FFT_SIZE  = FFT_W  * FFT_H  * FEATURE_DIM * sizeof(float);
102 |     CONV_SIZE = FFT_W  * FFT_H  * sizeof(float);
103 |     
104 |     if(debug) fprintf(stderr,"FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM);
105 | 
106 |     if (mxGetClassID(prhs[KERNLE_CELL_INDEX]) != mxCELL_CLASS)
107 |         mexErrMsgIdAndTxt(errId, "Kernel must be a cell array");
108 | 
109 |     mwSize nKernel = mxGetNumberOfElements(prhs[KERNLE_CELL_INDEX]);
110 |     N_KERNEL = (int)nKernel;
111 |     plhs[CONVOLUTION_CELL_INDEX] = mxCreateCellMatrix(1, N_KERNEL);
112 | 
113 |     if(debug) fprintf(stderr,"N Kernel: %d\n", N_KERNEL);
114 | 
115 | 
116 |     /* Set block size and thread size */
117 |     dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
118 |     dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 
119 |                         iDivUp(FFT_H, threadBlock3D.y), 
120 |                         iDivUp(FEATURE_DIM, threadBlock3D.z));
121 | 
122 |     dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D);
123 |     dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 
124 |                         iDivUp(FFT_H, threadBlock2D.y));
125 | 
126 | 
127 |     /*  Pad Kernel */
128 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedKernel,    FFT_SIZE));
129 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_IFFTEProd,       FFT_SIZE));
130 | 
131 |     /* Create a GPUArray to hold the result and get its underlying pointer. */
132 |     // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
133 |     // FFT_dims[0] = FFT_H;
134 |     // FFT_dims[1] = FFT_W;
135 |     // FFT_dims[2] = FEATURE_DIM;
136 | 
137 |     d_CFFT_DATA = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData);
138 | 
139 |     // mxConvolution = mxGPUCreateGPUArray(2,
140 |     //                         FFT_dims, // Third element will not be accessed
141 |     //                         mxSINGLE_CLASS,
142 |     //                         mxREAL,
143 |     //                         MX_GPU_DO_NOT_INITIALIZE);
144 | 
145 |     // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution));
146 | 
147 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE));
148 | 
149 |     // mxFFTKernel = mxGPUCreateGPUArray(3,
150 |     //                         mxFFT_Dim,
151 |     //                         mxSINGLE_CLASS,
152 |     //                         mxCOMPLEX,
153 |     //                         MX_GPU_DO_NOT_INITIALIZE);
154 | 
155 |     // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel));
156 | 
157 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE));
158 | 
159 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE));
160 | 
161 |     /* FFT Kernel */
162 |     int BATCH = FEATURE_DIM;
163 |     int FFT_Dims[] = { FFT_W, FFT_H };
164 |     int CFFT_Dims[] = { CFFT_W, CFFT_H };
165 | 
166 |     int idist = FFT_W * FFT_H;
167 |     int odist = CFFT_W * CFFT_H;
168 | 
169 |     cufftHandle FFTplan_R2C, FFTplan_C2R;
170 |     CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 
171 |         2, // rank
172 |         FFT_Dims, 
173 |         FFT_Dims, 1, idist, // *inembed, istride, idist
174 |         CFFT_Dims, 1, odist, // *onembed, ostride, odist
175 |         CUFFT_R2C, 
176 |         BATCH)); // batch
177 | 
178 |     CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_C2R, 
179 |         2, // rank
180 |         FFT_Dims,
181 |         CFFT_Dims, 1, odist, // *inembed, istride, idist
182 |         FFT_Dims, 1, idist, // *onembed, ostride, odist
183 |         CUFFT_C2R, 
184 |         BATCH)); // batch
185 |     
186 |     mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
187 |     FFT_dims[0] = FFT_H;
188 |     FFT_dims[1] = FFT_W;
189 | 
190 |     /* For each kernel iterate */
191 |     for (int kernelIdx = 0; kernelIdx < N_KERNEL; kernelIdx++){
192 |         
193 |         // Get Kernel Data
194 |         const mxArray *mxCurrentCell = mxGetCell(prhs[KERNLE_CELL_INDEX], kernelIdx);
195 |         if (!mxIsGPUArray(mxCurrentCell)){
196 |             
197 |             if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 )
198 |                 mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
199 | 
200 |             h_Kernel = (float *)mxGetData(mxCurrentCell);
201 |             mxKernel_Dim = mxGetDimensions(mxCurrentCell);
202 | 
203 |             // Kernel dimensions
204 |             KERNEL_H = mxKernel_Dim[0];
205 |             KERNEL_W = mxKernel_Dim[1];
206 |             KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
207 | 
208 |             CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Kernel, KERNEL_SIZE));
209 |             CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
210 |             mxKernel = NULL;
211 |         }else{ // Kernel is GPU Array
212 |             mxKernel = mxGPUCreateFromMxArray(mxCurrentCell);
213 | 
214 |             if ( mxGPUGetClassID(mxKernel) != mxSINGLE_CLASS || mxGPUGetNumberOfDimensions(mxKernel) != 3 )
215 |                 mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
216 | 
217 |             mxKernel_Dim = mxGPUGetDimensions(mxKernel);
218 | 
219 |             // Kernel dimensions
220 |             KERNEL_H = mxKernel_Dim[0];
221 |             KERNEL_W = mxKernel_Dim[1];
222 |             KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
223 | 
224 |             d_Kernel = (float *)mxGPUGetDataReadOnly(mxKernel);
225 |         }
226 | 
227 |         if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W);
228 | 
229 |         if (FEATURE_DIM != mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){
230 |             mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size");
231 |         }
232 | 
233 |         padData<<<dataBlockGrid3D, threadBlock3D>>>(
234 |                 d_PaddedKernel,
235 |                 d_Kernel,
236 |                 FFT_W,
237 |                 FFT_H,
238 |                 KERNEL_W,
239 |                 KERNEL_H,
240 |                 FEATURE_DIM
241 |             );
242 | 
243 | 
244 |         CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedKernel, d_CFFT_KERNEL));
245 |         CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
246 | 
247 |         if(debug) fprintf(stderr,"FFT done\n");
248 | 
249 |         
250 |         /* Hadamard product, Element-wise multiplication in frequency domain */
251 |         /* If execute the following, second compile of this file create MATLAB error */
252 |         elementwiseProductAndNormalize<<<dataBlockGrid3D, threadBlock3D>>>(
253 |                 d_FFTEProd, // out
254 |                 d_CFFT_DATA, // in data
255 |                 d_CFFT_KERNEL,   // in kernel
256 |                 CFFT_H,
257 |                 CFFT_W,
258 |                 FEATURE_DIM,
259 |                 1.0f / (FFT_W * FFT_H)
260 |             );
261 | 
262 |         CUFFT_SAFE_CALL(cufftExecC2R(FFTplan_C2R, d_FFTEProd, d_IFFTEProd));
263 |         CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
264 | 
265 |         sumAlongFeatures<<<dataBlockGrid2D, threadBlock2D>>>(
266 |                 d_CONVOLUTION,
267 |                 d_IFFTEProd,
268 |                 FFT_H,
269 |                 FFT_W,
270 |                 FEATURE_DIM
271 |             );
272 | 
273 | 
274 | 
275 |         convolutionResult = mxCreateNumericArray(2, FFT_dims, mxSINGLE_CLASS, mxREAL);
276 |         h_CONVOLUTION = (float *)mxGetData(convolutionResult);
277 |         CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(h_CONVOLUTION, d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost));
278 | 
279 |         mxSetCell(plhs[CONVOLUTION_CELL_INDEX], kernelIdx, convolutionResult);
280 | 
281 |         if(mxKernel == NULL) cudaFree(d_Kernel);
282 |     }
283 |     // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel);
284 | 
285 |     /*
286 |      * The mxGPUArray pointers are host-side structures that refer to device
287 |      * data. These must be destroyed before leaving the MEX function.
288 |      */
289 |     mxGPUDestroyGPUArray(mxFFTData);
290 |     // mxGPUDestroyGPUArray(mxConvolution);    
291 |     // mxGPUDestroyGPUArray(mxFFTKernel);
292 |     
293 |     cufftDestroy(FFTplan_R2C);
294 |     cufftDestroy(FFTplan_C2R);
295 | 
296 |     if(mxKernel != NULL) mxGPUDestroyGPUArray(mxKernel);
297 | 
298 |     cudaFree(d_PaddedKernel);
299 |     cudaFree(d_IFFTEProd);
300 |     cudaFree(d_CONVOLUTION);
301 |     cudaFree(d_CFFT_KERNEL);
302 |     cudaFree(d_FFTEProd);
303 |     
304 | 
305 |     mxFree(FFT_dims);
306 | }
307 | 


--------------------------------------------------------------------------------
/src/cudaConvFFTData.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUDA_CONV_FFT_DATA_CUH
 2 | #define CUDA_CONV_FFT_DATA_CUH
 3 | 
 4 | /*
 5 |  * Device Code
 6 |  */
 7 | 
 8 | ////////////////////////////////////////////////////////////////////////////////
 9 | // Pad data with zeros, 
10 | ////////////////////////////////////////////////////////////////////////////////
11 | __global__ void padData(
12 |     float *d_PaddedData,
13 |     const float *d_Data,
14 |     int fftW,
15 |     int fftH,
16 |     int dataW,
17 |     int dataH,
18 |     int FEATURE_DIM
19 | ){
20 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
21 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
22 |     const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
23 | 
24 |     if(x < fftW && y < fftH && z < FEATURE_DIM){
25 |         if(x < dataW && y < dataH)
26 |             d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 
27 |                     d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y];
28 |         else
29 |             d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0;
30 |     }
31 | }
32 | 
33 | ////////////////////////////////////////////////////////////////////////////////
34 | // Modulate Fourier image of padded data by Fourier image of padded kernel
35 | // and normalize by FFT size
36 | ////////////////////////////////////////////////////////////////////////////////
37 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
38 |     const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
39 |     out = t;
40 | }
41 | 
42 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
43 |     const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)};
44 |     out = t;
45 | }
46 | 
47 | __global__ void elementwiseProductAndNormalize(
48 |     cufftComplex *fft_Output,
49 |     const cufftComplex *fft_PaddedData,
50 |     const cufftComplex *fft_PaddedKernel,
51 |     int FFT_H,
52 |     int FFT_W,
53 |     int FEATURE_DIM,
54 |     float scale
55 | ){
56 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
57 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
58 |     const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
59 |     
60 |     if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){
61 |         // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y;
62 |         int i = z * FFT_W * FFT_H + FFT_H * x + y;
63 |         // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale);
64 |         fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y);
65 |         fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y);
66 |     }
67 | }
68 | 
69 | /* Support in-place computation, i.e. input and output can be the same */
70 | __global__ void sumAlongFeatures(
71 |     float *convolutionResult,
72 |     const float *convolutionPerFeature,
73 |     int FFT_H,
74 |     int FFT_W,
75 |     int FEATURE_DIM
76 | ){
77 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
78 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
79 | 
80 |     if(x < FFT_W && y < FFT_H){
81 |         const int result_i = IMUL(FFT_H, x) + y;
82 |         const int N = IMUL(FFT_W, FFT_H);
83 | 
84 |         float acc = convolutionPerFeature[result_i];
85 |         int zN = N;
86 |         for (int z = 1; z < FEATURE_DIM; z++){
87 |             acc += convolutionPerFeature[zN + result_i];
88 |             zN += N;
89 |         }
90 |         convolutionResult[result_i] = acc;
91 |     }
92 | }
93 |     
94 | 
95 | #endif


--------------------------------------------------------------------------------
/src/cudaConvFFTData.h:
--------------------------------------------------------------------------------
  1 | #ifndef CUDA_CONV_FFT_DATA
  2 | #define CUDA_CONV_FFT_DATA
  3 | 
  4 | #  define IMUL(a, b) __mul24(a, b)
  5 | 
  6 | #  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
  7 |     cudaError err = call;                                                    \
  8 |     if( cudaSuccess != err) {                                                \
  9 |         printf("Cuda error in file '%s' in line %i Error : %d.\n",            \
 10 |                 __FILE__, __LINE__, err);                                         \
 11 |         exit(EXIT_FAILURE);                                                  \
 12 |     } } while (0)
 13 | 
 14 | #  define CUDA_SAFE_CALL( call) do {                                         \
 15 |     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
 16 |     cudaError err = cudaThreadSynchronize();                                 \
 17 |     if( cudaSuccess != err) {                                                \
 18 |         printf("Cuda error in file '%s' in line %i Error : %d.\n",            \
 19 |                 __FILE__, __LINE__,err);                                        \
 20 |         exit(EXIT_FAILURE);                                                  \
 21 |     } } while (0)
 22 | 
 23 | #  define CUFFT_SAFE_CALL( call) do {                                        \
 24 |     cufftResult err = call;                                                  \
 25 |     if( CUFFT_SUCCESS != err) {                                              \
 26 |         printf("CUFFT error in file '%s' in line %i Error : %d.\n",            \
 27 |                 __FILE__, __LINE__,err);                                         \
 28 |         exit(EXIT_FAILURE);                                                  \
 29 |     } } while (0)
 30 | 
 31 | 
 32 | ////////////////////////////////////////////////////////////////////////////////
 33 | // Helper functions
 34 | ////////////////////////////////////////////////////////////////////////////////
 35 | //Round a / b to nearest higher integer value
 36 | int iDivUp(int a, int b){
 37 |     return (a % b != 0) ? (a / b + 1) : (a / b);
 38 | }
 39 | 
 40 | //Align a to nearest higher multiple of b
 41 | int iAlignUp(int a, int b){
 42 |     return (a % b != 0) ?  (a - a % b + b) : a;
 43 | }
 44 | 
 45 | 
 46 | 
 47 | int checkDeviceProp ( cudaDeviceProp p ) {
 48 |     int support = p.canMapHostMemory;
 49 | 
 50 |     if(support == 0) printf( "%s does not support mapping host memory.\n", p.name);
 51 |     else             printf( "%s supports mapping host memory.\n",p.name);
 52 | 
 53 |     support = p.concurrentKernels;
 54 |     if(support == 0) printf("%s does not support concurrent kernels\n", p.name);
 55 |     else printf("%s supports concurrent kernels\n",p.name);
 56 | 
 57 |     support = p.kernelExecTimeoutEnabled;
 58 |     if(support == 0) printf("%s kernelExecTimeout disabled\n", p.name);
 59 |     else printf("%s kernelExecTimeout enabled\n",p.name);
 60 | 
 61 |     printf("compute capability : %d.%d \n", p.major,p.minor);
 62 |     printf("number of multiprocessors : %d \n", p.multiProcessorCount);
 63 | 
 64 |     return support;
 65 | }
 66 | 
 67 | int computeFFTsize(int dataSize){
 68 |     //Highest non-zero bit position of dataSize
 69 |     int hiBit;
 70 |     //Neares lower and higher powers of two numbers for dataSize
 71 |     unsigned int lowPOT, hiPOT;
 72 | 
 73 |     //Align data size to a multiple of half-warp
 74 |     //in order to have each line starting at properly aligned addresses
 75 |     //for coalesced global memory writes in padKernel() and padData()
 76 |     dataSize = iAlignUp(dataSize, 16);
 77 | 
 78 |     //Find highest non-zero bit
 79 |     for(hiBit = 31; hiBit >= 0; hiBit--)
 80 |         if(dataSize & (1U << hiBit)) break;
 81 | 
 82 |     //No need to align, if already power of two
 83 |     lowPOT = 1U << hiBit;
 84 |     if(lowPOT == dataSize) return dataSize;
 85 | 
 86 |     //Align to a nearest higher power of two, if the size is small enough,
 87 |     //else align only to a nearest higher multiple of 512,
 88 |     //in order to save computation and memory bandwidth
 89 |     hiPOT = 1U << (hiBit + 1);
 90 |     //if(hiPOT <= 1024)
 91 |         return hiPOT;
 92 |     //else 
 93 |     //  return iAlignUp(dataSize, 512);
 94 | }
 95 | 
 96 | int computeFFTsize16(int dataSize){
 97 |     // Compute the multiple of 16
 98 |     int mod = dataSize / 16;
 99 |     int rem = dataSize % 16;
100 | 
101 |     return (mod * 16) + ((rem > 0)?16:0);
102 | }
103 | 
104 | #endif


--------------------------------------------------------------------------------
/src/cudaConvFFTDataStreams.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <cufft.h>
  3 | #include "mex.h"
  4 | #include "gpu/mxGPUArray.h"
  5 | // #include "common/helper_cuda.h"
  6 | #include "cudaConvFFTDataStream.h"
  7 | 
  8 | 
  9 | const int N_MAX_PARALLEL = 32;
 10 | static bool debug = true;
 11 | 
 12 | /*
 13 |  * Device Code
 14 |  */
 15 | 
 16 | ////////////////////////////////////////////////////////////////////////////////
 17 | // Pad data with zeros, 
 18 | ////////////////////////////////////////////////////////////////////////////////
 19 | __global__ void padData(
 20 |     float *d_PaddedData,
 21 |     const float *d_Data,
 22 |     int fftW,
 23 |     int fftH,
 24 |     int dataW,
 25 |     int dataH,
 26 |     int FEATURE_DIM
 27 | ){
 28 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 29 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 30 |     const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
 31 | 
 32 |     if(x < fftW && y < fftH && z < FEATURE_DIM){
 33 |         if(x < dataW && y < dataH)
 34 |             d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 
 35 |                     d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y];
 36 |         else
 37 |             d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0;
 38 |     }
 39 | }
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // Modulate Fourier image of padded data by Fourier image of padded kernel
 43 | // and normalize by FFT size
 44 | ////////////////////////////////////////////////////////////////////////////////
 45 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
 46 |     const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
 47 |     out = t;
 48 | }
 49 | 
 50 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
 51 |     const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)};
 52 |     out = t;
 53 | }
 54 | 
 55 | __global__ void elementwiseProductAndNormalize(
 56 |     cufftComplex *fft_Output,
 57 |     const cufftComplex *fft_PaddedData,
 58 |     const cufftComplex *fft_PaddedKernel,
 59 |     int FFT_H,
 60 |     int FFT_W,
 61 |     int FEATURE_DIM,
 62 |     float scale
 63 | ){
 64 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 65 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 66 |     const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
 67 |     
 68 |     if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){
 69 |         // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y;
 70 |         int i = z * FFT_W * FFT_H + FFT_H * x + y;
 71 |         // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale);
 72 |         fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y);
 73 |         fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y);
 74 |     }
 75 | }
 76 | 
 77 | /* Support in-place computation, i.e. input and output can be the same */
 78 | __global__ void sumAlongFeatures(
 79 |     float *convolutionResult,
 80 |     const float *convolutionPerFeature,
 81 |     int FFT_H,
 82 |     int FFT_W,
 83 |     int FEATURE_DIM
 84 | ){
 85 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 86 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 87 | 
 88 |     if(x < FFT_W && y < FFT_H){
 89 |         const int result_i = IMUL(FFT_H, x) + y;
 90 |         const int N = IMUL(FFT_W, FFT_H);
 91 | 
 92 |         convolutionResult[result_i] = convolutionPerFeature[result_i];
 93 |         for (int z = 1; z < FEATURE_DIM; z++){
 94 |             convolutionResult[result_i] += 
 95 |                 convolutionPerFeature[IMUL(z, N) + result_i];
 96 |         }
 97 |     }
 98 | }
 99 | 
100 | /*
101 |  * Host code
102 |  */
103 | 
104 | ////////////////////////////////////////////////////////////////////////////////
105 | // Helper functions
106 | ////////////////////////////////////////////////////////////////////////////////
107 | //Round a / b to nearest higher integer value
108 | int iDivUp(int a, int b){
109 |     return (a % b != 0) ? (a / b + 1) : (a / b);
110 | }
111 | 
112 | //Align a to nearest higher multiple of b
113 | int iAlignUp(int a, int b){
114 |     return (a % b != 0) ?  (a - a % b + b) : a;
115 | }
116 | 
117 | 
118 | ////////////////////////////////////////////////////////////////////////////////
119 | // Mex Entry
120 | ////////////////////////////////////////////////////////////////////////////////
121 | void mexFunction(int nlhs, mxArray *plhs[],
122 |                  int nrhs, mxArray const *prhs[])
123 | {
124 |     ConvPlan plan[N_MAX_PARALLEL];
125 | 
126 |     /* Declare all variables.*/
127 |     const mxGPUArray *mxFFTData;
128 |     const mxGPUArray *mxKernel;
129 |     mxGPUArray *mxFFTKernel;
130 |     mxGPUArray *mxConvolution;
131 | 
132 |     cufftComplex **d_CFFT_DATA_PER_GPU;
133 | 
134 |     /* concurrent kernel executions */
135 |     int N_GPU; 
136 |     int N_BATCH_PER_GPU = 2;
137 | 
138 |     char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
139 | 
140 |     /* Choose a reasonably sized number of threads for the block. */
141 |     int THREAD_PER_BLOCK_H = 16;
142 |     int THREAD_PER_BLOCK_W = 8;
143 |     int THREAD_PER_BLOCK_D = 8;
144 |     int THREAD_PER_BLOCK_2D = 32;
145 | 
146 |     // const mwSize * mxKernel_Dim;
147 |     const mwSize * mxFFT_Dim;
148 |     // int MblocksPerGrid, NblocksPerGrid;
149 |     int KERNEL_H, KERNEL_W, N_KERNEL,
150 |         CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM,
151 |         KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE;
152 | 
153 |     int gpuIdx, streamIdx, planIdx;
154 | 
155 |     /* Initialize the MathWorks GPU API. */
156 |     mxInitGPU();
157 |     
158 |     /* Throw an error if the input is not a GPU array. */
159 |     if ( (nrhs < 2) || (nrhs > 3) || !mxIsGPUArray(prhs[0]) )
160 |         mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU");
161 | 
162 |     if (( nrhs == 3)  && mxGetNumberOfElements(prhs[2]) != 4)
163 |         mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock");
164 | 
165 |     if ( nrhs == 3 ){
166 |         const double* threadSize = (double *)mxGetData(prhs[2]);
167 |         THREAD_PER_BLOCK_H = (int)threadSize[0];
168 |         THREAD_PER_BLOCK_W = (int)threadSize[1];
169 |         THREAD_PER_BLOCK_D = (int)threadSize[2];
170 |         THREAD_PER_BLOCK_2D = (int)threadSize[3];
171 |         if(debug) printf("Thread size: H=%d, W=%d, D=%d, D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D);
172 |     }
173 | 
174 |     cudaDeviceProp dev;
175 |     cudaGetDeviceProperties(&dev,0);
176 |     int success = checkDeviceProp(dev);
177 | 
178 |     mxFFTData = mxGPUCreateFromMxArray(prhs[0]);
179 |     mxFFT_Dim = mxGPUGetDimensions(mxFFTData);
180 | 
181 |     // FFT Dim
182 |     // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points.
183 |     CFFT_H = mxFFT_Dim[0];
184 |     CFFT_W = mxFFT_Dim[1];
185 | 
186 |     FFT_H = (mxFFT_Dim[0] - 1) * 2;
187 |     FFT_W = mxFFT_Dim[1];
188 | 
189 |     FEATURE_DIM = mxFFT_Dim[2];
190 | 
191 |     CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2);
192 |     FFT_SIZE  = FFT_W  * FFT_H  * FEATURE_DIM * sizeof(float);
193 |     CONV_SIZE = FFT_W  * FFT_H  * sizeof(float);
194 |     
195 |     if(debug) printf("FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM);
196 | 
197 |     if (mxGetClassID(prhs[1]) != mxCELL_CLASS)
198 |         mexErrMsgIdAndTxt(errId, "Kernel must be a cell array");
199 | 
200 |     mwSize nKernel = mxGetNumberOfElements(prhs[1]);
201 |     N_KERNEL = (int)nKernel;
202 |     plhs[0] = mxCreateCellMatrix(1, N_KERNEL);
203 |     
204 |     if(debug) printf("N Kernel: %d\n", N_KERNEL);
205 | 
206 | 
207 |     /* Set block size and thread size */
208 |     dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
209 |     dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 
210 |                         iDivUp(FFT_H, threadBlock3D.y), 
211 |                         iDivUp(FEATURE_DIM, threadBlock3D.z));
212 | 
213 |     dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D);
214 |     dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 
215 |                         iDivUp(FFT_H, threadBlock2D.y));
216 | 
217 | 
218 |     /* Find number of cuda capable devices */
219 |     CUDA_SAFE_CALL(cudaGetDeviceCount(&N_GPU));
220 |     if(debug) printf( "CUDA-capable device count: %i\n", N_GPU);
221 |     
222 |     CUDA_SAFE_CALL(cudaSetDevice(0));
223 |     d_CFFT_DATA_PER_GPU = (cufftComplex **)malloc(N_GPU * sizeof(float));
224 | 
225 |     /*  Pad Kernel */
226 |     // CUDA_SAFE_CALL(cudaMalloc((void **)&d_PaddedKernel,    FFT_SIZE));
227 |     // CUDA_SAFE_CALL(cudaMalloc((void **)&d_IFFTEProd,       FFT_SIZE));
228 | 
229 |     /* Create a GPUArray to hold the result and get its underlying pointer. */
230 |     mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
231 |     FFT_dims[0] = FFT_H;
232 |     FFT_dims[1] = FFT_W;
233 |     FFT_dims[2] = FEATURE_DIM;
234 | 
235 |     d_CFFT_DATA_PER_GPU[0] = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData);
236 | 
237 |     // mxConvolution = mxGPUCreateGPUArray(2,
238 |     //                         FFT_dims, // Third element will not be accessed
239 |     //                         mxSINGLE_CLASS,
240 |     //                         mxREAL,
241 |     //                         MX_GPU_DO_NOT_INITIALIZE);
242 | 
243 |     // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution));
244 | 
245 |     // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE));
246 | 
247 |     // mxFFTKernel = mxGPUCreateGPUArray(3,
248 |     //                         mxFFT_Dim,
249 |     //                         mxSINGLE_CLASS,
250 |     //                         mxCOMPLEX,
251 |     //                         MX_GPU_DO_NOT_INITIALIZE);
252 | 
253 |     // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel));
254 | 
255 |     // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE));
256 | 
257 |     // CUDA_SAFE_CALL(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE));
258 | 
259 |     /* FFT Kernel */
260 |     int BATCH = FEATURE_DIM;
261 |     int FFT_Dims[] = { FFT_W, FFT_H };
262 |     int CFFT_Dims[] = { CFFT_W, CFFT_H };
263 | 
264 |     int idist = FFT_W * FFT_H;
265 |     int odist = CFFT_W * CFFT_H;
266 |     
267 |     // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
268 |     //     FFT_dims[0] = FFT_H;
269 |     //     FFT_dims[1] = FFT_W;
270 | 
271 |     N_GPU = 1;
272 |     //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
273 |     for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++)
274 |     {
275 |         // Set GPU
276 |         CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
277 |         // if (gpuIdx != 0) CUDA_SAFE_CALL();
278 |         /* COPY mxFFTData to individual GPU */
279 |         if (gpuIdx > 0) {
280 |             if(debug) printf("start inter gpu copy from 0 to %d\n", gpuIdx);
281 |             CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_DATA_PER_GPU[gpuIdx], CFFT_SIZE));
282 |             CUDA_SAFE_CALL(cudaMemcpyPeerAsync(d_CFFT_DATA_PER_GPU[gpuIdx],
283 |                     gpuIdx,
284 |                     d_CFFT_DATA_PER_GPU[0],
285 |                     0,
286 |                     CFFT_SIZE,
287 |                     plan[0].stream));
288 |             if(debug) printf("end gpu copy from 0 to %d\n", gpuIdx);
289 |         }
290 | 
291 |         // Set Streams
292 |         for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
293 |             planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
294 | 
295 |             CUDA_SAFE_CALL(cudaStreamCreate(&plan[planIdx].stream));
296 |             
297 |             // Cufft Plans
298 |             CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_R2C, 
299 |                 2, // rank
300 |                 FFT_Dims, 
301 |                 FFT_Dims, 1, idist, // *inembed, istride, idist
302 |                 CFFT_Dims, 1, odist, // *onembed, ostride, odist
303 |                 CUFFT_R2C, 
304 |                 BATCH)); // batch
305 |             cufftSetStream(plan[planIdx].FFTplan_R2C, plan[planIdx].stream);
306 | 
307 |             CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_C2R, 
308 |                 2, // rank
309 |                 FFT_Dims,
310 |                 CFFT_Dims, 1, odist, // *inembed, istride, idist
311 |                 FFT_Dims, 1, idist, // *onembed, ostride, odist
312 |                 CUFFT_C2R, 
313 |                 BATCH)); // batch
314 |             cufftSetStream(plan[planIdx].FFTplan_C2R, plan[planIdx].stream);
315 | 
316 |             plan[planIdx].d_CFFT_DATA = d_CFFT_DATA_PER_GPU[gpuIdx];
317 | 
318 |             //Allocate memory
319 |             CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CFFT_KERNEL, CFFT_SIZE));
320 |             CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_FFTEProd,    CFFT_SIZE));
321 |             CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CONVOLUTION, CONV_SIZE));
322 |             CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_IFFTEProd,       FFT_SIZE));
323 |             // d_Kernel, dynamically set
324 |             CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_PaddedKernel,    FFT_SIZE));
325 |             // h_Kernel, dynamically set
326 |             // CUDA_SAFE_CALL(cudaMallocHost((void **)&plan[planIdx].h_CONVOLUTION,    CONV_SIZE));
327 |         }
328 |     }
329 |     
330 | 
331 |     /* For each kernel iterate */
332 |     int N_PLANS = N_GPU * N_BATCH_PER_GPU;
333 |     printf("N Plans %d\n",N_PLANS);
334 | 
335 |     int kernelIdx = 0;
336 |     int lastPlanIdx;
337 | 
338 |     while(kernelIdx < N_KERNEL){
339 |         if(debug) printf( "Kernel: %d\n",kernelIdx);
340 | 
341 |         for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++){
342 |             if (kernelIdx >= N_KERNEL) break;
343 | 
344 |             // Set GPU
345 |             CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
346 |             
347 |             // Set Streams
348 |             for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
349 |                 planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
350 | 
351 |                 // Get Kernel Data
352 |                 const mxArray *mxCurrentCell = mxGetCell(prhs[1], kernelIdx);
353 |                 {
354 |                     if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 )
355 |                         mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
356 | 
357 |                     if(debug) printf("Start plan %d\n", planIdx);
358 | 
359 |                     plan[planIdx].h_Kernel = (float *)mxGetData(mxCurrentCell);
360 |                     plan[planIdx].mxKernel_Dim = mxGetDimensions(mxCurrentCell);
361 | 
362 |                     // Kernel dimensions
363 |                     KERNEL_H = plan[planIdx].mxKernel_Dim[0];
364 |                     KERNEL_W = plan[planIdx].mxKernel_Dim[1];
365 |                     KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
366 | 
367 |                     if(debug) printf("Start copy\n");
368 |                     // CUDA_SAFE_CALL(cudaHostRegister(plan[planIdx].h_Kernel, KERNEL_SIZE, cudaHostRegisterPortable));
369 |                     // CUDA_SAFE_CALL(cudaHostGetDevicePointer((void **) &plan[planIdx].d_Kernel, (void *)plan[planIdx].h_Kernel, 0));
370 |                     CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_Kernel, KERNEL_SIZE));
371 |                     CUDA_SAFE_CALL(cudaMemcpyAsync(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice, plan[planIdx].stream));
372 |                     // CUDA_SAFE_CALL(cudaMemcpy(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
373 |                     mxKernel = NULL;
374 |                 }
375 | 
376 |                 if(debug) printf("Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W);
377 | 
378 |                 if (FEATURE_DIM != plan[planIdx].mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){
379 |                     mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size");
380 |                 }
381 | 
382 |                 // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
383 |                 if(debug) printf("Sync before padding\n");
384 |                 padData<<<dataBlockGrid3D, threadBlock3D, 0, plan[planIdx].stream>>>(
385 |                     plan[planIdx].d_PaddedKernel,
386 |                     plan[planIdx].d_Kernel,
387 |                     FFT_W,
388 |                     FFT_H,
389 |                     KERNEL_W,
390 |                     KERNEL_H,
391 |                     FEATURE_DIM
392 |                     );
393 |                 if(debug) printf("Padding done\n");
394 | 
395 |                 CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
396 |                 CUFFT_SAFE_CALL(cufftExecR2C(plan[planIdx].FFTplan_R2C, plan[planIdx].d_PaddedKernel, plan[planIdx].d_CFFT_KERNEL));
397 |                 // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
398 | 
399 |                 if(debug) printf("FFT done\n");
400 |                 
401 |                 /* Hadamard product, Element-wise multiplication in frequency domain */
402 |                 /* If execute the following, second compile of this file create MATLAB error */
403 |                 elementwiseProductAndNormalize<<<dataBlockGrid3D, threadBlock3D, 0, plan[planIdx].stream>>>(
404 |                         plan[planIdx].d_FFTEProd, // out
405 |                         plan[planIdx].d_CFFT_DATA, // in data
406 |                         plan[planIdx].d_CFFT_KERNEL,   // in kernel
407 |                         CFFT_H,
408 |                         CFFT_W,
409 |                         FEATURE_DIM,
410 |                         1.0f / (FFT_W * FFT_H)
411 |                     );
412 |                 if(debug) printf("Eprod done\n");
413 |                 CUFFT_SAFE_CALL(cufftExecC2R(plan[planIdx].FFTplan_C2R, plan[planIdx].d_FFTEProd, plan[planIdx].d_IFFTEProd));
414 |                 // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
415 |                 if(debug) printf("Second fft done\n");
416 |                 sumAlongFeatures<<<dataBlockGrid2D, threadBlock2D, 0, plan[planIdx].stream>>>(
417 |                         plan[planIdx].d_CONVOLUTION,
418 |                         plan[planIdx].d_IFFTEProd,
419 |                         FFT_H,
420 |                         FFT_W,
421 |                         FEATURE_DIM
422 |                     );
423 |                 if(debug) printf("sum along features done\n");
424 |                 // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_Kernel));
425 | 
426 |                 plan[planIdx].convolutionResult = mxCreateNumericArray(2, FFT_dims, mxSINGLE_CLASS, mxREAL);
427 |                 plan[planIdx].h_CONVOLUTION = (float *)mxGetData(plan[planIdx].convolutionResult);
428 | 
429 |                 // CUDA_SAFE_CALL(cudaHostRegister(plan[planIdx].h_CONVOLUTION, CONV_SIZE, cudaHostRegisterPortable));
430 |                 CUDA_SAFE_CALL(cudaMemcpyAsync(plan[planIdx].h_CONVOLUTION, plan[planIdx].d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost, plan[planIdx].stream));
431 |                 // CUDA_SAFE_CALL(cudaMemcpy(plan[planIdx].h_CONVOLUTION, plan[planIdx].d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost));
432 | 
433 |                 if(debug) printf("Copy done\n");
434 |  
435 |                 // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
436 |                 if(debug) printf("Sync done\n");
437 | 
438 |                 mxSetCell(plhs[0], kernelIdx, plan[planIdx].convolutionResult);
439 |                 if(debug) printf("Setting Cell done\n");
440 |                 // if(debug){
441 |                 //     for(int i = 0; i < 10; i++)
442 |                 //         printf("%f\n", plan[planIdx].h_CONVOLUTION[i]);
443 |                 // }
444 |                 kernelIdx = kernelIdx + 1;
445 |                 if (kernelIdx >= N_KERNEL) break;
446 |             }
447 |         }
448 | 
449 |         lastPlanIdx = planIdx;
450 |         if(debug) printf("lastPlanIdx : %d\n", lastPlanIdx);
451 | 
452 |         for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++){
453 |             if (planIdx > lastPlanIdx ) break;
454 | 
455 |             // Set GPU
456 |             CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
457 |             
458 |             // Set Streams
459 |             for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
460 |                 planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
461 |                 if (planIdx > lastPlanIdx ) break;
462 |                 CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
463 |                 CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_Kernel));
464 |                 // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_Kernel));
465 |                 // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_CONVOLUTION));
466 |                 if(debug) printf("Synchronize %d\n", planIdx);
467 |             }
468 |         }
469 |     }
470 |     
471 |     // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel);
472 | 
473 |     /*
474 |      * The mxGPUArray pointers are host-side structures that refer to device
475 |      * data. These must be destroyed before leaving the MEX function.
476 |      */
477 |     mxGPUDestroyGPUArray(mxFFTData);
478 |     // mxGPUDestroyGPUArray(mxConvolution);    
479 |     // mxGPUDestroyGPUArray(mxFFTKernel);
480 | 
481 |     // if(mxKernel == NULL) mxGPUDestroyGPUArray(mxKernel);
482 | 
483 |     for ( gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++)
484 |     {
485 |         // Set GPU
486 |         CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
487 |         if(debug) printf( "free DATA per GPU %d\n", gpuIdx);
488 |         CUDA_SAFE_CALL(cudaFree(d_CFFT_DATA_PER_GPU[gpuIdx]));
489 |         // Set Streams
490 |         for (int streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
491 |             int planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
492 |             
493 |             cufftDestroy(plan[planIdx].FFTplan_R2C);
494 |             cufftDestroy(plan[planIdx].FFTplan_C2R);
495 | 
496 |             if(debug) printf( "free plans\n");
497 | 
498 |             //Allocate memory
499 |             CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_CFFT_KERNEL));
500 |             CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_FFTEProd));
501 |             CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_CONVOLUTION));
502 |             CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_IFFTEProd));
503 |             // d_Kernel
504 |             CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_PaddedKernel));
505 |             // h_Kernel
506 |             // CUDA_SAFE_CALL(cudaFreeHost(plan[planIdx].h_CONVOLUTION));
507 |             if(debug) printf( "free stream\n");
508 |             CUDA_SAFE_CALL(cudaStreamDestroy(plan[planIdx].stream));
509 |         }
510 | 
511 |         // cudaDeviceReset causes the driver to clean up all state. While
512 |         // not mandatory in normal operation, it is good practice.  It is also
513 |         // needed to ensure correct operation when the application is being
514 |         // profiled. Calling cudaDeviceReset causes all profile data to be
515 |         // flushed before the application exits
516 |         cudaDeviceReset();
517 |     }
518 |     
519 |     // // if(mxKernel == NULL) cudaFree(d_Kernel);
520 | 
521 |     mxFree(FFT_dims);
522 | }
523 | 


--------------------------------------------------------------------------------
/src/cudaConvolutionFFT.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <cufft.h>
  3 | #include "mex.h"
  4 | #include "gpu/mxGPUArray.h"
  5 | // #include "common/helper_cuda.h"
  6 | #include "cudaConvFFTData.h"
  7 | #include "cudaConvFFTData.cuh"
  8 | 
  9 | static bool debug = false;
 10 | 
 11 | enum OUT_INDEX{
 12 |     CONVOLUTION_CELL_INDEX
 13 | };
 14 | 
 15 | enum IN_INDEX{
 16 |     DATA_INDEX,
 17 |     MAX_KERNEL_H_INDEX,
 18 |     MAX_KERNEL_W_INDEX,
 19 |     KERNLE_CELL_INDEX,
 20 |     THREAD_SIZE_INDEX, // Optional
 21 |     GPU_INDEX          // Optional
 22 | };
 23 | 
 24 | ////////////////////////////////////////////////////////////////////////////////
 25 | // Mex Entry
 26 | ////////////////////////////////////////////////////////////////////////////////
 27 | void mexFunction(int nlhs, mxArray *plhs[],
 28 |                  int nrhs, mxArray const *prhs[])
 29 | {
 30 |     char const * const errId = "cudaConvFFTData:InvalidInput";
 31 | 
 32 |     /* Choose a reasonably sized number of threads for the block. */
 33 |     int THREAD_PER_BLOCK_H = 16;
 34 |     int THREAD_PER_BLOCK_W = 8;
 35 |     int THREAD_PER_BLOCK_D = 8;
 36 |     int THREAD_PER_BLOCK_2D = 32;
 37 | 
 38 |     /* Initialize the MathWorks GPU API. */
 39 |     // If initialized mxInitGPU do nothing
 40 |     if (mxInitGPU() != MX_GPU_SUCCESS)
 41 |         mexErrMsgTxt("mxInitGPU fail");
 42 |     
 43 | 
 44 |     /* Throw an error if the number of inputs mismatch */
 45 |     if ( (nrhs <  (KERNLE_CELL_INDEX + 1)) || (nrhs > (GPU_INDEX + 1) ))
 46 |         mexErrMsgIdAndTxt(errId, "Wrong number of inputs");
 47 | 
 48 | 
 49 |     /*  Set data */
 50 |     const mxArray *mxDATA = prhs[DATA_INDEX];
 51 |     if (mxIsGPUArray(mxDATA) || 
 52 |             mxGetNumberOfDimensions(mxDATA) != 3 || 
 53 |             mxGetClassID(mxDATA) != mxSINGLE_CLASS)
 54 |         mexErrMsgTxt("Invalid data input");
 55 | 
 56 | 
 57 |     /* Kernel dimensions */
 58 |     int MAX_KERNEL_H = (int)mxGetScalar(prhs[MAX_KERNEL_H_INDEX]);
 59 |     int MAX_KERNEL_W = (int)mxGetScalar(prhs[MAX_KERNEL_W_INDEX]);
 60 |     if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n",MAX_KERNEL_H,MAX_KERNEL_W);
 61 | 
 62 | 
 63 |     /* Kernel Input */
 64 |     if (mxGetClassID(prhs[KERNLE_CELL_INDEX]) != mxCELL_CLASS)
 65 |         mexErrMsgIdAndTxt(errId, "Kernel must be a cell array");
 66 |     mwSize nKernel = mxGetNumberOfElements(prhs[KERNLE_CELL_INDEX]);
 67 |     int N_KERNEL = (int)nKernel;
 68 |     if(debug) fprintf(stderr,"N Kernel: %d\n", N_KERNEL);
 69 | 
 70 | 
 71 |     /* Thread size */
 72 |     if (( nrhs > THREAD_SIZE_INDEX)  && mxGetNumberOfElements(prhs[THREAD_SIZE_INDEX]) != 4)
 73 |         mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock");
 74 | 
 75 |     if ( nrhs > THREAD_SIZE_INDEX ){
 76 |         const double* threadSize = (double *)mxGetData(prhs[THREAD_SIZE_INDEX]);
 77 |         THREAD_PER_BLOCK_H = (int)threadSize[0];
 78 |         THREAD_PER_BLOCK_W = (int)threadSize[1];
 79 |         THREAD_PER_BLOCK_D = (int)threadSize[2];
 80 |         THREAD_PER_BLOCK_2D = (int)threadSize[3];
 81 |         if(debug) fprintf(stderr,"Thread size: H=%d, W=%d, D=%d, 2D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D);
 82 |     }
 83 | 
 84 |     int GPU_ID = 0;
 85 |     if (nrhs > GPU_INDEX ){
 86 |        GPU_ID = (int)mxGetScalar(prhs[GPU_INDEX]); 
 87 |        if(debug) fprintf(stderr,"Using GPU : %d\n", GPU_ID);
 88 |        cudaSetDevice(GPU_ID);
 89 |     }
 90 | 
 91 | 
 92 |     /*  FFT Data */
 93 |     // Data dimensions
 94 |     const mwSize *DATA_dims = mxGetDimensions(mxDATA);
 95 |     int DATA_H = DATA_dims[0];
 96 |     int DATA_W = DATA_dims[1];
 97 |     int FEATURE_DIM = DATA_dims[2];
 98 | 
 99 |     float *h_Data = (float *)mxGetData(mxDATA);
100 |     if(debug) fprintf(stderr,"Data size: h=%d, w=%d, f=%d\n",DATA_H,DATA_W,FEATURE_DIM); 
101 | 
102 |     // Width and height of padding
103 |     int PADDING_H = MAX_KERNEL_H - 1;
104 |     int PADDING_W = MAX_KERNEL_W - 1;
105 | 
106 |     // Derive FFT size from data and kernel dimensions
107 |     // FFT_H = computeFFTsize(DATA_H + PADDING_H);
108 |     // FFT_W = computeFFTsize(DATA_W + PADDING_W);
109 |     int FFT_H = computeFFTsize16(DATA_H + PADDING_H);
110 |     int FFT_W = computeFFTsize16(DATA_W + PADDING_W);
111 |     int CFFT_W = FFT_W;
112 |     int CFFT_H = FFT_H/2 + 1;
113 | 
114 |     if(debug) fprintf(stderr,"FFT size: h=%d, w=%d\n",FFT_H,FFT_W);
115 | 
116 |     int DATA_SIZE = DATA_W * DATA_H * FEATURE_DIM * sizeof(float);
117 |     int FFT_SIZE  = FFT_W  * FFT_H  * FEATURE_DIM * sizeof(float);
118 |     int CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2);
119 |     int CONV_SIZE = FFT_W  * FFT_H  * sizeof(float);
120 |     
121 |     int BATCH = FEATURE_DIM;
122 |     int FFT_Dims[] = { FFT_W, FFT_H };
123 |     int CFFT_Dims[] = { CFFT_W, CFFT_H };
124 |     int idist = FFT_W * FFT_H;
125 |     int odist = CFFT_W * CFFT_H;
126 | 
127 |     cufftHandle FFTplan_R2C, FFTplan_C2R;
128 |     CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 
129 |         2, // rank
130 |         FFT_Dims, 
131 |         FFT_Dims, 1, idist, // *inembed, istride, idist
132 |         CFFT_Dims, 1, odist, // *onembed, ostride, odist
133 |         CUFFT_R2C, 
134 |         BATCH)); // batch
135 | 
136 |     CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_C2R, 
137 |         2, // rank
138 |         FFT_Dims,
139 |         CFFT_Dims, 1, odist, // *inembed, istride, idist
140 |         FFT_Dims, 1, idist, // *onembed, ostride, odist
141 |         CUFFT_C2R, 
142 |         BATCH)); // batch
143 | 
144 |     float *d_Data;
145 |     float *d_PaddedData;
146 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Data,         DATA_SIZE));
147 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedData,   FFT_SIZE));
148 |     CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice));
149 | 
150 |     dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
151 |     dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 
152 |                         iDivUp(FFT_H, threadBlock3D.y), 
153 |                         iDivUp(FEATURE_DIM, threadBlock3D.z));
154 | 
155 |     padData<<<dataBlockGrid3D, threadBlock3D>>>(
156 |         d_PaddedData,
157 |         d_Data,
158 |         FFT_W,
159 |         FFT_H,
160 |         DATA_W,
161 |         DATA_H,
162 |         FEATURE_DIM
163 |         );
164 | 
165 |     cufftComplex *d_CFFT_DATA;
166 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_DATA,     CFFT_SIZE));
167 |     CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_DATA));
168 |     CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
169 |     cudaFree(d_Data);
170 | 
171 | 
172 | 
173 | 
174 |     
175 | 
176 |     /* Convolution FFT */
177 |     // Set Variables 
178 |     float *d_IFFTEProd;
179 |     float *d_CONVOLUTION;
180 |     cufftComplex *d_CFFT_KERNEL;
181 |     cufftComplex *d_FFTEProd;
182 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_IFFTEProd,    FFT_SIZE));
183 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CONVOLUTION,  CONV_SIZE));
184 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_KERNEL,  CFFT_SIZE));
185 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_FFTEProd,     CFFT_SIZE));
186 |     
187 |     const mxArray *mxCurrentCell;
188 |     const mxGPUArray *mxKernel;
189 |     const mwSize *mxKernel_Dim;
190 |     float *h_Kernel;
191 |     float *d_Kernel;
192 |     int KERNEL_H, KERNEL_W, KERNEL_SIZE;
193 | 
194 |     dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D);
195 |     dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 
196 |                         iDivUp(FFT_H, threadBlock2D.y));
197 |     
198 |     mwSize mwCONV_Dims[2];
199 |     mwCONV_Dims[0] = FFT_H;
200 |     mwCONV_Dims[1] = FFT_W;
201 | 
202 |     plhs[CONVOLUTION_CELL_INDEX] = mxCreateCellMatrix(1, N_KERNEL);
203 | 
204 |     for (int kernelIdx = 0; kernelIdx < N_KERNEL; kernelIdx++){
205 |         
206 |         // Get Kernel Data
207 |         mxCurrentCell = mxGetCell(prhs[KERNLE_CELL_INDEX], kernelIdx);
208 |         if (!mxIsGPUArray(mxCurrentCell)){
209 |             
210 |             if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 )
211 |                 mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
212 | 
213 |             h_Kernel = (float *)mxGetData(mxCurrentCell);
214 |             mxKernel_Dim = mxGetDimensions(mxCurrentCell);
215 | 
216 |             // Kernel dimensions
217 |             KERNEL_H = mxKernel_Dim[0];
218 |             KERNEL_W = mxKernel_Dim[1];
219 |             KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
220 | 
221 |             CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Kernel, KERNEL_SIZE));
222 |             CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
223 |             mxKernel = NULL;
224 |         }else{ // Kernel is GPU Array
225 |             mxKernel = mxGPUCreateFromMxArray(mxCurrentCell);
226 | 
227 |             if ( mxGPUGetClassID(mxKernel) != mxSINGLE_CLASS || mxGPUGetNumberOfDimensions(mxKernel) != 3 )
228 |                 mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
229 | 
230 |             mxKernel_Dim = mxGPUGetDimensions(mxKernel);
231 | 
232 |             // Kernel dimensions
233 |             KERNEL_H = mxKernel_Dim[0];
234 |             KERNEL_W = mxKernel_Dim[1];
235 |             KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
236 | 
237 |             d_Kernel = (float *)mxGPUGetDataReadOnly(mxKernel);
238 |         }
239 | 
240 |         if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W);
241 | 
242 |         if (FEATURE_DIM != mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H )
243 |             mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size");
244 | 
245 |         padData<<<dataBlockGrid3D, threadBlock3D>>>(
246 |                 d_PaddedData,
247 |                 d_Kernel,
248 |                 FFT_W,
249 |                 FFT_H,
250 |                 KERNEL_W,
251 |                 KERNEL_H,
252 |                 FEATURE_DIM
253 |             );
254 | 
255 |         CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_KERNEL));
256 |         CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
257 | 
258 |         if(debug) fprintf(stderr,"FFT done\n");
259 | 
260 |         
261 |         /* Hadamard product, Element-wise multiplication in frequency domain */
262 |         /* If execute the following, second compile of this file create MATLAB error */
263 |         elementwiseProductAndNormalize<<<dataBlockGrid3D, threadBlock3D>>>(
264 |                 d_FFTEProd, // out
265 |                 d_CFFT_DATA, // in data
266 |                 d_CFFT_KERNEL,   // in kernel
267 |                 CFFT_H,
268 |                 CFFT_W,
269 |                 FEATURE_DIM,
270 |                 1.0f / (FFT_W * FFT_H)
271 |             );
272 | 
273 |         CUFFT_SAFE_CALL(cufftExecC2R(FFTplan_C2R, d_FFTEProd, d_IFFTEProd));
274 |         CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
275 | 
276 |         sumAlongFeatures<<<dataBlockGrid2D, threadBlock2D>>>(
277 |                 d_CONVOLUTION,
278 |                 d_IFFTEProd,
279 |                 FFT_H,
280 |                 FFT_W,
281 |                 FEATURE_DIM
282 |             );
283 | 
284 |         mxArray * convolutionResult = mxCreateNumericArray(2, mwCONV_Dims, mxSINGLE_CLASS, mxREAL);
285 |         float * h_CONVOLUTION = (float *)mxGetData(convolutionResult);
286 |         CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(h_CONVOLUTION, d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost));
287 | 
288 |         mxSetCell(plhs[CONVOLUTION_CELL_INDEX], kernelIdx, convolutionResult);
289 |         if(mxKernel == NULL) cudaFree(d_Kernel);
290 |         else mxGPUDestroyGPUArray(mxKernel);
291 |     }
292 |     // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel);
293 | 
294 |     /*
295 |      * The mxGPUArray pointers are host-side structures that refer to device
296 |      * data. These must be destroyed before leaving the MEX function.
297 |      */
298 |     // mxGPUDestroyGPUArray(mxFFTData);
299 |     // mxGPUDestroyGPUArray(mxConvolution);    
300 |     // mxGPUDestroyGPUArray(mxFFTKernel);
301 |     
302 |     cufftDestroy(FFTplan_R2C);
303 |     cufftDestroy(FFTplan_C2R);
304 | 
305 |     cudaFree(d_CFFT_DATA);
306 |     cudaFree(d_IFFTEProd);
307 |     cudaFree(d_CONVOLUTION);
308 |     cudaFree(d_CFFT_KERNEL);
309 |     cudaFree(d_FFTEProd);
310 |     cudaFree(d_PaddedData);
311 | }
312 | 


--------------------------------------------------------------------------------
/src/cudaFFTData.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <cufft.h>
  3 | #include "mex.h"
  4 | #include "gpu/mxGPUArray.h"
  5 | #include "cudaConvFFTData.h"
  6 | #include "cudaConvFFTData.cuh"
  7 | 
  8 | static bool debug = false;
  9 | 
 10 | enum IN_INDEX{
 11 |     DATA_INDEX,
 12 |     KERNEL_H_INDEX,
 13 |     KERNEL_W_INDEX
 14 | };
 15 | ////////////////////////////////////////////////////////////////////////////////
 16 | // Mex Entry
 17 | ////////////////////////////////////////////////////////////////////////////////
 18 | void mexFunction(int nlhs, mxArray *plhs[],
 19 |                  int nrhs, mxArray const *prhs[])
 20 | {
 21 |     /* Declare all variables.*/
 22 |     const mxArray *mxDATA = prhs[DATA_INDEX];
 23 |     mxGPUArray *FFT_DATA;
 24 |     float2 *d_CFFT_DATA;
 25 |     float *h_Data;
 26 |     float *d_Data;
 27 |     float *d_PaddedData;
 28 |     char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
 29 |     char const * const errMsg = "Invalid input to MEX file.";
 30 | 
 31 |     /* Choose a reasonably sized number of threads for the block. */
 32 |     int const THREAD_PER_BLOCK_H = 16;
 33 |     int const THREAD_PER_BLOCK_W = 8;
 34 |     int const THREAD_PER_BLOCK_D = 8;
 35 | 
 36 |     // int MblocksPerGrid, NblocksPerGrid;
 37 |     int KERNEL_H, KERNEL_W, DATA_H, DATA_W, 
 38 |         PADDING_H, PADDING_W, FFT_H, FFT_W, FEATURE_DIM,
 39 |         DATA_SIZE, FFT_SIZE, CFFT_SIZE;
 40 | 
 41 |     
 42 |     /* Initialize the MathWorks GPU API. */
 43 |     // If initialized mxInitGPU do nothing
 44 |     if (mxInitGPU() != MX_GPU_SUCCESS)
 45 |         mexErrMsgTxt("mxInitGPU fail");
 46 | 
 47 |     
 48 |     /* Throw an error if the input is not a GPU array. */
 49 |     if ((nrhs!=3) ||
 50 |             mxIsGPUArray(mxDATA) || 
 51 |             mxGetNumberOfDimensions(mxDATA) != 3 || 
 52 |             mxGetClassID(mxDATA) != mxSINGLE_CLASS) {
 53 |         mexErrMsgIdAndTxt(errId, errMsg);
 54 |     }
 55 | 
 56 | 
 57 |     // Kernel dimensions
 58 |     KERNEL_H = (int)mxGetScalar(prhs[KERNEL_H_INDEX]);
 59 |     KERNEL_W = (int)mxGetScalar(prhs[KERNEL_W_INDEX]);
 60 |     if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n",KERNEL_H,KERNEL_W);
 61 | 
 62 |     // Data dimensions
 63 |     const mwSize *DATA_dims = mxGetDimensions(mxDATA);
 64 |     DATA_H = DATA_dims[0];
 65 |     DATA_W = DATA_dims[1];
 66 |     FEATURE_DIM = DATA_dims[2];
 67 | 
 68 |     h_Data = (float *)mxGetData(mxDATA);
 69 |     if(debug) fprintf(stderr,"Data size: h=%d, w=%d, f=%d\n",DATA_H,DATA_W,FEATURE_DIM); 
 70 | 
 71 |     // Width and height of padding
 72 |     PADDING_H = KERNEL_H - 1;
 73 |     PADDING_W = KERNEL_W - 1;
 74 | 
 75 |     // Derive FFT size from data and kernel dimensions
 76 |     // FFT_H = computeFFTsize(DATA_H + PADDING_H);
 77 |     // FFT_W = computeFFTsize(DATA_W + PADDING_W);
 78 |     FFT_H = computeFFTsize16(DATA_H + PADDING_H);
 79 |     FFT_W = computeFFTsize16(DATA_W + PADDING_W);
 80 | 
 81 |     if(debug) fprintf(stderr,"FFT size: h=%d, w=%d\n",FFT_H,FFT_W);
 82 | 
 83 |     DATA_SIZE = DATA_W * DATA_H * FEATURE_DIM * sizeof(float);
 84 |     FFT_SIZE  = FFT_W  * FFT_H  * FEATURE_DIM * sizeof(float);
 85 |     // CFFT_SIZE = FFT_W  * FFT_H  * FEATURE_DIM * sizeof(float2);
 86 | 
 87 |     // Allocate memory for input
 88 |     // No need to initialize using mxCalloc
 89 | 
 90 |     mwSize CFFT_dims[3];
 91 | 
 92 |     CFFT_dims[0] = FFT_H/2 + 1;
 93 |     CFFT_dims[1] = FFT_W;
 94 |     CFFT_dims[2] = FEATURE_DIM;
 95 | 
 96 |     /* Wrap the result up as a MATLAB gpuArray for return. */
 97 |     FFT_DATA = mxGPUCreateGPUArray(3,
 98 |                                 CFFT_dims,
 99 |                                 mxSINGLE_CLASS,
100 |                                 mxCOMPLEX,
101 |                                 MX_GPU_INITIALIZE_VALUES);
102 |     
103 |     d_CFFT_DATA = (float2 *)mxGPUGetData(FFT_DATA);
104 |     
105 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Data,        DATA_SIZE));
106 |     CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedData,  FFT_SIZE));
107 | 
108 |     CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice));
109 | 
110 |     dim3 threadBlock(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
111 |     dim3 dataBlockGrid( iDivUp(FFT_W, threadBlock.x), 
112 |                         iDivUp(FFT_H, threadBlock.y), 
113 |                         iDivUp(FEATURE_DIM, threadBlock.z));
114 | 
115 |     padData<<<dataBlockGrid, threadBlock>>>(
116 |             d_PaddedData,
117 |             d_Data,
118 |             FFT_W,
119 |             FFT_H,
120 |             DATA_W,
121 |             DATA_H,
122 |             FEATURE_DIM
123 |         );
124 | 
125 |     if(debug) fprintf(stderr,"Padding\n");
126 | 
127 |     int BATCH = FEATURE_DIM;
128 |     int FFT_Dims[] = { FFT_W, FFT_H };
129 | 
130 |     int idist = FFT_W * FFT_H;
131 |     int odist = FFT_W * (FFT_H/2 + 1);
132 |     
133 |     int inembed[] = {FFT_W, FFT_H};
134 |     int onembed[] = {FFT_W, FFT_H/2 + 1};
135 | 
136 |     cufftHandle FFTplan_R2C;
137 |     CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 
138 |         2, // rank
139 |         FFT_Dims, 
140 |         inembed, 1, idist, // *inembed, istride, idist
141 |         onembed, 1, odist, // *onembed, ostride, odist
142 |         CUFFT_R2C, 
143 |         BATCH)); // batch
144 | 
145 | 
146 |     CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_DATA));
147 |     CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
148 |     if(debug) fprintf(stderr,"Sync\n");
149 | 
150 |     plhs[0] = mxGPUCreateMxArrayOnGPU(FFT_DATA);
151 |     if(debug) fprintf(stderr,"plhs\n");
152 |     /*
153 |      * The mxGPUArray pointers are host-side structures that refer to device
154 |      * data. These must be destroyed before leaving the MEX function.
155 |      */
156 |     mxGPUDestroyGPUArray(FFT_DATA);
157 |     cufftDestroy(FFTplan_R2C);
158 |     cudaFree(d_Data);
159 |     cudaFree(d_PaddedData);
160 | }
161 | 


--------------------------------------------------------------------------------