├── .gitignore
├── LICENSE
├── README.md
├── common
├── helper_cuda.h
├── helper_cuda_drvapi.h
├── helper_cuda_gl.h
├── helper_functions.h
├── helper_image.h
├── helper_math.h
├── helper_string.h
└── helper_timer.h
├── compile.m
├── cuda_compile.m
├── demoCudaConvolutionFFT.m
└── src
├── convolutionFFTkernel.cu
├── cudaConvFFTData.cu
├── cudaConvFFTData.cuh
├── cudaConvFFTData.h
├── cudaConvFFTDataStreams.cu
├── cudaConvolutionFFT.cu
├── cudaFFTData.cu
└── cutil.h
/.gitignore:
--------------------------------------------------------------------------------
1 | *.*~
2 | *.o
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 | {description}
294 | Copyright (C) {year} {fullname}
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | {signature of Ty Coon}, 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | CUDA-FFT-Convolution
2 | ==============
3 |
4 | Using a standard multi-threaded CPU convolution for very large kernels is very inefficient and slow. This package provides GPU convolution using Fast Fourier Transformation implementation using CUDA.
5 |
6 | Standard convolution in time domain takes O(nm) time whereas convolution in frequency domain takes O((n+m) log (n+m)) time where n is the data length and k is the kernel length.
7 |
8 | ## cudaConvolutionFFT.cu
9 |
10 | The main file takes data, max kernel height, width, convolution kernels (multiple kernels in cell format) and returns convolution results that corresponds to the convolution kernels.
11 |
12 | ## Usage and Instructions
13 |
14 | 1. Download the repo.
15 |
16 | ```
17 | git clone http://github.com/chrischoy/MatlabCUDAConv
18 | ```
19 |
20 | 2. Go to the repo. Open MATLAB and type
21 |
22 | ```
23 | compile
24 | ```
25 |
26 | 3. Run demo. the demo file `demoCudaConvolutionFFT.m` contains a detailed instruction and demo usage
27 |
28 |
29 | ```
30 | demoCudaConvolutionFFT
31 | ```
32 |
33 | ## Output
34 |
35 | 
36 |
37 | ### More resource
38 |
39 | [http://chrischoy.org/projects/cuda-fft-convolution](http://chrischoy.org/projects/cuda-fft-convolution)
40 |
--------------------------------------------------------------------------------
/common/helper_cuda.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | ////////////////////////////////////////////////////////////////////////////////
13 | // These are CUDA Helper functions for initialization and error checking
14 |
15 | #ifndef HELPER_CUDA_H
16 | #define HELPER_CUDA_H
17 |
18 | #pragma once
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | #include
25 |
26 | /*
27 | inline void __ExitInTime(int seconds)
28 | {
29 | fprintf(stdout, "> exiting in %d seconds: ", seconds);
30 | fflush(stdout);
31 | time_t t;
32 | int count;
33 |
34 | for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
35 | fprintf(stdout, "%d...", count);
36 | #if defined(WIN32)
37 | Sleep(1000);
38 | #else
39 | sleep(1);
40 | #endif
41 | }
42 |
43 | fprintf(stdout,"done!\n\n");
44 | fflush(stdout);
45 | }
46 |
47 | #define EXIT_TIME_DELAY 2
48 |
49 | inline void EXIT_DELAY(int return_code)
50 | {
51 | __ExitInTime(EXIT_TIME_DELAY);
52 | exit(return_code);
53 | }
54 | */
55 |
56 | #ifndef EXIT_WAIVED
57 | #define EXIT_WAIVED 2
58 | #endif
59 |
60 | // Note, it is required that your SDK sample to include the proper header files, please
61 | // refer the CUDA examples for examples of the needed CUDA headers, which may change depending
62 | // on which CUDA functions are used.
63 |
64 | // CUDA Runtime error messages
65 | #ifdef __DRIVER_TYPES_H__
66 | static const char *_cudaGetErrorEnum(cudaError_t error)
67 | {
68 | switch (error)
69 | {
70 | case cudaSuccess:
71 | return "cudaSuccess";
72 |
73 | case cudaErrorMissingConfiguration:
74 | return "cudaErrorMissingConfiguration";
75 |
76 | case cudaErrorMemoryAllocation:
77 | return "cudaErrorMemoryAllocation";
78 |
79 | case cudaErrorInitializationError:
80 | return "cudaErrorInitializationError";
81 |
82 | case cudaErrorLaunchFailure:
83 | return "cudaErrorLaunchFailure";
84 |
85 | case cudaErrorPriorLaunchFailure:
86 | return "cudaErrorPriorLaunchFailure";
87 |
88 | case cudaErrorLaunchTimeout:
89 | return "cudaErrorLaunchTimeout";
90 |
91 | case cudaErrorLaunchOutOfResources:
92 | return "cudaErrorLaunchOutOfResources";
93 |
94 | case cudaErrorInvalidDeviceFunction:
95 | return "cudaErrorInvalidDeviceFunction";
96 |
97 | case cudaErrorInvalidConfiguration:
98 | return "cudaErrorInvalidConfiguration";
99 |
100 | case cudaErrorInvalidDevice:
101 | return "cudaErrorInvalidDevice";
102 |
103 | case cudaErrorInvalidValue:
104 | return "cudaErrorInvalidValue";
105 |
106 | case cudaErrorInvalidPitchValue:
107 | return "cudaErrorInvalidPitchValue";
108 |
109 | case cudaErrorInvalidSymbol:
110 | return "cudaErrorInvalidSymbol";
111 |
112 | case cudaErrorMapBufferObjectFailed:
113 | return "cudaErrorMapBufferObjectFailed";
114 |
115 | case cudaErrorUnmapBufferObjectFailed:
116 | return "cudaErrorUnmapBufferObjectFailed";
117 |
118 | case cudaErrorInvalidHostPointer:
119 | return "cudaErrorInvalidHostPointer";
120 |
121 | case cudaErrorInvalidDevicePointer:
122 | return "cudaErrorInvalidDevicePointer";
123 |
124 | case cudaErrorInvalidTexture:
125 | return "cudaErrorInvalidTexture";
126 |
127 | case cudaErrorInvalidTextureBinding:
128 | return "cudaErrorInvalidTextureBinding";
129 |
130 | case cudaErrorInvalidChannelDescriptor:
131 | return "cudaErrorInvalidChannelDescriptor";
132 |
133 | case cudaErrorInvalidMemcpyDirection:
134 | return "cudaErrorInvalidMemcpyDirection";
135 |
136 | case cudaErrorAddressOfConstant:
137 | return "cudaErrorAddressOfConstant";
138 |
139 | case cudaErrorTextureFetchFailed:
140 | return "cudaErrorTextureFetchFailed";
141 |
142 | case cudaErrorTextureNotBound:
143 | return "cudaErrorTextureNotBound";
144 |
145 | case cudaErrorSynchronizationError:
146 | return "cudaErrorSynchronizationError";
147 |
148 | case cudaErrorInvalidFilterSetting:
149 | return "cudaErrorInvalidFilterSetting";
150 |
151 | case cudaErrorInvalidNormSetting:
152 | return "cudaErrorInvalidNormSetting";
153 |
154 | case cudaErrorMixedDeviceExecution:
155 | return "cudaErrorMixedDeviceExecution";
156 |
157 | case cudaErrorCudartUnloading:
158 | return "cudaErrorCudartUnloading";
159 |
160 | case cudaErrorUnknown:
161 | return "cudaErrorUnknown";
162 |
163 | case cudaErrorNotYetImplemented:
164 | return "cudaErrorNotYetImplemented";
165 |
166 | case cudaErrorMemoryValueTooLarge:
167 | return "cudaErrorMemoryValueTooLarge";
168 |
169 | case cudaErrorInvalidResourceHandle:
170 | return "cudaErrorInvalidResourceHandle";
171 |
172 | case cudaErrorNotReady:
173 | return "cudaErrorNotReady";
174 |
175 | case cudaErrorInsufficientDriver:
176 | return "cudaErrorInsufficientDriver";
177 |
178 | case cudaErrorSetOnActiveProcess:
179 | return "cudaErrorSetOnActiveProcess";
180 |
181 | case cudaErrorInvalidSurface:
182 | return "cudaErrorInvalidSurface";
183 |
184 | case cudaErrorNoDevice:
185 | return "cudaErrorNoDevice";
186 |
187 | case cudaErrorECCUncorrectable:
188 | return "cudaErrorECCUncorrectable";
189 |
190 | case cudaErrorSharedObjectSymbolNotFound:
191 | return "cudaErrorSharedObjectSymbolNotFound";
192 |
193 | case cudaErrorSharedObjectInitFailed:
194 | return "cudaErrorSharedObjectInitFailed";
195 |
196 | case cudaErrorUnsupportedLimit:
197 | return "cudaErrorUnsupportedLimit";
198 |
199 | case cudaErrorDuplicateVariableName:
200 | return "cudaErrorDuplicateVariableName";
201 |
202 | case cudaErrorDuplicateTextureName:
203 | return "cudaErrorDuplicateTextureName";
204 |
205 | case cudaErrorDuplicateSurfaceName:
206 | return "cudaErrorDuplicateSurfaceName";
207 |
208 | case cudaErrorDevicesUnavailable:
209 | return "cudaErrorDevicesUnavailable";
210 |
211 | case cudaErrorInvalidKernelImage:
212 | return "cudaErrorInvalidKernelImage";
213 |
214 | case cudaErrorNoKernelImageForDevice:
215 | return "cudaErrorNoKernelImageForDevice";
216 |
217 | case cudaErrorIncompatibleDriverContext:
218 | return "cudaErrorIncompatibleDriverContext";
219 |
220 | case cudaErrorPeerAccessAlreadyEnabled:
221 | return "cudaErrorPeerAccessAlreadyEnabled";
222 |
223 | case cudaErrorPeerAccessNotEnabled:
224 | return "cudaErrorPeerAccessNotEnabled";
225 |
226 | case cudaErrorDeviceAlreadyInUse:
227 | return "cudaErrorDeviceAlreadyInUse";
228 |
229 | case cudaErrorProfilerDisabled:
230 | return "cudaErrorProfilerDisabled";
231 |
232 | case cudaErrorProfilerNotInitialized:
233 | return "cudaErrorProfilerNotInitialized";
234 |
235 | case cudaErrorProfilerAlreadyStarted:
236 | return "cudaErrorProfilerAlreadyStarted";
237 |
238 | case cudaErrorProfilerAlreadyStopped:
239 | return "cudaErrorProfilerAlreadyStopped";
240 |
241 | #if __CUDA_API_VERSION >= 0x4000
242 |
243 | case cudaErrorAssert:
244 | return "cudaErrorAssert";
245 |
246 | case cudaErrorTooManyPeers:
247 | return "cudaErrorTooManyPeers";
248 |
249 | case cudaErrorHostMemoryAlreadyRegistered:
250 | return "cudaErrorHostMemoryAlreadyRegistered";
251 |
252 | case cudaErrorHostMemoryNotRegistered:
253 | return "cudaErrorHostMemoryNotRegistered";
254 | #endif
255 |
256 | case cudaErrorStartupFailure:
257 | return "cudaErrorStartupFailure";
258 |
259 | case cudaErrorApiFailureBase:
260 | return "cudaErrorApiFailureBase";
261 | }
262 |
263 | return "";
264 | }
265 | #endif
266 |
267 | #ifdef __cuda_cuda_h__
268 | // CUDA Driver API errors
269 | static const char *_cudaGetErrorEnum(CUresult error)
270 | {
271 | switch (error)
272 | {
273 | case CUDA_SUCCESS:
274 | return "CUDA_SUCCESS";
275 |
276 | case CUDA_ERROR_INVALID_VALUE:
277 | return "CUDA_ERROR_INVALID_VALUE";
278 |
279 | case CUDA_ERROR_OUT_OF_MEMORY:
280 | return "CUDA_ERROR_OUT_OF_MEMORY";
281 |
282 | case CUDA_ERROR_NOT_INITIALIZED:
283 | return "CUDA_ERROR_NOT_INITIALIZED";
284 |
285 | case CUDA_ERROR_DEINITIALIZED:
286 | return "CUDA_ERROR_DEINITIALIZED";
287 |
288 | case CUDA_ERROR_PROFILER_DISABLED:
289 | return "CUDA_ERROR_PROFILER_DISABLED";
290 |
291 | case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
292 | return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
293 |
294 | case CUDA_ERROR_PROFILER_ALREADY_STARTED:
295 | return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
296 |
297 | case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
298 | return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
299 |
300 | case CUDA_ERROR_NO_DEVICE:
301 | return "CUDA_ERROR_NO_DEVICE";
302 |
303 | case CUDA_ERROR_INVALID_DEVICE:
304 | return "CUDA_ERROR_INVALID_DEVICE";
305 |
306 | case CUDA_ERROR_INVALID_IMAGE:
307 | return "CUDA_ERROR_INVALID_IMAGE";
308 |
309 | case CUDA_ERROR_INVALID_CONTEXT:
310 | return "CUDA_ERROR_INVALID_CONTEXT";
311 |
312 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
313 | return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
314 |
315 | case CUDA_ERROR_MAP_FAILED:
316 | return "CUDA_ERROR_MAP_FAILED";
317 |
318 | case CUDA_ERROR_UNMAP_FAILED:
319 | return "CUDA_ERROR_UNMAP_FAILED";
320 |
321 | case CUDA_ERROR_ARRAY_IS_MAPPED:
322 | return "CUDA_ERROR_ARRAY_IS_MAPPED";
323 |
324 | case CUDA_ERROR_ALREADY_MAPPED:
325 | return "CUDA_ERROR_ALREADY_MAPPED";
326 |
327 | case CUDA_ERROR_NO_BINARY_FOR_GPU:
328 | return "CUDA_ERROR_NO_BINARY_FOR_GPU";
329 |
330 | case CUDA_ERROR_ALREADY_ACQUIRED:
331 | return "CUDA_ERROR_ALREADY_ACQUIRED";
332 |
333 | case CUDA_ERROR_NOT_MAPPED:
334 | return "CUDA_ERROR_NOT_MAPPED";
335 |
336 | case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
337 | return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
338 |
339 | case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
340 | return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
341 |
342 | case CUDA_ERROR_ECC_UNCORRECTABLE:
343 | return "CUDA_ERROR_ECC_UNCORRECTABLE";
344 |
345 | case CUDA_ERROR_UNSUPPORTED_LIMIT:
346 | return "CUDA_ERROR_UNSUPPORTED_LIMIT";
347 |
348 | case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
349 | return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
350 |
351 | case CUDA_ERROR_INVALID_SOURCE:
352 | return "CUDA_ERROR_INVALID_SOURCE";
353 |
354 | case CUDA_ERROR_FILE_NOT_FOUND:
355 | return "CUDA_ERROR_FILE_NOT_FOUND";
356 |
357 | case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
358 | return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
359 |
360 | case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
361 | return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
362 |
363 | case CUDA_ERROR_OPERATING_SYSTEM:
364 | return "CUDA_ERROR_OPERATING_SYSTEM";
365 |
366 | case CUDA_ERROR_INVALID_HANDLE:
367 | return "CUDA_ERROR_INVALID_HANDLE";
368 |
369 | case CUDA_ERROR_NOT_FOUND:
370 | return "CUDA_ERROR_NOT_FOUND";
371 |
372 | case CUDA_ERROR_NOT_READY:
373 | return "CUDA_ERROR_NOT_READY";
374 |
375 | case CUDA_ERROR_LAUNCH_FAILED:
376 | return "CUDA_ERROR_LAUNCH_FAILED";
377 |
378 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
379 | return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
380 |
381 | case CUDA_ERROR_LAUNCH_TIMEOUT:
382 | return "CUDA_ERROR_LAUNCH_TIMEOUT";
383 |
384 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
385 | return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
386 |
387 | case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
388 | return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
389 |
390 | case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
391 | return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
392 |
393 | case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
394 | return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
395 |
396 | case CUDA_ERROR_CONTEXT_IS_DESTROYED:
397 | return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
398 |
399 | case CUDA_ERROR_ASSERT:
400 | return "CUDA_ERROR_ASSERT";
401 |
402 | case CUDA_ERROR_TOO_MANY_PEERS:
403 | return "CUDA_ERROR_TOO_MANY_PEERS";
404 |
405 | case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
406 | return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
407 |
408 | case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
409 | return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
410 |
411 | case CUDA_ERROR_UNKNOWN:
412 | return "CUDA_ERROR_UNKNOWN";
413 | }
414 |
415 | return "";
416 | }
417 | #endif
418 |
419 | #ifdef CUBLAS_API_H_
420 | // cuBLAS API errors
421 | static const char *_cudaGetErrorEnum(cublasStatus_t error)
422 | {
423 | switch (error)
424 | {
425 | case CUBLAS_STATUS_SUCCESS:
426 | return "CUBLAS_STATUS_SUCCESS";
427 |
428 | case CUBLAS_STATUS_NOT_INITIALIZED:
429 | return "CUBLAS_STATUS_NOT_INITIALIZED";
430 |
431 | case CUBLAS_STATUS_ALLOC_FAILED:
432 | return "CUBLAS_STATUS_ALLOC_FAILED";
433 |
434 | case CUBLAS_STATUS_INVALID_VALUE:
435 | return "CUBLAS_STATUS_INVALID_VALUE";
436 |
437 | case CUBLAS_STATUS_ARCH_MISMATCH:
438 | return "CUBLAS_STATUS_ARCH_MISMATCH";
439 |
440 | case CUBLAS_STATUS_MAPPING_ERROR:
441 | return "CUBLAS_STATUS_MAPPING_ERROR";
442 |
443 | case CUBLAS_STATUS_EXECUTION_FAILED:
444 | return "CUBLAS_STATUS_EXECUTION_FAILED";
445 |
446 | case CUBLAS_STATUS_INTERNAL_ERROR:
447 | return "CUBLAS_STATUS_INTERNAL_ERROR";
448 | }
449 |
450 | return "";
451 | }
452 | #endif
453 |
454 | #ifdef _CUFFT_H_
455 | // cuFFT API errors
456 | static const char *_cudaGetErrorEnum(cufftResult error)
457 | {
458 | switch (error)
459 | {
460 | case CUFFT_SUCCESS:
461 | return "CUFFT_SUCCESS";
462 |
463 | case CUFFT_INVALID_PLAN:
464 | return "CUFFT_INVALID_PLAN";
465 |
466 | case CUFFT_ALLOC_FAILED:
467 | return "CUFFT_ALLOC_FAILED";
468 |
469 | case CUFFT_INVALID_TYPE:
470 | return "CUFFT_INVALID_TYPE";
471 |
472 | case CUFFT_INVALID_VALUE:
473 | return "CUFFT_INVALID_VALUE";
474 |
475 | case CUFFT_INTERNAL_ERROR:
476 | return "CUFFT_INTERNAL_ERROR";
477 |
478 | case CUFFT_EXEC_FAILED:
479 | return "CUFFT_EXEC_FAILED";
480 |
481 | case CUFFT_SETUP_FAILED:
482 | return "CUFFT_SETUP_FAILED";
483 |
484 | case CUFFT_INVALID_SIZE:
485 | return "CUFFT_INVALID_SIZE";
486 |
487 | case CUFFT_UNALIGNED_DATA:
488 | return "CUFFT_UNALIGNED_DATA";
489 | }
490 |
491 | return "";
492 | }
493 | #endif
494 |
495 |
496 | #ifdef CUSPARSEAPI
497 | // cuSPARSE API errors
498 | static const char *_cudaGetErrorEnum(cusparseStatus_t error)
499 | {
500 | switch (error)
501 | {
502 | case CUSPARSE_STATUS_SUCCESS:
503 | return "CUSPARSE_STATUS_SUCCESS";
504 |
505 | case CUSPARSE_STATUS_NOT_INITIALIZED:
506 | return "CUSPARSE_STATUS_NOT_INITIALIZED";
507 |
508 | case CUSPARSE_STATUS_ALLOC_FAILED:
509 | return "CUSPARSE_STATUS_ALLOC_FAILED";
510 |
511 | case CUSPARSE_STATUS_INVALID_VALUE:
512 | return "CUSPARSE_STATUS_INVALID_VALUE";
513 |
514 | case CUSPARSE_STATUS_ARCH_MISMATCH:
515 | return "CUSPARSE_STATUS_ARCH_MISMATCH";
516 |
517 | case CUSPARSE_STATUS_MAPPING_ERROR:
518 | return "CUSPARSE_STATUS_MAPPING_ERROR";
519 |
520 | case CUSPARSE_STATUS_EXECUTION_FAILED:
521 | return "CUSPARSE_STATUS_EXECUTION_FAILED";
522 |
523 | case CUSPARSE_STATUS_INTERNAL_ERROR:
524 | return "CUSPARSE_STATUS_INTERNAL_ERROR";
525 |
526 | case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
527 | return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
528 | }
529 |
530 | return "";
531 | }
532 | #endif
533 |
534 | #ifdef CURAND_H_
535 | // cuRAND API errors
536 | static const char *_cudaGetErrorEnum(curandStatus_t error)
537 | {
538 | switch (error)
539 | {
540 | case CURAND_STATUS_SUCCESS:
541 | return "CURAND_STATUS_SUCCESS";
542 |
543 | case CURAND_STATUS_VERSION_MISMATCH:
544 | return "CURAND_STATUS_VERSION_MISMATCH";
545 |
546 | case CURAND_STATUS_NOT_INITIALIZED:
547 | return "CURAND_STATUS_NOT_INITIALIZED";
548 |
549 | case CURAND_STATUS_ALLOCATION_FAILED:
550 | return "CURAND_STATUS_ALLOCATION_FAILED";
551 |
552 | case CURAND_STATUS_TYPE_ERROR:
553 | return "CURAND_STATUS_TYPE_ERROR";
554 |
555 | case CURAND_STATUS_OUT_OF_RANGE:
556 | return "CURAND_STATUS_OUT_OF_RANGE";
557 |
558 | case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
559 | return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
560 |
561 | case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
562 | return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
563 |
564 | case CURAND_STATUS_LAUNCH_FAILURE:
565 | return "CURAND_STATUS_LAUNCH_FAILURE";
566 |
567 | case CURAND_STATUS_PREEXISTING_FAILURE:
568 | return "CURAND_STATUS_PREEXISTING_FAILURE";
569 |
570 | case CURAND_STATUS_INITIALIZATION_FAILED:
571 | return "CURAND_STATUS_INITIALIZATION_FAILED";
572 |
573 | case CURAND_STATUS_ARCH_MISMATCH:
574 | return "CURAND_STATUS_ARCH_MISMATCH";
575 |
576 | case CURAND_STATUS_INTERNAL_ERROR:
577 | return "CURAND_STATUS_INTERNAL_ERROR";
578 | }
579 |
580 | return "";
581 | }
582 | #endif
583 |
584 | #ifdef NV_NPPIDEFS_H
585 | // NPP API errors
586 | static const char *_cudaGetErrorEnum(NppStatus error)
587 | {
588 | switch (error)
589 | {
590 | case NPP_NOT_SUPPORTED_MODE_ERROR:
591 | return "NPP_NOT_SUPPORTED_MODE_ERROR";
592 |
593 | case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
594 | return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
595 |
596 | case NPP_RESIZE_NO_OPERATION_ERROR:
597 | return "NPP_RESIZE_NO_OPERATION_ERROR";
598 |
599 | case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
600 | return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
601 |
602 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
603 |
604 | case NPP_BAD_ARG_ERROR:
605 | return "NPP_BAD_ARGUMENT_ERROR";
606 |
607 | case NPP_COEFF_ERROR:
608 | return "NPP_COEFFICIENT_ERROR";
609 |
610 | case NPP_RECT_ERROR:
611 | return "NPP_RECTANGLE_ERROR";
612 |
613 | case NPP_QUAD_ERROR:
614 | return "NPP_QUADRANGLE_ERROR";
615 |
616 | case NPP_MEM_ALLOC_ERR:
617 | return "NPP_MEMORY_ALLOCATION_ERROR";
618 |
619 | case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
620 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
621 |
622 | case NPP_INVALID_INPUT:
623 | return "NPP_INVALID_INPUT";
624 |
625 | case NPP_POINTER_ERROR:
626 | return "NPP_POINTER_ERROR";
627 |
628 | case NPP_WARNING:
629 | return "NPP_WARNING";
630 |
631 | case NPP_ODD_ROI_WARNING:
632 | return "NPP_ODD_ROI_WARNING";
633 | #else
634 |
635 | // These are for CUDA 5.5 or higher
636 | case NPP_BAD_ARGUMENT_ERROR:
637 | return "NPP_BAD_ARGUMENT_ERROR";
638 |
639 | case NPP_COEFFICIENT_ERROR:
640 | return "NPP_COEFFICIENT_ERROR";
641 |
642 | case NPP_RECTANGLE_ERROR:
643 | return "NPP_RECTANGLE_ERROR";
644 |
645 | case NPP_QUADRANGLE_ERROR:
646 | return "NPP_QUADRANGLE_ERROR";
647 |
648 | case NPP_MEMORY_ALLOCATION_ERR:
649 | return "NPP_MEMORY_ALLOCATION_ERROR";
650 |
651 | case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
652 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
653 |
654 | case NPP_INVALID_HOST_POINTER_ERROR:
655 | return "NPP_INVALID_HOST_POINTER_ERROR";
656 |
657 | case NPP_INVALID_DEVICE_POINTER_ERROR:
658 | return "NPP_INVALID_DEVICE_POINTER_ERROR";
659 | #endif
660 |
661 | case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
662 | return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
663 |
664 | case NPP_TEXTURE_BIND_ERROR:
665 | return "NPP_TEXTURE_BIND_ERROR";
666 |
667 | case NPP_WRONG_INTERSECTION_ROI_ERROR:
668 | return "NPP_WRONG_INTERSECTION_ROI_ERROR";
669 |
670 | case NPP_NOT_EVEN_STEP_ERROR:
671 | return "NPP_NOT_EVEN_STEP_ERROR";
672 |
673 | case NPP_INTERPOLATION_ERROR:
674 | return "NPP_INTERPOLATION_ERROR";
675 |
676 | case NPP_RESIZE_FACTOR_ERROR:
677 | return "NPP_RESIZE_FACTOR_ERROR";
678 |
679 | case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
680 | return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
681 |
682 |
683 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
684 |
685 | case NPP_MEMFREE_ERR:
686 | return "NPP_MEMFREE_ERR";
687 |
688 | case NPP_MEMSET_ERR:
689 | return "NPP_MEMSET_ERR";
690 |
691 | case NPP_MEMCPY_ERR:
692 | return "NPP_MEMCPY_ERROR";
693 |
694 | case NPP_MIRROR_FLIP_ERR:
695 | return "NPP_MIRROR_FLIP_ERR";
696 | #else
697 |
698 | case NPP_MEMFREE_ERROR:
699 | return "NPP_MEMFREE_ERROR";
700 |
701 | case NPP_MEMSET_ERROR:
702 | return "NPP_MEMSET_ERROR";
703 |
704 | case NPP_MEMCPY_ERROR:
705 | return "NPP_MEMCPY_ERROR";
706 |
707 | case NPP_MIRROR_FLIP_ERROR:
708 | return "NPP_MIRROR_FLIP_ERROR";
709 | #endif
710 |
711 | case NPP_ALIGNMENT_ERROR:
712 | return "NPP_ALIGNMENT_ERROR";
713 |
714 | case NPP_STEP_ERROR:
715 | return "NPP_STEP_ERROR";
716 |
717 | case NPP_SIZE_ERROR:
718 | return "NPP_SIZE_ERROR";
719 |
720 | case NPP_NULL_POINTER_ERROR:
721 | return "NPP_NULL_POINTER_ERROR";
722 |
723 | case NPP_CUDA_KERNEL_EXECUTION_ERROR:
724 | return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
725 |
726 | case NPP_NOT_IMPLEMENTED_ERROR:
727 | return "NPP_NOT_IMPLEMENTED_ERROR";
728 |
729 | case NPP_ERROR:
730 | return "NPP_ERROR";
731 |
732 | case NPP_SUCCESS:
733 | return "NPP_SUCCESS";
734 |
735 | case NPP_WRONG_INTERSECTION_QUAD_WARNING:
736 | return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
737 |
738 | case NPP_MISALIGNED_DST_ROI_WARNING:
739 | return "NPP_MISALIGNED_DST_ROI_WARNING";
740 |
741 | case NPP_AFFINE_QUAD_INCORRECT_WARNING:
742 | return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
743 |
744 | case NPP_DOUBLE_SIZE_WARNING:
745 | return "NPP_DOUBLE_SIZE_WARNING";
746 |
747 | case NPP_WRONG_INTERSECTION_ROI_WARNING:
748 | return "NPP_WRONG_INTERSECTION_ROI_WARNING";
749 | }
750 |
751 | return "";
752 | }
753 | #endif
754 |
755 | #ifdef __DRIVER_TYPES_H__
756 | #ifndef DEVICE_RESET
757 | #define DEVICE_RESET cudaDeviceReset();
758 | #endif
759 | #else
760 | #ifndef DEVICE_RESET
761 | #define DEVICE_RESET
762 | #endif
763 | #endif
764 |
765 | template< typename T >
766 | void check(T result, char const *const func, const char *const file, int const line)
767 | {
768 | if (result)
769 | {
770 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
771 | file, line, static_cast(result), _cudaGetErrorEnum(result), func);
772 | DEVICE_RESET
773 | // Make sure we call CUDA Device Reset before exiting
774 | exit(EXIT_FAILURE);
775 | }
776 | }
777 |
778 | #ifdef __DRIVER_TYPES_H__
779 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
780 | #define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )
781 |
782 | // This will output the proper error string when calling cudaGetLastError
783 | #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
784 |
785 | inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
786 | {
787 | cudaError_t err = cudaGetLastError();
788 |
789 | if (cudaSuccess != err)
790 | {
791 | fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
792 | file, line, errorMessage, (int)err, cudaGetErrorString(err));
793 | DEVICE_RESET
794 | exit(EXIT_FAILURE);
795 | }
796 | }
797 | #endif
798 |
799 | #ifndef MAX
800 | #define MAX(a,b) (a > b ? a : b)
801 | #endif
802 |
803 | // Beginning of GPU Architecture definitions
804 | inline int _ConvertSMVer2Cores(int major, int minor)
805 | {
806 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
807 | typedef struct
808 | {
809 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
810 | int Cores;
811 | } sSMtoCores;
812 |
813 | sSMtoCores nGpuArchCoresPerSM[] =
814 | {
815 | { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
816 | { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
817 | { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
818 | { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
819 | { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
820 | { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
821 | { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
822 | { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
823 | { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
824 | { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
825 | { -1, -1 }
826 | };
827 |
828 | int index = 0;
829 |
830 | while (nGpuArchCoresPerSM[index].SM != -1)
831 | {
832 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
833 | {
834 | return nGpuArchCoresPerSM[index].Cores;
835 | }
836 |
837 | index++;
838 | }
839 |
840 | // If we don't find the values, we default use the previous one to run properly
841 | printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
842 | return nGpuArchCoresPerSM[7].Cores;
843 | }
844 | // end of GPU Architecture definitions
845 |
846 | #ifdef __CUDA_RUNTIME_H__
847 | // General GPU Device CUDA Initialization
848 | inline int gpuDeviceInit(int devID)
849 | {
850 | int device_count;
851 | checkCudaErrors(cudaGetDeviceCount(&device_count));
852 |
853 | if (device_count == 0)
854 | {
855 | fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
856 | exit(EXIT_FAILURE);
857 | }
858 |
859 | if (devID < 0)
860 | {
861 | devID = 0;
862 | }
863 |
864 | if (devID > device_count-1)
865 | {
866 | fprintf(stderr, "\n");
867 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
868 | fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
869 | fprintf(stderr, "\n");
870 | return -devID;
871 | }
872 |
873 | cudaDeviceProp deviceProp;
874 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
875 |
876 | if (deviceProp.computeMode == cudaComputeModeProhibited)
877 | {
878 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n");
879 | return -1;
880 | }
881 |
882 | if (deviceProp.major < 1)
883 | {
884 | fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
885 | exit(EXIT_FAILURE);
886 | }
887 |
888 | checkCudaErrors(cudaSetDevice(devID));
889 | printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
890 |
891 | return devID;
892 | }
893 |
894 | // This function returns the best GPU (with maximum GFLOPS)
895 | inline int gpuGetMaxGflopsDeviceId()
896 | {
897 | int current_device = 0, sm_per_multiproc = 0;
898 | int max_perf_device = 0;
899 | int device_count = 0, best_SM_arch = 0;
900 |
901 | unsigned long long max_compute_perf = 0;
902 | cudaDeviceProp deviceProp;
903 | cudaGetDeviceCount(&device_count);
904 |
905 | checkCudaErrors(cudaGetDeviceCount(&device_count));
906 |
907 | if (device_count == 0)
908 | {
909 | fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
910 | exit(EXIT_FAILURE);
911 | }
912 |
913 | // Find the best major SM Architecture GPU device
914 | while (current_device < device_count)
915 | {
916 | cudaGetDeviceProperties(&deviceProp, current_device);
917 |
918 | // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
919 | if (deviceProp.computeMode != cudaComputeModeProhibited)
920 | {
921 | if (deviceProp.major > 0 && deviceProp.major < 9999)
922 | {
923 | best_SM_arch = MAX(best_SM_arch, deviceProp.major);
924 | }
925 | }
926 |
927 | current_device++;
928 | }
929 |
930 | // Find the best CUDA capable GPU device
931 | current_device = 0;
932 |
933 | while (current_device < device_count)
934 | {
935 | cudaGetDeviceProperties(&deviceProp, current_device);
936 |
937 | // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
938 | if (deviceProp.computeMode != cudaComputeModeProhibited)
939 | {
940 | if (deviceProp.major == 9999 && deviceProp.minor == 9999)
941 | {
942 | sm_per_multiproc = 1;
943 | }
944 | else
945 | {
946 | sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
947 | }
948 |
949 | unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
950 |
951 | if (compute_perf > max_compute_perf)
952 | {
953 | // If we find GPU with SM major > 2, search only these
954 | if (best_SM_arch > 2)
955 | {
956 | // If our device==dest_SM_arch, choose this, or else pass
957 | if (deviceProp.major == best_SM_arch)
958 | {
959 | max_compute_perf = compute_perf;
960 | max_perf_device = current_device;
961 | }
962 | }
963 | else
964 | {
965 | max_compute_perf = compute_perf;
966 | max_perf_device = current_device;
967 | }
968 | }
969 | }
970 |
971 | ++current_device;
972 | }
973 |
974 | return max_perf_device;
975 | }
976 |
977 |
978 | // Initialization code to find the best CUDA Device
979 | inline int findCudaDevice(int argc, const char **argv)
980 | {
981 | cudaDeviceProp deviceProp;
982 | int devID = 0;
983 |
984 | // If the command-line has a device number specified, use it
985 | if (checkCmdLineFlag(argc, argv, "device"))
986 | {
987 | devID = getCmdLineArgumentInt(argc, argv, "device=");
988 |
989 | if (devID < 0)
990 | {
991 | printf("Invalid command line parameter\n ");
992 | exit(EXIT_FAILURE);
993 | }
994 | else
995 | {
996 | devID = gpuDeviceInit(devID);
997 |
998 | if (devID < 0)
999 | {
1000 | printf("exiting...\n");
1001 | exit(EXIT_FAILURE);
1002 | }
1003 | }
1004 | }
1005 | else
1006 | {
1007 | // Otherwise pick the device with highest Gflops/s
1008 | devID = gpuGetMaxGflopsDeviceId();
1009 | checkCudaErrors(cudaSetDevice(devID));
1010 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
1011 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
1012 | }
1013 |
1014 | return devID;
1015 | }
1016 |
1017 | // General check for CUDA GPU SM Capabilities
1018 | inline bool checkCudaCapabilities(int major_version, int minor_version)
1019 | {
1020 | cudaDeviceProp deviceProp;
1021 | deviceProp.major = 0;
1022 | deviceProp.minor = 0;
1023 | int dev;
1024 |
1025 | checkCudaErrors(cudaGetDevice(&dev));
1026 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
1027 |
1028 | if ((deviceProp.major > major_version) ||
1029 | (deviceProp.major == major_version && deviceProp.minor >= minor_version))
1030 | {
1031 | printf(" GPU Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
1032 | return true;
1033 | }
1034 | else
1035 | {
1036 | printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
1037 | return false;
1038 | }
1039 | }
1040 | #endif
1041 |
1042 | // end of CUDA Helper Functions
1043 |
1044 |
1045 | #endif
1046 |
--------------------------------------------------------------------------------
/common/helper_cuda_drvapi.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | // Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects)
13 | #ifndef HELPER_CUDA_DRVAPI_H
14 | #define HELPER_CUDA_DRVAPI_H
15 |
16 | #include
17 | #include
18 | #include
19 |
20 | #include
21 | #include
22 |
23 | #ifndef MAX
24 | #define MAX(a,b) (a > b ? a : b)
25 | #endif
26 |
27 | #ifndef EXIT_WAIVED
28 | #define EXIT_WAIVED 2
29 | #endif
30 |
31 | ////////////////////////////////////////////////////////////////////////////////
32 | // These are CUDA Helper functions
33 |
34 | // add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H
35 | #ifdef __cuda_cuda_h__
36 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
37 | #ifndef checkCudaErrors
38 | #define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
39 |
40 | // These are the inline versions for all of the SDK helper functions
41 | inline void __checkCudaErrors(CUresult err, const char *file, const int line)
42 | {
43 | if (CUDA_SUCCESS != err)
44 | {
45 | fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
46 | err, getCudaDrvErrorString(err), file, line);
47 | exit(EXIT_FAILURE);
48 | }
49 | }
50 | #endif
51 |
52 | #ifdef getLastCudaDrvErrorMsg
53 | #undef getLastCudaDrvErrorMsg
54 | #endif
55 |
56 | #define getLastCudaDrvErrorMsg(msg) __getLastCudaDrvErrorMsg (msg, __FILE__, __LINE__)
57 |
58 | inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line)
59 | {
60 | CUresult err = cuCtxSynchronize();
61 |
62 | if (CUDA_SUCCESS != err)
63 | {
64 | fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg);
65 | fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n",
66 | err, getCudaDrvErrorString(err), file, line);
67 | exit(EXIT_FAILURE);
68 | }
69 | }
70 |
71 | // This function wraps the CUDA Driver API into a template function
72 | template
73 | inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
74 | {
75 | CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device);
76 |
77 | if (error_result != CUDA_SUCCESS)
78 | {
79 | printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result));
80 | exit(EXIT_SUCCESS);
81 | }
82 | }
83 | #endif
84 |
85 | // Beginning of GPU Architecture definitions
86 | inline int _ConvertSMVer2CoresDRV(int major, int minor)
87 | {
88 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
89 | typedef struct
90 | {
91 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
92 | int Cores;
93 | } sSMtoCores;
94 |
95 | sSMtoCores nGpuArchCoresPerSM[] =
96 | {
97 | { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
98 | { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
99 | { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
100 | { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
101 | { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
102 | { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
103 | { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
104 | { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
105 | { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
106 | { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
107 | { -1, -1 }
108 | };
109 |
110 | int index = 0;
111 |
112 | while (nGpuArchCoresPerSM[index].SM != -1)
113 | {
114 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
115 | {
116 | return nGpuArchCoresPerSM[index].Cores;
117 | }
118 |
119 | index++;
120 | }
121 |
122 | // If we don't find the values, we default use the previous one to run properly
123 | printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
124 | return nGpuArchCoresPerSM[7].Cores;
125 | }
126 | // end of GPU Architecture definitions
127 |
128 | #ifdef __cuda_cuda_h__
129 | // General GPU Device CUDA Initialization
130 | inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
131 | {
132 | int cuDevice = 0;
133 | int deviceCount = 0;
134 | CUresult err = cuInit(0);
135 |
136 | if (CUDA_SUCCESS == err)
137 | {
138 | checkCudaErrors(cuDeviceGetCount(&deviceCount));
139 | }
140 |
141 | if (deviceCount == 0)
142 | {
143 | fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
144 | exit(EXIT_FAILURE);
145 | }
146 |
147 | int dev = 0;
148 | dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
149 |
150 | if (dev < 0)
151 | {
152 | dev = 0;
153 | }
154 |
155 | if (dev > deviceCount-1)
156 | {
157 | fprintf(stderr, "\n");
158 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
159 | fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
160 | fprintf(stderr, "\n");
161 | return -dev;
162 | }
163 |
164 | checkCudaErrors(cuDeviceGet(&cuDevice, dev));
165 | char name[100];
166 | cuDeviceGetName(name, 100, cuDevice);
167 |
168 | int computeMode;
169 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
170 |
171 | if (computeMode == CU_COMPUTEMODE_PROHIBITED)
172 | {
173 | fprintf(stderr, "Error: device is running in , no threads can use this CUDA Device.\n");
174 | return -1;
175 | }
176 |
177 | if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false)
178 | {
179 | printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
180 | }
181 |
182 | return dev;
183 | }
184 |
185 | // This function returns the best GPU based on performance
186 | inline int gpuGetMaxGflopsDeviceIdDRV()
187 | {
188 | CUdevice current_device = 0, max_perf_device = 0;
189 | int device_count = 0, sm_per_multiproc = 0;
190 | int max_compute_perf = 0, best_SM_arch = 0;
191 | int major = 0, minor = 0 , multiProcessorCount, clockRate;
192 |
193 | cuInit(0);
194 | checkCudaErrors(cuDeviceGetCount(&device_count));
195 |
196 | if (device_count == 0)
197 | {
198 | fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
199 | exit(EXIT_FAILURE);
200 | }
201 |
202 | // Find the best major SM Architecture GPU device
203 | while (current_device < device_count)
204 | {
205 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
206 |
207 | if (major > 0 && major < 9999)
208 | {
209 | best_SM_arch = MAX(best_SM_arch, major);
210 | }
211 |
212 | current_device++;
213 | }
214 |
215 | // Find the best CUDA capable GPU device
216 | current_device = 0;
217 |
218 | while (current_device < device_count)
219 | {
220 | checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
221 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
222 | current_device));
223 | checkCudaErrors(cuDeviceGetAttribute(&clockRate,
224 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
225 | current_device));
226 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
227 |
228 | int computeMode;
229 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
230 |
231 | if (computeMode != CU_COMPUTEMODE_PROHIBITED)
232 | {
233 | if (major == 9999 && minor == 9999)
234 | {
235 | sm_per_multiproc = 1;
236 | }
237 | else
238 | {
239 | sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
240 | }
241 |
242 | int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
243 |
244 | if (compute_perf > max_compute_perf)
245 | {
246 | // If we find GPU with SM major > 2, search only these
247 | if (best_SM_arch > 2)
248 | {
249 | // If our device==dest_SM_arch, choose this, or else pass
250 | if (major == best_SM_arch)
251 | {
252 | max_compute_perf = compute_perf;
253 | max_perf_device = current_device;
254 | }
255 | }
256 | else
257 | {
258 | max_compute_perf = compute_perf;
259 | max_perf_device = current_device;
260 | }
261 | }
262 | }
263 |
264 | ++current_device;
265 | }
266 |
267 | return max_perf_device;
268 | }
269 |
270 | // This function returns the best Graphics GPU based on performance
271 | inline int gpuGetMaxGflopsGLDeviceIdDRV()
272 | {
273 | CUdevice current_device = 0, max_perf_device = 0;
274 | int device_count = 0, sm_per_multiproc = 0;
275 | int max_compute_perf = 0, best_SM_arch = 0;
276 | int major = 0, minor = 0, multiProcessorCount, clockRate;
277 | int bTCC = 0;
278 | char deviceName[256];
279 |
280 | cuInit(0);
281 | checkCudaErrors(cuDeviceGetCount(&device_count));
282 |
283 | if (device_count == 0)
284 | {
285 | fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n");
286 | exit(EXIT_FAILURE);
287 | }
288 |
289 | // Find the best major SM Architecture GPU device that are graphics devices
290 | while (current_device < device_count)
291 | {
292 | checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
293 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
294 |
295 | #if CUDA_VERSION >= 3020
296 | checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
297 | #else
298 |
299 | // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
300 | if (deviceName[0] == 'T')
301 | {
302 | bTCC = 1;
303 | }
304 |
305 | #endif
306 |
307 | int computeMode;
308 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
309 |
310 | if (computeMode != CU_COMPUTEMODE_PROHIBITED)
311 | {
312 | if (!bTCC)
313 | {
314 | if (major > 0 && major < 9999)
315 | {
316 | best_SM_arch = MAX(best_SM_arch, major);
317 | }
318 | }
319 | }
320 |
321 | current_device++;
322 | }
323 |
324 | // Find the best CUDA capable GPU device
325 | current_device = 0;
326 |
327 | while (current_device < device_count)
328 | {
329 | checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
330 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
331 | current_device));
332 | checkCudaErrors(cuDeviceGetAttribute(&clockRate,
333 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
334 | current_device));
335 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
336 |
337 | #if CUDA_VERSION >= 3020
338 | checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
339 | #else
340 |
341 | // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
342 | if (deviceName[0] == 'T')
343 | {
344 | bTCC = 1;
345 | }
346 |
347 | #endif
348 |
349 | int computeMode;
350 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
351 |
352 | if (computeMode != CU_COMPUTEMODE_PROHIBITED)
353 | {
354 | if (major == 9999 && minor == 9999)
355 | {
356 | sm_per_multiproc = 1;
357 | }
358 | else
359 | {
360 | sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
361 | }
362 |
363 | // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
364 | if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
365 | {
366 | int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
367 |
368 | if (compute_perf > max_compute_perf)
369 | {
370 | // If we find GPU with SM major > 2, search only these
371 | if (best_SM_arch > 2)
372 | {
373 | // If our device = dest_SM_arch, then we pick this one
374 | if (major == best_SM_arch)
375 | {
376 | max_compute_perf = compute_perf;
377 | max_perf_device = current_device;
378 | }
379 | }
380 | else
381 | {
382 | max_compute_perf = compute_perf;
383 | max_perf_device = current_device;
384 | }
385 | }
386 | }
387 | }
388 |
389 | ++current_device;
390 | }
391 |
392 | return max_perf_device;
393 | }
394 |
395 | // General initialization call to pick the best CUDA Device
396 | inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
397 | {
398 | CUdevice cuDevice;
399 | int devID = 0;
400 |
401 | // If the command-line has a device number specified, use it
402 | if (checkCmdLineFlag(argc, (const char **)argv, "device"))
403 | {
404 | devID = gpuDeviceInitDRV(argc, argv);
405 |
406 | if (devID < 0)
407 | {
408 | printf("exiting...\n");
409 | exit(EXIT_SUCCESS);
410 | }
411 | }
412 | else
413 | {
414 | // Otherwise pick the device with highest Gflops/s
415 | char name[100];
416 | devID = gpuGetMaxGflopsDeviceIdDRV();
417 | checkCudaErrors(cuDeviceGet(&cuDevice, devID));
418 | cuDeviceGetName(name, 100, cuDevice);
419 | printf("> Using CUDA Device [%d]: %s\n", devID, name);
420 | }
421 |
422 | cuDeviceGet(&cuDevice, devID);
423 |
424 | return cuDevice;
425 | }
426 |
427 | // This function will pick the best CUDA device available with OpenGL interop
428 | inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv)
429 | {
430 | CUdevice cuDevice;
431 | int devID = 0;
432 |
433 | // If the command-line has a device number specified, use it
434 | if (checkCmdLineFlag(argc, (const char **)argv, "device"))
435 | {
436 | devID = gpuDeviceInitDRV(argc, (const char **)argv);
437 |
438 | if (devID < 0)
439 | {
440 | printf("no CUDA capable devices found, exiting...\n");
441 | exit(EXIT_SUCCESS);
442 | }
443 | }
444 | else
445 | {
446 | char name[100];
447 | // Otherwise pick the device with highest Gflops/s
448 | devID = gpuGetMaxGflopsGLDeviceIdDRV();
449 | checkCudaErrors(cuDeviceGet(&cuDevice, devID));
450 | cuDeviceGetName(name, 100, cuDevice);
451 | printf("> Using CUDA/GL Device [%d]: %s\n", devID, name);
452 | }
453 |
454 | return devID;
455 | }
456 |
457 | // General check for CUDA GPU SM Capabilities
458 | inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
459 | {
460 | CUdevice cuDevice;
461 | char name[256];
462 | int major = 0, minor = 0;
463 |
464 | checkCudaErrors(cuDeviceGet(&cuDevice, devID));
465 | checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
466 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
467 |
468 | if ((major > major_version) ||
469 | (major == major_version && minor >= minor_version))
470 | {
471 | printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
472 | return true;
473 | }
474 | else
475 | {
476 | printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
477 | return false;
478 | }
479 | }
480 | #endif
481 |
482 | // end of CUDA Helper Functions
483 |
484 | #endif
485 |
--------------------------------------------------------------------------------
/common/helper_cuda_gl.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | #ifndef HELPER_CUDA_GL_H
13 | #define HELPER_CUDA_GL_H
14 |
15 | #include
16 | #include
17 | #include
18 |
19 | // includes, graphics
20 | #if defined (__APPLE__) || defined(MACOSX)
21 | #include
22 | #include
23 | #else
24 | #include
25 | #include
26 | #endif
27 |
28 | #ifndef EXIT_WAIVED
29 | #define EXIT_WAIVED 2
30 | #endif
31 |
32 | #ifdef __DRIVER_TYPES_H__
33 | #ifndef DEVICE_RESET
34 | #define DEVICE_RESET cudaDeviceReset()
35 | #endif
36 | #else
37 | #ifndef DEVICE_RESET
38 | #define DEVICE_RESET
39 | #endif
40 | #endif
41 |
42 | #ifdef __CUDA_GL_INTEROP_H__
43 | ////////////////////////////////////////////////////////////////////////////////
44 | // These are CUDA OpenGL Helper functions
45 |
46 | inline int gpuGLDeviceInit(int ARGC, const char **ARGV)
47 | {
48 | int deviceCount;
49 | checkCudaErrors(cudaGetDeviceCount(&deviceCount));
50 |
51 | if (deviceCount == 0)
52 | {
53 | fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
54 | exit(EXIT_FAILURE);
55 | }
56 |
57 | int dev = 0;
58 | dev = getCmdLineArgumentInt(ARGC, ARGV, "device=");
59 |
60 | if (dev < 0)
61 | {
62 | dev = 0;
63 | }
64 |
65 | if (dev > deviceCount-1)
66 | {
67 | fprintf(stderr, "\n");
68 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
69 | fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
70 | fprintf(stderr, "\n");
71 | return -dev;
72 | }
73 |
74 | cudaDeviceProp deviceProp;
75 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
76 |
77 | if (deviceProp.computeMode == cudaComputeModeProhibited)
78 | {
79 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n");
80 | return -1;
81 | }
82 |
83 | if (deviceProp.major < 1)
84 | {
85 | fprintf(stderr, "Error: device does not support CUDA.\n");
86 | exit(EXIT_FAILURE);
87 | }
88 |
89 | if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false)
90 | {
91 | fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
92 | }
93 |
94 | checkCudaErrors(cudaGLSetGLDevice(dev));
95 | return dev;
96 | }
97 |
98 | // This function will pick the best CUDA device available with OpenGL interop
99 | inline int findCudaGLDevice(int argc, const char **argv)
100 | {
101 | int devID = 0;
102 |
103 | // If the command-line has a device number specified, use it
104 | if (checkCmdLineFlag(argc, (const char **)argv, "device"))
105 | {
106 | devID = gpuGLDeviceInit(argc, (const char **)argv);
107 |
108 | if (devID < 0)
109 | {
110 | printf("no CUDA capable devices found, exiting...\n");
111 | DEVICE_RESET
112 | exit(EXIT_SUCCESS);
113 | }
114 | }
115 | else
116 | {
117 | // Otherwise pick the device with highest Gflops/s
118 | devID = gpuGetMaxGflopsDeviceId();
119 | cudaGLSetGLDevice(devID);
120 | }
121 |
122 | return devID;
123 | }
124 |
125 | ////////////////////////////////////////////////////////////////////////////
126 | //! Check for OpenGL error
127 | //! @return bool if no GL error has been encountered, otherwise 0
128 | //! @param file __FILE__ macro
129 | //! @param line __LINE__ macro
130 | //! @note The GL error is listed on stderr
131 | //! @note This function should be used via the CHECK_ERROR_GL() macro
132 | ////////////////////////////////////////////////////////////////////////////
133 | inline bool
134 | sdkCheckErrorGL(const char *file, const int line)
135 | {
136 | bool ret_val = true;
137 |
138 | // check for error
139 | GLenum gl_error = glGetError();
140 |
141 | if (gl_error != GL_NO_ERROR)
142 | {
143 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
144 | char tmpStr[512];
145 | // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
146 | // when the user double clicks on the error line in the Output pane. Like any compile error.
147 | sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error));
148 | fprintf(stderr, "%s", tmpStr);
149 | #endif
150 | fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
151 | fprintf(stderr, "%s\n", gluErrorString(gl_error));
152 | ret_val = false;
153 | }
154 |
155 | return ret_val;
156 | }
157 |
158 | #define SDK_CHECK_ERROR_GL() \
159 | if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \
160 | DEVICE_RESET \
161 | exit(EXIT_FAILURE); \
162 | }
163 | #endif
164 |
165 | #endif
166 |
--------------------------------------------------------------------------------
/common/helper_functions.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | // These are helper functions for the SDK samples (string parsing, timers, image helpers, etc)
13 | #ifndef HELPER_FUNCTIONS_H
14 | #define HELPER_FUNCTIONS_H
15 |
16 | #ifdef WIN32
17 | #pragma warning(disable:4996)
18 | #endif
19 |
20 | // includes, project
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 |
33 | // includes, timer, string parsing, image helpers
34 | #include // helper functions for timers
35 | #include // helper functions for string parsing
36 | #include // helper functions for image compare, dump, data comparisons
37 |
38 | #ifndef EXIT_WAIVED
39 | #define EXIT_WAIVED 2
40 | #endif
41 |
42 | #endif // HELPER_FUNCTIONS_H
43 |
--------------------------------------------------------------------------------
/common/helper_image.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | // These are helper functions for the SDK samples (image,bitmap)
13 | #ifndef HELPER_IMAGE_H
14 | #define HELPER_IMAGE_H
15 |
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | #include
23 | #include
24 | #include
25 |
26 | #ifndef MIN
27 | #define MIN(a,b) ((a < b) ? a : b)
28 | #endif
29 | #ifndef MAX
30 | #define MAX(a,b) ((a > b) ? a : b)
31 | #endif
32 |
33 | #ifndef EXIT_WAIVED
34 | #define EXIT_WAIVED 2
35 | #endif
36 |
37 | #include
38 |
39 | // namespace unnamed (internal)
40 | namespace
41 | {
42 | //! size of PGM file header
43 | const unsigned int PGMHeaderSize = 0x40;
44 |
45 | // types
46 |
47 | //! Data converter from unsigned char / unsigned byte to type T
48 | template
49 | struct ConverterFromUByte;
50 |
51 | //! Data converter from unsigned char / unsigned byte
52 | template<>
53 | struct ConverterFromUByte
54 | {
55 | //! Conversion operator
56 | //! @return converted value
57 | //! @param val value to convert
58 | float operator()(const unsigned char &val)
59 | {
60 | return static_cast(val);
61 | }
62 | };
63 |
64 | //! Data converter from unsigned char / unsigned byte to float
65 | template<>
66 | struct ConverterFromUByte
67 | {
68 | //! Conversion operator
69 | //! @return converted value
70 | //! @param val value to convert
71 | float operator()(const unsigned char &val)
72 | {
73 | return static_cast(val) / 255.0f;
74 | }
75 | };
76 |
77 | //! Data converter from unsigned char / unsigned byte to type T
78 | template
79 | struct ConverterToUByte;
80 |
81 | //! Data converter from unsigned char / unsigned byte to unsigned int
82 | template<>
83 | struct ConverterToUByte
84 | {
85 | //! Conversion operator (essentially a passthru
86 | //! @return converted value
87 | //! @param val value to convert
88 | unsigned char operator()(const unsigned char &val)
89 | {
90 | return val;
91 | }
92 | };
93 |
94 | //! Data converter from unsigned char / unsigned byte to unsigned int
95 | template<>
96 | struct ConverterToUByte
97 | {
98 | //! Conversion operator
99 | //! @return converted value
100 | //! @param val value to convert
101 | unsigned char operator()(const float &val)
102 | {
103 | return static_cast(val * 255.0f);
104 | }
105 | };
106 | }
107 |
108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
109 | #ifndef FOPEN
110 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
111 | #endif
112 | #ifndef FOPEN_FAIL
113 | #define FOPEN_FAIL(result) (result != 0)
114 | #endif
115 | #ifndef SSCANF
116 | #define SSCANF sscanf_s
117 | #endif
118 | #else
119 | #ifndef FOPEN
120 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
121 | #endif
122 | #ifndef FOPEN_FAIL
123 | #define FOPEN_FAIL(result) (result == NULL)
124 | #endif
125 | #ifndef SSCANF
126 | #define SSCANF sscanf
127 | #endif
128 | #endif
129 |
130 | inline bool
131 | __loadPPM(const char *file, unsigned char **data,
132 | unsigned int *w, unsigned int *h, unsigned int *channels)
133 | {
134 | FILE *fp = NULL;
135 |
136 | if (FOPEN_FAIL(FOPEN(fp, file, "rb")))
137 | {
138 | std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
139 | return false;
140 | }
141 |
142 | // check header
143 | char header[PGMHeaderSize];
144 |
145 | if (fgets(header, PGMHeaderSize, fp) == NULL)
146 | {
147 | std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
148 | return false;
149 | }
150 |
151 | if (strncmp(header, "P5", 2) == 0)
152 | {
153 | *channels = 1;
154 | }
155 | else if (strncmp(header, "P6", 2) == 0)
156 | {
157 | *channels = 3;
158 | }
159 | else
160 | {
161 | std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
162 | *channels = 0;
163 | return false;
164 | }
165 |
166 | // parse header, read maxval, width and height
167 | unsigned int width = 0;
168 | unsigned int height = 0;
169 | unsigned int maxval = 0;
170 | unsigned int i = 0;
171 |
172 | while (i < 3)
173 | {
174 | if (fgets(header, PGMHeaderSize, fp) == NULL)
175 | {
176 | std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
177 | return false;
178 | }
179 |
180 | if (header[0] == '#')
181 | {
182 | continue;
183 | }
184 |
185 | if (i == 0)
186 | {
187 | i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
188 | }
189 | else if (i == 1)
190 | {
191 | i += SSCANF(header, "%u %u", &height, &maxval);
192 | }
193 | else if (i == 2)
194 | {
195 | i += SSCANF(header, "%u", &maxval);
196 | }
197 | }
198 |
199 | // check if given handle for the data is initialized
200 | if (NULL != *data)
201 | {
202 | if (*w != width || *h != height)
203 | {
204 | std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
205 | }
206 | }
207 | else
208 | {
209 | *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels);
210 | *w = width;
211 | *h = height;
212 | }
213 |
214 | // read and close file
215 | if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0)
216 | {
217 | std::cerr << "__LoadPPM() read data returned error." << std::endl;
218 | }
219 |
220 | fclose(fp);
221 |
222 | return true;
223 | }
224 |
225 | template
226 | inline bool
227 | sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h)
228 | {
229 | unsigned char *idata = NULL;
230 | unsigned int channels;
231 |
232 | if (true != __loadPPM(file, &idata, w, h, &channels))
233 | {
234 | return false;
235 | }
236 |
237 | unsigned int size = *w **h * channels;
238 |
239 | // initialize mem if necessary
240 | // the correct size is checked / set in loadPGMc()
241 | if (NULL == *data)
242 | {
243 | *data = (T *) malloc(sizeof(T) * size);
244 | }
245 |
246 | // copy and cast data
247 | std::transform(idata, idata + size, *data, ConverterFromUByte());
248 |
249 | free(idata);
250 |
251 | return true;
252 | }
253 |
254 | template
255 | inline bool
256 | sdkLoadPPM4(const char *file, T **data,
257 | unsigned int *w,unsigned int *h)
258 | {
259 | unsigned char *idata = 0;
260 | unsigned int channels;
261 |
262 | if (__loadPPM(file, &idata, w, h, &channels))
263 | {
264 | // pad 4th component
265 | int size = *w **h;
266 | // keep the original pointer
267 | unsigned char *idata_orig = idata;
268 | *data = (T *) malloc(sizeof(T) * size * 4);
269 | unsigned char *ptr = *data;
270 |
271 | for (int i=0; i 0);
295 | assert(h > 0);
296 |
297 | std::fstream fh(file, std::fstream::out | std::fstream::binary);
298 |
299 | if (fh.bad())
300 | {
301 | std::cerr << "__savePPM() : Opening file failed." << std::endl;
302 | return false;
303 | }
304 |
305 | if (channels == 1)
306 | {
307 | fh << "P5\n";
308 | }
309 | else if (channels == 3)
310 | {
311 | fh << "P6\n";
312 | }
313 | else
314 | {
315 | std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
316 | return false;
317 | }
318 |
319 | fh << w << "\n" << h << "\n" << 0xff << std::endl;
320 |
321 | for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
322 | {
323 | fh << data[i];
324 | }
325 |
326 | fh.flush();
327 |
328 | if (fh.bad())
329 | {
330 | std::cerr << "__savePPM() : Writing data failed." << std::endl;
331 | return false;
332 | }
333 |
334 | fh.close();
335 |
336 | return true;
337 | }
338 |
339 | template
340 | inline bool
341 | sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h)
342 | {
343 | unsigned int size = w * h;
344 | unsigned char *idata =
345 | (unsigned char *) malloc(sizeof(unsigned char) * size);
346 |
347 | std::transform(data, data + size, idata, ConverterToUByte());
348 |
349 | // write file
350 | bool result = __savePPM(file, idata, w, h, 1);
351 |
352 | // cleanup
353 | free(idata);
354 |
355 | return result;
356 | }
357 |
358 | inline bool
359 | sdkSavePPM4ub(const char *file, unsigned char *data,
360 | unsigned int w, unsigned int h)
361 | {
362 | // strip 4th component
363 | int size = w * h;
364 | unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3);
365 | unsigned char *ptr = ndata;
366 |
367 | for (int i=0; i
390 | inline bool
391 | sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
392 | {
393 | // check input arguments
394 | assert(NULL != filename);
395 | assert(NULL != len);
396 |
397 | // intermediate storage for the data read
398 | std::vector data_read;
399 |
400 | // open file for reading
401 | FILE *fh = NULL;
402 |
403 | // check if filestream is valid
404 | if (FOPEN_FAIL(FOPEN(fh, filename, "r")))
405 | {
406 | printf("Unable to open input file: %s\n", filename);
407 | return false;
408 | }
409 |
410 | // read all data elements
411 | T token;
412 |
413 | while (!feof(fh))
414 | {
415 | fscanf(fh, "%f", &token);
416 | data_read.push_back(token);
417 | }
418 |
419 | // the last element is read twice
420 | data_read.pop_back();
421 | fclose(fh);
422 |
423 | // check if the given handle is already initialized
424 | if (NULL != *data)
425 | {
426 | if (*len != data_read.size())
427 | {
428 | std::cerr << "sdkReadFile() : Initialized memory given but "
429 | << "size mismatch with signal read "
430 | << "(data read / data init = " << (unsigned int)data_read.size()
431 | << " / " << *len << ")" << std::endl;
432 |
433 | return false;
434 | }
435 | }
436 | else
437 | {
438 | // allocate storage for the data read
439 | *data = (T *) malloc(sizeof(T) * data_read.size());
440 | // store signal size
441 | *len = static_cast(data_read.size());
442 | }
443 |
444 | // copy data
445 | memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
446 |
447 | return true;
448 | }
449 |
450 | //////////////////////////////////////////////////////////////////////////////
451 | //! Read file \filename and return the data
452 | //! @return bool if reading the file succeeded, otherwise false
453 | //! @param filename name of the source file
454 | //! @param data uninitialized pointer, returned initialized and pointing to
455 | //! the data read
456 | //! @param len number of data elements in data, -1 on error
457 | //////////////////////////////////////////////////////////////////////////////
458 | template
459 | inline bool
460 | sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose)
461 | {
462 | // check input arguments
463 | assert(NULL != filename);
464 | assert(NULL != len);
465 |
466 | // open file for reading
467 | FILE *fh = fopen(filename, "rb");
468 |
469 | if (fh == NULL && verbose)
470 | {
471 | std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
472 | return false;
473 | }
474 |
475 | // check if the given handle is already initialized
476 | // allocate storage for the data read
477 | data[block_num] = (T *) malloc(block_size);
478 |
479 | // read all data elements
480 | fseek(fh, block_num * block_size, SEEK_SET);
481 | *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh);
482 |
483 | fclose(fh);
484 |
485 | return true;
486 | }
487 |
488 | //////////////////////////////////////////////////////////////////////////////
489 | //! Write a data file \filename
490 | //! @return true if writing the file succeeded, otherwise false
491 | //! @param filename name of the source file
492 | //! @param data data to write
493 | //! @param len number of data elements in data, -1 on error
494 | //! @param epsilon epsilon for comparison
495 | //////////////////////////////////////////////////////////////////////////////
496 | template
497 | inline bool
498 | sdkWriteFile(const char *filename, const T *data, unsigned int len,
499 | const S epsilon, bool verbose, bool append = false)
500 | {
501 | assert(NULL != filename);
502 | assert(NULL != data);
503 |
504 | // open file for writing
505 | // if (append) {
506 | std::fstream fh(filename, std::fstream::out | std::fstream::ate);
507 |
508 | if (verbose)
509 | {
510 | std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl;
511 | }
512 |
513 | /* } else {
514 | std::fstream fh(filename, std::fstream::out);
515 | if (verbose) {
516 | std::cerr << "sdkWriteFile() : Open file " << filename << " for write." << std::endl;
517 | }
518 | }
519 | */
520 |
521 | // check if filestream is valid
522 | if (! fh.good())
523 | {
524 | if (verbose)
525 | {
526 | std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
527 | }
528 |
529 | return false;
530 | }
531 |
532 | // first write epsilon
533 | fh << "# " << epsilon << "\n";
534 |
535 | // write data
536 | for (unsigned int i = 0; (i < len) && (fh.good()); ++i)
537 | {
538 | fh << data[i] << ' ';
539 | }
540 |
541 | // Check if writing succeeded
542 | if (! fh.good())
543 | {
544 | if (verbose)
545 | {
546 | std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
547 | }
548 |
549 | return false;
550 | }
551 |
552 | // file ends with nl
553 | fh << std::endl;
554 |
555 | return true;
556 | }
557 |
558 | //////////////////////////////////////////////////////////////////////////////
559 | //! Compare two arrays of arbitrary type
560 | //! @return true if \a reference and \a data are identical, otherwise false
561 | //! @param reference timer_interface to the reference data / gold image
562 | //! @param data handle to the computed data
563 | //! @param len number of elements in reference and data
564 | //! @param epsilon epsilon to use for the comparison
565 | //////////////////////////////////////////////////////////////////////////////
566 | template
567 | inline bool
568 | compareData(const T *reference, const T *data, const unsigned int len,
569 | const S epsilon, const float threshold)
570 | {
571 | assert(epsilon >= 0);
572 |
573 | bool result = true;
574 | unsigned int error_count = 0;
575 |
576 | for (unsigned int i = 0; i < len; ++i)
577 | {
578 | float diff = (float)reference[i] - (float)data[i];
579 | bool comp = (diff <= epsilon) && (diff >= -epsilon);
580 | result &= comp;
581 |
582 | error_count += !comp;
583 |
584 | #if 0
585 |
586 | if (! comp)
587 | {
588 | std::cerr << "ERROR, i = " << i << ",\t "
589 | << reference[i] << " / "
590 | << data[i]
591 | << " (reference / data)\n";
592 | }
593 |
594 | #endif
595 | }
596 |
597 | if (threshold == 0.0f)
598 | {
599 | return (result) ? true : false;
600 | }
601 | else
602 | {
603 | if (error_count)
604 | {
605 | printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
606 | }
607 |
608 | return (len*threshold > error_count) ? true : false;
609 | }
610 | }
611 |
612 | #ifndef __MIN_EPSILON_ERROR
613 | #define __MIN_EPSILON_ERROR 1e-3f
614 | #endif
615 |
616 | //////////////////////////////////////////////////////////////////////////////
617 | //! Compare two arrays of arbitrary type
618 | //! @return true if \a reference and \a data are identical, otherwise false
619 | //! @param reference handle to the reference data / gold image
620 | //! @param data handle to the computed data
621 | //! @param len number of elements in reference and data
622 | //! @param epsilon epsilon to use for the comparison
623 | //! @param epsilon threshold % of (# of bytes) for pass/fail
624 | //////////////////////////////////////////////////////////////////////////////
625 | template
626 | inline bool
627 | compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len,
628 | const S epsilon, const float threshold)
629 | {
630 | assert(epsilon >= 0);
631 |
632 | // If we set epsilon to be 0, let's set a minimum threshold
633 | float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
634 | int error_count = 0;
635 | bool result = true;
636 |
637 | for (unsigned int i = 0; i < len; ++i)
638 | {
639 | float diff = fabs((float)reference[i] - (float)data[i]);
640 | bool comp = (diff < max_error);
641 | result &= comp;
642 |
643 | if (! comp)
644 | {
645 | error_count++;
646 | #if 0
647 |
648 | if (error_count < 50)
649 | {
650 | printf("\n ERROR(epsilon=%4.3f), i=%d, (ref)0x%02x / (data)0x%02x / (diff)%d\n",
651 | max_error, i,
652 | *(unsigned int *)&reference[i],
653 | *(unsigned int *)&data[i],
654 | (unsigned int)diff);
655 | }
656 |
657 | #endif
658 | }
659 | }
660 |
661 | if (threshold == 0.0f)
662 | {
663 | if (error_count)
664 | {
665 | printf("total # of errors = %d\n", error_count);
666 | }
667 |
668 | return (error_count == 0) ? true : false;
669 | }
670 | else
671 | {
672 | if (error_count)
673 | {
674 | printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
675 | }
676 |
677 | return ((len*threshold > error_count) ? true : false);
678 | }
679 | }
680 |
681 | inline
682 | void sdkDumpBin(void *data, unsigned int bytes, const char *filename)
683 | {
684 | printf("sdkDumpBin: <%s>\n", filename);
685 | FILE *fp;
686 | FOPEN(fp, filename, "wb");
687 | fwrite(data, bytes, 1, fp);
688 | fflush(fp);
689 | fclose(fp);
690 | }
691 |
692 | inline
693 | bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
694 | {
695 | unsigned int *src_buffer, *ref_buffer;
696 | FILE *src_fp = NULL, *ref_fp = NULL;
697 |
698 | unsigned long error_count = 0;
699 | size_t fsize = 0;
700 |
701 | if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
702 | {
703 | printf("compareBin2Bin unable to open src_file: %s\n", src_file);
704 | error_count++;
705 | }
706 |
707 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
708 |
709 | if (ref_file_path == NULL)
710 | {
711 | printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path);
712 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
713 | printf("Aborting comparison!\n");
714 | printf(" FAILED\n");
715 | error_count++;
716 |
717 | if (src_fp)
718 | {
719 | fclose(src_fp);
720 | }
721 |
722 | if (ref_fp)
723 | {
724 | fclose(ref_fp);
725 | }
726 | }
727 | else
728 | {
729 | if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
730 | {
731 | printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path);
732 | error_count++;
733 | }
734 |
735 | if (src_fp && ref_fp)
736 | {
737 | src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
738 | ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
739 |
740 | fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
741 | fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
742 |
743 | printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
744 | printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
745 | printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
746 |
747 | if (!compareData(ref_buffer, src_buffer, nelements, epsilon, threshold))
748 | {
749 | error_count++;
750 | }
751 |
752 | fclose(src_fp);
753 | fclose(ref_fp);
754 |
755 | free(src_buffer);
756 | free(ref_buffer);
757 | }
758 | else
759 | {
760 | if (src_fp)
761 | {
762 | fclose(src_fp);
763 | }
764 |
765 | if (ref_fp)
766 | {
767 | fclose(ref_fp);
768 | }
769 | }
770 | }
771 |
772 | if (error_count == 0)
773 | {
774 | printf(" OK\n");
775 | }
776 | else
777 | {
778 | printf(" FAILURE: %d errors...\n", (unsigned int)error_count);
779 | }
780 |
781 | return (error_count == 0); // returns true if all pixels pass
782 | }
783 |
784 | inline
785 | bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
786 | {
787 | float *src_buffer, *ref_buffer;
788 | FILE *src_fp = NULL, *ref_fp = NULL;
789 | size_t fsize = 0;
790 |
791 | unsigned long error_count = 0;
792 |
793 | if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
794 | {
795 | printf("compareBin2Bin unable to open src_file: %s\n", src_file);
796 | error_count = 1;
797 | }
798 |
799 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
800 |
801 | if (ref_file_path == NULL)
802 | {
803 | printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path);
804 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path);
805 | printf("Aborting comparison!\n");
806 | printf(" FAILED\n");
807 | error_count++;
808 |
809 | if (src_fp)
810 | {
811 | fclose(src_fp);
812 | }
813 |
814 | if (ref_fp)
815 | {
816 | fclose(ref_fp);
817 | }
818 | }
819 | else
820 | {
821 | if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
822 | {
823 | printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path);
824 | error_count = 1;
825 | }
826 |
827 | if (src_fp && ref_fp)
828 | {
829 | src_buffer = (float *)malloc(nelements*sizeof(float));
830 | ref_buffer = (float *)malloc(nelements*sizeof(float));
831 |
832 | fsize = fread(src_buffer, nelements, sizeof(float), src_fp);
833 | fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp);
834 |
835 | printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
836 | printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
837 | printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
838 |
839 | if (!compareDataAsFloatThreshold(ref_buffer, src_buffer, nelements, epsilon, threshold))
840 | {
841 | error_count++;
842 | }
843 |
844 | fclose(src_fp);
845 | fclose(ref_fp);
846 |
847 | free(src_buffer);
848 | free(ref_buffer);
849 | }
850 | else
851 | {
852 | if (src_fp)
853 | {
854 | fclose(src_fp);
855 | }
856 |
857 | if (ref_fp)
858 | {
859 | fclose(ref_fp);
860 | }
861 | }
862 | }
863 |
864 | if (error_count == 0)
865 | {
866 | printf(" OK\n");
867 | }
868 | else
869 | {
870 | printf(" FAILURE: %d errors...\n", (unsigned int)error_count);
871 | }
872 |
873 | return (error_count == 0); // returns true if all pixels pass
874 | }
875 |
876 | inline bool
877 | sdkCompareL2fe(const float *reference, const float *data,
878 | const unsigned int len, const float epsilon)
879 | {
880 | assert(epsilon >= 0);
881 |
882 | float error = 0;
883 | float ref = 0;
884 |
885 | for (unsigned int i = 0; i < len; ++i)
886 | {
887 |
888 | float diff = reference[i] - data[i];
889 | error += diff * diff;
890 | ref += reference[i] * reference[i];
891 | }
892 |
893 | float normRef = sqrtf(ref);
894 |
895 | if (fabs(ref) < 1e-7)
896 | {
897 | #ifdef _DEBUG
898 | std::cerr << "ERROR, reference l2-norm is 0\n";
899 | #endif
900 | return false;
901 | }
902 |
903 | float normError = sqrtf(error);
904 | error = normError / normRef;
905 | bool result = error < epsilon;
906 | #ifdef _DEBUG
907 |
908 | if (! result)
909 | {
910 | std::cerr << "ERROR, l2-norm error "
911 | << error << " is greater than epsilon " << epsilon << "\n";
912 | }
913 |
914 | #endif
915 |
916 | return result;
917 | }
918 |
919 | inline bool
920 | sdkLoadPPMub(const char *file, unsigned char **data,
921 | unsigned int *w,unsigned int *h)
922 | {
923 | unsigned int channels;
924 | return __loadPPM(file, data, w, h, &channels);
925 | }
926 |
927 | inline bool
928 | sdkLoadPPM4ub(const char *file, unsigned char **data,
929 | unsigned int *w, unsigned int *h)
930 | {
931 | unsigned char *idata = 0;
932 | unsigned int channels;
933 |
934 | if (__loadPPM(file, &idata, w, h, &channels))
935 | {
936 | // pad 4th component
937 | int size = *w **h;
938 | // keep the original pointer
939 | unsigned char *idata_orig = idata;
940 | *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4);
941 | unsigned char *ptr = *data;
942 |
943 | for (int i=0; i Compare (a)rendered: <" << src_file << ">\n";
984 | std::cerr << "> (b)reference: <" << ref_file << ">\n";
985 | }
986 |
987 |
988 | if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
989 | {
990 | if (verboseErrors)
991 | {
992 | std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n";
993 | }
994 |
995 | return false;
996 | }
997 |
998 | if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
999 | {
1000 | std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n";
1001 | return false;
1002 | }
1003 |
1004 | if (src_height != ref_height || src_width != ref_width)
1005 | {
1006 | if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width <<
1007 | "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
1008 | }
1009 |
1010 | if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width <<
1011 | "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
1012 |
1013 | if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false)
1014 | {
1015 | error_count=1;
1016 | }
1017 |
1018 | if (error_count == 0)
1019 | {
1020 | if (verboseErrors)
1021 | {
1022 | std::cerr << " OK\n\n";
1023 | }
1024 | }
1025 | else
1026 | {
1027 | if (verboseErrors)
1028 | {
1029 | std::cerr << " FAILURE! "< Compare (a)rendered: <" << src_file << ">\n";
1058 | std::cerr << "> (b)reference: <" << ref_file << ">\n";
1059 | }
1060 |
1061 |
1062 | if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true)
1063 | {
1064 | if (verboseErrors)
1065 | {
1066 | std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n";
1067 | }
1068 |
1069 | return false;
1070 | }
1071 |
1072 | if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true)
1073 | {
1074 | std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n";
1075 | return false;
1076 | }
1077 |
1078 | if (src_height != ref_height || src_width != ref_width)
1079 | {
1080 | if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width <<
1081 | "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
1082 | }
1083 |
1084 | if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width <<
1085 | "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
1086 |
1087 | if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false)
1088 | {
1089 | error_count=1;
1090 | }
1091 |
1092 | if (error_count == 0)
1093 | {
1094 | if (verboseErrors)
1095 | {
1096 | std::cerr << " OK\n\n";
1097 | }
1098 | }
1099 | else
1100 | {
1101 | if (verboseErrors)
1102 | {
1103 | std::cerr << " FAILURE! "<
17 | #include
18 | #include
19 | #include
20 |
21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
22 | #ifndef _CRT_SECURE_NO_DEPRECATE
23 | #define _CRT_SECURE_NO_DEPRECATE
24 | #endif
25 | #ifndef STRCASECMP
26 | #define STRCASECMP _stricmp
27 | #endif
28 | #ifndef STRNCASECMP
29 | #define STRNCASECMP _strnicmp
30 | #endif
31 | #ifndef STRCPY
32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
33 | #endif
34 |
35 | #ifndef FOPEN
36 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
37 | #endif
38 | #ifndef FOPEN_FAIL
39 | #define FOPEN_FAIL(result) (result != 0)
40 | #endif
41 | #ifndef SSCANF
42 | #define SSCANF sscanf_s
43 | #endif
44 | #ifndef SPRINTF
45 | #define SPRINTF sprintf_s
46 | #endif
47 | #else // Linux Includes
48 | #include
49 | #include
50 |
51 | #ifndef STRCASECMP
52 | #define STRCASECMP strcasecmp
53 | #endif
54 | #ifndef STRNCASECMP
55 | #define STRNCASECMP strncasecmp
56 | #endif
57 | #ifndef STRCPY
58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
59 | #endif
60 |
61 | #ifndef FOPEN
62 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
63 | #endif
64 | #ifndef FOPEN_FAIL
65 | #define FOPEN_FAIL(result) (result == NULL)
66 | #endif
67 | #ifndef SSCANF
68 | #define SSCANF sscanf
69 | #endif
70 | #ifndef SPRINTF
71 | #define SPRINTF sprintf
72 | #endif
73 | #endif
74 |
75 | #ifndef EXIT_WAIVED
76 | #define EXIT_WAIVED 2
77 | #endif
78 |
79 | // CUDA Utility Helper Functions
80 | inline int stringRemoveDelimiter(char delimiter, const char *string)
81 | {
82 | int string_start = 0;
83 |
84 | while (string[string_start] == delimiter)
85 | {
86 | string_start++;
87 | }
88 |
89 | if (string_start >= (int)strlen(string)-1)
90 | {
91 | return 0;
92 | }
93 |
94 | return string_start;
95 | }
96 |
97 | inline int getFileExtension(char *filename, char **extension)
98 | {
99 | int string_length = (int)strlen(filename);
100 |
101 | while (filename[string_length--] != '.')
102 | {
103 | if (string_length == 0)
104 | break;
105 | }
106 |
107 | if (string_length > 0) string_length += 2;
108 |
109 | if (string_length == 0)
110 | *extension = NULL;
111 | else
112 | *extension = &filename[string_length];
113 |
114 | return string_length;
115 | }
116 |
117 |
118 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
119 | {
120 | bool bFound = false;
121 |
122 | if (argc >= 1)
123 | {
124 | for (int i=1; i < argc; i++)
125 | {
126 | int string_start = stringRemoveDelimiter('-', argv[i]);
127 | const char *string_argv = &argv[i][string_start];
128 |
129 | const char *equal_pos = strchr(string_argv, '=');
130 | int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
131 |
132 | int length = (int)strlen(string_ref);
133 |
134 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
135 | {
136 | bFound = true;
137 | continue;
138 | }
139 | }
140 | }
141 |
142 | return bFound;
143 | }
144 |
145 | // This function wraps the CUDA Driver API into a template function
146 | template
147 | inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
148 | {
149 | bool bFound = false;
150 |
151 | if (argc >= 1)
152 | {
153 | for (int i=1; i < argc; i++)
154 | {
155 | int string_start = stringRemoveDelimiter('-', argv[i]);
156 | const char *string_argv = &argv[i][string_start];
157 | int length = (int)strlen(string_ref);
158 |
159 | if (!STRNCASECMP(string_argv, string_ref, length))
160 | {
161 | if (length+1 <= (int)strlen(string_argv))
162 | {
163 | int auto_inc = (string_argv[length] == '=') ? 1 : 0;
164 | *value = (T)atoi(&string_argv[length + auto_inc]);
165 | }
166 |
167 | bFound = true;
168 | i=argc;
169 | }
170 | }
171 | }
172 |
173 | return bFound;
174 | }
175 |
176 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
177 | {
178 | bool bFound = false;
179 | int value = -1;
180 |
181 | if (argc >= 1)
182 | {
183 | for (int i=1; i < argc; i++)
184 | {
185 | int string_start = stringRemoveDelimiter('-', argv[i]);
186 | const char *string_argv = &argv[i][string_start];
187 | int length = (int)strlen(string_ref);
188 |
189 | if (!STRNCASECMP(string_argv, string_ref, length))
190 | {
191 | if (length+1 <= (int)strlen(string_argv))
192 | {
193 | int auto_inc = (string_argv[length] == '=') ? 1 : 0;
194 | value = atoi(&string_argv[length + auto_inc]);
195 | }
196 | else
197 | {
198 | value = 0;
199 | }
200 |
201 | bFound = true;
202 | continue;
203 | }
204 | }
205 | }
206 |
207 | if (bFound)
208 | {
209 | return value;
210 | }
211 | else
212 | {
213 | return 0;
214 | }
215 | }
216 |
217 | inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
218 | {
219 | bool bFound = false;
220 | float value = -1;
221 |
222 | if (argc >= 1)
223 | {
224 | for (int i=1; i < argc; i++)
225 | {
226 | int string_start = stringRemoveDelimiter('-', argv[i]);
227 | const char *string_argv = &argv[i][string_start];
228 | int length = (int)strlen(string_ref);
229 |
230 | if (!STRNCASECMP(string_argv, string_ref, length))
231 | {
232 | if (length+1 <= (int)strlen(string_argv))
233 | {
234 | int auto_inc = (string_argv[length] == '=') ? 1 : 0;
235 | value = (float)atof(&string_argv[length + auto_inc]);
236 | }
237 | else
238 | {
239 | value = 0.f;
240 | }
241 |
242 | bFound = true;
243 | continue;
244 | }
245 | }
246 | }
247 |
248 | if (bFound)
249 | {
250 | return value;
251 | }
252 | else
253 | {
254 | return 0;
255 | }
256 | }
257 |
258 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
259 | const char *string_ref, char **string_retval)
260 | {
261 | bool bFound = false;
262 |
263 | if (argc >= 1)
264 | {
265 | for (int i=1; i < argc; i++)
266 | {
267 | int string_start = stringRemoveDelimiter('-', argv[i]);
268 | char *string_argv = (char *)&argv[i][string_start];
269 | int length = (int)strlen(string_ref);
270 |
271 | if (!STRNCASECMP(string_argv, string_ref, length))
272 | {
273 | *string_retval = &string_argv[length+1];
274 | bFound = true;
275 | continue;
276 | }
277 | }
278 | }
279 |
280 | if (!bFound)
281 | {
282 | *string_retval = NULL;
283 | }
284 |
285 | return bFound;
286 | }
287 |
288 | //////////////////////////////////////////////////////////////////////////////
289 | //! Find the path for a file assuming that
290 | //! files are found in the searchPath.
291 | //!
292 | //! @return the path if succeeded, otherwise 0
293 | //! @param filename name of the file
294 | //! @param executable_path optional absolute path of the executable
295 | //////////////////////////////////////////////////////////////////////////////
296 | inline char *sdkFindFilePath(const char *filename, const char *executable_path)
297 | {
298 | // defines a variable that is replaced with the name of the executable
299 |
300 | // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
301 | // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
302 | const char *searchPath[] =
303 | {
304 | "./", // same dir
305 | "./common/", // "/common/" subdir
306 | "./common/data/", // "/common/data/" subdir
307 | "./data/", // "/data/" subdir
308 | "./src/", // "/src/" subdir
309 | "./src//data/", // "/src//data/" subdir
310 | "./inc/", // "/inc/" subdir
311 | "./0_Simple/", // "/0_Simple/" subdir
312 | "./1_Utilities/", // "/1_Utilities/" subdir
313 | "./2_Graphics/", // "/2_Graphics/" subdir
314 | "./3_Imaging/", // "/3_Imaging/" subdir
315 | "./4_Financial/", // "/4_Financial/" subdir
316 | "./5_Simulations/", // "/5_Simulations/" subdir
317 | "./6_Advanced/", // "/6_Advanced/" subdir
318 | "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
319 | "./samples/", // "/samples/" subdir
320 |
321 | "../", // up 1 in tree
322 | "../common/", // up 1 in tree, "/common/" subdir
323 | "../common/data/", // up 1 in tree, "/common/data/" subdir
324 | "../data/", // up 1 in tree, "/data/" subdir
325 | "../src/", // up 1 in tree, "/src/" subdir
326 | "../inc/", // up 1 in tree, "/inc/" subdir
327 |
328 | "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir
329 | "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir
330 | "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir
331 | "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir
332 | "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir
333 | "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir
334 | "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir
335 | "../7_CUDALibraries//data/",// up 1 in tree, "/7_CUDALibraries//" subdir
336 | "../samples//data/", // up 1 in tree, "/samples//" subdir
337 | "../../", // up 2 in tree
338 | "../../common/", // up 2 in tree, "/common/" subdir
339 | "../../common/data/", // up 2 in tree, "/common/data/" subdir
340 | "../../data/", // up 2 in tree, "/data/" subdir
341 | "../../src/", // up 2 in tree, "/src/" subdir
342 | "../../inc/", // up 2 in tree, "/inc/" subdir
343 | "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir
344 | "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir
345 | "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir
346 | "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir
347 | "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir
348 | "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir
349 | "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir
350 | "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir
351 | "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir
352 | "../../samples//data/", // up 2 in tree, "/samples//" subdir
353 | "../../../", // up 3 in tree
354 | "../../../src//", // up 3 in tree, "/src//" subdir
355 | "../../../src//data/", // up 3 in tree, "/src//data/" subdir
356 | "../../../src//src/", // up 3 in tree, "/src//src/" subdir
357 | "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir
358 | "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir
359 | "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir
360 | "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir
361 | "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir
362 | "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir
363 | "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir
364 | "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir
365 | "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir
366 | "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir
367 | "../../../5_Simulations//data/", // up 3 in tree, "/5_Simulations//" subdir
368 | "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir
369 | "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir
370 | "../../../samples//data/", // up 3 in tree, "/samples//" subdir
371 | "../../../common/", // up 3 in tree, "../../../common/" subdir
372 | "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
373 | "../../../data/", // up 3 in tree, "../../../data/" subdir
374 | "../../../../", // up 4 in tree
375 | "../../../../src//", // up 4 in tree, "/src//" subdir
376 | "../../../../src//data/", // up 4 in tree, "/src//data/" subdir
377 | "../../../../src//src/", // up 4 in tree, "/src//src/" subdir
378 | "../../../../src//inc/", // up 4 in tree, "/src//inc/" subdir
379 | "../../../../sandbox//", // up 4 in tree, "/sandbox//" subdir
380 | "../../../../sandbox//data/", // up 4 in tree, "/sandbox//data/" subdir
381 | "../../../../sandbox//src/", // up 4 in tree, "/sandbox//src/" subdir
382 | "../../../../sandbox//inc/", // up 4 in tree, "/sandbox//inc/" subdir
383 | "../../../../0_Simple//data/", // up 4 in tree, "/0_Simple//" subdir
384 | "../../../../1_Utilities//data/", // up 4 in tree, "/1_Utilities//" subdir
385 | "../../../../2_Graphics//data/", // up 4 in tree, "/2_Graphics//" subdir
386 | "../../../../3_Imaging//data/", // up 4 in tree, "/3_Imaging//" subdir
387 | "../../../../4_Financial//data/", // up 4 in tree, "/4_Financial//" subdir
388 | "../../../../5_Simulations//data/",// up 4 in tree, "/5_Simulations//" subdir
389 | "../../../../6_Advanced//data/", // up 4 in tree, "/6_Advanced//" subdir
390 | "../../../../7_CUDALibraries//data/", // up 4 in tree, "/7_CUDALibraries//" subdir
391 | "../../../../samples//data/", // up 4 in tree, "/samples//" subdir
392 | "../../../../common/", // up 4 in tree, "../../../common/" subdir
393 | "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir
394 | "../../../../data/", // up 4 in tree, "../../../data/" subdir
395 | "../../../../../", // up 5 in tree
396 | "../../../../../src//", // up 5 in tree, "/src//" subdir
397 | "../../../../../src//data/", // up 5 in tree, "/src//data/" subdir
398 | "../../../../../src//src/", // up 5 in tree, "/src//src/" subdir
399 | "../../../../../src//inc/", // up 5 in tree, "/src//inc/" subdir
400 | "../../../../../sandbox//", // up 5 in tree, "/sandbox//" subdir
401 | "../../../../../sandbox//data/", // up 5 in tree, "/sandbox//data/" subdir
402 | "../../../../../sandbox//src/", // up 5 in tree, "/sandbox//src/" subdir
403 | "../../../../../sandbox//inc/", // up 5 in tree, "/sandbox//inc/" subdir
404 | "../../../../../0_Simple//data/", // up 5 in tree, "/0_Simple//" subdir
405 | "../../../../../1_Utilities//data/", // up 5 in tree, "/1_Utilities//" subdir
406 | "../../../../../2_Graphics//data/", // up 5 in tree, "/2_Graphics//" subdir
407 | "../../../../../3_Imaging//data/", // up 5 in tree, "/3_Imaging//" subdir
408 | "../../../../../4_Financial//data/", // up 5 in tree, "/4_Financial//" subdir
409 | "../../../../../5_Simulations//data/",// up 5 in tree, "/5_Simulations//" subdir
410 | "../../../../../6_Advanced//data/", // up 5 in tree, "/6_Advanced//" subdir
411 | "../../../../../7_CUDALibraries//data/", // up 5 in tree, "/7_CUDALibraries//" subdir
412 | "../../../../../samples//data/", // up 5 in tree, "/samples//" subdir
413 | "../../../../../common/", // up 5 in tree, "../../../common/" subdir
414 | "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir
415 | };
416 |
417 | // Extract the executable name
418 | std::string executable_name;
419 |
420 | if (executable_path != 0)
421 | {
422 | executable_name = std::string(executable_path);
423 |
424 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
425 | // Windows path delimiter
426 | size_t delimiter_pos = executable_name.find_last_of('\\');
427 | executable_name.erase(0, delimiter_pos + 1);
428 |
429 | if (executable_name.rfind(".exe") != std::string::npos)
430 | {
431 | // we strip .exe, only if the .exe is found
432 | executable_name.resize(executable_name.size() - 4);
433 | }
434 |
435 | #else
436 | // Linux & OSX path delimiter
437 | size_t delimiter_pos = executable_name.find_last_of('/');
438 | executable_name.erase(0,delimiter_pos+1);
439 | #endif
440 | }
441 |
442 | // Loop over all search paths and return the first hit
443 | for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
444 | {
445 | std::string path(searchPath[i]);
446 | size_t executable_name_pos = path.find("");
447 |
448 | // If there is executable_name variable in the searchPath
449 | // replace it with the value
450 | if (executable_name_pos != std::string::npos)
451 | {
452 | if (executable_path != 0)
453 | {
454 | path.replace(executable_name_pos, strlen(""), executable_name);
455 | }
456 | else
457 | {
458 | // Skip this path entry if no executable argument is given
459 | continue;
460 | }
461 | }
462 |
463 | #ifdef _DEBUG
464 | printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
465 | #endif
466 |
467 | // Test if the file exists
468 | path.append(filename);
469 | FILE *fp;
470 | FOPEN(fp, path.c_str(), "rb");
471 |
472 | if (fp != NULL)
473 | {
474 | fclose(fp);
475 | // File found
476 | // returning an allocated array here for backwards compatibility reasons
477 | char *file_path = (char *) malloc(path.length() + 1);
478 | STRCPY(file_path, path.length() + 1, path.c_str());
479 | return file_path;
480 | }
481 |
482 | if (fp)
483 | {
484 | fclose(fp);
485 | }
486 | }
487 |
488 | // File not found
489 | return 0;
490 | }
491 |
492 | #endif
493 |
--------------------------------------------------------------------------------
/common/helper_timer.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
3 | *
4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
5 | * with this source code for terms and conditions that govern your use of
6 | * this software. Any use, reproduction, disclosure, or distribution of
7 | * this software and related documentation outside the terms of the EULA
8 | * is strictly prohibited.
9 | *
10 | */
11 |
12 | // Helper Timing Functions
13 | #ifndef HELPER_TIMER_H
14 | #define HELPER_TIMER_H
15 |
16 | #ifndef EXIT_WAIVED
17 | #define EXIT_WAIVED 2
18 | #endif
19 |
20 | // includes, system
21 | #include
22 |
23 | // includes, project
24 | #include
25 |
26 | // Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
27 | // But rather in a self contained class interface
28 | class StopWatchInterface
29 | {
30 | public:
31 | StopWatchInterface() {};
32 | virtual ~StopWatchInterface() {};
33 |
34 | public:
35 | //! Start time measurement
36 | virtual void start() = 0;
37 |
38 | //! Stop time measurement
39 | virtual void stop() = 0;
40 |
41 | //! Reset time counters to zero
42 | virtual void reset() = 0;
43 |
44 | //! Time in msec. after start. If the stop watch is still running (i.e. there
45 | //! was no call to stop()) then the elapsed time is returned, otherwise the
46 | //! time between the last start() and stop call is returned
47 | virtual float getTime() = 0;
48 |
49 | //! Mean time to date based on the number of times the stopwatch has been
50 | //! _stopped_ (ie finished sessions) and the current total time
51 | virtual float getAverageTime() = 0;
52 | };
53 |
54 |
55 | //////////////////////////////////////////////////////////////////
56 | // Begin Stopwatch timer class definitions for all OS platforms //
57 | //////////////////////////////////////////////////////////////////
58 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
59 | // includes, system
60 | #define WINDOWS_LEAN_AND_MEAN
61 | #include
62 | #undef min
63 | #undef max
64 |
65 | //! Windows specific implementation of StopWatch
66 | class StopWatchWin : public StopWatchInterface
67 | {
68 | public:
69 | //! Constructor, default
70 | StopWatchWin() :
71 | start_time(), end_time(),
72 | diff_time(0.0f), total_time(0.0f),
73 | running(false), clock_sessions(0), freq(0), freq_set(false)
74 | {
75 | if (! freq_set)
76 | {
77 | // helper variable
78 | LARGE_INTEGER temp;
79 |
80 | // get the tick frequency from the OS
81 | QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
82 |
83 | // convert to type in which it is needed
84 | freq = ((double) temp.QuadPart) / 1000.0;
85 |
86 | // rememeber query
87 | freq_set = true;
88 | }
89 | };
90 |
91 | // Destructor
92 | ~StopWatchWin() { };
93 |
94 | public:
95 | //! Start time measurement
96 | inline void start();
97 |
98 | //! Stop time measurement
99 | inline void stop();
100 |
101 | //! Reset time counters to zero
102 | inline void reset();
103 |
104 | //! Time in msec. after start. If the stop watch is still running (i.e. there
105 | //! was no call to stop()) then the elapsed time is returned, otherwise the
106 | //! time between the last start() and stop call is returned
107 | inline float getTime();
108 |
109 | //! Mean time to date based on the number of times the stopwatch has been
110 | //! _stopped_ (ie finished sessions) and the current total time
111 | inline float getAverageTime();
112 |
113 | private:
114 | // member variables
115 |
116 | //! Start of measurement
117 | LARGE_INTEGER start_time;
118 | //! End of measurement
119 | LARGE_INTEGER end_time;
120 |
121 | //! Time difference between the last start and stop
122 | float diff_time;
123 |
124 | //! TOTAL time difference between starts and stops
125 | float total_time;
126 |
127 | //! flag if the stop watch is running
128 | bool running;
129 |
130 | //! Number of times clock has been started
131 | //! and stopped to allow averaging
132 | int clock_sessions;
133 |
134 | //! tick frequency
135 | double freq;
136 |
137 | //! flag if the frequency has been set
138 | bool freq_set;
139 | };
140 |
141 | // functions, inlined
142 |
143 | ////////////////////////////////////////////////////////////////////////////////
144 | //! Start time measurement
145 | ////////////////////////////////////////////////////////////////////////////////
146 | inline void
147 | StopWatchWin::start()
148 | {
149 | QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
150 | running = true;
151 | }
152 |
153 | ////////////////////////////////////////////////////////////////////////////////
154 | //! Stop time measurement and increment add to the current diff_time summation
155 | //! variable. Also increment the number of times this clock has been run.
156 | ////////////////////////////////////////////////////////////////////////////////
157 | inline void
158 | StopWatchWin::stop()
159 | {
160 | QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
161 | diff_time = (float)
162 | (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
163 |
164 | total_time += diff_time;
165 | clock_sessions++;
166 | running = false;
167 | }
168 |
169 | ////////////////////////////////////////////////////////////////////////////////
170 | //! Reset the timer to 0. Does not change the timer running state but does
171 | //! recapture this point in time as the current start time if it is running.
172 | ////////////////////////////////////////////////////////////////////////////////
173 | inline void
174 | StopWatchWin::reset()
175 | {
176 | diff_time = 0;
177 | total_time = 0;
178 | clock_sessions = 0;
179 |
180 | if (running)
181 | {
182 | QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
183 | }
184 | }
185 |
186 |
187 | ////////////////////////////////////////////////////////////////////////////////
188 | //! Time in msec. after start. If the stop watch is still running (i.e. there
189 | //! was no call to stop()) then the elapsed time is returned added to the
190 | //! current diff_time sum, otherwise the current summed time difference alone
191 | //! is returned.
192 | ////////////////////////////////////////////////////////////////////////////////
193 | inline float
194 | StopWatchWin::getTime()
195 | {
196 | // Return the TOTAL time to date
197 | float retval = total_time;
198 |
199 | if (running)
200 | {
201 | LARGE_INTEGER temp;
202 | QueryPerformanceCounter((LARGE_INTEGER *) &temp);
203 | retval += (float)
204 | (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
205 | }
206 |
207 | return retval;
208 | }
209 |
210 | ////////////////////////////////////////////////////////////////////////////////
211 | //! Time in msec. for a single run based on the total number of COMPLETED runs
212 | //! and the total time.
213 | ////////////////////////////////////////////////////////////////////////////////
214 | inline float
215 | StopWatchWin::getAverageTime()
216 | {
217 | return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
218 | }
219 | #else
220 | // Declarations for Stopwatch on Linux and Mac OSX
221 | // includes, system
222 | #include
223 | #include
224 |
225 | //! Windows specific implementation of StopWatch
226 | class StopWatchLinux : public StopWatchInterface
227 | {
228 | public:
229 | //! Constructor, default
230 | StopWatchLinux() :
231 | start_time(), diff_time(0.0), total_time(0.0),
232 | running(false), clock_sessions(0)
233 | { };
234 |
235 | // Destructor
236 | virtual ~StopWatchLinux()
237 | { };
238 |
239 | public:
240 | //! Start time measurement
241 | inline void start();
242 |
243 | //! Stop time measurement
244 | inline void stop();
245 |
246 | //! Reset time counters to zero
247 | inline void reset();
248 |
249 | //! Time in msec. after start. If the stop watch is still running (i.e. there
250 | //! was no call to stop()) then the elapsed time is returned, otherwise the
251 | //! time between the last start() and stop call is returned
252 | inline float getTime();
253 |
254 | //! Mean time to date based on the number of times the stopwatch has been
255 | //! _stopped_ (ie finished sessions) and the current total time
256 | inline float getAverageTime();
257 |
258 | private:
259 |
260 | // helper functions
261 |
262 | //! Get difference between start time and current time
263 | inline float getDiffTime();
264 |
265 | private:
266 |
267 | // member variables
268 |
269 | //! Start of measurement
270 | struct timeval start_time;
271 |
272 | //! Time difference between the last start and stop
273 | float diff_time;
274 |
275 | //! TOTAL time difference between starts and stops
276 | float total_time;
277 |
278 | //! flag if the stop watch is running
279 | bool running;
280 |
281 | //! Number of times clock has been started
282 | //! and stopped to allow averaging
283 | int clock_sessions;
284 | };
285 |
286 | // functions, inlined
287 |
288 | ////////////////////////////////////////////////////////////////////////////////
289 | //! Start time measurement
290 | ////////////////////////////////////////////////////////////////////////////////
291 | inline void
292 | StopWatchLinux::start()
293 | {
294 | gettimeofday(&start_time, 0);
295 | running = true;
296 | }
297 |
298 | ////////////////////////////////////////////////////////////////////////////////
299 | //! Stop time measurement and increment add to the current diff_time summation
300 | //! variable. Also increment the number of times this clock has been run.
301 | ////////////////////////////////////////////////////////////////////////////////
302 | inline void
303 | StopWatchLinux::stop()
304 | {
305 | diff_time = getDiffTime();
306 | total_time += diff_time;
307 | running = false;
308 | clock_sessions++;
309 | }
310 |
311 | ////////////////////////////////////////////////////////////////////////////////
312 | //! Reset the timer to 0. Does not change the timer running state but does
313 | //! recapture this point in time as the current start time if it is running.
314 | ////////////////////////////////////////////////////////////////////////////////
315 | inline void
316 | StopWatchLinux::reset()
317 | {
318 | diff_time = 0;
319 | total_time = 0;
320 | clock_sessions = 0;
321 |
322 | if (running)
323 | {
324 | gettimeofday(&start_time, 0);
325 | }
326 | }
327 |
328 | ////////////////////////////////////////////////////////////////////////////////
329 | //! Time in msec. after start. If the stop watch is still running (i.e. there
330 | //! was no call to stop()) then the elapsed time is returned added to the
331 | //! current diff_time sum, otherwise the current summed time difference alone
332 | //! is returned.
333 | ////////////////////////////////////////////////////////////////////////////////
334 | inline float
335 | StopWatchLinux::getTime()
336 | {
337 | // Return the TOTAL time to date
338 | float retval = total_time;
339 |
340 | if (running)
341 | {
342 | retval += getDiffTime();
343 | }
344 |
345 | return retval;
346 | }
347 |
348 | ////////////////////////////////////////////////////////////////////////////////
349 | //! Time in msec. for a single run based on the total number of COMPLETED runs
350 | //! and the total time.
351 | ////////////////////////////////////////////////////////////////////////////////
352 | inline float
353 | StopWatchLinux::getAverageTime()
354 | {
355 | return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
356 | }
357 | ////////////////////////////////////////////////////////////////////////////////
358 |
359 | ////////////////////////////////////////////////////////////////////////////////
360 | inline float
361 | StopWatchLinux::getDiffTime()
362 | {
363 | struct timeval t_time;
364 | gettimeofday(&t_time, 0);
365 |
366 | // time difference in milli-seconds
367 | return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
368 | + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
369 | }
370 | #endif // WIN32
371 |
372 | ////////////////////////////////////////////////////////////////////////////////
373 | //! Timer functionality exported
374 |
375 | ////////////////////////////////////////////////////////////////////////////////
376 | //! Create a new timer
377 | //! @return true if a time has been created, otherwise false
378 | //! @param name of the new timer, 0 if the creation failed
379 | ////////////////////////////////////////////////////////////////////////////////
380 | inline bool
381 | sdkCreateTimer(StopWatchInterface **timer_interface)
382 | {
383 | //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
384 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
385 | *timer_interface = (StopWatchInterface *)new StopWatchWin();
386 | #else
387 | *timer_interface = (StopWatchInterface *)new StopWatchLinux();
388 | #endif
389 | return (*timer_interface != NULL) ? true : false;
390 | }
391 |
392 |
393 | ////////////////////////////////////////////////////////////////////////////////
394 | //! Delete a timer
395 | //! @return true if a time has been deleted, otherwise false
396 | //! @param name of the timer to delete
397 | ////////////////////////////////////////////////////////////////////////////////
398 | inline bool
399 | sdkDeleteTimer(StopWatchInterface **timer_interface)
400 | {
401 | //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
402 | if (*timer_interface)
403 | {
404 | delete *timer_interface;
405 | *timer_interface = NULL;
406 | }
407 |
408 | return true;
409 | }
410 |
411 | ////////////////////////////////////////////////////////////////////////////////
412 | //! Start the time with name \a name
413 | //! @param name name of the timer to start
414 | ////////////////////////////////////////////////////////////////////////////////
415 | inline bool
416 | sdkStartTimer(StopWatchInterface **timer_interface)
417 | {
418 | //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
419 | if (*timer_interface)
420 | {
421 | (*timer_interface)->start();
422 | }
423 |
424 | return true;
425 | }
426 |
427 | ////////////////////////////////////////////////////////////////////////////////
428 | //! Stop the time with name \a name. Does not reset.
429 | //! @param name name of the timer to stop
430 | ////////////////////////////////////////////////////////////////////////////////
431 | inline bool
432 | sdkStopTimer(StopWatchInterface **timer_interface)
433 | {
434 | // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
435 | if (*timer_interface)
436 | {
437 | (*timer_interface)->stop();
438 | }
439 |
440 | return true;
441 | }
442 |
443 | ////////////////////////////////////////////////////////////////////////////////
444 | //! Resets the timer's counter.
445 | //! @param name name of the timer to reset.
446 | ////////////////////////////////////////////////////////////////////////////////
447 | inline bool
448 | sdkResetTimer(StopWatchInterface **timer_interface)
449 | {
450 | // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
451 | if (*timer_interface)
452 | {
453 | (*timer_interface)->reset();
454 | }
455 |
456 | return true;
457 | }
458 |
459 | ////////////////////////////////////////////////////////////////////////////////
460 | //! Return the average time for timer execution as the total time
461 | //! for the timer dividied by the number of completed (stopped) runs the timer
462 | //! has made.
463 | //! Excludes the current running time if the timer is currently running.
464 | //! @param name name of the timer to return the time of
465 | ////////////////////////////////////////////////////////////////////////////////
466 | inline float
467 | sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
468 | {
469 | // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
470 | if (*timer_interface)
471 | {
472 | return (*timer_interface)->getAverageTime();
473 | }
474 | else
475 | {
476 | return 0.0f;
477 | }
478 | }
479 |
480 | ////////////////////////////////////////////////////////////////////////////////
481 | //! Total execution time for the timer over all runs since the last reset
482 | //! or timer creation.
483 | //! @param name name of the timer to obtain the value of.
484 | ////////////////////////////////////////////////////////////////////////////////
485 | inline float
486 | sdkGetTimerValue(StopWatchInterface **timer_interface)
487 | {
488 | // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
489 | if (*timer_interface)
490 | {
491 | return (*timer_interface)->getTime();
492 | }
493 | else
494 | {
495 | return 0.0f;
496 | }
497 | }
498 |
499 | #endif // HELPER_TIMER_H
500 |
--------------------------------------------------------------------------------
/compile.m:
--------------------------------------------------------------------------------
1 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b';
2 | CUDA_ROOT = '/usr/local/cuda-6.0';
3 |
4 | if ismac
5 | MATLAB_ROOT = '/Applications/MATLAB_R2014a.app';
6 | CUDA_ROOT = '/usr/local/cuda';
7 | end
8 |
9 | cuda_compile('./src', 'cudaFFTData', MATLAB_ROOT, CUDA_ROOT, './bin', false)
10 | cuda_compile('./src', 'cudaConvFFTData',MATLAB_ROOT, CUDA_ROOT, './bin', false)
11 | cuda_compile('./src', 'cudaConvolutionFFT',MATLAB_ROOT, CUDA_ROOT, './bin', false)
12 |
--------------------------------------------------------------------------------
/cuda_compile.m:
--------------------------------------------------------------------------------
1 | function cuda_compile( src_path, func_name, matlab_root, cuda_root, out_path, debug)
2 | %CUDA_COMPILE general cuda compiling helper for MATLAB version < 2014a
3 | if nargin < 6
4 | debug = false;
5 | end
6 |
7 | if ~exist('./bin', 'dir')
8 | mkdir('./bin')
9 | end
10 |
11 | % TODO: For matlab version < 8.0.1, Use the following setting,
12 | % if ~verLessThan('matlab', '8.0.1')
13 | % http://www.mathworks.com/help/distcomp/run-mex-functions-containing-cuda-code.html
14 | % setenv('MW_NVCC_PATH',[cudaroot '/nvcc'])
15 | % eval(sprintf('mex -v -largeArrayDims %s.cu',func_name));
16 | % elseif isunix && ~ismac && verLessThan('matlab', '8.0.1')
17 |
18 |
19 | % ------------------------------------------------------------------------------
20 | % Check cuda computing capability
21 | % ------------------------------------------------------------------------------
22 | % TODO, CUDA Stream if high CM
23 | gpuInfo = gpuDevice;
24 | fprintf('Your GPU Computing Capability %d\n', str2num(gpuInfo.ComputeCapability));
25 |
26 | % Remove compiled binary files
27 | eval(['!rm bin/' func_name '.o']);
28 |
29 | % ------------------------------------------------------------------------------
30 | % Setup environment variables
31 | % ------------------------------------------------------------------------------
32 |
33 | % Set debugging flag
34 | if debug
35 | nvcc_debug_flag = '-g -G -O0';
36 | mex_debug_flag = '-g';
37 | else
38 | nvcc_debug_flag = '-O3 -DNDEBUG';
39 | mex_debug_flag = '';
40 | end
41 |
42 | if ismac
43 | matlab_bin_path = '/bin/maci64';
44 | else
45 | matlab_bin_path = '/bin/glnxa64';
46 | end
47 |
48 | INCLUDE_PATH = sprintf([...
49 | '-I./common ',...
50 | '-I%s/extern/include ',...
51 | '-I%s/toolbox/distcomp/gpu/extern/include'],...
52 | matlab_root, matlab_root);
53 | NVCC_OPTS = '-arch=sm_30 -ftz=true -prec-div=false -prec-sqrt=false';
54 | COMPILER_OPTS = '-Xcompiler -fPIC -v';
55 |
56 | MEX_OPTS = '-largeArrayDims';
57 | MEX_INCLUDE_PATH = sprintf('-I%s/include', cuda_root);
58 | MEX_LIBS = '-lcudart -lcufft -lmwgpu';
59 | MEX_LIBRARY_PATH = ['-L', matlab_root, matlab_bin_path];
60 |
61 | % ------------------------------------------------------------------------------
62 | % Compile
63 | % ------------------------------------------------------------------------------
64 |
65 | % Compile the object file
66 | compile_string = sprintf([...
67 | '!%s/bin/nvcc ',...
68 | '%s ',... % Debug flag
69 | '%s ',... % Compiler options
70 | '%s ',... % NVCC_OPTS
71 | '%s ',... % Include paths
72 | '-c %s/%s.cu --output-file %s/%s.o'], ...
73 | cuda_root, nvcc_debug_flag, COMPILER_OPTS, NVCC_OPTS, INCLUDE_PATH, src_path, func_name, out_path, func_name);
74 | disp(compile_string);
75 | eval(compile_string);
76 |
77 | compile_string = sprintf(['mex ',...
78 | '%s ',... % Debug flag
79 | '%s ',... % Mex options
80 | '%s/%s.o ',... % Object file
81 | '%s ',... % Mex library path
82 | '%s ',... % Mex libraries
83 | '-outdir %s'],... % Out path
84 | mex_debug_flag, MEX_OPTS, out_path, func_name, MEX_LIBRARY_PATH, MEX_LIBS, out_path);
85 | disp(compile_string);
86 | eval(compile_string);
87 |
88 | % % Run system command
89 | % !nvcc -O3 -DNDEBUG -c cudaconv.cu -Xcompiler -fPIC -I/afs/cs/package/matlab-r2013b/matlab/r2013b/extern/include -I/afs/cs/package/matlab-r2013b/matlab/r2013b/toolbox/distcomp/gpu/extern/include
90 | % % Link object
91 | % mex cudaconv.o -L/usr/local/cuda-6.0/lib64 -L/afs/cs/package/matlab-r2013b/matlab/r2013b/bin/glnxa64 -lcudart -lcufft -lmwgpu
92 | % -gencode arch=compute_30,code=sm_30
93 |
--------------------------------------------------------------------------------
/demoCudaConvolutionFFT.m:
--------------------------------------------------------------------------------
1 | % MatlabCUDAConv
2 | %
3 | % To speed up convolutions, I made
4 |
5 | % ------------------------------------------------------------------------------
6 | % Compile
7 | % ------------------------------------------------------------------------------
8 |
9 | % Change the following lines
10 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b/';
11 | CUDA_ROOT = '/usr/local/cuda-6.0/';
12 |
13 | if ismac
14 | MATLAB_ROOT = '/Applications/MATLAB_R2014a.app/';
15 | CUDA_ROOT = '/usr/local/cuda/';
16 | end
17 |
18 | % Debugging compile
19 | compile
20 | addpath('./bin')
21 |
22 | % ------------------------------------------------------------------------------
23 | % Clear the GPU
24 | % ------------------------------------------------------------------------------
25 |
26 | clear;
27 | device_id = 1; % 1-base GPU index (MATLAB convention)
28 | g = gpuDevice(device_id);
29 | reset(g);
30 | cos(gpuArray(1)); % force matlab gpu dynamic library loading
31 |
32 |
33 | % ------------------------------------------------------------------------------
34 | % Experiment setup
35 | % ------------------------------------------------------------------------------
36 |
37 | n = 64; % data height
38 | m = 8; % data width
39 | k = 5; % number of channels
40 |
41 | cn = 10; % kernel height
42 | cm = 4; % kernel width
43 |
44 | % Make random data
45 | data = single(rand(n,m));
46 | for i = 2:k
47 | data(:,:,i) = single(rand(n,m));
48 | end
49 |
50 | % Make random kernel
51 | kernel = zeros(cn,cm,k,'single');
52 | kernel(:,:,1) = single(reshape(1:cn*cm,cn,cm));
53 | for i = 2:k
54 | kernel(:,:,i) = single(rand(cn,cm));
55 | end
56 |
57 | % To verify experiment, put kernel values to specific regions
58 | data(5:(4+cn),2:(1+cm),1) = kernel(:,:,1);
59 | data(21:(20+cn),1:cm,2) = kernel(:,:,1);
60 | data(1:cn,(m-(cm-1)):m,k) = kernel(:,:,1);
61 | kernel(:,:,k) = kernel(:,:,1);
62 |
63 | % ------------------------------------------------------------------------------
64 | % Flip Kernel (Required)
65 | % ------------------------------------------------------------------------------
66 |
67 | for i = 1:k
68 | kernel(:,:,i) = kernel(end:-1:1,end:-1:1,i);
69 | end
70 |
71 |
72 | % ------------------------------------------------------------------------------
73 | % Matlab convolution (Conv2 and FFT versions)
74 | % ------------------------------------------------------------------------------
75 |
76 | % Compute convolution using FFT
77 | % The size of ffted data should be larger than (n + cn - 1)x(m + cm - 1)
78 | fft_h = 80;
79 | fft_w = 16;
80 | matFFTedData = zeros(fft_h,fft_w,k);
81 | for i = 1:k
82 | matFFTedData(:,:,i) = fft2(data(:,:,i),fft_h,fft_w);
83 | end
84 |
85 | matFFTedKernel = zeros(fft_h, fft_w, k);
86 | for i = 1:k
87 | matFFTedKernel(:,:,i) = fft2(kernel(:,:,i),fft_h,fft_w);
88 | end
89 |
90 | % Compute using the naive convolution
91 | matConv = conv2(data(:,:,1),kernel(:,:,1));
92 | for i = 2:k
93 | matConv(:,:,i) = conv2(data(:,:,i),kernel(:,:,i));
94 | end
95 |
96 | cvmatlab = sum(matConv,3);
97 |
98 | ematlab = matFFTedKernel .* (matFFTedData);
99 | matFFTConv = ifft2(ematlab(:,:,1));
100 | for i=1:k
101 | matFFTConv(:,:,i) = ifft2(ematlab(:,:,i));
102 | end
103 |
104 |
105 | % ------------------------------------------------------------------------------
106 | % Convolution using GPU cudaConvolutionFFT
107 | % ------------------------------------------------------------------------------
108 |
109 | % You can feed multiple kernels in a cell format
110 | kernel2 = kernel;
111 | kernel2(1) = 100;
112 |
113 | kernelCell = {kernel, kernel2, kernel};
114 |
115 | thread_per_block_width = 8;
116 | thread_per_block_height = 8;
117 | thread_per_block_depth = 8;
118 | thread_per_block_2d_width = 16;
119 | threads_per_block_in =[thread_per_block_width, ...
120 | thread_per_block_height, ...
121 | thread_per_block_depth, ...
122 | thread_per_block_2d_width];
123 |
124 | [cvcell] = cudaConvolutionFFT(data, ... % Data
125 | cn,... % Maximum kernel height
126 | cm,... % Maximum kernel width
127 | kernelCell,... % Multiple kernels in a cell
128 | threads_per_block_in,... % threads per block
129 | device_id-1); % 0-based indexing for GPU Device ID
130 | cvg = cvcell{1}; % Get the result for the first kernel
131 | cvg2 = cvcell{2}; % Get the result for the second kernel (kernel2)
132 |
133 | % ------------------------------------------------------------------------------
134 | % Comparison and visualization
135 | % ------------------------------------------------------------------------------
136 |
137 | % Visualize convolution result
138 | figure(1); subplot(131); imagesc(sum(matConv,3));
139 | subplot(132); imagesc(real(sum(matFFTConv,3)));
140 | subplot(133); imagesc(real(cvg));
141 |
142 | % Transformed data
143 | figure(2); imagesc(real(ematlab(:,:,1)));
144 |
145 | % Compare matlab convolution with cuda FFT convolution
146 | figure(3); subplot(131); imagesc(cvg); % Convolution output ( using FFT,
147 | % data is padded with the size of the
148 | % kernel -1 )
149 | subplot(132); imagesc(cvg(1:n + cn - 1,1:m + cm - 1)); % Extract
150 | % exact convolution part that is the
151 | % same as matlab convolution
152 | subplot(133); imagesc(cvmatlab); % Visualize matlab convolution output
153 |
154 | % Compute residual
155 | figure(4); imagesc(cvg(1:n + cn - 1,1:m + cm - 1) - cvmatlab); colorbar;
156 |
--------------------------------------------------------------------------------
/src/convolutionFFTkernel.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
3 | *
4 | * NOTICE TO USER:
5 | *
6 | * This source code is subject to NVIDIA ownership rights under U.S. and
7 | * international Copyright laws. Users and possessors of this source code
8 | * are hereby granted a nonexclusive, royalty-free license to use this code
9 | * in individual and commercial software.
10 | *
11 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
12 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
13 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
14 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
15 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
16 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
17 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
18 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
19 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
20 | * OR PERFORMANCE OF THIS SOURCE CODE.
21 | *
22 | * U.S. Government End Users. This source code is a "commercial item" as
23 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
24 | * "commercial computer software" and "commercial computer software
25 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
26 | * and is provided to the U.S. Government only as a commercial end item.
27 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
28 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
29 | * source code with only those rights set forth herein.
30 | *
31 | * Any use of this source code in individual and commercial software must
32 | * include, in the user documentation and internal comments to the code,
33 | * the above Disclaimer and U.S. Government End Users Notice.
34 | */
35 |
36 |
37 |
38 | #define IMUL(a, b) __mul24(a, b)
39 |
40 |
41 | ////////////////////////////////////////////////////////////////////////////////
42 | // Copy input data array to the upper left corner and pad by border values
43 | ////////////////////////////////////////////////////////////////////////////////
44 | texture texData;
45 |
46 | __global__ void padData(
47 | float *d_PaddedData,
48 | int fftW,
49 | int fftH,
50 | int dataW,
51 | int dataH,
52 | int featureDim,
53 | int kernelW,
54 | int kernelH,
55 | int kernelX,
56 | int kernelY
57 | ){
58 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
59 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
60 | const int borderW = dataW + kernelX;
61 | const int borderH = dataH + kernelY;
62 | int dx;
63 | int dy;
64 |
65 | if(x < fftW && y < fftH){
66 | if(x < dataW) dx = x;
67 | if(y < dataH) dy = y;
68 | if(x >= dataW && x < borderW) dx = dataW - 1;
69 | if(y >= dataH && y < borderH) dy = dataH - 1;
70 | if(x >= borderW) dx = 0;
71 | if(y >= borderH) dy = 0;
72 |
73 | d_PaddedData[IMUL(y, fftW) + x] =
74 | tex2D(texData, (float)dx + 0.5f, (float)dy + 0.5f);
75 | }
76 | }
77 |
78 |
79 |
80 | ////////////////////////////////////////////////////////////////////////////////
81 | // Modulate Fourier image of padded data by Fourier image of padded kernel
82 | // and normalize by FFT size
83 | ////////////////////////////////////////////////////////////////////////////////
84 | __device__ void complexMulAndScale(Complex& a, Complex b, float c){
85 | Complex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
86 | a = t;
87 | }
88 |
89 | __global__ void modulateAndNormalize(
90 | Complex *fft_PaddedData,
91 | Complex *fft_PaddedKernel,
92 | int dataN
93 | ){
94 | const int tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
95 | const int threadN = IMUL(blockDim.x, gridDim.x);
96 | const float q = 1.0f / (float)dataN;
97 |
98 | for(int i = tid; i < dataN; i += threadN)
99 | complexMulAndScale(fft_PaddedData[i], fft_PaddedKernel[i], q);
100 | }
101 |
--------------------------------------------------------------------------------
/src/cudaConvFFTData.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "mex.h"
4 | #include "gpu/mxGPUArray.h"
5 | // #include "common/helper_cuda.h"
6 | #include "cudaConvFFTData.h"
7 | #include "cudaConvFFTData.cuh"
8 |
9 | static bool debug = false;
10 |
11 | enum OUT_INDEX{
12 | CONVOLUTION_CELL_INDEX
13 | };
14 |
15 | enum IN_INDEX{
16 | FFT_DATA_INDEX,
17 | KERNLE_CELL_INDEX,
18 | THREAD_SIZE_INDEX // Optional
19 | };
20 |
21 | ////////////////////////////////////////////////////////////////////////////////
22 | // Mex Entry
23 | ////////////////////////////////////////////////////////////////////////////////
24 | void mexFunction(int nlhs, mxArray *plhs[],
25 | int nrhs, mxArray const *prhs[])
26 | {
27 | /* Declare all variables.*/
28 | const mxGPUArray *mxFFTData;
29 | const mxGPUArray *mxKernel;
30 | mxGPUArray *mxFFTKernel;
31 | mxGPUArray *mxConvolution;
32 | mxArray *convolutionResult;
33 |
34 | /* cufftComplex is float2 */
35 | const cufftComplex *d_CFFT_DATA;
36 | cufftComplex *d_CFFT_KERNEL;
37 | cufftComplex *d_FFTEProd;
38 |
39 | float *d_CONVOLUTION;
40 | float *d_IFFTEProd;
41 |
42 | float *h_Kernel;
43 | float *h_CONVOLUTION;
44 | float *d_Kernel;
45 | float *d_PaddedKernel;
46 |
47 | char const * const errId = "cudaConvFFTData:InvalidInput";
48 |
49 | /* Choose a reasonably sized number of threads for the block. */
50 | int THREAD_PER_BLOCK_H = 16;
51 | int THREAD_PER_BLOCK_W = 8;
52 | int THREAD_PER_BLOCK_D = 8;
53 | int THREAD_PER_BLOCK_2D = 32;
54 |
55 | const mwSize * mxKernel_Dim;
56 | const mwSize * mxFFT_Dim;
57 | // int MblocksPerGrid, NblocksPerGrid;
58 | int KERNEL_H, KERNEL_W, N_KERNEL,
59 | CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM,
60 | KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE;
61 |
62 | /* Initialize the MathWorks GPU API. */
63 | // If initialized mxInitGPU do nothing
64 | if (mxInitGPU() != MX_GPU_SUCCESS)
65 | mexErrMsgTxt("mxInitGPU fail");
66 |
67 | /* Throw an error if the input is not a GPU array. */
68 | if ( (nrhs < (KERNLE_CELL_INDEX + 1)) || (nrhs > (THREAD_SIZE_INDEX + 1) ) || !mxIsGPUArray(prhs[FFT_DATA_INDEX]) )
69 | mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU");
70 |
71 | if (( nrhs > THREAD_SIZE_INDEX) && mxGetNumberOfElements(prhs[THREAD_SIZE_INDEX]) != 4)
72 | mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock");
73 |
74 | if ( nrhs > THREAD_SIZE_INDEX ){
75 | const double* threadSize = (double *)mxGetData(prhs[THREAD_SIZE_INDEX]);
76 | THREAD_PER_BLOCK_H = (int)threadSize[0];
77 | THREAD_PER_BLOCK_W = (int)threadSize[1];
78 | THREAD_PER_BLOCK_D = (int)threadSize[2];
79 | THREAD_PER_BLOCK_2D = (int)threadSize[3];
80 | if(debug) fprintf(stderr,"Thread size: H=%d, W=%d, D=%d, 2D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D);
81 | }
82 |
83 | // cudaDeviceProp dev;
84 | // cudaGetDeviceProperties(&dev,0);
85 | // int success = checkDeviceProp(dev);
86 |
87 | mxFFTData = mxGPUCreateFromMxArray(prhs[FFT_DATA_INDEX]);
88 | mxFFT_Dim = mxGPUGetDimensions(mxFFTData);
89 |
90 | // FFT Dim
91 | // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points.
92 | CFFT_H = mxFFT_Dim[0];
93 | CFFT_W = mxFFT_Dim[1];
94 |
95 | FFT_H = (mxFFT_Dim[0] - 1) * 2;
96 | FFT_W = mxFFT_Dim[1];
97 |
98 | FEATURE_DIM = mxFFT_Dim[2];
99 |
100 | CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2);
101 | FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float);
102 | CONV_SIZE = FFT_W * FFT_H * sizeof(float);
103 |
104 | if(debug) fprintf(stderr,"FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM);
105 |
106 | if (mxGetClassID(prhs[KERNLE_CELL_INDEX]) != mxCELL_CLASS)
107 | mexErrMsgIdAndTxt(errId, "Kernel must be a cell array");
108 |
109 | mwSize nKernel = mxGetNumberOfElements(prhs[KERNLE_CELL_INDEX]);
110 | N_KERNEL = (int)nKernel;
111 | plhs[CONVOLUTION_CELL_INDEX] = mxCreateCellMatrix(1, N_KERNEL);
112 |
113 | if(debug) fprintf(stderr,"N Kernel: %d\n", N_KERNEL);
114 |
115 |
116 | /* Set block size and thread size */
117 | dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
118 | dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x),
119 | iDivUp(FFT_H, threadBlock3D.y),
120 | iDivUp(FEATURE_DIM, threadBlock3D.z));
121 |
122 | dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D);
123 | dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x),
124 | iDivUp(FFT_H, threadBlock2D.y));
125 |
126 |
127 | /* Pad Kernel */
128 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE));
129 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_IFFTEProd, FFT_SIZE));
130 |
131 | /* Create a GPUArray to hold the result and get its underlying pointer. */
132 | // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
133 | // FFT_dims[0] = FFT_H;
134 | // FFT_dims[1] = FFT_W;
135 | // FFT_dims[2] = FEATURE_DIM;
136 |
137 | d_CFFT_DATA = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData);
138 |
139 | // mxConvolution = mxGPUCreateGPUArray(2,
140 | // FFT_dims, // Third element will not be accessed
141 | // mxSINGLE_CLASS,
142 | // mxREAL,
143 | // MX_GPU_DO_NOT_INITIALIZE);
144 |
145 | // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution));
146 |
147 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE));
148 |
149 | // mxFFTKernel = mxGPUCreateGPUArray(3,
150 | // mxFFT_Dim,
151 | // mxSINGLE_CLASS,
152 | // mxCOMPLEX,
153 | // MX_GPU_DO_NOT_INITIALIZE);
154 |
155 | // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel));
156 |
157 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE));
158 |
159 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE));
160 |
161 | /* FFT Kernel */
162 | int BATCH = FEATURE_DIM;
163 | int FFT_Dims[] = { FFT_W, FFT_H };
164 | int CFFT_Dims[] = { CFFT_W, CFFT_H };
165 |
166 | int idist = FFT_W * FFT_H;
167 | int odist = CFFT_W * CFFT_H;
168 |
169 | cufftHandle FFTplan_R2C, FFTplan_C2R;
170 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C,
171 | 2, // rank
172 | FFT_Dims,
173 | FFT_Dims, 1, idist, // *inembed, istride, idist
174 | CFFT_Dims, 1, odist, // *onembed, ostride, odist
175 | CUFFT_R2C,
176 | BATCH)); // batch
177 |
178 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_C2R,
179 | 2, // rank
180 | FFT_Dims,
181 | CFFT_Dims, 1, odist, // *inembed, istride, idist
182 | FFT_Dims, 1, idist, // *onembed, ostride, odist
183 | CUFFT_C2R,
184 | BATCH)); // batch
185 |
186 | mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
187 | FFT_dims[0] = FFT_H;
188 | FFT_dims[1] = FFT_W;
189 |
190 | /* For each kernel iterate */
191 | for (int kernelIdx = 0; kernelIdx < N_KERNEL; kernelIdx++){
192 |
193 | // Get Kernel Data
194 | const mxArray *mxCurrentCell = mxGetCell(prhs[KERNLE_CELL_INDEX], kernelIdx);
195 | if (!mxIsGPUArray(mxCurrentCell)){
196 |
197 | if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 )
198 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
199 |
200 | h_Kernel = (float *)mxGetData(mxCurrentCell);
201 | mxKernel_Dim = mxGetDimensions(mxCurrentCell);
202 |
203 | // Kernel dimensions
204 | KERNEL_H = mxKernel_Dim[0];
205 | KERNEL_W = mxKernel_Dim[1];
206 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
207 |
208 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Kernel, KERNEL_SIZE));
209 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
210 | mxKernel = NULL;
211 | }else{ // Kernel is GPU Array
212 | mxKernel = mxGPUCreateFromMxArray(mxCurrentCell);
213 |
214 | if ( mxGPUGetClassID(mxKernel) != mxSINGLE_CLASS || mxGPUGetNumberOfDimensions(mxKernel) != 3 )
215 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
216 |
217 | mxKernel_Dim = mxGPUGetDimensions(mxKernel);
218 |
219 | // Kernel dimensions
220 | KERNEL_H = mxKernel_Dim[0];
221 | KERNEL_W = mxKernel_Dim[1];
222 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
223 |
224 | d_Kernel = (float *)mxGPUGetDataReadOnly(mxKernel);
225 | }
226 |
227 | if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W);
228 |
229 | if (FEATURE_DIM != mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){
230 | mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size");
231 | }
232 |
233 | padData<<>>(
234 | d_PaddedKernel,
235 | d_Kernel,
236 | FFT_W,
237 | FFT_H,
238 | KERNEL_W,
239 | KERNEL_H,
240 | FEATURE_DIM
241 | );
242 |
243 |
244 | CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedKernel, d_CFFT_KERNEL));
245 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
246 |
247 | if(debug) fprintf(stderr,"FFT done\n");
248 |
249 |
250 | /* Hadamard product, Element-wise multiplication in frequency domain */
251 | /* If execute the following, second compile of this file create MATLAB error */
252 | elementwiseProductAndNormalize<<>>(
253 | d_FFTEProd, // out
254 | d_CFFT_DATA, // in data
255 | d_CFFT_KERNEL, // in kernel
256 | CFFT_H,
257 | CFFT_W,
258 | FEATURE_DIM,
259 | 1.0f / (FFT_W * FFT_H)
260 | );
261 |
262 | CUFFT_SAFE_CALL(cufftExecC2R(FFTplan_C2R, d_FFTEProd, d_IFFTEProd));
263 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize());
264 |
265 | sumAlongFeatures<<>>(
266 | d_CONVOLUTION,
267 | d_IFFTEProd,
268 | FFT_H,
269 | FFT_W,
270 | FEATURE_DIM
271 | );
272 |
273 |
274 |
275 | convolutionResult = mxCreateNumericArray(2, FFT_dims, mxSINGLE_CLASS, mxREAL);
276 | h_CONVOLUTION = (float *)mxGetData(convolutionResult);
277 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(h_CONVOLUTION, d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost));
278 |
279 | mxSetCell(plhs[CONVOLUTION_CELL_INDEX], kernelIdx, convolutionResult);
280 |
281 | if(mxKernel == NULL) cudaFree(d_Kernel);
282 | }
283 | // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel);
284 |
285 | /*
286 | * The mxGPUArray pointers are host-side structures that refer to device
287 | * data. These must be destroyed before leaving the MEX function.
288 | */
289 | mxGPUDestroyGPUArray(mxFFTData);
290 | // mxGPUDestroyGPUArray(mxConvolution);
291 | // mxGPUDestroyGPUArray(mxFFTKernel);
292 |
293 | cufftDestroy(FFTplan_R2C);
294 | cufftDestroy(FFTplan_C2R);
295 |
296 | if(mxKernel != NULL) mxGPUDestroyGPUArray(mxKernel);
297 |
298 | cudaFree(d_PaddedKernel);
299 | cudaFree(d_IFFTEProd);
300 | cudaFree(d_CONVOLUTION);
301 | cudaFree(d_CFFT_KERNEL);
302 | cudaFree(d_FFTEProd);
303 |
304 |
305 | mxFree(FFT_dims);
306 | }
307 |
--------------------------------------------------------------------------------
/src/cudaConvFFTData.cuh:
--------------------------------------------------------------------------------
1 | #ifndef CUDA_CONV_FFT_DATA_CUH
2 | #define CUDA_CONV_FFT_DATA_CUH
3 |
4 | /*
5 | * Device Code
6 | */
7 |
8 | ////////////////////////////////////////////////////////////////////////////////
9 | // Pad data with zeros,
10 | ////////////////////////////////////////////////////////////////////////////////
11 | __global__ void padData(
12 | float *d_PaddedData,
13 | const float *d_Data,
14 | int fftW,
15 | int fftH,
16 | int dataW,
17 | int dataH,
18 | int FEATURE_DIM
19 | ){
20 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
21 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
22 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
23 |
24 | if(x < fftW && y < fftH && z < FEATURE_DIM){
25 | if(x < dataW && y < dataH)
26 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] =
27 | d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y];
28 | else
29 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0;
30 | }
31 | }
32 |
33 | ////////////////////////////////////////////////////////////////////////////////
34 | // Modulate Fourier image of padded data by Fourier image of padded kernel
35 | // and normalize by FFT size
36 | ////////////////////////////////////////////////////////////////////////////////
37 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
38 | const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
39 | out = t;
40 | }
41 |
42 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
43 | const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)};
44 | out = t;
45 | }
46 |
47 | __global__ void elementwiseProductAndNormalize(
48 | cufftComplex *fft_Output,
49 | const cufftComplex *fft_PaddedData,
50 | const cufftComplex *fft_PaddedKernel,
51 | int FFT_H,
52 | int FFT_W,
53 | int FEATURE_DIM,
54 | float scale
55 | ){
56 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
57 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
58 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
59 |
60 | if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){
61 | // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y;
62 | int i = z * FFT_W * FFT_H + FFT_H * x + y;
63 | // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale);
64 | fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y);
65 | fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y);
66 | }
67 | }
68 |
69 | /* Support in-place computation, i.e. input and output can be the same */
70 | __global__ void sumAlongFeatures(
71 | float *convolutionResult,
72 | const float *convolutionPerFeature,
73 | int FFT_H,
74 | int FFT_W,
75 | int FEATURE_DIM
76 | ){
77 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
78 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
79 |
80 | if(x < FFT_W && y < FFT_H){
81 | const int result_i = IMUL(FFT_H, x) + y;
82 | const int N = IMUL(FFT_W, FFT_H);
83 |
84 | float acc = convolutionPerFeature[result_i];
85 | int zN = N;
86 | for (int z = 1; z < FEATURE_DIM; z++){
87 | acc += convolutionPerFeature[zN + result_i];
88 | zN += N;
89 | }
90 | convolutionResult[result_i] = acc;
91 | }
92 | }
93 |
94 |
95 | #endif
--------------------------------------------------------------------------------
/src/cudaConvFFTData.h:
--------------------------------------------------------------------------------
1 | #ifndef CUDA_CONV_FFT_DATA
2 | #define CUDA_CONV_FFT_DATA
3 |
4 | # define IMUL(a, b) __mul24(a, b)
5 |
6 | # define CUDA_SAFE_CALL_NO_SYNC( call) do { \
7 | cudaError err = call; \
8 | if( cudaSuccess != err) { \
9 | printf("Cuda error in file '%s' in line %i Error : %d.\n", \
10 | __FILE__, __LINE__, err); \
11 | exit(EXIT_FAILURE); \
12 | } } while (0)
13 |
14 | # define CUDA_SAFE_CALL( call) do { \
15 | CUDA_SAFE_CALL_NO_SYNC(call); \
16 | cudaError err = cudaThreadSynchronize(); \
17 | if( cudaSuccess != err) { \
18 | printf("Cuda error in file '%s' in line %i Error : %d.\n", \
19 | __FILE__, __LINE__,err); \
20 | exit(EXIT_FAILURE); \
21 | } } while (0)
22 |
23 | # define CUFFT_SAFE_CALL( call) do { \
24 | cufftResult err = call; \
25 | if( CUFFT_SUCCESS != err) { \
26 | printf("CUFFT error in file '%s' in line %i Error : %d.\n", \
27 | __FILE__, __LINE__,err); \
28 | exit(EXIT_FAILURE); \
29 | } } while (0)
30 |
31 |
32 | ////////////////////////////////////////////////////////////////////////////////
33 | // Helper functions
34 | ////////////////////////////////////////////////////////////////////////////////
35 | //Round a / b to nearest higher integer value
36 | int iDivUp(int a, int b){
37 | return (a % b != 0) ? (a / b + 1) : (a / b);
38 | }
39 |
40 | //Align a to nearest higher multiple of b
41 | int iAlignUp(int a, int b){
42 | return (a % b != 0) ? (a - a % b + b) : a;
43 | }
44 |
45 |
46 |
47 | int checkDeviceProp ( cudaDeviceProp p ) {
48 | int support = p.canMapHostMemory;
49 |
50 | if(support == 0) printf( "%s does not support mapping host memory.\n", p.name);
51 | else printf( "%s supports mapping host memory.\n",p.name);
52 |
53 | support = p.concurrentKernels;
54 | if(support == 0) printf("%s does not support concurrent kernels\n", p.name);
55 | else printf("%s supports concurrent kernels\n",p.name);
56 |
57 | support = p.kernelExecTimeoutEnabled;
58 | if(support == 0) printf("%s kernelExecTimeout disabled\n", p.name);
59 | else printf("%s kernelExecTimeout enabled\n",p.name);
60 |
61 | printf("compute capability : %d.%d \n", p.major,p.minor);
62 | printf("number of multiprocessors : %d \n", p.multiProcessorCount);
63 |
64 | return support;
65 | }
66 |
67 | int computeFFTsize(int dataSize){
68 | //Highest non-zero bit position of dataSize
69 | int hiBit;
70 | //Neares lower and higher powers of two numbers for dataSize
71 | unsigned int lowPOT, hiPOT;
72 |
73 | //Align data size to a multiple of half-warp
74 | //in order to have each line starting at properly aligned addresses
75 | //for coalesced global memory writes in padKernel() and padData()
76 | dataSize = iAlignUp(dataSize, 16);
77 |
78 | //Find highest non-zero bit
79 | for(hiBit = 31; hiBit >= 0; hiBit--)
80 | if(dataSize & (1U << hiBit)) break;
81 |
82 | //No need to align, if already power of two
83 | lowPOT = 1U << hiBit;
84 | if(lowPOT == dataSize) return dataSize;
85 |
86 | //Align to a nearest higher power of two, if the size is small enough,
87 | //else align only to a nearest higher multiple of 512,
88 | //in order to save computation and memory bandwidth
89 | hiPOT = 1U << (hiBit + 1);
90 | //if(hiPOT <= 1024)
91 | return hiPOT;
92 | //else
93 | // return iAlignUp(dataSize, 512);
94 | }
95 |
96 | int computeFFTsize16(int dataSize){
97 | // Compute the multiple of 16
98 | int mod = dataSize / 16;
99 | int rem = dataSize % 16;
100 |
101 | return (mod * 16) + ((rem > 0)?16:0);
102 | }
103 |
104 | #endif
--------------------------------------------------------------------------------
/src/cudaConvFFTDataStreams.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "mex.h"
4 | #include "gpu/mxGPUArray.h"
5 | // #include "common/helper_cuda.h"
6 | #include "cudaConvFFTDataStream.h"
7 |
8 |
9 | const int N_MAX_PARALLEL = 32;
10 | static bool debug = true;
11 |
12 | /*
13 | * Device Code
14 | */
15 |
16 | ////////////////////////////////////////////////////////////////////////////////
17 | // Pad data with zeros,
18 | ////////////////////////////////////////////////////////////////////////////////
19 | __global__ void padData(
20 | float *d_PaddedData,
21 | const float *d_Data,
22 | int fftW,
23 | int fftH,
24 | int dataW,
25 | int dataH,
26 | int FEATURE_DIM
27 | ){
28 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
29 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
30 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
31 |
32 | if(x < fftW && y < fftH && z < FEATURE_DIM){
33 | if(x < dataW && y < dataH)
34 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] =
35 | d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y];
36 | else
37 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0;
38 | }
39 | }
40 |
41 | ////////////////////////////////////////////////////////////////////////////////
42 | // Modulate Fourier image of padded data by Fourier image of padded kernel
43 | // and normalize by FFT size
44 | ////////////////////////////////////////////////////////////////////////////////
45 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
46 | const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
47 | out = t;
48 | }
49 |
50 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){
51 | const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)};
52 | out = t;
53 | }
54 |
55 | __global__ void elementwiseProductAndNormalize(
56 | cufftComplex *fft_Output,
57 | const cufftComplex *fft_PaddedData,
58 | const cufftComplex *fft_PaddedKernel,
59 | int FFT_H,
60 | int FFT_W,
61 | int FEATURE_DIM,
62 | float scale
63 | ){
64 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
65 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
66 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z;
67 |
68 | if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){
69 | // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y;
70 | int i = z * FFT_W * FFT_H + FFT_H * x + y;
71 | // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale);
72 | fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y);
73 | fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y);
74 | }
75 | }
76 |
77 | /* Support in-place computation, i.e. input and output can be the same */
78 | __global__ void sumAlongFeatures(
79 | float *convolutionResult,
80 | const float *convolutionPerFeature,
81 | int FFT_H,
82 | int FFT_W,
83 | int FEATURE_DIM
84 | ){
85 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
86 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
87 |
88 | if(x < FFT_W && y < FFT_H){
89 | const int result_i = IMUL(FFT_H, x) + y;
90 | const int N = IMUL(FFT_W, FFT_H);
91 |
92 | convolutionResult[result_i] = convolutionPerFeature[result_i];
93 | for (int z = 1; z < FEATURE_DIM; z++){
94 | convolutionResult[result_i] +=
95 | convolutionPerFeature[IMUL(z, N) + result_i];
96 | }
97 | }
98 | }
99 |
100 | /*
101 | * Host code
102 | */
103 |
104 | ////////////////////////////////////////////////////////////////////////////////
105 | // Helper functions
106 | ////////////////////////////////////////////////////////////////////////////////
107 | //Round a / b to nearest higher integer value
108 | int iDivUp(int a, int b){
109 | return (a % b != 0) ? (a / b + 1) : (a / b);
110 | }
111 |
112 | //Align a to nearest higher multiple of b
113 | int iAlignUp(int a, int b){
114 | return (a % b != 0) ? (a - a % b + b) : a;
115 | }
116 |
117 |
118 | ////////////////////////////////////////////////////////////////////////////////
119 | // Mex Entry
120 | ////////////////////////////////////////////////////////////////////////////////
121 | void mexFunction(int nlhs, mxArray *plhs[],
122 | int nrhs, mxArray const *prhs[])
123 | {
124 | ConvPlan plan[N_MAX_PARALLEL];
125 |
126 | /* Declare all variables.*/
127 | const mxGPUArray *mxFFTData;
128 | const mxGPUArray *mxKernel;
129 | mxGPUArray *mxFFTKernel;
130 | mxGPUArray *mxConvolution;
131 |
132 | cufftComplex **d_CFFT_DATA_PER_GPU;
133 |
134 | /* concurrent kernel executions */
135 | int N_GPU;
136 | int N_BATCH_PER_GPU = 2;
137 |
138 | char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
139 |
140 | /* Choose a reasonably sized number of threads for the block. */
141 | int THREAD_PER_BLOCK_H = 16;
142 | int THREAD_PER_BLOCK_W = 8;
143 | int THREAD_PER_BLOCK_D = 8;
144 | int THREAD_PER_BLOCK_2D = 32;
145 |
146 | // const mwSize * mxKernel_Dim;
147 | const mwSize * mxFFT_Dim;
148 | // int MblocksPerGrid, NblocksPerGrid;
149 | int KERNEL_H, KERNEL_W, N_KERNEL,
150 | CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM,
151 | KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE;
152 |
153 | int gpuIdx, streamIdx, planIdx;
154 |
155 | /* Initialize the MathWorks GPU API. */
156 | mxInitGPU();
157 |
158 | /* Throw an error if the input is not a GPU array. */
159 | if ( (nrhs < 2) || (nrhs > 3) || !mxIsGPUArray(prhs[0]) )
160 | mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU");
161 |
162 | if (( nrhs == 3) && mxGetNumberOfElements(prhs[2]) != 4)
163 | mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock");
164 |
165 | if ( nrhs == 3 ){
166 | const double* threadSize = (double *)mxGetData(prhs[2]);
167 | THREAD_PER_BLOCK_H = (int)threadSize[0];
168 | THREAD_PER_BLOCK_W = (int)threadSize[1];
169 | THREAD_PER_BLOCK_D = (int)threadSize[2];
170 | THREAD_PER_BLOCK_2D = (int)threadSize[3];
171 | if(debug) printf("Thread size: H=%d, W=%d, D=%d, D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D);
172 | }
173 |
174 | cudaDeviceProp dev;
175 | cudaGetDeviceProperties(&dev,0);
176 | int success = checkDeviceProp(dev);
177 |
178 | mxFFTData = mxGPUCreateFromMxArray(prhs[0]);
179 | mxFFT_Dim = mxGPUGetDimensions(mxFFTData);
180 |
181 | // FFT Dim
182 | // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points.
183 | CFFT_H = mxFFT_Dim[0];
184 | CFFT_W = mxFFT_Dim[1];
185 |
186 | FFT_H = (mxFFT_Dim[0] - 1) * 2;
187 | FFT_W = mxFFT_Dim[1];
188 |
189 | FEATURE_DIM = mxFFT_Dim[2];
190 |
191 | CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2);
192 | FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float);
193 | CONV_SIZE = FFT_W * FFT_H * sizeof(float);
194 |
195 | if(debug) printf("FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM);
196 |
197 | if (mxGetClassID(prhs[1]) != mxCELL_CLASS)
198 | mexErrMsgIdAndTxt(errId, "Kernel must be a cell array");
199 |
200 | mwSize nKernel = mxGetNumberOfElements(prhs[1]);
201 | N_KERNEL = (int)nKernel;
202 | plhs[0] = mxCreateCellMatrix(1, N_KERNEL);
203 |
204 | if(debug) printf("N Kernel: %d\n", N_KERNEL);
205 |
206 |
207 | /* Set block size and thread size */
208 | dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D);
209 | dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x),
210 | iDivUp(FFT_H, threadBlock3D.y),
211 | iDivUp(FEATURE_DIM, threadBlock3D.z));
212 |
213 | dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D);
214 | dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x),
215 | iDivUp(FFT_H, threadBlock2D.y));
216 |
217 |
218 | /* Find number of cuda capable devices */
219 | CUDA_SAFE_CALL(cudaGetDeviceCount(&N_GPU));
220 | if(debug) printf( "CUDA-capable device count: %i\n", N_GPU);
221 |
222 | CUDA_SAFE_CALL(cudaSetDevice(0));
223 | d_CFFT_DATA_PER_GPU = (cufftComplex **)malloc(N_GPU * sizeof(float));
224 |
225 | /* Pad Kernel */
226 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE));
227 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_IFFTEProd, FFT_SIZE));
228 |
229 | /* Create a GPUArray to hold the result and get its underlying pointer. */
230 | mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
231 | FFT_dims[0] = FFT_H;
232 | FFT_dims[1] = FFT_W;
233 | FFT_dims[2] = FEATURE_DIM;
234 |
235 | d_CFFT_DATA_PER_GPU[0] = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData);
236 |
237 | // mxConvolution = mxGPUCreateGPUArray(2,
238 | // FFT_dims, // Third element will not be accessed
239 | // mxSINGLE_CLASS,
240 | // mxREAL,
241 | // MX_GPU_DO_NOT_INITIALIZE);
242 |
243 | // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution));
244 |
245 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE));
246 |
247 | // mxFFTKernel = mxGPUCreateGPUArray(3,
248 | // mxFFT_Dim,
249 | // mxSINGLE_CLASS,
250 | // mxCOMPLEX,
251 | // MX_GPU_DO_NOT_INITIALIZE);
252 |
253 | // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel));
254 |
255 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE));
256 |
257 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE));
258 |
259 | /* FFT Kernel */
260 | int BATCH = FEATURE_DIM;
261 | int FFT_Dims[] = { FFT_W, FFT_H };
262 | int CFFT_Dims[] = { CFFT_W, CFFT_H };
263 |
264 | int idist = FFT_W * FFT_H;
265 | int odist = CFFT_W * CFFT_H;
266 |
267 | // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize));
268 | // FFT_dims[0] = FFT_H;
269 | // FFT_dims[1] = FFT_W;
270 |
271 | N_GPU = 1;
272 | //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
273 | for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++)
274 | {
275 | // Set GPU
276 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
277 | // if (gpuIdx != 0) CUDA_SAFE_CALL();
278 | /* COPY mxFFTData to individual GPU */
279 | if (gpuIdx > 0) {
280 | if(debug) printf("start inter gpu copy from 0 to %d\n", gpuIdx);
281 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_DATA_PER_GPU[gpuIdx], CFFT_SIZE));
282 | CUDA_SAFE_CALL(cudaMemcpyPeerAsync(d_CFFT_DATA_PER_GPU[gpuIdx],
283 | gpuIdx,
284 | d_CFFT_DATA_PER_GPU[0],
285 | 0,
286 | CFFT_SIZE,
287 | plan[0].stream));
288 | if(debug) printf("end gpu copy from 0 to %d\n", gpuIdx);
289 | }
290 |
291 | // Set Streams
292 | for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
293 | planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
294 |
295 | CUDA_SAFE_CALL(cudaStreamCreate(&plan[planIdx].stream));
296 |
297 | // Cufft Plans
298 | CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_R2C,
299 | 2, // rank
300 | FFT_Dims,
301 | FFT_Dims, 1, idist, // *inembed, istride, idist
302 | CFFT_Dims, 1, odist, // *onembed, ostride, odist
303 | CUFFT_R2C,
304 | BATCH)); // batch
305 | cufftSetStream(plan[planIdx].FFTplan_R2C, plan[planIdx].stream);
306 |
307 | CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_C2R,
308 | 2, // rank
309 | FFT_Dims,
310 | CFFT_Dims, 1, odist, // *inembed, istride, idist
311 | FFT_Dims, 1, idist, // *onembed, ostride, odist
312 | CUFFT_C2R,
313 | BATCH)); // batch
314 | cufftSetStream(plan[planIdx].FFTplan_C2R, plan[planIdx].stream);
315 |
316 | plan[planIdx].d_CFFT_DATA = d_CFFT_DATA_PER_GPU[gpuIdx];
317 |
318 | //Allocate memory
319 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CFFT_KERNEL, CFFT_SIZE));
320 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_FFTEProd, CFFT_SIZE));
321 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CONVOLUTION, CONV_SIZE));
322 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_IFFTEProd, FFT_SIZE));
323 | // d_Kernel, dynamically set
324 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_PaddedKernel, FFT_SIZE));
325 | // h_Kernel, dynamically set
326 | // CUDA_SAFE_CALL(cudaMallocHost((void **)&plan[planIdx].h_CONVOLUTION, CONV_SIZE));
327 | }
328 | }
329 |
330 |
331 | /* For each kernel iterate */
332 | int N_PLANS = N_GPU * N_BATCH_PER_GPU;
333 | printf("N Plans %d\n",N_PLANS);
334 |
335 | int kernelIdx = 0;
336 | int lastPlanIdx;
337 |
338 | while(kernelIdx < N_KERNEL){
339 | if(debug) printf( "Kernel: %d\n",kernelIdx);
340 |
341 | for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++){
342 | if (kernelIdx >= N_KERNEL) break;
343 |
344 | // Set GPU
345 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx));
346 |
347 | // Set Streams
348 | for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){
349 | planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx;
350 |
351 | // Get Kernel Data
352 | const mxArray *mxCurrentCell = mxGetCell(prhs[1], kernelIdx);
353 | {
354 | if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 )
355 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1");
356 |
357 | if(debug) printf("Start plan %d\n", planIdx);
358 |
359 | plan[planIdx].h_Kernel = (float *)mxGetData(mxCurrentCell);
360 | plan[planIdx].mxKernel_Dim = mxGetDimensions(mxCurrentCell);
361 |
362 | // Kernel dimensions
363 | KERNEL_H = plan[planIdx].mxKernel_Dim[0];
364 | KERNEL_W = plan[planIdx].mxKernel_Dim[1];
365 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float);
366 |
367 | if(debug) printf("Start copy\n");
368 | // CUDA_SAFE_CALL(cudaHostRegister(plan[planIdx].h_Kernel, KERNEL_SIZE, cudaHostRegisterPortable));
369 | // CUDA_SAFE_CALL(cudaHostGetDevicePointer((void **) &plan[planIdx].d_Kernel, (void *)plan[planIdx].h_Kernel, 0));
370 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_Kernel, KERNEL_SIZE));
371 | CUDA_SAFE_CALL(cudaMemcpyAsync(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice, plan[planIdx].stream));
372 | // CUDA_SAFE_CALL(cudaMemcpy(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice));
373 | mxKernel = NULL;
374 | }
375 |
376 | if(debug) printf("Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W);
377 |
378 | if (FEATURE_DIM != plan[planIdx].mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){
379 | mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size");
380 | }
381 |
382 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
383 | if(debug) printf("Sync before padding\n");
384 | padData<<>>(
385 | plan[planIdx].d_PaddedKernel,
386 | plan[planIdx].d_Kernel,
387 | FFT_W,
388 | FFT_H,
389 | KERNEL_W,
390 | KERNEL_H,
391 | FEATURE_DIM
392 | );
393 | if(debug) printf("Padding done\n");
394 |
395 | CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
396 | CUFFT_SAFE_CALL(cufftExecR2C(plan[planIdx].FFTplan_R2C, plan[planIdx].d_PaddedKernel, plan[planIdx].d_CFFT_KERNEL));
397 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream));
398 |
399 | if(debug) printf("FFT done\n");
400 |
401 | /* Hadamard product, Element-wise multiplication in frequency domain */
402 | /* If execute the following, second compile of this file create MATLAB error */
403 | elementwiseProductAndNormalize<<