├── .gitignore ├── LICENSE ├── README.md ├── common ├── helper_cuda.h ├── helper_cuda_drvapi.h ├── helper_cuda_gl.h ├── helper_functions.h ├── helper_image.h ├── helper_math.h ├── helper_string.h └── helper_timer.h ├── compile.m ├── cuda_compile.m ├── demoCudaConvolutionFFT.m └── src ├── convolutionFFTkernel.cu ├── cudaConvFFTData.cu ├── cudaConvFFTData.cuh ├── cudaConvFFTData.h ├── cudaConvFFTDataStreams.cu ├── cudaConvolutionFFT.cu ├── cudaFFTData.cu └── cutil.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | *.o 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CUDA-FFT-Convolution 2 | ============== 3 | 4 | Using a standard multi-threaded CPU convolution for very large kernels is very inefficient and slow. This package provides GPU convolution using Fast Fourier Transformation implementation using CUDA. 5 | 6 | Standard convolution in time domain takes O(nm) time whereas convolution in frequency domain takes O((n+m) log (n+m)) time where n is the data length and k is the kernel length. 7 | 8 | ## cudaConvolutionFFT.cu 9 | 10 | The main file takes data, max kernel height, width, convolution kernels (multiple kernels in cell format) and returns convolution results that corresponds to the convolution kernels. 11 | 12 | ## Usage and Instructions 13 | 14 | 1. Download the repo. 15 | 16 | ``` 17 | git clone http://github.com/chrischoy/MatlabCUDAConv 18 | ``` 19 | 20 | 2. Go to the repo. Open MATLAB and type 21 | 22 | ``` 23 | compile 24 | ``` 25 | 26 | 3. Run demo. the demo file `demoCudaConvolutionFFT.m` contains a detailed instruction and demo usage 27 | 28 | 29 | ``` 30 | demoCudaConvolutionFFT 31 | ``` 32 | 33 | ## Output 34 | 35 | ![](https://dl.dropboxusercontent.com/u/57360783/cudafft_matlabfft_conv.png) 36 | 37 | ### More resource 38 | 39 | [http://chrischoy.org/projects/cuda-fft-convolution](http://chrischoy.org/projects/cuda-fft-convolution) 40 | -------------------------------------------------------------------------------- /common/helper_cuda.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | //////////////////////////////////////////////////////////////////////////////// 13 | // These are CUDA Helper functions for initialization and error checking 14 | 15 | #ifndef HELPER_CUDA_H 16 | #define HELPER_CUDA_H 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | /* 27 | inline void __ExitInTime(int seconds) 28 | { 29 | fprintf(stdout, "> exiting in %d seconds: ", seconds); 30 | fflush(stdout); 31 | time_t t; 32 | int count; 33 | 34 | for (t=time(0)+seconds, count=seconds; time(0) < t; count--) { 35 | fprintf(stdout, "%d...", count); 36 | #if defined(WIN32) 37 | Sleep(1000); 38 | #else 39 | sleep(1); 40 | #endif 41 | } 42 | 43 | fprintf(stdout,"done!\n\n"); 44 | fflush(stdout); 45 | } 46 | 47 | #define EXIT_TIME_DELAY 2 48 | 49 | inline void EXIT_DELAY(int return_code) 50 | { 51 | __ExitInTime(EXIT_TIME_DELAY); 52 | exit(return_code); 53 | } 54 | */ 55 | 56 | #ifndef EXIT_WAIVED 57 | #define EXIT_WAIVED 2 58 | #endif 59 | 60 | // Note, it is required that your SDK sample to include the proper header files, please 61 | // refer the CUDA examples for examples of the needed CUDA headers, which may change depending 62 | // on which CUDA functions are used. 63 | 64 | // CUDA Runtime error messages 65 | #ifdef __DRIVER_TYPES_H__ 66 | static const char *_cudaGetErrorEnum(cudaError_t error) 67 | { 68 | switch (error) 69 | { 70 | case cudaSuccess: 71 | return "cudaSuccess"; 72 | 73 | case cudaErrorMissingConfiguration: 74 | return "cudaErrorMissingConfiguration"; 75 | 76 | case cudaErrorMemoryAllocation: 77 | return "cudaErrorMemoryAllocation"; 78 | 79 | case cudaErrorInitializationError: 80 | return "cudaErrorInitializationError"; 81 | 82 | case cudaErrorLaunchFailure: 83 | return "cudaErrorLaunchFailure"; 84 | 85 | case cudaErrorPriorLaunchFailure: 86 | return "cudaErrorPriorLaunchFailure"; 87 | 88 | case cudaErrorLaunchTimeout: 89 | return "cudaErrorLaunchTimeout"; 90 | 91 | case cudaErrorLaunchOutOfResources: 92 | return "cudaErrorLaunchOutOfResources"; 93 | 94 | case cudaErrorInvalidDeviceFunction: 95 | return "cudaErrorInvalidDeviceFunction"; 96 | 97 | case cudaErrorInvalidConfiguration: 98 | return "cudaErrorInvalidConfiguration"; 99 | 100 | case cudaErrorInvalidDevice: 101 | return "cudaErrorInvalidDevice"; 102 | 103 | case cudaErrorInvalidValue: 104 | return "cudaErrorInvalidValue"; 105 | 106 | case cudaErrorInvalidPitchValue: 107 | return "cudaErrorInvalidPitchValue"; 108 | 109 | case cudaErrorInvalidSymbol: 110 | return "cudaErrorInvalidSymbol"; 111 | 112 | case cudaErrorMapBufferObjectFailed: 113 | return "cudaErrorMapBufferObjectFailed"; 114 | 115 | case cudaErrorUnmapBufferObjectFailed: 116 | return "cudaErrorUnmapBufferObjectFailed"; 117 | 118 | case cudaErrorInvalidHostPointer: 119 | return "cudaErrorInvalidHostPointer"; 120 | 121 | case cudaErrorInvalidDevicePointer: 122 | return "cudaErrorInvalidDevicePointer"; 123 | 124 | case cudaErrorInvalidTexture: 125 | return "cudaErrorInvalidTexture"; 126 | 127 | case cudaErrorInvalidTextureBinding: 128 | return "cudaErrorInvalidTextureBinding"; 129 | 130 | case cudaErrorInvalidChannelDescriptor: 131 | return "cudaErrorInvalidChannelDescriptor"; 132 | 133 | case cudaErrorInvalidMemcpyDirection: 134 | return "cudaErrorInvalidMemcpyDirection"; 135 | 136 | case cudaErrorAddressOfConstant: 137 | return "cudaErrorAddressOfConstant"; 138 | 139 | case cudaErrorTextureFetchFailed: 140 | return "cudaErrorTextureFetchFailed"; 141 | 142 | case cudaErrorTextureNotBound: 143 | return "cudaErrorTextureNotBound"; 144 | 145 | case cudaErrorSynchronizationError: 146 | return "cudaErrorSynchronizationError"; 147 | 148 | case cudaErrorInvalidFilterSetting: 149 | return "cudaErrorInvalidFilterSetting"; 150 | 151 | case cudaErrorInvalidNormSetting: 152 | return "cudaErrorInvalidNormSetting"; 153 | 154 | case cudaErrorMixedDeviceExecution: 155 | return "cudaErrorMixedDeviceExecution"; 156 | 157 | case cudaErrorCudartUnloading: 158 | return "cudaErrorCudartUnloading"; 159 | 160 | case cudaErrorUnknown: 161 | return "cudaErrorUnknown"; 162 | 163 | case cudaErrorNotYetImplemented: 164 | return "cudaErrorNotYetImplemented"; 165 | 166 | case cudaErrorMemoryValueTooLarge: 167 | return "cudaErrorMemoryValueTooLarge"; 168 | 169 | case cudaErrorInvalidResourceHandle: 170 | return "cudaErrorInvalidResourceHandle"; 171 | 172 | case cudaErrorNotReady: 173 | return "cudaErrorNotReady"; 174 | 175 | case cudaErrorInsufficientDriver: 176 | return "cudaErrorInsufficientDriver"; 177 | 178 | case cudaErrorSetOnActiveProcess: 179 | return "cudaErrorSetOnActiveProcess"; 180 | 181 | case cudaErrorInvalidSurface: 182 | return "cudaErrorInvalidSurface"; 183 | 184 | case cudaErrorNoDevice: 185 | return "cudaErrorNoDevice"; 186 | 187 | case cudaErrorECCUncorrectable: 188 | return "cudaErrorECCUncorrectable"; 189 | 190 | case cudaErrorSharedObjectSymbolNotFound: 191 | return "cudaErrorSharedObjectSymbolNotFound"; 192 | 193 | case cudaErrorSharedObjectInitFailed: 194 | return "cudaErrorSharedObjectInitFailed"; 195 | 196 | case cudaErrorUnsupportedLimit: 197 | return "cudaErrorUnsupportedLimit"; 198 | 199 | case cudaErrorDuplicateVariableName: 200 | return "cudaErrorDuplicateVariableName"; 201 | 202 | case cudaErrorDuplicateTextureName: 203 | return "cudaErrorDuplicateTextureName"; 204 | 205 | case cudaErrorDuplicateSurfaceName: 206 | return "cudaErrorDuplicateSurfaceName"; 207 | 208 | case cudaErrorDevicesUnavailable: 209 | return "cudaErrorDevicesUnavailable"; 210 | 211 | case cudaErrorInvalidKernelImage: 212 | return "cudaErrorInvalidKernelImage"; 213 | 214 | case cudaErrorNoKernelImageForDevice: 215 | return "cudaErrorNoKernelImageForDevice"; 216 | 217 | case cudaErrorIncompatibleDriverContext: 218 | return "cudaErrorIncompatibleDriverContext"; 219 | 220 | case cudaErrorPeerAccessAlreadyEnabled: 221 | return "cudaErrorPeerAccessAlreadyEnabled"; 222 | 223 | case cudaErrorPeerAccessNotEnabled: 224 | return "cudaErrorPeerAccessNotEnabled"; 225 | 226 | case cudaErrorDeviceAlreadyInUse: 227 | return "cudaErrorDeviceAlreadyInUse"; 228 | 229 | case cudaErrorProfilerDisabled: 230 | return "cudaErrorProfilerDisabled"; 231 | 232 | case cudaErrorProfilerNotInitialized: 233 | return "cudaErrorProfilerNotInitialized"; 234 | 235 | case cudaErrorProfilerAlreadyStarted: 236 | return "cudaErrorProfilerAlreadyStarted"; 237 | 238 | case cudaErrorProfilerAlreadyStopped: 239 | return "cudaErrorProfilerAlreadyStopped"; 240 | 241 | #if __CUDA_API_VERSION >= 0x4000 242 | 243 | case cudaErrorAssert: 244 | return "cudaErrorAssert"; 245 | 246 | case cudaErrorTooManyPeers: 247 | return "cudaErrorTooManyPeers"; 248 | 249 | case cudaErrorHostMemoryAlreadyRegistered: 250 | return "cudaErrorHostMemoryAlreadyRegistered"; 251 | 252 | case cudaErrorHostMemoryNotRegistered: 253 | return "cudaErrorHostMemoryNotRegistered"; 254 | #endif 255 | 256 | case cudaErrorStartupFailure: 257 | return "cudaErrorStartupFailure"; 258 | 259 | case cudaErrorApiFailureBase: 260 | return "cudaErrorApiFailureBase"; 261 | } 262 | 263 | return ""; 264 | } 265 | #endif 266 | 267 | #ifdef __cuda_cuda_h__ 268 | // CUDA Driver API errors 269 | static const char *_cudaGetErrorEnum(CUresult error) 270 | { 271 | switch (error) 272 | { 273 | case CUDA_SUCCESS: 274 | return "CUDA_SUCCESS"; 275 | 276 | case CUDA_ERROR_INVALID_VALUE: 277 | return "CUDA_ERROR_INVALID_VALUE"; 278 | 279 | case CUDA_ERROR_OUT_OF_MEMORY: 280 | return "CUDA_ERROR_OUT_OF_MEMORY"; 281 | 282 | case CUDA_ERROR_NOT_INITIALIZED: 283 | return "CUDA_ERROR_NOT_INITIALIZED"; 284 | 285 | case CUDA_ERROR_DEINITIALIZED: 286 | return "CUDA_ERROR_DEINITIALIZED"; 287 | 288 | case CUDA_ERROR_PROFILER_DISABLED: 289 | return "CUDA_ERROR_PROFILER_DISABLED"; 290 | 291 | case CUDA_ERROR_PROFILER_NOT_INITIALIZED: 292 | return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; 293 | 294 | case CUDA_ERROR_PROFILER_ALREADY_STARTED: 295 | return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; 296 | 297 | case CUDA_ERROR_PROFILER_ALREADY_STOPPED: 298 | return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; 299 | 300 | case CUDA_ERROR_NO_DEVICE: 301 | return "CUDA_ERROR_NO_DEVICE"; 302 | 303 | case CUDA_ERROR_INVALID_DEVICE: 304 | return "CUDA_ERROR_INVALID_DEVICE"; 305 | 306 | case CUDA_ERROR_INVALID_IMAGE: 307 | return "CUDA_ERROR_INVALID_IMAGE"; 308 | 309 | case CUDA_ERROR_INVALID_CONTEXT: 310 | return "CUDA_ERROR_INVALID_CONTEXT"; 311 | 312 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: 313 | return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; 314 | 315 | case CUDA_ERROR_MAP_FAILED: 316 | return "CUDA_ERROR_MAP_FAILED"; 317 | 318 | case CUDA_ERROR_UNMAP_FAILED: 319 | return "CUDA_ERROR_UNMAP_FAILED"; 320 | 321 | case CUDA_ERROR_ARRAY_IS_MAPPED: 322 | return "CUDA_ERROR_ARRAY_IS_MAPPED"; 323 | 324 | case CUDA_ERROR_ALREADY_MAPPED: 325 | return "CUDA_ERROR_ALREADY_MAPPED"; 326 | 327 | case CUDA_ERROR_NO_BINARY_FOR_GPU: 328 | return "CUDA_ERROR_NO_BINARY_FOR_GPU"; 329 | 330 | case CUDA_ERROR_ALREADY_ACQUIRED: 331 | return "CUDA_ERROR_ALREADY_ACQUIRED"; 332 | 333 | case CUDA_ERROR_NOT_MAPPED: 334 | return "CUDA_ERROR_NOT_MAPPED"; 335 | 336 | case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: 337 | return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; 338 | 339 | case CUDA_ERROR_NOT_MAPPED_AS_POINTER: 340 | return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; 341 | 342 | case CUDA_ERROR_ECC_UNCORRECTABLE: 343 | return "CUDA_ERROR_ECC_UNCORRECTABLE"; 344 | 345 | case CUDA_ERROR_UNSUPPORTED_LIMIT: 346 | return "CUDA_ERROR_UNSUPPORTED_LIMIT"; 347 | 348 | case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: 349 | return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; 350 | 351 | case CUDA_ERROR_INVALID_SOURCE: 352 | return "CUDA_ERROR_INVALID_SOURCE"; 353 | 354 | case CUDA_ERROR_FILE_NOT_FOUND: 355 | return "CUDA_ERROR_FILE_NOT_FOUND"; 356 | 357 | case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: 358 | return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; 359 | 360 | case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: 361 | return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; 362 | 363 | case CUDA_ERROR_OPERATING_SYSTEM: 364 | return "CUDA_ERROR_OPERATING_SYSTEM"; 365 | 366 | case CUDA_ERROR_INVALID_HANDLE: 367 | return "CUDA_ERROR_INVALID_HANDLE"; 368 | 369 | case CUDA_ERROR_NOT_FOUND: 370 | return "CUDA_ERROR_NOT_FOUND"; 371 | 372 | case CUDA_ERROR_NOT_READY: 373 | return "CUDA_ERROR_NOT_READY"; 374 | 375 | case CUDA_ERROR_LAUNCH_FAILED: 376 | return "CUDA_ERROR_LAUNCH_FAILED"; 377 | 378 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: 379 | return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; 380 | 381 | case CUDA_ERROR_LAUNCH_TIMEOUT: 382 | return "CUDA_ERROR_LAUNCH_TIMEOUT"; 383 | 384 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: 385 | return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; 386 | 387 | case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: 388 | return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; 389 | 390 | case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: 391 | return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; 392 | 393 | case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: 394 | return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; 395 | 396 | case CUDA_ERROR_CONTEXT_IS_DESTROYED: 397 | return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; 398 | 399 | case CUDA_ERROR_ASSERT: 400 | return "CUDA_ERROR_ASSERT"; 401 | 402 | case CUDA_ERROR_TOO_MANY_PEERS: 403 | return "CUDA_ERROR_TOO_MANY_PEERS"; 404 | 405 | case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: 406 | return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; 407 | 408 | case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: 409 | return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; 410 | 411 | case CUDA_ERROR_UNKNOWN: 412 | return "CUDA_ERROR_UNKNOWN"; 413 | } 414 | 415 | return ""; 416 | } 417 | #endif 418 | 419 | #ifdef CUBLAS_API_H_ 420 | // cuBLAS API errors 421 | static const char *_cudaGetErrorEnum(cublasStatus_t error) 422 | { 423 | switch (error) 424 | { 425 | case CUBLAS_STATUS_SUCCESS: 426 | return "CUBLAS_STATUS_SUCCESS"; 427 | 428 | case CUBLAS_STATUS_NOT_INITIALIZED: 429 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 430 | 431 | case CUBLAS_STATUS_ALLOC_FAILED: 432 | return "CUBLAS_STATUS_ALLOC_FAILED"; 433 | 434 | case CUBLAS_STATUS_INVALID_VALUE: 435 | return "CUBLAS_STATUS_INVALID_VALUE"; 436 | 437 | case CUBLAS_STATUS_ARCH_MISMATCH: 438 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 439 | 440 | case CUBLAS_STATUS_MAPPING_ERROR: 441 | return "CUBLAS_STATUS_MAPPING_ERROR"; 442 | 443 | case CUBLAS_STATUS_EXECUTION_FAILED: 444 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 445 | 446 | case CUBLAS_STATUS_INTERNAL_ERROR: 447 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 448 | } 449 | 450 | return ""; 451 | } 452 | #endif 453 | 454 | #ifdef _CUFFT_H_ 455 | // cuFFT API errors 456 | static const char *_cudaGetErrorEnum(cufftResult error) 457 | { 458 | switch (error) 459 | { 460 | case CUFFT_SUCCESS: 461 | return "CUFFT_SUCCESS"; 462 | 463 | case CUFFT_INVALID_PLAN: 464 | return "CUFFT_INVALID_PLAN"; 465 | 466 | case CUFFT_ALLOC_FAILED: 467 | return "CUFFT_ALLOC_FAILED"; 468 | 469 | case CUFFT_INVALID_TYPE: 470 | return "CUFFT_INVALID_TYPE"; 471 | 472 | case CUFFT_INVALID_VALUE: 473 | return "CUFFT_INVALID_VALUE"; 474 | 475 | case CUFFT_INTERNAL_ERROR: 476 | return "CUFFT_INTERNAL_ERROR"; 477 | 478 | case CUFFT_EXEC_FAILED: 479 | return "CUFFT_EXEC_FAILED"; 480 | 481 | case CUFFT_SETUP_FAILED: 482 | return "CUFFT_SETUP_FAILED"; 483 | 484 | case CUFFT_INVALID_SIZE: 485 | return "CUFFT_INVALID_SIZE"; 486 | 487 | case CUFFT_UNALIGNED_DATA: 488 | return "CUFFT_UNALIGNED_DATA"; 489 | } 490 | 491 | return ""; 492 | } 493 | #endif 494 | 495 | 496 | #ifdef CUSPARSEAPI 497 | // cuSPARSE API errors 498 | static const char *_cudaGetErrorEnum(cusparseStatus_t error) 499 | { 500 | switch (error) 501 | { 502 | case CUSPARSE_STATUS_SUCCESS: 503 | return "CUSPARSE_STATUS_SUCCESS"; 504 | 505 | case CUSPARSE_STATUS_NOT_INITIALIZED: 506 | return "CUSPARSE_STATUS_NOT_INITIALIZED"; 507 | 508 | case CUSPARSE_STATUS_ALLOC_FAILED: 509 | return "CUSPARSE_STATUS_ALLOC_FAILED"; 510 | 511 | case CUSPARSE_STATUS_INVALID_VALUE: 512 | return "CUSPARSE_STATUS_INVALID_VALUE"; 513 | 514 | case CUSPARSE_STATUS_ARCH_MISMATCH: 515 | return "CUSPARSE_STATUS_ARCH_MISMATCH"; 516 | 517 | case CUSPARSE_STATUS_MAPPING_ERROR: 518 | return "CUSPARSE_STATUS_MAPPING_ERROR"; 519 | 520 | case CUSPARSE_STATUS_EXECUTION_FAILED: 521 | return "CUSPARSE_STATUS_EXECUTION_FAILED"; 522 | 523 | case CUSPARSE_STATUS_INTERNAL_ERROR: 524 | return "CUSPARSE_STATUS_INTERNAL_ERROR"; 525 | 526 | case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 527 | return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 528 | } 529 | 530 | return ""; 531 | } 532 | #endif 533 | 534 | #ifdef CURAND_H_ 535 | // cuRAND API errors 536 | static const char *_cudaGetErrorEnum(curandStatus_t error) 537 | { 538 | switch (error) 539 | { 540 | case CURAND_STATUS_SUCCESS: 541 | return "CURAND_STATUS_SUCCESS"; 542 | 543 | case CURAND_STATUS_VERSION_MISMATCH: 544 | return "CURAND_STATUS_VERSION_MISMATCH"; 545 | 546 | case CURAND_STATUS_NOT_INITIALIZED: 547 | return "CURAND_STATUS_NOT_INITIALIZED"; 548 | 549 | case CURAND_STATUS_ALLOCATION_FAILED: 550 | return "CURAND_STATUS_ALLOCATION_FAILED"; 551 | 552 | case CURAND_STATUS_TYPE_ERROR: 553 | return "CURAND_STATUS_TYPE_ERROR"; 554 | 555 | case CURAND_STATUS_OUT_OF_RANGE: 556 | return "CURAND_STATUS_OUT_OF_RANGE"; 557 | 558 | case CURAND_STATUS_LENGTH_NOT_MULTIPLE: 559 | return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; 560 | 561 | case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: 562 | return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; 563 | 564 | case CURAND_STATUS_LAUNCH_FAILURE: 565 | return "CURAND_STATUS_LAUNCH_FAILURE"; 566 | 567 | case CURAND_STATUS_PREEXISTING_FAILURE: 568 | return "CURAND_STATUS_PREEXISTING_FAILURE"; 569 | 570 | case CURAND_STATUS_INITIALIZATION_FAILED: 571 | return "CURAND_STATUS_INITIALIZATION_FAILED"; 572 | 573 | case CURAND_STATUS_ARCH_MISMATCH: 574 | return "CURAND_STATUS_ARCH_MISMATCH"; 575 | 576 | case CURAND_STATUS_INTERNAL_ERROR: 577 | return "CURAND_STATUS_INTERNAL_ERROR"; 578 | } 579 | 580 | return ""; 581 | } 582 | #endif 583 | 584 | #ifdef NV_NPPIDEFS_H 585 | // NPP API errors 586 | static const char *_cudaGetErrorEnum(NppStatus error) 587 | { 588 | switch (error) 589 | { 590 | case NPP_NOT_SUPPORTED_MODE_ERROR: 591 | return "NPP_NOT_SUPPORTED_MODE_ERROR"; 592 | 593 | case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: 594 | return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; 595 | 596 | case NPP_RESIZE_NO_OPERATION_ERROR: 597 | return "NPP_RESIZE_NO_OPERATION_ERROR"; 598 | 599 | case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: 600 | return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; 601 | 602 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 603 | 604 | case NPP_BAD_ARG_ERROR: 605 | return "NPP_BAD_ARGUMENT_ERROR"; 606 | 607 | case NPP_COEFF_ERROR: 608 | return "NPP_COEFFICIENT_ERROR"; 609 | 610 | case NPP_RECT_ERROR: 611 | return "NPP_RECTANGLE_ERROR"; 612 | 613 | case NPP_QUAD_ERROR: 614 | return "NPP_QUADRANGLE_ERROR"; 615 | 616 | case NPP_MEM_ALLOC_ERR: 617 | return "NPP_MEMORY_ALLOCATION_ERROR"; 618 | 619 | case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: 620 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; 621 | 622 | case NPP_INVALID_INPUT: 623 | return "NPP_INVALID_INPUT"; 624 | 625 | case NPP_POINTER_ERROR: 626 | return "NPP_POINTER_ERROR"; 627 | 628 | case NPP_WARNING: 629 | return "NPP_WARNING"; 630 | 631 | case NPP_ODD_ROI_WARNING: 632 | return "NPP_ODD_ROI_WARNING"; 633 | #else 634 | 635 | // These are for CUDA 5.5 or higher 636 | case NPP_BAD_ARGUMENT_ERROR: 637 | return "NPP_BAD_ARGUMENT_ERROR"; 638 | 639 | case NPP_COEFFICIENT_ERROR: 640 | return "NPP_COEFFICIENT_ERROR"; 641 | 642 | case NPP_RECTANGLE_ERROR: 643 | return "NPP_RECTANGLE_ERROR"; 644 | 645 | case NPP_QUADRANGLE_ERROR: 646 | return "NPP_QUADRANGLE_ERROR"; 647 | 648 | case NPP_MEMORY_ALLOCATION_ERR: 649 | return "NPP_MEMORY_ALLOCATION_ERROR"; 650 | 651 | case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: 652 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; 653 | 654 | case NPP_INVALID_HOST_POINTER_ERROR: 655 | return "NPP_INVALID_HOST_POINTER_ERROR"; 656 | 657 | case NPP_INVALID_DEVICE_POINTER_ERROR: 658 | return "NPP_INVALID_DEVICE_POINTER_ERROR"; 659 | #endif 660 | 661 | case NPP_LUT_NUMBER_OF_LEVELS_ERROR: 662 | return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; 663 | 664 | case NPP_TEXTURE_BIND_ERROR: 665 | return "NPP_TEXTURE_BIND_ERROR"; 666 | 667 | case NPP_WRONG_INTERSECTION_ROI_ERROR: 668 | return "NPP_WRONG_INTERSECTION_ROI_ERROR"; 669 | 670 | case NPP_NOT_EVEN_STEP_ERROR: 671 | return "NPP_NOT_EVEN_STEP_ERROR"; 672 | 673 | case NPP_INTERPOLATION_ERROR: 674 | return "NPP_INTERPOLATION_ERROR"; 675 | 676 | case NPP_RESIZE_FACTOR_ERROR: 677 | return "NPP_RESIZE_FACTOR_ERROR"; 678 | 679 | case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: 680 | return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; 681 | 682 | 683 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 684 | 685 | case NPP_MEMFREE_ERR: 686 | return "NPP_MEMFREE_ERR"; 687 | 688 | case NPP_MEMSET_ERR: 689 | return "NPP_MEMSET_ERR"; 690 | 691 | case NPP_MEMCPY_ERR: 692 | return "NPP_MEMCPY_ERROR"; 693 | 694 | case NPP_MIRROR_FLIP_ERR: 695 | return "NPP_MIRROR_FLIP_ERR"; 696 | #else 697 | 698 | case NPP_MEMFREE_ERROR: 699 | return "NPP_MEMFREE_ERROR"; 700 | 701 | case NPP_MEMSET_ERROR: 702 | return "NPP_MEMSET_ERROR"; 703 | 704 | case NPP_MEMCPY_ERROR: 705 | return "NPP_MEMCPY_ERROR"; 706 | 707 | case NPP_MIRROR_FLIP_ERROR: 708 | return "NPP_MIRROR_FLIP_ERROR"; 709 | #endif 710 | 711 | case NPP_ALIGNMENT_ERROR: 712 | return "NPP_ALIGNMENT_ERROR"; 713 | 714 | case NPP_STEP_ERROR: 715 | return "NPP_STEP_ERROR"; 716 | 717 | case NPP_SIZE_ERROR: 718 | return "NPP_SIZE_ERROR"; 719 | 720 | case NPP_NULL_POINTER_ERROR: 721 | return "NPP_NULL_POINTER_ERROR"; 722 | 723 | case NPP_CUDA_KERNEL_EXECUTION_ERROR: 724 | return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; 725 | 726 | case NPP_NOT_IMPLEMENTED_ERROR: 727 | return "NPP_NOT_IMPLEMENTED_ERROR"; 728 | 729 | case NPP_ERROR: 730 | return "NPP_ERROR"; 731 | 732 | case NPP_SUCCESS: 733 | return "NPP_SUCCESS"; 734 | 735 | case NPP_WRONG_INTERSECTION_QUAD_WARNING: 736 | return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; 737 | 738 | case NPP_MISALIGNED_DST_ROI_WARNING: 739 | return "NPP_MISALIGNED_DST_ROI_WARNING"; 740 | 741 | case NPP_AFFINE_QUAD_INCORRECT_WARNING: 742 | return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; 743 | 744 | case NPP_DOUBLE_SIZE_WARNING: 745 | return "NPP_DOUBLE_SIZE_WARNING"; 746 | 747 | case NPP_WRONG_INTERSECTION_ROI_WARNING: 748 | return "NPP_WRONG_INTERSECTION_ROI_WARNING"; 749 | } 750 | 751 | return ""; 752 | } 753 | #endif 754 | 755 | #ifdef __DRIVER_TYPES_H__ 756 | #ifndef DEVICE_RESET 757 | #define DEVICE_RESET cudaDeviceReset(); 758 | #endif 759 | #else 760 | #ifndef DEVICE_RESET 761 | #define DEVICE_RESET 762 | #endif 763 | #endif 764 | 765 | template< typename T > 766 | void check(T result, char const *const func, const char *const file, int const line) 767 | { 768 | if (result) 769 | { 770 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", 771 | file, line, static_cast(result), _cudaGetErrorEnum(result), func); 772 | DEVICE_RESET 773 | // Make sure we call CUDA Device Reset before exiting 774 | exit(EXIT_FAILURE); 775 | } 776 | } 777 | 778 | #ifdef __DRIVER_TYPES_H__ 779 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error 780 | #define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) 781 | 782 | // This will output the proper error string when calling cudaGetLastError 783 | #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) 784 | 785 | inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) 786 | { 787 | cudaError_t err = cudaGetLastError(); 788 | 789 | if (cudaSuccess != err) 790 | { 791 | fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", 792 | file, line, errorMessage, (int)err, cudaGetErrorString(err)); 793 | DEVICE_RESET 794 | exit(EXIT_FAILURE); 795 | } 796 | } 797 | #endif 798 | 799 | #ifndef MAX 800 | #define MAX(a,b) (a > b ? a : b) 801 | #endif 802 | 803 | // Beginning of GPU Architecture definitions 804 | inline int _ConvertSMVer2Cores(int major, int minor) 805 | { 806 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM 807 | typedef struct 808 | { 809 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version 810 | int Cores; 811 | } sSMtoCores; 812 | 813 | sSMtoCores nGpuArchCoresPerSM[] = 814 | { 815 | { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class 816 | { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class 817 | { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class 818 | { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class 819 | { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class 820 | { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class 821 | { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class 822 | { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class 823 | { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class 824 | { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class 825 | { -1, -1 } 826 | }; 827 | 828 | int index = 0; 829 | 830 | while (nGpuArchCoresPerSM[index].SM != -1) 831 | { 832 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) 833 | { 834 | return nGpuArchCoresPerSM[index].Cores; 835 | } 836 | 837 | index++; 838 | } 839 | 840 | // If we don't find the values, we default use the previous one to run properly 841 | printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores); 842 | return nGpuArchCoresPerSM[7].Cores; 843 | } 844 | // end of GPU Architecture definitions 845 | 846 | #ifdef __CUDA_RUNTIME_H__ 847 | // General GPU Device CUDA Initialization 848 | inline int gpuDeviceInit(int devID) 849 | { 850 | int device_count; 851 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 852 | 853 | if (device_count == 0) 854 | { 855 | fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); 856 | exit(EXIT_FAILURE); 857 | } 858 | 859 | if (devID < 0) 860 | { 861 | devID = 0; 862 | } 863 | 864 | if (devID > device_count-1) 865 | { 866 | fprintf(stderr, "\n"); 867 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); 868 | fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); 869 | fprintf(stderr, "\n"); 870 | return -devID; 871 | } 872 | 873 | cudaDeviceProp deviceProp; 874 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); 875 | 876 | if (deviceProp.computeMode == cudaComputeModeProhibited) 877 | { 878 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 879 | return -1; 880 | } 881 | 882 | if (deviceProp.major < 1) 883 | { 884 | fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); 885 | exit(EXIT_FAILURE); 886 | } 887 | 888 | checkCudaErrors(cudaSetDevice(devID)); 889 | printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); 890 | 891 | return devID; 892 | } 893 | 894 | // This function returns the best GPU (with maximum GFLOPS) 895 | inline int gpuGetMaxGflopsDeviceId() 896 | { 897 | int current_device = 0, sm_per_multiproc = 0; 898 | int max_perf_device = 0; 899 | int device_count = 0, best_SM_arch = 0; 900 | 901 | unsigned long long max_compute_perf = 0; 902 | cudaDeviceProp deviceProp; 903 | cudaGetDeviceCount(&device_count); 904 | 905 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 906 | 907 | if (device_count == 0) 908 | { 909 | fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); 910 | exit(EXIT_FAILURE); 911 | } 912 | 913 | // Find the best major SM Architecture GPU device 914 | while (current_device < device_count) 915 | { 916 | cudaGetDeviceProperties(&deviceProp, current_device); 917 | 918 | // If this GPU is not running on Compute Mode prohibited, then we can add it to the list 919 | if (deviceProp.computeMode != cudaComputeModeProhibited) 920 | { 921 | if (deviceProp.major > 0 && deviceProp.major < 9999) 922 | { 923 | best_SM_arch = MAX(best_SM_arch, deviceProp.major); 924 | } 925 | } 926 | 927 | current_device++; 928 | } 929 | 930 | // Find the best CUDA capable GPU device 931 | current_device = 0; 932 | 933 | while (current_device < device_count) 934 | { 935 | cudaGetDeviceProperties(&deviceProp, current_device); 936 | 937 | // If this GPU is not running on Compute Mode prohibited, then we can add it to the list 938 | if (deviceProp.computeMode != cudaComputeModeProhibited) 939 | { 940 | if (deviceProp.major == 9999 && deviceProp.minor == 9999) 941 | { 942 | sm_per_multiproc = 1; 943 | } 944 | else 945 | { 946 | sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); 947 | } 948 | 949 | unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; 950 | 951 | if (compute_perf > max_compute_perf) 952 | { 953 | // If we find GPU with SM major > 2, search only these 954 | if (best_SM_arch > 2) 955 | { 956 | // If our device==dest_SM_arch, choose this, or else pass 957 | if (deviceProp.major == best_SM_arch) 958 | { 959 | max_compute_perf = compute_perf; 960 | max_perf_device = current_device; 961 | } 962 | } 963 | else 964 | { 965 | max_compute_perf = compute_perf; 966 | max_perf_device = current_device; 967 | } 968 | } 969 | } 970 | 971 | ++current_device; 972 | } 973 | 974 | return max_perf_device; 975 | } 976 | 977 | 978 | // Initialization code to find the best CUDA Device 979 | inline int findCudaDevice(int argc, const char **argv) 980 | { 981 | cudaDeviceProp deviceProp; 982 | int devID = 0; 983 | 984 | // If the command-line has a device number specified, use it 985 | if (checkCmdLineFlag(argc, argv, "device")) 986 | { 987 | devID = getCmdLineArgumentInt(argc, argv, "device="); 988 | 989 | if (devID < 0) 990 | { 991 | printf("Invalid command line parameter\n "); 992 | exit(EXIT_FAILURE); 993 | } 994 | else 995 | { 996 | devID = gpuDeviceInit(devID); 997 | 998 | if (devID < 0) 999 | { 1000 | printf("exiting...\n"); 1001 | exit(EXIT_FAILURE); 1002 | } 1003 | } 1004 | } 1005 | else 1006 | { 1007 | // Otherwise pick the device with highest Gflops/s 1008 | devID = gpuGetMaxGflopsDeviceId(); 1009 | checkCudaErrors(cudaSetDevice(devID)); 1010 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); 1011 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 1012 | } 1013 | 1014 | return devID; 1015 | } 1016 | 1017 | // General check for CUDA GPU SM Capabilities 1018 | inline bool checkCudaCapabilities(int major_version, int minor_version) 1019 | { 1020 | cudaDeviceProp deviceProp; 1021 | deviceProp.major = 0; 1022 | deviceProp.minor = 0; 1023 | int dev; 1024 | 1025 | checkCudaErrors(cudaGetDevice(&dev)); 1026 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); 1027 | 1028 | if ((deviceProp.major > major_version) || 1029 | (deviceProp.major == major_version && deviceProp.minor >= minor_version)) 1030 | { 1031 | printf(" GPU Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); 1032 | return true; 1033 | } 1034 | else 1035 | { 1036 | printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); 1037 | return false; 1038 | } 1039 | } 1040 | #endif 1041 | 1042 | // end of CUDA Helper Functions 1043 | 1044 | 1045 | #endif 1046 | -------------------------------------------------------------------------------- /common/helper_cuda_drvapi.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects) 13 | #ifndef HELPER_CUDA_DRVAPI_H 14 | #define HELPER_CUDA_DRVAPI_H 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | #ifndef MAX 24 | #define MAX(a,b) (a > b ? a : b) 25 | #endif 26 | 27 | #ifndef EXIT_WAIVED 28 | #define EXIT_WAIVED 2 29 | #endif 30 | 31 | //////////////////////////////////////////////////////////////////////////////// 32 | // These are CUDA Helper functions 33 | 34 | // add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H 35 | #ifdef __cuda_cuda_h__ 36 | // This will output the proper CUDA error strings in the event that a CUDA host call returns an error 37 | #ifndef checkCudaErrors 38 | #define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) 39 | 40 | // These are the inline versions for all of the SDK helper functions 41 | inline void __checkCudaErrors(CUresult err, const char *file, const int line) 42 | { 43 | if (CUDA_SUCCESS != err) 44 | { 45 | fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n", 46 | err, getCudaDrvErrorString(err), file, line); 47 | exit(EXIT_FAILURE); 48 | } 49 | } 50 | #endif 51 | 52 | #ifdef getLastCudaDrvErrorMsg 53 | #undef getLastCudaDrvErrorMsg 54 | #endif 55 | 56 | #define getLastCudaDrvErrorMsg(msg) __getLastCudaDrvErrorMsg (msg, __FILE__, __LINE__) 57 | 58 | inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line) 59 | { 60 | CUresult err = cuCtxSynchronize(); 61 | 62 | if (CUDA_SUCCESS != err) 63 | { 64 | fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg); 65 | fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n", 66 | err, getCudaDrvErrorString(err), file, line); 67 | exit(EXIT_FAILURE); 68 | } 69 | } 70 | 71 | // This function wraps the CUDA Driver API into a template function 72 | template 73 | inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) 74 | { 75 | CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device); 76 | 77 | if (error_result != CUDA_SUCCESS) 78 | { 79 | printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result)); 80 | exit(EXIT_SUCCESS); 81 | } 82 | } 83 | #endif 84 | 85 | // Beginning of GPU Architecture definitions 86 | inline int _ConvertSMVer2CoresDRV(int major, int minor) 87 | { 88 | // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM 89 | typedef struct 90 | { 91 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version 92 | int Cores; 93 | } sSMtoCores; 94 | 95 | sSMtoCores nGpuArchCoresPerSM[] = 96 | { 97 | { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class 98 | { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class 99 | { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class 100 | { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class 101 | { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class 102 | { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class 103 | { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class 104 | { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class 105 | { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class 106 | { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class 107 | { -1, -1 } 108 | }; 109 | 110 | int index = 0; 111 | 112 | while (nGpuArchCoresPerSM[index].SM != -1) 113 | { 114 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) 115 | { 116 | return nGpuArchCoresPerSM[index].Cores; 117 | } 118 | 119 | index++; 120 | } 121 | 122 | // If we don't find the values, we default use the previous one to run properly 123 | printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores); 124 | return nGpuArchCoresPerSM[7].Cores; 125 | } 126 | // end of GPU Architecture definitions 127 | 128 | #ifdef __cuda_cuda_h__ 129 | // General GPU Device CUDA Initialization 130 | inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) 131 | { 132 | int cuDevice = 0; 133 | int deviceCount = 0; 134 | CUresult err = cuInit(0); 135 | 136 | if (CUDA_SUCCESS == err) 137 | { 138 | checkCudaErrors(cuDeviceGetCount(&deviceCount)); 139 | } 140 | 141 | if (deviceCount == 0) 142 | { 143 | fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); 144 | exit(EXIT_FAILURE); 145 | } 146 | 147 | int dev = 0; 148 | dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device="); 149 | 150 | if (dev < 0) 151 | { 152 | dev = 0; 153 | } 154 | 155 | if (dev > deviceCount-1) 156 | { 157 | fprintf(stderr, "\n"); 158 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); 159 | fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev); 160 | fprintf(stderr, "\n"); 161 | return -dev; 162 | } 163 | 164 | checkCudaErrors(cuDeviceGet(&cuDevice, dev)); 165 | char name[100]; 166 | cuDeviceGetName(name, 100, cuDevice); 167 | 168 | int computeMode; 169 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 170 | 171 | if (computeMode == CU_COMPUTEMODE_PROHIBITED) 172 | { 173 | fprintf(stderr, "Error: device is running in , no threads can use this CUDA Device.\n"); 174 | return -1; 175 | } 176 | 177 | if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false) 178 | { 179 | printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); 180 | } 181 | 182 | return dev; 183 | } 184 | 185 | // This function returns the best GPU based on performance 186 | inline int gpuGetMaxGflopsDeviceIdDRV() 187 | { 188 | CUdevice current_device = 0, max_perf_device = 0; 189 | int device_count = 0, sm_per_multiproc = 0; 190 | int max_compute_perf = 0, best_SM_arch = 0; 191 | int major = 0, minor = 0 , multiProcessorCount, clockRate; 192 | 193 | cuInit(0); 194 | checkCudaErrors(cuDeviceGetCount(&device_count)); 195 | 196 | if (device_count == 0) 197 | { 198 | fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); 199 | exit(EXIT_FAILURE); 200 | } 201 | 202 | // Find the best major SM Architecture GPU device 203 | while (current_device < device_count) 204 | { 205 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); 206 | 207 | if (major > 0 && major < 9999) 208 | { 209 | best_SM_arch = MAX(best_SM_arch, major); 210 | } 211 | 212 | current_device++; 213 | } 214 | 215 | // Find the best CUDA capable GPU device 216 | current_device = 0; 217 | 218 | while (current_device < device_count) 219 | { 220 | checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount, 221 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 222 | current_device)); 223 | checkCudaErrors(cuDeviceGetAttribute(&clockRate, 224 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 225 | current_device)); 226 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); 227 | 228 | int computeMode; 229 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); 230 | 231 | if (computeMode != CU_COMPUTEMODE_PROHIBITED) 232 | { 233 | if (major == 9999 && minor == 9999) 234 | { 235 | sm_per_multiproc = 1; 236 | } 237 | else 238 | { 239 | sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); 240 | } 241 | 242 | int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate; 243 | 244 | if (compute_perf > max_compute_perf) 245 | { 246 | // If we find GPU with SM major > 2, search only these 247 | if (best_SM_arch > 2) 248 | { 249 | // If our device==dest_SM_arch, choose this, or else pass 250 | if (major == best_SM_arch) 251 | { 252 | max_compute_perf = compute_perf; 253 | max_perf_device = current_device; 254 | } 255 | } 256 | else 257 | { 258 | max_compute_perf = compute_perf; 259 | max_perf_device = current_device; 260 | } 261 | } 262 | } 263 | 264 | ++current_device; 265 | } 266 | 267 | return max_perf_device; 268 | } 269 | 270 | // This function returns the best Graphics GPU based on performance 271 | inline int gpuGetMaxGflopsGLDeviceIdDRV() 272 | { 273 | CUdevice current_device = 0, max_perf_device = 0; 274 | int device_count = 0, sm_per_multiproc = 0; 275 | int max_compute_perf = 0, best_SM_arch = 0; 276 | int major = 0, minor = 0, multiProcessorCount, clockRate; 277 | int bTCC = 0; 278 | char deviceName[256]; 279 | 280 | cuInit(0); 281 | checkCudaErrors(cuDeviceGetCount(&device_count)); 282 | 283 | if (device_count == 0) 284 | { 285 | fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n"); 286 | exit(EXIT_FAILURE); 287 | } 288 | 289 | // Find the best major SM Architecture GPU device that are graphics devices 290 | while (current_device < device_count) 291 | { 292 | checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); 293 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); 294 | 295 | #if CUDA_VERSION >= 3020 296 | checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); 297 | #else 298 | 299 | // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 300 | if (deviceName[0] == 'T') 301 | { 302 | bTCC = 1; 303 | } 304 | 305 | #endif 306 | 307 | int computeMode; 308 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); 309 | 310 | if (computeMode != CU_COMPUTEMODE_PROHIBITED) 311 | { 312 | if (!bTCC) 313 | { 314 | if (major > 0 && major < 9999) 315 | { 316 | best_SM_arch = MAX(best_SM_arch, major); 317 | } 318 | } 319 | } 320 | 321 | current_device++; 322 | } 323 | 324 | // Find the best CUDA capable GPU device 325 | current_device = 0; 326 | 327 | while (current_device < device_count) 328 | { 329 | checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount, 330 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 331 | current_device)); 332 | checkCudaErrors(cuDeviceGetAttribute(&clockRate, 333 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 334 | current_device)); 335 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); 336 | 337 | #if CUDA_VERSION >= 3020 338 | checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); 339 | #else 340 | 341 | // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 342 | if (deviceName[0] == 'T') 343 | { 344 | bTCC = 1; 345 | } 346 | 347 | #endif 348 | 349 | int computeMode; 350 | getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); 351 | 352 | if (computeMode != CU_COMPUTEMODE_PROHIBITED) 353 | { 354 | if (major == 9999 && minor == 9999) 355 | { 356 | sm_per_multiproc = 1; 357 | } 358 | else 359 | { 360 | sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); 361 | } 362 | 363 | // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor 364 | if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this 365 | { 366 | int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate; 367 | 368 | if (compute_perf > max_compute_perf) 369 | { 370 | // If we find GPU with SM major > 2, search only these 371 | if (best_SM_arch > 2) 372 | { 373 | // If our device = dest_SM_arch, then we pick this one 374 | if (major == best_SM_arch) 375 | { 376 | max_compute_perf = compute_perf; 377 | max_perf_device = current_device; 378 | } 379 | } 380 | else 381 | { 382 | max_compute_perf = compute_perf; 383 | max_perf_device = current_device; 384 | } 385 | } 386 | } 387 | } 388 | 389 | ++current_device; 390 | } 391 | 392 | return max_perf_device; 393 | } 394 | 395 | // General initialization call to pick the best CUDA Device 396 | inline CUdevice findCudaDeviceDRV(int argc, const char **argv) 397 | { 398 | CUdevice cuDevice; 399 | int devID = 0; 400 | 401 | // If the command-line has a device number specified, use it 402 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) 403 | { 404 | devID = gpuDeviceInitDRV(argc, argv); 405 | 406 | if (devID < 0) 407 | { 408 | printf("exiting...\n"); 409 | exit(EXIT_SUCCESS); 410 | } 411 | } 412 | else 413 | { 414 | // Otherwise pick the device with highest Gflops/s 415 | char name[100]; 416 | devID = gpuGetMaxGflopsDeviceIdDRV(); 417 | checkCudaErrors(cuDeviceGet(&cuDevice, devID)); 418 | cuDeviceGetName(name, 100, cuDevice); 419 | printf("> Using CUDA Device [%d]: %s\n", devID, name); 420 | } 421 | 422 | cuDeviceGet(&cuDevice, devID); 423 | 424 | return cuDevice; 425 | } 426 | 427 | // This function will pick the best CUDA device available with OpenGL interop 428 | inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv) 429 | { 430 | CUdevice cuDevice; 431 | int devID = 0; 432 | 433 | // If the command-line has a device number specified, use it 434 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) 435 | { 436 | devID = gpuDeviceInitDRV(argc, (const char **)argv); 437 | 438 | if (devID < 0) 439 | { 440 | printf("no CUDA capable devices found, exiting...\n"); 441 | exit(EXIT_SUCCESS); 442 | } 443 | } 444 | else 445 | { 446 | char name[100]; 447 | // Otherwise pick the device with highest Gflops/s 448 | devID = gpuGetMaxGflopsGLDeviceIdDRV(); 449 | checkCudaErrors(cuDeviceGet(&cuDevice, devID)); 450 | cuDeviceGetName(name, 100, cuDevice); 451 | printf("> Using CUDA/GL Device [%d]: %s\n", devID, name); 452 | } 453 | 454 | return devID; 455 | } 456 | 457 | // General check for CUDA GPU SM Capabilities 458 | inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID) 459 | { 460 | CUdevice cuDevice; 461 | char name[256]; 462 | int major = 0, minor = 0; 463 | 464 | checkCudaErrors(cuDeviceGet(&cuDevice, devID)); 465 | checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); 466 | checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID)); 467 | 468 | if ((major > major_version) || 469 | (major == major_version && minor >= minor_version)) 470 | { 471 | printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor); 472 | return true; 473 | } 474 | else 475 | { 476 | printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); 477 | return false; 478 | } 479 | } 480 | #endif 481 | 482 | // end of CUDA Helper Functions 483 | 484 | #endif 485 | -------------------------------------------------------------------------------- /common/helper_cuda_gl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #ifndef HELPER_CUDA_GL_H 13 | #define HELPER_CUDA_GL_H 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | // includes, graphics 20 | #if defined (__APPLE__) || defined(MACOSX) 21 | #include 22 | #include 23 | #else 24 | #include 25 | #include 26 | #endif 27 | 28 | #ifndef EXIT_WAIVED 29 | #define EXIT_WAIVED 2 30 | #endif 31 | 32 | #ifdef __DRIVER_TYPES_H__ 33 | #ifndef DEVICE_RESET 34 | #define DEVICE_RESET cudaDeviceReset() 35 | #endif 36 | #else 37 | #ifndef DEVICE_RESET 38 | #define DEVICE_RESET 39 | #endif 40 | #endif 41 | 42 | #ifdef __CUDA_GL_INTEROP_H__ 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // These are CUDA OpenGL Helper functions 45 | 46 | inline int gpuGLDeviceInit(int ARGC, const char **ARGV) 47 | { 48 | int deviceCount; 49 | checkCudaErrors(cudaGetDeviceCount(&deviceCount)); 50 | 51 | if (deviceCount == 0) 52 | { 53 | fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); 54 | exit(EXIT_FAILURE); 55 | } 56 | 57 | int dev = 0; 58 | dev = getCmdLineArgumentInt(ARGC, ARGV, "device="); 59 | 60 | if (dev < 0) 61 | { 62 | dev = 0; 63 | } 64 | 65 | if (dev > deviceCount-1) 66 | { 67 | fprintf(stderr, "\n"); 68 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); 69 | fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev); 70 | fprintf(stderr, "\n"); 71 | return -dev; 72 | } 73 | 74 | cudaDeviceProp deviceProp; 75 | checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); 76 | 77 | if (deviceProp.computeMode == cudaComputeModeProhibited) 78 | { 79 | fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); 80 | return -1; 81 | } 82 | 83 | if (deviceProp.major < 1) 84 | { 85 | fprintf(stderr, "Error: device does not support CUDA.\n"); 86 | exit(EXIT_FAILURE); 87 | } 88 | 89 | if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false) 90 | { 91 | fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); 92 | } 93 | 94 | checkCudaErrors(cudaGLSetGLDevice(dev)); 95 | return dev; 96 | } 97 | 98 | // This function will pick the best CUDA device available with OpenGL interop 99 | inline int findCudaGLDevice(int argc, const char **argv) 100 | { 101 | int devID = 0; 102 | 103 | // If the command-line has a device number specified, use it 104 | if (checkCmdLineFlag(argc, (const char **)argv, "device")) 105 | { 106 | devID = gpuGLDeviceInit(argc, (const char **)argv); 107 | 108 | if (devID < 0) 109 | { 110 | printf("no CUDA capable devices found, exiting...\n"); 111 | DEVICE_RESET 112 | exit(EXIT_SUCCESS); 113 | } 114 | } 115 | else 116 | { 117 | // Otherwise pick the device with highest Gflops/s 118 | devID = gpuGetMaxGflopsDeviceId(); 119 | cudaGLSetGLDevice(devID); 120 | } 121 | 122 | return devID; 123 | } 124 | 125 | //////////////////////////////////////////////////////////////////////////// 126 | //! Check for OpenGL error 127 | //! @return bool if no GL error has been encountered, otherwise 0 128 | //! @param file __FILE__ macro 129 | //! @param line __LINE__ macro 130 | //! @note The GL error is listed on stderr 131 | //! @note This function should be used via the CHECK_ERROR_GL() macro 132 | //////////////////////////////////////////////////////////////////////////// 133 | inline bool 134 | sdkCheckErrorGL(const char *file, const int line) 135 | { 136 | bool ret_val = true; 137 | 138 | // check for error 139 | GLenum gl_error = glGetError(); 140 | 141 | if (gl_error != GL_NO_ERROR) 142 | { 143 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 144 | char tmpStr[512]; 145 | // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line 146 | // when the user double clicks on the error line in the Output pane. Like any compile error. 147 | sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error)); 148 | fprintf(stderr, "%s", tmpStr); 149 | #endif 150 | fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line); 151 | fprintf(stderr, "%s\n", gluErrorString(gl_error)); 152 | ret_val = false; 153 | } 154 | 155 | return ret_val; 156 | } 157 | 158 | #define SDK_CHECK_ERROR_GL() \ 159 | if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \ 160 | DEVICE_RESET \ 161 | exit(EXIT_FAILURE); \ 162 | } 163 | #endif 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /common/helper_functions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // These are helper functions for the SDK samples (string parsing, timers, image helpers, etc) 13 | #ifndef HELPER_FUNCTIONS_H 14 | #define HELPER_FUNCTIONS_H 15 | 16 | #ifdef WIN32 17 | #pragma warning(disable:4996) 18 | #endif 19 | 20 | // includes, project 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | // includes, timer, string parsing, image helpers 34 | #include // helper functions for timers 35 | #include // helper functions for string parsing 36 | #include // helper functions for image compare, dump, data comparisons 37 | 38 | #ifndef EXIT_WAIVED 39 | #define EXIT_WAIVED 2 40 | #endif 41 | 42 | #endif // HELPER_FUNCTIONS_H 43 | -------------------------------------------------------------------------------- /common/helper_image.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // These are helper functions for the SDK samples (image,bitmap) 13 | #ifndef HELPER_IMAGE_H 14 | #define HELPER_IMAGE_H 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | #ifndef MIN 27 | #define MIN(a,b) ((a < b) ? a : b) 28 | #endif 29 | #ifndef MAX 30 | #define MAX(a,b) ((a > b) ? a : b) 31 | #endif 32 | 33 | #ifndef EXIT_WAIVED 34 | #define EXIT_WAIVED 2 35 | #endif 36 | 37 | #include 38 | 39 | // namespace unnamed (internal) 40 | namespace 41 | { 42 | //! size of PGM file header 43 | const unsigned int PGMHeaderSize = 0x40; 44 | 45 | // types 46 | 47 | //! Data converter from unsigned char / unsigned byte to type T 48 | template 49 | struct ConverterFromUByte; 50 | 51 | //! Data converter from unsigned char / unsigned byte 52 | template<> 53 | struct ConverterFromUByte 54 | { 55 | //! Conversion operator 56 | //! @return converted value 57 | //! @param val value to convert 58 | float operator()(const unsigned char &val) 59 | { 60 | return static_cast(val); 61 | } 62 | }; 63 | 64 | //! Data converter from unsigned char / unsigned byte to float 65 | template<> 66 | struct ConverterFromUByte 67 | { 68 | //! Conversion operator 69 | //! @return converted value 70 | //! @param val value to convert 71 | float operator()(const unsigned char &val) 72 | { 73 | return static_cast(val) / 255.0f; 74 | } 75 | }; 76 | 77 | //! Data converter from unsigned char / unsigned byte to type T 78 | template 79 | struct ConverterToUByte; 80 | 81 | //! Data converter from unsigned char / unsigned byte to unsigned int 82 | template<> 83 | struct ConverterToUByte 84 | { 85 | //! Conversion operator (essentially a passthru 86 | //! @return converted value 87 | //! @param val value to convert 88 | unsigned char operator()(const unsigned char &val) 89 | { 90 | return val; 91 | } 92 | }; 93 | 94 | //! Data converter from unsigned char / unsigned byte to unsigned int 95 | template<> 96 | struct ConverterToUByte 97 | { 98 | //! Conversion operator 99 | //! @return converted value 100 | //! @param val value to convert 101 | unsigned char operator()(const float &val) 102 | { 103 | return static_cast(val * 255.0f); 104 | } 105 | }; 106 | } 107 | 108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 109 | #ifndef FOPEN 110 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) 111 | #endif 112 | #ifndef FOPEN_FAIL 113 | #define FOPEN_FAIL(result) (result != 0) 114 | #endif 115 | #ifndef SSCANF 116 | #define SSCANF sscanf_s 117 | #endif 118 | #else 119 | #ifndef FOPEN 120 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) 121 | #endif 122 | #ifndef FOPEN_FAIL 123 | #define FOPEN_FAIL(result) (result == NULL) 124 | #endif 125 | #ifndef SSCANF 126 | #define SSCANF sscanf 127 | #endif 128 | #endif 129 | 130 | inline bool 131 | __loadPPM(const char *file, unsigned char **data, 132 | unsigned int *w, unsigned int *h, unsigned int *channels) 133 | { 134 | FILE *fp = NULL; 135 | 136 | if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) 137 | { 138 | std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; 139 | return false; 140 | } 141 | 142 | // check header 143 | char header[PGMHeaderSize]; 144 | 145 | if (fgets(header, PGMHeaderSize, fp) == NULL) 146 | { 147 | std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; 148 | return false; 149 | } 150 | 151 | if (strncmp(header, "P5", 2) == 0) 152 | { 153 | *channels = 1; 154 | } 155 | else if (strncmp(header, "P6", 2) == 0) 156 | { 157 | *channels = 3; 158 | } 159 | else 160 | { 161 | std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; 162 | *channels = 0; 163 | return false; 164 | } 165 | 166 | // parse header, read maxval, width and height 167 | unsigned int width = 0; 168 | unsigned int height = 0; 169 | unsigned int maxval = 0; 170 | unsigned int i = 0; 171 | 172 | while (i < 3) 173 | { 174 | if (fgets(header, PGMHeaderSize, fp) == NULL) 175 | { 176 | std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; 177 | return false; 178 | } 179 | 180 | if (header[0] == '#') 181 | { 182 | continue; 183 | } 184 | 185 | if (i == 0) 186 | { 187 | i += SSCANF(header, "%u %u %u", &width, &height, &maxval); 188 | } 189 | else if (i == 1) 190 | { 191 | i += SSCANF(header, "%u %u", &height, &maxval); 192 | } 193 | else if (i == 2) 194 | { 195 | i += SSCANF(header, "%u", &maxval); 196 | } 197 | } 198 | 199 | // check if given handle for the data is initialized 200 | if (NULL != *data) 201 | { 202 | if (*w != width || *h != height) 203 | { 204 | std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; 205 | } 206 | } 207 | else 208 | { 209 | *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels); 210 | *w = width; 211 | *h = height; 212 | } 213 | 214 | // read and close file 215 | if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0) 216 | { 217 | std::cerr << "__LoadPPM() read data returned error." << std::endl; 218 | } 219 | 220 | fclose(fp); 221 | 222 | return true; 223 | } 224 | 225 | template 226 | inline bool 227 | sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h) 228 | { 229 | unsigned char *idata = NULL; 230 | unsigned int channels; 231 | 232 | if (true != __loadPPM(file, &idata, w, h, &channels)) 233 | { 234 | return false; 235 | } 236 | 237 | unsigned int size = *w **h * channels; 238 | 239 | // initialize mem if necessary 240 | // the correct size is checked / set in loadPGMc() 241 | if (NULL == *data) 242 | { 243 | *data = (T *) malloc(sizeof(T) * size); 244 | } 245 | 246 | // copy and cast data 247 | std::transform(idata, idata + size, *data, ConverterFromUByte()); 248 | 249 | free(idata); 250 | 251 | return true; 252 | } 253 | 254 | template 255 | inline bool 256 | sdkLoadPPM4(const char *file, T **data, 257 | unsigned int *w,unsigned int *h) 258 | { 259 | unsigned char *idata = 0; 260 | unsigned int channels; 261 | 262 | if (__loadPPM(file, &idata, w, h, &channels)) 263 | { 264 | // pad 4th component 265 | int size = *w **h; 266 | // keep the original pointer 267 | unsigned char *idata_orig = idata; 268 | *data = (T *) malloc(sizeof(T) * size * 4); 269 | unsigned char *ptr = *data; 270 | 271 | for (int i=0; i 0); 295 | assert(h > 0); 296 | 297 | std::fstream fh(file, std::fstream::out | std::fstream::binary); 298 | 299 | if (fh.bad()) 300 | { 301 | std::cerr << "__savePPM() : Opening file failed." << std::endl; 302 | return false; 303 | } 304 | 305 | if (channels == 1) 306 | { 307 | fh << "P5\n"; 308 | } 309 | else if (channels == 3) 310 | { 311 | fh << "P6\n"; 312 | } 313 | else 314 | { 315 | std::cerr << "__savePPM() : Invalid number of channels." << std::endl; 316 | return false; 317 | } 318 | 319 | fh << w << "\n" << h << "\n" << 0xff << std::endl; 320 | 321 | for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i) 322 | { 323 | fh << data[i]; 324 | } 325 | 326 | fh.flush(); 327 | 328 | if (fh.bad()) 329 | { 330 | std::cerr << "__savePPM() : Writing data failed." << std::endl; 331 | return false; 332 | } 333 | 334 | fh.close(); 335 | 336 | return true; 337 | } 338 | 339 | template 340 | inline bool 341 | sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h) 342 | { 343 | unsigned int size = w * h; 344 | unsigned char *idata = 345 | (unsigned char *) malloc(sizeof(unsigned char) * size); 346 | 347 | std::transform(data, data + size, idata, ConverterToUByte()); 348 | 349 | // write file 350 | bool result = __savePPM(file, idata, w, h, 1); 351 | 352 | // cleanup 353 | free(idata); 354 | 355 | return result; 356 | } 357 | 358 | inline bool 359 | sdkSavePPM4ub(const char *file, unsigned char *data, 360 | unsigned int w, unsigned int h) 361 | { 362 | // strip 4th component 363 | int size = w * h; 364 | unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3); 365 | unsigned char *ptr = ndata; 366 | 367 | for (int i=0; i 390 | inline bool 391 | sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) 392 | { 393 | // check input arguments 394 | assert(NULL != filename); 395 | assert(NULL != len); 396 | 397 | // intermediate storage for the data read 398 | std::vector data_read; 399 | 400 | // open file for reading 401 | FILE *fh = NULL; 402 | 403 | // check if filestream is valid 404 | if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) 405 | { 406 | printf("Unable to open input file: %s\n", filename); 407 | return false; 408 | } 409 | 410 | // read all data elements 411 | T token; 412 | 413 | while (!feof(fh)) 414 | { 415 | fscanf(fh, "%f", &token); 416 | data_read.push_back(token); 417 | } 418 | 419 | // the last element is read twice 420 | data_read.pop_back(); 421 | fclose(fh); 422 | 423 | // check if the given handle is already initialized 424 | if (NULL != *data) 425 | { 426 | if (*len != data_read.size()) 427 | { 428 | std::cerr << "sdkReadFile() : Initialized memory given but " 429 | << "size mismatch with signal read " 430 | << "(data read / data init = " << (unsigned int)data_read.size() 431 | << " / " << *len << ")" << std::endl; 432 | 433 | return false; 434 | } 435 | } 436 | else 437 | { 438 | // allocate storage for the data read 439 | *data = (T *) malloc(sizeof(T) * data_read.size()); 440 | // store signal size 441 | *len = static_cast(data_read.size()); 442 | } 443 | 444 | // copy data 445 | memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); 446 | 447 | return true; 448 | } 449 | 450 | ////////////////////////////////////////////////////////////////////////////// 451 | //! Read file \filename and return the data 452 | //! @return bool if reading the file succeeded, otherwise false 453 | //! @param filename name of the source file 454 | //! @param data uninitialized pointer, returned initialized and pointing to 455 | //! the data read 456 | //! @param len number of data elements in data, -1 on error 457 | ////////////////////////////////////////////////////////////////////////////// 458 | template 459 | inline bool 460 | sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose) 461 | { 462 | // check input arguments 463 | assert(NULL != filename); 464 | assert(NULL != len); 465 | 466 | // open file for reading 467 | FILE *fh = fopen(filename, "rb"); 468 | 469 | if (fh == NULL && verbose) 470 | { 471 | std::cerr << "sdkReadFile() : Opening file failed." << std::endl; 472 | return false; 473 | } 474 | 475 | // check if the given handle is already initialized 476 | // allocate storage for the data read 477 | data[block_num] = (T *) malloc(block_size); 478 | 479 | // read all data elements 480 | fseek(fh, block_num * block_size, SEEK_SET); 481 | *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh); 482 | 483 | fclose(fh); 484 | 485 | return true; 486 | } 487 | 488 | ////////////////////////////////////////////////////////////////////////////// 489 | //! Write a data file \filename 490 | //! @return true if writing the file succeeded, otherwise false 491 | //! @param filename name of the source file 492 | //! @param data data to write 493 | //! @param len number of data elements in data, -1 on error 494 | //! @param epsilon epsilon for comparison 495 | ////////////////////////////////////////////////////////////////////////////// 496 | template 497 | inline bool 498 | sdkWriteFile(const char *filename, const T *data, unsigned int len, 499 | const S epsilon, bool verbose, bool append = false) 500 | { 501 | assert(NULL != filename); 502 | assert(NULL != data); 503 | 504 | // open file for writing 505 | // if (append) { 506 | std::fstream fh(filename, std::fstream::out | std::fstream::ate); 507 | 508 | if (verbose) 509 | { 510 | std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl; 511 | } 512 | 513 | /* } else { 514 | std::fstream fh(filename, std::fstream::out); 515 | if (verbose) { 516 | std::cerr << "sdkWriteFile() : Open file " << filename << " for write." << std::endl; 517 | } 518 | } 519 | */ 520 | 521 | // check if filestream is valid 522 | if (! fh.good()) 523 | { 524 | if (verbose) 525 | { 526 | std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; 527 | } 528 | 529 | return false; 530 | } 531 | 532 | // first write epsilon 533 | fh << "# " << epsilon << "\n"; 534 | 535 | // write data 536 | for (unsigned int i = 0; (i < len) && (fh.good()); ++i) 537 | { 538 | fh << data[i] << ' '; 539 | } 540 | 541 | // Check if writing succeeded 542 | if (! fh.good()) 543 | { 544 | if (verbose) 545 | { 546 | std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; 547 | } 548 | 549 | return false; 550 | } 551 | 552 | // file ends with nl 553 | fh << std::endl; 554 | 555 | return true; 556 | } 557 | 558 | ////////////////////////////////////////////////////////////////////////////// 559 | //! Compare two arrays of arbitrary type 560 | //! @return true if \a reference and \a data are identical, otherwise false 561 | //! @param reference timer_interface to the reference data / gold image 562 | //! @param data handle to the computed data 563 | //! @param len number of elements in reference and data 564 | //! @param epsilon epsilon to use for the comparison 565 | ////////////////////////////////////////////////////////////////////////////// 566 | template 567 | inline bool 568 | compareData(const T *reference, const T *data, const unsigned int len, 569 | const S epsilon, const float threshold) 570 | { 571 | assert(epsilon >= 0); 572 | 573 | bool result = true; 574 | unsigned int error_count = 0; 575 | 576 | for (unsigned int i = 0; i < len; ++i) 577 | { 578 | float diff = (float)reference[i] - (float)data[i]; 579 | bool comp = (diff <= epsilon) && (diff >= -epsilon); 580 | result &= comp; 581 | 582 | error_count += !comp; 583 | 584 | #if 0 585 | 586 | if (! comp) 587 | { 588 | std::cerr << "ERROR, i = " << i << ",\t " 589 | << reference[i] << " / " 590 | << data[i] 591 | << " (reference / data)\n"; 592 | } 593 | 594 | #endif 595 | } 596 | 597 | if (threshold == 0.0f) 598 | { 599 | return (result) ? true : false; 600 | } 601 | else 602 | { 603 | if (error_count) 604 | { 605 | printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); 606 | } 607 | 608 | return (len*threshold > error_count) ? true : false; 609 | } 610 | } 611 | 612 | #ifndef __MIN_EPSILON_ERROR 613 | #define __MIN_EPSILON_ERROR 1e-3f 614 | #endif 615 | 616 | ////////////////////////////////////////////////////////////////////////////// 617 | //! Compare two arrays of arbitrary type 618 | //! @return true if \a reference and \a data are identical, otherwise false 619 | //! @param reference handle to the reference data / gold image 620 | //! @param data handle to the computed data 621 | //! @param len number of elements in reference and data 622 | //! @param epsilon epsilon to use for the comparison 623 | //! @param epsilon threshold % of (# of bytes) for pass/fail 624 | ////////////////////////////////////////////////////////////////////////////// 625 | template 626 | inline bool 627 | compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len, 628 | const S epsilon, const float threshold) 629 | { 630 | assert(epsilon >= 0); 631 | 632 | // If we set epsilon to be 0, let's set a minimum threshold 633 | float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); 634 | int error_count = 0; 635 | bool result = true; 636 | 637 | for (unsigned int i = 0; i < len; ++i) 638 | { 639 | float diff = fabs((float)reference[i] - (float)data[i]); 640 | bool comp = (diff < max_error); 641 | result &= comp; 642 | 643 | if (! comp) 644 | { 645 | error_count++; 646 | #if 0 647 | 648 | if (error_count < 50) 649 | { 650 | printf("\n ERROR(epsilon=%4.3f), i=%d, (ref)0x%02x / (data)0x%02x / (diff)%d\n", 651 | max_error, i, 652 | *(unsigned int *)&reference[i], 653 | *(unsigned int *)&data[i], 654 | (unsigned int)diff); 655 | } 656 | 657 | #endif 658 | } 659 | } 660 | 661 | if (threshold == 0.0f) 662 | { 663 | if (error_count) 664 | { 665 | printf("total # of errors = %d\n", error_count); 666 | } 667 | 668 | return (error_count == 0) ? true : false; 669 | } 670 | else 671 | { 672 | if (error_count) 673 | { 674 | printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); 675 | } 676 | 677 | return ((len*threshold > error_count) ? true : false); 678 | } 679 | } 680 | 681 | inline 682 | void sdkDumpBin(void *data, unsigned int bytes, const char *filename) 683 | { 684 | printf("sdkDumpBin: <%s>\n", filename); 685 | FILE *fp; 686 | FOPEN(fp, filename, "wb"); 687 | fwrite(data, bytes, 1, fp); 688 | fflush(fp); 689 | fclose(fp); 690 | } 691 | 692 | inline 693 | bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) 694 | { 695 | unsigned int *src_buffer, *ref_buffer; 696 | FILE *src_fp = NULL, *ref_fp = NULL; 697 | 698 | unsigned long error_count = 0; 699 | size_t fsize = 0; 700 | 701 | if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) 702 | { 703 | printf("compareBin2Bin unable to open src_file: %s\n", src_file); 704 | error_count++; 705 | } 706 | 707 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path); 708 | 709 | if (ref_file_path == NULL) 710 | { 711 | printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); 712 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); 713 | printf("Aborting comparison!\n"); 714 | printf(" FAILED\n"); 715 | error_count++; 716 | 717 | if (src_fp) 718 | { 719 | fclose(src_fp); 720 | } 721 | 722 | if (ref_fp) 723 | { 724 | fclose(ref_fp); 725 | } 726 | } 727 | else 728 | { 729 | if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) 730 | { 731 | printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); 732 | error_count++; 733 | } 734 | 735 | if (src_fp && ref_fp) 736 | { 737 | src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); 738 | ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); 739 | 740 | fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); 741 | fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); 742 | 743 | printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); 744 | printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); 745 | printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); 746 | 747 | if (!compareData(ref_buffer, src_buffer, nelements, epsilon, threshold)) 748 | { 749 | error_count++; 750 | } 751 | 752 | fclose(src_fp); 753 | fclose(ref_fp); 754 | 755 | free(src_buffer); 756 | free(ref_buffer); 757 | } 758 | else 759 | { 760 | if (src_fp) 761 | { 762 | fclose(src_fp); 763 | } 764 | 765 | if (ref_fp) 766 | { 767 | fclose(ref_fp); 768 | } 769 | } 770 | } 771 | 772 | if (error_count == 0) 773 | { 774 | printf(" OK\n"); 775 | } 776 | else 777 | { 778 | printf(" FAILURE: %d errors...\n", (unsigned int)error_count); 779 | } 780 | 781 | return (error_count == 0); // returns true if all pixels pass 782 | } 783 | 784 | inline 785 | bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) 786 | { 787 | float *src_buffer, *ref_buffer; 788 | FILE *src_fp = NULL, *ref_fp = NULL; 789 | size_t fsize = 0; 790 | 791 | unsigned long error_count = 0; 792 | 793 | if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) 794 | { 795 | printf("compareBin2Bin unable to open src_file: %s\n", src_file); 796 | error_count = 1; 797 | } 798 | 799 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path); 800 | 801 | if (ref_file_path == NULL) 802 | { 803 | printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); 804 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path); 805 | printf("Aborting comparison!\n"); 806 | printf(" FAILED\n"); 807 | error_count++; 808 | 809 | if (src_fp) 810 | { 811 | fclose(src_fp); 812 | } 813 | 814 | if (ref_fp) 815 | { 816 | fclose(ref_fp); 817 | } 818 | } 819 | else 820 | { 821 | if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) 822 | { 823 | printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); 824 | error_count = 1; 825 | } 826 | 827 | if (src_fp && ref_fp) 828 | { 829 | src_buffer = (float *)malloc(nelements*sizeof(float)); 830 | ref_buffer = (float *)malloc(nelements*sizeof(float)); 831 | 832 | fsize = fread(src_buffer, nelements, sizeof(float), src_fp); 833 | fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp); 834 | 835 | printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); 836 | printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); 837 | printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); 838 | 839 | if (!compareDataAsFloatThreshold(ref_buffer, src_buffer, nelements, epsilon, threshold)) 840 | { 841 | error_count++; 842 | } 843 | 844 | fclose(src_fp); 845 | fclose(ref_fp); 846 | 847 | free(src_buffer); 848 | free(ref_buffer); 849 | } 850 | else 851 | { 852 | if (src_fp) 853 | { 854 | fclose(src_fp); 855 | } 856 | 857 | if (ref_fp) 858 | { 859 | fclose(ref_fp); 860 | } 861 | } 862 | } 863 | 864 | if (error_count == 0) 865 | { 866 | printf(" OK\n"); 867 | } 868 | else 869 | { 870 | printf(" FAILURE: %d errors...\n", (unsigned int)error_count); 871 | } 872 | 873 | return (error_count == 0); // returns true if all pixels pass 874 | } 875 | 876 | inline bool 877 | sdkCompareL2fe(const float *reference, const float *data, 878 | const unsigned int len, const float epsilon) 879 | { 880 | assert(epsilon >= 0); 881 | 882 | float error = 0; 883 | float ref = 0; 884 | 885 | for (unsigned int i = 0; i < len; ++i) 886 | { 887 | 888 | float diff = reference[i] - data[i]; 889 | error += diff * diff; 890 | ref += reference[i] * reference[i]; 891 | } 892 | 893 | float normRef = sqrtf(ref); 894 | 895 | if (fabs(ref) < 1e-7) 896 | { 897 | #ifdef _DEBUG 898 | std::cerr << "ERROR, reference l2-norm is 0\n"; 899 | #endif 900 | return false; 901 | } 902 | 903 | float normError = sqrtf(error); 904 | error = normError / normRef; 905 | bool result = error < epsilon; 906 | #ifdef _DEBUG 907 | 908 | if (! result) 909 | { 910 | std::cerr << "ERROR, l2-norm error " 911 | << error << " is greater than epsilon " << epsilon << "\n"; 912 | } 913 | 914 | #endif 915 | 916 | return result; 917 | } 918 | 919 | inline bool 920 | sdkLoadPPMub(const char *file, unsigned char **data, 921 | unsigned int *w,unsigned int *h) 922 | { 923 | unsigned int channels; 924 | return __loadPPM(file, data, w, h, &channels); 925 | } 926 | 927 | inline bool 928 | sdkLoadPPM4ub(const char *file, unsigned char **data, 929 | unsigned int *w, unsigned int *h) 930 | { 931 | unsigned char *idata = 0; 932 | unsigned int channels; 933 | 934 | if (__loadPPM(file, &idata, w, h, &channels)) 935 | { 936 | // pad 4th component 937 | int size = *w **h; 938 | // keep the original pointer 939 | unsigned char *idata_orig = idata; 940 | *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4); 941 | unsigned char *ptr = *data; 942 | 943 | for (int i=0; i Compare (a)rendered: <" << src_file << ">\n"; 984 | std::cerr << "> (b)reference: <" << ref_file << ">\n"; 985 | } 986 | 987 | 988 | if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) 989 | { 990 | if (verboseErrors) 991 | { 992 | std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n"; 993 | } 994 | 995 | return false; 996 | } 997 | 998 | if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) 999 | { 1000 | std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n"; 1001 | return false; 1002 | } 1003 | 1004 | if (src_height != ref_height || src_width != ref_width) 1005 | { 1006 | if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width << 1007 | "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; 1008 | } 1009 | 1010 | if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width << 1011 | "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; 1012 | 1013 | if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false) 1014 | { 1015 | error_count=1; 1016 | } 1017 | 1018 | if (error_count == 0) 1019 | { 1020 | if (verboseErrors) 1021 | { 1022 | std::cerr << " OK\n\n"; 1023 | } 1024 | } 1025 | else 1026 | { 1027 | if (verboseErrors) 1028 | { 1029 | std::cerr << " FAILURE! "< Compare (a)rendered: <" << src_file << ">\n"; 1058 | std::cerr << "> (b)reference: <" << ref_file << ">\n"; 1059 | } 1060 | 1061 | 1062 | if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) 1063 | { 1064 | if (verboseErrors) 1065 | { 1066 | std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n"; 1067 | } 1068 | 1069 | return false; 1070 | } 1071 | 1072 | if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) 1073 | { 1074 | std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n"; 1075 | return false; 1076 | } 1077 | 1078 | if (src_height != ref_height || src_width != ref_width) 1079 | { 1080 | if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width << 1081 | "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; 1082 | } 1083 | 1084 | if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width << 1085 | "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; 1086 | 1087 | if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false) 1088 | { 1089 | error_count=1; 1090 | } 1091 | 1092 | if (error_count == 0) 1093 | { 1094 | if (verboseErrors) 1095 | { 1096 | std::cerr << " OK\n\n"; 1097 | } 1098 | } 1099 | else 1100 | { 1101 | if (verboseErrors) 1102 | { 1103 | std::cerr << " FAILURE! "< 17 | #include 18 | #include 19 | #include 20 | 21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 22 | #ifndef _CRT_SECURE_NO_DEPRECATE 23 | #define _CRT_SECURE_NO_DEPRECATE 24 | #endif 25 | #ifndef STRCASECMP 26 | #define STRCASECMP _stricmp 27 | #endif 28 | #ifndef STRNCASECMP 29 | #define STRNCASECMP _strnicmp 30 | #endif 31 | #ifndef STRCPY 32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) 33 | #endif 34 | 35 | #ifndef FOPEN 36 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) 37 | #endif 38 | #ifndef FOPEN_FAIL 39 | #define FOPEN_FAIL(result) (result != 0) 40 | #endif 41 | #ifndef SSCANF 42 | #define SSCANF sscanf_s 43 | #endif 44 | #ifndef SPRINTF 45 | #define SPRINTF sprintf_s 46 | #endif 47 | #else // Linux Includes 48 | #include 49 | #include 50 | 51 | #ifndef STRCASECMP 52 | #define STRCASECMP strcasecmp 53 | #endif 54 | #ifndef STRNCASECMP 55 | #define STRNCASECMP strncasecmp 56 | #endif 57 | #ifndef STRCPY 58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) 59 | #endif 60 | 61 | #ifndef FOPEN 62 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) 63 | #endif 64 | #ifndef FOPEN_FAIL 65 | #define FOPEN_FAIL(result) (result == NULL) 66 | #endif 67 | #ifndef SSCANF 68 | #define SSCANF sscanf 69 | #endif 70 | #ifndef SPRINTF 71 | #define SPRINTF sprintf 72 | #endif 73 | #endif 74 | 75 | #ifndef EXIT_WAIVED 76 | #define EXIT_WAIVED 2 77 | #endif 78 | 79 | // CUDA Utility Helper Functions 80 | inline int stringRemoveDelimiter(char delimiter, const char *string) 81 | { 82 | int string_start = 0; 83 | 84 | while (string[string_start] == delimiter) 85 | { 86 | string_start++; 87 | } 88 | 89 | if (string_start >= (int)strlen(string)-1) 90 | { 91 | return 0; 92 | } 93 | 94 | return string_start; 95 | } 96 | 97 | inline int getFileExtension(char *filename, char **extension) 98 | { 99 | int string_length = (int)strlen(filename); 100 | 101 | while (filename[string_length--] != '.') 102 | { 103 | if (string_length == 0) 104 | break; 105 | } 106 | 107 | if (string_length > 0) string_length += 2; 108 | 109 | if (string_length == 0) 110 | *extension = NULL; 111 | else 112 | *extension = &filename[string_length]; 113 | 114 | return string_length; 115 | } 116 | 117 | 118 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 119 | { 120 | bool bFound = false; 121 | 122 | if (argc >= 1) 123 | { 124 | for (int i=1; i < argc; i++) 125 | { 126 | int string_start = stringRemoveDelimiter('-', argv[i]); 127 | const char *string_argv = &argv[i][string_start]; 128 | 129 | const char *equal_pos = strchr(string_argv, '='); 130 | int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 131 | 132 | int length = (int)strlen(string_ref); 133 | 134 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 135 | { 136 | bFound = true; 137 | continue; 138 | } 139 | } 140 | } 141 | 142 | return bFound; 143 | } 144 | 145 | // This function wraps the CUDA Driver API into a template function 146 | template 147 | inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value) 148 | { 149 | bool bFound = false; 150 | 151 | if (argc >= 1) 152 | { 153 | for (int i=1; i < argc; i++) 154 | { 155 | int string_start = stringRemoveDelimiter('-', argv[i]); 156 | const char *string_argv = &argv[i][string_start]; 157 | int length = (int)strlen(string_ref); 158 | 159 | if (!STRNCASECMP(string_argv, string_ref, length)) 160 | { 161 | if (length+1 <= (int)strlen(string_argv)) 162 | { 163 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 164 | *value = (T)atoi(&string_argv[length + auto_inc]); 165 | } 166 | 167 | bFound = true; 168 | i=argc; 169 | } 170 | } 171 | } 172 | 173 | return bFound; 174 | } 175 | 176 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 177 | { 178 | bool bFound = false; 179 | int value = -1; 180 | 181 | if (argc >= 1) 182 | { 183 | for (int i=1; i < argc; i++) 184 | { 185 | int string_start = stringRemoveDelimiter('-', argv[i]); 186 | const char *string_argv = &argv[i][string_start]; 187 | int length = (int)strlen(string_ref); 188 | 189 | if (!STRNCASECMP(string_argv, string_ref, length)) 190 | { 191 | if (length+1 <= (int)strlen(string_argv)) 192 | { 193 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 194 | value = atoi(&string_argv[length + auto_inc]); 195 | } 196 | else 197 | { 198 | value = 0; 199 | } 200 | 201 | bFound = true; 202 | continue; 203 | } 204 | } 205 | } 206 | 207 | if (bFound) 208 | { 209 | return value; 210 | } 211 | else 212 | { 213 | return 0; 214 | } 215 | } 216 | 217 | inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) 218 | { 219 | bool bFound = false; 220 | float value = -1; 221 | 222 | if (argc >= 1) 223 | { 224 | for (int i=1; i < argc; i++) 225 | { 226 | int string_start = stringRemoveDelimiter('-', argv[i]); 227 | const char *string_argv = &argv[i][string_start]; 228 | int length = (int)strlen(string_ref); 229 | 230 | if (!STRNCASECMP(string_argv, string_ref, length)) 231 | { 232 | if (length+1 <= (int)strlen(string_argv)) 233 | { 234 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 235 | value = (float)atof(&string_argv[length + auto_inc]); 236 | } 237 | else 238 | { 239 | value = 0.f; 240 | } 241 | 242 | bFound = true; 243 | continue; 244 | } 245 | } 246 | } 247 | 248 | if (bFound) 249 | { 250 | return value; 251 | } 252 | else 253 | { 254 | return 0; 255 | } 256 | } 257 | 258 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 259 | const char *string_ref, char **string_retval) 260 | { 261 | bool bFound = false; 262 | 263 | if (argc >= 1) 264 | { 265 | for (int i=1; i < argc; i++) 266 | { 267 | int string_start = stringRemoveDelimiter('-', argv[i]); 268 | char *string_argv = (char *)&argv[i][string_start]; 269 | int length = (int)strlen(string_ref); 270 | 271 | if (!STRNCASECMP(string_argv, string_ref, length)) 272 | { 273 | *string_retval = &string_argv[length+1]; 274 | bFound = true; 275 | continue; 276 | } 277 | } 278 | } 279 | 280 | if (!bFound) 281 | { 282 | *string_retval = NULL; 283 | } 284 | 285 | return bFound; 286 | } 287 | 288 | ////////////////////////////////////////////////////////////////////////////// 289 | //! Find the path for a file assuming that 290 | //! files are found in the searchPath. 291 | //! 292 | //! @return the path if succeeded, otherwise 0 293 | //! @param filename name of the file 294 | //! @param executable_path optional absolute path of the executable 295 | ////////////////////////////////////////////////////////////////////////////// 296 | inline char *sdkFindFilePath(const char *filename, const char *executable_path) 297 | { 298 | // defines a variable that is replaced with the name of the executable 299 | 300 | // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files) 301 | // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc 302 | const char *searchPath[] = 303 | { 304 | "./", // same dir 305 | "./common/", // "/common/" subdir 306 | "./common/data/", // "/common/data/" subdir 307 | "./data/", // "/data/" subdir 308 | "./src/", // "/src/" subdir 309 | "./src//data/", // "/src//data/" subdir 310 | "./inc/", // "/inc/" subdir 311 | "./0_Simple/", // "/0_Simple/" subdir 312 | "./1_Utilities/", // "/1_Utilities/" subdir 313 | "./2_Graphics/", // "/2_Graphics/" subdir 314 | "./3_Imaging/", // "/3_Imaging/" subdir 315 | "./4_Financial/", // "/4_Financial/" subdir 316 | "./5_Simulations/", // "/5_Simulations/" subdir 317 | "./6_Advanced/", // "/6_Advanced/" subdir 318 | "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir 319 | "./samples/", // "/samples/" subdir 320 | 321 | "../", // up 1 in tree 322 | "../common/", // up 1 in tree, "/common/" subdir 323 | "../common/data/", // up 1 in tree, "/common/data/" subdir 324 | "../data/", // up 1 in tree, "/data/" subdir 325 | "../src/", // up 1 in tree, "/src/" subdir 326 | "../inc/", // up 1 in tree, "/inc/" subdir 327 | 328 | "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir 329 | "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir 330 | "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir 331 | "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir 332 | "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir 333 | "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir 334 | "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir 335 | "../7_CUDALibraries//data/",// up 1 in tree, "/7_CUDALibraries//" subdir 336 | "../samples//data/", // up 1 in tree, "/samples//" subdir 337 | "../../", // up 2 in tree 338 | "../../common/", // up 2 in tree, "/common/" subdir 339 | "../../common/data/", // up 2 in tree, "/common/data/" subdir 340 | "../../data/", // up 2 in tree, "/data/" subdir 341 | "../../src/", // up 2 in tree, "/src/" subdir 342 | "../../inc/", // up 2 in tree, "/inc/" subdir 343 | "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir 344 | "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir 345 | "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir 346 | "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir 347 | "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir 348 | "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir 349 | "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir 350 | "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir 351 | "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir 352 | "../../samples//data/", // up 2 in tree, "/samples//" subdir 353 | "../../../", // up 3 in tree 354 | "../../../src//", // up 3 in tree, "/src//" subdir 355 | "../../../src//data/", // up 3 in tree, "/src//data/" subdir 356 | "../../../src//src/", // up 3 in tree, "/src//src/" subdir 357 | "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir 358 | "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir 359 | "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir 360 | "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir 361 | "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir 362 | "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir 363 | "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir 364 | "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir 365 | "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir 366 | "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir 367 | "../../../5_Simulations//data/", // up 3 in tree, "/5_Simulations//" subdir 368 | "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir 369 | "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir 370 | "../../../samples//data/", // up 3 in tree, "/samples//" subdir 371 | "../../../common/", // up 3 in tree, "../../../common/" subdir 372 | "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir 373 | "../../../data/", // up 3 in tree, "../../../data/" subdir 374 | "../../../../", // up 4 in tree 375 | "../../../../src//", // up 4 in tree, "/src//" subdir 376 | "../../../../src//data/", // up 4 in tree, "/src//data/" subdir 377 | "../../../../src//src/", // up 4 in tree, "/src//src/" subdir 378 | "../../../../src//inc/", // up 4 in tree, "/src//inc/" subdir 379 | "../../../../sandbox//", // up 4 in tree, "/sandbox//" subdir 380 | "../../../../sandbox//data/", // up 4 in tree, "/sandbox//data/" subdir 381 | "../../../../sandbox//src/", // up 4 in tree, "/sandbox//src/" subdir 382 | "../../../../sandbox//inc/", // up 4 in tree, "/sandbox//inc/" subdir 383 | "../../../../0_Simple//data/", // up 4 in tree, "/0_Simple//" subdir 384 | "../../../../1_Utilities//data/", // up 4 in tree, "/1_Utilities//" subdir 385 | "../../../../2_Graphics//data/", // up 4 in tree, "/2_Graphics//" subdir 386 | "../../../../3_Imaging//data/", // up 4 in tree, "/3_Imaging//" subdir 387 | "../../../../4_Financial//data/", // up 4 in tree, "/4_Financial//" subdir 388 | "../../../../5_Simulations//data/",// up 4 in tree, "/5_Simulations//" subdir 389 | "../../../../6_Advanced//data/", // up 4 in tree, "/6_Advanced//" subdir 390 | "../../../../7_CUDALibraries//data/", // up 4 in tree, "/7_CUDALibraries//" subdir 391 | "../../../../samples//data/", // up 4 in tree, "/samples//" subdir 392 | "../../../../common/", // up 4 in tree, "../../../common/" subdir 393 | "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir 394 | "../../../../data/", // up 4 in tree, "../../../data/" subdir 395 | "../../../../../", // up 5 in tree 396 | "../../../../../src//", // up 5 in tree, "/src//" subdir 397 | "../../../../../src//data/", // up 5 in tree, "/src//data/" subdir 398 | "../../../../../src//src/", // up 5 in tree, "/src//src/" subdir 399 | "../../../../../src//inc/", // up 5 in tree, "/src//inc/" subdir 400 | "../../../../../sandbox//", // up 5 in tree, "/sandbox//" subdir 401 | "../../../../../sandbox//data/", // up 5 in tree, "/sandbox//data/" subdir 402 | "../../../../../sandbox//src/", // up 5 in tree, "/sandbox//src/" subdir 403 | "../../../../../sandbox//inc/", // up 5 in tree, "/sandbox//inc/" subdir 404 | "../../../../../0_Simple//data/", // up 5 in tree, "/0_Simple//" subdir 405 | "../../../../../1_Utilities//data/", // up 5 in tree, "/1_Utilities//" subdir 406 | "../../../../../2_Graphics//data/", // up 5 in tree, "/2_Graphics//" subdir 407 | "../../../../../3_Imaging//data/", // up 5 in tree, "/3_Imaging//" subdir 408 | "../../../../../4_Financial//data/", // up 5 in tree, "/4_Financial//" subdir 409 | "../../../../../5_Simulations//data/",// up 5 in tree, "/5_Simulations//" subdir 410 | "../../../../../6_Advanced//data/", // up 5 in tree, "/6_Advanced//" subdir 411 | "../../../../../7_CUDALibraries//data/", // up 5 in tree, "/7_CUDALibraries//" subdir 412 | "../../../../../samples//data/", // up 5 in tree, "/samples//" subdir 413 | "../../../../../common/", // up 5 in tree, "../../../common/" subdir 414 | "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir 415 | }; 416 | 417 | // Extract the executable name 418 | std::string executable_name; 419 | 420 | if (executable_path != 0) 421 | { 422 | executable_name = std::string(executable_path); 423 | 424 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 425 | // Windows path delimiter 426 | size_t delimiter_pos = executable_name.find_last_of('\\'); 427 | executable_name.erase(0, delimiter_pos + 1); 428 | 429 | if (executable_name.rfind(".exe") != std::string::npos) 430 | { 431 | // we strip .exe, only if the .exe is found 432 | executable_name.resize(executable_name.size() - 4); 433 | } 434 | 435 | #else 436 | // Linux & OSX path delimiter 437 | size_t delimiter_pos = executable_name.find_last_of('/'); 438 | executable_name.erase(0,delimiter_pos+1); 439 | #endif 440 | } 441 | 442 | // Loop over all search paths and return the first hit 443 | for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i) 444 | { 445 | std::string path(searchPath[i]); 446 | size_t executable_name_pos = path.find(""); 447 | 448 | // If there is executable_name variable in the searchPath 449 | // replace it with the value 450 | if (executable_name_pos != std::string::npos) 451 | { 452 | if (executable_path != 0) 453 | { 454 | path.replace(executable_name_pos, strlen(""), executable_name); 455 | } 456 | else 457 | { 458 | // Skip this path entry if no executable argument is given 459 | continue; 460 | } 461 | } 462 | 463 | #ifdef _DEBUG 464 | printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); 465 | #endif 466 | 467 | // Test if the file exists 468 | path.append(filename); 469 | FILE *fp; 470 | FOPEN(fp, path.c_str(), "rb"); 471 | 472 | if (fp != NULL) 473 | { 474 | fclose(fp); 475 | // File found 476 | // returning an allocated array here for backwards compatibility reasons 477 | char *file_path = (char *) malloc(path.length() + 1); 478 | STRCPY(file_path, path.length() + 1, path.c_str()); 479 | return file_path; 480 | } 481 | 482 | if (fp) 483 | { 484 | fclose(fp); 485 | } 486 | } 487 | 488 | // File not found 489 | return 0; 490 | } 491 | 492 | #endif 493 | -------------------------------------------------------------------------------- /common/helper_timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // Helper Timing Functions 13 | #ifndef HELPER_TIMER_H 14 | #define HELPER_TIMER_H 15 | 16 | #ifndef EXIT_WAIVED 17 | #define EXIT_WAIVED 2 18 | #endif 19 | 20 | // includes, system 21 | #include 22 | 23 | // includes, project 24 | #include 25 | 26 | // Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions 27 | // But rather in a self contained class interface 28 | class StopWatchInterface 29 | { 30 | public: 31 | StopWatchInterface() {}; 32 | virtual ~StopWatchInterface() {}; 33 | 34 | public: 35 | //! Start time measurement 36 | virtual void start() = 0; 37 | 38 | //! Stop time measurement 39 | virtual void stop() = 0; 40 | 41 | //! Reset time counters to zero 42 | virtual void reset() = 0; 43 | 44 | //! Time in msec. after start. If the stop watch is still running (i.e. there 45 | //! was no call to stop()) then the elapsed time is returned, otherwise the 46 | //! time between the last start() and stop call is returned 47 | virtual float getTime() = 0; 48 | 49 | //! Mean time to date based on the number of times the stopwatch has been 50 | //! _stopped_ (ie finished sessions) and the current total time 51 | virtual float getAverageTime() = 0; 52 | }; 53 | 54 | 55 | ////////////////////////////////////////////////////////////////// 56 | // Begin Stopwatch timer class definitions for all OS platforms // 57 | ////////////////////////////////////////////////////////////////// 58 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 59 | // includes, system 60 | #define WINDOWS_LEAN_AND_MEAN 61 | #include 62 | #undef min 63 | #undef max 64 | 65 | //! Windows specific implementation of StopWatch 66 | class StopWatchWin : public StopWatchInterface 67 | { 68 | public: 69 | //! Constructor, default 70 | StopWatchWin() : 71 | start_time(), end_time(), 72 | diff_time(0.0f), total_time(0.0f), 73 | running(false), clock_sessions(0), freq(0), freq_set(false) 74 | { 75 | if (! freq_set) 76 | { 77 | // helper variable 78 | LARGE_INTEGER temp; 79 | 80 | // get the tick frequency from the OS 81 | QueryPerformanceFrequency((LARGE_INTEGER *) &temp); 82 | 83 | // convert to type in which it is needed 84 | freq = ((double) temp.QuadPart) / 1000.0; 85 | 86 | // rememeber query 87 | freq_set = true; 88 | } 89 | }; 90 | 91 | // Destructor 92 | ~StopWatchWin() { }; 93 | 94 | public: 95 | //! Start time measurement 96 | inline void start(); 97 | 98 | //! Stop time measurement 99 | inline void stop(); 100 | 101 | //! Reset time counters to zero 102 | inline void reset(); 103 | 104 | //! Time in msec. after start. If the stop watch is still running (i.e. there 105 | //! was no call to stop()) then the elapsed time is returned, otherwise the 106 | //! time between the last start() and stop call is returned 107 | inline float getTime(); 108 | 109 | //! Mean time to date based on the number of times the stopwatch has been 110 | //! _stopped_ (ie finished sessions) and the current total time 111 | inline float getAverageTime(); 112 | 113 | private: 114 | // member variables 115 | 116 | //! Start of measurement 117 | LARGE_INTEGER start_time; 118 | //! End of measurement 119 | LARGE_INTEGER end_time; 120 | 121 | //! Time difference between the last start and stop 122 | float diff_time; 123 | 124 | //! TOTAL time difference between starts and stops 125 | float total_time; 126 | 127 | //! flag if the stop watch is running 128 | bool running; 129 | 130 | //! Number of times clock has been started 131 | //! and stopped to allow averaging 132 | int clock_sessions; 133 | 134 | //! tick frequency 135 | double freq; 136 | 137 | //! flag if the frequency has been set 138 | bool freq_set; 139 | }; 140 | 141 | // functions, inlined 142 | 143 | //////////////////////////////////////////////////////////////////////////////// 144 | //! Start time measurement 145 | //////////////////////////////////////////////////////////////////////////////// 146 | inline void 147 | StopWatchWin::start() 148 | { 149 | QueryPerformanceCounter((LARGE_INTEGER *) &start_time); 150 | running = true; 151 | } 152 | 153 | //////////////////////////////////////////////////////////////////////////////// 154 | //! Stop time measurement and increment add to the current diff_time summation 155 | //! variable. Also increment the number of times this clock has been run. 156 | //////////////////////////////////////////////////////////////////////////////// 157 | inline void 158 | StopWatchWin::stop() 159 | { 160 | QueryPerformanceCounter((LARGE_INTEGER *) &end_time); 161 | diff_time = (float) 162 | (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); 163 | 164 | total_time += diff_time; 165 | clock_sessions++; 166 | running = false; 167 | } 168 | 169 | //////////////////////////////////////////////////////////////////////////////// 170 | //! Reset the timer to 0. Does not change the timer running state but does 171 | //! recapture this point in time as the current start time if it is running. 172 | //////////////////////////////////////////////////////////////////////////////// 173 | inline void 174 | StopWatchWin::reset() 175 | { 176 | diff_time = 0; 177 | total_time = 0; 178 | clock_sessions = 0; 179 | 180 | if (running) 181 | { 182 | QueryPerformanceCounter((LARGE_INTEGER *) &start_time); 183 | } 184 | } 185 | 186 | 187 | //////////////////////////////////////////////////////////////////////////////// 188 | //! Time in msec. after start. If the stop watch is still running (i.e. there 189 | //! was no call to stop()) then the elapsed time is returned added to the 190 | //! current diff_time sum, otherwise the current summed time difference alone 191 | //! is returned. 192 | //////////////////////////////////////////////////////////////////////////////// 193 | inline float 194 | StopWatchWin::getTime() 195 | { 196 | // Return the TOTAL time to date 197 | float retval = total_time; 198 | 199 | if (running) 200 | { 201 | LARGE_INTEGER temp; 202 | QueryPerformanceCounter((LARGE_INTEGER *) &temp); 203 | retval += (float) 204 | (((double)(temp.QuadPart - start_time.QuadPart)) / freq); 205 | } 206 | 207 | return retval; 208 | } 209 | 210 | //////////////////////////////////////////////////////////////////////////////// 211 | //! Time in msec. for a single run based on the total number of COMPLETED runs 212 | //! and the total time. 213 | //////////////////////////////////////////////////////////////////////////////// 214 | inline float 215 | StopWatchWin::getAverageTime() 216 | { 217 | return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; 218 | } 219 | #else 220 | // Declarations for Stopwatch on Linux and Mac OSX 221 | // includes, system 222 | #include 223 | #include 224 | 225 | //! Windows specific implementation of StopWatch 226 | class StopWatchLinux : public StopWatchInterface 227 | { 228 | public: 229 | //! Constructor, default 230 | StopWatchLinux() : 231 | start_time(), diff_time(0.0), total_time(0.0), 232 | running(false), clock_sessions(0) 233 | { }; 234 | 235 | // Destructor 236 | virtual ~StopWatchLinux() 237 | { }; 238 | 239 | public: 240 | //! Start time measurement 241 | inline void start(); 242 | 243 | //! Stop time measurement 244 | inline void stop(); 245 | 246 | //! Reset time counters to zero 247 | inline void reset(); 248 | 249 | //! Time in msec. after start. If the stop watch is still running (i.e. there 250 | //! was no call to stop()) then the elapsed time is returned, otherwise the 251 | //! time between the last start() and stop call is returned 252 | inline float getTime(); 253 | 254 | //! Mean time to date based on the number of times the stopwatch has been 255 | //! _stopped_ (ie finished sessions) and the current total time 256 | inline float getAverageTime(); 257 | 258 | private: 259 | 260 | // helper functions 261 | 262 | //! Get difference between start time and current time 263 | inline float getDiffTime(); 264 | 265 | private: 266 | 267 | // member variables 268 | 269 | //! Start of measurement 270 | struct timeval start_time; 271 | 272 | //! Time difference between the last start and stop 273 | float diff_time; 274 | 275 | //! TOTAL time difference between starts and stops 276 | float total_time; 277 | 278 | //! flag if the stop watch is running 279 | bool running; 280 | 281 | //! Number of times clock has been started 282 | //! and stopped to allow averaging 283 | int clock_sessions; 284 | }; 285 | 286 | // functions, inlined 287 | 288 | //////////////////////////////////////////////////////////////////////////////// 289 | //! Start time measurement 290 | //////////////////////////////////////////////////////////////////////////////// 291 | inline void 292 | StopWatchLinux::start() 293 | { 294 | gettimeofday(&start_time, 0); 295 | running = true; 296 | } 297 | 298 | //////////////////////////////////////////////////////////////////////////////// 299 | //! Stop time measurement and increment add to the current diff_time summation 300 | //! variable. Also increment the number of times this clock has been run. 301 | //////////////////////////////////////////////////////////////////////////////// 302 | inline void 303 | StopWatchLinux::stop() 304 | { 305 | diff_time = getDiffTime(); 306 | total_time += diff_time; 307 | running = false; 308 | clock_sessions++; 309 | } 310 | 311 | //////////////////////////////////////////////////////////////////////////////// 312 | //! Reset the timer to 0. Does not change the timer running state but does 313 | //! recapture this point in time as the current start time if it is running. 314 | //////////////////////////////////////////////////////////////////////////////// 315 | inline void 316 | StopWatchLinux::reset() 317 | { 318 | diff_time = 0; 319 | total_time = 0; 320 | clock_sessions = 0; 321 | 322 | if (running) 323 | { 324 | gettimeofday(&start_time, 0); 325 | } 326 | } 327 | 328 | //////////////////////////////////////////////////////////////////////////////// 329 | //! Time in msec. after start. If the stop watch is still running (i.e. there 330 | //! was no call to stop()) then the elapsed time is returned added to the 331 | //! current diff_time sum, otherwise the current summed time difference alone 332 | //! is returned. 333 | //////////////////////////////////////////////////////////////////////////////// 334 | inline float 335 | StopWatchLinux::getTime() 336 | { 337 | // Return the TOTAL time to date 338 | float retval = total_time; 339 | 340 | if (running) 341 | { 342 | retval += getDiffTime(); 343 | } 344 | 345 | return retval; 346 | } 347 | 348 | //////////////////////////////////////////////////////////////////////////////// 349 | //! Time in msec. for a single run based on the total number of COMPLETED runs 350 | //! and the total time. 351 | //////////////////////////////////////////////////////////////////////////////// 352 | inline float 353 | StopWatchLinux::getAverageTime() 354 | { 355 | return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; 356 | } 357 | //////////////////////////////////////////////////////////////////////////////// 358 | 359 | //////////////////////////////////////////////////////////////////////////////// 360 | inline float 361 | StopWatchLinux::getDiffTime() 362 | { 363 | struct timeval t_time; 364 | gettimeofday(&t_time, 0); 365 | 366 | // time difference in milli-seconds 367 | return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) 368 | + (0.001 * (t_time.tv_usec - start_time.tv_usec))); 369 | } 370 | #endif // WIN32 371 | 372 | //////////////////////////////////////////////////////////////////////////////// 373 | //! Timer functionality exported 374 | 375 | //////////////////////////////////////////////////////////////////////////////// 376 | //! Create a new timer 377 | //! @return true if a time has been created, otherwise false 378 | //! @param name of the new timer, 0 if the creation failed 379 | //////////////////////////////////////////////////////////////////////////////// 380 | inline bool 381 | sdkCreateTimer(StopWatchInterface **timer_interface) 382 | { 383 | //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); 384 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 385 | *timer_interface = (StopWatchInterface *)new StopWatchWin(); 386 | #else 387 | *timer_interface = (StopWatchInterface *)new StopWatchLinux(); 388 | #endif 389 | return (*timer_interface != NULL) ? true : false; 390 | } 391 | 392 | 393 | //////////////////////////////////////////////////////////////////////////////// 394 | //! Delete a timer 395 | //! @return true if a time has been deleted, otherwise false 396 | //! @param name of the timer to delete 397 | //////////////////////////////////////////////////////////////////////////////// 398 | inline bool 399 | sdkDeleteTimer(StopWatchInterface **timer_interface) 400 | { 401 | //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); 402 | if (*timer_interface) 403 | { 404 | delete *timer_interface; 405 | *timer_interface = NULL; 406 | } 407 | 408 | return true; 409 | } 410 | 411 | //////////////////////////////////////////////////////////////////////////////// 412 | //! Start the time with name \a name 413 | //! @param name name of the timer to start 414 | //////////////////////////////////////////////////////////////////////////////// 415 | inline bool 416 | sdkStartTimer(StopWatchInterface **timer_interface) 417 | { 418 | //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); 419 | if (*timer_interface) 420 | { 421 | (*timer_interface)->start(); 422 | } 423 | 424 | return true; 425 | } 426 | 427 | //////////////////////////////////////////////////////////////////////////////// 428 | //! Stop the time with name \a name. Does not reset. 429 | //! @param name name of the timer to stop 430 | //////////////////////////////////////////////////////////////////////////////// 431 | inline bool 432 | sdkStopTimer(StopWatchInterface **timer_interface) 433 | { 434 | // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); 435 | if (*timer_interface) 436 | { 437 | (*timer_interface)->stop(); 438 | } 439 | 440 | return true; 441 | } 442 | 443 | //////////////////////////////////////////////////////////////////////////////// 444 | //! Resets the timer's counter. 445 | //! @param name name of the timer to reset. 446 | //////////////////////////////////////////////////////////////////////////////// 447 | inline bool 448 | sdkResetTimer(StopWatchInterface **timer_interface) 449 | { 450 | // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); 451 | if (*timer_interface) 452 | { 453 | (*timer_interface)->reset(); 454 | } 455 | 456 | return true; 457 | } 458 | 459 | //////////////////////////////////////////////////////////////////////////////// 460 | //! Return the average time for timer execution as the total time 461 | //! for the timer dividied by the number of completed (stopped) runs the timer 462 | //! has made. 463 | //! Excludes the current running time if the timer is currently running. 464 | //! @param name name of the timer to return the time of 465 | //////////////////////////////////////////////////////////////////////////////// 466 | inline float 467 | sdkGetAverageTimerValue(StopWatchInterface **timer_interface) 468 | { 469 | // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); 470 | if (*timer_interface) 471 | { 472 | return (*timer_interface)->getAverageTime(); 473 | } 474 | else 475 | { 476 | return 0.0f; 477 | } 478 | } 479 | 480 | //////////////////////////////////////////////////////////////////////////////// 481 | //! Total execution time for the timer over all runs since the last reset 482 | //! or timer creation. 483 | //! @param name name of the timer to obtain the value of. 484 | //////////////////////////////////////////////////////////////////////////////// 485 | inline float 486 | sdkGetTimerValue(StopWatchInterface **timer_interface) 487 | { 488 | // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); 489 | if (*timer_interface) 490 | { 491 | return (*timer_interface)->getTime(); 492 | } 493 | else 494 | { 495 | return 0.0f; 496 | } 497 | } 498 | 499 | #endif // HELPER_TIMER_H 500 | -------------------------------------------------------------------------------- /compile.m: -------------------------------------------------------------------------------- 1 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b'; 2 | CUDA_ROOT = '/usr/local/cuda-6.0'; 3 | 4 | if ismac 5 | MATLAB_ROOT = '/Applications/MATLAB_R2014a.app'; 6 | CUDA_ROOT = '/usr/local/cuda'; 7 | end 8 | 9 | cuda_compile('./src', 'cudaFFTData', MATLAB_ROOT, CUDA_ROOT, './bin', false) 10 | cuda_compile('./src', 'cudaConvFFTData',MATLAB_ROOT, CUDA_ROOT, './bin', false) 11 | cuda_compile('./src', 'cudaConvolutionFFT',MATLAB_ROOT, CUDA_ROOT, './bin', false) 12 | -------------------------------------------------------------------------------- /cuda_compile.m: -------------------------------------------------------------------------------- 1 | function cuda_compile( src_path, func_name, matlab_root, cuda_root, out_path, debug) 2 | %CUDA_COMPILE general cuda compiling helper for MATLAB version < 2014a 3 | if nargin < 6 4 | debug = false; 5 | end 6 | 7 | if ~exist('./bin', 'dir') 8 | mkdir('./bin') 9 | end 10 | 11 | % TODO: For matlab version < 8.0.1, Use the following setting, 12 | % if ~verLessThan('matlab', '8.0.1') 13 | % http://www.mathworks.com/help/distcomp/run-mex-functions-containing-cuda-code.html 14 | % setenv('MW_NVCC_PATH',[cudaroot '/nvcc']) 15 | % eval(sprintf('mex -v -largeArrayDims %s.cu',func_name)); 16 | % elseif isunix && ~ismac && verLessThan('matlab', '8.0.1') 17 | 18 | 19 | % ------------------------------------------------------------------------------ 20 | % Check cuda computing capability 21 | % ------------------------------------------------------------------------------ 22 | % TODO, CUDA Stream if high CM 23 | gpuInfo = gpuDevice; 24 | fprintf('Your GPU Computing Capability %d\n', str2num(gpuInfo.ComputeCapability)); 25 | 26 | % Remove compiled binary files 27 | eval(['!rm bin/' func_name '.o']); 28 | 29 | % ------------------------------------------------------------------------------ 30 | % Setup environment variables 31 | % ------------------------------------------------------------------------------ 32 | 33 | % Set debugging flag 34 | if debug 35 | nvcc_debug_flag = '-g -G -O0'; 36 | mex_debug_flag = '-g'; 37 | else 38 | nvcc_debug_flag = '-O3 -DNDEBUG'; 39 | mex_debug_flag = ''; 40 | end 41 | 42 | if ismac 43 | matlab_bin_path = '/bin/maci64'; 44 | else 45 | matlab_bin_path = '/bin/glnxa64'; 46 | end 47 | 48 | INCLUDE_PATH = sprintf([... 49 | '-I./common ',... 50 | '-I%s/extern/include ',... 51 | '-I%s/toolbox/distcomp/gpu/extern/include'],... 52 | matlab_root, matlab_root); 53 | NVCC_OPTS = '-arch=sm_30 -ftz=true -prec-div=false -prec-sqrt=false'; 54 | COMPILER_OPTS = '-Xcompiler -fPIC -v'; 55 | 56 | MEX_OPTS = '-largeArrayDims'; 57 | MEX_INCLUDE_PATH = sprintf('-I%s/include', cuda_root); 58 | MEX_LIBS = '-lcudart -lcufft -lmwgpu'; 59 | MEX_LIBRARY_PATH = ['-L', matlab_root, matlab_bin_path]; 60 | 61 | % ------------------------------------------------------------------------------ 62 | % Compile 63 | % ------------------------------------------------------------------------------ 64 | 65 | % Compile the object file 66 | compile_string = sprintf([... 67 | '!%s/bin/nvcc ',... 68 | '%s ',... % Debug flag 69 | '%s ',... % Compiler options 70 | '%s ',... % NVCC_OPTS 71 | '%s ',... % Include paths 72 | '-c %s/%s.cu --output-file %s/%s.o'], ... 73 | cuda_root, nvcc_debug_flag, COMPILER_OPTS, NVCC_OPTS, INCLUDE_PATH, src_path, func_name, out_path, func_name); 74 | disp(compile_string); 75 | eval(compile_string); 76 | 77 | compile_string = sprintf(['mex ',... 78 | '%s ',... % Debug flag 79 | '%s ',... % Mex options 80 | '%s/%s.o ',... % Object file 81 | '%s ',... % Mex library path 82 | '%s ',... % Mex libraries 83 | '-outdir %s'],... % Out path 84 | mex_debug_flag, MEX_OPTS, out_path, func_name, MEX_LIBRARY_PATH, MEX_LIBS, out_path); 85 | disp(compile_string); 86 | eval(compile_string); 87 | 88 | % % Run system command 89 | % !nvcc -O3 -DNDEBUG -c cudaconv.cu -Xcompiler -fPIC -I/afs/cs/package/matlab-r2013b/matlab/r2013b/extern/include -I/afs/cs/package/matlab-r2013b/matlab/r2013b/toolbox/distcomp/gpu/extern/include 90 | % % Link object 91 | % mex cudaconv.o -L/usr/local/cuda-6.0/lib64 -L/afs/cs/package/matlab-r2013b/matlab/r2013b/bin/glnxa64 -lcudart -lcufft -lmwgpu 92 | % -gencode arch=compute_30,code=sm_30 93 | -------------------------------------------------------------------------------- /demoCudaConvolutionFFT.m: -------------------------------------------------------------------------------- 1 | % MatlabCUDAConv 2 | % 3 | % To speed up convolutions, I made 4 | 5 | % ------------------------------------------------------------------------------ 6 | % Compile 7 | % ------------------------------------------------------------------------------ 8 | 9 | % Change the following lines 10 | MATLAB_ROOT = '/afs/cs/package/matlab-r2013b/matlab/r2013b/'; 11 | CUDA_ROOT = '/usr/local/cuda-6.0/'; 12 | 13 | if ismac 14 | MATLAB_ROOT = '/Applications/MATLAB_R2014a.app/'; 15 | CUDA_ROOT = '/usr/local/cuda/'; 16 | end 17 | 18 | % Debugging compile 19 | compile 20 | addpath('./bin') 21 | 22 | % ------------------------------------------------------------------------------ 23 | % Clear the GPU 24 | % ------------------------------------------------------------------------------ 25 | 26 | clear; 27 | device_id = 1; % 1-base GPU index (MATLAB convention) 28 | g = gpuDevice(device_id); 29 | reset(g); 30 | cos(gpuArray(1)); % force matlab gpu dynamic library loading 31 | 32 | 33 | % ------------------------------------------------------------------------------ 34 | % Experiment setup 35 | % ------------------------------------------------------------------------------ 36 | 37 | n = 64; % data height 38 | m = 8; % data width 39 | k = 5; % number of channels 40 | 41 | cn = 10; % kernel height 42 | cm = 4; % kernel width 43 | 44 | % Make random data 45 | data = single(rand(n,m)); 46 | for i = 2:k 47 | data(:,:,i) = single(rand(n,m)); 48 | end 49 | 50 | % Make random kernel 51 | kernel = zeros(cn,cm,k,'single'); 52 | kernel(:,:,1) = single(reshape(1:cn*cm,cn,cm)); 53 | for i = 2:k 54 | kernel(:,:,i) = single(rand(cn,cm)); 55 | end 56 | 57 | % To verify experiment, put kernel values to specific regions 58 | data(5:(4+cn),2:(1+cm),1) = kernel(:,:,1); 59 | data(21:(20+cn),1:cm,2) = kernel(:,:,1); 60 | data(1:cn,(m-(cm-1)):m,k) = kernel(:,:,1); 61 | kernel(:,:,k) = kernel(:,:,1); 62 | 63 | % ------------------------------------------------------------------------------ 64 | % Flip Kernel (Required) 65 | % ------------------------------------------------------------------------------ 66 | 67 | for i = 1:k 68 | kernel(:,:,i) = kernel(end:-1:1,end:-1:1,i); 69 | end 70 | 71 | 72 | % ------------------------------------------------------------------------------ 73 | % Matlab convolution (Conv2 and FFT versions) 74 | % ------------------------------------------------------------------------------ 75 | 76 | % Compute convolution using FFT 77 | % The size of ffted data should be larger than (n + cn - 1)x(m + cm - 1) 78 | fft_h = 80; 79 | fft_w = 16; 80 | matFFTedData = zeros(fft_h,fft_w,k); 81 | for i = 1:k 82 | matFFTedData(:,:,i) = fft2(data(:,:,i),fft_h,fft_w); 83 | end 84 | 85 | matFFTedKernel = zeros(fft_h, fft_w, k); 86 | for i = 1:k 87 | matFFTedKernel(:,:,i) = fft2(kernel(:,:,i),fft_h,fft_w); 88 | end 89 | 90 | % Compute using the naive convolution 91 | matConv = conv2(data(:,:,1),kernel(:,:,1)); 92 | for i = 2:k 93 | matConv(:,:,i) = conv2(data(:,:,i),kernel(:,:,i)); 94 | end 95 | 96 | cvmatlab = sum(matConv,3); 97 | 98 | ematlab = matFFTedKernel .* (matFFTedData); 99 | matFFTConv = ifft2(ematlab(:,:,1)); 100 | for i=1:k 101 | matFFTConv(:,:,i) = ifft2(ematlab(:,:,i)); 102 | end 103 | 104 | 105 | % ------------------------------------------------------------------------------ 106 | % Convolution using GPU cudaConvolutionFFT 107 | % ------------------------------------------------------------------------------ 108 | 109 | % You can feed multiple kernels in a cell format 110 | kernel2 = kernel; 111 | kernel2(1) = 100; 112 | 113 | kernelCell = {kernel, kernel2, kernel}; 114 | 115 | thread_per_block_width = 8; 116 | thread_per_block_height = 8; 117 | thread_per_block_depth = 8; 118 | thread_per_block_2d_width = 16; 119 | threads_per_block_in =[thread_per_block_width, ... 120 | thread_per_block_height, ... 121 | thread_per_block_depth, ... 122 | thread_per_block_2d_width]; 123 | 124 | [cvcell] = cudaConvolutionFFT(data, ... % Data 125 | cn,... % Maximum kernel height 126 | cm,... % Maximum kernel width 127 | kernelCell,... % Multiple kernels in a cell 128 | threads_per_block_in,... % threads per block 129 | device_id-1); % 0-based indexing for GPU Device ID 130 | cvg = cvcell{1}; % Get the result for the first kernel 131 | cvg2 = cvcell{2}; % Get the result for the second kernel (kernel2) 132 | 133 | % ------------------------------------------------------------------------------ 134 | % Comparison and visualization 135 | % ------------------------------------------------------------------------------ 136 | 137 | % Visualize convolution result 138 | figure(1); subplot(131); imagesc(sum(matConv,3)); 139 | subplot(132); imagesc(real(sum(matFFTConv,3))); 140 | subplot(133); imagesc(real(cvg)); 141 | 142 | % Transformed data 143 | figure(2); imagesc(real(ematlab(:,:,1))); 144 | 145 | % Compare matlab convolution with cuda FFT convolution 146 | figure(3); subplot(131); imagesc(cvg); % Convolution output ( using FFT, 147 | % data is padded with the size of the 148 | % kernel -1 ) 149 | subplot(132); imagesc(cvg(1:n + cn - 1,1:m + cm - 1)); % Extract 150 | % exact convolution part that is the 151 | % same as matlab convolution 152 | subplot(133); imagesc(cvmatlab); % Visualize matlab convolution output 153 | 154 | % Compute residual 155 | figure(4); imagesc(cvg(1:n + cn - 1,1:m + cm - 1) - cvmatlab); colorbar; 156 | -------------------------------------------------------------------------------- /src/convolutionFFTkernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. Users and possessors of this source code 8 | * are hereby granted a nonexclusive, royalty-free license to use this code 9 | * in individual and commercial software. 10 | * 11 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 12 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 13 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 14 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 15 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 16 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 17 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 18 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 19 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 20 | * OR PERFORMANCE OF THIS SOURCE CODE. 21 | * 22 | * U.S. Government End Users. This source code is a "commercial item" as 23 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 24 | * "commercial computer software" and "commercial computer software 25 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 26 | * and is provided to the U.S. Government only as a commercial end item. 27 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 28 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 29 | * source code with only those rights set forth herein. 30 | * 31 | * Any use of this source code in individual and commercial software must 32 | * include, in the user documentation and internal comments to the code, 33 | * the above Disclaimer and U.S. Government End Users Notice. 34 | */ 35 | 36 | 37 | 38 | #define IMUL(a, b) __mul24(a, b) 39 | 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // Copy input data array to the upper left corner and pad by border values 43 | //////////////////////////////////////////////////////////////////////////////// 44 | texture texData; 45 | 46 | __global__ void padData( 47 | float *d_PaddedData, 48 | int fftW, 49 | int fftH, 50 | int dataW, 51 | int dataH, 52 | int featureDim, 53 | int kernelW, 54 | int kernelH, 55 | int kernelX, 56 | int kernelY 57 | ){ 58 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 59 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 60 | const int borderW = dataW + kernelX; 61 | const int borderH = dataH + kernelY; 62 | int dx; 63 | int dy; 64 | 65 | if(x < fftW && y < fftH){ 66 | if(x < dataW) dx = x; 67 | if(y < dataH) dy = y; 68 | if(x >= dataW && x < borderW) dx = dataW - 1; 69 | if(y >= dataH && y < borderH) dy = dataH - 1; 70 | if(x >= borderW) dx = 0; 71 | if(y >= borderH) dy = 0; 72 | 73 | d_PaddedData[IMUL(y, fftW) + x] = 74 | tex2D(texData, (float)dx + 0.5f, (float)dy + 0.5f); 75 | } 76 | } 77 | 78 | 79 | 80 | //////////////////////////////////////////////////////////////////////////////// 81 | // Modulate Fourier image of padded data by Fourier image of padded kernel 82 | // and normalize by FFT size 83 | //////////////////////////////////////////////////////////////////////////////// 84 | __device__ void complexMulAndScale(Complex& a, Complex b, float c){ 85 | Complex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; 86 | a = t; 87 | } 88 | 89 | __global__ void modulateAndNormalize( 90 | Complex *fft_PaddedData, 91 | Complex *fft_PaddedKernel, 92 | int dataN 93 | ){ 94 | const int tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 95 | const int threadN = IMUL(blockDim.x, gridDim.x); 96 | const float q = 1.0f / (float)dataN; 97 | 98 | for(int i = tid; i < dataN; i += threadN) 99 | complexMulAndScale(fft_PaddedData[i], fft_PaddedKernel[i], q); 100 | } 101 | -------------------------------------------------------------------------------- /src/cudaConvFFTData.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mex.h" 4 | #include "gpu/mxGPUArray.h" 5 | // #include "common/helper_cuda.h" 6 | #include "cudaConvFFTData.h" 7 | #include "cudaConvFFTData.cuh" 8 | 9 | static bool debug = false; 10 | 11 | enum OUT_INDEX{ 12 | CONVOLUTION_CELL_INDEX 13 | }; 14 | 15 | enum IN_INDEX{ 16 | FFT_DATA_INDEX, 17 | KERNLE_CELL_INDEX, 18 | THREAD_SIZE_INDEX // Optional 19 | }; 20 | 21 | //////////////////////////////////////////////////////////////////////////////// 22 | // Mex Entry 23 | //////////////////////////////////////////////////////////////////////////////// 24 | void mexFunction(int nlhs, mxArray *plhs[], 25 | int nrhs, mxArray const *prhs[]) 26 | { 27 | /* Declare all variables.*/ 28 | const mxGPUArray *mxFFTData; 29 | const mxGPUArray *mxKernel; 30 | mxGPUArray *mxFFTKernel; 31 | mxGPUArray *mxConvolution; 32 | mxArray *convolutionResult; 33 | 34 | /* cufftComplex is float2 */ 35 | const cufftComplex *d_CFFT_DATA; 36 | cufftComplex *d_CFFT_KERNEL; 37 | cufftComplex *d_FFTEProd; 38 | 39 | float *d_CONVOLUTION; 40 | float *d_IFFTEProd; 41 | 42 | float *h_Kernel; 43 | float *h_CONVOLUTION; 44 | float *d_Kernel; 45 | float *d_PaddedKernel; 46 | 47 | char const * const errId = "cudaConvFFTData:InvalidInput"; 48 | 49 | /* Choose a reasonably sized number of threads for the block. */ 50 | int THREAD_PER_BLOCK_H = 16; 51 | int THREAD_PER_BLOCK_W = 8; 52 | int THREAD_PER_BLOCK_D = 8; 53 | int THREAD_PER_BLOCK_2D = 32; 54 | 55 | const mwSize * mxKernel_Dim; 56 | const mwSize * mxFFT_Dim; 57 | // int MblocksPerGrid, NblocksPerGrid; 58 | int KERNEL_H, KERNEL_W, N_KERNEL, 59 | CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM, 60 | KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE; 61 | 62 | /* Initialize the MathWorks GPU API. */ 63 | // If initialized mxInitGPU do nothing 64 | if (mxInitGPU() != MX_GPU_SUCCESS) 65 | mexErrMsgTxt("mxInitGPU fail"); 66 | 67 | /* Throw an error if the input is not a GPU array. */ 68 | if ( (nrhs < (KERNLE_CELL_INDEX + 1)) || (nrhs > (THREAD_SIZE_INDEX + 1) ) || !mxIsGPUArray(prhs[FFT_DATA_INDEX]) ) 69 | mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU"); 70 | 71 | if (( nrhs > THREAD_SIZE_INDEX) && mxGetNumberOfElements(prhs[THREAD_SIZE_INDEX]) != 4) 72 | mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock"); 73 | 74 | if ( nrhs > THREAD_SIZE_INDEX ){ 75 | const double* threadSize = (double *)mxGetData(prhs[THREAD_SIZE_INDEX]); 76 | THREAD_PER_BLOCK_H = (int)threadSize[0]; 77 | THREAD_PER_BLOCK_W = (int)threadSize[1]; 78 | THREAD_PER_BLOCK_D = (int)threadSize[2]; 79 | THREAD_PER_BLOCK_2D = (int)threadSize[3]; 80 | if(debug) fprintf(stderr,"Thread size: H=%d, W=%d, D=%d, 2D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D); 81 | } 82 | 83 | // cudaDeviceProp dev; 84 | // cudaGetDeviceProperties(&dev,0); 85 | // int success = checkDeviceProp(dev); 86 | 87 | mxFFTData = mxGPUCreateFromMxArray(prhs[FFT_DATA_INDEX]); 88 | mxFFT_Dim = mxGPUGetDimensions(mxFFTData); 89 | 90 | // FFT Dim 91 | // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points. 92 | CFFT_H = mxFFT_Dim[0]; 93 | CFFT_W = mxFFT_Dim[1]; 94 | 95 | FFT_H = (mxFFT_Dim[0] - 1) * 2; 96 | FFT_W = mxFFT_Dim[1]; 97 | 98 | FEATURE_DIM = mxFFT_Dim[2]; 99 | 100 | CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2); 101 | FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float); 102 | CONV_SIZE = FFT_W * FFT_H * sizeof(float); 103 | 104 | if(debug) fprintf(stderr,"FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM); 105 | 106 | if (mxGetClassID(prhs[KERNLE_CELL_INDEX]) != mxCELL_CLASS) 107 | mexErrMsgIdAndTxt(errId, "Kernel must be a cell array"); 108 | 109 | mwSize nKernel = mxGetNumberOfElements(prhs[KERNLE_CELL_INDEX]); 110 | N_KERNEL = (int)nKernel; 111 | plhs[CONVOLUTION_CELL_INDEX] = mxCreateCellMatrix(1, N_KERNEL); 112 | 113 | if(debug) fprintf(stderr,"N Kernel: %d\n", N_KERNEL); 114 | 115 | 116 | /* Set block size and thread size */ 117 | dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D); 118 | dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 119 | iDivUp(FFT_H, threadBlock3D.y), 120 | iDivUp(FEATURE_DIM, threadBlock3D.z)); 121 | 122 | dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D); 123 | dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 124 | iDivUp(FFT_H, threadBlock2D.y)); 125 | 126 | 127 | /* Pad Kernel */ 128 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE)); 129 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_IFFTEProd, FFT_SIZE)); 130 | 131 | /* Create a GPUArray to hold the result and get its underlying pointer. */ 132 | // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize)); 133 | // FFT_dims[0] = FFT_H; 134 | // FFT_dims[1] = FFT_W; 135 | // FFT_dims[2] = FEATURE_DIM; 136 | 137 | d_CFFT_DATA = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData); 138 | 139 | // mxConvolution = mxGPUCreateGPUArray(2, 140 | // FFT_dims, // Third element will not be accessed 141 | // mxSINGLE_CLASS, 142 | // mxREAL, 143 | // MX_GPU_DO_NOT_INITIALIZE); 144 | 145 | // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution)); 146 | 147 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE)); 148 | 149 | // mxFFTKernel = mxGPUCreateGPUArray(3, 150 | // mxFFT_Dim, 151 | // mxSINGLE_CLASS, 152 | // mxCOMPLEX, 153 | // MX_GPU_DO_NOT_INITIALIZE); 154 | 155 | // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel)); 156 | 157 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE)); 158 | 159 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE)); 160 | 161 | /* FFT Kernel */ 162 | int BATCH = FEATURE_DIM; 163 | int FFT_Dims[] = { FFT_W, FFT_H }; 164 | int CFFT_Dims[] = { CFFT_W, CFFT_H }; 165 | 166 | int idist = FFT_W * FFT_H; 167 | int odist = CFFT_W * CFFT_H; 168 | 169 | cufftHandle FFTplan_R2C, FFTplan_C2R; 170 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 171 | 2, // rank 172 | FFT_Dims, 173 | FFT_Dims, 1, idist, // *inembed, istride, idist 174 | CFFT_Dims, 1, odist, // *onembed, ostride, odist 175 | CUFFT_R2C, 176 | BATCH)); // batch 177 | 178 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_C2R, 179 | 2, // rank 180 | FFT_Dims, 181 | CFFT_Dims, 1, odist, // *inembed, istride, idist 182 | FFT_Dims, 1, idist, // *onembed, ostride, odist 183 | CUFFT_C2R, 184 | BATCH)); // batch 185 | 186 | mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize)); 187 | FFT_dims[0] = FFT_H; 188 | FFT_dims[1] = FFT_W; 189 | 190 | /* For each kernel iterate */ 191 | for (int kernelIdx = 0; kernelIdx < N_KERNEL; kernelIdx++){ 192 | 193 | // Get Kernel Data 194 | const mxArray *mxCurrentCell = mxGetCell(prhs[KERNLE_CELL_INDEX], kernelIdx); 195 | if (!mxIsGPUArray(mxCurrentCell)){ 196 | 197 | if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 ) 198 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1"); 199 | 200 | h_Kernel = (float *)mxGetData(mxCurrentCell); 201 | mxKernel_Dim = mxGetDimensions(mxCurrentCell); 202 | 203 | // Kernel dimensions 204 | KERNEL_H = mxKernel_Dim[0]; 205 | KERNEL_W = mxKernel_Dim[1]; 206 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float); 207 | 208 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Kernel, KERNEL_SIZE)); 209 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice)); 210 | mxKernel = NULL; 211 | }else{ // Kernel is GPU Array 212 | mxKernel = mxGPUCreateFromMxArray(mxCurrentCell); 213 | 214 | if ( mxGPUGetClassID(mxKernel) != mxSINGLE_CLASS || mxGPUGetNumberOfDimensions(mxKernel) != 3 ) 215 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1"); 216 | 217 | mxKernel_Dim = mxGPUGetDimensions(mxKernel); 218 | 219 | // Kernel dimensions 220 | KERNEL_H = mxKernel_Dim[0]; 221 | KERNEL_W = mxKernel_Dim[1]; 222 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float); 223 | 224 | d_Kernel = (float *)mxGPUGetDataReadOnly(mxKernel); 225 | } 226 | 227 | if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W); 228 | 229 | if (FEATURE_DIM != mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){ 230 | mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size"); 231 | } 232 | 233 | padData<<>>( 234 | d_PaddedKernel, 235 | d_Kernel, 236 | FFT_W, 237 | FFT_H, 238 | KERNEL_W, 239 | KERNEL_H, 240 | FEATURE_DIM 241 | ); 242 | 243 | 244 | CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedKernel, d_CFFT_KERNEL)); 245 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 246 | 247 | if(debug) fprintf(stderr,"FFT done\n"); 248 | 249 | 250 | /* Hadamard product, Element-wise multiplication in frequency domain */ 251 | /* If execute the following, second compile of this file create MATLAB error */ 252 | elementwiseProductAndNormalize<<>>( 253 | d_FFTEProd, // out 254 | d_CFFT_DATA, // in data 255 | d_CFFT_KERNEL, // in kernel 256 | CFFT_H, 257 | CFFT_W, 258 | FEATURE_DIM, 259 | 1.0f / (FFT_W * FFT_H) 260 | ); 261 | 262 | CUFFT_SAFE_CALL(cufftExecC2R(FFTplan_C2R, d_FFTEProd, d_IFFTEProd)); 263 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 264 | 265 | sumAlongFeatures<<>>( 266 | d_CONVOLUTION, 267 | d_IFFTEProd, 268 | FFT_H, 269 | FFT_W, 270 | FEATURE_DIM 271 | ); 272 | 273 | 274 | 275 | convolutionResult = mxCreateNumericArray(2, FFT_dims, mxSINGLE_CLASS, mxREAL); 276 | h_CONVOLUTION = (float *)mxGetData(convolutionResult); 277 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(h_CONVOLUTION, d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost)); 278 | 279 | mxSetCell(plhs[CONVOLUTION_CELL_INDEX], kernelIdx, convolutionResult); 280 | 281 | if(mxKernel == NULL) cudaFree(d_Kernel); 282 | } 283 | // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel); 284 | 285 | /* 286 | * The mxGPUArray pointers are host-side structures that refer to device 287 | * data. These must be destroyed before leaving the MEX function. 288 | */ 289 | mxGPUDestroyGPUArray(mxFFTData); 290 | // mxGPUDestroyGPUArray(mxConvolution); 291 | // mxGPUDestroyGPUArray(mxFFTKernel); 292 | 293 | cufftDestroy(FFTplan_R2C); 294 | cufftDestroy(FFTplan_C2R); 295 | 296 | if(mxKernel != NULL) mxGPUDestroyGPUArray(mxKernel); 297 | 298 | cudaFree(d_PaddedKernel); 299 | cudaFree(d_IFFTEProd); 300 | cudaFree(d_CONVOLUTION); 301 | cudaFree(d_CFFT_KERNEL); 302 | cudaFree(d_FFTEProd); 303 | 304 | 305 | mxFree(FFT_dims); 306 | } 307 | -------------------------------------------------------------------------------- /src/cudaConvFFTData.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_CONV_FFT_DATA_CUH 2 | #define CUDA_CONV_FFT_DATA_CUH 3 | 4 | /* 5 | * Device Code 6 | */ 7 | 8 | //////////////////////////////////////////////////////////////////////////////// 9 | // Pad data with zeros, 10 | //////////////////////////////////////////////////////////////////////////////// 11 | __global__ void padData( 12 | float *d_PaddedData, 13 | const float *d_Data, 14 | int fftW, 15 | int fftH, 16 | int dataW, 17 | int dataH, 18 | int FEATURE_DIM 19 | ){ 20 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 21 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 22 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z; 23 | 24 | if(x < fftW && y < fftH && z < FEATURE_DIM){ 25 | if(x < dataW && y < dataH) 26 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 27 | d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y]; 28 | else 29 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0; 30 | } 31 | } 32 | 33 | //////////////////////////////////////////////////////////////////////////////// 34 | // Modulate Fourier image of padded data by Fourier image of padded kernel 35 | // and normalize by FFT size 36 | //////////////////////////////////////////////////////////////////////////////// 37 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){ 38 | const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; 39 | out = t; 40 | } 41 | 42 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){ 43 | const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)}; 44 | out = t; 45 | } 46 | 47 | __global__ void elementwiseProductAndNormalize( 48 | cufftComplex *fft_Output, 49 | const cufftComplex *fft_PaddedData, 50 | const cufftComplex *fft_PaddedKernel, 51 | int FFT_H, 52 | int FFT_W, 53 | int FEATURE_DIM, 54 | float scale 55 | ){ 56 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 57 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 58 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z; 59 | 60 | if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){ 61 | // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y; 62 | int i = z * FFT_W * FFT_H + FFT_H * x + y; 63 | // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale); 64 | fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y); 65 | fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y); 66 | } 67 | } 68 | 69 | /* Support in-place computation, i.e. input and output can be the same */ 70 | __global__ void sumAlongFeatures( 71 | float *convolutionResult, 72 | const float *convolutionPerFeature, 73 | int FFT_H, 74 | int FFT_W, 75 | int FEATURE_DIM 76 | ){ 77 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 78 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 79 | 80 | if(x < FFT_W && y < FFT_H){ 81 | const int result_i = IMUL(FFT_H, x) + y; 82 | const int N = IMUL(FFT_W, FFT_H); 83 | 84 | float acc = convolutionPerFeature[result_i]; 85 | int zN = N; 86 | for (int z = 1; z < FEATURE_DIM; z++){ 87 | acc += convolutionPerFeature[zN + result_i]; 88 | zN += N; 89 | } 90 | convolutionResult[result_i] = acc; 91 | } 92 | } 93 | 94 | 95 | #endif -------------------------------------------------------------------------------- /src/cudaConvFFTData.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_CONV_FFT_DATA 2 | #define CUDA_CONV_FFT_DATA 3 | 4 | # define IMUL(a, b) __mul24(a, b) 5 | 6 | # define CUDA_SAFE_CALL_NO_SYNC( call) do { \ 7 | cudaError err = call; \ 8 | if( cudaSuccess != err) { \ 9 | printf("Cuda error in file '%s' in line %i Error : %d.\n", \ 10 | __FILE__, __LINE__, err); \ 11 | exit(EXIT_FAILURE); \ 12 | } } while (0) 13 | 14 | # define CUDA_SAFE_CALL( call) do { \ 15 | CUDA_SAFE_CALL_NO_SYNC(call); \ 16 | cudaError err = cudaThreadSynchronize(); \ 17 | if( cudaSuccess != err) { \ 18 | printf("Cuda error in file '%s' in line %i Error : %d.\n", \ 19 | __FILE__, __LINE__,err); \ 20 | exit(EXIT_FAILURE); \ 21 | } } while (0) 22 | 23 | # define CUFFT_SAFE_CALL( call) do { \ 24 | cufftResult err = call; \ 25 | if( CUFFT_SUCCESS != err) { \ 26 | printf("CUFFT error in file '%s' in line %i Error : %d.\n", \ 27 | __FILE__, __LINE__,err); \ 28 | exit(EXIT_FAILURE); \ 29 | } } while (0) 30 | 31 | 32 | //////////////////////////////////////////////////////////////////////////////// 33 | // Helper functions 34 | //////////////////////////////////////////////////////////////////////////////// 35 | //Round a / b to nearest higher integer value 36 | int iDivUp(int a, int b){ 37 | return (a % b != 0) ? (a / b + 1) : (a / b); 38 | } 39 | 40 | //Align a to nearest higher multiple of b 41 | int iAlignUp(int a, int b){ 42 | return (a % b != 0) ? (a - a % b + b) : a; 43 | } 44 | 45 | 46 | 47 | int checkDeviceProp ( cudaDeviceProp p ) { 48 | int support = p.canMapHostMemory; 49 | 50 | if(support == 0) printf( "%s does not support mapping host memory.\n", p.name); 51 | else printf( "%s supports mapping host memory.\n",p.name); 52 | 53 | support = p.concurrentKernels; 54 | if(support == 0) printf("%s does not support concurrent kernels\n", p.name); 55 | else printf("%s supports concurrent kernels\n",p.name); 56 | 57 | support = p.kernelExecTimeoutEnabled; 58 | if(support == 0) printf("%s kernelExecTimeout disabled\n", p.name); 59 | else printf("%s kernelExecTimeout enabled\n",p.name); 60 | 61 | printf("compute capability : %d.%d \n", p.major,p.minor); 62 | printf("number of multiprocessors : %d \n", p.multiProcessorCount); 63 | 64 | return support; 65 | } 66 | 67 | int computeFFTsize(int dataSize){ 68 | //Highest non-zero bit position of dataSize 69 | int hiBit; 70 | //Neares lower and higher powers of two numbers for dataSize 71 | unsigned int lowPOT, hiPOT; 72 | 73 | //Align data size to a multiple of half-warp 74 | //in order to have each line starting at properly aligned addresses 75 | //for coalesced global memory writes in padKernel() and padData() 76 | dataSize = iAlignUp(dataSize, 16); 77 | 78 | //Find highest non-zero bit 79 | for(hiBit = 31; hiBit >= 0; hiBit--) 80 | if(dataSize & (1U << hiBit)) break; 81 | 82 | //No need to align, if already power of two 83 | lowPOT = 1U << hiBit; 84 | if(lowPOT == dataSize) return dataSize; 85 | 86 | //Align to a nearest higher power of two, if the size is small enough, 87 | //else align only to a nearest higher multiple of 512, 88 | //in order to save computation and memory bandwidth 89 | hiPOT = 1U << (hiBit + 1); 90 | //if(hiPOT <= 1024) 91 | return hiPOT; 92 | //else 93 | // return iAlignUp(dataSize, 512); 94 | } 95 | 96 | int computeFFTsize16(int dataSize){ 97 | // Compute the multiple of 16 98 | int mod = dataSize / 16; 99 | int rem = dataSize % 16; 100 | 101 | return (mod * 16) + ((rem > 0)?16:0); 102 | } 103 | 104 | #endif -------------------------------------------------------------------------------- /src/cudaConvFFTDataStreams.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mex.h" 4 | #include "gpu/mxGPUArray.h" 5 | // #include "common/helper_cuda.h" 6 | #include "cudaConvFFTDataStream.h" 7 | 8 | 9 | const int N_MAX_PARALLEL = 32; 10 | static bool debug = true; 11 | 12 | /* 13 | * Device Code 14 | */ 15 | 16 | //////////////////////////////////////////////////////////////////////////////// 17 | // Pad data with zeros, 18 | //////////////////////////////////////////////////////////////////////////////// 19 | __global__ void padData( 20 | float *d_PaddedData, 21 | const float *d_Data, 22 | int fftW, 23 | int fftH, 24 | int dataW, 25 | int dataH, 26 | int FEATURE_DIM 27 | ){ 28 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 29 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 30 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z; 31 | 32 | if(x < fftW && y < fftH && z < FEATURE_DIM){ 33 | if(x < dataW && y < dataH) 34 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 35 | d_Data[ IMUL(z, IMUL(dataH, dataW)) + IMUL(x, dataH ) + y]; 36 | else 37 | d_PaddedData[IMUL(z, IMUL(fftW, fftH)) + IMUL(x, fftH) + y] = 0; 38 | } 39 | } 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // Modulate Fourier image of padded data by Fourier image of padded kernel 43 | // and normalize by FFT size 44 | //////////////////////////////////////////////////////////////////////////////// 45 | __device__ void complexMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){ 46 | const cufftComplex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; 47 | out = t; 48 | } 49 | 50 | __device__ void complexConjMulAndScale(cufftComplex &out, cufftComplex a, cufftComplex b, float c){ 51 | const cufftComplex t = {c * (a.x * b.x + a.y * b.y), c * (a.y * b.x - a.x * b.y)}; 52 | out = t; 53 | } 54 | 55 | __global__ void elementwiseProductAndNormalize( 56 | cufftComplex *fft_Output, 57 | const cufftComplex *fft_PaddedData, 58 | const cufftComplex *fft_PaddedKernel, 59 | int FFT_H, 60 | int FFT_W, 61 | int FEATURE_DIM, 62 | float scale 63 | ){ 64 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 65 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 66 | const int z = IMUL(blockDim.z, blockIdx.z) + threadIdx.z; 67 | 68 | if(x < FFT_W && y < FFT_H && z < FEATURE_DIM){ 69 | // int i = IMUL(z, IMUL(FFT_W, FFT_H)) + IMUL(FFT_H, x) + y; 70 | int i = z * FFT_W * FFT_H + FFT_H * x + y; 71 | // complexConjMulAndScale(fft_Output[i], fft_PaddedData[i], fft_PaddedKernel[i], scale); 72 | fft_Output[i].x = scale * (fft_PaddedData[i].x * fft_PaddedKernel[i].x - fft_PaddedData[i].y * fft_PaddedKernel[i].y); 73 | fft_Output[i].y = scale * (fft_PaddedData[i].y * fft_PaddedKernel[i].x + fft_PaddedData[i].x * fft_PaddedKernel[i].y); 74 | } 75 | } 76 | 77 | /* Support in-place computation, i.e. input and output can be the same */ 78 | __global__ void sumAlongFeatures( 79 | float *convolutionResult, 80 | const float *convolutionPerFeature, 81 | int FFT_H, 82 | int FFT_W, 83 | int FEATURE_DIM 84 | ){ 85 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 86 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 87 | 88 | if(x < FFT_W && y < FFT_H){ 89 | const int result_i = IMUL(FFT_H, x) + y; 90 | const int N = IMUL(FFT_W, FFT_H); 91 | 92 | convolutionResult[result_i] = convolutionPerFeature[result_i]; 93 | for (int z = 1; z < FEATURE_DIM; z++){ 94 | convolutionResult[result_i] += 95 | convolutionPerFeature[IMUL(z, N) + result_i]; 96 | } 97 | } 98 | } 99 | 100 | /* 101 | * Host code 102 | */ 103 | 104 | //////////////////////////////////////////////////////////////////////////////// 105 | // Helper functions 106 | //////////////////////////////////////////////////////////////////////////////// 107 | //Round a / b to nearest higher integer value 108 | int iDivUp(int a, int b){ 109 | return (a % b != 0) ? (a / b + 1) : (a / b); 110 | } 111 | 112 | //Align a to nearest higher multiple of b 113 | int iAlignUp(int a, int b){ 114 | return (a % b != 0) ? (a - a % b + b) : a; 115 | } 116 | 117 | 118 | //////////////////////////////////////////////////////////////////////////////// 119 | // Mex Entry 120 | //////////////////////////////////////////////////////////////////////////////// 121 | void mexFunction(int nlhs, mxArray *plhs[], 122 | int nrhs, mxArray const *prhs[]) 123 | { 124 | ConvPlan plan[N_MAX_PARALLEL]; 125 | 126 | /* Declare all variables.*/ 127 | const mxGPUArray *mxFFTData; 128 | const mxGPUArray *mxKernel; 129 | mxGPUArray *mxFFTKernel; 130 | mxGPUArray *mxConvolution; 131 | 132 | cufftComplex **d_CFFT_DATA_PER_GPU; 133 | 134 | /* concurrent kernel executions */ 135 | int N_GPU; 136 | int N_BATCH_PER_GPU = 2; 137 | 138 | char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; 139 | 140 | /* Choose a reasonably sized number of threads for the block. */ 141 | int THREAD_PER_BLOCK_H = 16; 142 | int THREAD_PER_BLOCK_W = 8; 143 | int THREAD_PER_BLOCK_D = 8; 144 | int THREAD_PER_BLOCK_2D = 32; 145 | 146 | // const mwSize * mxKernel_Dim; 147 | const mwSize * mxFFT_Dim; 148 | // int MblocksPerGrid, NblocksPerGrid; 149 | int KERNEL_H, KERNEL_W, N_KERNEL, 150 | CFFT_H, CFFT_W, FFT_H, FFT_W, FEATURE_DIM, 151 | KERNEL_SIZE, CFFT_SIZE, FFT_SIZE, CONV_SIZE; 152 | 153 | int gpuIdx, streamIdx, planIdx; 154 | 155 | /* Initialize the MathWorks GPU API. */ 156 | mxInitGPU(); 157 | 158 | /* Throw an error if the input is not a GPU array. */ 159 | if ( (nrhs < 2) || (nrhs > 3) || !mxIsGPUArray(prhs[0]) ) 160 | mexErrMsgIdAndTxt(errId, "The data must be FFT-ed real array in GPU"); 161 | 162 | if (( nrhs == 3) && mxGetNumberOfElements(prhs[2]) != 4) 163 | mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock"); 164 | 165 | if ( nrhs == 3 ){ 166 | const double* threadSize = (double *)mxGetData(prhs[2]); 167 | THREAD_PER_BLOCK_H = (int)threadSize[0]; 168 | THREAD_PER_BLOCK_W = (int)threadSize[1]; 169 | THREAD_PER_BLOCK_D = (int)threadSize[2]; 170 | THREAD_PER_BLOCK_2D = (int)threadSize[3]; 171 | if(debug) printf("Thread size: H=%d, W=%d, D=%d, D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D); 172 | } 173 | 174 | cudaDeviceProp dev; 175 | cudaGetDeviceProperties(&dev,0); 176 | int success = checkDeviceProp(dev); 177 | 178 | mxFFTData = mxGPUCreateFromMxArray(prhs[0]); 179 | mxFFT_Dim = mxGPUGetDimensions(mxFFTData); 180 | 181 | // FFT Dim 182 | // In CUDA, R2C fft will create only N/2 + 1 points. This is due to the Hermitian symmetry of the points. 183 | CFFT_H = mxFFT_Dim[0]; 184 | CFFT_W = mxFFT_Dim[1]; 185 | 186 | FFT_H = (mxFFT_Dim[0] - 1) * 2; 187 | FFT_W = mxFFT_Dim[1]; 188 | 189 | FEATURE_DIM = mxFFT_Dim[2]; 190 | 191 | CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2); 192 | FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float); 193 | CONV_SIZE = FFT_W * FFT_H * sizeof(float); 194 | 195 | if(debug) printf("FFT Data size: h=%d, w=%d, f=%d\n", FFT_H, FFT_W, FEATURE_DIM); 196 | 197 | if (mxGetClassID(prhs[1]) != mxCELL_CLASS) 198 | mexErrMsgIdAndTxt(errId, "Kernel must be a cell array"); 199 | 200 | mwSize nKernel = mxGetNumberOfElements(prhs[1]); 201 | N_KERNEL = (int)nKernel; 202 | plhs[0] = mxCreateCellMatrix(1, N_KERNEL); 203 | 204 | if(debug) printf("N Kernel: %d\n", N_KERNEL); 205 | 206 | 207 | /* Set block size and thread size */ 208 | dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D); 209 | dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 210 | iDivUp(FFT_H, threadBlock3D.y), 211 | iDivUp(FEATURE_DIM, threadBlock3D.z)); 212 | 213 | dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D); 214 | dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 215 | iDivUp(FFT_H, threadBlock2D.y)); 216 | 217 | 218 | /* Find number of cuda capable devices */ 219 | CUDA_SAFE_CALL(cudaGetDeviceCount(&N_GPU)); 220 | if(debug) printf( "CUDA-capable device count: %i\n", N_GPU); 221 | 222 | CUDA_SAFE_CALL(cudaSetDevice(0)); 223 | d_CFFT_DATA_PER_GPU = (cufftComplex **)malloc(N_GPU * sizeof(float)); 224 | 225 | /* Pad Kernel */ 226 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_PaddedKernel, FFT_SIZE)); 227 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_IFFTEProd, FFT_SIZE)); 228 | 229 | /* Create a GPUArray to hold the result and get its underlying pointer. */ 230 | mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize)); 231 | FFT_dims[0] = FFT_H; 232 | FFT_dims[1] = FFT_W; 233 | FFT_dims[2] = FEATURE_DIM; 234 | 235 | d_CFFT_DATA_PER_GPU[0] = (cufftComplex *)mxGPUGetDataReadOnly(mxFFTData); 236 | 237 | // mxConvolution = mxGPUCreateGPUArray(2, 238 | // FFT_dims, // Third element will not be accessed 239 | // mxSINGLE_CLASS, 240 | // mxREAL, 241 | // MX_GPU_DO_NOT_INITIALIZE); 242 | 243 | // d_CONVOLUTION = (cufftReal *)(mxGPUGetData(mxConvolution)); 244 | 245 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE)); 246 | 247 | // mxFFTKernel = mxGPUCreateGPUArray(3, 248 | // mxFFT_Dim, 249 | // mxSINGLE_CLASS, 250 | // mxCOMPLEX, 251 | // MX_GPU_DO_NOT_INITIALIZE); 252 | 253 | // d_CFFT_KERNEL = (cufftComplex *)(mxGPUGetData(mxFFTKernel)); 254 | 255 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE)); 256 | 257 | // CUDA_SAFE_CALL(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE)); 258 | 259 | /* FFT Kernel */ 260 | int BATCH = FEATURE_DIM; 261 | int FFT_Dims[] = { FFT_W, FFT_H }; 262 | int CFFT_Dims[] = { CFFT_W, CFFT_H }; 263 | 264 | int idist = FFT_W * FFT_H; 265 | int odist = CFFT_W * CFFT_H; 266 | 267 | // mwSize *FFT_dims = (mwSize *)mxMalloc(2 * sizeof(mwSize)); 268 | // FFT_dims[0] = FFT_H; 269 | // FFT_dims[1] = FFT_W; 270 | 271 | N_GPU = 1; 272 | //Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked) 273 | for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++) 274 | { 275 | // Set GPU 276 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx)); 277 | // if (gpuIdx != 0) CUDA_SAFE_CALL(); 278 | /* COPY mxFFTData to individual GPU */ 279 | if (gpuIdx > 0) { 280 | if(debug) printf("start inter gpu copy from 0 to %d\n", gpuIdx); 281 | CUDA_SAFE_CALL(cudaMalloc((void **)&d_CFFT_DATA_PER_GPU[gpuIdx], CFFT_SIZE)); 282 | CUDA_SAFE_CALL(cudaMemcpyPeerAsync(d_CFFT_DATA_PER_GPU[gpuIdx], 283 | gpuIdx, 284 | d_CFFT_DATA_PER_GPU[0], 285 | 0, 286 | CFFT_SIZE, 287 | plan[0].stream)); 288 | if(debug) printf("end gpu copy from 0 to %d\n", gpuIdx); 289 | } 290 | 291 | // Set Streams 292 | for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){ 293 | planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx; 294 | 295 | CUDA_SAFE_CALL(cudaStreamCreate(&plan[planIdx].stream)); 296 | 297 | // Cufft Plans 298 | CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_R2C, 299 | 2, // rank 300 | FFT_Dims, 301 | FFT_Dims, 1, idist, // *inembed, istride, idist 302 | CFFT_Dims, 1, odist, // *onembed, ostride, odist 303 | CUFFT_R2C, 304 | BATCH)); // batch 305 | cufftSetStream(plan[planIdx].FFTplan_R2C, plan[planIdx].stream); 306 | 307 | CUFFT_SAFE_CALL(cufftPlanMany(&plan[planIdx].FFTplan_C2R, 308 | 2, // rank 309 | FFT_Dims, 310 | CFFT_Dims, 1, odist, // *inembed, istride, idist 311 | FFT_Dims, 1, idist, // *onembed, ostride, odist 312 | CUFFT_C2R, 313 | BATCH)); // batch 314 | cufftSetStream(plan[planIdx].FFTplan_C2R, plan[planIdx].stream); 315 | 316 | plan[planIdx].d_CFFT_DATA = d_CFFT_DATA_PER_GPU[gpuIdx]; 317 | 318 | //Allocate memory 319 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CFFT_KERNEL, CFFT_SIZE)); 320 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_FFTEProd, CFFT_SIZE)); 321 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_CONVOLUTION, CONV_SIZE)); 322 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_IFFTEProd, FFT_SIZE)); 323 | // d_Kernel, dynamically set 324 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_PaddedKernel, FFT_SIZE)); 325 | // h_Kernel, dynamically set 326 | // CUDA_SAFE_CALL(cudaMallocHost((void **)&plan[planIdx].h_CONVOLUTION, CONV_SIZE)); 327 | } 328 | } 329 | 330 | 331 | /* For each kernel iterate */ 332 | int N_PLANS = N_GPU * N_BATCH_PER_GPU; 333 | printf("N Plans %d\n",N_PLANS); 334 | 335 | int kernelIdx = 0; 336 | int lastPlanIdx; 337 | 338 | while(kernelIdx < N_KERNEL){ 339 | if(debug) printf( "Kernel: %d\n",kernelIdx); 340 | 341 | for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++){ 342 | if (kernelIdx >= N_KERNEL) break; 343 | 344 | // Set GPU 345 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx)); 346 | 347 | // Set Streams 348 | for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){ 349 | planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx; 350 | 351 | // Get Kernel Data 352 | const mxArray *mxCurrentCell = mxGetCell(prhs[1], kernelIdx); 353 | { 354 | if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 ) 355 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1"); 356 | 357 | if(debug) printf("Start plan %d\n", planIdx); 358 | 359 | plan[planIdx].h_Kernel = (float *)mxGetData(mxCurrentCell); 360 | plan[planIdx].mxKernel_Dim = mxGetDimensions(mxCurrentCell); 361 | 362 | // Kernel dimensions 363 | KERNEL_H = plan[planIdx].mxKernel_Dim[0]; 364 | KERNEL_W = plan[planIdx].mxKernel_Dim[1]; 365 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float); 366 | 367 | if(debug) printf("Start copy\n"); 368 | // CUDA_SAFE_CALL(cudaHostRegister(plan[planIdx].h_Kernel, KERNEL_SIZE, cudaHostRegisterPortable)); 369 | // CUDA_SAFE_CALL(cudaHostGetDevicePointer((void **) &plan[planIdx].d_Kernel, (void *)plan[planIdx].h_Kernel, 0)); 370 | CUDA_SAFE_CALL(cudaMalloc((void **)&plan[planIdx].d_Kernel, KERNEL_SIZE)); 371 | CUDA_SAFE_CALL(cudaMemcpyAsync(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice, plan[planIdx].stream)); 372 | // CUDA_SAFE_CALL(cudaMemcpy(plan[planIdx].d_Kernel, plan[planIdx].h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice)); 373 | mxKernel = NULL; 374 | } 375 | 376 | if(debug) printf("Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W); 377 | 378 | if (FEATURE_DIM != plan[planIdx].mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ){ 379 | mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size"); 380 | } 381 | 382 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 383 | if(debug) printf("Sync before padding\n"); 384 | padData<<>>( 385 | plan[planIdx].d_PaddedKernel, 386 | plan[planIdx].d_Kernel, 387 | FFT_W, 388 | FFT_H, 389 | KERNEL_W, 390 | KERNEL_H, 391 | FEATURE_DIM 392 | ); 393 | if(debug) printf("Padding done\n"); 394 | 395 | CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 396 | CUFFT_SAFE_CALL(cufftExecR2C(plan[planIdx].FFTplan_R2C, plan[planIdx].d_PaddedKernel, plan[planIdx].d_CFFT_KERNEL)); 397 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 398 | 399 | if(debug) printf("FFT done\n"); 400 | 401 | /* Hadamard product, Element-wise multiplication in frequency domain */ 402 | /* If execute the following, second compile of this file create MATLAB error */ 403 | elementwiseProductAndNormalize<<>>( 404 | plan[planIdx].d_FFTEProd, // out 405 | plan[planIdx].d_CFFT_DATA, // in data 406 | plan[planIdx].d_CFFT_KERNEL, // in kernel 407 | CFFT_H, 408 | CFFT_W, 409 | FEATURE_DIM, 410 | 1.0f / (FFT_W * FFT_H) 411 | ); 412 | if(debug) printf("Eprod done\n"); 413 | CUFFT_SAFE_CALL(cufftExecC2R(plan[planIdx].FFTplan_C2R, plan[planIdx].d_FFTEProd, plan[planIdx].d_IFFTEProd)); 414 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 415 | if(debug) printf("Second fft done\n"); 416 | sumAlongFeatures<<>>( 417 | plan[planIdx].d_CONVOLUTION, 418 | plan[planIdx].d_IFFTEProd, 419 | FFT_H, 420 | FFT_W, 421 | FEATURE_DIM 422 | ); 423 | if(debug) printf("sum along features done\n"); 424 | // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_Kernel)); 425 | 426 | plan[planIdx].convolutionResult = mxCreateNumericArray(2, FFT_dims, mxSINGLE_CLASS, mxREAL); 427 | plan[planIdx].h_CONVOLUTION = (float *)mxGetData(plan[planIdx].convolutionResult); 428 | 429 | // CUDA_SAFE_CALL(cudaHostRegister(plan[planIdx].h_CONVOLUTION, CONV_SIZE, cudaHostRegisterPortable)); 430 | CUDA_SAFE_CALL(cudaMemcpyAsync(plan[planIdx].h_CONVOLUTION, plan[planIdx].d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost, plan[planIdx].stream)); 431 | // CUDA_SAFE_CALL(cudaMemcpy(plan[planIdx].h_CONVOLUTION, plan[planIdx].d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost)); 432 | 433 | if(debug) printf("Copy done\n"); 434 | 435 | // CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 436 | if(debug) printf("Sync done\n"); 437 | 438 | mxSetCell(plhs[0], kernelIdx, plan[planIdx].convolutionResult); 439 | if(debug) printf("Setting Cell done\n"); 440 | // if(debug){ 441 | // for(int i = 0; i < 10; i++) 442 | // printf("%f\n", plan[planIdx].h_CONVOLUTION[i]); 443 | // } 444 | kernelIdx = kernelIdx + 1; 445 | if (kernelIdx >= N_KERNEL) break; 446 | } 447 | } 448 | 449 | lastPlanIdx = planIdx; 450 | if(debug) printf("lastPlanIdx : %d\n", lastPlanIdx); 451 | 452 | for (gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++){ 453 | if (planIdx > lastPlanIdx ) break; 454 | 455 | // Set GPU 456 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx)); 457 | 458 | // Set Streams 459 | for (streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){ 460 | planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx; 461 | if (planIdx > lastPlanIdx ) break; 462 | CUDA_SAFE_CALL(cudaStreamSynchronize(plan[planIdx].stream)); 463 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_Kernel)); 464 | // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_Kernel)); 465 | // CUDA_SAFE_CALL(cudaHostUnregister(plan[planIdx].h_CONVOLUTION)); 466 | if(debug) printf("Synchronize %d\n", planIdx); 467 | } 468 | } 469 | } 470 | 471 | // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel); 472 | 473 | /* 474 | * The mxGPUArray pointers are host-side structures that refer to device 475 | * data. These must be destroyed before leaving the MEX function. 476 | */ 477 | mxGPUDestroyGPUArray(mxFFTData); 478 | // mxGPUDestroyGPUArray(mxConvolution); 479 | // mxGPUDestroyGPUArray(mxFFTKernel); 480 | 481 | // if(mxKernel == NULL) mxGPUDestroyGPUArray(mxKernel); 482 | 483 | for ( gpuIdx = 0; gpuIdx < N_GPU; gpuIdx++) 484 | { 485 | // Set GPU 486 | CUDA_SAFE_CALL(cudaSetDevice(gpuIdx)); 487 | if(debug) printf( "free DATA per GPU %d\n", gpuIdx); 488 | CUDA_SAFE_CALL(cudaFree(d_CFFT_DATA_PER_GPU[gpuIdx])); 489 | // Set Streams 490 | for (int streamIdx = 0; streamIdx < N_BATCH_PER_GPU; streamIdx++){ 491 | int planIdx = gpuIdx * N_BATCH_PER_GPU + streamIdx; 492 | 493 | cufftDestroy(plan[planIdx].FFTplan_R2C); 494 | cufftDestroy(plan[planIdx].FFTplan_C2R); 495 | 496 | if(debug) printf( "free plans\n"); 497 | 498 | //Allocate memory 499 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_CFFT_KERNEL)); 500 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_FFTEProd)); 501 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_CONVOLUTION)); 502 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_IFFTEProd)); 503 | // d_Kernel 504 | CUDA_SAFE_CALL(cudaFree(plan[planIdx].d_PaddedKernel)); 505 | // h_Kernel 506 | // CUDA_SAFE_CALL(cudaFreeHost(plan[planIdx].h_CONVOLUTION)); 507 | if(debug) printf( "free stream\n"); 508 | CUDA_SAFE_CALL(cudaStreamDestroy(plan[planIdx].stream)); 509 | } 510 | 511 | // cudaDeviceReset causes the driver to clean up all state. While 512 | // not mandatory in normal operation, it is good practice. It is also 513 | // needed to ensure correct operation when the application is being 514 | // profiled. Calling cudaDeviceReset causes all profile data to be 515 | // flushed before the application exits 516 | cudaDeviceReset(); 517 | } 518 | 519 | // // if(mxKernel == NULL) cudaFree(d_Kernel); 520 | 521 | mxFree(FFT_dims); 522 | } 523 | -------------------------------------------------------------------------------- /src/cudaConvolutionFFT.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mex.h" 4 | #include "gpu/mxGPUArray.h" 5 | // #include "common/helper_cuda.h" 6 | #include "cudaConvFFTData.h" 7 | #include "cudaConvFFTData.cuh" 8 | 9 | static bool debug = false; 10 | 11 | enum OUT_INDEX{ 12 | CONVOLUTION_CELL_INDEX 13 | }; 14 | 15 | enum IN_INDEX{ 16 | DATA_INDEX, 17 | MAX_KERNEL_H_INDEX, 18 | MAX_KERNEL_W_INDEX, 19 | KERNLE_CELL_INDEX, 20 | THREAD_SIZE_INDEX, // Optional 21 | GPU_INDEX // Optional 22 | }; 23 | 24 | //////////////////////////////////////////////////////////////////////////////// 25 | // Mex Entry 26 | //////////////////////////////////////////////////////////////////////////////// 27 | void mexFunction(int nlhs, mxArray *plhs[], 28 | int nrhs, mxArray const *prhs[]) 29 | { 30 | char const * const errId = "cudaConvFFTData:InvalidInput"; 31 | 32 | /* Choose a reasonably sized number of threads for the block. */ 33 | int THREAD_PER_BLOCK_H = 16; 34 | int THREAD_PER_BLOCK_W = 8; 35 | int THREAD_PER_BLOCK_D = 8; 36 | int THREAD_PER_BLOCK_2D = 32; 37 | 38 | /* Initialize the MathWorks GPU API. */ 39 | // If initialized mxInitGPU do nothing 40 | if (mxInitGPU() != MX_GPU_SUCCESS) 41 | mexErrMsgTxt("mxInitGPU fail"); 42 | 43 | 44 | /* Throw an error if the number of inputs mismatch */ 45 | if ( (nrhs < (KERNLE_CELL_INDEX + 1)) || (nrhs > (GPU_INDEX + 1) )) 46 | mexErrMsgIdAndTxt(errId, "Wrong number of inputs"); 47 | 48 | 49 | /* Set data */ 50 | const mxArray *mxDATA = prhs[DATA_INDEX]; 51 | if (mxIsGPUArray(mxDATA) || 52 | mxGetNumberOfDimensions(mxDATA) != 3 || 53 | mxGetClassID(mxDATA) != mxSINGLE_CLASS) 54 | mexErrMsgTxt("Invalid data input"); 55 | 56 | 57 | /* Kernel dimensions */ 58 | int MAX_KERNEL_H = (int)mxGetScalar(prhs[MAX_KERNEL_H_INDEX]); 59 | int MAX_KERNEL_W = (int)mxGetScalar(prhs[MAX_KERNEL_W_INDEX]); 60 | if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n",MAX_KERNEL_H,MAX_KERNEL_W); 61 | 62 | 63 | /* Kernel Input */ 64 | if (mxGetClassID(prhs[KERNLE_CELL_INDEX]) != mxCELL_CLASS) 65 | mexErrMsgIdAndTxt(errId, "Kernel must be a cell array"); 66 | mwSize nKernel = mxGetNumberOfElements(prhs[KERNLE_CELL_INDEX]); 67 | int N_KERNEL = (int)nKernel; 68 | if(debug) fprintf(stderr,"N Kernel: %d\n", N_KERNEL); 69 | 70 | 71 | /* Thread size */ 72 | if (( nrhs > THREAD_SIZE_INDEX) && mxGetNumberOfElements(prhs[THREAD_SIZE_INDEX]) != 4) 73 | mexErrMsgIdAndTxt(errId, "CUDA Thread Size must be 4 integers : THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D\nYou must choose size such that total thread will not be larger than MaxThreadsPerBlock"); 74 | 75 | if ( nrhs > THREAD_SIZE_INDEX ){ 76 | const double* threadSize = (double *)mxGetData(prhs[THREAD_SIZE_INDEX]); 77 | THREAD_PER_BLOCK_H = (int)threadSize[0]; 78 | THREAD_PER_BLOCK_W = (int)threadSize[1]; 79 | THREAD_PER_BLOCK_D = (int)threadSize[2]; 80 | THREAD_PER_BLOCK_2D = (int)threadSize[3]; 81 | if(debug) fprintf(stderr,"Thread size: H=%d, W=%d, D=%d, 2D=%d\n", THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D, THREAD_PER_BLOCK_2D); 82 | } 83 | 84 | int GPU_ID = 0; 85 | if (nrhs > GPU_INDEX ){ 86 | GPU_ID = (int)mxGetScalar(prhs[GPU_INDEX]); 87 | if(debug) fprintf(stderr,"Using GPU : %d\n", GPU_ID); 88 | cudaSetDevice(GPU_ID); 89 | } 90 | 91 | 92 | /* FFT Data */ 93 | // Data dimensions 94 | const mwSize *DATA_dims = mxGetDimensions(mxDATA); 95 | int DATA_H = DATA_dims[0]; 96 | int DATA_W = DATA_dims[1]; 97 | int FEATURE_DIM = DATA_dims[2]; 98 | 99 | float *h_Data = (float *)mxGetData(mxDATA); 100 | if(debug) fprintf(stderr,"Data size: h=%d, w=%d, f=%d\n",DATA_H,DATA_W,FEATURE_DIM); 101 | 102 | // Width and height of padding 103 | int PADDING_H = MAX_KERNEL_H - 1; 104 | int PADDING_W = MAX_KERNEL_W - 1; 105 | 106 | // Derive FFT size from data and kernel dimensions 107 | // FFT_H = computeFFTsize(DATA_H + PADDING_H); 108 | // FFT_W = computeFFTsize(DATA_W + PADDING_W); 109 | int FFT_H = computeFFTsize16(DATA_H + PADDING_H); 110 | int FFT_W = computeFFTsize16(DATA_W + PADDING_W); 111 | int CFFT_W = FFT_W; 112 | int CFFT_H = FFT_H/2 + 1; 113 | 114 | if(debug) fprintf(stderr,"FFT size: h=%d, w=%d\n",FFT_H,FFT_W); 115 | 116 | int DATA_SIZE = DATA_W * DATA_H * FEATURE_DIM * sizeof(float); 117 | int FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float); 118 | int CFFT_SIZE = CFFT_W * CFFT_H * FEATURE_DIM * sizeof(float2); 119 | int CONV_SIZE = FFT_W * FFT_H * sizeof(float); 120 | 121 | int BATCH = FEATURE_DIM; 122 | int FFT_Dims[] = { FFT_W, FFT_H }; 123 | int CFFT_Dims[] = { CFFT_W, CFFT_H }; 124 | int idist = FFT_W * FFT_H; 125 | int odist = CFFT_W * CFFT_H; 126 | 127 | cufftHandle FFTplan_R2C, FFTplan_C2R; 128 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 129 | 2, // rank 130 | FFT_Dims, 131 | FFT_Dims, 1, idist, // *inembed, istride, idist 132 | CFFT_Dims, 1, odist, // *onembed, ostride, odist 133 | CUFFT_R2C, 134 | BATCH)); // batch 135 | 136 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_C2R, 137 | 2, // rank 138 | FFT_Dims, 139 | CFFT_Dims, 1, odist, // *inembed, istride, idist 140 | FFT_Dims, 1, idist, // *onembed, ostride, odist 141 | CUFFT_C2R, 142 | BATCH)); // batch 143 | 144 | float *d_Data; 145 | float *d_PaddedData; 146 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Data, DATA_SIZE)); 147 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedData, FFT_SIZE)); 148 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice)); 149 | 150 | dim3 threadBlock3D(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D); 151 | dim3 dataBlockGrid3D( iDivUp(FFT_W, threadBlock3D.x), 152 | iDivUp(FFT_H, threadBlock3D.y), 153 | iDivUp(FEATURE_DIM, threadBlock3D.z)); 154 | 155 | padData<<>>( 156 | d_PaddedData, 157 | d_Data, 158 | FFT_W, 159 | FFT_H, 160 | DATA_W, 161 | DATA_H, 162 | FEATURE_DIM 163 | ); 164 | 165 | cufftComplex *d_CFFT_DATA; 166 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_DATA, CFFT_SIZE)); 167 | CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_DATA)); 168 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 169 | cudaFree(d_Data); 170 | 171 | 172 | 173 | 174 | 175 | 176 | /* Convolution FFT */ 177 | // Set Variables 178 | float *d_IFFTEProd; 179 | float *d_CONVOLUTION; 180 | cufftComplex *d_CFFT_KERNEL; 181 | cufftComplex *d_FFTEProd; 182 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_IFFTEProd, FFT_SIZE)); 183 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CONVOLUTION, CONV_SIZE)); 184 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_CFFT_KERNEL, CFFT_SIZE)); 185 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_FFTEProd, CFFT_SIZE)); 186 | 187 | const mxArray *mxCurrentCell; 188 | const mxGPUArray *mxKernel; 189 | const mwSize *mxKernel_Dim; 190 | float *h_Kernel; 191 | float *d_Kernel; 192 | int KERNEL_H, KERNEL_W, KERNEL_SIZE; 193 | 194 | dim3 threadBlock2D( THREAD_PER_BLOCK_2D, THREAD_PER_BLOCK_2D); 195 | dim3 dataBlockGrid2D( iDivUp(FFT_W, threadBlock2D.x), 196 | iDivUp(FFT_H, threadBlock2D.y)); 197 | 198 | mwSize mwCONV_Dims[2]; 199 | mwCONV_Dims[0] = FFT_H; 200 | mwCONV_Dims[1] = FFT_W; 201 | 202 | plhs[CONVOLUTION_CELL_INDEX] = mxCreateCellMatrix(1, N_KERNEL); 203 | 204 | for (int kernelIdx = 0; kernelIdx < N_KERNEL; kernelIdx++){ 205 | 206 | // Get Kernel Data 207 | mxCurrentCell = mxGetCell(prhs[KERNLE_CELL_INDEX], kernelIdx); 208 | if (!mxIsGPUArray(mxCurrentCell)){ 209 | 210 | if( mxGetClassID(mxCurrentCell) != mxSINGLE_CLASS || mxGetNumberOfDimensions(mxCurrentCell) != 3 ) 211 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1"); 212 | 213 | h_Kernel = (float *)mxGetData(mxCurrentCell); 214 | mxKernel_Dim = mxGetDimensions(mxCurrentCell); 215 | 216 | // Kernel dimensions 217 | KERNEL_H = mxKernel_Dim[0]; 218 | KERNEL_W = mxKernel_Dim[1]; 219 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float); 220 | 221 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Kernel, KERNEL_SIZE)); 222 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE, cudaMemcpyHostToDevice)); 223 | mxKernel = NULL; 224 | }else{ // Kernel is GPU Array 225 | mxKernel = mxGPUCreateFromMxArray(mxCurrentCell); 226 | 227 | if ( mxGPUGetClassID(mxKernel) != mxSINGLE_CLASS || mxGPUGetNumberOfDimensions(mxKernel) != 3 ) 228 | mexErrMsgIdAndTxt(errId, "Kernels must be of type float and have features larger than 1"); 229 | 230 | mxKernel_Dim = mxGPUGetDimensions(mxKernel); 231 | 232 | // Kernel dimensions 233 | KERNEL_H = mxKernel_Dim[0]; 234 | KERNEL_W = mxKernel_Dim[1]; 235 | KERNEL_SIZE = KERNEL_W * KERNEL_H * FEATURE_DIM * sizeof(float); 236 | 237 | d_Kernel = (float *)mxGPUGetDataReadOnly(mxKernel); 238 | } 239 | 240 | if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n", KERNEL_H, KERNEL_W); 241 | 242 | if (FEATURE_DIM != mxKernel_Dim[2] || KERNEL_W > FFT_W || KERNEL_H > FFT_H ) 243 | mexErrMsgIdAndTxt(errId, "Kernel and Data must have the same number of features and kernel size should be smaller than data size"); 244 | 245 | padData<<>>( 246 | d_PaddedData, 247 | d_Kernel, 248 | FFT_W, 249 | FFT_H, 250 | KERNEL_W, 251 | KERNEL_H, 252 | FEATURE_DIM 253 | ); 254 | 255 | CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_KERNEL)); 256 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 257 | 258 | if(debug) fprintf(stderr,"FFT done\n"); 259 | 260 | 261 | /* Hadamard product, Element-wise multiplication in frequency domain */ 262 | /* If execute the following, second compile of this file create MATLAB error */ 263 | elementwiseProductAndNormalize<<>>( 264 | d_FFTEProd, // out 265 | d_CFFT_DATA, // in data 266 | d_CFFT_KERNEL, // in kernel 267 | CFFT_H, 268 | CFFT_W, 269 | FEATURE_DIM, 270 | 1.0f / (FFT_W * FFT_H) 271 | ); 272 | 273 | CUFFT_SAFE_CALL(cufftExecC2R(FFTplan_C2R, d_FFTEProd, d_IFFTEProd)); 274 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 275 | 276 | sumAlongFeatures<<>>( 277 | d_CONVOLUTION, 278 | d_IFFTEProd, 279 | FFT_H, 280 | FFT_W, 281 | FEATURE_DIM 282 | ); 283 | 284 | mxArray * convolutionResult = mxCreateNumericArray(2, mwCONV_Dims, mxSINGLE_CLASS, mxREAL); 285 | float * h_CONVOLUTION = (float *)mxGetData(convolutionResult); 286 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(h_CONVOLUTION, d_CONVOLUTION, CONV_SIZE ,cudaMemcpyDeviceToHost)); 287 | 288 | mxSetCell(plhs[CONVOLUTION_CELL_INDEX], kernelIdx, convolutionResult); 289 | if(mxKernel == NULL) cudaFree(d_Kernel); 290 | else mxGPUDestroyGPUArray(mxKernel); 291 | } 292 | // plhs[1] = mxGPUCreateMxArrayOnGPU(mxFFTKernel); 293 | 294 | /* 295 | * The mxGPUArray pointers are host-side structures that refer to device 296 | * data. These must be destroyed before leaving the MEX function. 297 | */ 298 | // mxGPUDestroyGPUArray(mxFFTData); 299 | // mxGPUDestroyGPUArray(mxConvolution); 300 | // mxGPUDestroyGPUArray(mxFFTKernel); 301 | 302 | cufftDestroy(FFTplan_R2C); 303 | cufftDestroy(FFTplan_C2R); 304 | 305 | cudaFree(d_CFFT_DATA); 306 | cudaFree(d_IFFTEProd); 307 | cudaFree(d_CONVOLUTION); 308 | cudaFree(d_CFFT_KERNEL); 309 | cudaFree(d_FFTEProd); 310 | cudaFree(d_PaddedData); 311 | } 312 | -------------------------------------------------------------------------------- /src/cudaFFTData.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mex.h" 4 | #include "gpu/mxGPUArray.h" 5 | #include "cudaConvFFTData.h" 6 | #include "cudaConvFFTData.cuh" 7 | 8 | static bool debug = false; 9 | 10 | enum IN_INDEX{ 11 | DATA_INDEX, 12 | KERNEL_H_INDEX, 13 | KERNEL_W_INDEX 14 | }; 15 | //////////////////////////////////////////////////////////////////////////////// 16 | // Mex Entry 17 | //////////////////////////////////////////////////////////////////////////////// 18 | void mexFunction(int nlhs, mxArray *plhs[], 19 | int nrhs, mxArray const *prhs[]) 20 | { 21 | /* Declare all variables.*/ 22 | const mxArray *mxDATA = prhs[DATA_INDEX]; 23 | mxGPUArray *FFT_DATA; 24 | float2 *d_CFFT_DATA; 25 | float *h_Data; 26 | float *d_Data; 27 | float *d_PaddedData; 28 | char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; 29 | char const * const errMsg = "Invalid input to MEX file."; 30 | 31 | /* Choose a reasonably sized number of threads for the block. */ 32 | int const THREAD_PER_BLOCK_H = 16; 33 | int const THREAD_PER_BLOCK_W = 8; 34 | int const THREAD_PER_BLOCK_D = 8; 35 | 36 | // int MblocksPerGrid, NblocksPerGrid; 37 | int KERNEL_H, KERNEL_W, DATA_H, DATA_W, 38 | PADDING_H, PADDING_W, FFT_H, FFT_W, FEATURE_DIM, 39 | DATA_SIZE, FFT_SIZE, CFFT_SIZE; 40 | 41 | 42 | /* Initialize the MathWorks GPU API. */ 43 | // If initialized mxInitGPU do nothing 44 | if (mxInitGPU() != MX_GPU_SUCCESS) 45 | mexErrMsgTxt("mxInitGPU fail"); 46 | 47 | 48 | /* Throw an error if the input is not a GPU array. */ 49 | if ((nrhs!=3) || 50 | mxIsGPUArray(mxDATA) || 51 | mxGetNumberOfDimensions(mxDATA) != 3 || 52 | mxGetClassID(mxDATA) != mxSINGLE_CLASS) { 53 | mexErrMsgIdAndTxt(errId, errMsg); 54 | } 55 | 56 | 57 | // Kernel dimensions 58 | KERNEL_H = (int)mxGetScalar(prhs[KERNEL_H_INDEX]); 59 | KERNEL_W = (int)mxGetScalar(prhs[KERNEL_W_INDEX]); 60 | if(debug) fprintf(stderr,"Kernel size: h=%d, w=%d\n",KERNEL_H,KERNEL_W); 61 | 62 | // Data dimensions 63 | const mwSize *DATA_dims = mxGetDimensions(mxDATA); 64 | DATA_H = DATA_dims[0]; 65 | DATA_W = DATA_dims[1]; 66 | FEATURE_DIM = DATA_dims[2]; 67 | 68 | h_Data = (float *)mxGetData(mxDATA); 69 | if(debug) fprintf(stderr,"Data size: h=%d, w=%d, f=%d\n",DATA_H,DATA_W,FEATURE_DIM); 70 | 71 | // Width and height of padding 72 | PADDING_H = KERNEL_H - 1; 73 | PADDING_W = KERNEL_W - 1; 74 | 75 | // Derive FFT size from data and kernel dimensions 76 | // FFT_H = computeFFTsize(DATA_H + PADDING_H); 77 | // FFT_W = computeFFTsize(DATA_W + PADDING_W); 78 | FFT_H = computeFFTsize16(DATA_H + PADDING_H); 79 | FFT_W = computeFFTsize16(DATA_W + PADDING_W); 80 | 81 | if(debug) fprintf(stderr,"FFT size: h=%d, w=%d\n",FFT_H,FFT_W); 82 | 83 | DATA_SIZE = DATA_W * DATA_H * FEATURE_DIM * sizeof(float); 84 | FFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float); 85 | // CFFT_SIZE = FFT_W * FFT_H * FEATURE_DIM * sizeof(float2); 86 | 87 | // Allocate memory for input 88 | // No need to initialize using mxCalloc 89 | 90 | mwSize CFFT_dims[3]; 91 | 92 | CFFT_dims[0] = FFT_H/2 + 1; 93 | CFFT_dims[1] = FFT_W; 94 | CFFT_dims[2] = FEATURE_DIM; 95 | 96 | /* Wrap the result up as a MATLAB gpuArray for return. */ 97 | FFT_DATA = mxGPUCreateGPUArray(3, 98 | CFFT_dims, 99 | mxSINGLE_CLASS, 100 | mxCOMPLEX, 101 | MX_GPU_INITIALIZE_VALUES); 102 | 103 | d_CFFT_DATA = (float2 *)mxGPUGetData(FFT_DATA); 104 | 105 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_Data, DATA_SIZE)); 106 | CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void **)&d_PaddedData, FFT_SIZE)); 107 | 108 | CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(d_Data, h_Data, DATA_SIZE, cudaMemcpyHostToDevice)); 109 | 110 | dim3 threadBlock(THREAD_PER_BLOCK_H, THREAD_PER_BLOCK_W, THREAD_PER_BLOCK_D); 111 | dim3 dataBlockGrid( iDivUp(FFT_W, threadBlock.x), 112 | iDivUp(FFT_H, threadBlock.y), 113 | iDivUp(FEATURE_DIM, threadBlock.z)); 114 | 115 | padData<<>>( 116 | d_PaddedData, 117 | d_Data, 118 | FFT_W, 119 | FFT_H, 120 | DATA_W, 121 | DATA_H, 122 | FEATURE_DIM 123 | ); 124 | 125 | if(debug) fprintf(stderr,"Padding\n"); 126 | 127 | int BATCH = FEATURE_DIM; 128 | int FFT_Dims[] = { FFT_W, FFT_H }; 129 | 130 | int idist = FFT_W * FFT_H; 131 | int odist = FFT_W * (FFT_H/2 + 1); 132 | 133 | int inembed[] = {FFT_W, FFT_H}; 134 | int onembed[] = {FFT_W, FFT_H/2 + 1}; 135 | 136 | cufftHandle FFTplan_R2C; 137 | CUFFT_SAFE_CALL(cufftPlanMany(&FFTplan_R2C, 138 | 2, // rank 139 | FFT_Dims, 140 | inembed, 1, idist, // *inembed, istride, idist 141 | onembed, 1, odist, // *onembed, ostride, odist 142 | CUFFT_R2C, 143 | BATCH)); // batch 144 | 145 | 146 | CUFFT_SAFE_CALL(cufftExecR2C(FFTplan_R2C, d_PaddedData, d_CFFT_DATA)); 147 | CUDA_SAFE_CALL_NO_SYNC(cudaDeviceSynchronize()); 148 | if(debug) fprintf(stderr,"Sync\n"); 149 | 150 | plhs[0] = mxGPUCreateMxArrayOnGPU(FFT_DATA); 151 | if(debug) fprintf(stderr,"plhs\n"); 152 | /* 153 | * The mxGPUArray pointers are host-side structures that refer to device 154 | * data. These must be destroyed before leaving the MEX function. 155 | */ 156 | mxGPUDestroyGPUArray(FFT_DATA); 157 | cufftDestroy(FFTplan_R2C); 158 | cudaFree(d_Data); 159 | cudaFree(d_PaddedData); 160 | } 161 | --------------------------------------------------------------------------------