├── LICENSE
├── Makefile
├── README.md
└── cuda_intercept.cpp


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Christos Konstantinos Matzoros
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Edit the CUDA_PATH variable for your system
 2 | CUDA_PATH?=/usr/local/cuda-9.0
 3 | 
 4 | 
 5 | #Set compilation flags
 6 | CXX=g++
 7 | CFLAGS=-Wall -fPIC -shared -ldl
 8 | 
 9 | 
10 | all: lib_cuda_intercept.so 
11 | 
12 | lib_cuda_intercept.so: cuda_intercept.cpp
13 | 	$(CXX) -I$(CUDA_PATH)/include $(CFLAGS) -o lib_cuda_intercept.so cuda_intercept.cpp
14 | 
15 | clean:
16 | 	-rm lib_cuda_intercept.so
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA-Runtime-API-calls-interception
 2 | Shared library for intercepting CUDA Runtime API calls. This code was part of my Bachelor thesis: "A Study on the Computational Exploitation of Remote Virtualized Graphics Cards" (https://bit.ly/37tIG0D)
 3 | 
 4 | 
 5 | Prerequisites:
 6 | -GNU/Linux for compilation
 7 | -Set CUDA_PATH variable in the Makefile to the correct directory 
 8 |  where cuda is installed.
 9 |  
10 | Tested on:
11 | gcc-6,gcc-7
12 |  
13 |  
14 | How to compile:
15 | $ make
16 | 
17 | To remove:
18 | $ make clean
19 | 
20 | 
21 | How to run:
22 | $ LD_PRELOAD=/full_path_to_thecuda_intercept_directory/cuda_intercept/lib_cuda_intercept.so ./full_path_to_the_directory_of_the_CUDA_Program/your_cuda_program.cu
23 | 
24 | e.g.
25 | LD_PRELOAD=/home/cuda_intercept/lib_cuda_intercept.so /home/NVIDIA_CUDA-9.0_Samples/6_Advanced/transpose/transpose
26 | 


--------------------------------------------------------------------------------
/cuda_intercept.cpp:
--------------------------------------------------------------------------------
   1 | /*********************
   2 | 
   3 | MIT License
   4 | 
   5 | Copyright (c) 2020 Christos Konstantinos Matzoros
   6 | 
   7 | Permission is hereby granted, free of charge, to any person obtaining a copy
   8 | of this software and associated documentation files (the "Software"), to deal
   9 | in the Software without restriction, including without limitation the rights
  10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 | copies of the Software, and to permit persons to whom the Software is
  12 | furnished to do so, subject to the following conditions:
  13 | 
  14 | The above copyright notice and this permission notice shall be included in all
  15 | copies or substantial portions of the Software.
  16 | 
  17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 | SOFTWARE.
  24 | 
  25 | ***********************/
  26 | 
  27 | 
  28 | //Headers
  29 | #include <stdio.h>
  30 | #include <list>
  31 | #include <map>
  32 | #include <cassert>
  33 | #include <vector_types.h>
  34 | #include <dlfcn.h>  //for dynamic linking
  35 | #include <cuda.h>
  36 | #include <driver_types.h>
  37 | using namespace std;
  38 | 
  39 | 
  40 | 
  41 | typedef struct {
  42 |     dim3 gridDim;
  43 |     dim3 blockDim;
  44 |     list <void*> arguments;
  45 |     int counter;
  46 | } kernel_info_t;
  47 | 
  48 | static list<kernel_info_t> kernels_list;
  49 | 
  50 | kernel_info_t &kernelInfo() {
  51 |     static kernel_info_t kernelInfo;
  52 |     return kernelInfo;
  53 | }
  54 | 
  55 | 
  56 | 
  57 | /////////////////////////
  58 | //   PRINT FUNCTIONS   //
  59 | /////////////////////////
  60 | 
  61 | void print_grid_dimensions(dim3 gridDim){
  62 |     if (gridDim.y == 1 && gridDim.z == 1) {     //1D grid (x)
  63 |         printf("gridDim=%d ", gridDim.x);
  64 |     } else if (gridDim.z == 1) {    //2D grid (x,y)
  65 |         printf("gridDim=[%d,%d] ", gridDim.x, gridDim.y);
  66 |     } else { //3D grid (x,y,z)
  67 |         printf("gridDim=[%d,%d,%d] ", gridDim.x, gridDim.y, gridDim.z);
  68 |     }
  69 | }
  70 | 
  71 | void print_block_dimensions(dim3 blockDim){
  72 |     if (blockDim.y == 1 && blockDim.z == 1) {   //1D block (x)
  73 |         printf("blockDim=%d ", blockDim.x);
  74 |     } else if (blockDim.z == 1) {   //2D block (x,y)
  75 |         printf("blockDim=[%d,%d] ", blockDim.x, blockDim.y);
  76 |     } else {    //3D block (x,y,z)
  77 |         printf("blockDim=[%d,%d,%d] ", blockDim.x, blockDim.y, blockDim.z);
  78 |     }
  79 | }
  80 | 
  81 | void print_dimensions(dim3 gridDim, dim3 blockDim){
  82 |     print_grid_dimensions(gridDim);
  83 |     print_block_dimensions(blockDim);
  84 | }
  85 | 
  86 | void print_args(list <void*> arg){
  87 |     for (std::list<void *>::iterator it = arg.begin(), end = arg.end(); it != end; ++it) {
  88 |         unsigned i = std::distance(arg.begin(), it);
  89 |         printf("%d:%d \n", i, *(static_cast<int *>(*it)));
  90 |     }
  91 | }
  92 | 
  93 | void print_kernel_invocation(const char *entry) {
  94 |     printf("New kernel invocation\n");
  95 |     print_dimensions(kernelInfo().gridDim,kernelInfo().blockDim);
  96 |     //print_args(kernelInfo().arguments);
  97 |     printf("\n");
  98 | }
  99 | 
 100 | 
 101 | 
 102 | ////////////////////////////
 103 | //   CALLS INTERCEPTION   //
 104 | ////////////////////////////
 105 | 
 106 | //*******************************************//
 107 | //      CUDA Runtime API Error Handling      //
 108 | //*******************************************//
 109 | ///   cudaGetErrorName   ///
 110 | typedef const char* (*cudaGetErrorName_t)(cudaError_t error);
 111 | static cudaGetErrorName_t native_cudaGetErrorName = NULL;
 112 | 
 113 | extern "C" const char* cudaGetErrorName(cudaError_t error) {
 114 |     printf("\n>> cudaGetErrorName interception\n");
 115 | 
 116 |     if (native_cudaGetErrorName == NULL) {
 117 |         native_cudaGetErrorName = (cudaGetErrorName_t)dlsym(RTLD_NEXT,"cudaGetErrorName");
 118 |     }
 119 |     assert(native_cudaGetErrorName != NULL);
 120 |     return native_cudaGetErrorName(error);
 121 | }
 122 | 
 123 | ///   cudaGetErrorString   ///
 124 | typedef const char* (*cudaGetErrorString_t)(cudaError_t error);
 125 | static cudaGetErrorString_t native_cudaGetErrorString = NULL;
 126 | 
 127 | extern "C" const char* cudaGetErrorString(cudaError_t error) {
 128 |     printf("\n>> cudaGetErrorString interception\n");
 129 | 
 130 |     if (native_cudaGetErrorString == NULL) {
 131 |         native_cudaGetErrorString = (cudaGetErrorString_t)dlsym(RTLD_NEXT,"cudaGetErrorString");
 132 |     }
 133 |     assert(native_cudaGetErrorString != NULL);
 134 |     return native_cudaGetErrorString(error);
 135 | }
 136 | 
 137 | ///   cudaGetLastError   ///
 138 | typedef cudaError_t (*cudaGetLastError_t)(void);
 139 | static cudaGetLastError_t native_cudaGetLastError = NULL;
 140 | 
 141 | extern "C" cudaError_t cudaGetLastError(void) {
 142 |     printf("\n>> cudaGetLastError interception\n");
 143 | 
 144 |     if (native_cudaGetLastError == NULL) {
 145 |         native_cudaGetLastError = (cudaGetLastError_t)dlsym(RTLD_NEXT,"cudaGetLastError");
 146 |     }
 147 |     assert(native_cudaGetLastError != NULL);
 148 |     return native_cudaGetLastError();
 149 | }
 150 | 
 151 | ///   cudaGetLastError   ///
 152 | typedef cudaError_t (*cudaPeekAtLastError_t)(void);
 153 | static cudaPeekAtLastError_t native_cudaPeekAtLastError = NULL;
 154 | 
 155 | extern "C" cudaError_t cudaPeekAtLastError(void) {
 156 |     printf("\n>> cudaPeekAtLastError interception\n");
 157 | 
 158 |     if (native_cudaPeekAtLastError== NULL) {
 159 |         native_cudaPeekAtLastError = (cudaPeekAtLastError_t)dlsym(RTLD_NEXT,"cudaPeekAtLastError");
 160 |     }
 161 |     assert(native_cudaPeekAtLastError != NULL);
 162 |     return native_cudaPeekAtLastError();
 163 | }
 164 | 
 165 | 
 166 | //**********************************************//
 167 | //      CUDA Runtime API Device Management      //
 168 | //**********************************************//
 169 | ///   cudaChooseDevice   ///
 170 | typedef cudaError_t (*cudaChooseDevice_t)(int * device, const struct cudaDeviceProp * prop);
 171 | static cudaChooseDevice_t native_cudaChooseDevice = NULL;
 172 | 
 173 | extern "C" cudaError_t cudaChooseDevice(int * device, const struct cudaDeviceProp * prop) {
 174 |     printf("\n>>cudaChooseDevice interception \n");
 175 | 
 176 |     if (native_cudaChooseDevice == NULL) {
 177 |         native_cudaChooseDevice = (cudaChooseDevice_t)dlsym(RTLD_NEXT,"cudaChooseDevice");
 178 |     }
 179 |     assert(native_cudaChooseDevice != NULL);
 180 |     return native_cudaChooseDevice(device,prop);
 181 | }
 182 | 
 183 | ///   cudaDeviceGetAttribute   ///
 184 | typedef cudaError_t (*cudaDeviceGetAttribute_t)(int* value, cudaDeviceAttr attr, int device);
 185 | static cudaDeviceGetAttribute_t native_cudaDeviceGetAttribute = NULL;
 186 | 
 187 | extern "C" cudaError_t cudaDeviceGetAttribute(int* value, cudaDeviceAttr attr, int device) {
 188 |     printf("\n>>cudaDeviceGetAttribute interception \n");
 189 | 
 190 |     if (native_cudaDeviceGetAttribute == NULL) {
 191 |         native_cudaDeviceGetAttribute = (cudaDeviceGetAttribute_t)dlsym(RTLD_NEXT,"cudaDeviceGetAttribute");
 192 |     }
 193 |     assert(native_cudaDeviceGetAttribute != NULL);
 194 |     return native_cudaDeviceGetAttribute(value,attr,device);
 195 | }
 196 | 
 197 | ///   cudaDeviceGetByPCIBusId    ///
 198 | typedef cudaError_t (*cudaDeviceGetByPCIBusId_t)(int* device, const char* pciBusId);
 199 | static cudaDeviceGetByPCIBusId_t native_cudaDeviceGetByPCIBusId  = NULL;
 200 | 
 201 | extern "C" cudaError_t cudaDeviceGetByPCIBusId  (int* device, const char* pciBusId) {
 202 |     printf("\n>>cudaDeviceGetByPCIBusId  interception\n");
 203 | 
 204 |     if (native_cudaDeviceGetByPCIBusId  == NULL) {
 205 |         native_cudaDeviceGetByPCIBusId  = (cudaDeviceGetByPCIBusId_t)dlsym(RTLD_NEXT,"cudaDeviceGetByPCIBusId ");
 206 |     }
 207 |     assert(native_cudaDeviceGetByPCIBusId  != NULL);
 208 |     return native_cudaDeviceGetByPCIBusId (device,pciBusId);
 209 | }
 210 | 
 211 | ///   cudaDeviceGetCacheConfig   ///
 212 | typedef cudaError_t (*cudaDeviceGetCacheConfig_t)(cudaFuncCache ** pCacheConfig);
 213 | static cudaDeviceGetCacheConfig_t native_cudaDeviceGetCacheConfig = NULL;
 214 | 
 215 | extern "C" cudaError_t cudaDeviceGetCacheConfig (cudaFuncCache ** pCacheConfig) {
 216 |     printf("\n>>cudaDeviceGetCacheConfig interception\n");
 217 | 
 218 |     if (native_cudaDeviceGetCacheConfig == NULL) {
 219 |         native_cudaDeviceGetCacheConfig = (cudaDeviceGetCacheConfig_t)dlsym(RTLD_NEXT,"cudaDeviceGetCacheConfig");
 220 |     }
 221 |     assert(native_cudaDeviceGetCacheConfig != NULL);
 222 |     return native_cudaDeviceGetCacheConfig(pCacheConfig);
 223 | }
 224 | 
 225 | ///   cudaDeviceGetLimit   ///
 226 | typedef cudaError_t (*cudaDeviceGetLimit_t)(size_t* pValue, cudaLimit limit);
 227 | static cudaDeviceGetLimit_t native_cudaDeviceGetLimit = NULL;
 228 | 
 229 | extern "C" cudaError_t cudaDeviceGetLimit (size_t* pValue, cudaLimit limit) {
 230 |     printf("\n>>cudaDeviceGetLimit interception\n");
 231 | 
 232 |     if (native_cudaDeviceGetLimit == NULL) {
 233 |         native_cudaDeviceGetLimit = (cudaDeviceGetLimit_t)dlsym(RTLD_NEXT,"cudaDeviceGetLimit");
 234 |     }
 235 |     assert(native_cudaDeviceGetLimit != NULL);
 236 |     return native_cudaDeviceGetLimit(pValue,limit);
 237 | }
 238 | 
 239 | ///   cudaDeviceGetNvSciSyncAttributes   ///
 240 | typedef cudaError_t (*cudaDeviceGetNvSciSyncAttributes_t)( void* nvSciSyncAttrList, int device, int flags);
 241 | static cudaDeviceGetNvSciSyncAttributes_t native_cudaDeviceGetNvSciSyncAttributes = NULL;
 242 | 
 243 | extern "C" cudaError_t cudaDeviceGetNvSciSyncAttributes ( void* nvSciSyncAttrList, int device, int flags) {
 244 |     printf("\n>>cudaDeviceGetNvSciSyncAttributes interception\n");
 245 | 
 246 |     if (native_cudaDeviceGetNvSciSyncAttributes== NULL) {
 247 |         native_cudaDeviceGetNvSciSyncAttributes= (cudaDeviceGetNvSciSyncAttributes_t)dlsym(RTLD_NEXT,"cudaDeviceGetNvSciSyncAttributes");
 248 |     }
 249 |     assert(native_cudaDeviceGetNvSciSyncAttributes != NULL);
 250 |     return native_cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList,device,flags);
 251 | }
 252 | 
 253 | ///   cudaDeviceGetP2PAttribute   ///
 254 | typedef cudaError_t (*cudaDeviceGetP2PAttribute_t)(int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
 255 | static cudaDeviceGetP2PAttribute_t native_cudaDeviceGetP2PAttribute= NULL;
 256 | 
 257 | extern "C" cudaError_t cudaDeviceGetP2PAttribute (int* value, cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) {
 258 |     printf("\n>>cudaDeviceGetP2PAttribute interception\n");
 259 | 
 260 |     if (native_cudaDeviceGetP2PAttribute == NULL) {
 261 |         native_cudaDeviceGetP2PAttribute = (cudaDeviceGetP2PAttribute_t)dlsym(RTLD_NEXT,"cudaDeviceGetP2PAttribute");
 262 |     }
 263 |     assert(native_cudaDeviceGetP2PAttribute != NULL);
 264 |     return native_cudaDeviceGetP2PAttribute(value,attr,srcDevice,dstDevice);
 265 | }
 266 | 
 267 | ///   cudaDeviceGetPCIBusId   ///
 268 | typedef cudaError_t (*cudaDeviceGetPCIBusId_t)(char* pciBusId, int len, int device);
 269 | static cudaDeviceGetPCIBusId_t native_cudaDeviceGetPCIBusId = NULL;
 270 | 
 271 | extern "C" cudaError_t cudaDeviceGetPCIBusId (char* pciBusId, int len, int device) {
 272 |     printf("\n>>cudaDeviceGetPCIBusId interception\n");
 273 | 
 274 |     if (native_cudaDeviceGetPCIBusId == NULL) {
 275 |         native_cudaDeviceGetPCIBusId = (cudaDeviceGetPCIBusId_t)dlsym(RTLD_NEXT,"cudaDeviceGetPCIBusId");
 276 |     }
 277 |     assert(native_cudaDeviceGetPCIBusId != NULL);
 278 |     return native_cudaDeviceGetPCIBusId(pciBusId,len,device);
 279 | }
 280 | 
 281 | ///   cudaDeviceGetSharedMemConfig   ///
 282 | typedef cudaError_t (*cudaDeviceGetSharedMemConfig_t)( cudaSharedMemConfig ** pConfig );
 283 | static cudaDeviceGetSharedMemConfig_t native_cudaDeviceGetSharedMemConfig = NULL;
 284 | 
 285 | extern "C" cudaError_t cudaDeviceGetSharedMemConfig (cudaSharedMemConfig ** pConfig ) {
 286 |     printf("\n>>cudaDeviceGetSharedMemConfig interception\n");
 287 | 
 288 |     if (native_cudaDeviceGetSharedMemConfig == NULL) {
 289 |         native_cudaDeviceGetSharedMemConfig = (cudaDeviceGetSharedMemConfig_t)dlsym(RTLD_NEXT,"cudaDeviceGetSharedMemConfig");
 290 |     }
 291 |     assert(native_cudaDeviceGetSharedMemConfig != NULL);
 292 |     return native_cudaDeviceGetSharedMemConfig(pConfig);
 293 | }
 294 | 
 295 | ///   cudaDeviceGetStreamPriorityRange   ///
 296 | typedef cudaError_t (*cudaDeviceGetStreamPriorityRange_t)( int* leastPriority, int* greatestPriority);
 297 | static cudaDeviceGetStreamPriorityRange_t native_cudaDeviceGetStreamPriorityRange = NULL;
 298 | 
 299 | extern "C" cudaError_t cudaDeviceGetStreamPriorityRange ( int* leastPriority, int* greatestPriority) {
 300 |     printf("\n>>cudaDeviceGetStreamPriorityRange interception\n");
 301 | 
 302 |     if (native_cudaDeviceGetStreamPriorityRange == NULL) {
 303 |         native_cudaDeviceGetStreamPriorityRange = (cudaDeviceGetStreamPriorityRange_t)dlsym(RTLD_NEXT,"cudaDeviceGetStreamPriorityRange");
 304 |     }
 305 |     assert(native_cudaDeviceGetStreamPriorityRange != NULL);
 306 |     return native_cudaDeviceGetStreamPriorityRange(leastPriority,greatestPriority);
 307 | }
 308 | 
 309 | ///   cudaMalloc3D   ///
 310 | typedef cudaError_t (*cudaDeviceSetCacheConfig_t)(cudaFuncCache cacheConfig);
 311 | static cudaDeviceSetCacheConfig_t native_cudaDeviceSetCacheConfig = NULL;
 312 | 
 313 | extern "C" cudaError_t cudaDeviceSetCacheConfig (cudaFuncCache cacheConfig) {
 314 |     printf("\n>>cudaDeviceSetCacheConfig interception\n");
 315 | 
 316 |     if (native_cudaDeviceSetCacheConfig == NULL) {
 317 |         native_cudaDeviceSetCacheConfig = (cudaDeviceSetCacheConfig_t)dlsym(RTLD_NEXT,"cudaDeviceSetCacheConfig");
 318 |     }
 319 |     assert(native_cudaDeviceSetCacheConfig != NULL);
 320 |     return native_cudaDeviceSetCacheConfig(cacheConfig);
 321 | }
 322 | 
 323 | ///   cudaDeviceSetLimit   ///
 324 | typedef cudaError_t (*cudaDeviceSetLimit_t)(cudaLimit limit, size_t value);
 325 | static cudaDeviceSetLimit_t native_cudaDeviceSetLimit = NULL;
 326 | 
 327 | extern "C" cudaError_t cudaDeviceSetLimit (cudaLimit limit, size_t value) {
 328 |     printf("\n>>cudaDeviceSetLimit interception\n");
 329 | 
 330 |     if (native_cudaDeviceSetLimit == NULL) {
 331 |         native_cudaDeviceSetLimit = (cudaDeviceSetLimit_t)dlsym(RTLD_NEXT,"cudaDeviceSetLimit");
 332 |     }
 333 |     assert(native_cudaDeviceSetLimit != NULL);
 334 |     return native_cudaDeviceSetLimit(limit,value);
 335 | }
 336 | 
 337 | ///   cudaDeviceSetSharedMemConfig   ///
 338 | typedef cudaError_t (*cudaDeviceSetSharedMemConfig_t)(cudaSharedMemConfig config);
 339 | static cudaDeviceSetSharedMemConfig_t native_cudaDeviceSetSharedMemConfig = NULL;
 340 | 
 341 | extern "C" cudaError_t cudaDeviceSetSharedMemConfig(cudaSharedMemConfig config) {
 342 |     printf("\n>>cudaDeviceSetSharedMemConfig interception\n");
 343 | 
 344 |     if (native_cudaDeviceSetSharedMemConfig == NULL) {
 345 |         native_cudaDeviceSetSharedMemConfig = (cudaDeviceSetSharedMemConfig_t)dlsym(RTLD_NEXT,"cudaDeviceSetSharedMemConfig");
 346 |     }
 347 |     assert(native_cudaDeviceSetSharedMemConfig != NULL);
 348 |     return native_cudaDeviceSetSharedMemConfig(config);
 349 | }
 350 | 
 351 | ///   cudaDeviceSynchronize   ///
 352 | typedef cudaError_t (*cudaDeviceSynchronize_t)(void);
 353 | static cudaDeviceSynchronize_t native_cudaDeviceSynchronize = NULL;
 354 | 
 355 | extern "C" cudaError_t cudaDeviceSynchronize (void) {
 356 |     printf("\n>>cudaDeviceSynchronize interception\n");
 357 | 
 358 |     if (native_cudaDeviceSynchronize == NULL) {
 359 |         native_cudaDeviceSynchronize = (cudaDeviceSynchronize_t)dlsym(RTLD_NEXT,"cudaDeviceSynchronize");
 360 |     }
 361 |     assert(native_cudaDeviceSynchronize != NULL);
 362 |     return native_cudaDeviceSynchronize();
 363 | }
 364 | 
 365 | ///   cudaGetDevice   ///
 366 | typedef cudaError_t (*cudaGetDevice_t)(int *device);
 367 | static cudaGetDevice_t native_cudaGetDevice = NULL;
 368 | 
 369 | extern "C" cudaError_t cudaGetDevice(int *device){
 370 |     printf("\n>>cudaGetDevice \n");
 371 |     //call of the real function
 372 |     if (native_cudaGetDevice == NULL) {
 373 |         native_cudaGetDevice = (cudaGetDevice_t)dlsym(RTLD_NEXT,"cudaGetDevice");
 374 |     }
 375 |     assert(native_cudaGetDevice != NULL);
 376 |     return native_cudaGetDevice(device);
 377 | }
 378 | 
 379 | ///   cudaGetDeviceCount   ///
 380 | typedef cudaError_t (*cudaGetDeviceCount_t)(int * count);
 381 | static cudaGetDeviceCount_t native_cudaGetDeviceCount = NULL;
 382 | 
 383 | extern "C" cudaError_t cudaGetDeviceCount(int * count){
 384 |     printf("\n>>cudaGetDeviceCount interception \n");
 385 | 
 386 |     if (native_cudaGetDeviceCount == NULL) {
 387 |         native_cudaGetDeviceCount = (cudaGetDeviceCount_t)dlsym(RTLD_NEXT,"cudaGetDeviceCount");
 388 |     }
 389 |     assert(native_cudaGetDeviceCount != NULL);
 390 |     return native_cudaGetDeviceCount(count);
 391 | }
 392 | 
 393 | ///   cudaGetDeviceFlags   ///
 394 | typedef cudaError_t (*cudaGetDeviceFlags_t)(unsigned int* flags);
 395 | static cudaGetDeviceFlags_t native_cudaGetDeviceFlags = NULL;
 396 | 
 397 | extern "C" cudaError_t cudaGetDeviceFlags (unsigned int* flags) {
 398 |     printf("\n>>cudaGetDeviceFlags interception\n");
 399 | 
 400 |     if (native_cudaGetDeviceFlags == NULL) {
 401 |         native_cudaGetDeviceFlags = (cudaGetDeviceFlags_t)dlsym(RTLD_NEXT,"cudaGetDeviceFlags");
 402 |     }
 403 |     assert(native_cudaGetDeviceFlags != NULL);
 404 |     return native_cudaGetDeviceFlags(flags);
 405 | }
 406 | 
 407 | ///   cudaGetDeviceProperties   ///
 408 | typedef cudaError_t (*cudaGetDeviceProperties_t)(struct cudaDeviceProp * prop, int device);
 409 | static cudaGetDeviceProperties_t native_cudaGetDeviceProperties = NULL;
 410 | 
 411 | extern "C" cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp * prop, int device){
 412 |     printf("\n>>cudaGetDeviceProperties interception \n");
 413 | 
 414 |     if (native_cudaGetDeviceProperties == NULL) {
 415 |         native_cudaGetDeviceProperties = (cudaGetDeviceProperties_t)dlsym(RTLD_NEXT,"cudaGetDeviceProperties");
 416 |     }
 417 |     assert(native_cudaGetDeviceProperties != NULL);
 418 |     return native_cudaGetDeviceProperties(prop,device);
 419 | }
 420 | 
 421 | ///   cudaIpcCloseMemHandle   ///
 422 | typedef cudaError_t (*cudaIpcCloseMemHandle_t)(void* devPtr);
 423 | static cudaIpcCloseMemHandle_t native_cudaIpcCloseMemHandle = NULL;
 424 | 
 425 | extern "C" cudaError_t cudaIpcCloseMemHandle (void* devPtr) {
 426 |     printf("\n>>cudaIpcCloseMemHandle interception\n");
 427 | 
 428 |     if (native_cudaIpcCloseMemHandle == NULL) {
 429 |         native_cudaIpcCloseMemHandle= (cudaIpcCloseMemHandle_t)dlsym(RTLD_NEXT,"cudaIpcCloseMemHandle");
 430 |     }
 431 |     assert(native_cudaIpcCloseMemHandle != NULL);
 432 |     return native_cudaIpcCloseMemHandle(devPtr);
 433 | }
 434 | 
 435 | ///   cudaIpcGetEventHandle   ///
 436 | typedef cudaError_t (*cudaIpcGetEventHandle_t)(cudaIpcEventHandle_t* handle, cudaEvent_t event);
 437 | static cudaIpcGetEventHandle_t native_cudaIpcGetEventHandle = NULL;
 438 | 
 439 | extern "C" cudaError_t cudaIpcGetEventHandle (cudaIpcEventHandle_t* handle, cudaEvent_t event) {
 440 |     printf("\n>>cudaIpcGetEventHandle interception\n");
 441 | 
 442 |     if (native_cudaIpcGetEventHandle == NULL) {
 443 |         native_cudaIpcGetEventHandle = (cudaIpcGetEventHandle_t)dlsym(RTLD_NEXT,"cudaIpcGetEventHandle");
 444 |     }
 445 |     assert(native_cudaIpcGetEventHandle != NULL);
 446 |     return native_cudaIpcGetEventHandle(handle,event);
 447 | }
 448 | 
 449 | ///   cudaIpcGetMemHandle   ///
 450 | typedef cudaError_t (*cudaIpcGetMemHandle_t)(cudaIpcMemHandle_t* handle, void* devPtr);
 451 | static cudaIpcGetMemHandle_t native_cudaIpcGetMemHandle= NULL;
 452 | 
 453 | extern "C" cudaError_t cudaIpcGetMemHandle (cudaIpcMemHandle_t* handle, void* devPtr) {
 454 |     printf("\n>>cudaIpcGetMemHandle interception\n");
 455 | 
 456 |     if (native_cudaIpcGetMemHandle == NULL) {
 457 |         native_cudaIpcGetMemHandle = (cudaIpcGetMemHandle_t)dlsym(RTLD_NEXT,"cudaIpcGetMemHandle");
 458 |     }
 459 |     assert(native_cudaIpcGetMemHandle!= NULL);
 460 |     return native_cudaIpcGetMemHandle(handle,devPtr);
 461 | }
 462 | 
 463 | ///   cudaIpcOpenEventHandle   ///
 464 | typedef cudaError_t (*cudaIpcOpenEventHandle_t)(cudaEvent_t* event, cudaIpcEventHandle_t handle);
 465 | static cudaIpcOpenEventHandle_t native_cudaIpcOpenEventHandle = NULL;
 466 | 
 467 | extern "C" cudaError_t cudaIpcOpenEventHandle (cudaEvent_t* event, cudaIpcEventHandle_t handle) {
 468 |     printf("\n>>cudaIpcOpenEventHandle interception\n");
 469 | 
 470 |     if (native_cudaIpcOpenEventHandle== NULL) {
 471 |         native_cudaIpcOpenEventHandle = (cudaIpcOpenEventHandle_t)dlsym(RTLD_NEXT,"cudaIpcOpenEventHandle");
 472 |     }
 473 |     assert(native_cudaIpcOpenEventHandle != NULL);
 474 |     return native_cudaIpcOpenEventHandle(event,handle);
 475 | }
 476 | 
 477 | ///   cudaIpcOpenMemHandle   ///
 478 | typedef cudaError_t (*cudaIpcOpenMemHandle_t)(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
 479 | static cudaIpcOpenMemHandle_t native_cudaIpcOpenMemHandle = NULL;
 480 | 
 481 | extern "C" cudaError_t cudaIpcOpenMemHandle (void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
 482 |     printf("\n>>cudaIpcOpenMemHandle interception\n");
 483 | 
 484 |     if (native_cudaIpcOpenMemHandle == NULL) {
 485 |         native_cudaIpcOpenMemHandle = (cudaIpcOpenMemHandle_t)dlsym(RTLD_NEXT,"cudaIpcOpenMemHandle");
 486 |     }
 487 |     assert(native_cudaIpcOpenMemHandle != NULL);
 488 |     return native_cudaIpcOpenMemHandle(devPtr,handle,flags);
 489 | }
 490 | 
 491 | ///   cudaSetDevice   ///
 492 | typedef cudaError_t (*cudaSetDevice_t)(int device);
 493 | static cudaSetDevice_t native_cudaSetDevice = NULL;
 494 | 
 495 | extern "C" cudaError_t cudaSetDevice(int device){
 496 |     printf("\n>>cudaSetDevice interception \n");
 497 | 
 498 |     if (native_cudaSetDevice == NULL) {
 499 |         native_cudaSetDevice = (cudaSetDevice_t)dlsym(RTLD_NEXT,"cudaSetDevice");
 500 |     }
 501 |     assert(native_cudaSetDevice != NULL);
 502 |     return native_cudaSetDevice(device);
 503 | }
 504 | 
 505 | ///   cudaSetDeviceFlags   ///
 506 | typedef cudaError_t (*cudaSetDeviceFlags_t)(int flags);
 507 | static cudaSetDeviceFlags_t native_cudaSetDeviceFlags = NULL;
 508 | 
 509 | extern "C" cudaError_t cudaSetDeviceFlags(int flags){
 510 |     printf("\n>>cudaSetDeviceFlags interception \n");
 511 | 
 512 |     if (native_cudaSetDeviceFlags == NULL) {
 513 |         native_cudaSetDeviceFlags = (cudaSetDeviceFlags_t)dlsym(RTLD_NEXT,"cudaSetDeviceFlags");
 514 |     }
 515 |     assert(native_cudaSetDeviceFlags != NULL);
 516 |     return native_cudaSetDeviceFlags(flags);
 517 | }
 518 | 
 519 | ///   cudaSetValidDevices   ///
 520 | typedef cudaError_t (*cudaSetValidDevices_t)(int * device_arr, int len);
 521 | static cudaSetValidDevices_t native_cudaSetValidDevices = NULL;
 522 | 
 523 | extern "C" cudaError_t cudaSetValidDevices(int * device_arr, int len){
 524 |     printf("\n>>cudaSetValidDevices interception \n");
 525 | 
 526 |     if (native_cudaSetValidDevices == NULL) {
 527 |         native_cudaSetValidDevices = (cudaSetValidDevices_t)dlsym(RTLD_NEXT,"cudaSetValidDevices");
 528 |     }
 529 |     assert(native_cudaSetValidDevices != NULL);
 530 |     return native_cudaSetValidDevices(device_arr,len);
 531 | }
 532 | 
 533 | 
 534 | 
 535 | //**********************************************//
 536 | //      CUDA Runtime API Stream Management      //
 537 | //**********************************************//
 538 | ///   cudaStreamAttachMemAsync   ///
 539 | typedef cudaError_t (*cudaStreamAttachMemAsync_t)(cudaStream_t stream, void* devPtr, size_t length, unsigned int flags);
 540 | static cudaStreamAttachMemAsync_t native_cudaStreamAttachMemAsync = NULL;
 541 | 
 542 | extern "C" cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, void* devPtr, size_t length, unsigned int flags){
 543 |     printf("\n>>cudaStreamAttachMemAsync interception \n");
 544 | 
 545 |     if (native_cudaStreamAttachMemAsync == NULL) {
 546 |         native_cudaStreamAttachMemAsync = (cudaStreamAttachMemAsync_t)dlsym(RTLD_NEXT,"cudaStreamAttachMemAsync");
 547 |     }
 548 |     assert(native_cudaStreamAttachMemAsync != NULL);
 549 |     return native_cudaStreamAttachMemAsync(stream,devPtr,length,flags);
 550 | }
 551 | 
 552 | 
 553 | ///   cudaStreamCreate   ///
 554 | typedef cudaError_t (*cudaStreamCreate_t)(cudaStream_t * pStream);
 555 | static cudaStreamCreate_t native_cudaStreamCreate = NULL;
 556 | 
 557 | extern "C" cudaError_t cudaStreamCreate(cudaStream_t * pStream){
 558 |     printf("\n>>cudaStreamCreate interception \n");
 559 | 
 560 |     if (native_cudaStreamCreate == NULL) {
 561 |         native_cudaStreamCreate = (cudaStreamCreate_t)dlsym(RTLD_NEXT,"cudaStreamCreate");
 562 |     }
 563 |     assert(native_cudaStreamCreate != NULL);
 564 |     return native_cudaStreamCreate(pStream);
 565 | }
 566 | 
 567 | ///   cudaStreamCreateWithFlags   ///
 568 | typedef cudaError_t (*cudaStreamCreateWithFlags_t)(cudaStream_t* pStream, unsigned int  flags);
 569 | static cudaStreamCreateWithFlags_t native_cudaStreamCreateWithFlags = NULL;
 570 | 
 571 | extern "C" cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, unsigned int  flags){
 572 |     printf("\n>>cudaStreamCreateWithFlags interception \n");
 573 | 
 574 |     if (native_cudaStreamCreateWithFlags == NULL) {
 575 |         native_cudaStreamCreateWithFlags = (cudaStreamCreateWithFlags_t)dlsym(RTLD_NEXT,"cudaStreamCreateWithFlags");
 576 |     }
 577 |     assert(native_cudaStreamCreateWithFlags != NULL);
 578 |     return native_cudaStreamCreateWithFlags(pStream,flags);
 579 | }
 580 | 
 581 | ///   cudaStreamCreateWithPriority   ///
 582 | typedef cudaError_t (*cudaStreamCreateWithPriority_t)(cudaStream_t* pStream, unsigned int flags, int priority);
 583 | static cudaStreamCreateWithPriority_t native_cudaStreamCreateWithPriority = NULL;
 584 | 
 585 | extern "C" cudaError_t cudaStreamCreateWithPriority(cudaStream_t* pStream, unsigned int flags, int priority){
 586 |     printf("\n>>cudaStreamCreateWithPriority interception \n");
 587 | 
 588 |     if (native_cudaStreamCreateWithPriority == NULL) {
 589 |         native_cudaStreamCreateWithPriority = (cudaStreamCreateWithPriority_t)dlsym(RTLD_NEXT,"cudaStreamCreateWithPriority");
 590 |     }
 591 |     assert(native_cudaStreamCreateWithPriority != NULL);
 592 |     return native_cudaStreamCreateWithPriority(pStream,flags,priority);
 593 | }
 594 | 
 595 | ///   cudaStreamDestroy   ///
 596 | typedef cudaError_t (*cudaStreamDestroy_t)(cudaStream_t stream);
 597 | static cudaStreamDestroy_t native_cudaStreamDestroy = NULL;
 598 | 
 599 | extern "C" cudaError_t cudaStreamDestroy(cudaStream_t stream){
 600 |     printf("\n>>cudaStreamDestroy interception \n");
 601 | 
 602 |     if (native_cudaStreamDestroy == NULL) {
 603 |         native_cudaStreamDestroy = (cudaStreamDestroy_t)dlsym(RTLD_NEXT,"cudaStreamDestroy");
 604 |     }
 605 |     assert(native_cudaStreamDestroy != NULL);
 606 |     return native_cudaStreamDestroy(stream);
 607 | }
 608 | 
 609 | 
 610 | ///   cudaStreamGetFlags   ///
 611 | typedef cudaError_t (*cudaStreamGetFlags_t)(cudaStream_t hStream, unsigned int* flags);
 612 | static cudaStreamGetFlags_t native_cudaStreamGetFlags= NULL;
 613 | 
 614 | extern "C" cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int* flags){
 615 |     printf("\n>>cudaStreamGetFlags interception \n");
 616 | 
 617 |     if (native_cudaStreamGetFlags == NULL) {
 618 |         native_cudaStreamGetFlags = (cudaStreamGetFlags_t)dlsym(RTLD_NEXT,"cudaStreamGetFlags");
 619 |     }
 620 |     assert(native_cudaStreamGetFlags != NULL);
 621 |     return native_cudaStreamGetFlags(hStream,flags);
 622 | }
 623 | 
 624 | ///   cudaStreamGetPriority   ///
 625 | typedef cudaError_t (*cudaStreamGetPriority_t)(cudaStream_t hStream, int* priority);
 626 | static cudaStreamGetPriority_t native_cudaStreamGetPriority = NULL;
 627 | 
 628 | extern "C" cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority){
 629 |     printf("\n>>cudaStreamGetPriority interception \n");
 630 | 
 631 |     if (native_cudaStreamGetPriority == NULL) {
 632 |         native_cudaStreamGetPriority = (cudaStreamGetPriority_t)dlsym(RTLD_NEXT,"cudaStreamGetPriority");
 633 |     }
 634 |     assert(native_cudaStreamGetPriority != NULL);
 635 |     return native_cudaStreamGetPriority(hStream,priority);
 636 | }
 637 | 
 638 | ///   cudaStreamQuery   ///
 639 | typedef cudaError_t (*cudaStreamQuery_t)(cudaStream_t stream);
 640 | static cudaStreamQuery_t native_cudaStreamQuery = NULL;
 641 | 
 642 | extern "C" cudaError_t cudaStreamQuery(cudaStream_t stream){
 643 |     printf("\n>>cudaStreamQuery interception \n");
 644 | 
 645 |     if (native_cudaStreamQuery == NULL) {
 646 |         native_cudaStreamQuery = (cudaStreamQuery_t)dlsym(RTLD_NEXT,"cudaStreamQuery");
 647 |     }
 648 |     assert(native_cudaStreamQuery != NULL);
 649 |     return native_cudaStreamQuery(stream);
 650 | }
 651 | 
 652 | ///   cudaStreamSynchronize   ///
 653 | typedef cudaError_t (*cudaStreamSynchronize_t)(cudaStream_t stream);
 654 | static cudaStreamSynchronize_t native_cudaStreamSynchronize = NULL;
 655 | 
 656 | extern "C" cudaError_t cudaStreamSynchronize(cudaStream_t stream){
 657 |     printf("\n>>cudaStreamSynchronize interception \n");
 658 | 
 659 |     if (native_cudaStreamSynchronize== NULL) {
 660 |         native_cudaStreamSynchronize = (cudaStreamSynchronize_t)dlsym(RTLD_NEXT,"cudaStreamSynchronize");
 661 |     }
 662 |     assert(native_cudaStreamSynchronize != NULL);
 663 |     return native_cudaStreamSynchronize(stream);
 664 | }
 665 | 
 666 | ///   cudaStreamWaitEvent   ///
 667 | typedef cudaError_t (*cudaStreamWaitEvent_t)(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
 668 | static cudaStreamWaitEvent_t native_cudaStreamWaitEvent = NULL;
 669 | 
 670 | extern "C" cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags){
 671 |     printf("\n>>cudaStreamWaitEvent interception \n");
 672 | 
 673 |     if (native_cudaStreamWaitEvent == NULL) {
 674 |         native_cudaStreamWaitEvent = (cudaStreamWaitEvent_t)dlsym(RTLD_NEXT,"cudaStreamWaitEvent");
 675 |     }
 676 |     assert(native_cudaStreamWaitEvent != NULL);
 677 |     return native_cudaStreamWaitEvent(stream,event,flags);
 678 | }
 679 | 
 680 | 
 681 | 
 682 | //*********************************************//
 683 | //      CUDA Runtime API Event Management      //
 684 | //*********************************************//
 685 | ///   cudaDriverGetVersion   ///
 686 | typedef cudaError_t (*cudaEventCreate_t)(cudaEvent_t * event);
 687 | static cudaEventCreate_t native_cudaEventCreate = NULL;
 688 | 
 689 | extern "C" cudaError_t cudaEventCreate (cudaEvent_t * event) {
 690 |     printf("\n>>cudaEventCreate interception\n");
 691 | 
 692 |     if (native_cudaEventCreate == NULL) {
 693 |         native_cudaEventCreate = (cudaEventCreate_t)dlsym(RTLD_NEXT,"cudaEventCreate");
 694 |     }
 695 |     assert(native_cudaEventCreate != NULL);
 696 |     return native_cudaEventCreate(event);
 697 | }
 698 | 
 699 | ///   cudaEventCreateWithFlags   ///
 700 | typedef cudaError_t (*cudaEventCreateWithFlags_t)(cudaEvent_t * event, int flags);
 701 | static cudaEventCreateWithFlags_t native_cudaEventCreateWithFlags = NULL;
 702 | 
 703 | extern "C" cudaError_t cudaEventCreateWithFlags(cudaEvent_t * event, int flags) {
 704 |     printf("\n>>cudaEventCreateWithFlags interception\n");
 705 | 
 706 |     if (native_cudaEventCreateWithFlags == NULL) {
 707 |         native_cudaEventCreateWithFlags = (cudaEventCreateWithFlags_t)dlsym(RTLD_NEXT,"cudaEventCreateWithFlags");
 708 |     }
 709 |     assert(native_cudaEventCreateWithFlags != NULL);
 710 |     return native_cudaEventCreateWithFlags(event,flags);
 711 | }
 712 | 
 713 | ///   cudaEventDestroy   ///
 714 | typedef cudaError_t (*cudaEventDestroy_t)(cudaEvent_t event);
 715 | static cudaEventDestroy_t native_cudaEventDestroy = NULL;
 716 | 
 717 | extern "C" cudaError_t cudaEventDestroy	(cudaEvent_t event) {
 718 |     printf("\n>>cudaEventDestroy interception\n");
 719 | 
 720 |     if (native_cudaEventDestroy == NULL) {
 721 |         native_cudaEventDestroy = (cudaEventDestroy_t)dlsym(RTLD_NEXT,"cudaEventDestroy");
 722 |     }
 723 | 
 724 |     assert(native_cudaEventDestroy != NULL);
 725 |     return native_cudaEventDestroy(event);
 726 | }
 727 | 
 728 | ///   cudaEventElapsedTime   ///
 729 | typedef cudaError_t (*cudaEventElapsedTime_t)(float * ms, cudaEvent_t start, cudaEvent_t end);
 730 | static cudaEventElapsedTime_t native_cudaEventElapsedTime = NULL;
 731 | 
 732 | extern "C" cudaError_t cudaEventElapsedTime	(float * ms, cudaEvent_t start,cudaEvent_t end) {
 733 |     printf("\n>>cudaEventElapsedTime interception\n");
 734 | 
 735 |     if (native_cudaEventElapsedTime == NULL) {
 736 |         native_cudaEventElapsedTime = (cudaEventElapsedTime_t)dlsym(RTLD_NEXT,"cudaEventElapsedTime");
 737 |     }
 738 |     assert(native_cudaEventElapsedTime != NULL);
 739 |     return native_cudaEventElapsedTime(ms,start,end);
 740 | }
 741 | 
 742 | ///   cudaEventQuery   ///
 743 | typedef cudaError_t (*cudaEventQuery_t)(cudaEvent_t event);
 744 | static cudaEventQuery_t native_cudaEventQuery = NULL;
 745 | 
 746 | extern "C" cudaError_t cudaEventQuery (cudaEvent_t event) {
 747 |     printf("\n>>cudaEventQuery interception\n");
 748 | 
 749 |     if (native_cudaEventQuery == NULL) {
 750 |         native_cudaEventQuery = (cudaEventQuery_t)dlsym(RTLD_NEXT,"cudaEventQuery");
 751 |     }
 752 |     assert(native_cudaEventQuery != NULL);
 753 |     return native_cudaEventQuery(event);
 754 | }
 755 | 
 756 | ///   cudaEventRecord   ///
 757 | typedef cudaError_t (*cudaEventRecord_t)(cudaEvent_t event, cudaStream_t stream);
 758 | static cudaEventRecord_t native_cudaEventRecord = NULL;
 759 | 
 760 | extern "C" cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) {
 761 |     printf("\n>>cudaEventRecord interception\n");
 762 | 
 763 |     if (native_cudaEventRecord == NULL) {
 764 |         native_cudaEventRecord = (cudaEventRecord_t)dlsym(RTLD_NEXT,"cudaEventRecord");
 765 |     }
 766 |     assert(native_cudaEventRecord != NULL);
 767 |     return native_cudaEventRecord(event,stream);
 768 | }
 769 | 
 770 | ///   cudaEventSynchronize   ///
 771 | typedef cudaError_t (*cudaEventSynchronize_t)(cudaEvent_t event);
 772 | static cudaEventSynchronize_t native_cudaEventSynchronize = NULL;
 773 | 
 774 | extern "C" cudaError_t cudaEventSynchronize	(cudaEvent_t event) {
 775 |     printf("\n>>cudaEventSynchronize interception\n");
 776 | 
 777 |     if (native_cudaEventSynchronize == NULL) {
 778 |         native_cudaEventSynchronize = (cudaEventSynchronize_t)dlsym(RTLD_NEXT,"cudaEventSynchronize");
 779 |     }
 780 |     assert(native_cudaEventSynchronize != NULL);
 781 |     return native_cudaEventSynchronize(event);
 782 | }
 783 | 
 784 | 
 785 | //**********************************************//
 786 | //      CUDA Runtime API Execution Control      //
 787 | //**********************************************//
 788 | //  cudaConfigureCall  ///
 789 | typedef cudaError_t (*cudaConfigureCall_t)(dim3,dim3,size_t,cudaStream_t);
 790 | static cudaConfigureCall_t native_CudaConfigureCall = NULL;
 791 | 
 792 | extern "C" cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem=0, cudaStream_t stream=0) {
 793 |     assert(kernelInfo().counter == 0);
 794 |     kernelInfo().gridDim = gridDim;
 795 |     kernelInfo().blockDim = blockDim;
 796 |     //kernelInfo().counter++;   //increase a counter to indicate an expected cudaLaunch to be completed
 797 |     printf("\n>>cudaConfigureCall interception\n");
 798 |     //call of the real function
 799 |     if (native_CudaConfigureCall == NULL)
 800 |         native_CudaConfigureCall = (cudaConfigureCall_t)dlsym(RTLD_NEXT,"cudaConfigureCall");
 801 | 
 802 |     assert(native_CudaConfigureCall != NULL);
 803 |     return native_CudaConfigureCall(gridDim,blockDim,sharedMem,stream);
 804 | }
 805 | 
 806 | 
 807 | ///   cudaFuncGetAttributes   ///
 808 | typedef cudaError_t (*cudaFuncGetAttributes_t)(struct cudaFuncAttributes * attr, const char * func);
 809 | static cudaFuncGetAttributes_t native_cudaFuncGetAttributes = NULL;
 810 | 
 811 | extern "C" cudaError_t cudaFuncGetAttributes (struct cudaFuncAttributes * attr, const char * func) {
 812 |     printf("\n>>cudaFuncGetAttributes interception\n");
 813 | 
 814 |     if (native_cudaFuncGetAttributes == NULL) {
 815 |         native_cudaFuncGetAttributes = (cudaFuncGetAttributes_t)dlsym(RTLD_NEXT,"cudaFuncGetAttributes");
 816 |     }
 817 |     assert(native_cudaFuncGetAttributes != NULL);
 818 |     return native_cudaFuncGetAttributes(attr,func);
 819 | }
 820 | 
 821 | ///   cudaFuncSetAttribute   ///
 822 | typedef cudaError_t (*cudaFuncSetAttribute_t)(const void* func, cudaFuncAttribute attr, int  value);
 823 | static cudaFuncSetAttribute_t native_cudaFuncSetAttribute = NULL;
 824 | 
 825 | extern "C" cudaError_t cudaFuncSetAttribute (const void* func, cudaFuncAttribute attr, int  value) {
 826 |     printf("\n>>cudaFuncSetAttribute interception\n");
 827 | 
 828 |     if (native_cudaFuncSetAttribute == NULL) {
 829 |         native_cudaFuncSetAttribute = (cudaFuncSetAttribute_t)dlsym(RTLD_NEXT,"cudaFuncSetAttribute");
 830 |     }
 831 |     assert(native_cudaFuncSetAttribute != NULL);
 832 |     return native_cudaFuncSetAttribute(func,attr,value);
 833 | }
 834 | 
 835 | ///  cudaLaunch ///
 836 | typedef cudaError_t (*cudaLaunch_t)(const char* entry);
 837 | static cudaLaunch_t native_cudaLaunch = NULL;
 838 | 
 839 | extern "C" cudaError_t cudaLaunch( const char* entry){
 840 |      //print_kernel_invocation(entry);
 841 |      //kernelInfo().counter--;
 842 |      printf("\n>>cudaLaunch interception\n");
 843 |     //call of the real function
 844 |     if (native_cudaLaunch == NULL) {
 845 |         native_cudaLaunch = (cudaLaunch_t)dlsym(RTLD_NEXT,"cudaLaunch");
 846 |     }
 847 |     assert(native_cudaLaunch != NULL);
 848 |     return native_cudaLaunch(entry);
 849 | }
 850 | 
 851 | 
 852 | ///   cudaFuncSetCacheConfig   ///
 853 | typedef cudaError_t (*cudaFuncSetCacheConfig_t)(const void* func, cudaFuncCache cacheConfig);
 854 | static cudaFuncSetCacheConfig_t native_cudaFuncSetCacheConfig = NULL;
 855 | 
 856 | extern "C" cudaError_t cudaFuncSetCacheConfig (const void* func, cudaFuncCache cacheConfig) {
 857 |     printf("\n>>cudaFuncSetCacheConfig interception\n");
 858 | 
 859 |     if (native_cudaFuncSetCacheConfig == NULL) {
 860 |         native_cudaFuncSetCacheConfig = (cudaFuncSetCacheConfig_t)dlsym(RTLD_NEXT,"cudaFuncSetCacheConfig");
 861 |     }
 862 |     assert(native_cudaFuncSetCacheConfig != NULL);
 863 |     return native_cudaFuncSetCacheConfig(func,cacheConfig);
 864 | }
 865 | 
 866 | ///   cudaFuncSetSharedMemConfig   ///
 867 | typedef cudaError_t (*cudaFuncSetSharedMemConfig_t)(const void* func, cudaSharedMemConfig config);
 868 | static cudaFuncSetSharedMemConfig_t native_cudaFuncSetSharedMemConfig = NULL;
 869 | 
 870 | extern "C" cudaError_t cudaFuncSetSharedMemConfig (const void* func, cudaSharedMemConfig config) {
 871 |     printf("\n>>cudaFuncSetSharedMemConfig interception\n");
 872 | 
 873 |     if (native_cudaFuncSetSharedMemConfig == NULL) {
 874 |         native_cudaFuncSetSharedMemConfig = (cudaFuncSetSharedMemConfig_t)dlsym(RTLD_NEXT,"cudaFuncSetSharedMemConfig");
 875 |     }
 876 |     assert(native_cudaFuncSetSharedMemConfig != NULL);
 877 |     return native_cudaFuncSetSharedMemConfig(func,config);
 878 | }
 879 | 
 880 | ///   cudaGetParameterBuffer   ///
 881 | typedef cudaError_t (*cudaGetParameterBuffer_t)(size_t alignment, size_t size);
 882 | static cudaGetParameterBuffer_t native_cudaGetParameterBuffer = NULL;
 883 | 
 884 | extern "C" cudaError_t cudaGetParameterBuffer (size_t alignment, size_t size) {
 885 |     printf("\n>>cudaGetParameterBuffer interception\n");
 886 | 
 887 |     if (native_cudaGetParameterBuffer == NULL) {
 888 |         native_cudaGetParameterBuffer = (cudaGetParameterBuffer_t)dlsym(RTLD_NEXT,"cudaGetParameterBuffer");
 889 |     }
 890 |     assert(native_cudaGetParameterBuffer != NULL);
 891 |     return native_cudaGetParameterBuffer(alignment,size);
 892 | }
 893 | 
 894 | ///   cudaGetParameterBufferV2   ///
 895 | typedef cudaError_t (*cudaGetParameterBufferV2_t)(void* func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
 896 | static cudaGetParameterBufferV2_t native_cudaGetParameterBufferV2 = NULL;
 897 | 
 898 | extern "C" cudaError_t cudaGetParameterBufferV2	(void* func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize) {
 899 |     printf("\n>>cudaGetParameterBufferV2 interception\n");
 900 | 
 901 |     if (native_cudaGetParameterBufferV2 == NULL) {
 902 |         native_cudaGetParameterBufferV2 = (cudaGetParameterBufferV2_t)dlsym(RTLD_NEXT,"cudaGetParameterBufferV2");
 903 |     }
 904 |     assert(native_cudaGetParameterBufferV2 != NULL);
 905 |     return native_cudaGetParameterBufferV2(func,gridDimension,blockDimension,sharedMemSize);
 906 | }
 907 | 
 908 | ///   cudaLaunchCooperativeKernel   ///
 909 | typedef cudaError_t (*cudaLaunchCooperativeKernel_t)(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream);
 910 | static cudaLaunchCooperativeKernel_t native_cudaLaunchCooperativeKernel = NULL;
 911 | 
 912 | extern "C" cudaError_t cudaLaunchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream) {
 913 |     printf("\n>>cudaLaunchCooperativeKernel interception\n");
 914 | 
 915 |     if (native_cudaLaunchCooperativeKernel == NULL) {
 916 |         native_cudaLaunchCooperativeKernel = (cudaLaunchCooperativeKernel_t)dlsym(RTLD_NEXT,"cudaLaunchCooperativeKernel");
 917 |     }
 918 |     assert(native_cudaLaunchCooperativeKernel != NULL);
 919 |     return native_cudaLaunchCooperativeKernel(func,gridDim,blockDim,args,sharedMem,stream);
 920 | }
 921 | 
 922 | ///   cudaLaunchCooperativeKernelMultiDevice   ///
 923 | typedef cudaError_t (*cudaLaunchCooperativeKernelMultiDevice_t)(cudaLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags);
 924 | static cudaLaunchCooperativeKernelMultiDevice_t native_cudaLaunchCooperativeKernelMultiDevice = NULL;
 925 | 
 926 | extern "C" cudaError_t cudaLaunchCooperativeKernelMultiDevice (cudaLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags) {
 927 |     printf("\n>>cudaLaunchCooperativeKernelMultiDevice interception\n");
 928 | 
 929 |     if (native_cudaLaunchCooperativeKernelMultiDevice == NULL) {
 930 |         native_cudaLaunchCooperativeKernelMultiDevice = (cudaLaunchCooperativeKernelMultiDevice_t)dlsym(RTLD_NEXT,"cudaLaunchCooperativeKernelMultiDevice");
 931 |     }
 932 |     assert(native_cudaLaunchCooperativeKernelMultiDevice != NULL);
 933 |     return native_cudaLaunchCooperativeKernelMultiDevice(launchParamsList,numDevices,flags);
 934 | }
 935 | 
 936 | 
 937 | 
 938 | ///   cudaLaunchKernel   ///
 939 | typedef cudaError_t (*cudaLaunchKernel_t)(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream);
 940 | static cudaLaunchKernel_t native_cudaLaunchKernel = NULL;
 941 | 
 942 | extern "C" cudaError_t cudaLaunchKernel	(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream) {
 943 |     printf("\n>>cudaLaunchKernel interception\n");
 944 | 
 945 |     if (native_cudaLaunchKernel == NULL) {
 946 |         native_cudaLaunchKernel = (cudaLaunchKernel_t)dlsym(RTLD_NEXT,"cudaLaunchKernel");
 947 |     }
 948 |     assert(native_cudaLaunchKernel != NULL);
 949 |     return native_cudaLaunchKernel(func,gridDim,blockDim,args,sharedMem,stream);
 950 | }
 951 | 
 952 | ///   cudaSetDoubleForDevice   ///
 953 | typedef cudaError_t (*cudaSetDoubleForDevice_t)(double *d);
 954 | static cudaSetDoubleForDevice_t native_cudaSetDoubleForDevice = NULL;
 955 | 
 956 | extern "C" cudaError_t cudaSetDoubleForDevice (double *d) {
 957 |     printf("\n>>cudaSetDoubleForDevice interception\n");
 958 | 
 959 |     if (native_cudaSetDoubleForDevice == NULL) {
 960 |         native_cudaSetDoubleForDevice = (cudaSetDoubleForDevice_t)dlsym(RTLD_NEXT,"cudaSetDoubleForDevice");
 961 |     }
 962 |     assert(native_cudaSetDoubleForDevice != NULL);
 963 |     return native_cudaSetDoubleForDevice(d);
 964 | }
 965 | 
 966 | ///   cudaSetDoubleForHost   ///
 967 | typedef cudaError_t (*cudaSetDoubleForHost_t)(double *d);
 968 | static cudaSetDoubleForHost_t native_cudaSetDoubleForHost = NULL;
 969 | 
 970 | extern "C" cudaError_t cudaSetDoubleForHost	(double *d) {
 971 |     printf("\n>>cudaSetDoubleForHost interception\n");
 972 | 
 973 |     if (native_cudaSetDoubleForHost == NULL) {
 974 |         native_cudaSetDoubleForHost = (cudaSetDoubleForHost_t)dlsym(RTLD_NEXT,"cudaSetDoubleForHost");
 975 |     }
 976 |     assert(native_cudaSetDoubleForHost != NULL);
 977 |     return native_cudaSetDoubleForHost(d);
 978 | }
 979 | 
 980 | /*  cudaSetupArgument   ///
 981 | typedef cudaError_t (*cudaSetupArgument_t)(const void *, size_t, size_t);
 982 | static cudaSetupArgument_t native_CudaSetupArgument = NULL;
 983 | 
 984 | extern "C" cudaError_t cudaSetupArgument(const void *arg, size_t size, size_t offset) {
 985 |     kernelInfo().arguments.push_back(const_cast<void *>(arg));
 986 | 
 987 |     //call of the real function
 988 |     if (native_CudaSetupArgument == NULL) {
 989 |         native_CudaSetupArgument = (cudaSetupArgument_t)dlsym(RTLD_NEXT,"cudaSetupArgument");
 990 |     }
 991 |     assert(native_CudaSetupArgument != NULL);
 992 |     return native_CudaSetupArgument(arg, size, offset);
 993 | }
 994 | */
 995 | 
 996 | 
 997 | //**********************************************//
 998 | //      CUDA Runtime API Memory Management      //
 999 | //**********************************************//
1000 | ///   cudaFree   ///
1001 | typedef cudaError_t (*cudaFree_t)(void * devPtr);
1002 | static cudaFree_t native_cudaFree = NULL;
1003 | 
1004 | extern "C" cudaError_t cudaFree	(void * devPtr) {
1005 |     printf("\n>>cudaFree interception\n");
1006 | 
1007 |     if (native_cudaFree == NULL) {
1008 |         native_cudaFree = (cudaFree_t)dlsym(RTLD_NEXT,"cudaFree");
1009 |     }
1010 |     assert(native_cudaFree != NULL);
1011 |     return native_cudaFree(devPtr);
1012 | }
1013 | 
1014 | 
1015 | ///   cudaFreeArray   ///
1016 | typedef cudaError_t (*cudaFreeArray_t)(struct cudaArray * array);
1017 | static cudaFreeArray_t native_cudaFreeArray = NULL;
1018 | 
1019 | extern "C" cudaError_t cudaFreeArray (struct cudaArray * array) {
1020 |     printf("\n>>cudaFreeArray interception\n");
1021 | 
1022 |     if (native_cudaFreeArray == NULL) {
1023 |         native_cudaFreeArray = (cudaFreeArray_t)dlsym(RTLD_NEXT,"cudaFreeArray");
1024 |     }
1025 |     assert(native_cudaFreeArray != NULL);
1026 |     return native_cudaFreeArray(array);
1027 | }
1028 | 
1029 | 
1030 | ///   cudaFreeHost   ///
1031 | typedef cudaError_t (*cudaFreeHost_t)(void * ptr);
1032 | static cudaFreeHost_t native_cudaFreeHost = NULL;
1033 | 
1034 | extern "C" cudaError_t cudaFreeHost(void * ptr) {
1035 |     printf("\n>>cudaFreeHost interception\n");
1036 | 
1037 |     if (native_cudaFreeHost == NULL) {
1038 |         native_cudaFreeHost = (cudaFreeHost_t)dlsym(RTLD_NEXT,"cudaFreeHost");
1039 |     }
1040 |     assert(native_cudaFreeHost != NULL);
1041 |     return native_cudaFreeHost(ptr);
1042 | }
1043 | 
1044 | 
1045 | ///   cudaGetSymbolAddress   ///
1046 | typedef cudaError_t (*cudaGetSymbolAddress_t)(void ** devPtr, const char * symbol);
1047 | static cudaGetSymbolAddress_t native_cudaGetSymbolAddress = NULL;
1048 | 
1049 | extern "C" cudaError_t cudaGetSymbolAddress	(void ** devPtr, const char * symbol) {
1050 |     printf("\n>>cudaGetSymbolAddress interception\n");
1051 | 
1052 |     if (native_cudaGetSymbolAddress == NULL) {
1053 |         native_cudaGetSymbolAddress = (cudaGetSymbolAddress_t)dlsym(RTLD_NEXT,"cudaGetSymbolAddress");
1054 |     }
1055 |     assert(native_cudaGetSymbolAddress != NULL);
1056 |     return native_cudaGetSymbolAddress(devPtr,symbol);
1057 | }
1058 | 
1059 | 
1060 | ///   cudaGetSymbolSize   ///
1061 | typedef cudaError_t (*cudaGetSymbolSize_t)(size_t * size, const char * symbol);
1062 | static cudaGetSymbolSize_t native_cudaGetSymbolSize = NULL;
1063 | 
1064 | extern "C" cudaError_t cudaGetSymbolSize(size_t * size, const char * symbol) {
1065 |     printf("\n>>cudaGetSymbolSize interception\n");
1066 | 
1067 |     if (native_cudaGetSymbolSize == NULL) {
1068 |         native_cudaGetSymbolSize = (cudaGetSymbolSize_t)dlsym(RTLD_NEXT,"cudaGetSymbolSize");
1069 |     }
1070 |     assert(native_cudaGetSymbolSize != NULL);
1071 |     return native_cudaGetSymbolSize(size,symbol);
1072 | }
1073 | 
1074 | 
1075 | ///   cudaHostAlloc   ///
1076 | typedef cudaError_t (*cudaHostAlloc_t)(void ** ptr, size_t size, unsigned int flags);
1077 | static cudaHostAlloc_t native_cudaHostAlloc = NULL;
1078 | 
1079 | extern "C" cudaError_t cudaHostAlloc (void ** ptr, size_t size, unsigned int flags) {
1080 |     printf("\n>>cudaHostAlloc interception\n");
1081 | 
1082 |     if (native_cudaHostAlloc == NULL) {
1083 |         native_cudaHostAlloc = (cudaHostAlloc_t)dlsym(RTLD_NEXT,"cudaHostAlloc");
1084 |     }
1085 |     assert(native_cudaHostAlloc != NULL);
1086 |     return native_cudaHostAlloc(ptr,size,flags);
1087 | }
1088 | 
1089 | 
1090 | ///   cudaHostGetDevicePointer   ///
1091 | typedef cudaError_t (*cudaHostGetDevicePointer_t)(void ** pDevice, void * pHost, unsigned int flags);
1092 | static cudaHostGetDevicePointer_t native_cudaHostGetDevicePointer = NULL;
1093 | 
1094 | extern "C" cudaError_t cudaHostGetDevicePointer(void ** pDevice, void * pHost, unsigned int flags) {
1095 |     printf("\n>>cudaHostGetDevicePointer interception\n");
1096 | 
1097 |     if (native_cudaHostGetDevicePointer == NULL) {
1098 |         native_cudaHostGetDevicePointer = (cudaHostGetDevicePointer_t)dlsym(RTLD_NEXT,"cudaHostGetDevicePointer");
1099 |     }
1100 |     assert(native_cudaHostGetDevicePointer != NULL);
1101 |     return native_cudaHostGetDevicePointer(pDevice,pHost,flags);
1102 | }
1103 | 
1104 | 
1105 | ///   cudaHostGetFlags   ///
1106 | typedef cudaError_t (*cudaHostGetFlags_t)(unsigned int * pFlags, void * pHost);
1107 | static cudaHostGetFlags_t native_cudaHostGetFlags = NULL;
1108 | 
1109 | extern "C" cudaError_t cudaHostGetFlags(unsigned int * pFlags, void * pHost) {
1110 |     printf("\n>>cudaHostGetFlags interception\n");
1111 | 
1112 |     if (native_cudaHostGetFlags == NULL) {
1113 |         native_cudaHostGetFlags = (cudaHostGetFlags_t)dlsym(RTLD_NEXT,"cudaHostGetFlags");
1114 |     }
1115 |     assert(native_cudaHostGetFlags != NULL);
1116 |     return native_cudaHostGetFlags(pFlags,pHost);
1117 | }
1118 | 
1119 | 
1120 | ///   cudaMalloc   ///
1121 | typedef cudaError_t (*cudaMalloc_t)(void ** devPtr, size_t size);
1122 | static cudaMalloc_t native_cudaMalloc = NULL;
1123 | 
1124 | extern "C" cudaError_t cudaMalloc(void ** devPtr, size_t size) {
1125 |     printf("\n>>cudaMalloc interception\n");
1126 | 
1127 |     if (native_cudaMalloc == NULL) {
1128 |         native_cudaMalloc = (cudaMalloc_t)dlsym(RTLD_NEXT,"cudaMalloc");
1129 |     }
1130 |     assert(native_cudaMalloc != NULL);
1131 |     return native_cudaMalloc(devPtr,size);
1132 | }
1133 | 
1134 | 
1135 | ///   cudaMalloc3D   ///
1136 | typedef cudaError_t (*cudaMalloc3D_t)(struct cudaPitchedPtr * pitchedDevPtr, struct cudaExtent extent);
1137 | static cudaMalloc3D_t native_cudaMalloc3D = NULL;
1138 | 
1139 | extern "C" cudaError_t cudaMalloc3D (struct cudaPitchedPtr * pitchedDevPtr, struct cudaExtent extent) {
1140 |     printf("\n>>cudaMalloc3D interception\n");
1141 | 
1142 |     if (native_cudaMalloc3D == NULL) {
1143 |         native_cudaMalloc3D = (cudaMalloc3D_t)dlsym(RTLD_NEXT,"cudaMalloc3D");
1144 |     }
1145 |     assert(native_cudaMalloc3D != NULL);
1146 |     return native_cudaMalloc3D(pitchedDevPtr,extent);
1147 | }
1148 | 
1149 | 
1150 | ///   cudaMalloc3DArray   ///
1151 | typedef cudaError_t (*cudaMalloc3DArray_t)(struct cudaArray ** arrayPtr, const struct cudaChannelFormatDesc * desc, struct cudaExtent extent);
1152 | static cudaMalloc3DArray_t native_cudaMalloc3DArray = NULL;
1153 | 
1154 | extern "C" cudaError_t cudaMalloc3DArray (struct cudaArray ** arrayPtr, const struct cudaChannelFormatDesc * desc, struct cudaExtent extent) {
1155 |     printf("\n>>cudaMalloc3DArray interception\n");
1156 | 
1157 |     if (native_cudaMalloc3DArray == NULL) {
1158 |         native_cudaMalloc3DArray = (cudaMalloc3DArray_t)dlsym(RTLD_NEXT,"cudaMalloc3DArray");
1159 |     }
1160 |     assert(native_cudaMalloc3DArray != NULL);
1161 |     return native_cudaMalloc3DArray(arrayPtr,desc,extent);
1162 | }
1163 | 
1164 | 
1165 | ///   cudaMallocArray   ///
1166 | typedef cudaError_t (*cudaMallocArray_t)(struct cudaArray ** arrayPtr, const struct cudaChannelFormatDesc * desc, size_t width, size_t height);
1167 | static cudaMallocArray_t native_cudaMallocArray = NULL;
1168 | 
1169 | extern "C" cudaError_t cudaMallocArray (struct cudaArray ** arrayPtr, const struct cudaChannelFormatDesc * desc, size_t width, size_t height) {
1170 |     printf("\n>>cudaMallocArray interception\n");
1171 | 
1172 |     if (native_cudaMallocArray == NULL) {
1173 |         native_cudaMallocArray = (cudaMallocArray_t)dlsym(RTLD_NEXT,"cudaMallocArray");
1174 |     }
1175 |     assert(native_cudaMallocArray != NULL);
1176 |     return native_cudaMallocArray(arrayPtr,desc,width,height);
1177 | }
1178 | 
1179 | 
1180 | ///   cudaMallocHost   ///
1181 | typedef cudaError_t (*cudaMallocHost_t)(void ** ptr,size_t size);
1182 | static cudaMallocHost_t native_cudaMallocHost = NULL;
1183 | 
1184 | extern "C" cudaError_t cudaMallocHost (void ** ptr,size_t size) {
1185 |     printf("\n>>cudaMallocHost interception\n");
1186 | 
1187 |     if (native_cudaMallocHost == NULL) {
1188 |         native_cudaMallocHost = (cudaMallocHost_t)dlsym(RTLD_NEXT,"cudaMallocHost");
1189 |     }
1190 |     assert(native_cudaMallocHost != NULL);
1191 |     return native_cudaMallocHost(ptr,size);
1192 | }
1193 | 
1194 | 
1195 | ///   cudaMallocPitch   ///
1196 | typedef cudaError_t (*cudaMallocPitch_t)(void ** devPtr, size_t * pitch, size_t width, size_t height);
1197 | static cudaMallocPitch_t native_cudaMallocPitch = NULL;
1198 | 
1199 | extern "C" cudaError_t cudaMallocPitch (void ** devPtr, size_t * pitch, size_t width, size_t height) {
1200 |     printf("\n>>cudaMallocPitch interception\n");
1201 | 
1202 |     if (native_cudaMallocPitch == NULL) {
1203 |         native_cudaMallocPitch = (cudaMallocPitch_t)dlsym(RTLD_NEXT,"cudaMallocPitch");
1204 |     }
1205 |     assert(native_cudaMallocPitch != NULL);
1206 |     return native_cudaMallocPitch(devPtr,pitch,width,height);
1207 | }
1208 | 
1209 | 
1210 | ///   cudaMemcpy   ///
1211 | typedef cudaError_t (*cudaMemcpy_t)(void * dst, const void * src, size_t count, enum cudaMemcpyKind kind);
1212 | static cudaMemcpy_t native_cudaMemcpy = NULL;
1213 | 
1214 | extern "C" cudaError_t cudaMemcpy (void * dst, const void * src, size_t count, enum cudaMemcpyKind kind) {
1215 |     printf("\n>>cudaMemcpy interception\n");
1216 | 
1217 |     if (native_cudaMemcpy == NULL) {
1218 |         native_cudaMemcpy = (cudaMemcpy_t)dlsym(RTLD_NEXT,"cudaMemcpy");
1219 |     }
1220 |     assert(native_cudaMemcpy != NULL);
1221 |     return native_cudaMemcpy(dst,src,count,kind);
1222 | }
1223 | 
1224 | 
1225 | ///   cudaMemcpy2D   ///
1226 | typedef cudaError_t (*cudaMemcpy2D_t)(void * dst, size_t dpitch, const void * src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
1227 | static cudaMemcpy2D_t native_cudaMemcpy2D= NULL;
1228 | 
1229 | extern "C" cudaError_t cudaMemcpy2D (void * dst, size_t dpitch, const void * src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
1230 |     printf("\n>>cudaMemcpy2D interception\n");
1231 | 
1232 |     if (native_cudaMemcpy2D == NULL) {
1233 |         native_cudaMemcpy2D = (cudaMemcpy2D_t)dlsym(RTLD_NEXT,"cudaMemcpy2D");
1234 |     }
1235 |     assert(native_cudaMemcpy2D != NULL);
1236 |     return native_cudaMemcpy2D(dst,dpitch,src,spitch,width,height,kind);
1237 | }
1238 | 
1239 | 
1240 | ///   cudaMemcpy2DArrayToArray   ///
1241 | typedef cudaError_t (*cudaMemcpy2DArrayToArray_t)(struct cudaArray * dst,
1242 |     size_t 	wOffsetDst,
1243 |     size_t 	hOffsetDst,
1244 |     const struct cudaArray * src,
1245 |     size_t 	wOffsetSrc,
1246 |     size_t 	hOffsetSrc,
1247 |     size_t 	width,
1248 |     size_t 	height,
1249 |     enum cudaMemcpyKind kind);
1250 | 
1251 | static cudaMemcpy2DArrayToArray_t native_cudaMemcpy2DArrayToArray = NULL;
1252 | 
1253 | extern "C" cudaError_t cudaMemcpy2DArrayToArray (struct cudaArray * dst,
1254 |     size_t 	wOffsetDst,
1255 |     size_t 	hOffsetDst,
1256 |     const struct cudaArray * src,
1257 |     size_t 	wOffsetSrc,
1258 |     size_t 	hOffsetSrc,
1259 |     size_t 	width,
1260 |     size_t 	height,
1261 |     enum cudaMemcpyKind kind) {
1262 |     printf("\n>>cudaMalloc3D interception\n");
1263 | 
1264 |     if (native_cudaMemcpy2DArrayToArray == NULL) {
1265 |         native_cudaMemcpy2DArrayToArray = (cudaMemcpy2DArrayToArray_t)dlsym(RTLD_NEXT,"cudaMemcpy2DArrayToArray");
1266 |     }
1267 |     assert(native_cudaMemcpy2DArrayToArray != NULL);
1268 |     return native_cudaMemcpy2DArrayToArray(dst,wOffsetDst,hOffsetDst,src,wOffsetSrc,hOffsetSrc,width,height,kind);
1269 | }
1270 | 
1271 | 
1272 | ///   cudaMemcpy2DAsync   ///
1273 | typedef cudaError_t (*cudaMemcpy2DAsync_t)(void * dst,
1274 |     size_t 	dpitch,
1275 |     const void * src,
1276 |     size_t 	spitch,
1277 |     size_t 	width,
1278 |     size_t 	height,
1279 |     enum cudaMemcpyKind kind,
1280 |     cudaStream_t stream);
1281 | 
1282 | static cudaMemcpy2DAsync_t native_cudaMemcpy2DAsync = NULL;
1283 | 
1284 | extern "C" cudaError_t cudaMemcpy2DAsync (void * dst,
1285 |     size_t 	dpitch,
1286 |     const void * src,
1287 |     size_t 	spitch,
1288 |     size_t 	width,
1289 |     size_t 	height,
1290 |     enum cudaMemcpyKind kind,
1291 |     cudaStream_t stream) {
1292 |     printf("\n>>cudaMemcpy2DAsync interception\n");
1293 | 
1294 |     if (native_cudaMemcpy2DAsync == NULL) {
1295 |         native_cudaMemcpy2DAsync = (cudaMemcpy2DAsync_t)dlsym(RTLD_NEXT,"cudaMemcpy2DAsync");
1296 |     }
1297 |     assert(native_cudaMemcpy2DAsync != NULL);
1298 |     return native_cudaMemcpy2DAsync(dst,dpitch,src,spitch,width,height,kind,stream);
1299 | }
1300 | 
1301 | 
1302 | ///   cudaMemcpy2DFromArray   ///
1303 | typedef cudaError_t (*cudaMemcpy2DFromArray_t)(void * dst,
1304 |     size_t 	dpitch,
1305 |     const struct cudaArray * src,
1306 |     size_t 	wOffset,
1307 |     size_t 	hOffset,
1308 |     size_t 	width,
1309 |     size_t 	height,
1310 |     enum cudaMemcpyKind kind);
1311 | 
1312 | static cudaMemcpy2DFromArray_t native_cudaMemcpy2DFromArray = NULL;
1313 | 
1314 | extern "C" cudaError_t cudaMemcpy2DFromArray (void * dst,
1315 |     size_t 	dpitch,
1316 |     const struct cudaArray * src,
1317 |     size_t 	wOffset,
1318 |     size_t 	hOffset,
1319 |     size_t 	width,
1320 |     size_t 	height,
1321 |     enum cudaMemcpyKind kind){
1322 | 
1323 |     printf("\n>>cudaMemcpy2DFromArray interception\n");
1324 | 
1325 |     if (native_cudaMemcpy2DFromArray == NULL) {
1326 |         native_cudaMemcpy2DFromArray = (cudaMemcpy2DFromArray_t)dlsym(RTLD_NEXT,"cudaMemcpy2DFromArray");
1327 |     }
1328 |     assert(native_cudaMemcpy2DFromArray != NULL);
1329 |     return native_cudaMemcpy2DFromArray(dst,dpitch,src,wOffset,hOffset,width,height,kind);
1330 | }
1331 | 
1332 | 
1333 | 
1334 | ///   cudaMemcpy2DFromArrayAsync   ///
1335 | typedef cudaError_t (*cudaMemcpy2DFromArrayAsync_t)(void * dst,
1336 |     size_t 	dpitch,
1337 |     const struct cudaArray * src,
1338 |     size_t 	wOffset,
1339 |     size_t 	hOffset,
1340 |     size_t 	width,
1341 |     size_t 	height,
1342 |     enum cudaMemcpyKind kind,
1343 |     cudaStream_t stream);
1344 | 
1345 | static cudaMemcpy2DFromArrayAsync_t native_cudaMemcpy2DFromArrayAsync = NULL;
1346 | 
1347 | extern "C" cudaError_t cudaMemcpy2DFromArrayAsync (void * dst,
1348 |     size_t 	dpitch,
1349 |     const struct cudaArray * src,
1350 |     size_t 	wOffset,
1351 |     size_t 	hOffset,
1352 |     size_t 	width,
1353 |     size_t 	height,
1354 |     enum cudaMemcpyKind kind,
1355 |     cudaStream_t stream){
1356 | 
1357 |     printf("\n>>cudaMemcpy2DFromArrayAsync interception\n");
1358 | 
1359 |     if (native_cudaMemcpy2DFromArrayAsync == NULL) {
1360 |         native_cudaMemcpy2DFromArrayAsync = (cudaMemcpy2DFromArrayAsync_t)dlsym(RTLD_NEXT,"cudaMemcpy2DFromArrayAsync");
1361 |     }
1362 |     assert(native_cudaMemcpy2DFromArrayAsync != NULL);
1363 |     return native_cudaMemcpy2DFromArrayAsync(dst,dpitch,src,wOffset,hOffset,width,height,kind,stream);
1364 | }
1365 | 
1366 | 
1367 | 
1368 | 
1369 | 
1370 | 
1371 | 
1372 | 
1373 | 
1374 | 
1375 | ///   cudaMemcpy2DToArray   ///
1376 | typedef cudaError_t (*cudaMemcpy2DToArray_t)(struct cudaArray * dst,
1377 |     size_t 	wOffset,
1378 |     size_t 	hOffset,
1379 |     const void * src,
1380 |     size_t 	spitch,
1381 |     size_t 	width,
1382 |     size_t 	height,
1383 |     enum cudaMemcpyKind kind);
1384 | 
1385 | static cudaMemcpy2DToArray_t native_cudaMemcpy2DToArray= NULL;
1386 | 
1387 | extern "C" cudaError_t cudaMemcpy2DToArray (struct cudaArray * dst,
1388 |     size_t 	wOffset,
1389 |     size_t 	hOffset,
1390 |     const void * src,
1391 |     size_t 	spitch,
1392 |     size_t 	width,
1393 |     size_t 	height,
1394 |     enum cudaMemcpyKind kind) {
1395 | 
1396 |     printf("\n>>cudaMemcpy2DToArray interception\n");
1397 | 
1398 |     if (native_cudaMemcpy2DToArray == NULL) {
1399 |         native_cudaMemcpy2DToArray = (cudaMemcpy2DToArray_t)dlsym(RTLD_NEXT,"cudaMemcpy2DToArray");
1400 |     }
1401 |     assert(native_cudaMemcpy2DToArray != NULL);
1402 |     return native_cudaMemcpy2DToArray(dst,wOffset,hOffset,src,spitch,width,height,kind);
1403 | }
1404 | 
1405 | 
1406 | ///   cudaMemcpy2DToArrayAsync   ///
1407 | typedef cudaError_t (*cudaMemcpy2DToArrayAsync_t)(struct cudaArray * dst,
1408 |     size_t 	wOffset,
1409 |     size_t 	hOffset,
1410 |     const void * src,
1411 |     size_t 	spitch,
1412 |     size_t 	width,
1413 |     size_t 	height,
1414 |     enum cudaMemcpyKind kind,
1415 |     cudaStream_t stream);
1416 | 
1417 | static cudaMemcpy2DToArrayAsync_t native_cudaMemcpy2DToArrayAsync = NULL;
1418 | 
1419 | extern "C" cudaError_t cudaMemcpy2DToArrayAsync (struct cudaArray * dst,
1420 |     size_t 	wOffset,
1421 |     size_t 	hOffset,
1422 |     const void * src,
1423 |     size_t 	spitch,
1424 |     size_t 	width,
1425 |     size_t 	height,
1426 |     enum cudaMemcpyKind kind,
1427 |     cudaStream_t stream) {
1428 | 
1429 |     printf("\n>>cudaMemcpy2DToArrayAsync interception\n");
1430 | 
1431 |     if (native_cudaMemcpy2DToArrayAsync == NULL) {
1432 |         native_cudaMemcpy2DToArrayAsync = (cudaMemcpy2DToArrayAsync_t)dlsym(RTLD_NEXT,"cudaMemcpy2DToArrayAsync");
1433 |     }
1434 |     assert(native_cudaMemcpy2DToArrayAsync != NULL);
1435 |     return native_cudaMemcpy2DToArrayAsync(dst,wOffset,hOffset,src,spitch,width,height,kind,stream);
1436 | }
1437 | 
1438 | 
1439 | ///   cudaMemcpy3D   ///
1440 | typedef cudaError_t (*cudaMemcpy3D_t)(const struct cudaMemcpy3DParms * p);
1441 | static cudaMemcpy3D_t native_cudaMemcpy3D = NULL;
1442 | 
1443 | extern "C" cudaError_t cudaMemcpy3D (const struct cudaMemcpy3DParms * p) {
1444 |     printf("\n>>cudaMemcpy3D interception\n");
1445 | 
1446 |     if (native_cudaMemcpy3D== NULL) {
1447 |         native_cudaMemcpy3D = (cudaMemcpy3D_t)dlsym(RTLD_NEXT,"cudaMemcpy3D");
1448 |     }
1449 |     assert(native_cudaMemcpy3D != NULL);
1450 |     return native_cudaMemcpy3D(p);
1451 | }
1452 | 
1453 | 
1454 | ///   cudaMemcpy3DAsync   ///
1455 | typedef cudaError_t (*cudaMemcpy3DAsync_t)(const struct cudaMemcpy3DParms * p, cudaStream_t stream);
1456 | static cudaMemcpy3DAsync_t native_cudaMemcpy3DAsync = NULL;
1457 | 
1458 | extern "C" cudaError_t cudaMemcpy3DAsync (const struct cudaMemcpy3DParms * p, cudaStream_t stream) {
1459 |     printf("\n>>cudaMemcpy3DAsync interception\n");
1460 | 
1461 |     if (native_cudaMemcpy3DAsync == NULL) {
1462 |         native_cudaMemcpy3DAsync = (cudaMemcpy3DAsync_t)dlsym(RTLD_NEXT,"cudaMemcpy3DAsync");
1463 |     }
1464 |     assert(native_cudaMemcpy3DAsync != NULL);
1465 |     return native_cudaMemcpy3DAsync(p,stream);
1466 | }
1467 | 
1468 | 
1469 | ///   cudaMemcpyArrayToArray   ///
1470 | typedef cudaError_t (*cudaMemcpyArrayToArray_t)(struct cudaArray * dst,
1471 |     size_t 	wOffsetDst,
1472 |     size_t 	hOffsetDst,
1473 |     const struct cudaArray * src,
1474 |     size_t 	wOffsetSrc,
1475 |     size_t 	hOffsetSrc,
1476 |     size_t 	count,
1477 |     enum cudaMemcpyKind kind);
1478 | 
1479 | static cudaMemcpyArrayToArray_t native_cudaMemcpyArrayToArray = NULL;
1480 | 
1481 | extern "C" cudaError_t cudaMemcpyArrayToArray(struct cudaArray * dst,
1482 |     size_t 	wOffsetDst,
1483 |     size_t 	hOffsetDst,
1484 |     const struct cudaArray * src,
1485 |     size_t 	wOffsetSrc,
1486 |     size_t 	hOffsetSrc,
1487 |     size_t 	count,
1488 |     enum cudaMemcpyKind kind){
1489 | 
1490 |     printf("\n>>cudaMemcpyArrayToArray interception\n");
1491 | 
1492 |     if (native_cudaMemcpyArrayToArray == NULL) {
1493 |         native_cudaMemcpyArrayToArray = (cudaMemcpyArrayToArray_t)dlsym(RTLD_NEXT,"cudaMemcpyArrayToArray");
1494 |     }
1495 |     assert(native_cudaMemcpyArrayToArray != NULL);
1496 |     return native_cudaMemcpyArrayToArray(dst,wOffsetDst,hOffsetDst,src,wOffsetSrc,hOffsetSrc,count,kind);
1497 | }
1498 | 
1499 | 
1500 | ///   cudaMemcpyAsync   ///
1501 | typedef cudaError_t (*cudaMemcpyAsync_t)(void * dst,
1502 |     const void * src,
1503 |     size_t 	count,
1504 |     enum cudaMemcpyKind kind,
1505 |     cudaStream_t stream);
1506 | 
1507 | static cudaMemcpyAsync_t native_cudaMemcpyAsync = NULL;
1508 | 
1509 | extern "C" cudaError_t cudaMemcpyAsync (void * dst,
1510 |     const void * src,
1511 |     size_t 	count,
1512 |     enum cudaMemcpyKind kind,
1513 |     cudaStream_t stream) {
1514 | 
1515 |     printf("\n>>cudaMemcpyAsync interception\n");
1516 | 
1517 |     if (native_cudaMemcpyAsync == NULL) {
1518 |         native_cudaMemcpyAsync = (cudaMemcpyAsync_t)dlsym(RTLD_NEXT,"cudaMemcpyAsync");
1519 |     }
1520 |     assert(native_cudaMemcpyAsync != NULL);
1521 |     return native_cudaMemcpyAsync(dst,src,count,kind,stream);
1522 | }
1523 | 
1524 | 
1525 | ///   cudaMemcpyFromArray   ///
1526 | typedef cudaError_t (*cudaMemcpyFromArray_t)(void * dst,
1527 |     const struct cudaArray * src,
1528 |     size_t wOffset,
1529 |     size_t hOffset,
1530 |     size_t count,
1531 |     enum cudaMemcpyKind kind);
1532 | 
1533 | static cudaMemcpyFromArray_t native_cudaMemcpyFromArray = NULL;
1534 | 
1535 | extern "C" cudaError_t cudaMemcpyFromArray (void * dst,
1536 |     const struct cudaArray * src,
1537 |     size_t wOffset,
1538 |     size_t hOffset,
1539 |     size_t count,
1540 |     enum cudaMemcpyKind kind){
1541 | 
1542 |     printf("\n>>cudaMemcpyFromArray interception\n");
1543 | 
1544 |     if (native_cudaMemcpyFromArray == NULL) {
1545 |         native_cudaMemcpyFromArray = (cudaMemcpyFromArray_t)dlsym(RTLD_NEXT,"cudaMemcpyFromArray");
1546 |     }
1547 |     assert(native_cudaMemcpyFromArray != NULL);
1548 |     return native_cudaMemcpyFromArray(dst,src,wOffset,hOffset,count,kind);
1549 | }
1550 | 
1551 | 
1552 | ///   cudaMemcpyFromArrayAsync   ///
1553 | typedef cudaError_t (*cudaMemcpyFromArrayAsync_t)(void * dst,
1554 |     const struct cudaArray * src,
1555 |     size_t 	wOffset,
1556 |     size_t 	hOffset,
1557 |     size_t 	count,
1558 |     enum cudaMemcpyKind kind,
1559 |     cudaStream_t stream);
1560 | 
1561 | static cudaMemcpyFromArrayAsync_t native_cudaMemcpyFromArrayAsync = NULL;
1562 | 
1563 | extern "C" cudaError_t cudaMemcpyFromArrayAsync (void * dst,
1564 |     const struct cudaArray * src,
1565 |     size_t 	wOffset,
1566 |     size_t 	hOffset,
1567 |     size_t 	count,
1568 |     enum cudaMemcpyKind kind,
1569 |     cudaStream_t stream){
1570 | 
1571 |     printf("\n>>cudaMemcpyFromArrayAsync interception\n");
1572 | 
1573 |     if (native_cudaMemcpyFromArrayAsync == NULL) {
1574 |         native_cudaMemcpyFromArrayAsync = (cudaMemcpyFromArrayAsync_t)dlsym(RTLD_NEXT,"cudaMemcpyFromArrayAsync");
1575 |     }
1576 |     assert(native_cudaMemcpyFromArrayAsync != NULL);
1577 |     return native_cudaMemcpyFromArrayAsync(dst,src,wOffset,hOffset,count,kind,stream);
1578 | }
1579 | 
1580 | 
1581 | ///   cudaMemcpyFromSymbol   ///
1582 | typedef cudaError_t (*cudaMemcpyFromSymbol_t)(void * dst,
1583 |     const char * symbol,
1584 |     size_t 	count,
1585 |     size_t 	offset,
1586 |     enum cudaMemcpyKind kind);
1587 | 
1588 | static cudaMemcpyFromSymbol_t native_cudaMemcpyFromSymbol = NULL;
1589 | 
1590 | extern "C" cudaError_t cudaMemcpyFromSymbol (void * dst,
1591 |     const char * symbol,
1592 |     size_t 	count,
1593 |     size_t 	offset,
1594 |     enum cudaMemcpyKind kind) {
1595 | 
1596 |     printf("\n>>cudaMemcpyFromSymbol interception\n");
1597 | 
1598 |     if (native_cudaMemcpyFromSymbol == NULL) {
1599 |         native_cudaMemcpyFromSymbol = (cudaMemcpyFromSymbol_t)dlsym(RTLD_NEXT,"cudaMemcpyFromSymbol");
1600 |     }
1601 |     assert(native_cudaMemcpyFromSymbol != NULL);
1602 |     return native_cudaMemcpyFromSymbol(dst,symbol,count,offset,kind);
1603 | }
1604 | 
1605 | 
1606 | ///   cudaMemcpyFromSymbolAsync   ///
1607 | typedef cudaError_t (*cudaMemcpyFromSymbolAsync_t)(void * dst,
1608 |     const char * symbol,
1609 |     size_t 	count,
1610 |     size_t 	offset,
1611 |     enum cudaMemcpyKind kind,
1612 |     cudaStream_t stream);
1613 | 
1614 | static cudaMemcpyFromSymbolAsync_t native_cudaMemcpyFromSymbolAsync = NULL;
1615 | 
1616 | extern "C" cudaError_t cudaMemcpyFromSymbolAsync (void * dst,
1617 |     const char * symbol,
1618 |     size_t 	count,
1619 |     size_t 	offset,
1620 |     enum cudaMemcpyKind kind,
1621 |     cudaStream_t stream) {
1622 | 
1623 |     printf("\n>>cudaMemcpyFromSymbolAsync interception\n");
1624 | 
1625 |     if (native_cudaMemcpyFromSymbolAsync == NULL) {
1626 |         native_cudaMemcpyFromSymbolAsync = (cudaMemcpyFromSymbolAsync_t)dlsym(RTLD_NEXT,"cudaMemcpyFromSymbolAsync");
1627 |     }
1628 |     assert(native_cudaMemcpyFromSymbolAsync != NULL);
1629 |     return native_cudaMemcpyFromSymbolAsync(dst,symbol,count,offset,kind,stream);
1630 | }
1631 | 
1632 | 
1633 | ///   cudaMemcpyToArray   ///
1634 | typedef cudaError_t (*cudaMemcpyToArray_t)(struct cudaArray * dst,
1635 |     size_t 	wOffset,
1636 |     size_t 	hOffset,
1637 |     const void * src,
1638 |     size_t 	count,
1639 |     enum cudaMemcpyKind kind);
1640 | 
1641 | static cudaMemcpyToArray_t native_cudaMemcpyToArray = NULL;
1642 | 
1643 | extern "C" cudaError_t cudaMemcpyToArray (struct cudaArray * dst,
1644 |     size_t 	wOffset,
1645 |     size_t 	hOffset,
1646 |     const void * src,
1647 |     size_t 	count,
1648 |     enum cudaMemcpyKind kind) {
1649 | 
1650 |     printf("\n>>cudaMemcpyToArray interception\n");
1651 | 
1652 |     if (native_cudaMemcpyToArray == NULL) {
1653 |         native_cudaMemcpyToArray = (cudaMemcpyToArray_t)dlsym(RTLD_NEXT,"cudaMemcpyToArray");
1654 |     }
1655 |     assert(native_cudaMemcpyToArray != NULL);
1656 |     return native_cudaMemcpyToArray(dst,wOffset,hOffset,src,count,kind);
1657 | }
1658 | 
1659 | 
1660 | ///   cudaMemcpyToArrayAsync   ///
1661 | typedef cudaError_t (*cudaMemcpyToArrayAsync_t)(struct cudaArray * 	dst,
1662 |     size_t 	wOffset,
1663 |     size_t 	hOffset,
1664 |     const void * src,
1665 |     size_t 	count,
1666 |     enum cudaMemcpyKind kind,
1667 |     cudaStream_t stream);
1668 | 
1669 | static cudaMemcpyToArrayAsync_t native_cudaMemcpyToArrayAsync = NULL;
1670 | 
1671 | extern "C" cudaError_t cudaMemcpyToArrayAsync (struct cudaArray * dst,
1672 |     size_t 	wOffset,
1673 |     size_t 	hOffset,
1674 |     const void * src,
1675 |     size_t 	count,
1676 |     enum cudaMemcpyKind kind,
1677 |     cudaStream_t stream) {
1678 | 
1679 |     printf("\n>>cudaMemcpyToArrayAsync interception\n");
1680 | 
1681 |     if (native_cudaMemcpyToArrayAsync == NULL) {
1682 |         native_cudaMemcpyToArrayAsync = (cudaMemcpyToArrayAsync_t)dlsym(RTLD_NEXT,"cudaMemcpyToArrayAsync");
1683 |     }
1684 |     assert(native_cudaMemcpyToArrayAsync != NULL);
1685 |     return native_cudaMemcpyToArrayAsync(dst,wOffset,hOffset,src,count,kind,stream);
1686 | }
1687 | 
1688 | 
1689 | ///   cudaMemcpyToSymbol   ///
1690 | typedef cudaError_t (*cudaMemcpyToSymbol_t)(const char * symbol,
1691 |     const void * src,
1692 |     size_t 	count,
1693 |     size_t 	offset,
1694 |     enum cudaMemcpyKind kind);
1695 | 
1696 | static cudaMemcpyToSymbol_t native_cudaMemcpyToSymbol = NULL;
1697 | 
1698 | extern "C" cudaError_t cudaMemcpyToSymbol (const char * symbol,
1699 |     const void * src,
1700 |     size_t 	count,
1701 |     size_t 	offset,
1702 |     enum cudaMemcpyKind kind) {
1703 | 
1704 |     printf("\n>>cudaMemcpyToSymbol interception\n");
1705 | 
1706 |     if (native_cudaMemcpyToSymbol == NULL) {
1707 |         native_cudaMemcpyToSymbol = (cudaMemcpyToSymbol_t)dlsym(RTLD_NEXT,"cudaMemcpyToSymbol");
1708 |     }
1709 |     assert(native_cudaMemcpyToSymbol != NULL);
1710 |     return native_cudaMemcpyToSymbol(symbol,src,count,offset,kind);
1711 | }
1712 | 
1713 | 
1714 | ///   cudaMemcpyToSymbolAsync   ///
1715 | typedef cudaError_t (*cudaMemcpyToSymbolAsync_t)(const char * symbol,
1716 |     const void * src,
1717 |     size_t 	count,
1718 |     size_t 	offset,
1719 |     enum cudaMemcpyKind kind,
1720 |     cudaStream_t stream);
1721 | 
1722 | static cudaMemcpyToSymbolAsync_t native_cudaMemcpyToSymbolAsync = NULL;
1723 | 
1724 | extern "C" cudaError_t cudaMemcpyToSymbolAsync (const char * symbol,
1725 |     const void * src,
1726 |     size_t 	count,
1727 |     size_t 	offset,
1728 |     enum cudaMemcpyKind kind,
1729 |     cudaStream_t stream) {
1730 | 
1731 |     printf("\n>>cudaMemcpyToSymbolAsync interception\n");
1732 | 
1733 |     if (native_cudaMemcpyToSymbolAsync == NULL) {
1734 |         native_cudaMemcpyToSymbolAsync = (cudaMemcpyToSymbolAsync_t)dlsym(RTLD_NEXT,"cudaMemcpyToSymbolAsync");
1735 |     }
1736 |     assert(native_cudaMemcpyToSymbolAsync != NULL);
1737 |     return native_cudaMemcpyToSymbolAsync(symbol,src,count,offset,kind,stream);
1738 | }
1739 | 
1740 | 
1741 | ///   cudaMemset   ///
1742 | typedef cudaError_t (*cudaMemset_t)(void * devPtr, int value, size_t count);
1743 | static cudaMemset_t native_cudaMemset = NULL;
1744 | 
1745 | extern "C" cudaError_t cudaMemset(void * devPtr, int value, size_t count) {
1746 |     printf("\n>>cudaMemset interception\n");
1747 | 
1748 |     if (native_cudaMemset == NULL) {
1749 |         native_cudaMemset = (cudaMemset_t)dlsym(RTLD_NEXT,"cudaMemset");
1750 |     }
1751 |     assert(native_cudaMemset != NULL);
1752 |     return native_cudaMemset(devPtr,value,count);
1753 | }
1754 | 
1755 | 
1756 | ///   cudaMemset2D   ///
1757 | typedef cudaError_t (*cudaMemset2D_t)(void * devPtr,
1758 |     size_t  pitch,
1759 |     int     value,
1760 |     size_t 	width,
1761 |     size_t 	height);
1762 | 
1763 | static cudaMemset2D_t native_cudaMemset2D = NULL;
1764 | 
1765 | extern "C" cudaError_t cudaMemset2D (void * devPtr,
1766 |     size_t  pitch,
1767 |     int     value,
1768 |     size_t 	width,
1769 |     size_t 	height) {
1770 | 
1771 |     printf("\n>>cudaMemset2D interception\n");
1772 | 
1773 |     if (native_cudaMemset2D == NULL) {
1774 |         native_cudaMemset2D = (cudaMemset2D_t)dlsym(RTLD_NEXT,"cudaMemset2D");
1775 |     }
1776 |     assert(native_cudaMemset2D != NULL);
1777 |     return native_cudaMemset2D(devPtr,pitch,value,width,height);
1778 | }
1779 | 
1780 | 
1781 | ///   cudaMemset3D   ///
1782 | typedef cudaError_t (*cudaMemset3D_t)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
1783 | 
1784 | static cudaMemset3D_t native_cudaMemset3D = NULL;
1785 | 
1786 | extern "C" cudaError_t cudaMemset3D (struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
1787 |     printf("\n>>cudaMemset3D interception\n");
1788 | 
1789 |     if (native_cudaMemset3D == NULL) {
1790 |         native_cudaMemset3D = (cudaMemset3D_t)dlsym(RTLD_NEXT,"cudaMemset3D");
1791 |     }
1792 |     assert(native_cudaMemset3D != NULL);
1793 |     return native_cudaMemset3D(pitchedDevPtr,value,extent);
1794 | }
1795 | 
1796 | 
1797 | 
1798 | //***********************************************//
1799 | //      CUDA Runtime API Version Management      //
1800 | //***********************************************//
1801 | ///   cudaDriverGetVersion   ///
1802 | typedef cudaError_t (*cudaDriverGetVersion_t)(int * driverVersion);
1803 | static cudaDriverGetVersion_t native_cudaDriverGetVersion = NULL;
1804 | 
1805 | extern "C" cudaError_t cudaDriverGetVersion	(int * driverVersion) {
1806 |     printf("\ncudaDriverGetVersion interception\n");
1807 | 
1808 |     if (native_cudaDriverGetVersion == NULL) {
1809 |         native_cudaDriverGetVersion = (cudaDriverGetVersion_t)dlsym(RTLD_NEXT,"cudaDriverGetVersion");
1810 |     }
1811 |     assert(native_cudaDriverGetVersion != NULL);
1812 |     return native_cudaDriverGetVersion(driverVersion);
1813 | }
1814 | 
1815 | ///   cudaDriverGetVersion   ///
1816 | typedef cudaError_t (*cudaRuntimeGetVersion_t)(int * runtimeVersion);
1817 | static cudaRuntimeGetVersion_t native_cudaRuntimeGetVersion = NULL;
1818 | 
1819 | extern "C" cudaError_t cudaRuntimeGetVersion(int * runtimeVersion) {
1820 |     printf("\ncudaRuntimeGetVersion interception\n");
1821 | 
1822 |     if (native_cudaRuntimeGetVersion == NULL) {
1823 |         native_cudaRuntimeGetVersion = (cudaRuntimeGetVersion_t)dlsym(RTLD_NEXT,"cudaRuntimeGetVersion");
1824 |     }
1825 |     assert(native_cudaRuntimeGetVersion != NULL);
1826 |     return native_cudaRuntimeGetVersion(runtimeVersion);
1827 | }
1828 | 
1829 | 
1830 | 
1831 | //**********************************************//
1832 | //      CUDA Runtime API Thread Management      //
1833 | //**********************************************//
1834 | ///   cudaThreadExit   ///
1835 | typedef cudaError_t (*cudaThreadExit_t)(void);
1836 | static cudaThreadExit_t native_cudaThreadExit = NULL;
1837 | 
1838 | extern "C" cudaError_t cudaThreadExit(void) {
1839 |     printf("\n>>cudaThreadExit interception\n");
1840 | 
1841 |     if (native_cudaThreadExit == NULL) {
1842 |         native_cudaThreadExit = (cudaThreadExit_t)dlsym(RTLD_NEXT,"cudaThreadExit");
1843 |     }
1844 |     assert(native_cudaThreadExit != NULL);
1845 |     return native_cudaThreadExit();
1846 | }
1847 | 
1848 | ///   cudaThreadExit   ///
1849 | typedef cudaError_t (*cudaThreadSynchronize_t)(void);
1850 | static cudaThreadSynchronize_t native_cudaThreadSynchronize = NULL;
1851 | 
1852 | extern "C" cudaError_t cudaThreadSynchronize(void) {
1853 |     printf("\n>>cudaThreadSynchronize interception\n");
1854 | 
1855 |     if (native_cudaThreadSynchronize == NULL) {
1856 |         native_cudaThreadSynchronize = (cudaThreadSynchronize_t)dlsym(RTLD_NEXT,"cudaThreadSynchronize");
1857 |     }
1858 |     assert(native_cudaThreadSynchronize != NULL);
1859 |     return native_cudaThreadSynchronize();
1860 | }
1861 | 


--------------------------------------------------------------------------------