├── scripts ├── lib64 │ ├── libcudart.so.6.5 │ ├── libcudart.so │ ├── rCUDAcommIB.so │ └── rCUDAcommTCP.so ├── Makefile.am ├── mrcudaexec.py.template ├── plotters │ └── overhead.py └── Makefile.in ├── Makefile.am ├── AUTHORS ├── .gitignore ├── tests ├── Makefile.am ├── progs │ ├── gpuaddr.cu │ ├── multigpuaddr.cu │ ├── benchmark.nullker.cudamemcpy.sh │ ├── hello.cu │ ├── benchmark.memcpybw.sh │ ├── nullker.cu │ ├── hellomul.cu │ ├── thread_dev.cu │ ├── memcpybw.cu │ ├── cudamemcpy.cu │ ├── matmul_par.cu │ └── matmul_mul.cu ├── check_record.c └── check_comm.c ├── NEWS ├── src ├── Makefile.am ├── comm.h ├── common.h ├── intercomm_mem.h ├── intercomm.h ├── mrcuda.h ├── intercomm_mem.c ├── intercomm.c ├── comm.c ├── intercomm_interface.h ├── record.h ├── interface.c └── datatypes.h ├── notes └── func-list.txt ├── README ├── configure.ac ├── config.h.in ├── ChangeLog ├── results ├── nullker-mhelper.out └── memcpybw-mhelper.out └── INSTALL /scripts/lib64/libcudart.so.6.5: -------------------------------------------------------------------------------- 1 | libcudart.so -------------------------------------------------------------------------------- /scripts/lib64/libcudart.so: -------------------------------------------------------------------------------- 1 | ../../build/src/.libs/libcudart.so -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | ACLOCAL_AMFLAGS = -I build-aux 2 | SUBDIRS = src . tests scripts 3 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Pak Markthub the creator of this project. 2 | -------------------------------------------------------------------------------- /scripts/lib64/rCUDAcommIB.so: -------------------------------------------------------------------------------- 1 | /home/pak/Projects/rCUDAv15.07-CUDA7.0/lib/rCUDAcommIB.so -------------------------------------------------------------------------------- /scripts/lib64/rCUDAcommTCP.so: -------------------------------------------------------------------------------- 1 | /home/pak/Projects/rCUDAv15.07-CUDA7.0/lib/rCUDAcommTCP.so -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.swp 3 | *.swo 4 | src/libcudart.so.5.0 5 | lib64/* 6 | build/* 7 | autom4te.cache 8 | *~ 9 | -------------------------------------------------------------------------------- /tests/Makefile.am: -------------------------------------------------------------------------------- 1 | TESTS = check_comm 2 | check_PROGRAMS = check_comm 3 | check_comm_SOURCES = check_comm.c $(top_builddir)/src/comm.h 4 | check_comm_CFLAGS = @CHECK_CFLAGS@ -pthread 5 | check_comm_LDADD = $(top_builddir)/src/libcomm.a @CHECK_LIBS@ 6 | check_comm_LDFLAGS = -pthread -lpthread 7 | 8 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | 2016-09-28 Pak Markthub 2 | * This is the first alpha release version. 3 | * mrCUDA supports multi-GPU remote-to-local GPU migration. 4 | * Only a subset of CUDA Runtime APIs v7.0 are supported, but at least they are enough for LAMMPS to run without problem. 5 | 6 | -------------------------------------------------------------------------------- /tests/progs/gpuaddr.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CUDA_SAFE_CALL(x) \ 5 | { \ 6 | if ((x) != cudaSuccess) { \ 7 | fprintf(stderr, "Error!"); \ 8 | exit(EXIT_FAILURE); \ 9 | } \ 10 | } 11 | 12 | int main() 13 | { 14 | float *a; 15 | CUDA_SAFE_CALL(cudaMalloc(&a, sizeof(float))); 16 | printf("a is %p\n", a); 17 | getchar(); 18 | return 0; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /scripts/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = mrcudaexec 2 | mrcudaexec_SOURCES = mrcudaexec.py.template 3 | 4 | mrcudaexec$(EXEEXT): mrcudaexec.py.template 5 | cp $< mrcudaexec$(EXEEXT) 6 | ${SED} -i -- 's/{{ RCUDA_LIBCUDART }}/$(shell echo "${RCUDA_LIBCUDART}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 7 | ${SED} -i -- 's/{{ NVIDIA_LIBCUDART }}/$(shell echo "${NVIDIA_LIBCUDART}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 8 | ${SED} -i -- 's/{{ MRCUDA_LIBPATH }}/$(shell echo "${libdir}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 9 | 10 | -------------------------------------------------------------------------------- /tests/progs/multigpuaddr.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CUDA_SAFE_CALL(x) \ 5 | { \ 6 | if ((x) != cudaSuccess) { \ 7 | fprintf(stderr, "Error!"); \ 8 | exit(EXIT_FAILURE); \ 9 | } \ 10 | } 11 | 12 | int main() 13 | { 14 | float *a, *b; 15 | CUDA_SAFE_CALL(cudaSetDevice(0)); 16 | CUDA_SAFE_CALL(cudaMalloc(&a, sizeof(float))); 17 | CUDA_SAFE_CALL(cudaSetDevice(1)); 18 | CUDA_SAFE_CALL(cudaMalloc(&b, sizeof(float))); 19 | printf("a on device 0 is %p\n", a); 20 | printf("b on device 1 is %p\n", b); 21 | return 0; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = mhelper 2 | mhelper_SOURCES = mhelper.c intercomm_mem.c 3 | mhelper_LDFLAGS = -lcuda -lcudart $(DEPS_LIBS) 4 | mhelper_CPPFLAGS = -I/usr/local/cuda/include $(DEPS_CFLAGS) 5 | 6 | lib_LTLIBRARIES = libcudart.la 7 | libcudart_la_SOURCES = comm.c interface.c mrcuda.c record.c intercomm.c intercomm_mem.c intercomm_interface.c 8 | libcudart_la_LDFLAGS = -avoid-version -shared -ldl $(DEPS_LIBS) 9 | libcudart_la_CPPFLAGS = -I/usr/local/cuda/include $(DEPS_CFLAGS) 10 | 11 | lib_LIBRARIES = libcomm.a 12 | libcomm_a_SOURCES = comm.c comm.h common.h 13 | libcomm_a_CPPFLAGS = -lpthread -pthread $(DEPS_CFLAGS) 14 | 15 | install-exec-hook: 16 | ${LN_S} ${RCUDA_RCUDACOMMIB} ${libdir} 17 | ${LN_S} ${RCUDA_RCUDACOMMTCP} ${libdir} 18 | ${LN_S} ${libdir}/libcudart.so ${libdir}/libcudart.so.7.0 19 | 20 | -------------------------------------------------------------------------------- /src/comm.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_COMM__HEADER__ 2 | #define __MRCUDA_COMM__HEADER__ 3 | 4 | #include "common.h" 5 | 6 | /** 7 | * This function starts listening to a signal that tells the system to switch to native CUDA. 8 | * After it receives the signal, this function calls the callback and terminates the socket. 9 | * This function executes the listening process in a different thread; thus, it returns almost immediately. 10 | * Note: if the signal is not well form, this function will simply skips that signal and not calls the callback. 11 | * @param path path for creating a new UNIX socket for listening to the signal. 12 | * @param callback the function that will be called after received a signal. 13 | * @return 0 if success, the error number otherwise. 14 | */ 15 | int mrcuda_comm_listen_for_signal(char *path, void (*callback)(void)); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /tests/progs/benchmark.nullker.cudamemcpy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #i=0 4 | #while [ $i -lt 10 ] 5 | #do 6 | # echo "nullker mrcuda $i" 7 | # taskset 1 ~/src/mrCUDA/scripts/mrCUDAExec -t IB -f ~/src/mrCUDA/scripts/sf.in -n 2 -- ./nullker 8 | # i=`expr $i + 1` 9 | #done 10 | # 11 | #sleep 1 12 | # 13 | #i=0 14 | #while [ $i -lt 10 ] 15 | #do 16 | # echo "nullker native $i" 17 | # taskset 1 ./nullker 18 | # i=`expr $i + 1` 19 | #done 20 | # 21 | #sleep 1 22 | 23 | i=8 24 | while [ $i -lt 10 ] 25 | do 26 | echo "cudamemcpy mrcuda $i" 27 | taskset 1 ~/src/mrCUDA/scripts/mrCUDAExec -t IB -f ~/src/mrCUDA/scripts/sf.in -n 2 -- ./cudamemcpy 28 | i=`expr $i + 1` 29 | done 30 | 31 | sleep 1 32 | 33 | i=0 34 | while [ $i -lt 10 ] 35 | do 36 | echo "cudamemcpy native $i" 37 | taskset 1 ./cudamemcpy 38 | i=`expr $i + 1` 39 | done 40 | 41 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_COMMON__HEADER__ 2 | #define __MRCUDA_COMMON__HEADER__ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #if DEBUG 11 | #define DPRINTF(fmt, ...) \ 12 | do {fprintf(stderr, "FILE: " __FILE__ ", LINE: %d, " fmt, __LINE__, ##__VA_ARGS__);} while(0) 13 | #else 14 | #define DPRINTF(fmt, ...) \ 15 | do {;;} while(0) 16 | #endif 17 | 18 | #define REPORT_ERROR_AND_EXIT(...) \ 19 | do { \ 20 | perror("FATAL ERROR"); \ 21 | fprintf(stderr, __VA_ARGS__); \ 22 | exit(EXIT_FAILURE); \ 23 | } while(0) 24 | 25 | #define STARTTIMMER() \ 26 | struct timeval t1, t2; \ 27 | gettimeofday(&t1, NULL); 28 | 29 | #define ENDTIMMER(acctime) \ 30 | gettimeofday(&t2, NULL); \ 31 | acctime += (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /tests/progs/hello.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | const int N = 16; 4 | const int blocksize = 16; 5 | 6 | __global__ 7 | void hello(char *a, int *b) 8 | { 9 | a[threadIdx.x] += b[threadIdx.x]; 10 | } 11 | 12 | int main() 13 | { 14 | char a[N] = "Hello \0\0\0\0\0\0"; 15 | int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 16 | 17 | char *ad; 18 | int *bd; 19 | const int csize = N*sizeof(char); 20 | const int isize = N*sizeof(int); 21 | 22 | printf("%s", a); 23 | 24 | cudaMalloc( (void**)&ad, csize ); 25 | cudaMalloc( (void**)&bd, isize ); 26 | cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 27 | cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 28 | 29 | dim3 dimBlock( blocksize, 1 ); 30 | dim3 dimGrid( 1, 1 ); 31 | hello<<>>(ad, bd); 32 | cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 33 | cudaFree( ad ); 34 | cudaFree( bd ); 35 | 36 | printf("%s\n", a); 37 | return EXIT_SUCCESS; 38 | } 39 | -------------------------------------------------------------------------------- /tests/progs/benchmark.memcpybw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #k=0 4 | #while [ $k -lt 15 ] 5 | #do 6 | # num=`echo "2^$k" | bc` 7 | # j=0 8 | # while [ $j -lt 10 ] 9 | # do 10 | # memsize=`echo "2^(20+$j)" | bc` 11 | # i=0 12 | # while [ $i -lt 10 ] 13 | # do 14 | # echo "mrcuda $memsize $num" 15 | # taskset 1 ~/src/mrCUDA/scripts/mrCUDAExec -t IB -s rc015 --switch-threshold=1 -- ./memcpybw $memsize $num 16 | # i=`expr $i + 1` 17 | # sleep 1 18 | # done 19 | # j=`expr $j + 1` 20 | # done 21 | # k=`expr $k + 1` 22 | #done 23 | 24 | j=0 25 | while [ $j -lt 20 ] 26 | do 27 | memsize=`echo "2^($j)" | bc` 28 | i=0 29 | while [ $i -lt 10 ] 30 | do 31 | echo "mrcuda $memsize 1" 32 | taskset 1 ~/src/mrCUDA/scripts/mrCUDAExec -t IB -s rc015 --switch-threshold=1 -- ./memcpybw $memsize 1 33 | i=`expr $i + 1` 34 | sleep 1 35 | done 36 | j=`expr $j + 1` 37 | done 38 | -------------------------------------------------------------------------------- /tests/check_record.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../src/record.h" 4 | 5 | START_TEST(test_mrcuda_record_cudaRegisterFatBinary) 6 | { 7 | void *fatCubin = malloc(sizeof(void *) * 10); 8 | mrcuda_record_cudaRegisterFatBinary(fatCubin); 9 | ck_assert(mrcudaRecordHeadPtr != NULL); 10 | ck_assert(mrcudaRecordTailPtr != NULL); 11 | ck_assert(strcmp(mrcudaRecordTailPtr->functionName, "cudaRegisterFatBinary") == 0); 12 | ck_assert(mrcudaRecordTailPtr->replayFunction == &mrcuda_replay_cudaRegisterFatBinary); 13 | ck_assert(mrcudaRecordTailPtr->data.cudaRegisterFatBinary.fatCubin == fatCubin); 14 | free(fatCubin); 15 | } 16 | END_TEST 17 | 18 | Suite *comm_suit(void) 19 | { 20 | Suite *s; 21 | TCase *tc_core; 22 | 23 | s = suite_create("Record"); 24 | 25 | tc_core = tcase_create("Core"); 26 | 27 | tcase_add_test(tc_core, test_mrcuda_record_cudaRegisterFatBinary); 28 | suite_add_tcase(s, tc_core); 29 | 30 | return s; 31 | } 32 | 33 | int main(void) 34 | { 35 | int number_failed; 36 | Suite *s; 37 | SRunner *sr; 38 | 39 | s = comm_suit(); 40 | sr = srunner_create(s); 41 | 42 | srunner_run_all(sr, CK_NORMAL); 43 | number_failed = srunner_ntests_failed(sr); 44 | srunner_free(sr); 45 | 46 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 47 | } 48 | -------------------------------------------------------------------------------- /tests/check_comm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "../src/comm.h" 11 | 12 | static int __getSignalFlag = 0; 13 | 14 | void process_signal(void) 15 | { 16 | __getSignalFlag = 1; 17 | } 18 | 19 | START_TEST(test_mrcuda_comm_listen_for_signal) 20 | { 21 | char *path = "/tmp/mrcuda.pipe"; 22 | int fd; 23 | int ret; 24 | 25 | unlink(path); 26 | ret = mrcuda_comm_listen_for_signal(path, &process_signal); 27 | ck_assert(ret == 0); 28 | 29 | fd = open(path, O_WRONLY); 30 | write(fd, "1", sizeof("1")); 31 | close(fd); 32 | 33 | while(!__getSignalFlag) 34 | sleep(1); 35 | } 36 | END_TEST 37 | 38 | Suite *comm_suit(void) 39 | { 40 | Suite *s; 41 | TCase *tc_core; 42 | 43 | s = suite_create("Comm"); 44 | 45 | tc_core = tcase_create("Core"); 46 | 47 | tcase_add_test(tc_core, test_mrcuda_comm_listen_for_signal); 48 | suite_add_tcase(s, tc_core); 49 | 50 | return s; 51 | } 52 | 53 | int main(void) 54 | { 55 | int number_failed; 56 | Suite *s; 57 | SRunner *sr; 58 | 59 | s = comm_suit(); 60 | sr = srunner_create(s); 61 | 62 | srunner_run_all(sr, CK_NORMAL); 63 | number_failed = srunner_ntests_failed(sr); 64 | srunner_free(sr); 65 | 66 | return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; 67 | } 68 | -------------------------------------------------------------------------------- /src/intercomm_mem.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_INTERCOMM_MEM__HEADER__ 2 | #define __MRCUDA_INTERCOMM_MEM__HEADER__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "datatypes.h" 9 | 10 | /** 11 | * Malloc memory on shared-memory region. 12 | * @param size the size of memory to be allocated. 13 | * @return a ptr to a MRCUDASharedMemLocalInfo_t on success. NULL otherwise. 14 | */ 15 | MRCUDASharedMemLocalInfo_t *mhelper_mem_malloc(size_t size); 16 | 17 | /** 18 | * Detach and destroy the shared region specified by the sharedMemInfo. 19 | * @param sharedMemInfo the information of the shared region. 20 | * @return 0 on success; other number otherwise. 21 | */ 22 | int mhelper_mem_free(MRCUDASharedMemLocalInfo_t *sharedMemInfo); 23 | 24 | /** 25 | * Get the memory region associated with the specified sharedMem. 26 | * @param sharedMem the minimum information of the shared region. 27 | * @return a ptr to a MRCUDASharedMemLocalInfo_t on success. NULL otherwise. 28 | */ 29 | MRCUDASharedMemLocalInfo_t *mhelper_mem_get(MRCUDASharedMem_t sharedMem); 30 | 31 | /** 32 | * Detach the shared region specified by the sharedMemInfo. 33 | * @param sharedMemInfo the information of the shared region. 34 | * @return 0 on success; another number otherwise. 35 | */ 36 | int mhelper_mem_detach(MRCUDASharedMemLocalInfo_t *sharedMemInfo); 37 | 38 | #endif /* __MRCUDA_INTERCOMM_MEM__HEADER__ */ 39 | 40 | -------------------------------------------------------------------------------- /src/intercomm.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_INTERCOMM__HEADER__ 2 | #define __MRCUDA_INTERCOMM__HEADER__ 3 | 4 | #include "datatypes.h" 5 | #include "intercomm_mem.h" 6 | 7 | /** 8 | * Create a helper process and assign the mrcudaGPU to it. 9 | * @param mrcudaGPU the GPU information to assign to the created process. 10 | * @param helperProgPath the path to the helper application. 11 | * @param gpuID the ID of the GPU the helper application will use. 12 | * @return a ptr to the created process on success; NULL otherwise. 13 | */ 14 | MHelperProcess_t *mhelper_create(MRCUDAGPU_t *mrcudaGPU, const char *helperProgPath, int gpuID); 15 | 16 | /** 17 | * Destroy the helper process. 18 | * @param process the process to be destroyed. 19 | * @return 0 on success; another number otherwise. 20 | */ 21 | int mhelper_destroy(MHelperProcess_t *process); 22 | 23 | /** 24 | * Ask the process to execute the command. 25 | * @param process the process that will execute the specified command. 26 | * @param command the command to be executed on the process. 27 | * @return the result of the execution. 28 | */ 29 | MHelperResult_t mhelper_call(MHelperProcess_t *process, MHelperCommand_t command); 30 | 31 | /** 32 | * Generate a unique ID for a command to be used with the specified mrcudaGPU. 33 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t 34 | * @return a unique ID 35 | */ 36 | int mhelper_generate_command_id(MRCUDAGPU_t *mrcudaGPU); 37 | 38 | #endif /* __MRCUDA_INTERCOMM__HEADER__ */ 39 | 40 | -------------------------------------------------------------------------------- /tests/progs/nullker.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define CUDA_SAFE_CALL(func) \ 6 | { \ 7 | if ((func) != cudaSuccess ) { \ 8 | fprintf(stderr, "ERROR\n"); \ 9 | exit(EXIT_FAILURE); \ 10 | } \ 11 | } 12 | 13 | static inline double get_elapsed_time(struct timeval *begin, struct timeval *end) 14 | { 15 | return (end->tv_sec - begin->tv_sec) * 1000 16 | + (end->tv_usec - begin->tv_usec) / 1000.0; 17 | } 18 | 19 | __global__ 20 | void null() 21 | { 22 | } 23 | 24 | int main() 25 | { 26 | int i = 0; 27 | struct timeval t1, t2; 28 | dim3 dimBlock( 1, 1 ); 29 | dim3 dimGrid( 1, 1 ); 30 | 31 | /* Initialize phase to force migration */ 32 | CUDA_SAFE_CALL(cudaSetDevice(0)); 33 | while (i < 20) { 34 | null<<>>(); 35 | i++; 36 | } 37 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 38 | CUDA_SAFE_CALL(cudaSetDevice(1)); 39 | i = 0; 40 | while (i < 20) { 41 | null<<>>(); 42 | i++; 43 | } 44 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 45 | 46 | //CUDA_SAFE_CALL(cudaSetDevice(0)); 47 | /* mhelper benchmark phase */ 48 | for (int iter = 0; iter < 15; iter++) { 49 | int j = (1 << (10 + iter)) - 1; 50 | i = 0; 51 | gettimeofday(&t1, NULL); 52 | while (i < j) { 53 | null<<>>(); 54 | i++; 55 | } 56 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 57 | gettimeofday(&t2, NULL); 58 | printf("%d %f\n", j + 1, get_elapsed_time(&t1, &t2)); 59 | } 60 | 61 | return EXIT_SUCCESS; 62 | } 63 | -------------------------------------------------------------------------------- /src/mrcuda.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA__HEADER__ 2 | #define __MRCUDA__HEADER__ 3 | 4 | #include 5 | #include 6 | #include "common.h" 7 | #include "datatypes.h" 8 | 9 | extern MRCUDASym_t *mrcudaSymNvidia; 10 | extern MRCUDASym_t *mrcudaSymRCUDA; 11 | 12 | extern int mrcudaNumGPUs; 13 | extern MRCUDAGPU_t *mrcudaGPUList; 14 | 15 | extern GHashTable *mrcudaGPUThreadMap; 16 | 17 | extern MRCUDAState_e mrcudaState; 18 | 19 | /** 20 | * Get the GPU assigned to the calling thread. 21 | * @return a pointer to the assigned GPU. 22 | */ 23 | MRCUDAGPU_t *mrcuda_get_current_gpu(); 24 | 25 | /** 26 | * Set the GPU assigned to the calling thread. 27 | * @param device virtual device ID. 28 | */ 29 | void mrcuda_set_current_gpu(int device); 30 | 31 | 32 | /** 33 | * Initialize mrCUDA. 34 | * Print error and terminate the program if an error occurs. 35 | */ 36 | void mrcuda_init(); 37 | 38 | /** 39 | * Finalize mrCUDA. 40 | */ 41 | int mrcuda_fini(); 42 | 43 | /** 44 | * Switch the specified mrcudaGPU from rCUDA to native. 45 | * @param mrcudaGPU a ptr to the mrcudaGPU to be switched. 46 | * @param toGPUNumber the native GPU number to be moved to. 47 | */ 48 | void mrcuda_switch(MRCUDAGPU_t *mrcudaGPU, int toGPUNumber); 49 | 50 | /** 51 | * Create a barrier such that subsequent calls are blocked until the barrier is released. 52 | * @param mrcudaGPU a ptr to the GPU a barrier will be created on. 53 | */ 54 | void mrcuda_function_call_lock(MRCUDAGPU_t *mrcudaGPU); 55 | 56 | /** 57 | * Release the barrier; thus, allow subsequent calls to be processed normally. 58 | * @param mrcudaGPU a ptr to the GPU the barrier will be released. 59 | */ 60 | void mrcuda_function_call_release(MRCUDAGPU_t *mrcudaGPU); 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /tests/progs/hellomul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define CUDA_SAFE_CALL(func) \ 5 | { \ 6 | if ((func) != cudaSuccess ) { \ 7 | fprintf(stderr, "ERROR\n"); \ 8 | exit(EXIT_FAILURE); \ 9 | } \ 10 | } 11 | 12 | const int N = 16; 13 | const int blocksize = 16; 14 | 15 | __global__ 16 | void hello(char *a, int *b) 17 | { 18 | a[threadIdx.x] += b[threadIdx.x]; 19 | } 20 | 21 | __global__ 22 | void null() 23 | { 24 | } 25 | 26 | 27 | int main() 28 | { 29 | int i = 0; 30 | char a[N] = "Hello \0\0\0\0\0\0"; 31 | int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 32 | 33 | char *ad; 34 | int *bd; 35 | const int csize = N*sizeof(char); 36 | const int isize = N*sizeof(int); 37 | dim3 dimBlock( blocksize, 1 ); 38 | dim3 dimGrid( 1, 1 ); 39 | 40 | printf("%s", a); 41 | 42 | CUDA_SAFE_CALL(cudaSetDevice(1)); 43 | CUDA_SAFE_CALL(cudaMalloc( (void**)&ad, csize )); 44 | CUDA_SAFE_CALL(cudaMalloc( (void**)&bd, isize )); 45 | CUDA_SAFE_CALL(cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice )); 46 | CUDA_SAFE_CALL(cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice )); 47 | 48 | CUDA_SAFE_CALL(cudaSetDevice(0)); 49 | while (i < 20) { 50 | null<<>>(); 51 | i++; 52 | } 53 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 54 | 55 | CUDA_SAFE_CALL(cudaSetDevice(1)); 56 | i = 0; 57 | while (i < 20) { 58 | null<<>>(); 59 | i++; 60 | } 61 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 62 | 63 | hello<<>>(ad, bd); 64 | CUDA_SAFE_CALL(cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost )); 65 | 66 | printf("%s\n", a); 67 | return EXIT_SUCCESS; 68 | } 69 | 70 | -------------------------------------------------------------------------------- /tests/progs/thread_dev.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | void *thread_main2(void *opaque) 8 | { 9 | int *devMem; 10 | int device; 11 | pid_t pid; 12 | pid = syscall(SYS_gettid); 13 | printf("Thread 2: thread %d\n", pid); 14 | cudaGetDevice(&device); 15 | printf("Thread 2: Device %d\n", device); 16 | cudaSetDevice(0); 17 | cudaMalloc(&devMem, sizeof(int) * 16); 18 | printf("Thread 2: Addr %p\n", devMem); 19 | cudaGetDevice(&device); 20 | printf("Thread 2: Device %d\n", device); 21 | return NULL; 22 | } 23 | 24 | void *thread_main1(void *opaque) 25 | { 26 | int *devMem; 27 | int device; 28 | pthread_t t; 29 | pid_t pid; 30 | pid = syscall(SYS_gettid); 31 | printf("Thread 1: thread %d\n", pid); 32 | cudaGetDevice(&device); 33 | printf("Thread 1: Device %d\n", device); 34 | cudaSetDevice(1); 35 | cudaMalloc(&devMem, sizeof(int) * 16); 36 | printf("Thread 1: Addr %p\n", devMem); 37 | cudaGetDevice(&device); 38 | printf("Thread 1: Device %d\n", device); 39 | pthread_create(&t, NULL, thread_main2, NULL); 40 | cudaGetDevice(&device); 41 | printf("Thread 1: Device %d\n", device); 42 | pthread_join(t, NULL); 43 | cudaGetDevice(&device); 44 | printf("Thread 1: Device %d\n", device); 45 | return NULL; 46 | } 47 | 48 | int main() 49 | { 50 | int *devMem; 51 | int device; 52 | pthread_t t; 53 | pid_t pid; 54 | pid = syscall(SYS_gettid); 55 | printf("Main: thread %d\n", pid); 56 | cudaSetDevice(0); 57 | cudaMalloc(&devMem, sizeof(int) * 32); 58 | printf("Main: Addr %p\n", devMem); 59 | pthread_create(&t, NULL, thread_main1, NULL); 60 | cudaGetDevice(&device); 61 | printf("Main: Device %d\n", device); 62 | pthread_join(t, NULL); 63 | cudaGetDevice(&device); 64 | printf("Main: Device %d\n", device); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /tests/progs/memcpybw.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define CUDA_SAFE_CALL(func) \ 7 | { \ 8 | if ((func) != cudaSuccess ) { \ 9 | fprintf(stderr, "ERROR\n"); \ 10 | exit(EXIT_FAILURE); \ 11 | } \ 12 | } 13 | 14 | static inline double get_elapsed_time(struct timeval *begin, struct timeval *end) 15 | { 16 | return (end->tv_sec - begin->tv_sec) * 1000 17 | + (end->tv_usec - begin->tv_usec) / 1000.0; 18 | } 19 | 20 | __global__ 21 | void null() 22 | { 23 | } 24 | 25 | int main(int argc, char *argv[]) 26 | { 27 | int i = 0; 28 | struct timeval t1, t2; 29 | dim3 dimBlock( 1, 1 ); 30 | dim3 dimGrid( 1, 1 ); 31 | char *pDev, *pHost; 32 | char *endpoint; 33 | size_t memsize; 34 | int num; 35 | size_t secSize; 36 | 37 | if (argc < 3) { 38 | fprintf(stderr, "prog memsize num\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | 42 | memsize = strtol(argv[1], &endpoint, 10); 43 | if (*endpoint != '\0') { 44 | fprintf(stderr, "memsize has to be long int.\n"); 45 | exit(EXIT_FAILURE); 46 | } 47 | 48 | num = (int)strtol(argv[2], &endpoint, 10); 49 | if (*endpoint != '\0') { 50 | fprintf(stderr, "num has to be int.\n"); 51 | exit(EXIT_FAILURE); 52 | } 53 | 54 | secSize = memsize / num; 55 | 56 | /* Initialize phase to force migration */ 57 | if ((pHost = (char *)malloc(sizeof(char) * memsize)) == NULL) { 58 | perror("MALLOC ERROR:"); 59 | exit(EXIT_FAILURE); 60 | } 61 | 62 | CUDA_SAFE_CALL(cudaMalloc(&pDev, sizeof(char) * secSize)); 63 | CUDA_SAFE_CALL(cudaMemcpy(pDev, pHost, sizeof(char) * secSize, cudaMemcpyHostToDevice)); 64 | CUDA_SAFE_CALL(cudaFree(pDev)); 65 | gettimeofday(&t1, NULL); 66 | for (i = 0; i < num; i++) { 67 | CUDA_SAFE_CALL(cudaMalloc(&pDev, sizeof(char) * secSize)); 68 | CUDA_SAFE_CALL(cudaMemcpy(pDev, pHost, sizeof(char) * secSize, cudaMemcpyHostToDevice)); 69 | } 70 | gettimeofday(&t2, NULL); 71 | printf("Elapsed Time: %f\n", get_elapsed_time(&t1, &t2)); 72 | while (i < 2000) { 73 | null<<>>(); 74 | i++; 75 | } 76 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 77 | return EXIT_SUCCESS; 78 | } 79 | 80 | -------------------------------------------------------------------------------- /tests/progs/cudamemcpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define MEMSIZE (1 << 30) 6 | 7 | #define CUDA_SAFE_CALL(func) \ 8 | { \ 9 | if ((func) != cudaSuccess ) { \ 10 | fprintf(stderr, "ERROR\n"); \ 11 | exit(EXIT_FAILURE); \ 12 | } \ 13 | } 14 | 15 | static inline double get_elapsed_time(struct timeval *begin, struct timeval *end) 16 | { 17 | return (end->tv_sec - begin->tv_sec) * 1000 18 | + (end->tv_usec - begin->tv_usec) / 1000.0; 19 | } 20 | 21 | __global__ 22 | void null() 23 | { 24 | } 25 | 26 | int main() 27 | { 28 | int i = 0; 29 | struct timeval t1, t2; 30 | dim3 dimBlock( 1, 1 ); 31 | dim3 dimGrid( 1, 1 ); 32 | char *pDev0, *pDev1, *pHost; 33 | 34 | /* Initialize phase to force migration */ 35 | if ((pHost = (char *)malloc(sizeof(char) * MEMSIZE)) == NULL) { 36 | perror("MALLOC ERROR:"); 37 | exit(EXIT_FAILURE); 38 | } 39 | 40 | CUDA_SAFE_CALL(cudaSetDevice(0)); 41 | CUDA_SAFE_CALL(cudaMalloc(&pDev0, sizeof(char) * MEMSIZE)); 42 | CUDA_SAFE_CALL(cudaMemcpy(pDev0, pHost, sizeof(char) * MEMSIZE, cudaMemcpyHostToDevice)); 43 | while (i < 2000) { 44 | null<<>>(); 45 | i++; 46 | } 47 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 48 | CUDA_SAFE_CALL(cudaMemcpy(pDev0, pHost, sizeof(char) * MEMSIZE, cudaMemcpyHostToDevice)); 49 | 50 | CUDA_SAFE_CALL(cudaSetDevice(1)); 51 | CUDA_SAFE_CALL(cudaMalloc(&pDev1, sizeof(char) * MEMSIZE)); 52 | CUDA_SAFE_CALL(cudaMemcpy(pDev1, pHost, sizeof(char) * MEMSIZE, cudaMemcpyHostToDevice)); 53 | i = 0; 54 | while (i < 2000) { 55 | null<<>>(); 56 | i++; 57 | } 58 | CUDA_SAFE_CALL(cudaDeviceSynchronize()); 59 | CUDA_SAFE_CALL(cudaMemcpy(pDev1, pHost, sizeof(char) * MEMSIZE, cudaMemcpyHostToDevice)); 60 | 61 | //CUDA_SAFE_CALL(cudaSetDevice(0)); 62 | /* mhelper benchmark phase */ 63 | for (int iter = 0; iter < 20; iter++) { 64 | int size = sizeof(char) * (1 << (10 + iter)); 65 | gettimeofday(&t1, NULL); 66 | for (int j = 0; j < 1000; j++) 67 | CUDA_SAFE_CALL(cudaMemcpy(pDev1, pHost, size, cudaMemcpyHostToDevice)); 68 | gettimeofday(&t2, NULL); 69 | printf("%d %f\n", size, get_elapsed_time(&t1, &t2)); 70 | } 71 | 72 | return EXIT_SUCCESS; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /notes/func-list.txt: -------------------------------------------------------------------------------- 1 | extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); 2 | extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func); 3 | extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); 4 | extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); 5 | extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags); 6 | extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); 7 | extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); 8 | extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset); 9 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); 10 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); 11 | extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); 12 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); 13 | extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)); 14 | extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f); 15 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device); 16 | extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream); 17 | extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total); 18 | extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device); 19 | extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); 20 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); 21 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); 22 | 23 | /* Maybe we don't need these, hopefully */ 24 | CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); 25 | CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); 26 | CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); 27 | CUresult CUDAAPI cuDeviceGetCount(int *count); 28 | CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); 29 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | mrCUDA: Migratable rCUDA 2 | 3 | What is it? 4 | =========== 5 | 6 | mrCUDA is an extension of rCUDA (http://rcuda.net), which aims at enabling 7 | remote-to-local GPU migration. We develop this project in order to solve the 8 | performance problems caused by remote GPU communication: overhead from rCUDA, 9 | and network congestion. By using mrCUDA, a user can migrate execution on a 10 | remote GPU to a local GPU when one becomes available. mrCUDA works seamlessly 11 | with rCUDA and programs that use CUDA Runtime API. There is no need to recompile 12 | the program in order to use mrCUDA. More information regarding mrCUDA can be 13 | found in: 14 | 15 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "Serving More GPU Jobs, with 16 | Low Penalty, using Remote GPU Execution and Migration." IEEE Cluster 2016. 17 | 18 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "Finishing GPU Jobs 19 | running on a Multi-GPU Batch-Queue Node-Sharing System Earlier with Remote 20 | GPU Execution and Migration." ISC2016. 21 | 22 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "Reducing Remote GPU 23 | Execution's Overhead with mrCUDA." GTC2016. 24 | 25 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "Serving More GPU Jobs 26 | in Multi-GPU Batch-Queue Systems using Remote GPU Execution and Migration 27 | (Unrefereed Workshop manuscript)." IPSJ SIG Notes 2016-HPC-153. 28 | 29 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "mrCUDA: Low-Overhead 30 | Middleware for Transparently Migrating CUDA Execution from Remote to Local 31 | GPUs." SC15. 32 | 33 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "mrCUDA: Low-Overhead 34 | Middleware for Transparently Migrating CUDA Execution from Remote to Local 35 | GPUs." GTC Japan 2015. 36 | 37 | * Pak Markthub, Akihiro Nomura, and Satoshi Matsuoka. "mrCUDA: A middleware 38 | for migrating rCUDA virtual GPUs to native GPUs (Unrefereed Workshop 39 | manuscript)." IPSJ SIG Notes 2015-HPC-150 (SWoPP2015). 40 | 41 | Installation 42 | ============ 43 | 44 | Prerequisites 45 | ------------- 46 | 47 | - check 48 | - CUDA7.0 49 | - glibc-2.0 50 | - Python2.7 51 | - rCUDAv15.07 52 | 53 | How to install 54 | -------------- 55 | 56 | mkdir build 57 | cd build 58 | ../configure --prefix=~/mrCUDA-bin --with-rcuda= 59 | make 60 | make install 61 | 62 | Note: We recommend you to specify --prefix because mrCUDA creates its own 63 | libcudart.so that might conflict with the installed libcudart.so from NVIDIA on 64 | your system. 65 | 66 | How to use? 67 | =========== 68 | 69 | 1. Make sure your program works with rCUDAv15.07. 70 | 2. Start rCUDAd on a node. 71 | 3. Go to mrCUDA's installed directory. 72 | 4. cd bin 73 | 5. ./mrcudaexec -s -t \ 74 | --switch-threshold= -- 75 | 76 | Notes: 77 | 1. By specifying --switch-threshold, mrCUDA will automatically migrate execution 78 | when it encounters 'cudaLaunch' more than the specified number. This is helpful 79 | for testing mrCUDA's migration functionality. 80 | 81 | 2. In future release, mrCUDA will create a UNIX socket that you can send a 82 | migration command in to start GPU migration. 83 | 84 | Acknowledgement 85 | =============== 86 | 87 | This research was supported by JST, CREST (Research Area: Advanced Core 88 | Technologies for Big Data Integration). 89 | 90 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.69]) 2 | 3 | AC_INIT([mrcuda], [7.0.0], [markthub.p.aa@m.titech.ac.jp]) 4 | AC_CONFIG_AUX_DIR([build-aux]) 5 | AM_INIT_AUTOMAKE 6 | AC_CONFIG_MACRO_DIR([build-aux]) 7 | AC_CONFIG_HEADERS([config.h]) 8 | AC_CONFIG_FILES([Makefile src/Makefile tests/Makefile scripts/Makefile]) 9 | AC_ENABLE_SHARED(yes) 10 | AC_ENABLE_STATIC(no) 11 | 12 | AC_ARG_WITH([rcuda], [ 13 | AS_HELP_STRING([--with-rcuda=[RCUDA_PATH]], 14 | [optionally specify the installation path of rCUDA.] 15 | ) 16 | ]) 17 | 18 | AC_ARG_WITH([nvcc], [ 19 | AS_HELP_STRING([--with-nvcc=[nvcc]], 20 | [optionally specify nvcc you want to use.] 21 | )], [ 22 | AC_SUBST(NVCC, "${with_nvcc}") 23 | ] 24 | ) 25 | 26 | AC_ARG_WITH([libcudart], [ 27 | AS_HELP_STRING([--with-libcudart=[libcudart]], 28 | [optionally specify CUDA libcudart you want to use.] 29 | )], [ 30 | AC_SUBST(NVIDIA_LIBCUDART, "${with_libcudart}") 31 | ] 32 | ) 33 | 34 | AC_ARG_ENABLE([debug], 35 | AS_HELP_STRING([--enable-debug], [Enable debug output])) 36 | AS_IF([test "x$enable_debug" = "xyes"], [ 37 | AC_DEFINE(DEBUG, 1, [Define if --enable-debug option is found.]) 38 | ]) 39 | 40 | # Checks for programs 41 | AM_PROG_AR 42 | AC_PROG_CC 43 | AM_PROG_CC_C_O 44 | AC_PROG_INSTALL 45 | AC_PROG_LN_S 46 | AM_PATH_PYTHON([2.7]) 47 | 48 | if test x"${NVCC}" == x""; then 49 | AC_PATH_PROG(NVCC, nvcc, no) 50 | if test x"${NVCC}" == x"no"; then 51 | AC_MSG_ERROR([Cannot find nvcc.]) 52 | fi 53 | fi 54 | 55 | if test x"${NVIDIA_LIBCUDART}" == x""; then 56 | AC_PATH_PROG(NVIDIA_LIBCUDART, [libcudart.so], no, [$LD_LIBRARY_PATH$PATH_SEPARATOR$LIBRARY_PATH]) 57 | if test x"${NVIDIA_LIBCUDART}" == x"no"; then 58 | AC_MSG_ERROR([Cannot find CUDA's libcudart.so.]) 59 | fi 60 | fi 61 | 62 | AC_PATH_PROG(RCUDA_RCUDACOMMIB, rCUDAcommIB.so, no, [$with_rcuda/lib$PATH_SEPARATOR$PATH]) 63 | if test x"${RCUDA_RCUDACOMMIB}" == x"no"; then 64 | AC_MSG_ERROR([Cannot find rCUDA. Make sure rCUDA is installed on your system.]) 65 | fi 66 | 67 | AC_PATH_PROG(RCUDA_RCUDACOMMTCP, rCUDAcommTCP.so, no, [$with_rcuda/lib$PATH_SEPARATOR$PATH]) 68 | if test x"${RCUDA_RCUDACOMMTCP}" == x"no"; then 69 | AC_MSG_ERROR([Cannot find rCUDA. Make sure rCUDA is installed on your system.]) 70 | fi 71 | 72 | AC_PATH_PROG(RCUDA_LIBCUDART, libcudart.so, no, [$with_rcuda/lib$PATH_SEPARATOR$PATH]) 73 | if test x"${RCUDA_LIBCUDART}" == x"no"; then 74 | AC_MSG_ERROR([Cannot find rCUDA. Make sure rCUDA is installed on your system.]) 75 | fi 76 | 77 | # Checks for modules 78 | PKG_CHECK_MODULES([CHECK], [check >= 0.9.4]) 79 | PKG_CHECK_MODULES([DEPS], [glib-2.0 >= 2.24.1]) 80 | LT_INIT 81 | 82 | # Checks for libraries. 83 | # FIXME: Replace `main' with a function in `-lcuda': 84 | AC_CHECK_LIB([cuda], [cuCtxCreate]) 85 | # FIXME: Replace `main' with a function in `-lcudart': 86 | AC_CHECK_LIB([cudart], [cudaMemcpy]) 87 | # FIXME: Replace `main' with a function in `-ldl': 88 | AC_CHECK_LIB([dl], [dlsym]) 89 | # FIXME: Replace `main' with a function in `-lpthread': 90 | AC_CHECK_LIB([pthread], [pthread_mutex_init]) 91 | 92 | # Checks for header files. 93 | AC_CHECK_HEADERS([fcntl.h stdlib.h string.h sys/time.h unistd.h]) 94 | 95 | # Checks for typedefs, structures, and compiler characteristics. 96 | AC_C_INLINE 97 | AC_TYPE_PID_T 98 | AC_TYPE_SIZE_T 99 | AC_TYPE_SSIZE_T 100 | 101 | # Checks for library functions. 102 | AC_FUNC_FORK 103 | AC_FUNC_MALLOC 104 | AC_CHECK_FUNCS([dup2 gettimeofday mempcpy mkfifo strtol]) 105 | 106 | AC_OUTPUT 107 | -------------------------------------------------------------------------------- /src/intercomm_mem.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "common.h" 8 | #include "datatypes.h" 9 | #include "intercomm_mem.h" 10 | 11 | #define DEV_RANDOM "/dev/urandom" 12 | 13 | static int initRand = 0; 14 | 15 | /** 16 | * Generate a key to be associated with a shared memory region. 17 | * @return a key. 18 | */ 19 | static key_t generate_key() 20 | { 21 | FILE *f; 22 | unsigned int seed; 23 | size_t remainingSize = sizeof(unsigned int); 24 | size_t readSize = 0; 25 | 26 | if (!initRand) { 27 | f = fopen(DEV_RANDOM, "r"); 28 | while (remainingSize > 0) { 29 | if ((readSize = fread(&seed, remainingSize, 1, f)) == 0) 30 | REPORT_ERROR_AND_EXIT("Cannot read from " DEV_RANDOM ".\n"); 31 | remainingSize -= readSize; 32 | } 33 | fclose(f); 34 | srand(seed); 35 | initRand = !initRand; 36 | } 37 | return (key_t)rand(); 38 | } 39 | 40 | /** 41 | * Malloc memory on shared-memory region. 42 | * @param size the size of memory to be allocated. 43 | * @return a ptr to a MRCUDASharedMemLocalInfo_t on success. NULL otherwise. 44 | */ 45 | MRCUDASharedMemLocalInfo_t *mhelper_mem_malloc(size_t size) 46 | { 47 | MRCUDASharedMemLocalInfo_t *sharedMemInfo = calloc(1, sizeof(MRCUDASharedMemLocalInfo_t)); 48 | if (sharedMemInfo == NULL) 49 | goto __mhelper_mem_malloc_err_0; 50 | sharedMemInfo->sharedMem.key = generate_key(); 51 | if ((sharedMemInfo->shmid = shmget(sharedMemInfo->sharedMem.key, size, IPC_CREAT | IPC_EXCL | 0600)) <= 0) 52 | goto __mhelper_mem_malloc_err_1; 53 | if ((sharedMemInfo->startAddr = shmat(sharedMemInfo->shmid, NULL, 0)) == NULL) 54 | goto __mhelper_mem_malloc_err_2; 55 | sharedMemInfo->sharedMem.size = size; 56 | return sharedMemInfo; 57 | 58 | __mhelper_mem_malloc_err_2: 59 | shmctl(sharedMemInfo->shmid, IPC_RMID, NULL); 60 | __mhelper_mem_malloc_err_1: 61 | free(sharedMemInfo); 62 | __mhelper_mem_malloc_err_0: 63 | return NULL; 64 | } 65 | 66 | /** 67 | * Detach and destroy the shared region specified by the sharedMemInfo. 68 | * @param sharedMemInfo the information of the shared region. 69 | * @return 0 on success; other number otherwise. 70 | */ 71 | int mhelper_mem_free(MRCUDASharedMemLocalInfo_t *sharedMemInfo) 72 | { 73 | int ret = shmctl(sharedMemInfo->shmid, IPC_RMID, NULL); 74 | if (ret == 0) 75 | free(sharedMemInfo); 76 | return ret; 77 | } 78 | 79 | /** 80 | * Get the memory region associated with the specified sharedMem. 81 | * @param sharedMem the minimum information of the shared region. 82 | * @return a ptr to a MRCUDASharedMemLocalInfo_t on success. NULL otherwise. 83 | */ 84 | MRCUDASharedMemLocalInfo_t *mhelper_mem_get(MRCUDASharedMem_t sharedMem) 85 | { 86 | MRCUDASharedMemLocalInfo_t *sharedMemInfo = calloc(1, sizeof(MRCUDASharedMemLocalInfo_t)); 87 | if (sharedMemInfo == NULL) 88 | goto __mhelper_mem_get_err_0; 89 | if ((sharedMemInfo->shmid = shmget(sharedMem.key, sharedMem.size, 0666)) <= 0) 90 | goto __mhelper_mem_get_err_1; 91 | if ((sharedMemInfo->startAddr = shmat(sharedMemInfo->shmid, NULL, 0)) == NULL) 92 | goto __mhelper_mem_get_err_1; 93 | sharedMemInfo->sharedMem = sharedMem; 94 | return sharedMemInfo; 95 | 96 | __mhelper_mem_get_err_1: 97 | free(sharedMemInfo); 98 | __mhelper_mem_get_err_0: 99 | return NULL; 100 | } 101 | 102 | /** 103 | * Detach the shared region specified by the sharedMemInfo. 104 | * @param sharedMemInfo the information of the shared region. 105 | * @return 0 on success; another number otherwise. 106 | */ 107 | int mhelper_mem_detach(MRCUDASharedMemLocalInfo_t *sharedMemInfo) 108 | { 109 | return shmdt(sharedMemInfo->startAddr); 110 | } 111 | 112 | -------------------------------------------------------------------------------- /config.h.in: -------------------------------------------------------------------------------- 1 | /* config.h.in. Generated from configure.ac by autoheader. */ 2 | 3 | /* Define if --enable-debug option is found. */ 4 | #undef DEBUG 5 | 6 | /* Define to 1 if you have the header file. */ 7 | #undef HAVE_DLFCN_H 8 | 9 | /* Define to 1 if you have the `dup2' function. */ 10 | #undef HAVE_DUP2 11 | 12 | /* Define to 1 if you have the header file. */ 13 | #undef HAVE_FCNTL_H 14 | 15 | /* Define to 1 if you have the `fork' function. */ 16 | #undef HAVE_FORK 17 | 18 | /* Define to 1 if you have the `gettimeofday' function. */ 19 | #undef HAVE_GETTIMEOFDAY 20 | 21 | /* Define to 1 if you have the header file. */ 22 | #undef HAVE_INTTYPES_H 23 | 24 | /* Define to 1 if you have the `cuda' library (-lcuda). */ 25 | #undef HAVE_LIBCUDA 26 | 27 | /* Define to 1 if you have the `cudart' library (-lcudart). */ 28 | #undef HAVE_LIBCUDART 29 | 30 | /* Define to 1 if you have the `dl' library (-ldl). */ 31 | #undef HAVE_LIBDL 32 | 33 | /* Define to 1 if you have the `pthread' library (-lpthread). */ 34 | #undef HAVE_LIBPTHREAD 35 | 36 | /* Define to 1 if your system has a GNU libc compatible `malloc' function, and 37 | to 0 otherwise. */ 38 | #undef HAVE_MALLOC 39 | 40 | /* Define to 1 if you have the header file. */ 41 | #undef HAVE_MEMORY_H 42 | 43 | /* Define to 1 if you have the `mempcpy' function. */ 44 | #undef HAVE_MEMPCPY 45 | 46 | /* Define to 1 if you have the `mkfifo' function. */ 47 | #undef HAVE_MKFIFO 48 | 49 | /* Define to 1 if you have the header file. */ 50 | #undef HAVE_STDINT_H 51 | 52 | /* Define to 1 if you have the header file. */ 53 | #undef HAVE_STDLIB_H 54 | 55 | /* Define to 1 if you have the header file. */ 56 | #undef HAVE_STRINGS_H 57 | 58 | /* Define to 1 if you have the header file. */ 59 | #undef HAVE_STRING_H 60 | 61 | /* Define to 1 if you have the `strtol' function. */ 62 | #undef HAVE_STRTOL 63 | 64 | /* Define to 1 if you have the header file. */ 65 | #undef HAVE_SYS_STAT_H 66 | 67 | /* Define to 1 if you have the header file. */ 68 | #undef HAVE_SYS_TIME_H 69 | 70 | /* Define to 1 if you have the header file. */ 71 | #undef HAVE_SYS_TYPES_H 72 | 73 | /* Define to 1 if you have the header file. */ 74 | #undef HAVE_UNISTD_H 75 | 76 | /* Define to 1 if you have the `vfork' function. */ 77 | #undef HAVE_VFORK 78 | 79 | /* Define to 1 if you have the header file. */ 80 | #undef HAVE_VFORK_H 81 | 82 | /* Define to 1 if `fork' works. */ 83 | #undef HAVE_WORKING_FORK 84 | 85 | /* Define to 1 if `vfork' works. */ 86 | #undef HAVE_WORKING_VFORK 87 | 88 | /* Define to the sub-directory in which libtool stores uninstalled libraries. 89 | */ 90 | #undef LT_OBJDIR 91 | 92 | /* Define to 1 if your C compiler doesn't accept -c and -o together. */ 93 | #undef NO_MINUS_C_MINUS_O 94 | 95 | /* Name of package */ 96 | #undef PACKAGE 97 | 98 | /* Define to the address where bug reports for this package should be sent. */ 99 | #undef PACKAGE_BUGREPORT 100 | 101 | /* Define to the full name of this package. */ 102 | #undef PACKAGE_NAME 103 | 104 | /* Define to the full name and version of this package. */ 105 | #undef PACKAGE_STRING 106 | 107 | /* Define to the one symbol short name of this package. */ 108 | #undef PACKAGE_TARNAME 109 | 110 | /* Define to the home page for this package. */ 111 | #undef PACKAGE_URL 112 | 113 | /* Define to the version of this package. */ 114 | #undef PACKAGE_VERSION 115 | 116 | /* Define to 1 if you have the ANSI C header files. */ 117 | #undef STDC_HEADERS 118 | 119 | /* Version number of package */ 120 | #undef VERSION 121 | 122 | /* Define to `__inline__' or `__inline' if that's what the C compiler 123 | calls it, or to nothing if 'inline' is not supported under any name. */ 124 | #ifndef __cplusplus 125 | #undef inline 126 | #endif 127 | 128 | /* Define to rpl_malloc if the replacement function should be used. */ 129 | #undef malloc 130 | 131 | /* Define to `int' if does not define. */ 132 | #undef pid_t 133 | 134 | /* Define to `unsigned int' if does not define. */ 135 | #undef size_t 136 | 137 | /* Define to `int' if does not define. */ 138 | #undef ssize_t 139 | 140 | /* Define as `fork' if `vfork' does not work. */ 141 | #undef vfork 142 | -------------------------------------------------------------------------------- /src/intercomm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "intercomm.h" 8 | #include "datatypes.h" 9 | 10 | /** 11 | * Create a helper process and assign the mrcudaGPU to it. 12 | * @param mrcudaGPU the GPU information to assign to the created process. 13 | * @param helperProgPath the path to the helper application. 14 | * @param gpuID the ID of the GPU the helper application will use. 15 | * @return a ptr to the created process on success; NULL otherwise. 16 | */ 17 | MHelperProcess_t *mhelper_create(MRCUDAGPU_t *mrcudaGPU, const char *helperProgPath, int gpuID) 18 | { 19 | int rPipePair[2], wPipePair[2]; 20 | MHelperProcess_t *mhelperProcess; 21 | pid_t pid; 22 | char gpuIDStr[15]; 23 | 24 | if (pipe(rPipePair) != 0) 25 | goto __mhelper_create_err_0; 26 | if (pipe(wPipePair) != 0) 27 | goto __mhelper_create_err_1; 28 | if ((mhelperProcess = malloc(sizeof(MHelperProcess_t))) == NULL) 29 | goto __mhelper_create_err_2; 30 | pid = fork(); 31 | if (pid == 0) { // child process 32 | close(wPipePair[1]); 33 | close(rPipePair[0]); 34 | dup2(wPipePair[0], fileno(stdin)); 35 | dup2(rPipePair[1], fileno(stdout)); 36 | sprintf(gpuIDStr, "%d", gpuID); 37 | execl(helperProgPath, helperProgPath, gpuIDStr, "\0"); 38 | perror("Helper Program Exec"); 39 | _exit(EXIT_FAILURE); 40 | } 41 | else if (pid < 0) // error; cannot fork 42 | goto __mhelper_create_err_3; 43 | else { // parent process 44 | close(wPipePair[0]); 45 | close(rPipePair[1]); 46 | mhelperProcess->readPipe = rPipePair[0]; 47 | mhelperProcess->writePipe = wPipePair[1]; 48 | mhelperProcess->pid = pid; 49 | if (mhelper_int_init(&(mhelperProcess->handle), mhelperProcess) != 0) 50 | goto __mhelper_create_err_3; 51 | mrcudaGPU->mhelperProcess = mhelperProcess; 52 | return mhelperProcess; 53 | } 54 | 55 | __mhelper_create_err_3: 56 | free(mhelperProcess); 57 | __mhelper_create_err_2: 58 | close(wPipePair[0]); 59 | close(wPipePair[1]); 60 | __mhelper_create_err_1: 61 | close(rPipePair[0]); 62 | close(rPipePair[1]); 63 | __mhelper_create_err_0: 64 | return NULL; 65 | } 66 | 67 | /** 68 | * Destroy the helper process. 69 | * @param process the process to be destroyed. 70 | * @return 0 on success; another number otherwise. 71 | */ 72 | int mhelper_destroy(MHelperProcess_t *process) 73 | { 74 | int ret = kill(process->pid, SIGQUIT); 75 | if (ret == 0) 76 | free(process); 77 | return ret; 78 | } 79 | 80 | /** 81 | * Ask the process to execute the command. 82 | * @param process the process that will execute the specified command. 83 | * @param command the command to be executed on the process. 84 | * @return the result of the execution. 85 | */ 86 | MHelperResult_t mhelper_call(MHelperProcess_t *process, MHelperCommand_t command) 87 | { 88 | ssize_t n; 89 | size_t remainingSize = sizeof(MHelperCommand_t); 90 | char *buf = (char *)&command; 91 | MHelperResult_t result; 92 | 93 | while (remainingSize > 0) { 94 | n = write(process->writePipe, buf, remainingSize); 95 | if (n < 0) 96 | goto __mhelper_call_err_0; 97 | remainingSize -= n; 98 | buf += n; 99 | } 100 | 101 | remainingSize = sizeof(MHelperResult_t); 102 | buf = (char *)&result; 103 | while (remainingSize > 0) { 104 | n = read(process->readPipe, buf, remainingSize); 105 | if (n < 0) 106 | goto __mhelper_call_err_0; 107 | remainingSize -= n; 108 | buf += n; 109 | } 110 | if (result.id != command.id || result.type != command.type) 111 | goto __mhelper_call_err_0; 112 | return result; 113 | 114 | __mhelper_call_err_0: 115 | result.id = command.id; 116 | result.type = command.type; 117 | result.internalError = -1; 118 | result.cudaError = cudaSuccess; 119 | return result; 120 | } 121 | 122 | /** 123 | * Generate a unique ID for a command to be used with the specified mrcudaGPU. 124 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t 125 | * @return a unique ID 126 | */ 127 | int mhelper_generate_command_id(MRCUDAGPU_t *mrcudaGPU) 128 | { 129 | return (rand() << 4) | mrcudaGPU->virtualNumber; 130 | } 131 | 132 | -------------------------------------------------------------------------------- /src/comm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "comm.h" 11 | 12 | #define LISTEN_BACKLOG 1 13 | 14 | typedef struct __MRCUDAComm 15 | { 16 | pthread_t listeningThread; 17 | 18 | char *path; 19 | void (*callback)(void); 20 | 21 | int fd; 22 | } __MRCUDAComm; 23 | 24 | static __MRCUDAComm __mrcudaCommObj; 25 | 26 | 27 | /** 28 | * Terminate the socket. 29 | */ 30 | static void __mrcuda_comm_fini() 31 | { 32 | DPRINTF("ENTER __mrcuda_comm_fini.\n"); 33 | close(__mrcudaCommObj.fd); 34 | unlink(__mrcudaCommObj.path); 35 | free(__mrcudaCommObj.path); 36 | DPRINTF("EXIT __mrcuda_comm_fini.\n"); 37 | } 38 | 39 | /** 40 | * This function creates a FIFO file specified by the path. 41 | * If it fails to do so for any reasons, it returns the error number; otherwise, return 0 42 | * @param path path of the FIFO file to be created. 43 | * @return 0 if success, otherwise the error number. 44 | */ 45 | static int __mrcuda_comm_init(char *path) 46 | { 47 | DPRINTF("ENTER __mrcuda_comm_init.\n"); 48 | 49 | DPRINTF("__mrcuda_comm_init allocates __mrcudaCommObj.path\n"); 50 | if((__mrcudaCommObj.path = (char *)malloc(strlen(path) + 1)) == NULL) 51 | goto __mrcuda_comm_init_err_1; 52 | 53 | DPRINTF("__mrcuda_comm_init strcpy path.\n"); 54 | strcpy(__mrcudaCommObj.path, path); 55 | 56 | DPRINTF("__mrcuda_comm_init mkfifo.\n"); 57 | if(mkfifo(__mrcudaCommObj.path, 0666) == -1) 58 | goto __mrcuda_comm_init_err_2; 59 | 60 | DPRINTF("EXIT SUCCESS __mrcuda_comm_init.\n"); 61 | return 0; 62 | 63 | __mrcuda_comm_init_err_2: 64 | free(__mrcudaCommObj.path); 65 | __mrcuda_comm_init_err_1: 66 | DPRINTF("EXIT FAILURE __mrcuda_comm_init.\n"); 67 | return -1; 68 | } 69 | 70 | 71 | /** 72 | * This is the main loop for repeatedly listening to a signal. 73 | * If it receives a correct signal, it terminates the socket and calls the callback. 74 | * This function should be called from a different thread since it blocks the execution. 75 | */ 76 | static void *__mrcuda_comm_listening_main_loop(void *arg) 77 | { 78 | DPRINTF("ENTER __mrcuda_comm_listening_main_loop.\n"); 79 | 80 | #define BUF_SIZE 1 81 | 82 | char buf[BUF_SIZE]; 83 | ssize_t readSize; 84 | 85 | DPRINTF("__mrcuda_comm_init open file.\n"); 86 | if((__mrcudaCommObj.fd = open(__mrcudaCommObj.path, O_RDONLY)) == -1) 87 | goto __mrcuda_comm_listening_main_loop_err_1; 88 | 89 | while(1) 90 | { 91 | DPRINTF("__mrcuda_comm_listening_main_loop is waiting.\n"); 92 | if((readSize = read(__mrcudaCommObj.fd, buf, BUF_SIZE)) == -1) 93 | goto __mrcuda_comm_listening_main_loop_err_1; 94 | DPRINTF("__mrcuda_comm_listening_main_loop received a signal.\n"); 95 | if(strncmp(buf, "1", BUF_SIZE) == 0) 96 | { 97 | DPRINTF("__mrcuda_comm_listening_main_loop calls the callback.\n"); 98 | __mrcudaCommObj.callback(); 99 | break; 100 | } 101 | } 102 | 103 | __mrcuda_comm_listening_main_loop_err_1: 104 | __mrcuda_comm_fini(); 105 | 106 | DPRINTF("EXIT __mrcuda_comm_listening_main_loop.\n"); 107 | 108 | #undef BUF_SIZE 109 | } 110 | 111 | /** 112 | * This function starts listening to a signal that tells the system to switch to native CUDA. 113 | * After it receives the signal, this function calls the callback and terminates the socket. 114 | * This function executes the listening process in a different thread; thus, it returns almost immediately. 115 | * Note: if the signal is not well form, this function will simply skips that signal and not calls the callback. 116 | * @param path path for creating a new UNIX socket for listening to the signal. 117 | * @param callback the function that will be called after received a signal. 118 | * @return 0 if success, the error number otherwise. 119 | */ 120 | int mrcuda_comm_listen_for_signal(char *path, void (*callback)(void)) 121 | { 122 | DPRINTF("ENTER mrcuda_comm_listen_for_signal.\n"); 123 | int ret = 0; 124 | if((ret = __mrcuda_comm_init(path)) != 0) 125 | return ret; 126 | __mrcudaCommObj.callback = callback; 127 | 128 | DPRINTF("mrcuda_comm_listen_for_signal creates a thread.\n"); 129 | if((ret = pthread_create(&(__mrcudaCommObj.listeningThread), NULL, &__mrcuda_comm_listening_main_loop, NULL)) != 0) 130 | __mrcuda_comm_fini(); 131 | 132 | 133 | DPRINTF("EXIT mrcuda_comm_listen_for_signal.\n"); 134 | return ret; 135 | } 136 | 137 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2016-09-28 Pak Markthub 2 | 3 | * all git log before the creation of this ChangeLog 4 | c617dd9 (HEAD -> installation, origin/installation) Create libcudart.so.7.0 symlink when installing mrCUDA 5 | c4ae1ca Regenerate all make and configuration scripts on Paris, hopefully it will work fine with other systems 6 | 1cdc3ea Add missing files 7 | 6b6cf38 Modify related files in the installation process 8 | c01849a Add missing files necessary for configure and make 9 | f1508e4 Add the missing config.h.in 10 | 0bcd211 Add the missing aclocal.m4 11 | c45bd5a Make the generation of the linked filenames of rCUDAcomm*.so more generic 12 | cdf5ed4 Create links to rCUDAcommIB.so and rCUDAcommTCP.so when installing mrCUDA 13 | cb954e8 Make now auto-generates correct mrcudaexec 14 | 77893c0 Check for python2.7 in configure 15 | d396f9d Add options for manually specifying NVIDIA's libcudart and nvcc 16 | 0513643 Detect the installation path of rCUDA's libcudart.so, rCUDAcommIB.so, and rCUDAcommTCP.so 17 | 332f4c9 Use absolute path for nvcc after checking 18 | b838985 Add checking for nvcc 19 | cf9a96f Create configure and its supported files 20 | 3913e41 (origin/multi-gpu, origin/master, origin/HEAD, github/master, multi-gpu, master) Change the labels of memsync plotter. 21 | 7e799ad Modify code so that it can run on Paris and matrixMul and vectorAdd can use mrCUDA 22 | aee10f1 Change the font size of some figures 23 | 6bc4b07 Add benchmark scripts and programs for mrCUDA's overhead 24 | 4bda4d0 Add manual profiling 25 | bcc42b6 Change many labels' sizes 26 | b2ff7c0 Add plot_record_replay to the overhead.py 27 | df634e0 Add plot_mhelper_memcpybw to the overhead.py 28 | ab7707e Add plot_mhelper_nullker to overhead.py 29 | 151283b Get multi-GPU migration benchmark's results 30 | 23dba8e Change the legend size in memsync-bw plot 31 | 801d4a4 Implement memsync-bw plot in the overhead.py 32 | 87ed363 Get memcpybw-memsync benchmark result 33 | 4fa2a15 Add a plotter overhead.py 34 | 2beb2e4 Remove cudaMemcpy and cudaMemcpyToSymbol profiling 35 | 1709ea6 Add manual profiling 36 | a7f915a Fix mhelper does not exit when the main program exited 37 | 342930e Fix cudaLaunch error bugs 38 | 64ecbcb Fix cudaMemcpy bugs in mhelper 39 | 544ee52 Fix mhelper does not set device bugs 40 | 8c79e14 Fix mhelper communication bugs 41 | 5473461 Fix deadlock in cudaSetDevice 42 | d7a6ad7 Implement mhelper.c 43 | 2ee9d72 Implement intercomm_interface.c 44 | 9ab3d67 Implement some interfaces in intercomm_interface 45 | f5f261e Fix runtime error when using switching for single GPU case 46 | aa2abf4 Fix runtime errors when using only rCUDA or native 47 | eddf55e Fix compliation errors 48 | 32bb9b7 Refactor code to support multi-gpu migration 49 | b40b53a Implement __cudaRegisterFatBinary in mhelper 50 | 5cab822 Partially implement mhelper 51 | fff4bf4 Implement intercomm.c 52 | b1dff8a Partially implement intercomm module 53 | 7c32128 Implement intercomm_mem 54 | 3263bce Roughly define data structures and functions 55 | 5274e9e Merge branch 'rcuda-5.0' 56 | de3d5b2 (origin/rcuda-5.0) Implement multi-GPU matmul 57 | 6b13153 Fix multiple reports of the total sizes of cudaMemcpy and cudaMemcpyToSymbol 58 | 65c10b3 Add cudaMemcpy and cudaMemcpyToSymbol profiling 59 | facce7e Add mrcuda_record time 60 | fde0ad0 Remove cudaMemcpyToSymbol replay and use sync symbol instead 61 | 50bdf0a Include mrcuda_replay_cudaMemcpyToSymbol to mrcuda_sync_mem profile 62 | 0aa19c9 Implement manual profiling 63 | 325623b Add mrcudaRecordCache 64 | 953ae29 Implement mrCUDAExec 65 | 7db0609 Add MRCUDA_SWITCH_THRESHOLD support for testing purpose 66 | c7ddadf Add cudaSetDeviceFlags support 67 | a1e7164 mrCUDA works with LAMMPS 68 | 2a765b3 Fix sync_memory bugs 69 | c697d6a Unknown test code 70 | 693042c Hard-code mrcuda_switching when num cudaLaunch reach a certain number 71 | cac9952 Fix segmentation-fault bug 72 | 03e9535 Fix compile-error bugs 73 | b9170c3 Connect modules together 74 | 94cfc0f Partially implement record and replay functions 75 | 2fa6ed4 Partially implement recording system 76 | c0fdd22 Implement and test comm.c 77 | 6335192 Use autotools as the make system 78 | 6bb38d7 Partially implement communication module. 79 | 7146ceb Implement function interfaces that are needed for LAMMPS 80 | c9e90a6 Implement mrcuda_init and mrcuda_fini 81 | 1123ca9 Partially define some headers 82 | 2c0e9f7 Change directory structure 83 | 92fdc95 Modify matmul_par.cu to make successful migratio more obvious 84 | 6ca6d5b Finish implementing rCUDA to native migration mini prototype 85 | 40db4b4 Successfully run hellowolrd concurrently on rCUDA and native 86 | d116474 Create an example of libcudart's hook 87 | -------------------------------------------------------------------------------- /scripts/mrcudaexec.py.template: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import subprocess 6 | from optparse import OptionParser 7 | 8 | framework_directory = os.path.dirname(os.path.abspath(__file__)) 9 | framework_lib_directory = '{{ MRCUDA_LIBPATH }}' 10 | 11 | def parse_args(): 12 | parser = OptionParser(usage = '%prog -s SERVER [options] -- PROGRAM') 13 | parser.add_option('-t', '--network-type', dest = 'network_type', 14 | choices = ['IB', 'TCP',], 15 | default = 'TCP', 16 | help = 'type of network for rCUDA (IB, TCP) [default = TCP].' 17 | ) 18 | parser.add_option('-n', '--number-of-devices', type = 'int', 19 | default = 1, 20 | dest = 'number_of_devices', 21 | help = 'number of GPU devices to be used [default = 1].' 22 | ) 23 | parser.add_option('-s', '--server', 24 | dest = 'server_address', 25 | help = 'rCUDA server address.', 26 | ) 27 | parser.add_option('-p', '--port', type = 'int', 28 | default = 8308, 29 | dest = 'port', 30 | help = 'Port number [default = 8308].', 31 | ) 32 | parser.add_option('-f', '--server-file', 33 | dest = 'server_file', 34 | help = 'rCUDA server file.' 35 | ) 36 | parser.add_option('--rcuda-libcudart', 37 | dest = 'rcuda_libcudart', 38 | default = '{{ RCUDA_LIBCUDART }}', 39 | help = 'rCUDA\'s libcudart.so path [default = \'{{ RCUDA_LIBCUDART }}\']' 40 | ) 41 | parser.add_option('--nvidia-libcudart', 42 | dest = 'nvidia_libcudart', 43 | default = '{{ NVIDIA_LIBCUDART }}', 44 | help = 'NVIDIA\'s libcudart.so path [default = \'{{ NVIDIA_LIBCUDART }}\']' 45 | ) 46 | parser.add_option('--switch-threshold', type = 'int', 47 | dest = 'switch_threshold', 48 | default = 0, 49 | help = 'Switching threshold value (positive integer) [default = 0]' 50 | ) 51 | parser.add_option('--sock-path', 52 | dest = 'sock_path', 53 | default = '/tmp/mrcuda.sock', 54 | help = 'Switching socket path [default = /tmp/mrcuda.sock]' 55 | ) 56 | parser.add_option('--mhelper-path', 57 | dest = 'mhelper_path', 58 | default = os.path.join(framework_directory, 'mhelper'), 59 | help = 'mhelper\'s path [default = %s]' % (os.path.join(framework_directory, 'mhelper'),) 60 | ) 61 | 62 | options, args = parser.parse_args() 63 | if not options.server_address and not options.server_file: 64 | parser.error('either -s or -f option is required.') 65 | elif options.server_address and options.server_file: 66 | parser.error('-s and -f options cannot be used at the same time.') 67 | if options.switch_threshold != 'RCUDA' and options.switch_threshold != 'NVIDIA': 68 | try: 69 | int(options.switch_threshold) 70 | except ValueError: 71 | parser.error('Only a positive integer, "RCUDA", or "NVIDIA" are allowed as a value of --switch-threshold.') 72 | if len(args) == 0: 73 | parser.error('Please specify PROGRAM to execute.') 74 | return options, args 75 | 76 | def main(options, args): 77 | ld_lib_path = framework_lib_directory 78 | 79 | program_args = args 80 | 81 | environment = os.environ.copy() 82 | if 'LD_LIBRARY_PATH' in environment: 83 | ld_lib_path = ld_lib_path + ':' + environment.get('LD_LIBRARY_PATH') 84 | 85 | environment['LD_LIBRARY_PATH'] = ld_lib_path 86 | environment['RCUDAPROTO'] = options.network_type 87 | environment['RCUDA_DEVICE_COUNT'] = str(options.number_of_devices) 88 | environment['MRCUDA_NVIDIA_LIB_PATH'] = options.nvidia_libcudart 89 | environment['MRCUDA_RCUDA_LIB_PATH'] = options.rcuda_libcudart 90 | environment['MRCUDA_SOCK_PATH'] = options.sock_path 91 | environment['MHELPER_PATH'] = options.mhelper_path 92 | 93 | i = 0 94 | if options.server_address: 95 | while i < options.number_of_devices: 96 | environment['RCUDA_DEVICE_' + str(i)] = options.server_address + '@' + str(options.port) + ':' + str(i) 97 | environment['MRCUDA_SWITCH_THRESHOLD_' + str(i)] = str(options.switch_threshold) 98 | i += 1 99 | else: 100 | f = open(options.server_file, 'r') 101 | server_lists = f.readlines() 102 | f.close() 103 | 104 | server_lists = [s.strip() for s in server_lists if len(s.strip()) > 0] 105 | if len(server_lists) == 0: 106 | raise Exception('Server file does not contain any server information.') 107 | 108 | j = 0 109 | while i < options.number_of_devices: 110 | server_info = server_lists[j].split('|') 111 | if len(server_info) != 2: 112 | raise Exception('Server file is not well-formed.') 113 | environment['RCUDA_DEVICE_' + str(i)] = server_info[0] 114 | environment['MRCUDA_SWITCH_THRESHOLD_' + str(i)] = server_info[1] 115 | j += 1 116 | if j >= len(server_lists): 117 | j = 0 118 | i += 1 119 | 120 | p = subprocess.Popen( 121 | program_args, 122 | env = environment 123 | ) 124 | p.wait() 125 | 126 | if __name__ == '__main__': 127 | options, args = parse_args() 128 | main(options, args) 129 | 130 | -------------------------------------------------------------------------------- /src/intercomm_interface.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRUCDA_INTERCOMM_INTERFACE__HEADER__ 2 | #define __MRCUDA_INTERCOMM_INTERFACE__HEADER__ 3 | 4 | #include 5 | 6 | #include "datatypes.h" 7 | 8 | /** 9 | * Initialize a handler with a helper process. 10 | * @param handler output of initialized handler. 11 | * @param process a ptr to a helper process. 12 | * @return 0 on success; -1 otherwise. 13 | */ 14 | int mhelper_int_init(MRCUDASym_t **handler, MHelperProcess_t *process); 15 | 16 | 17 | /* Interfaces */ 18 | 19 | /** 20 | * Create a context on the helper process. 21 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t a context will be created on. 22 | * @return 0 on success; -1 otherwise. 23 | */ 24 | int mhelper_int_cuCtxCreate_internal(MRCUDAGPU_t *mrcudaGPU); 25 | 26 | void **mhelper_int_cudaRegisterFatBinary(void *fatCubin); 27 | void **mhelper_int_cudaRegisterFatBinary_internal(MRCUDAGPU_t *mrcudaGPU, void *fatCubin); 28 | 29 | void mhelper_int_cudaUnregisterFatBinary(void **fatCubinHandle); 30 | void mhelper_int_cudaUnregisterFatBinary_internal(MRCUDAGPU_t *mrcudaGPU, void **fatCubinHandle); 31 | 32 | void mhelper_int_cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, int size, int constant, int global); 33 | void mhelper_int_cudaRegisterVar_internal(MRCUDAGPU_t *mrcudaGPU, void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, int size, int constant, int global); 34 | 35 | void mhelper_int_cudaRegisterTexture(void **fatCubinHandle, const struct textureReference *hostVar, const void **deviceAddress, const char *deviceName, int dim, int norm, int ext); 36 | void mhelper_int_cudaRegisterTexture_internal(MRCUDAGPU_t *mrcudaGPU, void **fatCubinHandle, const struct textureReference *hostVar, const void **deviceAddress, const char *deviceName, int dim, int norm, int ext); 37 | 38 | void mhelper_int_cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize); 39 | void mhelper_int_cudaRegisterFunction_internal(MRCUDAGPU_t *mrcudaGPU, void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize); 40 | 41 | cudaError_t mhelper_int_cudaLaunch(const void *func); 42 | cudaError_t mhelper_int_cudaLaunch_internal(MRCUDAGPU_t *mrcudaGPU, const void *func); 43 | 44 | cudaError_t mhelper_int_cudaHostAlloc(void **pHost, size_t size, unsigned int flags); 45 | 46 | cudaError_t mhelper_int_cudaDeviceReset(void); 47 | cudaError_t mhelper_int_cudaDeviceReset_internal(MRCUDAGPU_t *mrcudaGPU); 48 | 49 | cudaError_t mhelper_int_cudaDeviceSynchronize(void); 50 | cudaError_t mhelper_int_cudaDeviceSynchronize_internal(MRCUDAGPU_t *mrcudaGPU); 51 | 52 | cudaError_t mhelper_int_cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device); 53 | cudaError_t mhelper_int_cudaGetDeviceProperties_internal(MRCUDAGPU_t *mrcudaGPU, struct cudaDeviceProp *prop, int device); 54 | 55 | cudaError_t mhelper_int_cudaMalloc(void **devPtr, size_t size); 56 | cudaError_t mhelper_int_cudaMalloc_internal(MRCUDAGPU_t *mrcudaGPU, void **devPtr, size_t size); 57 | 58 | cudaError_t mhelper_int_cudaFreeHost(void *ptr); 59 | 60 | cudaError_t mhelper_int_cudaFree(void *devPtr); 61 | cudaError_t mhelper_int_cudaFree_internal(MRCUDAGPU_t *mrcudaGPU, void *devPtr); 62 | 63 | cudaError_t mhelper_int_cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream); 64 | cudaError_t mhelper_int_cudaMemcpyToSymbolAsync_internal(MRCUDAGPU_t *mrcudaGPU, const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream); 65 | 66 | cudaError_t mhelper_int_cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream); 67 | cudaError_t mhelper_int_cudaMemcpyFromSymbolAsync_internal(MRCUDAGPU_t *mrcudaGPU, void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream); 68 | 69 | cudaError_t mhelper_int_cudaSetupArgument(const void *arg, size_t size, size_t offset); 70 | cudaError_t mhelper_int_cudaSetupArgument_internal(MRCUDAGPU_t *mrcudaGPU, const void *arg, size_t size, size_t offset); 71 | 72 | cudaError_t mhelper_int_cudaStreamSynchronize(cudaStream_t stream); 73 | cudaError_t mhelper_int_cudaStreamSynchronize_internal(MRCUDAGPU_t *mrcudaGPU, cudaStream_t stream); 74 | 75 | cudaError_t mhelper_int_cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream); 76 | cudaError_t mhelper_int_cudaConfigureCall_internal(MRCUDAGPU_t *mrcudaGPU, dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream); 77 | 78 | cudaError_t mhelper_int_cudaGetLastError(void); 79 | cudaError_t mhelper_int_cudaGetLastError_internal(MRCUDAGPU_t *mrcudaGPU); 80 | 81 | cudaError_t mhelper_int_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); 82 | cudaError_t mhelper_int_cudaMemcpy_internal(MRCUDAGPU_t *mrcudaGPU, void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); 83 | 84 | cudaError_t mhelper_int_cudaSetDevice(int device); 85 | cudaError_t mhelper_int_cudaSetDevice_internal(MRCUDAGPU_t *mrcudaGPU, int device); 86 | 87 | cudaError_t mhelper_int_cudaStreamCreate(cudaStream_t *pStream); 88 | cudaError_t mhelper_int_cudaStreamCreate_internal(MRCUDAGPU_t *mrcudaGPU, cudaStream_t *pStream); 89 | 90 | #endif /* __MRCUDA_INTERCOMM_INTERFACE__HEADER__ */ 91 | 92 | -------------------------------------------------------------------------------- /tests/progs/matmul_par.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define BS (16) 8 | #define L (16) 9 | #define M (16) 10 | #define N (16) 11 | 12 | __global__ void matmul(float *A, float *B, float *C, 13 | int l, int m, int n) 14 | { 15 | int i, j, k; 16 | float sum; 17 | 18 | i = blockIdx.y * blockDim.y + threadIdx.y; 19 | j = blockIdx.x * blockDim.x + threadIdx.x; 20 | 21 | sum = 0.0; 22 | for (k = 0; k < m; k++) { 23 | sum += A[i * m + k] * B[k * n + j]; 24 | } 25 | C[i*n+j] = sum; 26 | } 27 | 28 | __global__ void thread_matrix(float *A, 29 | int l, int n) 30 | { 31 | int i, j; 32 | 33 | i = blockIdx.y * blockDim.y + threadIdx.y; 34 | j = blockIdx.x * blockDim.x + threadIdx.x; 35 | 36 | A[i * n + j] = i * n + j; 37 | } 38 | 39 | void matmul_cpu(float *A, float *B, float *C, 40 | int l, int m, int n) 41 | { 42 | int i, j, k; 43 | for (i = 0; i < l; i++) { 44 | for (j = 0; j < n; j++) { 45 | float sum = 0.0; 46 | for (k = 0; k < m; k++) { 47 | sum += A[i * m + k] * B[k * n + j]; 48 | } 49 | C[i*n+j] = sum; 50 | } 51 | } 52 | } 53 | 54 | void print_matrix(float *A, int l, int n) 55 | { 56 | int i, j; 57 | for (i = 0; i < l; i++) { 58 | for (j = 0; j < n; j++) { 59 | printf("%f ", A[i * n + j]); 60 | } 61 | printf("\n"); 62 | } 63 | } 64 | 65 | int compare_matrix(float *A, float *B, int l, int n) 66 | { 67 | int i, j; 68 | int ret = 0; 69 | for (i = 0; i < l; i++) { 70 | for (j = 0; j < n; j++) { 71 | if(A[i * n + j] != B[i * n + j]) 72 | ret = -1; 73 | } 74 | } 75 | return ret; 76 | } 77 | 78 | void alloc_matrix(float **m_h, float **m_d, int h, int w) 79 | { 80 | *m_h = (float *)malloc(sizeof(float) * h * w); 81 | cudaMalloc((void **)m_d, sizeof(float) * h * w); 82 | } 83 | 84 | void init_matrix(float *m, int h, int w) 85 | { 86 | int i, j; 87 | for (i = 0; i < h; i++) 88 | for (j = 0; j < w; j++) 89 | m[i * w + j] = (float)(random() % 100); 90 | } 91 | 92 | int check_error(const char *err_msg) 93 | { 94 | cudaError_t err = cudaGetLastError(); 95 | if (err != cudaSuccess) { 96 | fprintf(stderr, "CUDA error: %s: %s.\n", 97 | err_msg, cudaGetErrorString(err)); 98 | return 1; 99 | } 100 | return 0; 101 | } 102 | 103 | double get_elapsed_time(struct timeval *begin, struct timeval *end) 104 | { 105 | return (end->tv_sec - begin->tv_sec) * 1000 106 | + (end->tv_usec - begin->tv_usec) / 1000.0; 107 | } 108 | 109 | int main(int argc, char *argv[]) 110 | { 111 | float *Ad, *Bd, *Cd; 112 | float *Ah, *Bh, *Ch; 113 | struct timeval t1, t2; 114 | 115 | // prepare matrix A 116 | alloc_matrix(&Ah, &Ad, L, M); 117 | init_matrix(Ah, L, M); 118 | cudaMemcpy(Ad, Ah, sizeof(float) * L * M, 119 | cudaMemcpyHostToDevice); 120 | // do it again for matrix B 121 | alloc_matrix(&Bh, &Bd, M, N); 122 | init_matrix(Bh, M, N); 123 | cudaMemcpy(Bd, Bh, sizeof(float) * M * N, 124 | cudaMemcpyHostToDevice); 125 | // allocate spaces for matrix C 126 | alloc_matrix(&Ch, &Cd, L, N); 127 | 128 | cudaDeviceSynchronize(); 129 | gettimeofday(&t1, NULL); 130 | 131 | // launch matmul kernel 132 | matmul<<>>(Ad, Bd, Cd, L, M, N); 134 | 135 | if (check_error("matmul")) { 136 | exit(EXIT_FAILURE); 137 | } 138 | 139 | cudaDeviceSynchronize(); 140 | gettimeofday(&t2, NULL); 141 | printf("Elapsed time: %f msec\n", get_elapsed_time(&t1, &t2)); 142 | 143 | // obtain the result 144 | cudaMemcpy(Ch, Cd, sizeof(float) * L * N, cudaMemcpyDeviceToHost); 145 | float *C_cpu = (float *)malloc(sizeof(float) * L * N); 146 | matmul_cpu(Ah, Bh, C_cpu, L, M, N); 147 | print_matrix(Ch, L, N); 148 | printf("\n"); 149 | print_matrix(C_cpu, L, N); 150 | printf("\n"); 151 | 152 | if(compare_matrix(Ch, C_cpu, L, N) >= 0) 153 | printf("OK\n"); 154 | else 155 | printf("ERRRRR\n"); 156 | 157 | /* Switch to native */ 158 | /*cudaMalloc(NULL, 0); 159 | printf("Switched to native.....\n"); 160 | printf("Press enter to continue...\n"); 161 | getchar();*/ 162 | 163 | /*thread_matrix<<>>(Cd, L, N); 165 | cudaMemcpy(Ch, Cd, sizeof(float) * L * N, cudaMemcpyDeviceToHost); 166 | print_matrix(Ch, L, N); 167 | printf("\n");*/ 168 | 169 | int i; 170 | for(i = 0; i < 10; i++) 171 | { 172 | if(i == 3) 173 | { 174 | cudaMalloc(NULL, 0); 175 | printf("Switched to native.....\n"); 176 | printf("Press enter to continue...\n"); 177 | getchar(); 178 | } 179 | matmul<<>>(Ad, Bd, Cd, L, M, N); 181 | cudaMemcpy(Ch, Cd, sizeof(float) * L * N, cudaMemcpyDeviceToHost); 182 | print_matrix(Ch, L, N); 183 | printf("\n"); 184 | print_matrix(C_cpu, L, N); 185 | printf("\n"); 186 | if(compare_matrix(Ch, C_cpu, L, N) >= 0) 187 | printf("OK\n"); 188 | else 189 | printf("ERRRRR\n"); 190 | } 191 | 192 | free(C_cpu); 193 | 194 | cudaFree(Ad); 195 | cudaFree(Bd); 196 | cudaFree(Cd); 197 | 198 | return 0; 199 | } 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /src/record.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_RECORD__HEADER__ 2 | #define __MRCUDA_RECORD__HEADER__ 3 | 4 | #include 5 | #include 6 | 7 | #include "common.h" 8 | #include "datatypes.h" 9 | 10 | extern double recordAccTime; 11 | extern double memsyncAccTime; 12 | extern double memsyncrCUDAAccTime; 13 | extern double memsyncNvidiaAccTime; 14 | extern int memsyncNumCalls; 15 | extern double memsyncSize; 16 | 17 | extern MRecordGPU_t *mrecordGPUList; 18 | 19 | /** 20 | * Initialize the record/replay module. 21 | * Exit and report error if found. 22 | */ 23 | void mrcuda_record_init(); 24 | 25 | /** 26 | * Finalize the record/replay module. 27 | */ 28 | void mrcuda_record_fini(); 29 | 30 | /** 31 | * Record a cudaRegisterFatBinary call. 32 | */ 33 | void mrcuda_record_cudaRegisterFatBinary(MRCUDAGPU_t *mrcudaGPU, void* fatCubin, void **fatCubinHandle); 34 | 35 | /** 36 | * Record a cudaRegisterFunction call. 37 | */ 38 | void mrcuda_record_cudaRegisterFunction( 39 | MRCUDAGPU_t *mrcudaGPU, 40 | void **fatCubinHandle, 41 | const char *hostFun, 42 | char *deviceFun, 43 | const char *deviceName, 44 | int thread_limit, 45 | uint3 *tid, 46 | uint3 *bid, 47 | dim3 *bDim, 48 | dim3 *gDim, 49 | int *wSize 50 | ); 51 | 52 | /** 53 | * Record a cudaRegisterVar call. 54 | */ 55 | void mrcuda_record_cudaRegisterVar( 56 | MRCUDAGPU_t *mrcudaGPU, 57 | void **fatCubinHandle, 58 | char *hostVar, 59 | char *deviceAddress, 60 | const char *deviceName, 61 | int ext, 62 | int size, 63 | int constant, 64 | int global 65 | ); 66 | 67 | /** 68 | * Record a cudaRegisterTexture call. 69 | */ 70 | void mrcuda_record_cudaRegisterTexture( 71 | MRCUDAGPU_t *mrcudaGPU, 72 | void **fatCubinHandle, 73 | const struct textureReference *hostVar, 74 | const void **deviceAddress, 75 | const char *deviceName, 76 | int dim, 77 | int norm, 78 | int ext 79 | ); 80 | 81 | /** 82 | * Record a cudaUnregisterFatBinary call. 83 | */ 84 | void mrcuda_record_cudaUnregisterFatBinary(MRCUDAGPU_t *mrcudaGPU, void **fatCubinHandle); 85 | 86 | /** 87 | * Record a cudaMalloc call. 88 | */ 89 | void mrcuda_record_cudaMalloc(MRCUDAGPU_t *mrcudaGPU, void **devPtr, size_t size); 90 | 91 | /** 92 | * Record a cudaFree call. 93 | */ 94 | void mrcuda_record_cudaFree(MRCUDAGPU_t *mrcudaGPU, void *devPtr); 95 | 96 | /** 97 | * Record a cudaBindTexture call. 98 | */ 99 | void mrcuda_record_cudaBindTexture( 100 | MRCUDAGPU_t *mrcudaGPU, 101 | size_t *offset, 102 | const struct textureReference *texref, 103 | const void *devPtr, 104 | const struct cudaChannelFormatDesc *desc, 105 | size_t size 106 | ); 107 | 108 | /** 109 | * Record a cudaStreamCreate call. 110 | */ 111 | void mrcuda_record_cudaStreamCreate(MRCUDAGPU_t *mrcudaGPU, cudaStream_t *pStream); 112 | 113 | /** 114 | * Record a cudaHostAlloc call. 115 | * The dual function of this call is mrcuda_replay_cudaFreeHost. 116 | */ 117 | void mrcuda_record_cudaHostAlloc(MRCUDAGPU_t *mrcudaGPU, void **pHost, size_t size, unsigned int flags); 118 | 119 | /** 120 | * Record a cudaSetDeviceFlags call. 121 | */ 122 | void mrcuda_record_cudaSetDeviceFlags(MRCUDAGPU_t *mrcudaGPU, unsigned int flags); 123 | 124 | 125 | /** 126 | * Replay a cudaRegisterFatBinary call. 127 | */ 128 | void mrcuda_replay_cudaRegisterFatBinary(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 129 | 130 | /** 131 | * Replay a cudaRegisterFunction call. 132 | */ 133 | void mrcuda_replay_cudaRegisterFunction(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 134 | 135 | /** 136 | * Replay a cudaRegisterVar call. 137 | */ 138 | void mrcuda_replay_cudaRegisterVar(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 139 | 140 | /** 141 | * Replay a cudaRegisterTexture call. 142 | */ 143 | void mrcuda_replay_cudaRegisterTexture(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 144 | 145 | /** 146 | * Replay a cudaUnregisterFatBinary call. 147 | */ 148 | void mrcuda_replay_cudaUnregisterFatBinary(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 149 | 150 | /** 151 | * Replay a cudaMalloc call. 152 | */ 153 | void mrcuda_replay_cudaMalloc(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 154 | 155 | /** 156 | * Replay a cudaFree call. 157 | */ 158 | void mrcuda_replay_cudaFree(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 159 | 160 | /** 161 | * Replay a cudaBindTexture call. 162 | */ 163 | void mrcuda_replay_cudaBindTexture(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 164 | 165 | /** 166 | * Replay a cudaStreamCreate call. 167 | */ 168 | void mrcuda_replay_cudaStreamCreate(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 169 | 170 | /** 171 | * Replay a cudaFreeHost call. 172 | * This function looks for the library used for allocating the ptr. 173 | * The dual function of this call is mrcuda_record_cudaHostAlloc. 174 | */ 175 | MRCUDASym_t *mrcuda_replay_cudaFreeHost(MRCUDAGPU_t *mrcudaGPU, void *ptr); 176 | 177 | /** 178 | * Replay a cudaSetDeviceFlags call. 179 | */ 180 | void mrcuda_replay_cudaSetDeviceFlags(MRCUDAGPU_t *mrcudaGPU, MRecord_t *record); 181 | 182 | /** 183 | * Download the content of active memory regions to the native device. 184 | * Exit and report error if an error is found. 185 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t that the sync mem will be performed on. 186 | */ 187 | void mrcuda_sync_mem(MRCUDAGPU_t *mrcudaGPU); 188 | 189 | /** 190 | * Simulate cuda streams on the native CUDA so that the number of streams are equaled to that of rCUDA. 191 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t that the simulate stream will be performed on. 192 | */ 193 | void mrcuda_simulate_stream(MRCUDAGPU_t *mrcudaGPU); 194 | 195 | /** 196 | * Simulate cuCtxCreate on the specified gpuID. 197 | * If mrcudaGPU->status == MRCUDA_GPU_STATUS_HELPER, ask the helper to handle the command. 198 | * @param mrcudaGPU a ptr to a MRCUDAGPU_t. 199 | * @param gpuID the ID of the GPU a context will be created on. 200 | * @return 0 on success; -1 otherwise. 201 | */ 202 | int mrcuda_simulate_cuCtxCreate(MRCUDAGPU_t *mrcudaGPU, int gpuID); 203 | 204 | #endif 205 | -------------------------------------------------------------------------------- /tests/progs/matmul_mul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define BS (16) 8 | #define L (16) 9 | #define M (16) 10 | #define N (16) 11 | 12 | __global__ void matmul(float *A, float *B, float *C, 13 | int l, int m, int n) 14 | { 15 | int i, j, k; 16 | float sum; 17 | 18 | i = blockIdx.y * blockDim.y + threadIdx.y; 19 | j = blockIdx.x * blockDim.x + threadIdx.x; 20 | 21 | sum = 0.0; 22 | for (k = 0; k < m; k++) { 23 | sum += A[i * m + k] * B[k * n + j]; 24 | } 25 | C[i*n+j] = sum; 26 | } 27 | 28 | __global__ void thread_matrix(float *A, 29 | int l, int n) 30 | { 31 | int i, j; 32 | 33 | i = blockIdx.y * blockDim.y + threadIdx.y; 34 | j = blockIdx.x * blockDim.x + threadIdx.x; 35 | 36 | A[i * n + j] = i * n + j; 37 | } 38 | 39 | void matmul_cpu(float *A, float *B, float *C, 40 | int l, int m, int n) 41 | { 42 | int i, j, k; 43 | for (i = 0; i < l; i++) { 44 | for (j = 0; j < n; j++) { 45 | float sum = 0.0; 46 | for (k = 0; k < m; k++) { 47 | sum += A[i * m + k] * B[k * n + j]; 48 | } 49 | C[i*n+j] = sum; 50 | } 51 | } 52 | } 53 | 54 | void print_matrix(float *A, int l, int n) 55 | { 56 | int i, j; 57 | for (i = 0; i < l; i++) { 58 | for (j = 0; j < n; j++) { 59 | printf("%f ", A[i * n + j]); 60 | } 61 | printf("\n"); 62 | } 63 | } 64 | 65 | int compare_matrix(float *A, float *B, int l, int n) 66 | { 67 | int i, j; 68 | int ret = 0; 69 | for (i = 0; i < l; i++) { 70 | for (j = 0; j < n; j++) { 71 | if(A[i * n + j] != B[i * n + j]) 72 | ret = -1; 73 | } 74 | } 75 | return ret; 76 | } 77 | 78 | void alloc_matrix(float **m_h, float **m_d, int h, int w) 79 | { 80 | *m_h = (float *)malloc(sizeof(float) * h * w); 81 | cudaMalloc((void **)m_d, sizeof(float) * h * w); 82 | } 83 | 84 | void init_matrix(float *m, int h, int w) 85 | { 86 | int i, j; 87 | for (i = 0; i < h; i++) 88 | for (j = 0; j < w; j++) 89 | m[i * w + j] = (float)(random() % 100); 90 | } 91 | 92 | int check_error(const char *err_msg) 93 | { 94 | cudaError_t err = cudaGetLastError(); 95 | if (err != cudaSuccess) { 96 | fprintf(stderr, "CUDA error: %s: %s.\n", 97 | err_msg, cudaGetErrorString(err)); 98 | return 1; 99 | } 100 | return 0; 101 | } 102 | 103 | double get_elapsed_time(struct timeval *begin, struct timeval *end) 104 | { 105 | return (end->tv_sec - begin->tv_sec) * 1000 106 | + (end->tv_usec - begin->tv_usec) / 1000.0; 107 | } 108 | 109 | int main(int argc, char *argv[]) 110 | { 111 | float *Ad1, *Bd1, *Cd1; 112 | float *Ah1, *Bh1, *Ch1; 113 | float *Ad2, *Bd2, *Cd2; 114 | float *Ah2, *Bh2, *Ch2; 115 | struct timeval t1, t2; 116 | float *C_cpu; 117 | 118 | int num_device = 0; 119 | 120 | if (cudaGetDeviceCount(&num_device) != cudaSuccess || num_device < 2) { 121 | fprintf(stderr, "This program needs at least 2 devices.\n"); 122 | exit(EXIT_FAILURE); 123 | } 124 | 125 | cudaSetDevice(0); 126 | 127 | // prepare matrix A 128 | alloc_matrix(&Ah1, &Ad1, L, M); 129 | init_matrix(Ah1, L, M); 130 | cudaMemcpy(Ad1, Ah1, sizeof(float) * L * M, 131 | cudaMemcpyHostToDevice); 132 | // do it again for matrix B 133 | alloc_matrix(&Bh1, &Bd1, M, N); 134 | init_matrix(Bh1, M, N); 135 | cudaMemcpy(Bd1, Bh1, sizeof(float) * M * N, 136 | cudaMemcpyHostToDevice); 137 | // allocate spaces for matrix C 138 | alloc_matrix(&Ch1, &Cd1, L, N); 139 | 140 | cudaDeviceSynchronize(); 141 | gettimeofday(&t1, NULL); 142 | 143 | // launch matmul kernel 144 | matmul<<>>(Ad1, Bd1, Cd1, L, M, N); 146 | 147 | if (check_error("matmul")) { 148 | exit(EXIT_FAILURE); 149 | } 150 | 151 | cudaDeviceSynchronize(); 152 | gettimeofday(&t2, NULL); 153 | printf("Elapsed time: %f msec\n", get_elapsed_time(&t1, &t2)); 154 | 155 | // obtain the result 156 | cudaMemcpy(Ch1, Cd1, sizeof(float) * L * N, cudaMemcpyDeviceToHost); 157 | C_cpu = (float *)malloc(sizeof(float) * L * N); 158 | matmul_cpu(Ah1, Bh1, C_cpu, L, M, N); 159 | print_matrix(Ch1, L, N); 160 | printf("\n"); 161 | print_matrix(C_cpu, L, N); 162 | printf("\n"); 163 | 164 | if(compare_matrix(Ch1, C_cpu, L, N) >= 0) 165 | printf("OK\n"); 166 | else 167 | printf("ERRRRR\n"); 168 | 169 | free(C_cpu); 170 | 171 | cudaSetDevice(1); 172 | 173 | // prepare matrix A 174 | alloc_matrix(&Ah2, &Ad2, L, M); 175 | init_matrix(Ah2, L, M); 176 | cudaMemcpy(Ad2, Ah2, sizeof(float) * L * M, 177 | cudaMemcpyHostToDevice); 178 | // do it again for matrix B 179 | alloc_matrix(&Bh2, &Bd2, M, N); 180 | init_matrix(Bh2, M, N); 181 | cudaMemcpy(Bd2, Bh2, sizeof(float) * M * N, 182 | cudaMemcpyHostToDevice); 183 | // allocate spaces for matrix C 184 | alloc_matrix(&Ch2, &Cd2, L, N); 185 | 186 | cudaDeviceSynchronize(); 187 | gettimeofday(&t1, NULL); 188 | 189 | // launch matmul kernel 190 | matmul<<>>(Ad2, Bd2, Cd2, L, M, N); 192 | 193 | if (check_error("matmul")) { 194 | exit(EXIT_FAILURE); 195 | } 196 | 197 | cudaDeviceSynchronize(); 198 | gettimeofday(&t2, NULL); 199 | printf("Elapsed time: %f msec\n", get_elapsed_time(&t1, &t2)); 200 | 201 | // obtain the result 202 | cudaMemcpy(Ch2, Cd2, sizeof(float) * L * N, cudaMemcpyDeviceToHost); 203 | C_cpu = (float *)malloc(sizeof(float) * L * N); 204 | matmul_cpu(Ah2, Bh2, C_cpu, L, M, N); 205 | print_matrix(Ch2, L, N); 206 | printf("\n"); 207 | print_matrix(C_cpu, L, N); 208 | printf("\n"); 209 | 210 | if(compare_matrix(Ch2, C_cpu, L, N) >= 0) 211 | printf("OK\n"); 212 | else 213 | printf("ERRRRR\n"); 214 | 215 | free(C_cpu); 216 | 217 | cudaFree(Ad1); 218 | cudaFree(Bd1); 219 | cudaFree(Cd1); 220 | 221 | cudaFree(Ad2); 222 | cudaFree(Bd2); 223 | cudaFree(Cd2); 224 | 225 | return 0; 226 | } 227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /results/nullker-mhelper.out: -------------------------------------------------------------------------------- 1 | prog lib count num_calls time 2 | nullker mrcuda 0 1024 13.909000 3 | nullker mrcuda 0 2048 27.694000 4 | nullker mrcuda 0 4096 55.480000 5 | nullker mrcuda 0 8192 110.551000 6 | nullker mrcuda 0 16384 220.698000 7 | nullker mrcuda 0 32768 376.599000 8 | nullker mrcuda 0 65536 745.303000 9 | nullker mrcuda 0 131072 1487.305000 10 | nullker mrcuda 0 262144 2981.731000 11 | nullker mrcuda 0 524288 5951.194000 12 | nullker mrcuda 0 1048576 11905.031000 13 | nullker mrcuda 0 2097152 23794.678000 14 | nullker mrcuda 0 4194304 47580.513000 15 | nullker mrcuda 0 8388608 95185.737000 16 | nullker mrcuda 0 16777216 190206.546000 17 | nullker mrcuda 1 1024 13.948000 18 | nullker mrcuda 1 2048 27.847000 19 | nullker mrcuda 1 4096 55.602000 20 | nullker mrcuda 1 8192 110.650000 21 | nullker mrcuda 1 16384 221.341000 22 | nullker mrcuda 1 32768 383.100000 23 | nullker mrcuda 1 65536 746.129000 24 | nullker mrcuda 1 131072 1496.601000 25 | nullker mrcuda 1 262144 2995.116000 26 | nullker mrcuda 1 524288 5985.191000 27 | nullker mrcuda 1 1048576 11979.065000 28 | nullker mrcuda 1 2097152 23947.264000 29 | nullker mrcuda 1 4194304 47903.340000 30 | nullker mrcuda 1 8388608 95821.928000 31 | nullker mrcuda 1 16777216 191631.296000 32 | nullker mrcuda 2 1024 11.762000 33 | nullker mrcuda 2 2048 23.572000 34 | nullker mrcuda 2 4096 46.782000 35 | nullker mrcuda 2 8192 92.995000 36 | nullker mrcuda 2 16384 186.496000 37 | nullker mrcuda 2 32768 373.267000 38 | nullker mrcuda 2 65536 745.067000 39 | nullker mrcuda 2 131072 1491.059000 40 | nullker mrcuda 2 262144 2975.860000 41 | nullker mrcuda 2 524288 5957.743000 42 | nullker mrcuda 2 1048576 11907.501000 43 | nullker mrcuda 2 2097152 23853.835000 44 | nullker mrcuda 2 4194304 47667.329000 45 | nullker mrcuda 2 8388608 95264.386000 46 | nullker mrcuda 2 16777216 190606.974000 47 | nullker mrcuda 3 1024 13.931000 48 | nullker mrcuda 3 2048 27.930000 49 | nullker mrcuda 3 4096 55.193000 50 | nullker mrcuda 3 8192 109.578000 51 | nullker mrcuda 3 16384 220.805000 52 | nullker mrcuda 3 32768 376.383000 53 | nullker mrcuda 3 65536 744.637000 54 | nullker mrcuda 3 131072 1473.593000 55 | nullker mrcuda 3 262144 2953.483000 56 | nullker mrcuda 3 524288 5922.429000 57 | nullker mrcuda 3 1048576 11851.660000 58 | nullker mrcuda 3 2097152 23698.863000 59 | nullker mrcuda 3 4194304 47307.894000 60 | nullker mrcuda 3 8388608 94766.222000 61 | nullker mrcuda 3 16777216 189753.937000 62 | nullker mrcuda 4 1024 12.148000 63 | nullker mrcuda 4 2048 24.513000 64 | nullker mrcuda 4 4096 48.559000 65 | nullker mrcuda 4 8192 96.675000 66 | nullker mrcuda 4 16384 194.782000 67 | nullker mrcuda 4 32768 387.978000 68 | nullker mrcuda 4 65536 779.653000 69 | nullker mrcuda 4 131072 1559.794000 70 | nullker mrcuda 4 262144 3126.936000 71 | nullker mrcuda 4 524288 6238.395000 72 | nullker mrcuda 4 1048576 12470.579000 73 | nullker mrcuda 4 2097152 24921.762000 74 | nullker mrcuda 4 4194304 49788.988000 75 | nullker mrcuda 4 8388608 99457.578000 76 | nullker mrcuda 4 16777216 199119.432000 77 | nullker mrcuda 5 1024 14.360000 78 | nullker mrcuda 5 2048 28.614000 79 | nullker mrcuda 5 4096 57.200000 80 | nullker mrcuda 5 8192 113.692000 81 | nullker mrcuda 5 16384 227.503000 82 | nullker mrcuda 5 32768 388.961000 83 | nullker mrcuda 5 65536 767.622000 84 | nullker mrcuda 5 131072 1537.665000 85 | nullker mrcuda 5 262144 3072.827000 86 | nullker mrcuda 5 524288 6149.966000 87 | nullker mrcuda 5 1048576 12304.688000 88 | nullker mrcuda 5 2097152 24605.251000 89 | nullker mrcuda 5 4194304 49200.814000 90 | nullker mrcuda 5 8388608 98456.404000 91 | nullker mrcuda 5 16777216 196701.645000 92 | nullker mrcuda 6 1024 14.162000 93 | nullker mrcuda 6 2048 28.289000 94 | nullker mrcuda 6 4096 56.480000 95 | nullker mrcuda 6 8192 112.021000 96 | nullker mrcuda 6 16384 223.856000 97 | nullker mrcuda 6 32768 391.024000 98 | nullker mrcuda 6 65536 755.072000 99 | nullker mrcuda 6 131072 1515.169000 100 | nullker mrcuda 6 262144 3039.992000 101 | nullker mrcuda 6 524288 6077.132000 102 | nullker mrcuda 6 1048576 12119.706000 103 | nullker mrcuda 6 2097152 24327.747000 104 | nullker mrcuda 6 4194304 48537.433000 105 | nullker mrcuda 6 8388608 97030.690000 106 | nullker mrcuda 6 16777216 194138.853000 107 | nullker mrcuda 7 1024 13.810000 108 | nullker mrcuda 7 2048 27.852000 109 | nullker mrcuda 7 4096 55.083000 110 | nullker mrcuda 7 8192 109.188000 111 | nullker mrcuda 7 16384 218.314000 112 | nullker mrcuda 7 32768 376.721000 113 | nullker mrcuda 7 65536 735.194000 114 | nullker mrcuda 7 131072 1481.617000 115 | nullker mrcuda 7 262144 2954.298000 116 | nullker mrcuda 7 524288 5911.131000 117 | nullker mrcuda 7 1048576 11806.652000 118 | nullker mrcuda 7 2097152 23656.850000 119 | nullker mrcuda 7 4194304 47241.286000 120 | nullker mrcuda 7 8388608 94611.828000 121 | nullker mrcuda 7 16777216 189050.351000 122 | nullker mrcuda 8 1024 14.095000 123 | nullker mrcuda 8 2048 28.081000 124 | nullker mrcuda 8 4096 55.970000 125 | nullker mrcuda 8 8192 110.973000 126 | nullker mrcuda 8 16384 222.415000 127 | nullker mrcuda 8 32768 390.757000 128 | nullker mrcuda 8 65536 751.369000 129 | nullker mrcuda 8 131072 1505.198000 130 | nullker mrcuda 8 262144 3009.886000 131 | nullker mrcuda 8 524288 6017.065000 132 | nullker mrcuda 8 1048576 12057.644000 133 | nullker mrcuda 8 2097152 24091.687000 134 | nullker mrcuda 8 4194304 48175.926000 135 | nullker mrcuda 8 8388608 96237.943000 136 | nullker mrcuda 8 16777216 192701.657000 137 | nullker mrcuda 9 1024 14.060000 138 | nullker mrcuda 9 2048 28.105000 139 | nullker mrcuda 9 4096 56.126000 140 | nullker mrcuda 9 8192 111.257000 141 | nullker mrcuda 9 16384 222.588000 142 | nullker mrcuda 9 32768 388.535000 143 | nullker mrcuda 9 65536 749.658000 144 | nullker mrcuda 9 131072 1509.148000 145 | nullker mrcuda 9 262144 3006.658000 146 | nullker mrcuda 9 524288 6021.239000 147 | nullker mrcuda 9 1048576 12045.826000 148 | nullker mrcuda 9 2097152 24086.604000 149 | nullker mrcuda 9 4194304 48086.814000 150 | nullker mrcuda 9 8388608 96193.359000 151 | nullker mrcuda 9 16777216 192125.235000 152 | nullker native 0 1024 1.693000 153 | nullker native 0 2048 3.328000 154 | nullker native 0 4096 6.596000 155 | nullker native 0 8192 12.316000 156 | nullker native 0 16384 24.535000 157 | nullker native 0 32768 49.045000 158 | nullker native 0 65536 98.111000 159 | nullker native 0 131072 196.670000 160 | nullker native 0 262144 392.933000 161 | nullker native 0 524288 787.145000 162 | nullker native 0 1048576 1578.461000 163 | nullker native 0 2097152 3150.883000 164 | nullker native 0 4194304 6305.144000 165 | nullker native 0 8388608 12608.160000 166 | nullker native 0 16777216 25167.428000 167 | nullker native 1 1024 1.930000 168 | nullker native 1 2048 3.870000 169 | nullker native 1 4096 7.645000 170 | nullker native 1 8192 14.576000 171 | nullker native 1 16384 29.061000 172 | nullker native 1 32768 59.444000 173 | nullker native 1 65536 116.105000 174 | nullker native 1 131072 232.417000 175 | nullker native 1 262144 395.501000 176 | nullker native 1 524288 782.965000 177 | nullker native 1 1048576 1572.530000 178 | nullker native 1 2097152 3140.093000 179 | nullker native 1 4194304 6273.480000 180 | nullker native 1 8388608 12540.560000 181 | nullker native 1 16777216 25076.293000 182 | nullker native 2 1024 1.931000 183 | nullker native 2 2048 3.850000 184 | nullker native 2 4096 7.639000 185 | nullker native 2 8192 14.556000 186 | nullker native 2 16384 29.103000 187 | nullker native 2 32768 58.037000 188 | nullker native 2 65536 116.313000 189 | nullker native 2 131072 229.299000 190 | nullker native 2 262144 392.810000 191 | nullker native 2 524288 784.319000 192 | nullker native 2 1048576 1571.662000 193 | nullker native 2 2097152 3142.692000 194 | nullker native 2 4194304 6281.323000 195 | nullker native 2 8388608 12563.783000 196 | nullker native 2 16777216 25143.079000 197 | nullker native 3 1024 1.925000 198 | nullker native 3 2048 3.851000 199 | nullker native 3 4096 7.658000 200 | nullker native 3 8192 14.506000 201 | nullker native 3 16384 29.193000 202 | nullker native 3 32768 58.076000 203 | nullker native 3 65536 116.383000 204 | nullker native 3 131072 230.686000 205 | nullker native 3 262144 391.538000 206 | nullker native 3 524288 781.478000 207 | nullker native 3 1048576 1567.546000 208 | nullker native 3 2097152 3139.788000 209 | nullker native 3 4194304 6269.944000 210 | nullker native 3 8388608 12546.773000 211 | nullker native 3 16777216 25069.748000 212 | nullker native 4 1024 1.924000 213 | nullker native 4 2048 3.862000 214 | nullker native 4 4096 7.633000 215 | nullker native 4 8192 14.547000 216 | nullker native 4 16384 29.111000 217 | nullker native 4 32768 58.003000 218 | nullker native 4 65536 116.376000 219 | nullker native 4 131072 232.829000 220 | nullker native 4 262144 398.024000 221 | nullker native 4 524288 784.508000 222 | nullker native 4 1048576 1573.480000 223 | nullker native 4 2097152 3139.734000 224 | nullker native 4 4194304 6275.518000 225 | nullker native 4 8388608 12546.614000 226 | nullker native 4 16777216 25070.691000 227 | nullker native 5 1024 1.670000 228 | nullker native 5 2048 3.293000 229 | nullker native 5 4096 6.543000 230 | nullker native 5 8192 12.294000 231 | nullker native 5 16384 24.577000 232 | nullker native 5 32768 48.953000 233 | nullker native 5 65536 97.914000 234 | nullker native 5 131072 195.743000 235 | nullker native 5 262144 392.773000 236 | nullker native 5 524288 783.770000 237 | nullker native 5 1048576 1574.795000 238 | nullker native 5 2097152 3143.471000 239 | nullker native 5 4194304 6282.858000 240 | nullker native 5 8388608 12580.392000 241 | nullker native 5 16777216 25153.583000 242 | nullker native 6 1024 1.916000 243 | nullker native 6 2048 3.872000 244 | nullker native 6 4096 7.634000 245 | nullker native 6 8192 14.544000 246 | nullker native 6 16384 29.210000 247 | nullker native 6 32768 58.203000 248 | nullker native 6 65536 116.366000 249 | nullker native 6 131072 232.485000 250 | nullker native 6 262144 393.469000 251 | nullker native 6 524288 783.498000 252 | nullker native 6 1048576 1571.326000 253 | nullker native 6 2097152 3140.647000 254 | nullker native 6 4194304 6264.444000 255 | nullker native 6 8388608 12527.807000 256 | nullker native 6 16777216 25067.070000 257 | nullker native 7 1024 1.926000 258 | nullker native 7 2048 3.839000 259 | nullker native 7 4096 7.660000 260 | nullker native 7 8192 14.561000 261 | nullker native 7 16384 29.069000 262 | nullker native 7 32768 58.140000 263 | nullker native 7 65536 116.144000 264 | nullker native 7 131072 228.217000 265 | nullker native 7 262144 392.149000 266 | nullker native 7 524288 783.238000 267 | nullker native 7 1048576 1572.047000 268 | nullker native 7 2097152 3140.495000 269 | nullker native 7 4194304 6267.781000 270 | nullker native 7 8388608 12546.230000 271 | nullker native 7 16777216 25100.963000 272 | nullker native 8 1024 1.688000 273 | nullker native 8 2048 3.300000 274 | nullker native 8 4096 6.539000 275 | nullker native 8 8192 12.336000 276 | nullker native 8 16384 24.511000 277 | nullker native 8 32768 48.958000 278 | nullker native 8 65536 97.993000 279 | nullker native 8 131072 195.946000 280 | nullker native 8 262144 392.690000 281 | nullker native 8 524288 784.694000 282 | nullker native 8 1048576 1573.828000 283 | nullker native 8 2097152 3146.828000 284 | nullker native 8 4194304 6291.588000 285 | nullker native 8 8388608 12585.252000 286 | nullker native 8 16777216 25207.118000 287 | nullker native 9 1024 1.916000 288 | nullker native 9 2048 3.869000 289 | nullker native 9 4096 7.611000 290 | nullker native 9 8192 14.613000 291 | nullker native 9 16384 29.131000 292 | nullker native 9 32768 58.367000 293 | nullker native 9 65536 116.631000 294 | nullker native 9 131072 233.111000 295 | nullker native 9 262144 395.504000 296 | nullker native 9 524288 785.364000 297 | nullker native 9 1048576 1573.954000 298 | nullker native 9 2097152 3150.368000 299 | nullker native 9 4194304 6289.309000 300 | nullker native 9 8388608 12580.401000 301 | nullker native 9 16777216 25135.924000 302 | -------------------------------------------------------------------------------- /scripts/plotters/overhead.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.lines as mlines 3 | import matplotlib.markers as mmarkers 4 | import matplotlib.font_manager 5 | from matplotlib import rcParams 6 | 7 | rcParams['mathtext.fontset'] = 'custom' 8 | 9 | import numpy as np 10 | 11 | import csv 12 | import argparse 13 | import math 14 | 15 | COLOR = ['b', 'g', 'r', 'c', 'm', 'y',] 16 | 17 | def parseargs(): 18 | """ 19 | Manage the program arguments. 20 | """ 21 | parser = argparse.ArgumentParser( 22 | description = 'mrCUDA overhead benchmark result plotter' 23 | ) 24 | parser.add_argument('type', 25 | choices = ('memsync', 'memsync-bw', 'mhelper-nullker', 'mhelper-memcpybw', 'record-replay',), 26 | help = 'Overhead type' 27 | ) 28 | parser.add_argument('resultfile', type = argparse.FileType('r'), 29 | help = 'Result file (csv)' 30 | ) 31 | return parser.parse_args() 32 | 33 | def read_memsync_input(input_file): 34 | # All time is in ms. 35 | # All sizes are in B. 36 | reader = csv.DictReader(input_file, delimiter = ' ') 37 | result = list() 38 | for row in reader: 39 | row['total_size'] = int(row['total_size']) 40 | row['num_regions'] = int(row['num_regions']) 41 | # Filter out some results to reduce size 42 | if math.log(row['num_regions'], 2) % 2 == 1: 43 | continue 44 | row['memsync_time'] = float(row['memsync_time']) 45 | row['rcuda_time'] = float(row['rcuda_time']) 46 | row['local_time'] = float(row['local_time']) 47 | row['nvidia_time'] = float(row['nvidia_time']) 48 | row['other_time'] = float(row['other_time']) 49 | row['size_per_region'] = float(row['total_size']) / float(row['num_regions']) 50 | row['bw'] = row['total_size'] / row['nvidia_time'] * (10 ** -3) # MB / s 51 | result.append(row) 52 | return result 53 | 54 | def plot_memsync(input_data): 55 | properties = { 56 | 'bw_coef': 0.04721 * (10 ** 6), # 1 / s 57 | 'bw_max': 4778.505 * (10 ** 6), # B / s 58 | 'memsync_coef': 5.686 * (10 ** -11), # s / B 59 | 'memsync_const': 0, # s 60 | } 61 | 62 | group_dict = dict() 63 | predicted_dict = dict() 64 | for data in input_data: 65 | if data['num_regions'] not in group_dict: 66 | group_dict[data['num_regions']] = [list(), list(),] 67 | group_data = group_dict[data['num_regions']] 68 | group_data[0].append(data['size_per_region']) 69 | group_data[1].append(data['local_time'] / 1000) 70 | 71 | if data['num_regions'] not in predicted_dict: 72 | predicted_dict[data['num_regions']] = dict() 73 | if data['size_per_region'] not in predicted_dict[data['num_regions']]: 74 | predicted_dict[data['num_regions']][data['size_per_region']] = data['num_regions'] * (properties['memsync_coef'] * data['size_per_region'] + properties['memsync_const'] + data['size_per_region'] / min(properties['bw_max'], properties['bw_coef'] * data['size_per_region'])) 75 | 76 | legend_list = list() 77 | i = 0 78 | for num_regions, group_data in sorted(group_dict.items(), key = lambda item: item[0]): 79 | plt.scatter(group_data[0], group_data[1], 80 | c = COLOR[i % len(COLOR)], 81 | marker = 'o' if i < len(COLOR) else '+', 82 | s = 40 83 | ) 84 | x, y = zip(*sorted(predicted_dict[num_regions].items(), key = lambda item: item[0])) 85 | p = plt.plot(x, y, COLOR[i % len(COLOR)], linewidth = 4) 86 | legend_list.append((p[0], '$\mathbf{2^{%d}}$ regions' % (math.log(num_regions, 2),),)) 87 | i += 1 88 | 89 | p = mlines.Line2D([], [], color = 'black', linewidth = 4) 90 | legend_list.append((p, 'Predicted',)) 91 | p = mlines.Line2D([], [], color = 'black', marker = 'o', markersize = 16, linestyle = 'None') 92 | legend_list.append((p, 'Measured',)) 93 | 94 | legend_list.reverse() 95 | 96 | plt.legend(zip(*legend_list)[0], zip(*legend_list)[1], 97 | loc = 'upper left', 98 | prop = matplotlib.font_manager.FontProperties(size = 30, weight = 'bold') 99 | ) 100 | plt.xscale('log', basex = 2) 101 | plt.yscale('log', basey = 10) 102 | plt.xlim(xmin = 0) 103 | plt.ylim(ymin = 0) 104 | 105 | plt.xlabel('$\mathbf{data\_size_i}$ (B)', size = 40, weight = 'bold') 106 | plt.ylabel('Time (s)', size = 40, weight = 'bold') 107 | 108 | plt.xticks(size = 35, weight = 'bold') 109 | plt.yticks(size = 35, weight = 'bold') 110 | 111 | plt.show() 112 | 113 | def plot_memsync_bw(input_data): 114 | properties = { 115 | 'bw_coef': 0.04721 * (10 ** 6), # 1 / s 116 | 'bw_max': 4778.505 * (10 ** 6), # B / s 117 | 'memsync_coef': 5.686 * (10 ** -11), # s / B 118 | 'memsync_const': 0, # s 119 | } 120 | 121 | measured_data = [(row['size_per_region'], row['bw'],) for row in input_data] 122 | predicted_data = [(size_per_region, min(properties['bw_max'], properties['bw_coef'] * size_per_region) * (10 ** -6),) for size_per_region in sorted(set(zip(*measured_data)[0]))] 123 | 124 | legend_list = list() 125 | p = plt.scatter( 126 | zip(*measured_data)[0], 127 | zip(*measured_data)[1], 128 | c = COLOR[0], 129 | marker = 'o', 130 | s = 40 131 | ) 132 | legend_list.append((p, 'Measured',)) 133 | x, y = zip(*predicted_data) 134 | plt.plot(x, y, COLOR[0], linewidth = 4) 135 | p = mlines.Line2D([], [], color = COLOR[0], linewidth = 4) 136 | legend_list.append((p, 'Predicted',)) 137 | 138 | plt.legend(zip(*legend_list)[0], zip(*legend_list)[1], 139 | loc = 'upper left', 140 | prop = matplotlib.font_manager.FontProperties(size = 30, weight = 'bold') 141 | ) 142 | plt.xscale('log', basex = 2) 143 | plt.yscale('log', basey = 10) 144 | plt.xlim(xmin = 0) 145 | plt.ylim(ymin = 0) 146 | 147 | plt.xlabel('Size per region (B)', size = 30, weight = 'bold') 148 | plt.ylabel('Bandwidth (MB / s)', size = 30, weight = 'bold') 149 | 150 | plt.xticks(size = 25, weight = 'bold') 151 | plt.yticks(size = 25, weight = 'bold') 152 | 153 | plt.show() 154 | 155 | def read_mhelper_input(input_file): 156 | # All time is in ms. 157 | # All sizes are in B. 158 | reader = csv.DictReader(input_file, delimiter = ' ') 159 | result = list() 160 | for row in reader: 161 | row['count'] = int(row['count']) 162 | row['time'] = float(row['time']) 163 | if 'num_calls' in row: 164 | row['num_calls'] = int(row['num_calls']) 165 | else: 166 | row['size_per_call'] = int(row['size_per_call']) 167 | result.append(row) 168 | return result 169 | 170 | def plot_mhelper_nullker(input_data): 171 | properties = { 172 | 'coefd': 6.87138 * (10 ** -10), #s 173 | 'coefc': 9.98263 * (10 ** -6), # s 174 | 'const': 0.00293373, # s 175 | } 176 | 177 | native_data = dict() 178 | mrcuda_data = dict() 179 | for data in input_data: 180 | if data['lib'] == 'native': 181 | data_dict = native_data 182 | else: 183 | data_dict = mrcuda_data 184 | if data['num_calls'] not in data_dict: 185 | data_dict[data['num_calls']] = list() 186 | data_dict[data['num_calls']].append(data['time']) 187 | 188 | x_values = list() 189 | y_values = list() 190 | 191 | for num_calls in native_data.iterkeys(): 192 | avg_time = np.average(native_data[num_calls]) 193 | for time in mrcuda_data[num_calls]: 194 | x_values.append(num_calls) 195 | y_values.append((time - avg_time) * (10 ** -3)) # seconds 196 | 197 | legend_list = list() 198 | 199 | p = plt.scatter( 200 | x_values, 201 | y_values, 202 | c = COLOR[0], 203 | marker = 'o', 204 | s = 40 205 | ) 206 | legend_list.append((p, 'Measured',)) 207 | 208 | x_values = sorted(set(x_values)) 209 | y_values = [properties['coefc'] * x + properties['const'] for x in x_values] 210 | 211 | plt.plot(x_values, y_values, COLOR[0], linewidth = 4) 212 | p = mlines.Line2D([], [], color = COLOR[0], linewidth = 4) 213 | legend_list.append((p, 'Predicted',)) 214 | 215 | plt.legend(zip(*legend_list)[0], zip(*legend_list)[1], 216 | loc = 'upper left', 217 | prop = matplotlib.font_manager.FontProperties(size = 30, weight = 'bold') 218 | ) 219 | plt.xscale('log', basex = 2) 220 | plt.yscale('log', basey = 10) 221 | plt.xlim(xmin = 0) 222 | plt.ylim(ymin = 0) 223 | 224 | plt.xlabel('Number of calls', size = 30, weight = 'bold') 225 | plt.ylabel('Time (s)', size = 30, weight = 'bold') 226 | 227 | plt.xticks(size = 25, weight = 'bold') 228 | plt.yticks(size = 25, weight = 'bold') 229 | 230 | plt.show() 231 | 232 | def plot_mhelper_memcpybw(input_data): 233 | properties = { 234 | 'coefd': 6.87138 * (10 ** -10), #s 235 | 'coefc': 9.98263 * (10 ** -6), # s 236 | 'const': 0.00293373, # s 237 | 'num_calls': 1000, 238 | } 239 | 240 | native_data = dict() 241 | mrcuda_data = dict() 242 | for data in input_data: 243 | if data['lib'] == 'native': 244 | data_dict = native_data 245 | else: 246 | data_dict = mrcuda_data 247 | if data['size_per_call'] not in data_dict: 248 | data_dict[data['size_per_call']] = list() 249 | data_dict[data['size_per_call']].append(data['time']) 250 | 251 | x_values = list() 252 | y_values = list() 253 | 254 | for size_per_call in native_data.iterkeys(): 255 | avg_time = np.average(native_data[size_per_call]) 256 | for time in mrcuda_data[size_per_call]: 257 | x_values.append(size_per_call) 258 | y_values.append((time - avg_time) * (10 ** -3)) # seconds 259 | 260 | legend_list = list() 261 | 262 | p = plt.scatter( 263 | x_values, 264 | y_values, 265 | c = COLOR[0], 266 | marker = 'o', 267 | s = 40 268 | ) 269 | legend_list.append((p, 'Measured',)) 270 | 271 | x_values = sorted(set(x_values)) 272 | y_values = [properties['coefd'] * x * properties['num_calls'] + properties['coefc'] * properties['num_calls'] + properties['const'] for x in x_values] 273 | 274 | plt.plot(x_values, y_values, COLOR[0], linewidth = 4) 275 | p = mlines.Line2D([], [], color = COLOR[0], linewidth = 4) 276 | legend_list.append((p, 'Predicted',)) 277 | 278 | plt.legend(zip(*legend_list)[0], zip(*legend_list)[1], 279 | loc = 'upper left', 280 | prop = matplotlib.font_manager.FontProperties(size = 40, weight = 'bold') 281 | ) 282 | plt.xscale('log', basex = 2) 283 | plt.yscale('log', basey = 10) 284 | plt.xlim(xmin = 0) 285 | plt.ylim(ymin = 0) 286 | 287 | plt.xlabel('Size per calls (B)', size = 40, weight = 'bold') 288 | plt.ylabel('Time (s)', size = 40, weight = 'bold') 289 | 290 | plt.xticks(size = 35, weight = 'bold') 291 | plt.yticks(size = 35, weight = 'bold') 292 | 293 | plt.show() 294 | 295 | def read_record_replay_input(input_file): 296 | # All time is in s. 297 | reader = csv.DictReader(input_file, delimiter = ',') 298 | result = list() 299 | for row in reader: 300 | if row['mrcuda_switch num_replay']: 301 | row['mrcuda_record time'] = float(row['mrcuda_record time']) 302 | row['mrcuda_switch time'] = float(row['mrcuda_switch time']) 303 | row['mrcuda_sync_mem time'] = float(row['mrcuda_sync_mem time']) 304 | row['mrcuda_replay time'] = row['mrcuda_switch time'] - row['mrcuda_sync_mem time'] 305 | row['mrcuda_switch num_replay'] = int(row['mrcuda_switch num_replay']) 306 | result.append(row) 307 | return result 308 | 309 | def plot_record_replay(input_data): 310 | properties = { 311 | 'record_coef': 2.825 * (10 ** -7), # s 312 | 'record_const': 0.3437 * (10 ** -3), # s 313 | 'replay_coef': 1.031 * (10 ** -6), # s 314 | 'replay_const': 1.2437, # s 315 | } 316 | 317 | fig, ax1 = plt.subplots() 318 | ax2 = ax1.twinx() 319 | 320 | legend_list = list() 321 | 322 | x_values = [row['mrcuda_switch num_replay'] for row in input_data] 323 | 324 | p = ax1.scatter( 325 | x_values, 326 | [row['mrcuda_record time'] for row in input_data], 327 | c = COLOR[0], 328 | marker = 'o', 329 | s = 40 330 | ) 331 | legend_list.append((p, 'Record Overhead (Measured)',)) 332 | 333 | p = ax2.scatter( 334 | x_values, 335 | [row['mrcuda_replay time'] for row in input_data], 336 | c = COLOR[1], 337 | marker = 'o', 338 | s = 40 339 | ) 340 | legend_list.append((p, 'Replay Overhead (Measured)',)) 341 | 342 | x_values = sorted(set(x_values)) 343 | 344 | ax1.plot( 345 | x_values, 346 | [properties['record_coef'] * x + properties['record_const'] for x in x_values], 347 | COLOR[0], 348 | linewidth = 4 349 | ) 350 | p = mlines.Line2D([], [], color = COLOR[0], linewidth = 4) 351 | legend_list.append((p, 'Record Overhead (Predicted)',)) 352 | 353 | ax2.plot( 354 | x_values, 355 | [properties['replay_coef'] * x + properties['replay_const'] for x in x_values], 356 | COLOR[1], 357 | linewidth = 4 358 | ) 359 | p = mlines.Line2D([], [], color = COLOR[1], linewidth = 4) 360 | legend_list.append((p, 'Replay Overhead (Predicted)',)) 361 | 362 | plt.legend(zip(*legend_list)[0], zip(*legend_list)[1], 363 | loc = 'lower right', 364 | prop = matplotlib.font_manager.FontProperties(size = 30, weight = 'bold') 365 | ) 366 | #plt.xscale('log', basex = 2) 367 | #plt.yscale('log', basey = 10) 368 | ax1.set_xlim(xmin = 0) 369 | ax2.set_xlim(xmin = 0) 370 | ax1.set_ylim(ymin = 0) 371 | ax2.set_ylim(ymin = 0) 372 | 373 | ax1.set_xlabel('num_record (x10,000)', size = 30, weight = 'bold') 374 | ax1.set_ylabel('Record Time (ms)', size = 30, weight = 'bold') 375 | ax2.set_ylabel('Replay Time (s)', size = 30, weight = 'bold') 376 | 377 | ax1.set_xticklabels(['%d' % (int(label) / 10000,) for label in ax1.get_xticks().tolist()]) 378 | 379 | for label in ax1.get_xticklabels(): 380 | label.set_fontsize(25) 381 | label.set_fontweight('bold') 382 | 383 | ax1.set_yticklabels(['%d' % (float(label) * 1000,) for label in ax1.get_yticks().tolist()]) 384 | 385 | for label in ax1.get_yticklabels(): 386 | label.set_fontsize(25) 387 | label.set_fontweight('bold') 388 | for label in ax2.get_yticklabels(): 389 | label.set_fontsize(25) 390 | label.set_fontweight('bold') 391 | 392 | plt.show() 393 | 394 | def main(): 395 | """ 396 | Main function. 397 | """ 398 | args = parseargs() 399 | 400 | if args.type == 'memsync': 401 | input_data = read_memsync_input(args.resultfile) 402 | plot_memsync(input_data) 403 | elif args.type == 'memsync-bw': 404 | input_data = read_memsync_input(args.resultfile) 405 | plot_memsync_bw(input_data) 406 | elif args.type == 'mhelper-nullker': 407 | input_data = read_mhelper_input(args.resultfile) 408 | plot_mhelper_nullker(input_data) 409 | elif args.type == 'mhelper-memcpybw': 410 | input_data = read_mhelper_input(args.resultfile) 411 | plot_mhelper_memcpybw(input_data) 412 | elif args.type == 'record-replay': 413 | input_data = read_record_replay_input(args.resultfile) 414 | plot_record_replay(input_data) 415 | 416 | if __name__ == "__main__": 417 | main() 418 | 419 | -------------------------------------------------------------------------------- /results/memcpybw-mhelper.out: -------------------------------------------------------------------------------- 1 | prog lib count size_per_call time 2 | cudamemcpy mrcuda 0 1024 23.707000 3 | cudamemcpy mrcuda 0 2048 23.586000 4 | cudamemcpy mrcuda 0 4096 24.085000 5 | cudamemcpy mrcuda 0 8192 27.142000 6 | cudamemcpy mrcuda 0 16384 32.360000 7 | cudamemcpy mrcuda 0 32768 43.379000 8 | cudamemcpy mrcuda 0 65536 65.852000 9 | cudamemcpy mrcuda 0 131072 107.484000 10 | cudamemcpy mrcuda 0 262144 190.487000 11 | cudamemcpy mrcuda 0 524288 360.121000 12 | cudamemcpy mrcuda 0 1048576 740.892000 13 | cudamemcpy mrcuda 0 2097152 1502.484000 14 | cudamemcpy mrcuda 0 4194304 3162.210000 15 | cudamemcpy mrcuda 0 8388608 6589.956000 16 | cudamemcpy mrcuda 0 16777216 14099.772000 17 | cudamemcpy mrcuda 0 33554432 28670.813000 18 | cudamemcpy mrcuda 0 67108864 57351.585000 19 | cudamemcpy mrcuda 0 134217728 114402.483000 20 | cudamemcpy mrcuda 0 268435456 228174.487000 21 | cudamemcpy mrcuda 0 536870912 456561.788000 22 | cudamemcpy mrcuda 1 1024 25.535000 23 | cudamemcpy mrcuda 1 2048 24.536000 24 | cudamemcpy mrcuda 1 4096 24.919000 25 | cudamemcpy mrcuda 1 8192 28.170000 26 | cudamemcpy mrcuda 1 16384 33.451000 27 | cudamemcpy mrcuda 1 32768 44.960000 28 | cudamemcpy mrcuda 1 65536 67.520000 29 | cudamemcpy mrcuda 1 131072 109.729000 30 | cudamemcpy mrcuda 1 262144 192.256000 31 | cudamemcpy mrcuda 1 524288 363.553000 32 | cudamemcpy mrcuda 1 1048576 774.701000 33 | cudamemcpy mrcuda 1 2097152 1615.745000 34 | cudamemcpy mrcuda 1 4194304 3289.496000 35 | cudamemcpy mrcuda 1 8388608 6818.549000 36 | cudamemcpy mrcuda 1 16777216 14308.595000 37 | cudamemcpy mrcuda 1 33554432 28948.875000 38 | cudamemcpy mrcuda 1 67108864 57716.634000 39 | cudamemcpy mrcuda 1 134217728 114734.186000 40 | cudamemcpy mrcuda 1 268435456 228719.460000 41 | cudamemcpy mrcuda 1 536870912 457770.822000 42 | cudamemcpy mrcuda 2 1024 24.210000 43 | cudamemcpy mrcuda 2 2048 23.541000 44 | cudamemcpy mrcuda 2 4096 23.846000 45 | cudamemcpy mrcuda 2 8192 27.095000 46 | cudamemcpy mrcuda 2 16384 32.419000 47 | cudamemcpy mrcuda 2 32768 43.485000 48 | cudamemcpy mrcuda 2 65536 65.377000 49 | cudamemcpy mrcuda 2 131072 107.146000 50 | cudamemcpy mrcuda 2 262144 190.121000 51 | cudamemcpy mrcuda 2 524288 360.972000 52 | cudamemcpy mrcuda 2 1048576 769.155000 53 | cudamemcpy mrcuda 2 2097152 1610.051000 54 | cudamemcpy mrcuda 2 4194304 3281.864000 55 | cudamemcpy mrcuda 2 8388608 6782.289000 56 | cudamemcpy mrcuda 2 16777216 14296.611000 57 | cudamemcpy mrcuda 2 33554432 28939.121000 58 | cudamemcpy mrcuda 2 67108864 57711.919000 59 | cudamemcpy mrcuda 2 134217728 115116.285000 60 | cudamemcpy mrcuda 2 268435456 229418.731000 61 | cudamemcpy mrcuda 2 536870912 458720.893000 62 | cudamemcpy mrcuda 3 1024 24.432000 63 | cudamemcpy mrcuda 3 2048 23.433000 64 | cudamemcpy mrcuda 3 4096 23.725000 65 | cudamemcpy mrcuda 3 8192 26.918000 66 | cudamemcpy mrcuda 3 16384 32.127000 67 | cudamemcpy mrcuda 3 32768 43.121000 68 | cudamemcpy mrcuda 3 65536 65.396000 69 | cudamemcpy mrcuda 3 131072 107.548000 70 | cudamemcpy mrcuda 3 262144 190.862000 71 | cudamemcpy mrcuda 3 524288 363.774000 72 | cudamemcpy mrcuda 3 1048576 733.667000 73 | cudamemcpy mrcuda 3 2097152 1561.821000 74 | cudamemcpy mrcuda 3 4194304 3264.504000 75 | cudamemcpy mrcuda 3 8388608 6770.502000 76 | cudamemcpy mrcuda 3 16777216 14221.611000 77 | cudamemcpy mrcuda 3 33554432 28949.865000 78 | cudamemcpy mrcuda 3 67108864 57812.058000 79 | cudamemcpy mrcuda 3 134217728 115184.516000 80 | cudamemcpy mrcuda 3 268435456 229561.263000 81 | cudamemcpy mrcuda 3 536870912 458857.855000 82 | cudamemcpy mrcuda 4 1024 23.803000 83 | cudamemcpy mrcuda 4 2048 23.921000 84 | cudamemcpy mrcuda 4 4096 24.169000 85 | cudamemcpy mrcuda 4 8192 27.698000 86 | cudamemcpy mrcuda 4 16384 32.877000 87 | cudamemcpy mrcuda 4 32768 43.996000 88 | cudamemcpy mrcuda 4 65536 66.135000 89 | cudamemcpy mrcuda 4 131072 107.902000 90 | cudamemcpy mrcuda 4 262144 190.761000 91 | cudamemcpy mrcuda 4 524288 357.221000 92 | cudamemcpy mrcuda 4 1048576 734.587000 93 | cudamemcpy mrcuda 4 2097152 1501.452000 94 | cudamemcpy mrcuda 4 4194304 3157.658000 95 | cudamemcpy mrcuda 4 8388608 6724.800000 96 | cudamemcpy mrcuda 4 16777216 14084.432000 97 | cudamemcpy mrcuda 4 33554432 28745.811000 98 | cudamemcpy mrcuda 4 67108864 57456.041000 99 | cudamemcpy mrcuda 4 134217728 114529.454000 100 | cudamemcpy mrcuda 4 268435456 228555.257000 101 | cudamemcpy mrcuda 4 536870912 456908.974000 102 | cudamemcpy mrcuda 5 1024 24.740000 103 | cudamemcpy mrcuda 5 2048 24.142000 104 | cudamemcpy mrcuda 5 4096 24.591000 105 | cudamemcpy mrcuda 5 8192 27.863000 106 | cudamemcpy mrcuda 5 16384 33.229000 107 | cudamemcpy mrcuda 5 32768 44.054000 108 | cudamemcpy mrcuda 5 65536 66.328000 109 | cudamemcpy mrcuda 5 131072 108.365000 110 | cudamemcpy mrcuda 5 262144 190.897000 111 | cudamemcpy mrcuda 5 524288 362.404000 112 | cudamemcpy mrcuda 5 1048576 768.185000 113 | cudamemcpy mrcuda 5 2097152 1609.308000 114 | cudamemcpy mrcuda 5 4194304 3284.896000 115 | cudamemcpy mrcuda 5 8388608 6900.502000 116 | cudamemcpy mrcuda 5 16777216 14266.649000 117 | cudamemcpy mrcuda 5 33554432 28960.732000 118 | cudamemcpy mrcuda 5 67108864 57796.792000 119 | cudamemcpy mrcuda 5 134217728 115037.879000 120 | cudamemcpy mrcuda 5 268435456 229401.095000 121 | cudamemcpy mrcuda 5 536870912 458578.661000 122 | cudamemcpy mrcuda 6 1024 23.720000 123 | cudamemcpy mrcuda 6 2048 23.777000 124 | cudamemcpy mrcuda 6 4096 24.075000 125 | cudamemcpy mrcuda 6 8192 27.386000 126 | cudamemcpy mrcuda 6 16384 32.648000 127 | cudamemcpy mrcuda 6 32768 43.408000 128 | cudamemcpy mrcuda 6 65536 65.562000 129 | cudamemcpy mrcuda 6 131072 107.481000 130 | cudamemcpy mrcuda 6 262144 190.448000 131 | cudamemcpy mrcuda 6 524288 361.721000 132 | cudamemcpy mrcuda 6 1048576 775.303000 133 | cudamemcpy mrcuda 6 2097152 1602.592000 134 | cudamemcpy mrcuda 6 4194304 3262.345000 135 | cudamemcpy mrcuda 6 8388608 6864.071000 136 | cudamemcpy mrcuda 6 16777216 14261.377000 137 | cudamemcpy mrcuda 6 33554432 28879.013000 138 | cudamemcpy mrcuda 6 67108864 57695.091000 139 | cudamemcpy mrcuda 6 134217728 114821.480000 140 | cudamemcpy mrcuda 6 268435456 228831.121000 141 | cudamemcpy mrcuda 6 536870912 457399.061000 142 | cudamemcpy mrcuda 7 1024 24.467000 143 | cudamemcpy mrcuda 7 2048 23.916000 144 | cudamemcpy mrcuda 7 4096 24.277000 145 | cudamemcpy mrcuda 7 8192 27.448000 146 | cudamemcpy mrcuda 7 16384 32.568000 147 | cudamemcpy mrcuda 7 32768 43.604000 148 | cudamemcpy mrcuda 7 65536 65.713000 149 | cudamemcpy mrcuda 7 131072 108.453000 150 | cudamemcpy mrcuda 7 262144 191.636000 151 | cudamemcpy mrcuda 7 524288 361.887000 152 | cudamemcpy mrcuda 7 1048576 774.704000 153 | cudamemcpy mrcuda 7 2097152 1610.828000 154 | cudamemcpy mrcuda 7 4194304 3275.267000 155 | cudamemcpy mrcuda 7 8388608 6902.061000 156 | cudamemcpy mrcuda 7 16777216 14316.970000 157 | cudamemcpy mrcuda 7 33554432 29001.885000 158 | cudamemcpy mrcuda 7 67108864 57918.016000 159 | cudamemcpy mrcuda 7 134217728 115225.240000 160 | cudamemcpy mrcuda 7 268435456 229559.663000 161 | cudamemcpy mrcuda 7 536870912 458412.680000 162 | cudamemcpy mrcuda 8 1024 24.201000 163 | cudamemcpy mrcuda 8 2048 23.573000 164 | cudamemcpy mrcuda 8 4096 24.008000 165 | cudamemcpy mrcuda 8 8192 27.317000 166 | cudamemcpy mrcuda 8 16384 32.523000 167 | cudamemcpy mrcuda 8 32768 43.500000 168 | cudamemcpy mrcuda 8 65536 65.584000 169 | cudamemcpy mrcuda 8 131072 107.949000 170 | cudamemcpy mrcuda 8 262144 190.804000 171 | cudamemcpy mrcuda 8 524288 364.756000 172 | cudamemcpy mrcuda 8 1048576 724.120000 173 | cudamemcpy mrcuda 8 2097152 1550.997000 174 | cudamemcpy mrcuda 8 4194304 3218.520000 175 | cudamemcpy mrcuda 8 8388608 6666.086000 176 | cudamemcpy mrcuda 8 16777216 14107.568000 177 | cudamemcpy mrcuda 8 33554432 28795.421000 178 | cudamemcpy mrcuda 8 67108864 57638.160000 179 | cudamemcpy mrcuda 8 134217728 114926.156000 180 | cudamemcpy mrcuda 8 268435456 229095.012000 181 | cudamemcpy mrcuda 8 536870912 457751.274000 182 | cudamemcpy mrcuda 9 1024 23.594000 183 | cudamemcpy mrcuda 9 2048 24.050000 184 | cudamemcpy mrcuda 9 4096 24.403000 185 | cudamemcpy mrcuda 9 8192 27.678000 186 | cudamemcpy mrcuda 9 16384 32.980000 187 | cudamemcpy mrcuda 9 32768 43.657000 188 | cudamemcpy mrcuda 9 65536 65.738000 189 | cudamemcpy mrcuda 9 131072 107.550000 190 | cudamemcpy mrcuda 9 262144 190.718000 191 | cudamemcpy mrcuda 9 524288 363.023000 192 | cudamemcpy mrcuda 9 1048576 729.539000 193 | cudamemcpy mrcuda 9 2097152 1505.266000 194 | cudamemcpy mrcuda 9 4194304 3158.483000 195 | cudamemcpy mrcuda 9 8388608 6669.139000 196 | cudamemcpy mrcuda 9 16777216 14043.886000 197 | cudamemcpy mrcuda 9 33554432 28737.694000 198 | cudamemcpy mrcuda 9 67108864 57545.285000 199 | cudamemcpy mrcuda 9 134217728 114763.498000 200 | cudamemcpy mrcuda 9 268435456 229028.840000 201 | cudamemcpy mrcuda 9 536870912 457445.702000 202 | cudamemcpy native 0 1024 4.898000 203 | cudamemcpy native 0 2048 5.050000 204 | cudamemcpy native 0 4096 5.473000 205 | cudamemcpy native 0 8192 6.790000 206 | cudamemcpy native 0 16384 9.148000 207 | cudamemcpy native 0 32768 13.953000 208 | cudamemcpy native 0 65536 23.594000 209 | cudamemcpy native 0 131072 36.605000 210 | cudamemcpy native 0 262144 66.875000 211 | cudamemcpy native 0 524288 127.410000 212 | cudamemcpy native 0 1048576 249.071000 213 | cudamemcpy native 0 2097152 423.865000 214 | cudamemcpy native 0 4194304 773.866000 215 | cudamemcpy native 0 8388608 1464.670000 216 | cudamemcpy native 0 16777216 2860.749000 217 | cudamemcpy native 0 33554432 5636.800000 218 | cudamemcpy native 0 67108864 11195.433000 219 | cudamemcpy native 0 134217728 22313.544000 220 | cudamemcpy native 0 268435456 44545.160000 221 | cudamemcpy native 0 536870912 88965.398000 222 | cudamemcpy native 1 1024 4.922000 223 | cudamemcpy native 1 2048 5.040000 224 | cudamemcpy native 1 4096 5.501000 225 | cudamemcpy native 1 8192 6.835000 226 | cudamemcpy native 1 16384 9.161000 227 | cudamemcpy native 1 32768 13.996000 228 | cudamemcpy native 1 65536 23.676000 229 | cudamemcpy native 1 131072 36.652000 230 | cudamemcpy native 1 262144 66.976000 231 | cudamemcpy native 1 524288 127.428000 232 | cudamemcpy native 1 1048576 248.992000 233 | cudamemcpy native 1 2097152 423.326000 234 | cudamemcpy native 1 4194304 772.537000 235 | cudamemcpy native 1 8388608 1464.817000 236 | cudamemcpy native 1 16777216 2855.708000 237 | cudamemcpy native 1 33554432 5633.196000 238 | cudamemcpy native 1 67108864 11194.047000 239 | cudamemcpy native 1 134217728 22306.524000 240 | cudamemcpy native 1 268435456 44531.106000 241 | cudamemcpy native 1 536870912 88942.431000 242 | cudamemcpy native 2 1024 4.927000 243 | cudamemcpy native 2 2048 5.030000 244 | cudamemcpy native 2 4096 5.484000 245 | cudamemcpy native 2 8192 6.796000 246 | cudamemcpy native 2 16384 9.124000 247 | cudamemcpy native 2 32768 13.966000 248 | cudamemcpy native 2 65536 23.586000 249 | cudamemcpy native 2 131072 36.658000 250 | cudamemcpy native 2 262144 66.871000 251 | cudamemcpy native 2 524288 127.386000 252 | cudamemcpy native 2 1048576 276.266000 253 | cudamemcpy native 2 2097152 449.664000 254 | cudamemcpy native 2 4194304 800.574000 255 | cudamemcpy native 2 8388608 1490.734000 256 | cudamemcpy native 2 16777216 2890.811000 257 | cudamemcpy native 2 33554432 5670.958000 258 | cudamemcpy native 2 67108864 11224.960000 259 | cudamemcpy native 2 134217728 22320.847000 260 | cudamemcpy native 2 268435456 44527.648000 261 | cudamemcpy native 2 536870912 88956.920000 262 | cudamemcpy native 3 1024 4.914000 263 | cudamemcpy native 3 2048 5.064000 264 | cudamemcpy native 3 4096 5.478000 265 | cudamemcpy native 3 8192 6.788000 266 | cudamemcpy native 3 16384 9.145000 267 | cudamemcpy native 3 32768 13.995000 268 | cudamemcpy native 3 65536 23.598000 269 | cudamemcpy native 3 131072 36.605000 270 | cudamemcpy native 3 262144 66.904000 271 | cudamemcpy native 3 524288 127.529000 272 | cudamemcpy native 3 1048576 249.258000 273 | cudamemcpy native 3 2097152 423.666000 274 | cudamemcpy native 3 4194304 772.672000 275 | cudamemcpy native 3 8388608 1464.225000 276 | cudamemcpy native 3 16777216 2858.230000 277 | cudamemcpy native 3 33554432 5633.703000 278 | cudamemcpy native 3 67108864 11185.736000 279 | cudamemcpy native 3 134217728 22286.092000 280 | cudamemcpy native 3 268435456 44494.399000 281 | cudamemcpy native 3 536870912 88906.943000 282 | cudamemcpy native 4 1024 4.929000 283 | cudamemcpy native 4 2048 5.058000 284 | cudamemcpy native 4 4096 5.505000 285 | cudamemcpy native 4 8192 6.804000 286 | cudamemcpy native 4 16384 9.168000 287 | cudamemcpy native 4 32768 13.978000 288 | cudamemcpy native 4 65536 23.640000 289 | cudamemcpy native 4 131072 36.665000 290 | cudamemcpy native 4 262144 66.956000 291 | cudamemcpy native 4 524288 127.496000 292 | cudamemcpy native 4 1048576 249.023000 293 | cudamemcpy native 4 2097152 423.554000 294 | cudamemcpy native 4 4194304 772.551000 295 | cudamemcpy native 4 8388608 1463.784000 296 | cudamemcpy native 4 16777216 2858.779000 297 | cudamemcpy native 4 33554432 5634.908000 298 | cudamemcpy native 4 67108864 11197.449000 299 | cudamemcpy native 4 134217728 22302.144000 300 | cudamemcpy native 4 268435456 44528.614000 301 | cudamemcpy native 4 536870912 88931.180000 302 | cudamemcpy native 5 1024 4.916000 303 | cudamemcpy native 5 2048 5.056000 304 | cudamemcpy native 5 4096 5.495000 305 | cudamemcpy native 5 8192 6.817000 306 | cudamemcpy native 5 16384 9.193000 307 | cudamemcpy native 5 32768 14.084000 308 | cudamemcpy native 5 65536 23.629000 309 | cudamemcpy native 5 131072 36.595000 310 | cudamemcpy native 5 262144 66.844000 311 | cudamemcpy native 5 524288 127.363000 312 | cudamemcpy native 5 1048576 248.870000 313 | cudamemcpy native 5 2097152 423.485000 314 | cudamemcpy native 5 4194304 772.666000 315 | cudamemcpy native 5 8388608 1464.365000 316 | cudamemcpy native 5 16777216 2859.676000 317 | cudamemcpy native 5 33554432 5636.493000 318 | cudamemcpy native 5 67108864 11201.132000 319 | cudamemcpy native 5 134217728 22306.450000 320 | cudamemcpy native 5 268435456 44522.188000 321 | cudamemcpy native 5 536870912 88964.001000 322 | cudamemcpy native 6 1024 4.880000 323 | cudamemcpy native 6 2048 5.054000 324 | cudamemcpy native 6 4096 5.494000 325 | cudamemcpy native 6 8192 6.780000 326 | cudamemcpy native 6 16384 9.165000 327 | cudamemcpy native 6 32768 13.966000 328 | cudamemcpy native 6 65536 23.590000 329 | cudamemcpy native 6 131072 36.656000 330 | cudamemcpy native 6 262144 67.015000 331 | cudamemcpy native 6 524288 127.506000 332 | cudamemcpy native 6 1048576 249.662000 333 | cudamemcpy native 6 2097152 424.632000 334 | cudamemcpy native 6 4194304 773.746000 335 | cudamemcpy native 6 8388608 1468.640000 336 | cudamemcpy native 6 16777216 2864.502000 337 | cudamemcpy native 6 33554432 5649.572000 338 | cudamemcpy native 6 67108864 11222.734000 339 | cudamemcpy native 6 134217728 22363.116000 340 | cudamemcpy native 6 268435456 44651.063000 341 | cudamemcpy native 6 536870912 89210.271000 342 | cudamemcpy native 7 1024 4.906000 343 | cudamemcpy native 7 2048 5.061000 344 | cudamemcpy native 7 4096 5.507000 345 | cudamemcpy native 7 8192 6.801000 346 | cudamemcpy native 7 16384 9.163000 347 | cudamemcpy native 7 32768 14.023000 348 | cudamemcpy native 7 65536 23.634000 349 | cudamemcpy native 7 131072 36.718000 350 | cudamemcpy native 7 262144 67.052000 351 | cudamemcpy native 7 524288 128.877000 352 | cudamemcpy native 7 1048576 249.558000 353 | cudamemcpy native 7 2097152 424.701000 354 | cudamemcpy native 7 4194304 773.827000 355 | cudamemcpy native 7 8388608 1468.707000 356 | cudamemcpy native 7 16777216 2867.657000 357 | cudamemcpy native 7 33554432 5652.540000 358 | cudamemcpy native 7 67108864 11232.904000 359 | cudamemcpy native 7 134217728 22372.873000 360 | cudamemcpy native 7 268435456 44661.230000 361 | cudamemcpy native 7 536870912 89251.043000 362 | cudamemcpy native 8 1024 4.858000 363 | cudamemcpy native 8 2048 5.027000 364 | cudamemcpy native 8 4096 5.465000 365 | cudamemcpy native 8 8192 6.807000 366 | cudamemcpy native 8 16384 9.141000 367 | cudamemcpy native 8 32768 13.981000 368 | cudamemcpy native 8 65536 23.640000 369 | cudamemcpy native 8 131072 36.645000 370 | cudamemcpy native 8 262144 67.007000 371 | cudamemcpy native 8 524288 127.678000 372 | cudamemcpy native 8 1048576 249.723000 373 | cudamemcpy native 8 2097152 425.068000 374 | cudamemcpy native 8 4194304 775.541000 375 | cudamemcpy native 8 8388608 1469.898000 376 | cudamemcpy native 8 16777216 2869.008000 377 | cudamemcpy native 8 33554432 5658.383000 378 | cudamemcpy native 8 67108864 11239.155000 379 | cudamemcpy native 8 134217728 22381.782000 380 | cudamemcpy native 8 268435456 44677.469000 381 | cudamemcpy native 8 536870912 89271.732000 382 | cudamemcpy native 9 1024 4.917000 383 | cudamemcpy native 9 2048 5.055000 384 | cudamemcpy native 9 4096 5.491000 385 | cudamemcpy native 9 8192 6.795000 386 | cudamemcpy native 9 16384 9.172000 387 | cudamemcpy native 9 32768 13.993000 388 | cudamemcpy native 9 65536 23.643000 389 | cudamemcpy native 9 131072 36.683000 390 | cudamemcpy native 9 262144 66.824000 391 | cudamemcpy native 9 524288 127.519000 392 | cudamemcpy native 9 1048576 276.583000 393 | cudamemcpy native 9 2097152 450.542000 394 | cudamemcpy native 9 4194304 799.578000 395 | cudamemcpy native 9 8388608 1491.421000 396 | cudamemcpy native 9 16777216 2892.257000 397 | cudamemcpy native 9 33554432 5671.093000 398 | cudamemcpy native 9 67108864 11226.631000 399 | cudamemcpy native 9 134217728 22329.188000 400 | cudamemcpy native 9 268435456 44524.088000 401 | cudamemcpy native 9 536870912 88925.813000 402 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | Copyright (C) 1994-1996, 1999-2002, 2004-2013 Free Software Foundation, 5 | Inc. 6 | 7 | Copying and distribution of this file, with or without modification, 8 | are permitted in any medium without royalty provided the copyright 9 | notice and this notice are preserved. This file is offered as-is, 10 | without warranty of any kind. 11 | 12 | Basic Installation 13 | ================== 14 | 15 | Briefly, the shell commands `./configure; make; make install' should 16 | configure, build, and install this package. The following 17 | more-detailed instructions are generic; see the `README' file for 18 | instructions specific to this package. Some packages provide this 19 | `INSTALL' file but do not implement all of the features documented 20 | below. The lack of an optional feature in a given package is not 21 | necessarily a bug. More recommendations for GNU packages can be found 22 | in *note Makefile Conventions: (standards)Makefile Conventions. 23 | 24 | The `configure' shell script attempts to guess correct values for 25 | various system-dependent variables used during compilation. It uses 26 | those values to create a `Makefile' in each directory of the package. 27 | It may also create one or more `.h' files containing system-dependent 28 | definitions. Finally, it creates a shell script `config.status' that 29 | you can run in the future to recreate the current configuration, and a 30 | file `config.log' containing compiler output (useful mainly for 31 | debugging `configure'). 32 | 33 | It can also use an optional file (typically called `config.cache' 34 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 35 | the results of its tests to speed up reconfiguring. Caching is 36 | disabled by default to prevent problems with accidental use of stale 37 | cache files. 38 | 39 | If you need to do unusual things to compile the package, please try 40 | to figure out how `configure' could check whether to do them, and mail 41 | diffs or instructions to the address given in the `README' so they can 42 | be considered for the next release. If you are using the cache, and at 43 | some point `config.cache' contains results you don't want to keep, you 44 | may remove or edit it. 45 | 46 | The file `configure.ac' (or `configure.in') is used to create 47 | `configure' by a program called `autoconf'. You need `configure.ac' if 48 | you want to change it or regenerate `configure' using a newer version 49 | of `autoconf'. 50 | 51 | The simplest way to compile this package is: 52 | 53 | 1. `cd' to the directory containing the package's source code and type 54 | `./configure' to configure the package for your system. 55 | 56 | Running `configure' might take a while. While running, it prints 57 | some messages telling which features it is checking for. 58 | 59 | 2. Type `make' to compile the package. 60 | 61 | 3. Optionally, type `make check' to run any self-tests that come with 62 | the package, generally using the just-built uninstalled binaries. 63 | 64 | 4. Type `make install' to install the programs and any data files and 65 | documentation. When installing into a prefix owned by root, it is 66 | recommended that the package be configured and built as a regular 67 | user, and only the `make install' phase executed with root 68 | privileges. 69 | 70 | 5. Optionally, type `make installcheck' to repeat any self-tests, but 71 | this time using the binaries in their final installed location. 72 | This target does not install anything. Running this target as a 73 | regular user, particularly if the prior `make install' required 74 | root privileges, verifies that the installation completed 75 | correctly. 76 | 77 | 6. You can remove the program binaries and object files from the 78 | source code directory by typing `make clean'. To also remove the 79 | files that `configure' created (so you can compile the package for 80 | a different kind of computer), type `make distclean'. There is 81 | also a `make maintainer-clean' target, but that is intended mainly 82 | for the package's developers. If you use it, you may have to get 83 | all sorts of other programs in order to regenerate files that came 84 | with the distribution. 85 | 86 | 7. Often, you can also type `make uninstall' to remove the installed 87 | files again. In practice, not all packages have tested that 88 | uninstallation works correctly, even though it is required by the 89 | GNU Coding Standards. 90 | 91 | 8. Some packages, particularly those that use Automake, provide `make 92 | distcheck', which can by used by developers to test that all other 93 | targets like `make install' and `make uninstall' work correctly. 94 | This target is generally not run by end users. 95 | 96 | Compilers and Options 97 | ===================== 98 | 99 | Some systems require unusual options for compilation or linking that 100 | the `configure' script does not know about. Run `./configure --help' 101 | for details on some of the pertinent environment variables. 102 | 103 | You can give `configure' initial values for configuration parameters 104 | by setting variables in the command line or in the environment. Here 105 | is an example: 106 | 107 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 108 | 109 | *Note Defining Variables::, for more details. 110 | 111 | Compiling For Multiple Architectures 112 | ==================================== 113 | 114 | You can compile the package for more than one kind of computer at the 115 | same time, by placing the object files for each architecture in their 116 | own directory. To do this, you can use GNU `make'. `cd' to the 117 | directory where you want the object files and executables to go and run 118 | the `configure' script. `configure' automatically checks for the 119 | source code in the directory that `configure' is in and in `..'. This 120 | is known as a "VPATH" build. 121 | 122 | With a non-GNU `make', it is safer to compile the package for one 123 | architecture at a time in the source code directory. After you have 124 | installed the package for one architecture, use `make distclean' before 125 | reconfiguring for another architecture. 126 | 127 | On MacOS X 10.5 and later systems, you can create libraries and 128 | executables that work on multiple system types--known as "fat" or 129 | "universal" binaries--by specifying multiple `-arch' options to the 130 | compiler but only a single `-arch' option to the preprocessor. Like 131 | this: 132 | 133 | ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 134 | CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 135 | CPP="gcc -E" CXXCPP="g++ -E" 136 | 137 | This is not guaranteed to produce working output in all cases, you 138 | may have to build one architecture at a time and combine the results 139 | using the `lipo' tool if you have problems. 140 | 141 | Installation Names 142 | ================== 143 | 144 | By default, `make install' installs the package's commands under 145 | `/usr/local/bin', include files under `/usr/local/include', etc. You 146 | can specify an installation prefix other than `/usr/local' by giving 147 | `configure' the option `--prefix=PREFIX', where PREFIX must be an 148 | absolute file name. 149 | 150 | You can specify separate installation prefixes for 151 | architecture-specific files and architecture-independent files. If you 152 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 153 | PREFIX as the prefix for installing programs and libraries. 154 | Documentation and other data files still use the regular prefix. 155 | 156 | In addition, if you use an unusual directory layout you can give 157 | options like `--bindir=DIR' to specify different values for particular 158 | kinds of files. Run `configure --help' for a list of the directories 159 | you can set and what kinds of files go in them. In general, the 160 | default for these options is expressed in terms of `${prefix}', so that 161 | specifying just `--prefix' will affect all of the other directory 162 | specifications that were not explicitly provided. 163 | 164 | The most portable way to affect installation locations is to pass the 165 | correct locations to `configure'; however, many packages provide one or 166 | both of the following shortcuts of passing variable assignments to the 167 | `make install' command line to change installation locations without 168 | having to reconfigure or recompile. 169 | 170 | The first method involves providing an override variable for each 171 | affected directory. For example, `make install 172 | prefix=/alternate/directory' will choose an alternate location for all 173 | directory configuration variables that were expressed in terms of 174 | `${prefix}'. Any directories that were specified during `configure', 175 | but not in terms of `${prefix}', must each be overridden at install 176 | time for the entire installation to be relocated. The approach of 177 | makefile variable overrides for each directory variable is required by 178 | the GNU Coding Standards, and ideally causes no recompilation. 179 | However, some platforms have known limitations with the semantics of 180 | shared libraries that end up requiring recompilation when using this 181 | method, particularly noticeable in packages that use GNU Libtool. 182 | 183 | The second method involves providing the `DESTDIR' variable. For 184 | example, `make install DESTDIR=/alternate/directory' will prepend 185 | `/alternate/directory' before all installation names. The approach of 186 | `DESTDIR' overrides is not required by the GNU Coding Standards, and 187 | does not work on platforms that have drive letters. On the other hand, 188 | it does better at avoiding recompilation issues, and works well even 189 | when some directory options were not specified in terms of `${prefix}' 190 | at `configure' time. 191 | 192 | Optional Features 193 | ================= 194 | 195 | If the package supports it, you can cause programs to be installed 196 | with an extra prefix or suffix on their names by giving `configure' the 197 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 198 | 199 | Some packages pay attention to `--enable-FEATURE' options to 200 | `configure', where FEATURE indicates an optional part of the package. 201 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 202 | is something like `gnu-as' or `x' (for the X Window System). The 203 | `README' should mention any `--enable-' and `--with-' options that the 204 | package recognizes. 205 | 206 | For packages that use the X Window System, `configure' can usually 207 | find the X include and library files automatically, but if it doesn't, 208 | you can use the `configure' options `--x-includes=DIR' and 209 | `--x-libraries=DIR' to specify their locations. 210 | 211 | Some packages offer the ability to configure how verbose the 212 | execution of `make' will be. For these packages, running `./configure 213 | --enable-silent-rules' sets the default to minimal output, which can be 214 | overridden with `make V=1'; while running `./configure 215 | --disable-silent-rules' sets the default to verbose, which can be 216 | overridden with `make V=0'. 217 | 218 | Particular systems 219 | ================== 220 | 221 | On HP-UX, the default C compiler is not ANSI C compatible. If GNU 222 | CC is not installed, it is recommended to use the following options in 223 | order to use an ANSI C compiler: 224 | 225 | ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" 226 | 227 | and if that doesn't work, install pre-built binaries of GCC for HP-UX. 228 | 229 | HP-UX `make' updates targets which have the same time stamps as 230 | their prerequisites, which makes it generally unusable when shipped 231 | generated files such as `configure' are involved. Use GNU `make' 232 | instead. 233 | 234 | On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot 235 | parse its `' header file. The option `-nodtk' can be used as 236 | a workaround. If GNU CC is not installed, it is therefore recommended 237 | to try 238 | 239 | ./configure CC="cc" 240 | 241 | and if that doesn't work, try 242 | 243 | ./configure CC="cc -nodtk" 244 | 245 | On Solaris, don't put `/usr/ucb' early in your `PATH'. This 246 | directory contains several dysfunctional programs; working variants of 247 | these programs are available in `/usr/bin'. So, if you need `/usr/ucb' 248 | in your `PATH', put it _after_ `/usr/bin'. 249 | 250 | On Haiku, software installed for all users goes in `/boot/common', 251 | not `/usr/local'. It is recommended to use the following options: 252 | 253 | ./configure --prefix=/boot/common 254 | 255 | Specifying the System Type 256 | ========================== 257 | 258 | There may be some features `configure' cannot figure out 259 | automatically, but needs to determine by the type of machine the package 260 | will run on. Usually, assuming the package is built to be run on the 261 | _same_ architectures, `configure' can figure that out, but if it prints 262 | a message saying it cannot guess the machine type, give it the 263 | `--build=TYPE' option. TYPE can either be a short name for the system 264 | type, such as `sun4', or a canonical name which has the form: 265 | 266 | CPU-COMPANY-SYSTEM 267 | 268 | where SYSTEM can have one of these forms: 269 | 270 | OS 271 | KERNEL-OS 272 | 273 | See the file `config.sub' for the possible values of each field. If 274 | `config.sub' isn't included in this package, then this package doesn't 275 | need to know the machine type. 276 | 277 | If you are _building_ compiler tools for cross-compiling, you should 278 | use the option `--target=TYPE' to select the type of system they will 279 | produce code for. 280 | 281 | If you want to _use_ a cross compiler, that generates code for a 282 | platform different from the build platform, you should specify the 283 | "host" platform (i.e., that on which the generated programs will 284 | eventually be run) with `--host=TYPE'. 285 | 286 | Sharing Defaults 287 | ================ 288 | 289 | If you want to set default values for `configure' scripts to share, 290 | you can create a site shell script called `config.site' that gives 291 | default values for variables like `CC', `cache_file', and `prefix'. 292 | `configure' looks for `PREFIX/share/config.site' if it exists, then 293 | `PREFIX/etc/config.site' if it exists. Or, you can set the 294 | `CONFIG_SITE' environment variable to the location of the site script. 295 | A warning: not all `configure' scripts look for a site script. 296 | 297 | Defining Variables 298 | ================== 299 | 300 | Variables not defined in a site shell script can be set in the 301 | environment passed to `configure'. However, some packages may run 302 | configure again during the build, and the customized values of these 303 | variables may be lost. In order to avoid this problem, you should set 304 | them in the `configure' command line, using `VAR=value'. For example: 305 | 306 | ./configure CC=/usr/local2/bin/gcc 307 | 308 | causes the specified `gcc' to be used as the C compiler (unless it is 309 | overridden in the site shell script). 310 | 311 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 312 | an Autoconf limitation. Until the limitation is lifted, you can use 313 | this workaround: 314 | 315 | CONFIG_SHELL=/bin/bash ./configure CONFIG_SHELL=/bin/bash 316 | 317 | `configure' Invocation 318 | ====================== 319 | 320 | `configure' recognizes the following options to control how it 321 | operates. 322 | 323 | `--help' 324 | `-h' 325 | Print a summary of all of the options to `configure', and exit. 326 | 327 | `--help=short' 328 | `--help=recursive' 329 | Print a summary of the options unique to this package's 330 | `configure', and exit. The `short' variant lists options used 331 | only in the top level, while the `recursive' variant lists options 332 | also present in any nested packages. 333 | 334 | `--version' 335 | `-V' 336 | Print the version of Autoconf used to generate the `configure' 337 | script, and exit. 338 | 339 | `--cache-file=FILE' 340 | Enable the cache: use and save the results of the tests in FILE, 341 | traditionally `config.cache'. FILE defaults to `/dev/null' to 342 | disable caching. 343 | 344 | `--config-cache' 345 | `-C' 346 | Alias for `--cache-file=config.cache'. 347 | 348 | `--quiet' 349 | `--silent' 350 | `-q' 351 | Do not print messages saying which checks are being made. To 352 | suppress all normal output, redirect it to `/dev/null' (any error 353 | messages will still be shown). 354 | 355 | `--srcdir=DIR' 356 | Look for the package's source code in directory DIR. Usually 357 | `configure' can determine that directory automatically. 358 | 359 | `--prefix=DIR' 360 | Use DIR as the installation prefix. *note Installation Names:: 361 | for more details, including other options available for fine-tuning 362 | the installation locations. 363 | 364 | `--no-create' 365 | `-n' 366 | Run the configure checks, but stop before creating any output 367 | files. 368 | 369 | `configure' also accepts some other, not widely useful, options. Run 370 | `configure --help' for more details. 371 | -------------------------------------------------------------------------------- /src/interface.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "mrcuda.h" 3 | #include "datatypes.h" 4 | #include "record.h" 5 | #include "intercomm_interface.h" 6 | 7 | /** 8 | * Interface of __cudaRegisterFatBinary. 9 | */ 10 | void** __cudaRegisterFatBinary(void* fatCubin) 11 | { 12 | void **ret; 13 | MRCUDAGPU_t *gpu; 14 | mrcuda_init(); 15 | gpu = mrcuda_get_current_gpu(); 16 | mrcuda_function_call_lock(gpu); 17 | ret = gpu->defaultHandler->__mrcudaRegisterFatBinary(fatCubin); 18 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 19 | mrcuda_record_cudaRegisterFatBinary(gpu, fatCubin, ret); 20 | mrcuda_function_call_release(gpu); 21 | return ret; 22 | } 23 | 24 | /** 25 | * Interface of __cudaRegisterFunction. 26 | */ 27 | void __cudaRegisterFunction( 28 | void **fatCubinHandle, 29 | const char *hostFun, 30 | char *deviceFun, 31 | const char *deviceName, 32 | int thread_limit, 33 | uint3 *tid, 34 | uint3 *bid, 35 | dim3 *bDim, 36 | dim3 *gDim, 37 | int *wSize 38 | ) 39 | { 40 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 41 | mrcuda_function_call_lock(gpu); 42 | gpu->defaultHandler->__mrcudaRegisterFunction( 43 | fatCubinHandle, 44 | hostFun, 45 | deviceFun, 46 | deviceName, 47 | thread_limit, 48 | tid, 49 | bid, 50 | bDim, 51 | gDim, 52 | wSize 53 | ); 54 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 55 | mrcuda_record_cudaRegisterFunction( 56 | gpu, 57 | fatCubinHandle, 58 | hostFun, 59 | deviceFun, 60 | deviceName, 61 | thread_limit, 62 | tid, 63 | bid, 64 | bDim, 65 | gDim, 66 | wSize 67 | ); 68 | mrcuda_function_call_release(gpu); 69 | } 70 | 71 | /** 72 | * Interface of __cudaRegisterVar. 73 | */ 74 | void __cudaRegisterVar( 75 | void **fatCubinHandle, 76 | char *hostVar, 77 | char *deviceAddress, 78 | const char *deviceName, 79 | int ext, 80 | int size, 81 | int constant, 82 | int global 83 | ) 84 | { 85 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 86 | mrcuda_function_call_lock(gpu); 87 | gpu->defaultHandler->__mrcudaRegisterVar( 88 | fatCubinHandle, 89 | hostVar, 90 | deviceAddress, 91 | deviceName, 92 | ext, 93 | size, 94 | constant, 95 | global 96 | ); 97 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 98 | mrcuda_record_cudaRegisterVar( 99 | gpu, 100 | fatCubinHandle, 101 | hostVar, 102 | deviceAddress, 103 | deviceName, 104 | ext, 105 | size, 106 | constant, 107 | global 108 | ); 109 | mrcuda_function_call_release(gpu); 110 | } 111 | 112 | /** 113 | * Interface of __cudaRegisterTexture. 114 | */ 115 | void __cudaRegisterTexture( 116 | void **fatCubinHandle, 117 | const struct textureReference *hostVar, 118 | const void **deviceAddress, 119 | const char *deviceName, 120 | int dim, 121 | int norm, 122 | int ext 123 | ) 124 | { 125 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 126 | mrcuda_function_call_lock(gpu); 127 | gpu->defaultHandler->__mrcudaRegisterTexture( 128 | fatCubinHandle, 129 | hostVar, 130 | deviceAddress, 131 | deviceName, 132 | dim, 133 | norm, 134 | ext 135 | ); 136 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 137 | mrcuda_record_cudaRegisterTexture( 138 | gpu, 139 | fatCubinHandle, 140 | hostVar, 141 | deviceAddress, 142 | deviceName, 143 | dim, 144 | norm, 145 | ext 146 | ); 147 | mrcuda_function_call_release(gpu); 148 | } 149 | 150 | /** 151 | * Interface of __cudaUnregisterFatBinary. 152 | */ 153 | void __cudaUnregisterFatBinary(void **fatCubinHandle) 154 | { 155 | /*mrcuda_function_call_lock(); 156 | mrcudaSymDefault->__mrcudaUnregisterFatBinary( 157 | fatCubinHandle 158 | ); 159 | if(mrcudaSymDefault == mrcudaSymRCUDA) 160 | mrcuda_record_cudaUnregisterFatBinary(fatCubinHandle); 161 | mrcuda_function_call_release();*/ 162 | } 163 | 164 | /** 165 | * Interface of cudaThreadSynchronize. 166 | */ 167 | extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) 168 | { 169 | // cudaThreadSynchronize eventually calls cudaDeviceSynchronize. 170 | // Thus, locking cannot be done here since it will cause dead-lock. 171 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 172 | cudaError_t ret; 173 | ret = gpu->defaultHandler->mrcudaThreadSynchronize(); 174 | return ret; 175 | } 176 | 177 | /** 178 | * Interface of cudaLaunch. 179 | */ 180 | extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) 181 | { 182 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 183 | cudaError_t ret; 184 | mrcuda_function_call_lock(gpu); 185 | if (gpu->status == MRCUDA_GPU_STATUS_HELPER) 186 | ret = mhelper_int_cudaLaunch_internal(gpu, func); 187 | else 188 | ret = gpu->defaultHandler->mrcudaLaunch(func); 189 | gpu->cudaLaunchCount++; 190 | mrcuda_function_call_release(gpu); 191 | if (gpu->switchThreshold == gpu->cudaLaunchCount) 192 | mrcuda_switch(gpu, gpu->virtualNumber); 193 | return ret; 194 | } 195 | 196 | /** 197 | * Interface of cudaMemcpyToSymbol. 198 | */ 199 | extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol( 200 | const void *symbol, 201 | const void *src, 202 | size_t count, 203 | size_t offset, 204 | enum cudaMemcpyKind kind 205 | ) 206 | { 207 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 208 | cudaError_t ret; 209 | mrcuda_function_call_lock(gpu); 210 | ret = gpu->defaultHandler->mrcudaMemcpyToSymbol(symbol, src, count, offset, kind); 211 | mrcuda_function_call_release(gpu); 212 | return ret; 213 | } 214 | 215 | /** 216 | * Interface of cudaMemcpy. 217 | */ 218 | extern __host__ cudaError_t CUDARTAPI cudaMemcpy( 219 | void *dst, 220 | const void *src, 221 | size_t count, 222 | enum cudaMemcpyKind kind 223 | ) 224 | { 225 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 226 | cudaError_t ret; 227 | mrcuda_function_call_lock(gpu); 228 | ret = gpu->defaultHandler->mrcudaMemcpy(dst, src, count, kind); 229 | mrcuda_function_call_release(gpu); 230 | return ret; 231 | } 232 | 233 | /** 234 | * Interface of cudaHostAlloc. 235 | */ 236 | extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags) 237 | { 238 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 239 | cudaError_t ret; 240 | //mrcuda_function_call_lock(gpu); 241 | ret = gpu->defaultHandler->mrcudaHostAlloc(pHost, size, flags); 242 | if (!gpu->nativeFromStart) 243 | // This function has to be recorded regardless we are using rCUDA or not. 244 | // This ensures that we calls cudaFreeHost using the right library (rCUDA or native). 245 | // However, GPUs that are running natively from the start don't need to be recorded. 246 | mrcuda_record_cudaHostAlloc(gpu, pHost, size, flags); 247 | //mrcuda_function_call_release(gpu); 248 | return ret; 249 | } 250 | 251 | /** 252 | * Interface of cudaMemset. 253 | */ 254 | extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count) 255 | { 256 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 257 | cudaError_t ret; 258 | mrcuda_function_call_lock(gpu); 259 | ret = gpu->defaultHandler->mrcudaMemset(devPtr, value, count); 260 | mrcuda_function_call_release(gpu); 261 | return ret; 262 | } 263 | 264 | /** 265 | * Interface of cudaFreeHost. 266 | */ 267 | extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) 268 | { 269 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 270 | cudaError_t ret; 271 | //mrcuda_function_call_lock(gpu); 272 | if (!gpu->nativeFromStart) 273 | // Call the right library of cudaFreeHost according to the recorded cudaHostAlloc calls. 274 | mrcuda_replay_cudaFreeHost(gpu, ptr)->mrcudaFreeHost(ptr); 275 | else 276 | gpu->defaultHandler->mrcudaFreeHost(ptr); 277 | //mrcuda_function_call_release(gpu); 278 | return ret; 279 | } 280 | 281 | /** 282 | * Interface of cudaSetupArgument. 283 | */ 284 | extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset) 285 | { 286 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 287 | cudaError_t ret; 288 | mrcuda_function_call_lock(gpu); 289 | ret = gpu->defaultHandler->mrcudaSetupArgument(arg, size, offset); 290 | mrcuda_function_call_release(gpu); 291 | return ret; 292 | } 293 | 294 | /** 295 | * Interface of cudaMalloc. 296 | */ 297 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) 298 | { 299 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 300 | cudaError_t ret; 301 | mrcuda_function_call_lock(gpu); 302 | ret = gpu->defaultHandler->mrcudaMalloc(devPtr, size); 303 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 304 | mrcuda_record_cudaMalloc(gpu, devPtr, size); 305 | mrcuda_function_call_release(gpu); 306 | return ret; 307 | } 308 | 309 | /** 310 | * Interface of cudaFree. 311 | */ 312 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr) 313 | { 314 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 315 | cudaError_t ret; 316 | mrcuda_function_call_lock(gpu); 317 | ret = gpu->defaultHandler->mrcudaFree(devPtr); 318 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 319 | mrcuda_record_cudaFree(gpu, devPtr); 320 | mrcuda_function_call_release(gpu); 321 | return ret; 322 | } 323 | 324 | /** 325 | * Interface of cudaConfigureCall. 326 | */ 327 | extern __host__ cudaError_t CUDARTAPI cudaConfigureCall( 328 | dim3 gridDim, 329 | dim3 blockDim, 330 | size_t sharedMem, 331 | cudaStream_t stream 332 | ) 333 | { 334 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 335 | cudaError_t ret; 336 | mrcuda_function_call_lock(gpu); 337 | ret = gpu->defaultHandler->mrcudaConfigureCall(gridDim, blockDim, sharedMem, stream); 338 | mrcuda_function_call_release(gpu); 339 | return ret; 340 | } 341 | 342 | /** 343 | * Interface of cudaGetLastError. 344 | */ 345 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void) 346 | { 347 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 348 | cudaError_t ret; 349 | mrcuda_function_call_lock(gpu); 350 | ret = gpu->defaultHandler->mrcudaGetLastError(); 351 | mrcuda_function_call_release(gpu); 352 | return ret; 353 | } 354 | 355 | /** 356 | * Interface of cudaBindTexture. 357 | */ 358 | extern __host__ cudaError_t CUDARTAPI cudaBindTexture( 359 | size_t *offset, 360 | const struct textureReference *texref, 361 | const void *devPtr, 362 | const struct cudaChannelFormatDesc *desc, 363 | size_t size 364 | ) 365 | { 366 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 367 | cudaError_t ret; 368 | mrcuda_function_call_lock(gpu); 369 | ret = gpu->defaultHandler->mrcudaBindTexture(offset, texref, devPtr, desc, size); 370 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 371 | mrcuda_record_cudaBindTexture(gpu, offset, texref, devPtr, desc, size); 372 | mrcuda_function_call_release(gpu); 373 | return ret; 374 | } 375 | 376 | /** 377 | * Interface of cudaCreateChannelDesc. 378 | */ 379 | extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc( 380 | int x, 381 | int y, 382 | int z, 383 | int w, 384 | enum cudaChannelFormatKind f 385 | ) 386 | { 387 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 388 | struct cudaChannelFormatDesc ret; 389 | mrcuda_function_call_lock(gpu); 390 | ret = gpu->defaultHandler->mrcudaCreateChannelDesc(x, y, z, w, f); 391 | mrcuda_function_call_release(gpu); 392 | return ret; 393 | } 394 | 395 | /** 396 | * Interface of cudaGetDeviceProperties. 397 | */ 398 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties( 399 | struct cudaDeviceProp *prop, 400 | int device 401 | ) 402 | { 403 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 404 | cudaError_t ret; 405 | mrcuda_function_call_lock(gpu); 406 | ret = gpu->defaultHandler->mrcudaGetDeviceProperties(prop, device); 407 | mrcuda_function_call_release(gpu); 408 | return ret; 409 | } 410 | 411 | /** 412 | * Interface of cudaStreamCreate. 413 | */ 414 | extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) 415 | { 416 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 417 | cudaError_t ret; 418 | mrcuda_function_call_lock(gpu); 419 | ret = gpu->defaultHandler->mrcudaStreamCreate(pStream); 420 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 421 | mrcuda_record_cudaStreamCreate(gpu, pStream); 422 | mrcuda_function_call_release(gpu); 423 | return ret; 424 | } 425 | 426 | /** 427 | * Interface of cudaMemGetInfo. 428 | */ 429 | extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) 430 | { 431 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 432 | cudaError_t ret; 433 | mrcuda_function_call_lock(gpu); 434 | ret = gpu->defaultHandler->mrcudaMemGetInfo(free, total); 435 | mrcuda_function_call_release(gpu); 436 | return ret; 437 | } 438 | 439 | /** 440 | * Interface of cudaSetDevice. 441 | */ 442 | extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) 443 | { 444 | MRCUDAGPU_t *gpu = &(mrcudaGPUList[device]); 445 | cudaError_t ret; 446 | mrcuda_function_call_lock(gpu); 447 | mrcuda_set_current_gpu(device); 448 | mrcuda_function_call_release(gpu); 449 | ret = gpu->defaultHandler->mrcudaSetDevice(device); 450 | return ret; 451 | } 452 | 453 | /** 454 | * Interface of cudaSetDeviceFlags. 455 | */ 456 | extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) 457 | { 458 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 459 | cudaError_t ret; 460 | mrcuda_function_call_lock(gpu); 461 | ret = gpu->defaultHandler->mrcudaSetDeviceFlags(flags); 462 | if (gpu->status == MRCUDA_GPU_STATUS_RCUDA) 463 | mrcuda_record_cudaSetDeviceFlags(gpu, flags); 464 | mrcuda_function_call_release(gpu); 465 | return ret; 466 | } 467 | 468 | /** 469 | * Interface of cudaGetDevice. 470 | */ 471 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device) 472 | { 473 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 474 | cudaError_t ret; 475 | mrcuda_function_call_lock(gpu); 476 | ret = gpu->defaultHandler->mrcudaGetDevice(device); 477 | mrcuda_function_call_release(gpu); 478 | return ret; 479 | } 480 | 481 | /** 482 | * Interface of cudaGetDeviceCount. 483 | */ 484 | extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) 485 | { 486 | *count = mrcudaNumGPUs; 487 | return cudaSuccess; 488 | } 489 | 490 | extern __host__ __cudart_builtin__ cudaError_t cudaDeviceSynchronize(void) 491 | { 492 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 493 | cudaError_t ret; 494 | mrcuda_function_call_lock(gpu); 495 | ret = gpu->defaultHandler->mrcudaDeviceSynchronize(); 496 | mrcuda_function_call_release(gpu); 497 | return ret; 498 | } 499 | 500 | /** 501 | * Interface of cudaDeviceReset. 502 | */ 503 | extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) 504 | { 505 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 506 | cudaError_t ret; 507 | mrcuda_function_call_lock(gpu); 508 | ret = gpu->defaultHandler->mrcudaDeviceReset(); 509 | mrcuda_function_call_release(gpu); 510 | return ret; 511 | } 512 | 513 | /** 514 | * Interface of cudaEventCreate. 515 | */ 516 | extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) 517 | { 518 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 519 | cudaError_t ret; 520 | mrcuda_function_call_lock(gpu); 521 | ret = gpu->defaultHandler->mrcudaEventCreate(event); 522 | mrcuda_function_call_release(gpu); 523 | return ret; 524 | } 525 | 526 | /** 527 | * Interface of cudaEventRecord. 528 | */ 529 | extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream) 530 | { 531 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 532 | cudaError_t ret; 533 | mrcuda_function_call_lock(gpu); 534 | ret = gpu->defaultHandler->mrcudaEventRecord(event, stream); 535 | mrcuda_function_call_release(gpu); 536 | return ret; 537 | } 538 | 539 | /** 540 | * Interface of cudaEventSynchronize. 541 | */ 542 | extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) 543 | { 544 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 545 | cudaError_t ret; 546 | mrcuda_function_call_lock(gpu); 547 | ret = gpu->defaultHandler->mrcudaEventSynchronize(event); 548 | mrcuda_function_call_release(gpu); 549 | return ret; 550 | } 551 | 552 | /** 553 | * Interface of cudaEventElapsedTime. 554 | */ 555 | extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) 556 | { 557 | MRCUDAGPU_t *gpu = mrcuda_get_current_gpu(); 558 | cudaError_t ret; 559 | mrcuda_function_call_lock(gpu); 560 | ret = gpu->defaultHandler->mrcudaEventElapsedTime(ms, start, end); 561 | mrcuda_function_call_release(gpu); 562 | return ret; 563 | } 564 | -------------------------------------------------------------------------------- /scripts/Makefile.in: -------------------------------------------------------------------------------- 1 | # Makefile.in generated by automake 1.13.4 from Makefile.am. 2 | # @configure_input@ 3 | 4 | # Copyright (C) 1994-2013 Free Software Foundation, Inc. 5 | 6 | # This Makefile.in is free software; the Free Software Foundation 7 | # gives unlimited permission to copy and/or distribute it, 8 | # with or without modifications, as long as this notice is preserved. 9 | 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY, to the extent permitted by law; without 12 | # even the implied warranty of MERCHANTABILITY or FITNESS FOR A 13 | # PARTICULAR PURPOSE. 14 | 15 | @SET_MAKE@ 16 | 17 | VPATH = @srcdir@ 18 | am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' 19 | am__make_running_with_option = \ 20 | case $${target_option-} in \ 21 | ?) ;; \ 22 | *) echo "am__make_running_with_option: internal error: invalid" \ 23 | "target option '$${target_option-}' specified" >&2; \ 24 | exit 1;; \ 25 | esac; \ 26 | has_opt=no; \ 27 | sane_makeflags=$$MAKEFLAGS; \ 28 | if $(am__is_gnu_make); then \ 29 | sane_makeflags=$$MFLAGS; \ 30 | else \ 31 | case $$MAKEFLAGS in \ 32 | *\\[\ \ ]*) \ 33 | bs=\\; \ 34 | sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ 35 | | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ 36 | esac; \ 37 | fi; \ 38 | skip_next=no; \ 39 | strip_trailopt () \ 40 | { \ 41 | flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ 42 | }; \ 43 | for flg in $$sane_makeflags; do \ 44 | test $$skip_next = yes && { skip_next=no; continue; }; \ 45 | case $$flg in \ 46 | *=*|--*) continue;; \ 47 | -*I) strip_trailopt 'I'; skip_next=yes;; \ 48 | -*I?*) strip_trailopt 'I';; \ 49 | -*O) strip_trailopt 'O'; skip_next=yes;; \ 50 | -*O?*) strip_trailopt 'O';; \ 51 | -*l) strip_trailopt 'l'; skip_next=yes;; \ 52 | -*l?*) strip_trailopt 'l';; \ 53 | -[dEDm]) skip_next=yes;; \ 54 | -[JT]) skip_next=yes;; \ 55 | esac; \ 56 | case $$flg in \ 57 | *$$target_option*) has_opt=yes; break;; \ 58 | esac; \ 59 | done; \ 60 | test $$has_opt = yes 61 | am__make_dryrun = (target_option=n; $(am__make_running_with_option)) 62 | am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) 63 | pkgdatadir = $(datadir)/@PACKAGE@ 64 | pkgincludedir = $(includedir)/@PACKAGE@ 65 | pkglibdir = $(libdir)/@PACKAGE@ 66 | pkglibexecdir = $(libexecdir)/@PACKAGE@ 67 | am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd 68 | install_sh_DATA = $(install_sh) -c -m 644 69 | install_sh_PROGRAM = $(install_sh) -c 70 | install_sh_SCRIPT = $(install_sh) -c 71 | INSTALL_HEADER = $(INSTALL_DATA) 72 | transform = $(program_transform_name) 73 | NORMAL_INSTALL = : 74 | PRE_INSTALL = : 75 | POST_INSTALL = : 76 | NORMAL_UNINSTALL = : 77 | PRE_UNINSTALL = : 78 | POST_UNINSTALL = : 79 | build_triplet = @build@ 80 | host_triplet = @host@ 81 | bin_PROGRAMS = mrcudaexec$(EXEEXT) 82 | subdir = scripts 83 | DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am 84 | ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 85 | am__aclocal_m4_deps = $(top_srcdir)/build-aux/libtool.m4 \ 86 | $(top_srcdir)/build-aux/ltoptions.m4 \ 87 | $(top_srcdir)/build-aux/ltsugar.m4 \ 88 | $(top_srcdir)/build-aux/ltversion.m4 \ 89 | $(top_srcdir)/build-aux/lt~obsolete.m4 \ 90 | $(top_srcdir)/build-aux/pkg.m4 $(top_srcdir)/configure.ac 91 | am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ 92 | $(ACLOCAL_M4) 93 | mkinstalldirs = $(install_sh) -d 94 | CONFIG_HEADER = $(top_builddir)/config.h 95 | CONFIG_CLEAN_FILES = 96 | CONFIG_CLEAN_VPATH_FILES = 97 | am__installdirs = "$(DESTDIR)$(bindir)" 98 | PROGRAMS = $(bin_PROGRAMS) 99 | am_mrcudaexec_OBJECTS = 100 | mrcudaexec_OBJECTS = $(am_mrcudaexec_OBJECTS) 101 | mrcudaexec_LDADD = $(LDADD) 102 | AM_V_lt = $(am__v_lt_@AM_V@) 103 | am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) 104 | am__v_lt_0 = --silent 105 | am__v_lt_1 = 106 | AM_V_P = $(am__v_P_@AM_V@) 107 | am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) 108 | am__v_P_0 = false 109 | am__v_P_1 = : 110 | AM_V_GEN = $(am__v_GEN_@AM_V@) 111 | am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) 112 | am__v_GEN_0 = @echo " GEN " $@; 113 | am__v_GEN_1 = 114 | AM_V_at = $(am__v_at_@AM_V@) 115 | am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) 116 | am__v_at_0 = @ 117 | am__v_at_1 = 118 | DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) 119 | COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ 120 | $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) 121 | LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ 122 | $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ 123 | $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ 124 | $(AM_CFLAGS) $(CFLAGS) 125 | AM_V_CC = $(am__v_CC_@AM_V@) 126 | am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) 127 | am__v_CC_0 = @echo " CC " $@; 128 | am__v_CC_1 = 129 | CCLD = $(CC) 130 | LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ 131 | $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ 132 | $(AM_LDFLAGS) $(LDFLAGS) -o $@ 133 | AM_V_CCLD = $(am__v_CCLD_@AM_V@) 134 | am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) 135 | am__v_CCLD_0 = @echo " CCLD " $@; 136 | am__v_CCLD_1 = 137 | SOURCES = $(mrcudaexec_SOURCES) 138 | DIST_SOURCES = $(mrcudaexec_SOURCES) 139 | am__can_run_installinfo = \ 140 | case $$AM_UPDATE_INFO_DIR in \ 141 | n|no|NO) false;; \ 142 | *) (install-info --version) >/dev/null 2>&1;; \ 143 | esac 144 | am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) 145 | # Read a list of newline-separated strings from the standard input, 146 | # and print each of them once, without duplicates. Input order is 147 | # *not* preserved. 148 | am__uniquify_input = $(AWK) '\ 149 | BEGIN { nonempty = 0; } \ 150 | { items[$$0] = 1; nonempty = 1; } \ 151 | END { if (nonempty) { for (i in items) print i; }; } \ 152 | ' 153 | # Make sure the list of sources is unique. This is necessary because, 154 | # e.g., the same source file might be shared among _SOURCES variables 155 | # for different programs/libraries. 156 | am__define_uniq_tagged_files = \ 157 | list='$(am__tagged_files)'; \ 158 | unique=`for i in $$list; do \ 159 | if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ 160 | done | $(am__uniquify_input)` 161 | ETAGS = etags 162 | CTAGS = ctags 163 | DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) 164 | ACLOCAL = @ACLOCAL@ 165 | AMTAR = @AMTAR@ 166 | AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ 167 | AR = @AR@ 168 | AUTOCONF = @AUTOCONF@ 169 | AUTOHEADER = @AUTOHEADER@ 170 | AUTOMAKE = @AUTOMAKE@ 171 | AWK = @AWK@ 172 | CC = @CC@ 173 | CCDEPMODE = @CCDEPMODE@ 174 | CFLAGS = @CFLAGS@ 175 | CHECK_CFLAGS = @CHECK_CFLAGS@ 176 | CHECK_LIBS = @CHECK_LIBS@ 177 | CPP = @CPP@ 178 | CPPFLAGS = @CPPFLAGS@ 179 | CYGPATH_W = @CYGPATH_W@ 180 | DEFS = @DEFS@ 181 | DEPDIR = @DEPDIR@ 182 | DEPS_CFLAGS = @DEPS_CFLAGS@ 183 | DEPS_LIBS = @DEPS_LIBS@ 184 | DLLTOOL = @DLLTOOL@ 185 | DSYMUTIL = @DSYMUTIL@ 186 | DUMPBIN = @DUMPBIN@ 187 | ECHO_C = @ECHO_C@ 188 | ECHO_N = @ECHO_N@ 189 | ECHO_T = @ECHO_T@ 190 | EGREP = @EGREP@ 191 | EXEEXT = @EXEEXT@ 192 | FGREP = @FGREP@ 193 | GREP = @GREP@ 194 | INSTALL = @INSTALL@ 195 | INSTALL_DATA = @INSTALL_DATA@ 196 | INSTALL_PROGRAM = @INSTALL_PROGRAM@ 197 | INSTALL_SCRIPT = @INSTALL_SCRIPT@ 198 | INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ 199 | LD = @LD@ 200 | LDFLAGS = @LDFLAGS@ 201 | LIBOBJS = @LIBOBJS@ 202 | LIBS = @LIBS@ 203 | LIBTOOL = @LIBTOOL@ 204 | LIPO = @LIPO@ 205 | LN_S = @LN_S@ 206 | LTLIBOBJS = @LTLIBOBJS@ 207 | MAKEINFO = @MAKEINFO@ 208 | MANIFEST_TOOL = @MANIFEST_TOOL@ 209 | MKDIR_P = @MKDIR_P@ 210 | NM = @NM@ 211 | NMEDIT = @NMEDIT@ 212 | NVCC = @NVCC@ 213 | NVIDIA_LIBCUDART = @NVIDIA_LIBCUDART@ 214 | OBJDUMP = @OBJDUMP@ 215 | OBJEXT = @OBJEXT@ 216 | OTOOL = @OTOOL@ 217 | OTOOL64 = @OTOOL64@ 218 | PACKAGE = @PACKAGE@ 219 | PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ 220 | PACKAGE_NAME = @PACKAGE_NAME@ 221 | PACKAGE_STRING = @PACKAGE_STRING@ 222 | PACKAGE_TARNAME = @PACKAGE_TARNAME@ 223 | PACKAGE_URL = @PACKAGE_URL@ 224 | PACKAGE_VERSION = @PACKAGE_VERSION@ 225 | PATH_SEPARATOR = @PATH_SEPARATOR@ 226 | PKG_CONFIG = @PKG_CONFIG@ 227 | PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ 228 | PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ 229 | PYTHON = @PYTHON@ 230 | PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ 231 | PYTHON_PLATFORM = @PYTHON_PLATFORM@ 232 | PYTHON_PREFIX = @PYTHON_PREFIX@ 233 | PYTHON_VERSION = @PYTHON_VERSION@ 234 | RANLIB = @RANLIB@ 235 | RCUDA_LIBCUDART = @RCUDA_LIBCUDART@ 236 | RCUDA_RCUDACOMMIB = @RCUDA_RCUDACOMMIB@ 237 | RCUDA_RCUDACOMMTCP = @RCUDA_RCUDACOMMTCP@ 238 | SED = @SED@ 239 | SET_MAKE = @SET_MAKE@ 240 | SHELL = @SHELL@ 241 | STRIP = @STRIP@ 242 | VERSION = @VERSION@ 243 | abs_builddir = @abs_builddir@ 244 | abs_srcdir = @abs_srcdir@ 245 | abs_top_builddir = @abs_top_builddir@ 246 | abs_top_srcdir = @abs_top_srcdir@ 247 | ac_ct_AR = @ac_ct_AR@ 248 | ac_ct_CC = @ac_ct_CC@ 249 | ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ 250 | am__include = @am__include@ 251 | am__leading_dot = @am__leading_dot@ 252 | am__quote = @am__quote@ 253 | am__tar = @am__tar@ 254 | am__untar = @am__untar@ 255 | bindir = @bindir@ 256 | build = @build@ 257 | build_alias = @build_alias@ 258 | build_cpu = @build_cpu@ 259 | build_os = @build_os@ 260 | build_vendor = @build_vendor@ 261 | builddir = @builddir@ 262 | datadir = @datadir@ 263 | datarootdir = @datarootdir@ 264 | docdir = @docdir@ 265 | dvidir = @dvidir@ 266 | exec_prefix = @exec_prefix@ 267 | host = @host@ 268 | host_alias = @host_alias@ 269 | host_cpu = @host_cpu@ 270 | host_os = @host_os@ 271 | host_vendor = @host_vendor@ 272 | htmldir = @htmldir@ 273 | includedir = @includedir@ 274 | infodir = @infodir@ 275 | install_sh = @install_sh@ 276 | libdir = @libdir@ 277 | libexecdir = @libexecdir@ 278 | localedir = @localedir@ 279 | localstatedir = @localstatedir@ 280 | mandir = @mandir@ 281 | mkdir_p = @mkdir_p@ 282 | oldincludedir = @oldincludedir@ 283 | pdfdir = @pdfdir@ 284 | pkgpyexecdir = @pkgpyexecdir@ 285 | pkgpythondir = @pkgpythondir@ 286 | prefix = @prefix@ 287 | program_transform_name = @program_transform_name@ 288 | psdir = @psdir@ 289 | pyexecdir = @pyexecdir@ 290 | pythondir = @pythondir@ 291 | sbindir = @sbindir@ 292 | sharedstatedir = @sharedstatedir@ 293 | srcdir = @srcdir@ 294 | sysconfdir = @sysconfdir@ 295 | target_alias = @target_alias@ 296 | top_build_prefix = @top_build_prefix@ 297 | top_builddir = @top_builddir@ 298 | top_srcdir = @top_srcdir@ 299 | mrcudaexec_SOURCES = mrcudaexec.py.template 300 | all: all-am 301 | 302 | .SUFFIXES: 303 | $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) 304 | @for dep in $?; do \ 305 | case '$(am__configure_deps)' in \ 306 | *$$dep*) \ 307 | ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ 308 | && { if test -f $@; then exit 0; else break; fi; }; \ 309 | exit 1;; \ 310 | esac; \ 311 | done; \ 312 | echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu scripts/Makefile'; \ 313 | $(am__cd) $(top_srcdir) && \ 314 | $(AUTOMAKE) --gnu scripts/Makefile 315 | .PRECIOUS: Makefile 316 | Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status 317 | @case '$?' in \ 318 | *config.status*) \ 319 | cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ 320 | *) \ 321 | echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ 322 | cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ 323 | esac; 324 | 325 | $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) 326 | cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh 327 | 328 | $(top_srcdir)/configure: $(am__configure_deps) 329 | cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh 330 | $(ACLOCAL_M4): $(am__aclocal_m4_deps) 331 | cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh 332 | $(am__aclocal_m4_deps): 333 | install-binPROGRAMS: $(bin_PROGRAMS) 334 | @$(NORMAL_INSTALL) 335 | @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ 336 | if test -n "$$list"; then \ 337 | echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \ 338 | $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \ 339 | fi; \ 340 | for p in $$list; do echo "$$p $$p"; done | \ 341 | sed 's/$(EXEEXT)$$//' | \ 342 | while read p p1; do if test -f $$p \ 343 | || test -f $$p1 \ 344 | ; then echo "$$p"; echo "$$p"; else :; fi; \ 345 | done | \ 346 | sed -e 'p;s,.*/,,;n;h' \ 347 | -e 's|.*|.|' \ 348 | -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ 349 | sed 'N;N;N;s,\n, ,g' | \ 350 | $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ 351 | { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ 352 | if ($$2 == $$4) files[d] = files[d] " " $$1; \ 353 | else { print "f", $$3 "/" $$4, $$1; } } \ 354 | END { for (d in files) print "f", d, files[d] }' | \ 355 | while read type dir files; do \ 356 | if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ 357 | test -z "$$files" || { \ 358 | echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ 359 | $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ 360 | } \ 361 | ; done 362 | 363 | uninstall-binPROGRAMS: 364 | @$(NORMAL_UNINSTALL) 365 | @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ 366 | files=`for p in $$list; do echo "$$p"; done | \ 367 | sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ 368 | -e 's/$$/$(EXEEXT)/' \ 369 | `; \ 370 | test -n "$$list" || exit 0; \ 371 | echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ 372 | cd "$(DESTDIR)$(bindir)" && rm -f $$files 373 | 374 | clean-binPROGRAMS: 375 | @list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \ 376 | echo " rm -f" $$list; \ 377 | rm -f $$list || exit $$?; \ 378 | test -n "$(EXEEXT)" || exit 0; \ 379 | list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ 380 | echo " rm -f" $$list; \ 381 | rm -f $$list 382 | 383 | mostlyclean-compile: 384 | -rm -f *.$(OBJEXT) 385 | 386 | distclean-compile: 387 | -rm -f *.tab.c 388 | 389 | mostlyclean-libtool: 390 | -rm -f *.lo 391 | 392 | clean-libtool: 393 | -rm -rf .libs _libs 394 | 395 | ID: $(am__tagged_files) 396 | $(am__define_uniq_tagged_files); mkid -fID $$unique 397 | tags: tags-am 398 | TAGS: tags 399 | 400 | tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) 401 | set x; \ 402 | here=`pwd`; \ 403 | $(am__define_uniq_tagged_files); \ 404 | shift; \ 405 | if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ 406 | test -n "$$unique" || unique=$$empty_fix; \ 407 | if test $$# -gt 0; then \ 408 | $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ 409 | "$$@" $$unique; \ 410 | else \ 411 | $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ 412 | $$unique; \ 413 | fi; \ 414 | fi 415 | ctags: ctags-am 416 | 417 | CTAGS: ctags 418 | ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) 419 | $(am__define_uniq_tagged_files); \ 420 | test -z "$(CTAGS_ARGS)$$unique" \ 421 | || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ 422 | $$unique 423 | 424 | GTAGS: 425 | here=`$(am__cd) $(top_builddir) && pwd` \ 426 | && $(am__cd) $(top_srcdir) \ 427 | && gtags -i $(GTAGS_ARGS) "$$here" 428 | cscopelist: cscopelist-am 429 | 430 | cscopelist-am: $(am__tagged_files) 431 | list='$(am__tagged_files)'; \ 432 | case "$(srcdir)" in \ 433 | [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ 434 | *) sdir=$(subdir)/$(srcdir) ;; \ 435 | esac; \ 436 | for i in $$list; do \ 437 | if test -f "$$i"; then \ 438 | echo "$(subdir)/$$i"; \ 439 | else \ 440 | echo "$$sdir/$$i"; \ 441 | fi; \ 442 | done >> $(top_builddir)/cscope.files 443 | 444 | distclean-tags: 445 | -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags 446 | 447 | distdir: $(DISTFILES) 448 | @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ 449 | topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ 450 | list='$(DISTFILES)'; \ 451 | dist_files=`for file in $$list; do echo $$file; done | \ 452 | sed -e "s|^$$srcdirstrip/||;t" \ 453 | -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ 454 | case $$dist_files in \ 455 | */*) $(MKDIR_P) `echo "$$dist_files" | \ 456 | sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ 457 | sort -u` ;; \ 458 | esac; \ 459 | for file in $$dist_files; do \ 460 | if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ 461 | if test -d $$d/$$file; then \ 462 | dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ 463 | if test -d "$(distdir)/$$file"; then \ 464 | find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ 465 | fi; \ 466 | if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ 467 | cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ 468 | find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ 469 | fi; \ 470 | cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ 471 | else \ 472 | test -f "$(distdir)/$$file" \ 473 | || cp -p $$d/$$file "$(distdir)/$$file" \ 474 | || exit 1; \ 475 | fi; \ 476 | done 477 | check-am: all-am 478 | check: check-am 479 | all-am: Makefile $(PROGRAMS) 480 | installdirs: 481 | for dir in "$(DESTDIR)$(bindir)"; do \ 482 | test -z "$$dir" || $(MKDIR_P) "$$dir"; \ 483 | done 484 | install: install-am 485 | install-exec: install-exec-am 486 | install-data: install-data-am 487 | uninstall: uninstall-am 488 | 489 | install-am: all-am 490 | @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am 491 | 492 | installcheck: installcheck-am 493 | install-strip: 494 | if test -z '$(STRIP)'; then \ 495 | $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ 496 | install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ 497 | install; \ 498 | else \ 499 | $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ 500 | install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ 501 | "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ 502 | fi 503 | mostlyclean-generic: 504 | 505 | clean-generic: 506 | 507 | distclean-generic: 508 | -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) 509 | -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) 510 | 511 | maintainer-clean-generic: 512 | @echo "This command is intended for maintainers to use" 513 | @echo "it deletes files that may require special tools to rebuild." 514 | clean: clean-am 515 | 516 | clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am 517 | 518 | distclean: distclean-am 519 | -rm -f Makefile 520 | distclean-am: clean-am distclean-compile distclean-generic \ 521 | distclean-tags 522 | 523 | dvi: dvi-am 524 | 525 | dvi-am: 526 | 527 | html: html-am 528 | 529 | html-am: 530 | 531 | info: info-am 532 | 533 | info-am: 534 | 535 | install-data-am: 536 | 537 | install-dvi: install-dvi-am 538 | 539 | install-dvi-am: 540 | 541 | install-exec-am: install-binPROGRAMS 542 | 543 | install-html: install-html-am 544 | 545 | install-html-am: 546 | 547 | install-info: install-info-am 548 | 549 | install-info-am: 550 | 551 | install-man: 552 | 553 | install-pdf: install-pdf-am 554 | 555 | install-pdf-am: 556 | 557 | install-ps: install-ps-am 558 | 559 | install-ps-am: 560 | 561 | installcheck-am: 562 | 563 | maintainer-clean: maintainer-clean-am 564 | -rm -f Makefile 565 | maintainer-clean-am: distclean-am maintainer-clean-generic 566 | 567 | mostlyclean: mostlyclean-am 568 | 569 | mostlyclean-am: mostlyclean-compile mostlyclean-generic \ 570 | mostlyclean-libtool 571 | 572 | pdf: pdf-am 573 | 574 | pdf-am: 575 | 576 | ps: ps-am 577 | 578 | ps-am: 579 | 580 | uninstall-am: uninstall-binPROGRAMS 581 | 582 | .MAKE: install-am install-strip 583 | 584 | .PHONY: CTAGS GTAGS TAGS all all-am check check-am clean \ 585 | clean-binPROGRAMS clean-generic clean-libtool cscopelist-am \ 586 | ctags ctags-am distclean distclean-compile distclean-generic \ 587 | distclean-libtool distclean-tags distdir dvi dvi-am html \ 588 | html-am info info-am install install-am install-binPROGRAMS \ 589 | install-data install-data-am install-dvi install-dvi-am \ 590 | install-exec install-exec-am install-html install-html-am \ 591 | install-info install-info-am install-man install-pdf \ 592 | install-pdf-am install-ps install-ps-am install-strip \ 593 | installcheck installcheck-am installdirs maintainer-clean \ 594 | maintainer-clean-generic mostlyclean mostlyclean-compile \ 595 | mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ 596 | tags tags-am uninstall uninstall-am uninstall-binPROGRAMS 597 | 598 | 599 | mrcudaexec$(EXEEXT): mrcudaexec.py.template 600 | cp $< mrcudaexec$(EXEEXT) 601 | ${SED} -i -- 's/{{ RCUDA_LIBCUDART }}/$(shell echo "${RCUDA_LIBCUDART}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 602 | ${SED} -i -- 's/{{ NVIDIA_LIBCUDART }}/$(shell echo "${NVIDIA_LIBCUDART}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 603 | ${SED} -i -- 's/{{ MRCUDA_LIBPATH }}/$(shell echo "${libdir}" | ${SED} -e 's/\//\\\//g')/g' mrcudaexec$(EXEEXT) 604 | 605 | # Tell versions [3.59,3.63) of GNU make to not export all variables. 606 | # Otherwise a system limit (for SysV at least) may be exceeded. 607 | .NOEXPORT: 608 | -------------------------------------------------------------------------------- /src/datatypes.h: -------------------------------------------------------------------------------- 1 | #ifndef __MRCUDA_DATATYPES__HEADER__ 2 | #define __MRCUDA_DATATYPES__HEADER__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /* Pre-declared Structs */ 13 | typedef struct MHelperProcess_t MHelperProcess_t; 14 | typedef struct MRCUDAGPU_t MRCUDAGPU_t; 15 | 16 | /* Struct of CUDA symbolic pointers */ 17 | typedef struct MRCUDASym_t 18 | { 19 | union { 20 | void *symHandler; 21 | MHelperProcess_t *processHandler; 22 | } handler; 23 | 24 | cudaError_t (*mrcudaDeviceReset)(void); 25 | cudaError_t (*mrcudaDeviceSynchronize)(void); 26 | cudaError_t (*mrcudaDeviceSetLimit)(enum cudaLimit limit, size_t value); 27 | cudaError_t (*mrcudaDeviceGetLimit)(size_t *pValue, enum cudaLimit limit); 28 | cudaError_t (*mrcudaDeviceGetCacheConfig)(enum cudaFuncCache *pCacheConfig); 29 | cudaError_t (*mrcudaDeviceSetCacheConfig)(enum cudaFuncCache cacheConfig); 30 | cudaError_t (*mrcudaDeviceGetSharedMemConfig)(enum cudaSharedMemConfig *pConfig); 31 | cudaError_t (*mrcudaDeviceSetSharedMemConfig)(enum cudaSharedMemConfig config); 32 | cudaError_t (*mrcudaDeviceGetByPCIBusId)(int *device, char *pciBusId); 33 | cudaError_t (*mrcudaDeviceGetPCIBusId)(char *pciBusId, int len, int device); 34 | cudaError_t (*mrcudaIpcGetEventHandle)(cudaIpcEventHandle_t *handle, cudaEvent_t event); 35 | cudaError_t (*mrcudaIpcOpenEventHandle)(cudaEvent_t *event, cudaIpcEventHandle_t handle); 36 | cudaError_t (*mrcudaIpcGetMemHandle)(cudaIpcMemHandle_t *handle, void *devPtr); 37 | cudaError_t (*mrcudaIpcOpenMemHandle)(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags); 38 | cudaError_t (*mrcudaIpcCloseMemHandle)(void *devPtr); 39 | cudaError_t (*mrcudaThreadExit)(void); 40 | cudaError_t (*mrcudaThreadSynchronize)(void); 41 | cudaError_t (*mrcudaThreadSetLimit)(enum cudaLimit limit, size_t value); 42 | cudaError_t (*mrcudaThreadGetLimit)(size_t *pValue, enum cudaLimit limit); 43 | cudaError_t (*mrcudaThreadGetCacheConfig)(enum cudaFuncCache *pCacheConfig); 44 | cudaError_t (*mrcudaThreadSetCacheConfig)(enum cudaFuncCache cacheConfig); 45 | cudaError_t (*mrcudaGetLastError)(void); 46 | cudaError_t (*mrcudaPeekAtLastError)(void); 47 | const char* (*mrcudaGetErrorString)(cudaError_t error); 48 | cudaError_t (*mrcudaGetDeviceCount)(int *count); 49 | cudaError_t (*mrcudaGetDeviceProperties)(struct cudaDeviceProp *prop, int device); 50 | cudaError_t (*mrcudaDeviceGetAttribute)(int *value, enum cudaDeviceAttr attr, int device); 51 | cudaError_t (*mrcudaChooseDevice)(int *device, const struct cudaDeviceProp *prop); 52 | cudaError_t (*mrcudaSetDevice)(int device); 53 | cudaError_t (*mrcudaGetDevice)(int *device); 54 | cudaError_t (*mrcudaSetValidDevices)(int *device_arr, int len); 55 | cudaError_t (*mrcudaSetDeviceFlags)( unsigned int flags ); 56 | cudaError_t (*mrcudaStreamCreate)(cudaStream_t *pStream); 57 | cudaError_t (*mrcudaStreamCreateWithFlags)(cudaStream_t *pStream, unsigned int flags); 58 | cudaError_t (*mrcudaStreamDestroy)(cudaStream_t stream); 59 | cudaError_t (*mrcudaStreamWaitEvent)(cudaStream_t stream, cudaEvent_t event, unsigned int flags); 60 | cudaError_t (*mrcudaStreamAddCallback)(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags); 61 | cudaError_t (*mrcudaStreamSynchronize)(cudaStream_t stream); 62 | cudaError_t (*mrcudaStreamQuery)(cudaStream_t stream); 63 | cudaError_t (*mrcudaEventCreate)(cudaEvent_t *event); 64 | cudaError_t (*mrcudaEventCreateWithFlags)(cudaEvent_t *event, unsigned int flags); 65 | cudaError_t (*mrcudaEventRecord)(cudaEvent_t event, cudaStream_t stream ); 66 | cudaError_t (*mrcudaEventQuery)(cudaEvent_t event); 67 | cudaError_t (*mrcudaEventSynchronize)(cudaEvent_t event); 68 | cudaError_t (*mrcudaEventDestroy)(cudaEvent_t event); 69 | cudaError_t (*mrcudaEventElapsedTime)(float *ms, cudaEvent_t start, cudaEvent_t end); 70 | cudaError_t (*mrcudaConfigureCall)(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream ); 71 | cudaError_t (*mrcudaSetupArgument)(const void *arg, size_t size, size_t offset); 72 | cudaError_t (*mrcudaFuncSetCacheConfig)(const void *func, enum cudaFuncCache cacheConfig); 73 | cudaError_t (*mrcudaFuncSetSharedMemConfig)(const void *func, enum cudaSharedMemConfig config); 74 | cudaError_t (*mrcudaLaunch)(const void *func); 75 | cudaError_t (*mrcudaFuncGetAttributes)(struct cudaFuncAttributes *attr, const void *func); 76 | cudaError_t (*mrcudaSetDoubleForDevice)(double *d); 77 | cudaError_t (*mrcudaSetDoubleForHost)(double *d); 78 | cudaError_t (*mrcudaMalloc)(void **devPtr, size_t size); 79 | cudaError_t (*mrcudaMallocHost)(void **ptr, size_t size); 80 | cudaError_t (*mrcudaMallocPitch)(void **devPtr, size_t *pitch, size_t width, size_t height); 81 | cudaError_t (*mrcudaMallocArray)(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height , unsigned int flags ); 82 | cudaError_t (*mrcudaFree)(void *devPtr); 83 | cudaError_t (*mrcudaFreeHost)(void *ptr); 84 | cudaError_t (*mrcudaFreeArray)(cudaArray_t array); 85 | cudaError_t (*mrcudaFreeMipmappedArray)(cudaMipmappedArray_t mipmappedArray); 86 | cudaError_t (*mrcudaHostAlloc)(void **pHost, size_t size, unsigned int flags); 87 | cudaError_t (*mrcudaHostRegister)(void *ptr, size_t size, unsigned int flags); 88 | cudaError_t (*mrcudaHostUnregister)(void *ptr); 89 | cudaError_t (*mrcudaHostGetDevicePointer)(void **pDevice, void *pHost, unsigned int flags); 90 | cudaError_t (*mrcudaHostGetFlags)(unsigned int *pFlags, void *pHost); 91 | cudaError_t (*mrcudaMalloc3D)(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent); 92 | cudaError_t (*mrcudaMalloc3DArray)(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags ); 93 | cudaError_t (*mrcudaMallocMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags ); 94 | cudaError_t (*mrcudaGetMipmappedArrayLevel)(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level); 95 | cudaError_t (*mrcudaMemcpy3D)(const struct cudaMemcpy3DParms *p); 96 | cudaError_t (*mrcudaMemcpy3DPeer)(const struct cudaMemcpy3DPeerParms *p); 97 | cudaError_t (*mrcudaMemcpy3DAsync)(const struct cudaMemcpy3DParms *p, cudaStream_t stream ); 98 | cudaError_t (*mrcudaMemcpy3DPeerAsync)(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream ); 99 | cudaError_t (*mrcudaMemGetInfo)(size_t *free, size_t *total); 100 | cudaError_t (*mrcudaArrayGetInfo)(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array); 101 | cudaError_t (*mrcudaMemcpy)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); 102 | cudaError_t (*mrcudaMemcpyPeer)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count); 103 | cudaError_t (*mrcudaMemcpyToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); 104 | cudaError_t (*mrcudaMemcpyFromArray)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); 105 | cudaError_t (*mrcudaMemcpyArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind ); 106 | cudaError_t (*mrcudaMemcpy2D)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); 107 | cudaError_t (*mrcudaMemcpy2DToArray)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); 108 | cudaError_t (*mrcudaMemcpy2DFromArray)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); 109 | cudaError_t (*mrcudaMemcpy2DArrayToArray)(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind ); 110 | cudaError_t (*mrcudaMemcpyToSymbol)(const void *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ); 111 | cudaError_t (*mrcudaMemcpyFromSymbol)(void *dst, const void *symbol, size_t count, size_t offset , enum cudaMemcpyKind kind ); 112 | cudaError_t (*mrcudaMemcpyAsync)(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ); 113 | cudaError_t (*mrcudaMemcpyPeerAsync)(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream ); 114 | cudaError_t (*mrcudaMemcpyToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ); 115 | cudaError_t (*mrcudaMemcpyFromArrayAsync)(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ); 116 | cudaError_t (*mrcudaMemcpy2DAsync)(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ); 117 | cudaError_t (*mrcudaMemcpy2DToArrayAsync)(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ); 118 | cudaError_t (*mrcudaMemcpy2DFromArrayAsync)(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream ); 119 | cudaError_t (*mrcudaMemcpyToSymbolAsync)(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ); 120 | cudaError_t (*mrcudaMemcpyFromSymbolAsync)(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ); 121 | cudaError_t (*mrcudaMemset)(void *devPtr, int value, size_t count); 122 | cudaError_t (*mrcudaMemset2D)(void *devPtr, size_t pitch, int value, size_t width, size_t height); 123 | cudaError_t (*mrcudaMemset3D)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); 124 | cudaError_t (*mrcudaMemsetAsync)(void *devPtr, int value, size_t count, cudaStream_t stream ); 125 | cudaError_t (*mrcudaMemset2DAsync)(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream ); 126 | cudaError_t (*mrcudaMemset3DAsync)(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream ); 127 | cudaError_t (*mrcudaGetSymbolAddress)(void **devPtr, const void *symbol); 128 | cudaError_t (*mrcudaGetSymbolSize)(size_t *size, const void *symbol); 129 | cudaError_t (*mrcudaPointerGetAttributes)(struct cudaPointerAttributes *attributes, const void *ptr); 130 | cudaError_t (*mrcudaDeviceCanAccessPeer)(int *canAccessPeer, int device, int peerDevice); 131 | cudaError_t (*mrcudaDeviceEnablePeerAccess)(int peerDevice, unsigned int flags); 132 | cudaError_t (*mrcudaDeviceDisablePeerAccess)(int peerDevice); 133 | cudaError_t (*mrcudaGraphicsUnregisterResource)(cudaGraphicsResource_t resource); 134 | cudaError_t (*mrcudaGraphicsResourceSetMapFlags)(cudaGraphicsResource_t resource, unsigned int flags); 135 | cudaError_t (*mrcudaGraphicsMapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ); 136 | cudaError_t (*mrcudaGraphicsUnmapResources)(int count, cudaGraphicsResource_t *resources, cudaStream_t stream ); 137 | cudaError_t (*mrcudaGraphicsResourceGetMappedPointer)(void **devPtr, size_t *size, cudaGraphicsResource_t resource); 138 | cudaError_t (*mrcudaGraphicsSubResourceGetMappedArray)(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel); 139 | cudaError_t (*mrcudaGraphicsResourceGetMappedMipmappedArray)(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource); 140 | cudaError_t (*mrcudaGetChannelDesc)(struct cudaChannelFormatDesc *desc, cudaArray_const_t array); 141 | struct cudaChannelFormatDesc (*mrcudaCreateChannelDesc)(int x, int y, int z, int w, enum cudaChannelFormatKind f); 142 | cudaError_t (*mrcudaBindTexture)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size ); 143 | cudaError_t (*mrcudaBindTexture2D)(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch); 144 | cudaError_t (*mrcudaBindTextureToArray)(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); 145 | cudaError_t (*mrcudaBindTextureToMipmappedArray)(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc); 146 | cudaError_t (*mrcudaUnbindTexture)(const struct textureReference *texref); 147 | cudaError_t (*mrcudaGetTextureAlignmentOffset)(size_t *offset, const struct textureReference *texref); 148 | cudaError_t (*mrcudaGetTextureReference)(const struct textureReference **texref, const void *symbol); 149 | cudaError_t (*mrcudaBindSurfaceToArray)(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); 150 | cudaError_t (*mrcudaGetSurfaceReference)(const struct surfaceReference **surfref, const void *symbol); 151 | cudaError_t (*mrcudaCreateTextureObject)(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc); 152 | cudaError_t (*mrcudaDestroyTextureObject)(cudaTextureObject_t texObject); 153 | cudaError_t (*mrcudaGetTextureObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject); 154 | cudaError_t (*mrcudaGetTextureObjectTextureDesc)(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject); 155 | cudaError_t (*mrcudaGetTextureObjectResourceViewDesc)(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject); 156 | cudaError_t (*mrcudaCreateSurfaceObject)(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc); 157 | cudaError_t (*mrcudaDestroySurfaceObject)(cudaSurfaceObject_t surfObject); 158 | cudaError_t (*mrcudaGetSurfaceObjectResourceDesc)(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject); 159 | cudaError_t (*mrcudaDriverGetVersion)(int *driverVersion); 160 | cudaError_t (*mrcudaRuntimeGetVersion)(int *runtimeVersion); 161 | cudaError_t (*mrcudaGetExportTable)(const void **ppExportTable, const cudaUUID_t *pExportTableId); 162 | void** (*__mrcudaRegisterFatBinary)(void* fatCubin); 163 | void (*__mrcudaUnregisterFatBinary)(void **fatCubinHandle); 164 | void (*__mrcudaRegisterVar)(void **fatCubinHandle,char *hostVar,char *deviceAddress,const char *deviceName,int ext,int size,int constant,int global); 165 | void (*__mrcudaRegisterTexture)(void **fatCubinHandle,const struct textureReference *hostVar,const void **deviceAddress,const char *deviceName,int dim,int norm,int ext); 166 | void (*__mrcudaRegisterSurface)(void **fatCubinHandle,const struct surfaceReference *hostVar,const void **deviceAddress,const char *deviceName,int dim,int ext); 167 | void (*__mrcudaRegisterFunction)(void **fatCubinHandle,const char *hostFun,char *deviceFun,const char *deviceName,int thread_limit,uint3 *tid,uint3 *bid,dim3 *bDim,dim3 *gDim,int *wSize); 168 | void (*__mrcudaRegisterShared)(void **fatCubinHandle, void **devicePtr); 169 | void (*__mrcudaRegisterSharedVar)(void **fatCubinHandle,void **devicePtr,size_t size,size_t alignment,int storage); 170 | int (*__mrcudaSynchronizeThreads)(void** one,void* two); 171 | void (*__mrcudaTextureFetch)(const void* tex,void* index,int integer,void* val); 172 | void (*__mrcudaMutexOperation)(int lock); 173 | cudaError_t (*__mrcudaRegisterDeviceFunction)(); 174 | } MRCUDASym_t; 175 | 176 | /* Shared-memory-related Structure */ 177 | typedef struct MRCUDASharedMem_t { 178 | key_t key; /* shared-memory key */ 179 | size_t size; /* size of the shared memory region */ 180 | } MRCUDASharedMem_t; 181 | 182 | typedef struct MRCUDASharedMemLocalInfo_t { 183 | MRCUDASharedMem_t sharedMem; 184 | int shmid; 185 | void *startAddr; 186 | } MRCUDASharedMemLocalInfo_t; 187 | 188 | /* Group of CUDA-related call parameter struct */ 189 | typedef struct cudaRegisterFatBinary_t { 190 | void *fatCubin; 191 | void **fatCubinHandle; 192 | MRCUDASharedMem_t sharedMem; 193 | } cudaRegisterFatBinary_t; 194 | 195 | typedef struct cudaRegisterFunction_t { 196 | void **fatCubinHandle; 197 | const char *hostFun; 198 | union { 199 | char *ptr; 200 | size_t offset; /* relative to the start of the specified shared memory. */ 201 | } deviceFun; 202 | union { 203 | const char *ptr; 204 | size_t offset; /* relative to the start of the specified shared memory. */ 205 | } deviceName; 206 | int thread_limit; 207 | uint3* tid; 208 | uint3* bid; 209 | dim3* bDim; 210 | dim3* gDim; 211 | int* wSize; 212 | MRCUDASharedMem_t sharedMem; 213 | /** 214 | * pointer to cudaRegisterFatBinary_t.fatCubinHandle 215 | * However, we cannot use it in IPC. 216 | */ 217 | void ***fatCubinHandlePtr; 218 | } cudaRegisterFunction_t; 219 | 220 | typedef struct cudaRegisterVar_t { 221 | void **fatCubinHandle; 222 | union { 223 | char *ptr; 224 | size_t offset; 225 | } hostVar; 226 | union { 227 | char *ptr; 228 | size_t offset; 229 | } deviceAddress; 230 | union { 231 | const char *ptr; 232 | size_t offset; 233 | } deviceName; 234 | int ext; 235 | int size; 236 | int constant; 237 | int global; 238 | MRCUDASharedMem_t sharedMem; 239 | /** 240 | * pointer to cudaRegisterFatBinary_t.fatCubinHandle 241 | * However, we cannot use it in IPC. 242 | */ 243 | void ***fatCubinHandlePtr; 244 | } cudaRegisterVar_t; 245 | 246 | typedef struct cudaRegisterTexture_t { 247 | void **fatCubinHandle; 248 | union { 249 | const struct textureReference *ptr; 250 | size_t offset; 251 | } hostVar; 252 | const void **deviceAddress; 253 | union { 254 | const char *ptr; 255 | size_t offset; 256 | } deviceName; 257 | int dim; 258 | int norm; 259 | int ext; 260 | MRCUDASharedMem_t sharedMem; 261 | /** 262 | * pointer to cudaRegisterFatBinary_t.fatCubinHandle 263 | * However, we cannot use it in IPC. 264 | */ 265 | void ***fatCubinHandlePtr; 266 | } cudaRegisterTexture_t; 267 | 268 | typedef struct cudaUnregisterFatBinary_t { 269 | void **fatCubinHandle; 270 | /** 271 | * pointer to cudaRegisterFatBinary_t.fatCubinHandle 272 | * However, we cannot use it in IPC. 273 | */ 274 | void ***fatCubinHandlePtr; 275 | } cudaUnregisterFatBinary_t; 276 | 277 | typedef struct cudaMalloc_t { 278 | void *devPtr; 279 | size_t size; 280 | } cudaMalloc_t; 281 | 282 | typedef struct cudaFree_t { 283 | void *devPtr; 284 | } cudaFree_t; 285 | 286 | typedef struct cudaBindTexture_t { 287 | size_t offset; 288 | const struct textureReference *texref; 289 | const void *devPtr; 290 | struct cudaChannelFormatDesc desc; 291 | size_t size; 292 | } cudaBindTexture_t; 293 | 294 | typedef struct cudaStreamCreate_t { 295 | cudaStream_t *pStream; 296 | cudaStream_t stream; 297 | } cudaStreamCreate_t; 298 | 299 | typedef struct cudaSetDeviceFlags_t { 300 | unsigned int flags; 301 | } cudaSetDeviceFlags_t; 302 | 303 | typedef struct cudaSetDevice_t { 304 | int device; 305 | } cudaSetDevice_t; 306 | 307 | typedef struct cudaMemcpyToSymbol_t { 308 | size_t count; 309 | size_t offset; 310 | enum cudaMemcpyKind kind; 311 | cudaStream_t stream; 312 | MRCUDASharedMem_t sharedMem; 313 | } cudaMemcpyToSymbol_t; 314 | 315 | typedef struct cudaMemcpyFromSymbol_t { 316 | size_t count; 317 | size_t offset; 318 | enum cudaMemcpyKind kind; 319 | cudaStream_t stream; 320 | MRCUDASharedMem_t sharedMem; 321 | } cudaMemcpyFromSymbol_t; 322 | 323 | typedef struct cudaMemcpy_t { 324 | void *dst; 325 | const void *src; 326 | size_t count; 327 | enum cudaMemcpyKind kind; 328 | MRCUDASharedMem_t sharedMem; 329 | } cudaMemcpy_t; 330 | 331 | typedef struct cudaLaunch_t { 332 | const void *func; 333 | } cudaLaunch_t; 334 | 335 | typedef struct cudaSetupArgument_t { 336 | size_t size; 337 | size_t offset; 338 | MRCUDASharedMem_t sharedMem; 339 | } cudaSetupArgument_t; 340 | 341 | typedef struct cudaConfigureCall_t { 342 | dim3 gridDim; 343 | dim3 blockDim; 344 | size_t sharedMem; 345 | cudaStream_t stream; 346 | } cudaConfigureCall_t; 347 | 348 | typedef struct cudaGetDeviceProperties_t { 349 | struct cudaDeviceProp prop; 350 | } cudaGetDeviceProperties_t; 351 | 352 | typedef struct cudaStreamSynchronize_t { 353 | cudaStream_t stream; 354 | } cudaStreamSynchronize_t; 355 | 356 | /* MRecord Struct */ 357 | typedef struct MRecord_t { 358 | char *functionName; 359 | int skipMockStream; 360 | union { 361 | cudaRegisterFatBinary_t cudaRegisterFatBinary; 362 | cudaRegisterFunction_t cudaRegisterFunction; 363 | cudaRegisterVar_t cudaRegisterVar; 364 | cudaRegisterTexture_t cudaRegisterTexture; 365 | cudaUnregisterFatBinary_t cudaUnregisterFatBinary; 366 | cudaMalloc_t cudaMalloc; 367 | cudaFree_t cudaFree; 368 | cudaBindTexture_t cudaBindTexture; 369 | cudaStreamCreate_t cudaStreamCreate; 370 | cudaSetDeviceFlags_t cudaSetDeviceFlags; 371 | } data; 372 | void (*replayFunc)(MRCUDAGPU_t *, struct MRecord_t*); 373 | struct MRecord_t *next; 374 | } MRecord_t; 375 | 376 | /* Communication-related Structs */ 377 | typedef enum MHelperCommandType_e { 378 | MRCOMMAND_TYPE_CUCTXCREATE = 0, 379 | MRCOMMAND_TYPE_CUDAREGISTERFATBINARY, 380 | MRCOMMAND_TYPE_CUDAREGISTERFUNCTION, 381 | MRCOMMAND_TYPE_CUDAREGISTERVAR, 382 | MRCOMMAND_TYPE_CUDAREGISTERTEXTURE, 383 | MRCOMMAND_TYPE_CUDAUNREGISTERFATBINARY, 384 | MRCOMMAND_TYPE_CUDAMALLOC, 385 | MRCOMMAND_TYPE_CUDAFREE, 386 | MRCOMMAND_TYPE_CUDABINDTEXTURE, 387 | MRCOMMAND_TYPE_CUDASTREAMCREATE, 388 | MRCOMMAND_TYPE_CUDASETDEVICEFLAGS, 389 | MRCOMMAND_TYPE_CUDASETDEVICE, 390 | MRCOMMAND_TYPE_CUDAMEMCPYTOSYMBOL, 391 | MRCOMMAND_TYPE_CUDAMEMCPYTOSYMBOLASYNC, 392 | MRCOMMAND_TYPE_CUDAMEMCPYFROMSYMBOL, 393 | MRCOMMAND_TYPE_CUDAMEMCPY, 394 | MRCOMMAND_TYPE_CUDASETUPARGUMENT, 395 | MRCOMMAND_TYPE_CUDACONFIGURECALL, 396 | MRCOMMAND_TYPE_CUDALAUNCH, 397 | MRCOMMAND_TYPE_CUDADEVICERESET, 398 | MRCOMMAND_TYPE_CUDADEVICESYNCHRONIZE, 399 | MRCOMMAND_TYPE_CUDAGETDEVICEPROPERTIES, 400 | MRCOMMAND_TYPE_CUDASTREAMSYNCHRONIZE, 401 | MRCOMMAND_TYPE_CUDAGETLASTERROR 402 | } MHelperCommandType_e; 403 | 404 | typedef struct MHelperCommand_t { 405 | int id; 406 | MHelperCommandType_e type; 407 | union { 408 | cudaRegisterFatBinary_t cudaRegisterFatBinary; 409 | cudaRegisterFunction_t cudaRegisterFunction; 410 | cudaRegisterVar_t cudaRegisterVar; 411 | cudaRegisterTexture_t cudaRegisterTexture; 412 | cudaUnregisterFatBinary_t cudaUnregisterFatBinary; 413 | cudaMalloc_t cudaMalloc; 414 | cudaFree_t cudaFree; 415 | cudaBindTexture_t cudaBindTexture; 416 | cudaSetDeviceFlags_t cudaSetDeviceFlags; 417 | cudaSetDevice_t cudaSetDevice; 418 | cudaMemcpyToSymbol_t cudaMemcpyToSymbol; 419 | cudaMemcpyFromSymbol_t cudaMemcpyFromSymbol; 420 | cudaMemcpy_t cudaMemcpy; 421 | cudaSetupArgument_t cudaSetupArgument; 422 | cudaConfigureCall_t cudaConfigureCall; 423 | cudaLaunch_t cudaLaunch; 424 | cudaStreamSynchronize_t cudaStreamSynchronize; 425 | } args; 426 | } MHelperCommand_t; 427 | 428 | typedef struct MHelperResult_t { 429 | int id; 430 | MHelperCommandType_e type; 431 | int internalError; 432 | cudaError_t cudaError; 433 | union { 434 | cudaRegisterFatBinary_t cudaRegisterFatBinary; 435 | cudaGetDeviceProperties_t cudaGetDeviceProperties; 436 | cudaMalloc_t cudaMalloc; 437 | cudaMemcpyFromSymbol_t cudaMemcpyFromSymbol; 438 | cudaMemcpy_t cudaMemcpy; 439 | cudaStreamCreate_t cudaStreamCreate; 440 | } args; 441 | } MHelperResult_t; 442 | 443 | struct MHelperProcess_t { 444 | pid_t pid; 445 | int readPipe; 446 | int writePipe; 447 | MRCUDASym_t *handle; 448 | }; 449 | 450 | /* MRecordGPU Struct */ 451 | typedef struct MRecordGPU_t { 452 | MRCUDAGPU_t *mrcudaGPU; 453 | MRecord_t *mrcudaRecordHeadPtr; 454 | MRecord_t *mrcudaRecordTailPtr; 455 | GHashTable *activeMemoryTable; 456 | GHashTable *activeSymbolTable; 457 | GHashTable *fatCubinHandleAddrTable; 458 | GHashTable *hostAllocTable; 459 | } MRecordGPU_t; 460 | 461 | /* MRCUDAGPU Struct */ 462 | typedef enum MRCUDAGPUStatus_e { 463 | MRCUDA_GPU_STATUS_RCUDA = 0, 464 | MRCUDA_GPU_STATUS_NATIVE, 465 | MRCUDA_GPU_STATUS_HELPER 466 | } MRCUDAGPUStatus_e; 467 | 468 | struct MRCUDAGPU_t { 469 | int virtualNumber; 470 | int realNumber; 471 | int nativeFromStart; 472 | int switchThreshold; 473 | int cudaLaunchCount; 474 | MRCUDAGPUStatus_e status; 475 | pthread_mutex_t mutex; 476 | MRCUDASym_t *defaultHandler; 477 | MRecordGPU_t *mrecordGPU; 478 | MHelperProcess_t *mhelperProcess; 479 | }; 480 | 481 | typedef enum MRCUDAState_e { 482 | MRCUDA_STATE_UNINITIALIZED = 0, 483 | MRCUDA_STATE_RUNNING, 484 | MRCUDA_STATE_FINALIZED 485 | } MRCUDAState_e; 486 | 487 | #endif /* __MRCUDA_DATATYPES__HEADER__ */ 488 | 489 | --------------------------------------------------------------------------------