├── code ├── HOWTO ├── data_movement_ex.cpp ├── gpu_selector.cpp ├── basic_parafor.cpp ├── sync.cpp ├── async.cpp ├── timer.cpp ├── gemm_basic.cpp ├── fdad.cpp └── gemm_tile.cpp ├── .gitignore ├── .yamllint ├── README.md ├── .editorconfig ├── .markdownlint.yaml └── LICENSE /code/HOWTO: -------------------------------------------------------------------------------- 1 | *Compile: 2 | dpcpp filename.cpp -o filename_ex 3 | *Run: 4 | ./filename_ex 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | __pycache__ 4 | pelican.auto.py 5 | site-generated/ 6 | .authtokens 7 | -------------------------------------------------------------------------------- /.yamllint: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | extends: default 4 | 5 | rules: 6 | colons: disable 7 | document-start: disable 8 | line-length: disable 9 | truthy: disable 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # oneAPI - Data Parallel C++ course 2 | 3 | This repo is used to show the examples in the course for better understand and try DPC++. 4 | 5 | Have a fun and file issue for any quesitons. 6 | 7 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | charset = utf-8 9 | end_of_line = lf 10 | insert_final_newline = true 11 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/DavidAnson/markdownlint#rules--aliases 2 | 3 | # MD004 ul-style - Unordered list style 4 | MD004: false 5 | 6 | # MD013 line-length - Line length 7 | MD013: false 8 | 9 | # MD033 no-inline-html - Inline HTML 10 | MD033: false 11 | 12 | # MD034 no-bare-urls - Bare URL used 13 | MD034: false 14 | 15 | # MD041 first-line-heading/first-line-h1 16 | MD041: false 17 | -------------------------------------------------------------------------------- /code/data_movement_ex.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | constexpr int N = 10; 8 | 9 | int main() { 10 | // queue my_gpu_queue(sycl::cpu_selector_v); 11 | queue my_gpu_queue(sycl::gpu_selector_v); 12 | 13 | std::cout << "Selected GPU device: " << 14 | my_gpu_queue.get_device().get_info() << "\n"; 15 | 16 | int *host_mem = malloc_host(N, my_gpu_queue); 17 | int *device_mem = malloc_device(N, my_gpu_queue); 18 | 19 | // Init CPU data 20 | for(int i = 0; i < N; i++) { 21 | host_mem[i] = i; 22 | } 23 | 24 | // Copy from host(CPU) to device(GPU) 25 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 26 | 27 | // do some works on GPU 28 | // ...... 29 | // 30 | 31 | // Copy back from GPU to CPU 32 | my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); 33 | 34 | printf("\nData Result\n"); 35 | for(int i = 0; i < N; i++) { 36 | printf("%d, ", host_mem[i]); 37 | } 38 | printf("\nTask Done!\n"); 39 | 40 | free(host_mem, my_gpu_queue); 41 | free(device_mem, my_gpu_queue); 42 | 43 | return 0; 44 | } 45 | 46 | -------------------------------------------------------------------------------- /code/gpu_selector.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | int main() { 8 | // queue my_gpu_queue( sycl::cpu_selector_v); 9 | queue my_gpu_queue( sycl::gpu_selector_v); 10 | 11 | std::cout << "Selected GPU device: " << 12 | my_gpu_queue.get_device().get_info() << "\n"; 13 | 14 | std::cout << "max_compute_units: " << 15 | my_gpu_queue.get_device().get_info() << "\n"; 16 | 17 | std::cout << "max_work_item_dimensions: " << 18 | my_gpu_queue.get_device().get_info() << "\n"; 19 | 20 | std::cout << "max_work_group_size: " << 21 | my_gpu_queue.get_device().get_info() << "\n"; 22 | 23 | std::cout << "max_num_sub_groups: " << 24 | my_gpu_queue.get_device().get_info() << "\n"; 25 | 26 | std::cout << "supported sub_group_sizes: "; 27 | for(const auto& num : my_gpu_queue.get_device().get_info() ) 28 | std::cout << num << " "; 29 | std::cout << "\n"; 30 | 31 | std::cout << "max_mem_alloc_size: " << 32 | my_gpu_queue.get_device().get_info() << "\n"; 33 | 34 | std::cout << "global_mem_size: " << 35 | my_gpu_queue.get_device().get_info() << "\n"; 36 | 37 | std::cout << "local_mem_size: " << 38 | my_gpu_queue.get_device().get_info() << "\n"; 39 | 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /code/basic_parafor.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | constexpr int N = 10; 8 | 9 | int main() { 10 | // queue my_gpu_queue( sycl::cpu_selector_v); 11 | queue my_gpu_queue( sycl::gpu_selector_v); 12 | 13 | std::cout << "Selected GPU device: " << 14 | my_gpu_queue.get_device().get_info() << "\n"; 15 | 16 | int *host_mem = malloc_host(N, my_gpu_queue); 17 | int *device_mem = malloc_device(N, my_gpu_queue); 18 | 19 | // Init CPU data 20 | for(int i = 0; i < N; i++) { 21 | host_mem[i] = i; 22 | } 23 | 24 | // Copy from host(CPU) to device(GPU) 25 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 26 | 27 | // submit the content to the queue for execution 28 | my_gpu_queue.submit([&](handler& h) { 29 | 30 | // Parallel Computation 31 | h.parallel_for(range{N}, [=](id<1> item) { 32 | device_mem[item] *= 2; 33 | }); 34 | 35 | }); 36 | 37 | // wait the computation done 38 | my_gpu_queue.wait(); 39 | 40 | // Copy back from GPU to CPU 41 | my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); 42 | 43 | printf("\nData Result\n"); 44 | for(int i = 0; i < N; i++) { 45 | printf("%d, ", host_mem[i]); 46 | } 47 | printf("\nTask Done!\n"); 48 | 49 | free(host_mem, my_gpu_queue); 50 | free(device_mem, my_gpu_queue); 51 | 52 | return 0; 53 | } 54 | 55 | -------------------------------------------------------------------------------- /code/sync.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | constexpr int64_t N = 10000000; 8 | 9 | int main() { 10 | 11 | // Enable queue profiling 12 | auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; 13 | // queue my_gpu_queue(sycl::cpu_selector_v, propList); 14 | queue my_gpu_queue(sycl::gpu_selector_v, propList); 15 | 16 | std::cout << "Selected GPU device: " << 17 | my_gpu_queue.get_device().get_info() << "\n"; 18 | 19 | 20 | int *host_mem = malloc_host(N, my_gpu_queue); 21 | int *cpu_mem = malloc_host(N, my_gpu_queue); 22 | int *device_mem = malloc_device(N, my_gpu_queue); 23 | 24 | // Init CPU data 25 | for(int64_t i = 0; i < N; i++) { 26 | host_mem[i] = i % 6666; 27 | } 28 | 29 | float duration_cpu = 0.0; 30 | float duration_gpu = 0.0; 31 | float duration_total = 0.0; 32 | 33 | std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; 34 | std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; 35 | std::chrono::high_resolution_clock::time_point s_t, e_t; 36 | 37 | // warmup 38 | /*********************************************************************/ 39 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 40 | my_gpu_queue.submit([&](handler& h) { 41 | 42 | // Parallel Computation 43 | h.parallel_for(range{N}, [=](id<1> item) { 44 | device_mem[item] *= 2; 45 | }); 46 | 47 | }); 48 | my_gpu_queue.wait(); 49 | /*********************************************************************/ 50 | 51 | // CPU computation 52 | printf("\n Start CPU Computation, Number of Elems = %ld \n", N); 53 | 54 | s_t = std::chrono::high_resolution_clock::now(); 55 | s_cpu = std::chrono::high_resolution_clock::now(); 56 | // CPU code here 57 | for(int64_t i = 0; i < N; i++) { 58 | cpu_mem[i] = host_mem[i] * 2; 59 | } 60 | e_cpu = std::chrono::high_resolution_clock::now(); 61 | duration_cpu = std::chrono::duration(e_cpu - s_cpu).count(); 62 | 63 | 64 | s_gpu = std::chrono::high_resolution_clock::now(); 65 | // submit the content to the queue for execution 66 | auto event = my_gpu_queue.submit([&](handler& h) { 67 | 68 | // Parallel Computation 69 | h.parallel_for(range{N}, [=](id<1> item) { 70 | device_mem[item] *= 2; 71 | }); 72 | 73 | }); 74 | // wait the computation done 75 | my_gpu_queue.wait(); 76 | e_gpu = std::chrono::high_resolution_clock::now(); 77 | e_t = std::chrono::high_resolution_clock::now(); 78 | duration_gpu = std::chrono::duration(e_gpu - s_gpu).count(); 79 | duration_total = std::chrono::duration(e_t - s_t).count(); 80 | 81 | // Copy back from GPU to CPU 82 | my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); 83 | 84 | printf("\n CPU Computation, Time = %lf \n", duration_cpu); 85 | printf("\n GPU Computation, Time = %lf \n", duration_gpu); 86 | printf("\n Total Computation, TIme = %lf \n", duration_total); 87 | 88 | printf("\nTask Done!\n"); 89 | 90 | free(cpu_mem, my_gpu_queue); 91 | free(host_mem, my_gpu_queue); 92 | free(device_mem, my_gpu_queue); 93 | 94 | return 0; 95 | } 96 | 97 | -------------------------------------------------------------------------------- /code/async.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao, patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | constexpr int64_t N = 10000000; 8 | 9 | int main() { 10 | 11 | // Enable queue profiling 12 | // queue my_gpu_queue(sycl::cpu_selector_v); 13 | queue my_gpu_queue(sycl::gpu_selector_v); 14 | 15 | std::cout << "Selected GPU device: " << 16 | my_gpu_queue.get_device().get_info() << "\n"; 17 | 18 | 19 | int *cpu_out = (int*)malloc(N * sizeof(int)); 20 | int *host_mem = malloc_host(N, my_gpu_queue); 21 | int *device_mem = malloc_device(N, my_gpu_queue); 22 | 23 | // Init CPU data 24 | for(int64_t i = 0; i < N; i++) { 25 | host_mem[i] = i % 6666; 26 | cpu_out[i] = i % 6666; 27 | } 28 | 29 | float duration_cpu = 0.0; 30 | float duration_gpu = 0.0; 31 | float duration_total = 0.0; 32 | 33 | std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; 34 | std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; 35 | std::chrono::high_resolution_clock::time_point s_t, e_t; 36 | 37 | // warmup 38 | /*********************************************************************/ 39 | for(int64_t i = 0; i < N; i++) { 40 | cpu_out[i] = cpu_out[i] * 2; 41 | } 42 | 43 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 44 | my_gpu_queue.submit([&](handler& h) { 45 | 46 | // Parallel Computation 47 | h.parallel_for(range{N}, [=](id<1> item) { 48 | device_mem[item] *= 2; 49 | }); 50 | 51 | }); 52 | my_gpu_queue.wait(); 53 | /*********************************************************************/ 54 | 55 | printf("\n Start CPU Computation, Number of Elems = %ld \n", N); 56 | 57 | s_t = std::chrono::high_resolution_clock::now(); 58 | 59 | // GPU Computation 60 | // submit the content to the queue for execution 61 | s_gpu = std::chrono::high_resolution_clock::now(); 62 | auto event = my_gpu_queue.submit([&](handler& h) { 63 | // Parallel Computation 64 | h.parallel_for(range{N}, [=](id<1> item) { 65 | device_mem[item] *= 2; 66 | }); 67 | }); 68 | 69 | // CPU computation 70 | s_cpu = std::chrono::high_resolution_clock::now(); 71 | for(int64_t i = 0; i < N; i++) { 72 | cpu_out[i] *= 2; 73 | } 74 | e_cpu = std::chrono::high_resolution_clock::now(); 75 | 76 | // Testing overlapping between CPU and GPU 77 | // Delay the wait() after CPU computation 78 | event.wait(); 79 | e_gpu = std::chrono::high_resolution_clock::now(); 80 | 81 | e_t = std::chrono::high_resolution_clock::now(); 82 | 83 | duration_cpu = std::chrono::duration(e_cpu - s_cpu).count(); 84 | duration_gpu = std::chrono::duration(e_gpu - s_gpu).count(); 85 | duration_total = std::chrono::duration(e_t - s_t).count(); 86 | 87 | // Copy back from GPU to CPU 88 | my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); 89 | 90 | printf("\n CPU Computation, Time = %lf \n", duration_cpu); 91 | printf("\n GPU Computation, Time = %lf \n", duration_gpu); 92 | printf("\n Total Computation, TIme = %lf \n", duration_total); 93 | 94 | free(cpu_out); 95 | free(host_mem, my_gpu_queue); 96 | free(device_mem, my_gpu_queue); 97 | 98 | printf("\nTask Done!\n"); 99 | 100 | return 0; 101 | } 102 | 103 | -------------------------------------------------------------------------------- /code/timer.cpp: -------------------------------------------------------------------------------- 1 | // Patric Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | constexpr int64_t N = 10000000; 8 | 9 | int main() { 10 | 11 | // Enable queue profiling 12 | auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; 13 | // queue my_gpu_queue(sycl::cpu_selector_v, propList); 14 | queue my_gpu_queue(sycl::gpu_selector_v, propList); 15 | std::cout << "Selected GPU device: " << 16 | my_gpu_queue.get_device().get_info() << "\n"; 17 | 18 | 19 | 20 | int *host_mem = malloc_host(N, my_gpu_queue); 21 | int *cpu_mem = malloc_host(N, my_gpu_queue); 22 | int *device_mem = malloc_device(N, my_gpu_queue); 23 | 24 | // Init CPU data 25 | for(int64_t i = 0; i < N; i++) { 26 | host_mem[i] = i % 6666; 27 | } 28 | 29 | float duration_cpu = 0.0; 30 | float duration_gpu_a = 0.0; 31 | float duration_gpu_b = 0.0; 32 | float duration_gpu_c = 0.0; 33 | 34 | std::chrono::high_resolution_clock::time_point s, e; 35 | std::chrono::high_resolution_clock::time_point s_a, e_a; 36 | std::chrono::high_resolution_clock::time_point s_b, e_b; 37 | std::chrono::high_resolution_clock::time_point s_c, e_c; 38 | 39 | // CPU computation 40 | printf("\n Start CPU Computation, Number of Elems = %ld \n", N); 41 | 42 | s = std::chrono::high_resolution_clock::now(); 43 | // CPU code here 44 | for(int64_t i = 0; i < N; i++) { 45 | cpu_mem[i] = host_mem[i] * 2; 46 | } 47 | e = std::chrono::high_resolution_clock::now(); 48 | duration_cpu = std::chrono::duration(e - s).count(); 49 | printf("\n End CPU Computation, Time = %lf \n", duration_cpu); 50 | 51 | 52 | // warmup 53 | /*********************************************************************/ 54 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 55 | my_gpu_queue.submit([&](handler& h) { 56 | 57 | // Parallel Computation 58 | h.parallel_for(range{N}, [=](id<1> item) { 59 | device_mem[item] *= 2; 60 | }); 61 | 62 | }); 63 | my_gpu_queue.wait(); 64 | /*********************************************************************/ 65 | 66 | s_c = std::chrono::high_resolution_clock::now(); 67 | // Copy from host(CPU) to device(GPU) 68 | my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); 69 | 70 | s_b = std::chrono::high_resolution_clock::now(); 71 | s_a = std::chrono::high_resolution_clock::now(); 72 | // submit the content to the queue for execution 73 | auto event = my_gpu_queue.submit([&](handler& h) { 74 | 75 | // Parallel Computation 76 | h.parallel_for(range{N}, [=](id<1> item) { 77 | device_mem[item] *= 2; 78 | }); 79 | 80 | }); 81 | // wait the computation done 82 | my_gpu_queue.wait(); 83 | e_b = std::chrono::high_resolution_clock::now(); 84 | duration_gpu_b = std::chrono::duration(e_b - s_b).count(); 85 | 86 | duration_gpu_a = 87 | (event.get_profiling_info() - 88 | event.get_profiling_info()) /1000.0f/1000.0f; 89 | 90 | // Copy back from GPU to CPU 91 | my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); 92 | e_c = std::chrono::high_resolution_clock::now(); 93 | duration_gpu_c = std::chrono::duration(e_c - s_c).count(); 94 | 95 | printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a); 96 | printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b); 97 | printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c); 98 | 99 | printf("\nTask Done!\n"); 100 | 101 | return 0; 102 | } 103 | 104 | -------------------------------------------------------------------------------- /code/gemm_basic.cpp: -------------------------------------------------------------------------------- 1 | //Patric Zhao: patric.zhao@gmail.com 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define random_float() (rand() / double(RAND_MAX)) 8 | 9 | using namespace std; 10 | using namespace sycl; 11 | 12 | // return execution time 13 | double gpu_kernel(float *A, float *B, float *C, int M, int N, int K, int block_size, sycl::queue &q) { 14 | 15 | // define the workgroup size and mapping 16 | auto grid_rows = (M + block_size - 1) / block_size * block_size; 17 | auto grid_cols = (N + block_size - 1) / block_size * block_size; 18 | auto local_ndrange = range<2>(block_size, block_size); 19 | auto global_ndrange = range<2>(grid_rows, grid_cols); 20 | 21 | double duration = 0.0f; 22 | auto e = q.submit([&](sycl::handler &h) { 23 | h.parallel_for( 24 | sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) { 25 | 26 | int row = index.get_global_id(0); 27 | int col = index.get_global_id(1); 28 | 29 | float sum = 0.0f; 30 | 31 | for (int i = 0; i < K; i++) { 32 | sum += A[row * K + i] * B[i * N + col]; 33 | } 34 | C[row * N + col] = sum; 35 | }); 36 | }); 37 | e.wait(); 38 | 39 | duration += (e.get_profiling_info() - 40 | e.get_profiling_info()) /1000.0f/1000.0f; 41 | 42 | return(duration); 43 | } 44 | 45 | // return execution time 46 | double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) { 47 | 48 | double duration = 0.0; 49 | std::chrono::high_resolution_clock::time_point s, e; 50 | 51 | // Single Thread Computation in CPU 52 | s = std::chrono::high_resolution_clock::now(); 53 | for(int i = 0; i < M; i++) { 54 | for(int j = 0; j < N; j++) { 55 | float sum = 0.0f; 56 | for(int k = 0; k < K; k++) { 57 | sum += cA[i * K + k] * cB[k * N + j]; 58 | } 59 | cC[i * N + j] = sum; 60 | } 61 | } 62 | e = std::chrono::high_resolution_clock::now(); 63 | duration = std::chrono::duration(e - s).count(); 64 | 65 | return(duration); 66 | } 67 | 68 | int verify(float *cpu_res, float *gpu_res, int length){ 69 | int err = 0; 70 | for(int i = 0; i < length; i++) { 71 | if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) { 72 | err++; 73 | printf("\n%lf, %lf", cpu_res[i], gpu_res[i]); 74 | } 75 | } 76 | return(err); 77 | } 78 | 79 | int gemm(const int M, 80 | const int N, 81 | const int K, 82 | const int block_size, 83 | const int iterations, 84 | sycl::queue &q) { 85 | 86 | cout << "Problem size: c(" << M << "," << N << ") =" 87 | << " a(" << M << "," << K << ") *" 88 | << " b(" << K << "," << N << ")\n"; 89 | 90 | auto A = malloc_shared(M * K, q); 91 | auto B = malloc_shared(K * N, q); 92 | auto C = malloc_shared(M * N, q); 93 | auto C_host = malloc_host(M * N, q); 94 | 95 | // init the A/B/C 96 | for(int i=0; i < M * K; i++) { 97 | A[i] = random_float(); 98 | } 99 | 100 | for(int i=0; i < K * N; i++) { 101 | B[i] = random_float(); 102 | } 103 | 104 | for(int i=0; i < M * N; i++) { 105 | C[i] = 0.0f; 106 | C_host[i] = 0.0f; 107 | } 108 | 109 | double flopsPerMatrixMul 110 | = 2.0 * static_cast(M) * static_cast(N) * static_cast(K); 111 | 112 | double duration_gpu = 0.0f; 113 | double duration_cpu = 0.0f; 114 | 115 | // GPU compuation and timer 116 | int warmup = 10; 117 | for (int run = 0; run < iterations + warmup; run++) { 118 | float duration = gpu_kernel(A, B, C, M, N, K, block_size, q); 119 | if(run >= warmup) duration_gpu += duration; 120 | } 121 | duration_gpu = duration_gpu / iterations; 122 | 123 | // CPU compuation and timer 124 | warmup = 2; 125 | for(int run = 0; run < iterations/2 + warmup; run++) { 126 | float duration = cpu_kernel(A, B, C_host, M, N, K); 127 | if(run >= warmup) duration_cpu += duration; 128 | } 129 | duration_cpu = duration_cpu / iterations/2; 130 | 131 | // Compare the resutls of CPU and GPU 132 | int errCode = 0; 133 | errCode = verify(C_host, C, M*N); 134 | if(errCode > 0) printf("\nThere are %d errors\n", errCode); 135 | 136 | printf("\nPerformance Flops = %lf, \n" 137 | "GPU Computation Time = %lf (ms); \n" 138 | "CPU Computaiton Time = %lf (ms); \n", 139 | flopsPerMatrixMul, duration_gpu, duration_cpu); 140 | 141 | free(A, q); 142 | free(B, q); 143 | free(C, q); 144 | free(C_host, q); 145 | 146 | return(errCode); 147 | } 148 | 149 | int main() { 150 | 151 | auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; 152 | // queue my_gpu_queue( cl::sycl::cpu_selector_v, propList); 153 | queue my_gpu_queue( cl::sycl::gpu_selector_v, propList); 154 | 155 | int errCode = gemm(1024, 1024, 1024, 4, 10, my_gpu_queue); 156 | 157 | return(errCode); 158 | } 159 | -------------------------------------------------------------------------------- /code/fdad.cpp: -------------------------------------------------------------------------------- 1 | // patric zhao, patric.zhao@intel.com 2 | // show SLM usage by Finite Difference Approximating Derivatives (fdad) 3 | #include 4 | #include 5 | using namespace sycl; 6 | 7 | #define random_float() (rand() / double(RAND_MAX)) 8 | #define BLOCK 256 9 | #define CheckResult 0 10 | 11 | constexpr int64_t N = 256 * 256 * 256 + 2; 12 | constexpr float delta = 0.001f; 13 | 14 | void verify(float *gpu, float *cpu, int N) { 15 | int error = 0; 16 | for(int i = 0; i < N; i++) { 17 | if(std::fabs(gpu[i] - cpu[i]) > 10e-3) { 18 | printf("\nError at %d GPU = %f, CPU = %f\n", i, gpu[i], cpu[i]); 19 | error++; 20 | } 21 | if(error > 20) break; 22 | } 23 | return; 24 | } 25 | 26 | int main() { 27 | 28 | // Enable queue profiling 29 | auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; 30 | // queue my_gpu_queue(sycl::cpu_selector_v, propList); 31 | queue my_gpu_queue(sycl::gpu_selector_v, propList); 32 | 33 | std::cout << "Selected GPU device: " << 34 | my_gpu_queue.get_device().get_info() << "\n"; 35 | 36 | float *input = malloc_host(N, my_gpu_queue); 37 | float *output_P_cpu = malloc_host(N-2, my_gpu_queue); 38 | 39 | float *input_Q = malloc_device(N, my_gpu_queue); 40 | float *output_P = malloc_device(N-2, my_gpu_queue); 41 | 42 | float *output_P_gpu = malloc_host(N-2, my_gpu_queue); 43 | 44 | // Init CPU data 45 | for(int64_t i = 0; i < N; i++) { 46 | input[i] = random_float(); 47 | } 48 | 49 | // CPU compuatation 50 | printf("\n Start Computation, Number of Elems = %ld \n", N); 51 | for(int64_t i = 0; i < N-2; i++) { 52 | output_P_cpu[i] = (input[i+2] - input[i]) / (2.0f * delta); 53 | } 54 | 55 | float duration_gpu_a = 0.0; 56 | float duration_gpu_b = 0.0; 57 | 58 | // Copy from host(CPU) to device(GPU) 59 | my_gpu_queue.memcpy(input_Q, input, N * sizeof(float)).wait(); 60 | 61 | int warmup = 10; 62 | int iteration = 50; 63 | for(int i = 0; i < iteration + warmup; i++) { 64 | 65 | // read/write global memory directly 66 | auto event1 = my_gpu_queue.submit([&](handler& h) { 67 | h.parallel_for(nd_range<1>{N-2, BLOCK}, [=](nd_item<1> item) { 68 | auto global_id = item.get_global_id(0); 69 | output_P[global_id] = (input_Q[global_id +2] - input_Q[global_id]) / (2.0f * delta); 70 | }); 71 | }); 72 | // wait the computation done 73 | my_gpu_queue.wait(); 74 | 75 | if (i >= warmup) { 76 | duration_gpu_a += 77 | (event1.get_profiling_info() - 78 | event1.get_profiling_info()) /1000.0f/1000.0f; 79 | } 80 | 81 | if (CheckResult) { 82 | my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait(); 83 | verify(output_P_gpu, output_P_gpu, N); 84 | } 85 | 86 | // read data to SLM and then computaiton w/ SLM read 87 | // finally write back to global memory 88 | auto event2 = my_gpu_queue.submit([&](handler& h) { 89 | 90 | // Define SLM size per work-group 91 | 92 | // sycl::accessor 94 | // slm_buffer(BLOCK + 2, h); 95 | 96 | //Usage according to the new standards after 2021. 97 | sycl::local_accessor slm_buffer(BLOCK + 2, h); 98 | 99 | h.parallel_for(nd_range<1>(N-2, BLOCK), [=](nd_item<1> item) { 100 | 101 | auto local_id = item.get_local_id(0); 102 | auto global_id = item.get_global_id(0); 103 | 104 | slm_buffer[local_id] = input_Q[global_id]; 105 | if(local_id == BLOCK-1) { 106 | slm_buffer[BLOCK ] = input_Q[global_id +1]; 107 | slm_buffer[BLOCK+1] = input_Q[global_id +2]; 108 | } 109 | item.barrier(sycl::access::fence_space::local_space); 110 | 111 | output_P[global_id] = (slm_buffer[local_id +2] - slm_buffer[local_id]) / (2.0f * delta); 112 | }); 113 | 114 | }); 115 | my_gpu_queue.wait(); 116 | 117 | if (i >= warmup) { 118 | duration_gpu_b += 119 | (event2.get_profiling_info() - 120 | event2.get_profiling_info()) /1000.0f/1000.0f; 121 | } 122 | 123 | if (CheckResult) { 124 | my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait(); 125 | verify(output_P_gpu, output_P_gpu, N); 126 | } 127 | 128 | } 129 | 130 | printf("\n GPU Computation, GPU Time w/o SLM = %lf \n", duration_gpu_a / iteration); 131 | printf("\n GPU Computation, GPU Time w/ SLM = %lf \n", duration_gpu_b / iteration); 132 | 133 | printf("\nTask Done!\n"); 134 | 135 | free(input_Q, my_gpu_queue); 136 | free(output_P, my_gpu_queue); 137 | free(output_P_cpu, my_gpu_queue); 138 | free(output_P_gpu, my_gpu_queue); 139 | free(input, my_gpu_queue); 140 | 141 | return 0; 142 | } 143 | 144 | -------------------------------------------------------------------------------- /code/gemm_tile.cpp: -------------------------------------------------------------------------------- 1 | //Patri Zhao: patric.zhao@intel.com 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define random_float() (rand() / double(RAND_MAX)) 8 | 9 | using namespace std; 10 | using namespace sycl; 11 | 12 | #define tileY 2 13 | #define tileX 2 14 | 15 | // return execution time 16 | double gpu_kernel(float *A, float *B, float *C, 17 | int M, int N, int K, 18 | int BLOCK, sycl::queue &q) { 19 | 20 | // define the workgroup size and mapping 21 | auto grid_rows = M / tileY; 22 | auto grid_cols = N / tileX; 23 | auto local_ndrange = range<2>(BLOCK, BLOCK); 24 | auto global_ndrange = range<2>(grid_rows, grid_cols); 25 | 26 | double duration = 0.0f; 27 | 28 | auto e = q.submit([&](sycl::handler &h) { 29 | h.parallel_for( 30 | sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) { 31 | 32 | int row = tileY * index.get_global_id(0); 33 | int col = tileX * index.get_global_id(1); 34 | 35 | float sum[tileY][tileX] = {0.0f}; 36 | float subA[tileY] = {0.0f}; 37 | float subB[tileX] = {0.0f}; 38 | 39 | // core computation 40 | for (int k = 0; k < N; k++) { 41 | 42 | // read data to register 43 | for(int m = 0; m < tileY; m++) { 44 | subA[m] = A[(row + m) * N + k]; 45 | } 46 | 47 | for(int p = 0; p < tileX; p++) { 48 | subB[p] = B[k * N + p + col]; 49 | } 50 | 51 | for (int m = 0; m < tileY; m++) { 52 | for (int p = 0; p < tileX; p++) { 53 | sum[m][p] += subA[m] * subB[p]; 54 | } 55 | } 56 | 57 | } //end of K 58 | 59 | // write results back 60 | for (int m = 0; m < tileY; m++) { 61 | for (int p = 0; p < tileX; p++) { 62 | C[(row + m) * N + col + p] = sum[m][p]; 63 | } 64 | } 65 | 66 | }); 67 | }); 68 | e.wait(); 69 | 70 | duration += (e.get_profiling_info() - 71 | e.get_profiling_info()) /1000.0f/1000.0f; 72 | 73 | return(duration); 74 | } 75 | 76 | // return execution time 77 | double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) { 78 | 79 | double duration = 0.0; 80 | std::chrono::high_resolution_clock::time_point s, e; 81 | 82 | // Single Thread Computation in CPU 83 | s = std::chrono::high_resolution_clock::now(); 84 | for(int i = 0; i < M; i++) { 85 | for(int j = 0; j < N; j++) { 86 | float sum = 0.0f; 87 | for(int k = 0; k < K; k++) { 88 | sum += cA[i * K + k] * cB[k * N + j]; 89 | } 90 | cC[i * N + j] = sum; 91 | } 92 | } 93 | e = std::chrono::high_resolution_clock::now(); 94 | duration = std::chrono::duration(e - s).count(); 95 | 96 | return(duration); 97 | } 98 | 99 | int verify(float *cpu_res, float *gpu_res, int length){ 100 | int err = 0; 101 | for(int i = 0; i < length; i++) { 102 | if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) { 103 | err++; 104 | printf("\n%lf, %lf", cpu_res[i], gpu_res[i]); 105 | } 106 | } 107 | return(err); 108 | } 109 | 110 | int gemm(const int M, 111 | const int N, 112 | const int K, 113 | const int block_size, 114 | const int iterations, 115 | sycl::queue &q) { 116 | 117 | cout << "Problem size: c(" << M << "," << N << ") =" 118 | << " a(" << M << "," << K << ") *" 119 | << " b(" << K << "," << N << ")\n"; 120 | 121 | auto A = malloc_shared(M * K, q); 122 | auto B = malloc_shared(K * N, q); 123 | auto C = malloc_shared(M * N, q); 124 | auto C_host = malloc_host(M * N, q); 125 | 126 | // init the A/B/C 127 | for(int i=0; i < M * K; i++) { 128 | A[i] = random_float(); 129 | } 130 | 131 | for(int i=0; i < K * N; i++) { 132 | B[i] = random_float(); 133 | } 134 | 135 | for(int i=0; i < M * N; i++) { 136 | C[i] = 0.0f; 137 | C_host[i] = 0.0f; 138 | } 139 | 140 | double flopsPerMatrixMul 141 | = 2.0 * static_cast(M) * static_cast(N) * static_cast(K); 142 | 143 | double duration_gpu = 0.0f; 144 | double duration_cpu = 0.0f; 145 | 146 | // GPU compuation and timer 147 | int warmup = 10; 148 | for (int run = 0; run < iterations + warmup; run++) { 149 | float duration = gpu_kernel(A, B, C, M, N, K, block_size, q); 150 | if(run >= warmup) duration_gpu += duration; 151 | } 152 | duration_gpu = duration_gpu / iterations; 153 | 154 | // CPU compuation and timer 155 | warmup = 2; 156 | for(int run = 0; run < iterations/2 + warmup; run++) { 157 | float duration = cpu_kernel(A, B, C_host, M, N, K); 158 | if(run >= warmup) duration_cpu += duration; 159 | } 160 | duration_cpu = duration_cpu / iterations/2; 161 | 162 | // Compare the resutls of CPU and GPU 163 | int errCode = 0; 164 | errCode = verify(C_host, C, M*N); 165 | if(errCode > 0) printf("\nThere are %d errors\n", errCode); 166 | 167 | printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K); 168 | printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY); 169 | printf("\nPerformance Flops = %lf, \n" 170 | "GPU Computation Time = %lf (ms); \n" 171 | "CPU Computaiton Time = %lf (ms); \n", 172 | flopsPerMatrixMul, duration_gpu, duration_cpu); 173 | 174 | free(A, q); 175 | free(B, q); 176 | free(C, q); 177 | free(C_host, q); 178 | 179 | return(errCode); 180 | } 181 | 182 | int main() { 183 | 184 | auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; 185 | // queue my_gpu_queue( cl::sycl::cpu_selector_v, propList); 186 | queue my_gpu_queue( cl::sycl::gpu_selector_v, propList); 187 | 188 | int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */ 189 | 4, /* workgroup size */ 190 | 10, /* repeat time */ 191 | my_gpu_queue); 192 | 193 | return(errCode); 194 | } 195 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | --------------------------------------------------------------------------------