├── .gitignore ├── .gitmodules ├── utils ├── macro.cuh ├── utils.cuh ├── sass_kernel.cuh ├── format_print.cuh └── ptx_export.cuh ├── CMakeLists.txt ├── sass_cubin ├── reg_reuse_bankconflict.sass ├── reg_reuse_double.sass ├── reg_without_bankconflict.sass ├── reg_with_bankconflict.sass ├── warp_schedule.sass ├── shared_bankconflict.sass ├── memory_latency.sass ├── cache_linesize.sass ├── memory_bandwidth_thread.sass └── memory_bandwidth_block.sass ├── compile_sass.py ├── schedule ├── block_schedule.cu └── warp_schedule.cu ├── miscellany ├── reg_bankconflict.cu └── shared_bankconflict.cu ├── memory ├── cache_linesize.cu ├── memory_bandwidth.cu ├── memory_latency.cu └── global_memory_bandwidth.cu └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .vscode/ 3 | 4 | build/ 5 | bin/ 6 | 7 | *.out 8 | *.cubin -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "turingas"] 2 | path = turingas 3 | url = git@github.com:daadaada/turingas.git 4 | -------------------------------------------------------------------------------- /utils/macro.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // CUDA 3 | // Created by sjfeng 4 | // 5 | 6 | #pragma once 7 | 8 | #define UPPER_DIV(x, y) ((x + y - 1) / y) 9 | 10 | constexpr int kWarpSize = 32; 11 | -------------------------------------------------------------------------------- /utils/utils.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #pragma once 7 | 8 | #include "./format_print.cuh" 9 | #include "./macro.cuh" 10 | #include "./ptx_export.cuh" 11 | #include "./sass_kernel.cuh" 12 | -------------------------------------------------------------------------------- /utils/sass_kernel.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #pragma once 7 | 8 | #include "cuda.h" 9 | #include "cuda_runtime.h" 10 | 11 | cudaError_t launchSassKernel(const char* cubin_name, const char* kernel_name, const dim3& gDim, const dim3& bDim, const int shared_bytes, void** args){ 12 | CUmodule module; 13 | CUfunction kernel; 14 | 15 | cuModuleLoad(&module, cubin_name); 16 | cuModuleGetFunction(&kernel, module, kernel_name); 17 | 18 | cuLaunchKernel(kernel, 19 | gDim.x, gDim.y, gDim.z, 20 | bDim.x, bDim.y, bDim.z, 21 | shared_bytes, // SharedMem Bytes 22 | 0, // Stream 23 | args, 0); 24 | 25 | return cudaPeekAtLastError(); 26 | } 27 | -------------------------------------------------------------------------------- /utils/format_print.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // C++ 3 | // Created by sjfeng 4 | // 5 | // 6 | #pragma once 7 | 8 | #include "stdio.h" 9 | 10 | void formatArray(float* array, int size, int newline=10){ 11 | for (int i = 0; i < size; ++i){ 12 | printf("%.3f, ", array[i]); 13 | if (i % newline == newline - 1){ 14 | printf("\n"); 15 | } 16 | } 17 | printf("\n\t"); 18 | } 19 | 20 | void formatArray(uint* array, int size, int newline=10){ 21 | for (int i = 0; i < size; ++i){ 22 | printf("%3u, ", array[i]); 23 | if (i % newline == newline - 1){ 24 | printf("=====================\n"); 25 | } 26 | } 27 | printf("\n\t"); 28 | } 29 | 30 | void formatArray(int* array, int size, int newline=10){ 31 | for (int i = 0; i < size; ++i){ 32 | printf("%3u, ", array[i]); 33 | if (i % newline == newline - 1){ 34 | printf("=====================\n"); 35 | } 36 | } 37 | printf("\n\t"); 38 | } 39 | -------------------------------------------------------------------------------- /utils/ptx_export.cuh: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #pragma once 7 | 8 | #include "cuda.h" 9 | #include "cuda_runtime.h" 10 | #include "./macro.cuh" 11 | 12 | __forceinline__ __device__ uint32_t getClock(){ 13 | uint32_t clock; 14 | asm volatile( 15 | "mov.u32 %0, %%clock; \n\t" 16 | :"=r"(clock)::"memory" 17 | ); 18 | return clock; 19 | } 20 | 21 | __forceinline__ __device__ uint32_t getSmid(){ 22 | uint32_t smid; 23 | asm volatile( 24 | "mov.u32 %0, %%smid; \n\t" 25 | :"=r"(smid)::"memory" 26 | ); 27 | return smid; 28 | } 29 | 30 | __forceinline__ __device__ uint32_t getWarpid(){ 31 | uint32_t warpid; 32 | asm volatile( 33 | "mov.u32 %0, %%warpid; \n\t" 34 | :"=r"(warpid)::"memory" 35 | ); 36 | return warpid; 37 | } 38 | 39 | __forceinline__ __device__ uint32_t getLaneid(){ 40 | uint32_t laneid; 41 | asm volatile( 42 | "mov.u32 %0, %%laneid; \n\t" 43 | :"=r"(laneid)::"memory" 44 | ); 45 | return laneid; 46 | } 47 | 48 | __forceinline__ __device__ void barSync(){ 49 | asm volatile( 50 | "bar.sync 0; \n\t" 51 | ); 52 | } 53 | 54 | __forceinline__ __device__ void ptxExit(){ 55 | asm volatile( 56 | "exit; \n\t" 57 | ); 58 | } 59 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | 3 | project(gpu-arch-microbenchmark 4 | LANGUAGES CXX CUDA) 5 | 6 | enable_language(CUDA) 7 | 8 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin/) 9 | 10 | set(TARGET_ARCH "-gencode arch=compute_80,code=sm_80 \ 11 | -gencode arch=compute_75,code=sm_75 \ 12 | -gencode arch=compute_70,code=sm_70") 13 | 14 | set(CMAKE_CUDA_FLAGS "${CMAKE_NVCC_FLAGS} ${TARGET_ARCH}") 15 | 16 | set(MICROBENCHMARK_SRC memory/memory_latency.cu 17 | memory/memory_bandwidth.cu 18 | memory/cache_linesize.cu 19 | memory/global_memory_bandwidth.cu 20 | miscellany/reg_bankconflict.cu 21 | miscellany/shared_bankconflict.cu 22 | schedule/warp_schedule.cu) 23 | 24 | 25 | message(STATUS ">>> GPU Microbenchmark") 26 | 27 | foreach(benchmark ${MICROBENCHMARK_SRC}) 28 | get_filename_component(benchmark_exec ${benchmark} NAME_WE) 29 | message(STATUS "Benchmark: ${benchmark_exec}") 30 | add_executable(${benchmark_exec} ${benchmark}) 31 | target_include_directories(${benchmark_exec} PUBLIC ${PROJECT_SOURCE_DIR}/utils) 32 | target_link_libraries(${benchmark_exec} cuda) 33 | endforeach() 34 | 35 | message(STATUS "<<<") 36 | -------------------------------------------------------------------------------- /sass_cubin/reg_reuse_bankconflict.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-7: c1, c2 15 | 8-9: e1, e2 16 | 10-224 ~ v<0-192> 17 | 18 | 19 | 20 | --:-:-:-:2 MOV input_lo, input[0]; 21 | --:-:-:-:2 MOV input_hi, input[1]; 22 | --:-:-:-:2 MOV clock_lo, clock[0]; 23 | --:-:-:-:4 MOV clock_hi, clock[1]; 24 | 25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 26 | 27 | 28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2.reuse, v{:}, v0;" 29 | 30 | SASS_CODE = [] 31 | for i in range(64): 32 | reg = 4 + i * 2 33 | SASS_CODE += [REG_FFMA.format(reg)] 34 | 35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 36 | 37 | 38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 39 | --:-:-:-:2 BAR.SYNC 0x0; 40 | 41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 43 | 44 | 45 | 46 | 47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 48 | 49 | 50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2.reuse, v{:}, v0;" 51 | 52 | SASS_CODE = [] 53 | for i in range(64): 54 | reg = 4 + i * 2 55 | SASS_CODE += [REG_IADD3.format(reg)] 56 | 57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 58 | 59 | 60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 61 | --:-:-:-:2 BAR.SYNC 0x0; 62 | 63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1; 65 | --:-:-:-:2 EXIT; -------------------------------------------------------------------------------- /sass_cubin/reg_reuse_double.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-7: c1, c2 15 | 8-9: e1, e2 16 | 10-224 ~ v<0-192> 17 | 18 | 19 | 20 | --:-:-:-:2 MOV input_lo, input[0]; 21 | --:-:-:-:2 MOV input_hi, input[1]; 22 | --:-:-:-:2 MOV clock_lo, clock[0]; 23 | --:-:-:-:4 MOV clock_hi, clock[1]; 24 | 25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 26 | 27 | 28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2.reuse, v1.reuse, v{:};" 29 | 30 | SASS_CODE = [] 31 | for i in range(64): 32 | reg = 4 + i * 2 33 | SASS_CODE += [REG_FFMA.format(reg)] 34 | 35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 36 | 37 | 38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 39 | --:-:-:-:2 BAR.SYNC 0x0; 40 | 41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 43 | 44 | 45 | 46 | 47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 48 | 49 | 50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2.reuse, v1.reuse, v{:};" 51 | 52 | SASS_CODE = [] 53 | for i in range(64): 54 | reg = 4 + i * 2 55 | SASS_CODE += [REG_IADD3.format(reg)] 56 | 57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 58 | 59 | 60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 61 | --:-:-:-:2 BAR.SYNC 0x0; 62 | 63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1; 65 | --:-:-:-:2 EXIT; -------------------------------------------------------------------------------- /sass_cubin/reg_without_bankconflict.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-7: c1, c2 15 | 8-9: e1, e2 16 | 10-224 ~ v<0-192> 17 | 18 | 19 | 20 | --:-:-:-:2 MOV input_lo, input[0]; 21 | --:-:-:-:2 MOV input_hi, input[1]; 22 | --:-:-:-:2 MOV clock_lo, clock[0]; 23 | --:-:-:-:4 MOV clock_hi, clock[1]; 24 | 25 | --:-:-:-:4 CS2R c1, SR_CLOCKLO; 26 | 27 | 28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v{:}, v{:}, v0;" 29 | 30 | SASS_CODE = [] 31 | for i in range(64): 32 | reg = i * 2 33 | SASS_CODE += [REG_FFMA.format(reg, reg + 1)] 34 | 35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 36 | 37 | 38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 39 | --:-:-:-:2 BAR.SYNC 0x0; 40 | 41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 43 | 44 | 45 | 46 | 47 | --:-:-:-:4 CS2R c1, SR_CLOCKLO; 48 | 49 | 50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v{:}, v{:}, v0;" 51 | 52 | SASS_CODE = [] 53 | for i in range(64): 54 | reg = i * 2 55 | SASS_CODE += [REG_IADD3.format(reg, reg + 1)] 56 | 57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 58 | 59 | 60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 61 | --:-:-:-:2 BAR.SYNC 0x0; 62 | 63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1; 65 | --:-:-:-:2 EXIT; -------------------------------------------------------------------------------- /sass_cubin/reg_with_bankconflict.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-7: c1, c2 15 | 8-9: e1, e2 16 | 10-224 ~ v<0-192> 17 | 18 | 19 | 20 | --:-:-:-:2 MOV input_lo, input[0]; 21 | --:-:-:-:2 MOV input_hi, input[1]; 22 | --:-:-:-:2 MOV clock_lo, clock[0]; 23 | --:-:-:-:4 MOV clock_hi, clock[1]; 24 | 25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 26 | 27 | 28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2, v{:}, v0;" 29 | 30 | SASS_CODE = [] 31 | for i in range(64): 32 | reg = 4 + i * 2 33 | SASS_CODE += [REG_FFMA.format(reg)] 34 | 35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 36 | 37 | 38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 39 | --:-:-:-:2 BAR.SYNC 0x0; 40 | 41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 43 | 44 | 45 | 46 | 47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 48 | 49 | 50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2, v{:}, v0;" 51 | 52 | SASS_CODE = [] 53 | for i in range(64): 54 | reg = 4 + i * 2 55 | SASS_CODE += [REG_IADD3.format(reg)] 56 | 57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 58 | 59 | 60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO; 61 | --:-:-:-:2 BAR.SYNC 0x0; 62 | 63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1; 65 | 66 | 67 | 68 | 69 | --:-:-:-:2 EXIT; -------------------------------------------------------------------------------- /compile_sass.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | 5 | def camel_to_snake(name): 6 | token_list = name.split("_") 7 | camel_name = "" 8 | for i, token in enumerate(token_list): 9 | if i == 0: 10 | camel_name += token 11 | else: 12 | camel_name += token.capitalize() 13 | return camel_name 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-arch", type=int, default=75) 19 | 20 | args = parser.parse_args() 21 | 22 | ARCH_LIST = [70, 75, 80] 23 | KERNEL_LIST = ["memory_latency", 24 | "memory_bandwidth_thread", "memory_bandwidth_block", 25 | "cache_linesize", 26 | "reg_reuse_double", "reg_reuse_bankconflict", "reg_with_bankconflict", "reg_without_bankconflict", 27 | "shared_bankconflict", 28 | "warp_schedule"] 29 | 30 | if args.arch not in ARCH_LIST: 31 | print("Unsupported Gpu Arch: ", args.arch) 32 | exit() 33 | 34 | print(">>>") 35 | for kernel in KERNEL_LIST: 36 | source_sass = f"{kernel}.sass" 37 | target_cubin = f"{kernel}.cubin" 38 | target_kernel = camel_to_snake(kernel) 39 | compile_command = f"python3 -m turingas.main -i ../sass_cubin/{source_sass} -o ../sass_cubin/{target_cubin} -arch {args.arch} -name {target_kernel}" 40 | 41 | print(f" compile kernel: {target_kernel}") 42 | os.system(compile_command) 43 | print("<<<") 44 | -------------------------------------------------------------------------------- /sass_cubin/warp_schedule.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | run_warp, 4 6 | 7 | 8 | 9 | 0: input_lo 10 | 1: input_hi 11 | 2: output_lo 12 | 3: output_hi 13 | 4: clock_lo 14 | 5: clock_hi 15 | 6-7: c1, c2 16 | 8-9: e1, e2 17 | 10-19 ~ clock_offset_lo, clock_offset_hi, tid, warpid, laneid, warp_offset, warpid32 18 | 20-150 ~ v<0-128> 19 | 20 | 21 | 22 | --:-:0:-:4 S2R tid, SR_TID.X; 23 | 24 | --:-:-:-:2 MOV input_lo, input[0]; 25 | --:-:-:-:2 MOV input_hi, input[1]; 26 | --:-:-:-:2 MOV clock_lo, clock[0]; 27 | --:-:-:-:2 MOV clock_hi, clock[1]; 28 | 29 | 01:-:-:-:4 SHF.R.S32.HI warpid, RZ, 0x5, tid; 30 | 31 | --:-:-:-:4 ISETP.NE.AND P0, PT, warpid, run_warp, PT; 32 | --:-:-:-:5 ISETP.EQ.OR P0, PT, warpid, RZ, !P0; 33 | 34 | --:-:-:-:5 @!P0 EXIT; 35 | 36 | --:-:-:-:4 SHF.L.S32.HI warpid32, RZ, 0x5, warpid; 37 | --:-:-:-:5 IADD3 laneid, tid, -warpid32, RZ; 38 | 39 | --:-:-:-:5 IMAD.WIDE clock_offset_lo, laneid, 0x4, clock_lo; 40 | --:-:-:-:5 ISETP.EQ.AND P1, PT, warpid, RZ, PT; 41 | --:-:-:-:5 @P1 IADD3 clock_offset_lo, clock_offset_lo, 0x80, RZ; 42 | --:-:-:-:4 MOV clock_offset_hi, clock_hi; 43 | 44 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 45 | 46 | REG_FFMA = "--:-:-:-:1 FFMA v0, v{:}, v{:}, v0;" 47 | 48 | SASS_CODE = [] 49 | for i in range(64): 50 | reg = i * 2 51 | SASS_CODE += [REG_FFMA.format(i, i + 1)] 52 | 53 | SASS_CODE += ["--:-:-:-:4 FFMA v0, v1, v2, v0;"] 54 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 55 | 56 | --:-:-:-:6 CS2R c2, SR_CLOCKLO; 57 | 58 | --:-:-:-:6 IADD3 e1, c2, -c1, RZ; 59 | --:-:-:-:4 STG.E.SYS [clock_offset_lo], e1; 60 | 61 | --:-:-:-:5 EXIT; 62 | -------------------------------------------------------------------------------- /sass_cubin/shared_bankconflict.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-9: c<1-4> 15 | 10-13: vA, vB, vC, vD 16 | 14-17: e1, e2, e3, e4 17 | 18-20: x1, x2, x3 18 | 24-27: v<0-3> 19 | 20 | 21 | 22 | --:-:-:-:2 MOV input_lo, input[0]; 23 | --:-:-:-:2 MOV input_hi, input[1]; 24 | --:-:-:-:2 MOV clock_lo, clock[0]; 25 | --:-:-:-:4 MOV clock_hi, clock[1]; 26 | 27 | 28 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 29 | --:-:0:-:2 LDS vA, [RZ+0x100]; 30 | 01:-:-:-:4 CS2R c2, SR_CLOCKLO; 31 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ; 32 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 33 | 34 | 35 | 36 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 37 | --:-:0:-:2 LDS.64 v0, [RZ]; 38 | --:-:1:-:2 LDS.64 v2, [RZ + 0x8]; 39 | 03:-:-:-:4 CS2R c2, SR_CLOCKLO; 40 | --:-:-:-:5 IADD3 e2, c2, -c1, RZ; 41 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e2; 42 | 43 | 44 | 45 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 46 | --:-:0:-:2 LDS vA, [RZ+0x0]; 47 | --:-:1:-:2 LDS vB, [RZ+0x80]; 48 | --:-:2:-:2 LDS vC, [RZ+0x100]; 49 | --:-:3:-:2 LDS vD, [RZ+0x180]; 50 | 15:-:-:-:4 CS2R c2, SR_CLOCKLO; 51 | --:-:-:-:5 IADD3 e3, c2, -c1, RZ; 52 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x8], e3; 53 | 54 | 55 | 56 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 57 | --:-:0:-:2 LDS vA, [RZ+0x0]; 58 | --:-:1:-:2 LDS vB, [RZ+0x84]; 59 | --:-:2:-:2 LDS vC, [RZ+0x108]; 60 | --:-:3:-:2 LDS vD, [RZ+0x18c]; 61 | 15:-:-:-:4 CS2R c2, SR_CLOCKLO; 62 | --:-:-:-:5 IADD3 e4, c2, -c1, RZ; 63 | --:-:-:-:4 STG.E.SYS [clock_lo + 0xc], e4; 64 | 65 | --:-:-:-:2 EXIT; 66 | -------------------------------------------------------------------------------- /schedule/block_schedule.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void block_workload(float *A, float *B){ 5 | int tid = threadIdx.x; 6 | uint32_t global_warpip = get_global_warpid(); 7 | 8 | float dummy = 0; 9 | float vA[4], vB[4], vC[4], vD[4]; 10 | float *ptr; 11 | ptrdiff_t offset = 0; 12 | 13 | #pragma unroll 14 | for (int i = 0; i < 32; ++i){ 15 | offset = i * 4; 16 | ptr = A + offset; 17 | 18 | asm volatile( 19 | "ld.global.ca.f32 %0, [%4]; \n\t" 20 | "ld.global.ca.f32 %1, [%4+4]; \n\t" 21 | "ld.global.ca.f32 %2, [%4+8]; \n\t" 22 | "ld.global.ca.f32 %3, [%4+12]; \n\t" 23 | :"=f"(vA[0]),"=f"(vB[0]),"=f"(vC[0]),"=f"(vD[0]) 24 | :"l"(ptr):"memory" 25 | ); 26 | dummy += vA[0]; 27 | dummy += vB[0]; 28 | dummy += vC[0]; 29 | dummy += vD[0]; 30 | } 31 | B[tid] = dummy; 32 | } 33 | 34 | int main() { 35 | size_t width = 512; 36 | size_t bytes = 4 * width; 37 | 38 | 39 | dim3 bDim1(32); 40 | dim3 bDim1(128); 41 | dim3 gDim(80); 42 | 43 | float *A; 44 | float *B; 45 | uint32_t *cost; 46 | 47 | cudaMallocManaged(&A, bytes); 48 | cudaMallocManaged(&B, bytes); 49 | cudaMallocManaged(&cost, bytes); 50 | 51 | for (int i = 0; i < width; ++i) { 52 | h_A[i] = i; 53 | } 54 | 55 | float totalElapsed; 56 | cudaEvent_t start_t, stop_t; 57 | cudaEventCreate(&start_t); 58 | cudaEventCreate(&stop_t); 59 | 60 | cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice); 61 | cudaEventRecord(start_t, 0); 62 | 63 | warp_workload<0, 1><<>>(d_A, d_B); 64 | printf(cudaGetErrorString(cudaGetLastError())); 65 | 66 | cudaEventRecord(stop_t, 0); 67 | cudaEventSynchronize(stop_t); 68 | cudaMemcpy(h_B, d_B, bytes, cudaMemcpyDeviceToHost); 69 | cudaEventElapsedTime(&totalElapsed, start_t, stop_t); 70 | printf("\nHost Time Elapsed %f ms", totalElapsed); 71 | } 72 | -------------------------------------------------------------------------------- /sass_cubin/memory_latency.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6-9: c<1-4> 15 | 10-13: vA, vB, vC, vD 16 | 14-17: e1, e2, e3, e4 17 | 18-20: x1, x2, x3 18 | 21-25 ~ tid, warpid, tid32 19 | 32-35 ~ a0, a1, a2, a3 20 | 36-42 ~ smem1, smem2, smem3, e_s1, e_s2 21 | 43-63 ~ c<5-10> 22 | 64-79 ~ e<5-10> 23 | 24 | 25 | 26 | const_a, 8 27 | 28 | 29 | --:-:-:-:2 MOV input_lo, input[0]; 30 | --:-:-:-:2 MOV input_hi, input[1]; 31 | --:-:-:-:2 MOV clock_lo, clock[0]; 32 | --:-:-:-:4 MOV clock_hi, clock[1]; 33 | 34 | 35 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 36 | --:-:0:-:2 LDG.E.STRONG.GPU vA, [input_lo]; 37 | 01:-:-:-:2 CS2R c2, SR_CLOCKLO; 38 | --:-:0:-:2 LDG.E.STRONG.CTA vB, [input_lo+0x4]; 39 | 01:-:-:-:2 CS2R c3, SR_CLOCKLO; 40 | --:-:0:-:2 LDG.E.STRONG.CTA vC, [input_lo+0x8]; 41 | 01:-:-:-:2 CS2R c4, SR_CLOCKLO; 42 | --:-:0:-:2 LDG.E.STRONG.CTA vA, [input_lo+0x10000]; 43 | 01:-:-:-:2 CS2R c5, SR_CLOCKLO; 44 | 45 | --:-:-:-:4 IADD3 e1, c2, -c1, RZ; 46 | --:-:-:-:4 IADD3 e2, c3, -c2, RZ; 47 | --:-:-:-:4 IADD3 e3, c4, -c3, RZ; 48 | --:-:-:-:4 IADD3 e4, c5, -c4, RZ; 49 | 50 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 51 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2; 52 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3; 53 | --:-:-:-:4 STG.E.SYS [clock_lo+0x1c], e4; 54 | 55 | --:-:-:-:6 NOP; 56 | 57 | --:-:-:-:2 CS2R c1, SR_CLOCKLO; 58 | --:-:1:-:2 LDC.E x1, const_a[0]; 59 | 02:-:-:-:2 CS2R c2, SR_CLOCKLO; 60 | --:-:1:-:2 MOV x2, const_a[1]; 61 | 02:-:-:-:2 CS2R c3, SR_CLOCKLO; 62 | --:-:1:-:2 MOV x3, const_a[2]; 63 | 02:-:-:-:4 CS2R c4, SR_CLOCKLO; 64 | 65 | --:-:-:-:4 IADD3 e1, c2, -c1, RZ; 66 | --:-:-:-:4 IADD3 e2, c3, -c2, RZ; 67 | --:-:-:-:4 IADD3 e3, c4, -c3, RZ; 68 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e1; 69 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e2; 70 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e3; 71 | 72 | --:-:-:-:6 NOP; 73 | 74 | --:-:-:-:2 CS2R smem1, SR_CLOCKLO; 75 | --:-:0:-:2 LDS x1, [RZ+0x0]; 76 | 01:-:-:-:4 CS2R smem2, SR_CLOCKLO; 77 | 78 | --:-:-:-:5 IADD3 e_s1, smem2, -smem1, RZ; 79 | --:-:-:-:4 STG.E.SYS [clock_lo+0x18], e_s1; 80 | 81 | --:-:-:-:2 EXIT; 82 | -------------------------------------------------------------------------------- /schedule/warp_schedule.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | 10 | __global__ void warpScheduleKernel(float* input, float* output, uint* clock, const int run_warp){ 11 | int tid = threadIdx.x; 12 | int laneid = tid & 0x1f; 13 | int warpid = tid >> 5; 14 | 15 | if (warpid != 0 and warpid != run_warp){ 16 | ptxExit(); 17 | } 18 | 19 | input += tid; 20 | clock += 32 * warpid / run_warp; 21 | 22 | 23 | float array[128]; 24 | float acc = 0; 25 | for (int i = 0; i < 128; ++i){ 26 | array[i] = input[i]; 27 | } 28 | 29 | uint c1 = getClock(); 30 | #pragma unroll 31 | for (int i = 0; i < 128; ++i){ 32 | acc += array[i] * array[i] + 1.0f; 33 | } 34 | uint c2 = getClock(); 35 | 36 | clock[laneid] = c2 - c1; 37 | output[laneid] = acc; 38 | } 39 | 40 | uint sumArray(uint* array, int size){ 41 | uint acc = 0; 42 | for (int i = 0; i < size; ++i){ 43 | acc += array[i]; 44 | } 45 | return acc; 46 | } 47 | 48 | 49 | int main(){ 50 | 51 | float* input_h; 52 | float* input_d; 53 | float* output_h; 54 | float* output_d; 55 | uint32_t* clock_h; 56 | uint32_t* clock_d; 57 | 58 | int size = 4096; 59 | 60 | input_h = static_cast(malloc(sizeof(float) * size)); 61 | output_h = static_cast(malloc(sizeof(float) * size)); 62 | clock_h = static_cast(malloc(sizeof(uint32_t) * size)); 63 | 64 | 65 | cudaMalloc(&input_d, sizeof(float) * size); 66 | cudaMalloc(&output_d, sizeof(float) * size); 67 | cudaMalloc(&clock_d, sizeof(uint32_t) * size); 68 | 69 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice); 70 | 71 | 72 | dim3 gDim(1, 1, 1); 73 | dim3 bDim(256, 1, 1); 74 | 75 | const char* cubin_name = "../sass_cubin/warp_schedule.cubin"; 76 | const char* kernel_name = "warpSchedule"; 77 | 78 | printf(">>> SASS Level Warp Scedule Detect\n"); 79 | for (int i = 1; i < 8; ++i){ 80 | void* kernel_args[4] = {&input_d, &output_d, &clock_d, &i}; 81 | cudaMemset(clock_d, 0, sizeof(uint) * size); 82 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args); 83 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 84 | 85 | printf(" Run Warp <0, %d> Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64)); 86 | cudaDeviceSynchronize(); 87 | } 88 | 89 | 90 | 91 | printf("\n"); 92 | printf(">>> CUDA-C Level Warp Schedule Detect\n"); 93 | for (int i = 1; i < 8; ++i){ 94 | cudaMemset(clock_d, 0, sizeof(uint) * size); 95 | warpScheduleKernel<<>>(input_d, output_d, clock_d, i); 96 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost); 97 | 98 | printf(" Run Warp <0, %d> Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64)); 99 | cudaDeviceSynchronize(); 100 | } 101 | 102 | return 0; 103 | } -------------------------------------------------------------------------------- /miscellany/reg_bankconflict.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | 10 | int main(){ 11 | 12 | float* input_h; 13 | float* input_d; 14 | float* output_h; 15 | float* output_d; 16 | uint32_t* clock_h; 17 | uint32_t* clock_d; 18 | 19 | int size = 1024; 20 | 21 | input_h = static_cast(malloc(sizeof(float) * size)); 22 | output_h = static_cast(malloc(sizeof(float) * size)); 23 | clock_h = static_cast(malloc(sizeof(uint32_t) * size)); 24 | 25 | 26 | cudaMalloc(&input_d, sizeof(float) * size); 27 | cudaMalloc(&output_d, sizeof(float) * size); 28 | cudaMalloc(&clock_d, sizeof(uint32_t) * size); 29 | 30 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice); 31 | 32 | 33 | dim3 gDim(1, 1, 1); 34 | dim3 bDim(1, 1, 1); 35 | 36 | void* kernel_args[] = {&input_d, &output_d, &clock_d}; 37 | 38 | 39 | const char* cubin_name1 = "../sass_cubin/reg_with_bankconflict.cubin"; 40 | const char* kernel_name1 = "regWithBankconflict"; 41 | launchSassKernel(cubin_name1, kernel_name1, gDim, bDim, 0, kernel_args); 42 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 43 | cudaDeviceSynchronize(); 44 | printf(">>> SASS Level Reg With BankConflict IPC Result\n"); 45 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64); 46 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64); 47 | 48 | 49 | 50 | 51 | const char* cubin_name2 = "../sass_cubin/reg_without_bankconflict.cubin"; 52 | const char* kernel_name2 = "regWithoutBankconflict"; 53 | launchSassKernel(cubin_name2, kernel_name2, gDim, bDim, 0, kernel_args); 54 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 55 | cudaDeviceSynchronize(); 56 | printf("\n"); 57 | printf(">>> SASS Level Reg Without BankConflict IPC Result\n"); 58 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64); 59 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64); 60 | 61 | 62 | 63 | const char* cubin_name3 = "../sass_cubin/reg_reuse_bankconflict.cubin"; 64 | const char* kernel_name3 = "regReuseBankconflict"; 65 | launchSassKernel(cubin_name3, kernel_name3, gDim, bDim, 0, kernel_args); 66 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 67 | cudaDeviceSynchronize(); 68 | printf("\n"); 69 | printf(">>> SASS Level Reg Reuse BankConflict IPC Result\n"); 70 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64); 71 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64); 72 | 73 | 74 | 75 | const char* cubin_name4 = "../sass_cubin/reg_reuse_double.cubin"; 76 | const char* kernel_name4 = "regReuseDouble"; 77 | launchSassKernel(cubin_name4, kernel_name4, gDim, bDim, 0, kernel_args); 78 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 79 | cudaDeviceSynchronize(); 80 | printf("\n"); 81 | printf(">>> SASS Level Reg Reuse Double IPC Result\n"); 82 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64); 83 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64); 84 | 85 | return 0; 86 | } -------------------------------------------------------------------------------- /sass_cubin/cache_linesize.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | output, 8 4 | clock, 8 5 | 6 | 7 | 8 | 0: input_lo 9 | 1: input_hi 10 | 2: output_lo 11 | 3: output_hi 12 | 4: clock_lo 13 | 5: clock_hi 14 | 6: vA 15 | 7: e 16 | 8-240 ~ c<0-200> 17 | 18 | 19 | 20 | 21 | const_a, 1024 22 | 23 | 24 | 25 | 26 | --:-:-:-:2 MOV input_lo, input[0]; 27 | --:-:-:-:2 MOV input_hi, input[1]; 28 | --:-:-:-:2 MOV output_lo, output[0]; 29 | --:-:-:-:2 MOV output_hi, output[1]; 30 | --:-:-:-:2 MOV clock_lo, clock[0]; 31 | --:-:-:-:4 MOV clock_hi, clock[1]; 32 | 33 | 34 | --:-:-:-:2 CS2R c0, SR_CLOCKLO; 35 | 36 | SASS_CODE = [] 37 | loop_size = 200 38 | 39 | LDG = "--:-:0:-:2 LDG.E.STRONG.GPU vA, [input_lo+{:}];" 40 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;" 41 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;" 42 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;" 43 | 44 | for i in range(loop_size): 45 | SASS_CODE += [LDG.format(hex(i * 4))] 46 | SASS_CODE += [CS2R.format(i+1)] 47 | 48 | for i in range(loop_size): 49 | SASS_CODE += [IADD.format(i+1, i)] 50 | SASS_CODE += [STG.format(hex(i*4), i)] 51 | 52 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ; 61 | --:-:-:-:2 CS2R c0, SR_CLOCKLO; 62 | 63 | 64 | SASS_CODE = [] 65 | loop_size = 200 66 | 67 | LDG = "--:-:0:-:2 LDG.E.STRONG.CTA vA, [input_lo+{:}];" 68 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;" 69 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;" 70 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;" 71 | 72 | for i in range(loop_size): 73 | SASS_CODE += [LDG.format(hex(i * 4))] 74 | SASS_CODE += [CS2R.format(i+1)] 75 | 76 | for i in range(loop_size): 77 | SASS_CODE += [IADD.format(i+1, i)] 78 | SASS_CODE += [STG.format(hex(i*4), i)] 79 | 80 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 81 | 82 | 83 | 84 | 85 | 86 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ; 87 | --:-:-:-:2 CS2R c0, SR_CLOCKLO; 88 | 89 | SASS_CODE = [] 90 | loop_size = 200 91 | 92 | LDC = "--:-:0:-:2 LDC.E vA, const_a[{:}];" 93 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;" 94 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;" 95 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;" 96 | 97 | for i in range(loop_size): 98 | SASS_CODE += [LDC.format(i)] 99 | SASS_CODE += [CS2R.format(i+1)] 100 | 101 | for i in range(loop_size): 102 | SASS_CODE += [IADD.format(i+1, i)] 103 | SASS_CODE += [STG.format(hex(i*4), i)] 104 | 105 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 106 | 107 | 108 | 109 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ; 110 | --:-:-:-:2 CS2R c0, SR_CLOCKLO; 111 | 112 | SASS_CODE = [] 113 | loop_size = 200 114 | 115 | LDC = "--:-:0:-:2 MOV vA, const_a[{:}];" 116 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;" 117 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;" 118 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;" 119 | 120 | for i in range(loop_size): 121 | SASS_CODE += [LDC.format(i)] 122 | SASS_CODE += [CS2R.format(i+1)] 123 | 124 | for i in range(loop_size): 125 | SASS_CODE += [IADD.format(i+1, i)] 126 | SASS_CODE += [STG.format(hex(i*4), i)] 127 | 128 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 129 | 130 | 131 | --:-:-:-:2 EXIT; 132 | -------------------------------------------------------------------------------- /sass_cubin/memory_bandwidth_thread.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | clock, 8 4 | 5 | 6 | 7 | 0: input_lo 8 | 1: input_hi 9 | 4: clock_lo 10 | 5: clock_hi 11 | 8-11: v<1-4> 12 | 12-30 ~ c<1-12> 13 | 31-40 ~ e<1-6> 14 | 15 | 16 | --:-:-:-:2 MOV input_lo, input[0]; 17 | --:-:-:-:2 MOV input_hi, input[1]; 18 | --:-:-:-:2 MOV clock_lo, clock[0]; 19 | --:-:-:-:4 MOV clock_hi, clock[1]; 20 | 21 | --:-:3:-:1 LDG.E.128.STRONG.CTA v1, [input_lo]; // warmup 22 | 23 | ######################################################################################### 24 | 25 | 08:-:-:-:3 CS2R c1, SR_CLOCKLO; 26 | 27 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo+{:}];" 28 | 29 | SASS_CODE = [] 30 | for i in range(1024): 31 | pos = hex(i * 16) 32 | SASS_CODE += [LDG_128_to_reg.format(pos)] 33 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 34 | 35 | 01:-:-:-:6 CS2R c2, SR_CLOCKLO; 36 | 37 | --:-:-:-:3 CS2R c3, SR_CLOCKLO; 38 | 39 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [RZ+{:}];" 40 | 41 | SASS_CODE = [] 42 | for i in range(256): 43 | pos = hex(i * 16) 44 | SASS_CODE += [LDS_128_to_reg.format(pos)] 45 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 46 | 47 | 02:-:-:-:6 CS2R c4, SR_CLOCKLO; 48 | 49 | ######################################################################################### 50 | 51 | --:-:-:-:3 CS2R c5, SR_CLOCKLO; 52 | 53 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo+{:}];" 54 | 55 | SASS_CODE = [] 56 | for i in range(1024 * 2): 57 | pos = hex(i * 8) 58 | SASS_CODE += [LDG_64_to_reg.format(pos)] 59 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 60 | 61 | 01:-:-:-:6 CS2R c6, SR_CLOCKLO; 62 | 63 | --:-:-:-:3 CS2R c7, SR_CLOCKLO; 64 | 65 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [RZ+{:}];" 66 | 67 | SASS_CODE = [] 68 | for i in range(256 * 2): 69 | pos = hex(i * 8) 70 | SASS_CODE += [LDS_64_to_reg.format(pos)] 71 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 72 | 73 | 02:-:-:-:6 CS2R c8, SR_CLOCKLO; 74 | 75 | ######################################################################################### 76 | 77 | --:-:-:-:3 CS2R c9, SR_CLOCKLO; 78 | 79 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];" 80 | 81 | SASS_CODE = [] 82 | for i in range(1024 * 4): 83 | pos = hex(i * 4) 84 | SASS_CODE += [LDG_32_to_reg.format(pos)] 85 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 86 | 87 | 01:-:-:-:6 CS2R c10, SR_CLOCKLO; 88 | 89 | --:-:-:-:3 CS2R c11, SR_CLOCKLO; 90 | 91 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [RZ+{:}];" 92 | 93 | SASS_CODE = [] 94 | for i in range(256 * 4): 95 | pos = hex(i * 4) 96 | SASS_CODE += [LDS_32_to_reg.format(pos)] 97 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 98 | 99 | 02:-:-:-:6 CS2R c12, SR_CLOCKLO; 100 | 101 | ######################################################################################### 102 | 103 | --:-:-:-:2 IADD3 e1, c2, -c1, RZ; 104 | --:-:-:-:2 IADD3 e2, c4, -c3, RZ; 105 | --:-:-:-:2 IADD3 e3, c6, -c5, RZ; 106 | --:-:-:-:2 IADD3 e4, c8, -c7, RZ; 107 | --:-:-:-:2 IADD3 e5, c10, -c9, RZ; 108 | --:-:-:-:2 IADD3 e6, c12, -c11, RZ; 109 | 110 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 111 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2; 112 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3; 113 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e4; 114 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e5; 115 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e6; 116 | 117 | --:-:-:-:2 EXIT; 118 | 119 | -------------------------------------------------------------------------------- /memory/cache_linesize.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | 10 | __constant__ float cinput[1024]; 11 | 12 | __global__ void linesizeDetectKernel(float* input, float* output, uint* clock, float* cinput){ 13 | 14 | uint c[256]; 15 | float val = 0; 16 | 17 | float acc = 0; 18 | c[0] = getClock(); 19 | #pragma unroll 20 | for (int i = 0; i < 256; ++i){ 21 | asm volatile( 22 | "ld.global.cg.b32 %0, [%1]; \n\t" 23 | :"=f"(val):"l"(input):"memory" 24 | ); 25 | c[i+1] = getClock(); 26 | acc += val; 27 | input += 2; 28 | } 29 | #pragma unroll 30 | for (int i = 0; i < 256; ++i){ 31 | clock[i] = c[i+1] - c[i]; 32 | } 33 | output[0] = acc; 34 | 35 | ///////////////////////////////////////////////////////////////////////// 36 | 37 | input += 1024; 38 | clock += 512; 39 | acc = 0; 40 | c[0] = getClock(); 41 | #pragma unroll 42 | for (int i = 0; i < 256; ++i){ 43 | asm volatile( 44 | "ld.global.ca.f32 %0, [%1]; \n\t" 45 | :"=f"(val):"l"(input):"memory" 46 | ); 47 | c[i+1] = getClock(); 48 | acc += val; 49 | input++; 50 | } 51 | #pragma unroll 52 | for (int i = 0; i < 256; ++i){ 53 | clock[i] = c[i+1] - c[i]; 54 | } 55 | output[1] = acc; 56 | } 57 | 58 | 59 | int detectCacheLinesize(uint* clock, int size, uint gap){ 60 | int linesize = 0; 61 | uint last_cycle = clock[0]; 62 | 63 | int first = 0; 64 | int second = 0; 65 | 66 | // formatArray(clock, 256, 16); 67 | for (int i = 1; i < size; ++i){ 68 | if (clock[i] > last_cycle and clock[i] - last_cycle > gap) { 69 | if (first == 0){ 70 | first = i; 71 | } else { 72 | second = i; 73 | break; 74 | } 75 | } 76 | last_cycle = clock[i]; 77 | } 78 | return (second - first) * 4; 79 | } 80 | 81 | 82 | int main(){ 83 | float* input_h; 84 | float* input_d; 85 | float* output_h; 86 | float* output_d; 87 | uint* clock_h; 88 | uint* clock_d; 89 | 90 | int size = 4096; 91 | 92 | input_h = static_cast(malloc(sizeof(float) * size)); 93 | output_h = static_cast(malloc(sizeof(float) * size)); 94 | clock_h = static_cast(malloc(sizeof(uint) * size)); 95 | 96 | cudaMalloc(&input_d, sizeof(float) * size); 97 | cudaMalloc(&output_d, sizeof(float) * size); 98 | cudaMalloc(&clock_d, sizeof(uint) * size); 99 | 100 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice); 101 | 102 | dim3 gDim(1, 1, 1); 103 | dim3 bDim(1, 1, 1); 104 | 105 | void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput}; 106 | const char* cubin_name = "../sass_cubin/cache_linesize.cubin"; 107 | const char* kernel_name = "cacheLinesize"; 108 | 109 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args); 110 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost); 111 | cudaDeviceSynchronize(); 112 | printf(">>> SASS Level Cache Linesize Result\n"); 113 | printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40)); 114 | printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10)); 115 | printf(" Constant L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1024, 512, 100)); 116 | printf(" Constant L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1536, 512, 10)); 117 | 118 | 119 | 120 | linesizeDetectKernel<<>>(input_d, output_d, clock_d, cinput); 121 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost); 122 | cudaDeviceSynchronize(); 123 | printf("\n"); 124 | printf(">>> CUDA-C Level Cache Linesize Result\n"); 125 | printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40)); 126 | printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10)); 127 | return 0; 128 | } -------------------------------------------------------------------------------- /sass_cubin/memory_bandwidth_block.sass: -------------------------------------------------------------------------------- 1 | 2 | input, 8 3 | clock, 8 4 | 5 | 6 | 7 | 0: input_lo 8 | 1: input_hi 9 | 4: clock_lo 10 | 5: clock_hi 11 | 8-11: v<1-4> 12 | 12-30 ~ c<1-12> 13 | 31-40 ~ e<1-6> 14 | 41-44 ~ tid, tid_x_2, tid_x_4, clock_set 15 | 46-47: input_lo_x_4, input_hi_x_4 16 | 48-49: input_lo_x_2, input_hi_x_2 17 | 18 | 19 | --:-:-:-:2 MOV input_lo, input[0]; 20 | --:-:-:-:2 MOV input_hi, input[1]; 21 | --:-:-:-:2 MOV input_hi_x_4, input[1]; 22 | --:-:-:-:2 MOV input_hi_x_2, input[1]; 23 | --:-:-:-:2 MOV clock_lo, clock[0]; 24 | --:-:-:-:4 MOV clock_hi, clock[1]; 25 | 26 | --:-:3:-:1 LDG.E.32.STRONG.CTA v1, [input_lo]; // warmup 27 | 28 | --:-:0:-:5 S2R tid, SR_TID.X; // tid = threadIdx.x 29 | 01:-:-:-:6 SHF.L.S32.HI tid, RZ, 0x2, tid; // sizeof(T) == 4 30 | 31 | --:-:-:-:5 SHF.L.S32.HI tid_x_4, RZ, 0x2, tid; 32 | --:-:-:-:5 SHF.L.S32.HI tid_x_2, RZ, 0x1, tid; 33 | 34 | --:-:-:-:5 IADD3 input_lo_x_4, input_lo, tid_x_4, RZ; 35 | --:-:-:-:5 IADD3 input_lo_x_2, input_lo, tid_x_2, RZ; 36 | 37 | ######################################################################################### 38 | 39 | 08:-:-:-:3 CS2R c1, SR_CLOCKLO; 40 | 41 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo_x_4+{:}];" 42 | 43 | SASS_CODE = [] 44 | for i in range(128): 45 | pos = hex(i * 16 * 256) 46 | SASS_CODE += [LDG_128_to_reg.format(pos)] 47 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 48 | 49 | 50 | 01:-:-:-:6 CS2R c2, SR_CLOCKLO; 51 | 52 | --:-:-:-:3 CS2R c3, SR_CLOCKLO; 53 | 54 | 55 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [tid_x_4+{:}];" 56 | 57 | SASS_CODE = [] 58 | for i in range(8): 59 | pos = hex(i * 16 * 256) 60 | SASS_CODE += [LDS_128_to_reg.format(pos)] 61 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 62 | 63 | 02:-:-:-:6 CS2R c4, SR_CLOCKLO; 64 | 65 | ######################################################################################### 66 | 67 | --:-:-:-:3 CS2R c5, SR_CLOCKLO; 68 | 69 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo_x_2+{:}];" 70 | 71 | SASS_CODE = [] 72 | for i in range(256): 73 | pos = hex(i * 8 * 256) 74 | SASS_CODE += [LDG_64_to_reg.format(pos)] 75 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 76 | 77 | 78 | 01:-:-:-:6 CS2R c6, SR_CLOCKLO; 79 | 80 | --:-:-:-:3 CS2R c7, SR_CLOCKLO; 81 | 82 | 83 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [tid_x_2+{:}];" 84 | 85 | SASS_CODE = [] 86 | for i in range(16): 87 | pos = hex(i * 8 * 256) 88 | SASS_CODE += [LDS_64_to_reg.format(pos)] 89 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 90 | 91 | 02:-:-:-:6 CS2R c8, SR_CLOCKLO; 92 | 93 | ######################################################################################### 94 | 95 | --:-:-:-:3 CS2R c9, SR_CLOCKLO; 96 | 97 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];" 98 | 99 | SASS_CODE = [] 100 | for i in range(512): 101 | pos = hex(i * 4 * 256) 102 | SASS_CODE += [LDG_32_to_reg.format(pos)] 103 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 104 | 105 | 106 | 01:-:-:-:6 CS2R c10, SR_CLOCKLO; 107 | 108 | --:-:-:-:3 CS2R c11, SR_CLOCKLO; 109 | 110 | 111 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [tid+{:}];" 112 | 113 | SASS_CODE = [] 114 | for i in range(32): 115 | pos = hex(i * 4 * 256) 116 | SASS_CODE += [LDS_32_to_reg.format(pos)] 117 | out_ = "\n" + "\n".join(SASS_CODE) + "\n" 118 | 119 | 02:-:-:-:6 CS2R c12, SR_CLOCKLO; 120 | 121 | ######################################################################################### 122 | 123 | --:-:-:-:6 IMAD.WIDE clock_lo, tid, 0x6, clock_lo; 124 | 125 | --:-:-:-:2 IADD3 e1, c2, -c1, RZ; 126 | --:-:-:-:2 IADD3 e2, c4, -c3, RZ; 127 | --:-:-:-:2 IADD3 e3, c6, -c5, RZ; 128 | --:-:-:-:2 IADD3 e4, c8, -c7, RZ; 129 | --:-:-:-:2 IADD3 e5, c10, -c9, RZ; 130 | --:-:-:-:2 IADD3 e6, c12, -c11, RZ; 131 | 132 | --:-:-:-:4 STG.E.SYS [clock_lo], e1; 133 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2; 134 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3; 135 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e4; 136 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e5; 137 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e6; 138 | 139 | --:-:-:-:2 EXIT; 140 | 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU Arch Microbenchmark 2 | 3 | 4 | ## Prerequisites 5 | 1. install `turingas` compiler 6 | > `git clone --recursive git@github.com:sjfeng1999/gpu-arch-microbenchmark.git` 7 | > `cd turingas` 8 | > `python setup.py install` 9 | 10 | ## Usage 11 | 1. `mkdir build && cd build` 12 | 2. `cmake .. && make` 13 | 3. `python ../compile_sass.py -arch=(70|75|80)` 14 | 4. `./(memory_latency|reg_bankconflict|...)` 15 | 16 | ## Microbenchmark 17 | 18 | ### 1. Memory Latency 19 | 20 | |Device |Latency |Turing RTX-2070 (TU104)| 21 | |:--------------------------:|:---------:|:---------------------:| 22 | |Global Latency |cycle | 1000 ~ 1200 | 23 | |TLB Latency |cycle | 472 | 24 | |L2 Latency |cycle | 236 | 25 | |L1 Latency |cycle | 32 | 26 | |Shared Latency |cycle | 23 | 27 | |Constant Latency |cycle | 448 | 28 | |Constant L2 Latency |cycle | 62 | 29 | |Constant L1 Latency |cycle | 4 | 30 | 31 | - const L1-cache is as fast as register. 32 | 33 | ### 2. Memory Bandwidth 34 | 35 | 1. memory bandwidth within one thread 36 | 37 | |Device | Bandwidth | Turing RTX-2070 | 38 | |:--------------:|:-----------:|:---------------:| 39 | |Global LDG.128 | GB/s |194.12 | 40 | |Global LDG.64 | GB/s |140.77 | 41 | |Global LDG.32 | GB/s |54.18 | 42 | |Shared LDS.128 | GB/s |152.96 | 43 | |Shared LDS.64 | GB/s |30.58 | 44 | |Shared LDS.32 | GB/s |13.32 | 45 | 46 | 1. global memory bandwidth within (64 block * 256 thread) 47 | 48 | |Device | Bandwidth | Turing RTX-2070 | 49 | |:--------------------------:|:-----------:|:---------------:| 50 | |LDG.32 | GB/s |246.65 | 51 | |LDG.32 Group1 Stride1 | GB/s |118.73(2X) | 52 | |LDG.32 Group2 Stride2 | GB/s |119.08(2X) | 53 | |LDG.32 Group4 Stride4 | GB/s |117.11(2X) | 54 | |LDG.32 Group8 Stride8 | GB/s |336.27 | 55 | |LDG.64 | GB/s |379.24 | 56 | |LDG.64 Group1 Stride1 | GB/s |126.40(2X) | 57 | |LDG.64 Group2 Stride2 | GB/s |124.51(2X) | 58 | |LDG.64 Group4 Stride4 | GB/s |398.84 | 59 | |LDG.64 Group8 Stride8 | GB/s |371.28 | 60 | |LDG.128 | GB/s |391.83 | 61 | |LDG.128 Group1 Stride1 | GB/s |125.25(2X) | 62 | |LDG.128 Group2 Stride2 | GB/s |402.55 | 63 | |LDG.128 Group4 Stride4 | GB/s |394.22 | 64 | |LDG.128 Group8 Stride8 | GB/s |396.10 | 65 | 66 | ### 3. Cache Linesize 67 | 68 | |Device | Linesize | Turing RTX-2070(TU104)| 69 | |:--------------------------:|:---------:|:---------------------:| 70 | |L2 Linesise |bytes | 64 | 71 | |L1 Linesize |bytes | 32 | 72 | |Constant L2 Linesise |bytes | 256 | 73 | |Constant L1 Linesize |bytes | 32 | 74 | 75 | ### 4. Reg Bankconflict 76 | 77 | | Instruction |CPI | conflict | without conflict | reg reuse | double reuse | 78 | |:-----------:|:-------:|:--------:|:----------------:|:---------:|:------------:| 79 | |FFMA | cycle | 3.516 | 2.969 | 2.938 | 2.938 | 80 | |IADD3 | cycle | 3.031 | 2.062 | 2.031 | 2.031 | 81 | 82 | 83 | ### 5. Shared Bankconflict 84 | 85 | | Memory Load | Latency | Turing RTX-2070 (TU104)| 86 | |:----------------------:|:---------:|:----------------------:| 87 | | Single | cycle | 23 | 88 | | Vector2 X 2 | cycle | 27 | 89 | | Conflict Strided | cycle | 41 | 90 | | Conlict-Free Strided | cycle | 32 | 91 | 92 | 93 | ## Instruction Efficiency 94 | 95 | 96 | ## Roadmap 97 | 98 | - [ ] warp schedule 99 | - [ ] L1/L2 cache n-way k-set 100 | 101 | # Citation 102 | - Jia, Zhe, et al. "Dissecting the NVIDIA volta GPU architecture via microbenchmarking." arXiv preprint arXiv:1804.06826 (2018). 103 | - Jia, Zhe, et al. "Dissecting the NVidia Turing T4 GPU via microbenchmarking." arXiv preprint arXiv:1903.07486 (2019). 104 | - Yan, Da, Wei Wang, and Xiaowen Chu. "Optimizing batched winograd convolution on GPUs." Proceedings of the 25th ACM SIGPLAN symposium on principles and practice of parallel programming. 2020. [**(turingas)**](https://github.com/daadaada/turingas) 105 | -------------------------------------------------------------------------------- /memory/memory_bandwidth.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | const float kMemoryFrequency_MHz = 5000.0f; // 5000MHz 10 | 11 | float calculateBandWidth(uint elapsed_cycle, const int data_bytes) { 12 | float second_x_1024_x_1024 = static_cast(elapsed_cycle) / kMemoryFrequency_MHz; 13 | float data_KBytes = static_cast(data_bytes) / 1024; 14 | return data_KBytes / second_x_1024_x_1024; 15 | } 16 | 17 | template 18 | uint getAvgElapsedCycle(int thread_group_size, int stride_size, T* data) { 19 | T acc = 0; 20 | for (int i = 0; i < thread_group_size; ++i) { 21 | acc += data[i * stride_size]; 22 | } 23 | return static_cast(acc) / thread_group_size; 24 | } 25 | 26 | 27 | int main(){ 28 | float* input_d; 29 | uint32_t* clock_h; 30 | uint32_t* clock_d; 31 | 32 | int global_size = 4 * 1024 * 1024; 33 | int shared_size = 32 * 1024; 34 | 35 | clock_h = static_cast(malloc(sizeof(uint32_t) * global_size)); 36 | 37 | cudaMalloc(&input_d, sizeof(float) * global_size); 38 | cudaMalloc(&clock_d, sizeof(uint32_t) * global_size); 39 | 40 | void* kernel_args[] = {&input_d, &clock_d}; 41 | 42 | 43 | dim3 gDim1(1, 1, 1); 44 | dim3 bDim1(1, 1, 1); 45 | int global_load_bytes = 512 * 1024; 46 | int shared_load_bytes = 32 * 1024; 47 | 48 | const char* cubin_name1 = "../sass_cubin/memory_bandwidth_thread.cubin"; 49 | const char* kernel_name1 = "memoryBandwidthThread"; 50 | launchSassKernel(cubin_name1, kernel_name1, gDim1, bDim1, shared_size, kernel_args); 51 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost); 52 | cudaDeviceSynchronize(); 53 | 54 | printf(">>> SASS Level Memory BandWidth Result\n"); 55 | printf(" Global Memory Load %9d Bytes\n", global_load_bytes); 56 | printf(" Shared Memory Load %9d Bytes\n", shared_load_bytes); 57 | printf(" Within Thread Result\n"); 58 | printf(" LDG.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 59 | getAvgElapsedCycle(1, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 0), global_load_bytes)); 60 | printf(" LDG.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 61 | getAvgElapsedCycle(1, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 2), global_load_bytes)); 62 | printf(" LDG.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 63 | getAvgElapsedCycle(1, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 4), global_load_bytes)); 64 | printf(" LDS.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 65 | getAvgElapsedCycle(1, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 1), shared_load_bytes)); 66 | printf(" LDS.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 67 | getAvgElapsedCycle(1, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 3), shared_load_bytes)); 68 | printf(" LDS.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 69 | getAvgElapsedCycle(1, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 5), shared_load_bytes)); 70 | printf("\n"); 71 | 72 | 73 | dim3 gDim2(1, 1, 1); 74 | dim3 bDim2(256, 1, 1); 75 | const char* cubin_name2 = "../sass_cubin/memory_bandwidth_block.cubin"; 76 | const char* kernel_name2 = "memoryBandwidthBlock"; 77 | launchSassKernel(cubin_name2, kernel_name2, gDim2, bDim2, shared_size, kernel_args); 78 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost); 79 | cudaDeviceSynchronize(); 80 | 81 | printf(" Thread Average Result Within Block\n"); 82 | printf(" LDG.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 83 | getAvgElapsedCycle(256, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 0), global_load_bytes)); 84 | printf(" LDG.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 85 | getAvgElapsedCycle(256, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 2), global_load_bytes)); 86 | printf(" LDG.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 87 | getAvgElapsedCycle(256, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 4), global_load_bytes)); 88 | printf(" LDS.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 89 | getAvgElapsedCycle(256, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 1), shared_load_bytes)); 90 | printf(" LDS.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 91 | getAvgElapsedCycle(256, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 3), shared_load_bytes)); 92 | printf(" LDS.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n", 93 | getAvgElapsedCycle(256, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 5), shared_load_bytes)); 94 | printf("\n"); 95 | return 0; 96 | } -------------------------------------------------------------------------------- /miscellany/shared_bankconflict.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | 10 | __global__ void sharedBankconflictKernel(float* input, float* output, uint32_t* clock){ 11 | 12 | asm volatile ( 13 | ".reg.f32 val1, val2, val3, val4; \n\t" 14 | ".reg.u32 c_1, c_2; \n\t" 15 | ".reg.u32 e_1; \n\t" 16 | ".shared.b32 smem[1024]; \n\t" 17 | 18 | "mov.u32 c_1, %%clock; \n\t" 19 | "ld.shared.f32 val1, [smem + 0x100]; \n\t" 20 | "mov.u32 c_2, %%clock; \n\t" 21 | "sub.u32 e_1, c_2, c_1; \n\t" 22 | "st.global.u32 [%2], e_1; \n\t" 23 | "st.global.f32 [%1] , val1; \n\t" 24 | 25 | ////////////////////////////////////////////////////////////////// 26 | 27 | "mov.u32 c_1, %%clock; \n\t" 28 | "ld.shared.f32 val1, [smem]; \n\t" 29 | "ld.shared.f32 val2, [smem + 0x80]; \n\t" 30 | "ld.shared.f32 val3, [smem + 0x100]; \n\t" 31 | "ld.shared.f32 val4, [smem + 0x180]; \n\t" 32 | "mov.u32 c_2, %%clock; \n\t" 33 | 34 | "sub.u32 e_1, c_2, c_1; \n\t" 35 | "st.global.u32 [%2 + 0x4], e_1; \n\t" 36 | "st.global.f32 [%1 + 0x10], val1; \n\t" 37 | "st.global.f32 [%1 + 0x20], val2; \n\t" 38 | "st.global.f32 [%1 + 0x30], val3; \n\t" 39 | "st.global.f32 [%1 + 0x40], val4; \n\t" 40 | 41 | ////////////////////////////////////////////////////////////////// 42 | 43 | "mov.u32 c_1, %%clock; \n\t" 44 | "ld.shared.f32 val1, [smem]; \n\t" 45 | "ld.shared.f32 val2, [smem + 0x84]; \n\t" 46 | "ld.shared.f32 val3, [smem + 0x108]; \n\t" 47 | "ld.shared.f32 val4, [smem + 0x18c]; \n\t" 48 | "mov.u32 c_2, %%clock; \n\t" 49 | 50 | "sub.u32 e_1, c_2, c_1; \n\t" 51 | "st.global.u32 [%2 + 0x8], e_1; \n\t" 52 | "st.global.f32 [%1 + 0x44], val1; \n\t" 53 | "st.global.f32 [%1 + 0x14], val2; \n\t" 54 | "st.global.f32 [%1 + 0x24], val3; \n\t" 55 | "st.global.f32 [%1 + 0x34], val4; \n\t" 56 | 57 | ////////////////////////////////////////////////////////////////// 58 | 59 | "mov.u32 c_1, %%clock; \n\t" 60 | "ld.shared.v2.f32 {val1, val2}, [smem]; \n\t" 61 | "ld.shared.v2.f32 {val3, val4}, [smem + 0x8]; \n\t" 62 | "mov.u32 c_2, %%clock; \n\t" 63 | 64 | "sub.u32 e_1, c_2, c_1; \n\t" 65 | "st.global.u32 [%2 + 0xc], e_1; \n\t" 66 | "st.global.f32 [%1 + 0x48] , val1; \n\t" 67 | "st.global.f32 [%1 + 0x18], val2; \n\t" 68 | "st.global.f32 [%1 + 0x28], val3; \n\t" 69 | "st.global.f32 [%1 + 0x38], val4; \n\t" 70 | 71 | ////////////////////////////////////////////////////////////////// 72 | ::"l"(input),"l"(output),"l"(clock):"memory" 73 | ); 74 | } 75 | 76 | int main(){ 77 | 78 | float* input_h; 79 | float* input_d; 80 | float* output_h; 81 | float* output_d; 82 | uint32_t* clock_h; 83 | uint32_t* clock_d; 84 | 85 | int size = 1024; 86 | 87 | input_h = static_cast(malloc(sizeof(float) * size)); 88 | output_h = static_cast(malloc(sizeof(float) * size)); 89 | clock_h = static_cast(malloc(sizeof(uint32_t) * size)); 90 | 91 | 92 | cudaMalloc(&input_d, sizeof(float) * size); 93 | cudaMalloc(&output_d, sizeof(float) * size); 94 | cudaMalloc(&clock_d, sizeof(uint32_t) * size); 95 | 96 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice); 97 | 98 | 99 | dim3 gDim(1, 1, 1); 100 | dim3 bDim(1, 1, 1); 101 | 102 | void* kernel_args[] = {&input_d, &output_d, &clock_d}; 103 | 104 | 105 | const char* cubin_name = "../sass_cubin/shared_bankconflict.cubin"; 106 | const char* kernel_name = "sharedBankconflict"; 107 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, size * sizeof(float), kernel_args); 108 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 109 | cudaDeviceSynchronize(); 110 | 111 | printf(">>> SASS Level Shared Load BankConflict Result\n"); 112 | printf(" Single Load [0x100] Elapsed \t%3u cycle\n", clock_h[0]); 113 | printf(" Vector Load [0x0, 0x4 , 0x8 , 0xc ] Elapsed \t%3u cycle\n", clock_h[1]); 114 | printf(" WithConflict Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[2]); 115 | printf(" WithoutConflict Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[3]); 116 | 117 | 118 | sharedBankconflictKernel<<>>(input_d, output_d, clock_d); 119 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 120 | cudaDeviceSynchronize(); 121 | printf("\n"); 122 | printf(">>> CUDA-C Level Shared Load BankConflict Result\n"); 123 | printf(" Single Load [0x100] Elapsed \t%3u cycle\n", clock_h[0]); 124 | printf(" Vector Load [0x0, 0x4 , 0x8 , 0xc ] Elapsed \t%3u cycle\n", clock_h[3]); 125 | printf(" WithConflict Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[1]); 126 | printf(" WithoutConflict Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[2]); 127 | 128 | return 0; 129 | } -------------------------------------------------------------------------------- /memory/memory_latency.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | 10 | __constant__ float cinput[1024]; 11 | 12 | __global__ void latencyDetectKernel(float* input, float* output, uint32_t* clock, float* cinput){ 13 | 14 | input += 1024 * 1024 * 1024 / sizeof(float) / 2; 15 | cinput += 512; 16 | 17 | asm volatile ( 18 | ".reg.f32 val1, val2, val3; \n\t" 19 | ".reg.u32 c_1, c_2, c_3, c_4; \n\t" 20 | ".reg.u32 e_1, e_2, e_3; \n\t" 21 | ".reg.u32 e_4, e_5, e_6; \n\t" 22 | ".shared.b8 smem[32]; \n\t" 23 | 24 | 25 | "mov.u32 c_1, %%clock; \n\t" 26 | "ld.global.cg.f32 val1, [%0]; \n\t" 27 | "mov.u32 c_2, %%clock; \n\t" 28 | "ld.global.ca.f32 val2, [%0 + 0x4]; \n\t" 29 | "mov.u32 c_3, %%clock; \n\t" 30 | "ld.global.ca.f32 val3, [%0 + 0x8]; \n\t" 31 | "mov.u32 c_4, %%clock; \n\t" 32 | 33 | "sub.u32 e_1, c_2, c_1; \n\t" 34 | "sub.u32 e_2, c_3, c_2; \n\t" 35 | "sub.u32 e_3, c_4, c_3; \n\t" 36 | 37 | "add.f32 val1, val1, val2; \n\t" 38 | "add.f32 val1, val1, val3; \n\t" 39 | 40 | "st.global.u32 [%2], e_1; \n\t" 41 | "st.global.u32 [%2 + 0x4], e_2; \n\t" 42 | "st.global.u32 [%2 + 0x8], e_3; \n\t" 43 | 44 | "st.global.f32 [%1], val1; \n\t" 45 | "st.global.f32 [%1 + 0x4], val2; \n\t" 46 | "st.global.f32 [%1 + 0x8], val3; \n\t" 47 | 48 | /////////////////////////////////////////////////////////////////// 49 | 50 | "bar.sync 0; \n\t" 51 | 52 | /////////////////////////////////////////////////////////////////// 53 | 54 | "mov.u32 c_1, %%clock; \n\t" 55 | "ld.const.cg.f32 val1, [%3]; \n\t" 56 | "mov.u32 c_2, %%clock; \n\t" 57 | "ld.const.ca.f32 val2, [%3 + 0x4]; \n\t" 58 | "mov.u32 c_3, %%clock; \n\t" 59 | "ld.const.ca.f32 val3, [%3 + 0x8]; \n\t" 60 | "mov.u32 c_4, %%clock; \n\t" 61 | 62 | "sub.u32 e_4, c_2, c_1; \n\t" 63 | "sub.u32 e_5, c_3, c_2; \n\t" 64 | "sub.u32 e_6, c_4, c_3; \n\t" 65 | 66 | "add.f32 val1, val1, val2; \n\t" 67 | "add.f32 val1, val1, val3; \n\t" 68 | 69 | "st.global.u32 [%2 + 0xc], e_4; \n\t" 70 | "st.global.u32 [%2 + 0x10], e_5; \n\t" 71 | "st.global.u32 [%2 + 0x14], e_6; \n\t" 72 | 73 | "st.global.f32 [%1 + 0xc], val1; \n\t" 74 | "st.global.f32 [%1 + 0x10], val2; \n\t" 75 | "st.global.f32 [%1 + 0x14], val3; \n\t" 76 | 77 | ///////////////////////////////////////////////////////////////////////// 78 | 79 | "bar.sync 0; \n\t" 80 | 81 | /////////////////////////////////////////////////////////////////// 82 | 83 | "mov.u32 c_1, %%clock; \n\t" 84 | "ld.shared.f32 val1, [smem]; \n\t" 85 | "mov.u32 c_2, %%clock; \n\t" 86 | 87 | "sub.u32 e_4, c_2, c_1; \n\t" 88 | "st.global.u32 [%2 + 0x18], e_4; \n\t" 89 | "st.global.f32 [%1 + 0x18], val1; \n\t" 90 | 91 | ::"l"(input),"l"(output),"l"(clock),"l"(cinput):"memory" 92 | ); 93 | 94 | } 95 | 96 | 97 | int main(){ 98 | float* input_d; 99 | float* output_d; 100 | uint32_t* clock_h; 101 | uint32_t* clock_d; 102 | 103 | int size = 1024; 104 | int large_size = 1500 * 1024 * 1024; 105 | 106 | clock_h = static_cast(malloc(sizeof(uint32_t) * size)); 107 | 108 | cudaMalloc(&input_d, large_size); 109 | cudaMalloc(&output_d, sizeof(float) * size); 110 | cudaMalloc(&clock_d, sizeof(uint32_t) * size); 111 | 112 | for (int i = 0; i < 128; ++i){ 113 | cinput[i] = i; 114 | } 115 | 116 | dim3 gDim(1, 1, 1); 117 | dim3 bDim(1, 1, 1); 118 | 119 | void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput}; 120 | const char* cubin_name = "../sass_cubin/memory_latency.cubin"; 121 | const char* kernel_name = "memoryLatency"; 122 | 123 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, size, kernel_args); 124 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 125 | cudaDeviceSynchronize(); 126 | 127 | printf(">>> SASS Level Memory Latency Result\n"); 128 | printf(" Global Memory Latency \t= %4u cycle\n", clock_h[0]); 129 | printf(" Global TLB Latency \t= %4u cycle\n", clock_h[7]); 130 | printf(" Global L2-Cache Latency \t= %4u cycle\n", clock_h[1]); 131 | printf(" Global L1-Cache Latency \t= %4u cycle\n", clock_h[2]); 132 | printf(" Shared Memory Latency \t= %4u cycle\n", clock_h[6]); 133 | printf(" Constant Memory Latency \t= %4u cycle\n", clock_h[3]); 134 | printf(" Constant L2-Cache Latency \t= %4u cycle\n", clock_h[4]); 135 | printf(" Constant L1-Cache Latency \t= %4u cycle\n", clock_h[5]); 136 | 137 | 138 | 139 | latencyDetectKernel<<>>(input_d, output_d, clock_d, cinput); 140 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost); 141 | cudaDeviceSynchronize(); 142 | printf("\n"); 143 | printf(">>> CUDA-C Level Memory Latency Result\n"); 144 | printf(" Global Memory Latency \t= %4u cycle\n", clock_h[0]); 145 | printf(" Global L2-Cache Latency \t= %4u cycle\n", clock_h[1]); 146 | printf(" Global L1-Cache Latency \t= %4u cycle\n", clock_h[2]); 147 | printf(" Shared Memory Latency \t= %4u cycle\n", clock_h[6]); 148 | printf(" Constant Memory Latency \t= %4u cycle\n", clock_h[3]); 149 | printf(" Constant L2-Cache Latency \t= %4u cycle\n", clock_h[4]); 150 | printf(" Constant L1-Cache Latency \t= %4u cycle\n", clock_h[5]); 151 | return 0; 152 | } -------------------------------------------------------------------------------- /memory/global_memory_bandwidth.cu: -------------------------------------------------------------------------------- 1 | // 2 | // 3 | // 4 | // 5 | 6 | #include "cuda.h" 7 | #include "utils.cuh" 8 | 9 | constexpr int kGridDimX = 64; 10 | constexpr int kBlockDimX = 256; 11 | constexpr int kWarpCount = kBlockDimX / kWarpSize; 12 | constexpr int kLoopSize = 4 * 1024; 13 | constexpr size_t kGlobalSize = 256 * 1024 * 1024; 14 | constexpr float kCopySize = (float)kLoopSize * kGridDimX * kBlockDimX * sizeof(float); 15 | 16 | template 17 | __global__ 18 | void copyGroup32bKernel(float* input, float* output) { 19 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize; 20 | const int kBlockWorkload = kWarpCount * kWarpWorkload; 21 | const int kLine = kGridDimX * kBlockWorkload; 22 | 23 | int ctaid = blockIdx.x; 24 | int tid = threadIdx.x; 25 | int warpid = tid / 32; 26 | int laneid = tid % 32; 27 | int groupid = laneid / GroupSize; 28 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize; 29 | 30 | float* thread_input = input + offset; 31 | float* thread_output = output + offset; 32 | 33 | for (int i = 0; i < kLoopSize; ++i) { 34 | *thread_output = *thread_input; 35 | thread_input += kLine; 36 | thread_output += kLine; 37 | } 38 | } 39 | 40 | 41 | template 42 | __global__ 43 | void copyGroup64bKernel(float2* input, float2* output) { 44 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize; 45 | const int kBlockWorkload = kWarpCount * kWarpWorkload; 46 | const int kLine = kGridDimX * kBlockWorkload; 47 | 48 | int ctaid = blockIdx.x; 49 | int tid = threadIdx.x; 50 | int warpid = tid / 32; 51 | int laneid = tid % 32; 52 | int groupid = laneid / GroupSize; 53 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize; 54 | 55 | float2* thread_input = input + offset; 56 | float2* thread_output = output + offset; 57 | 58 | for (int i = 0; i < (kLoopSize / 2); ++i) { 59 | *thread_output = *thread_input; 60 | thread_input += kLine; 61 | thread_output += kLine; 62 | } 63 | } 64 | 65 | 66 | template 67 | __global__ 68 | void copyGroup128bKernel(float4* input, float4* output) { 69 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize; 70 | const int kBlockWorkload = kWarpCount * kWarpWorkload; 71 | const int kLine = kGridDimX * kBlockWorkload; 72 | 73 | int ctaid = blockIdx.x; 74 | int tid = threadIdx.x; 75 | int warpid = tid / 32; 76 | int laneid = tid % 32; 77 | int groupid = laneid / GroupSize; 78 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize; 79 | 80 | float4* thread_input = input + offset; 81 | float4* thread_output = output + offset; 82 | 83 | for (int i = 0; i < (kLoopSize / 4); ++i) { 84 | *thread_output = *thread_input; 85 | thread_input += kLine; 86 | thread_output += kLine; 87 | } 88 | } 89 | 90 | template 91 | float getElapsed(Func fn, cudaEvent_t start, cudaEvent_t stop) { 92 | float elapsed = 0; 93 | cudaEventRecord(start); 94 | fn(); 95 | cudaEventRecord(stop); 96 | cudaDeviceSynchronize(); 97 | cudaEventElapsedTime(&elapsed, start, stop); 98 | return kCopySize / elapsed / 1024 / 1024; 99 | } 100 | 101 | int main() { 102 | float* input_d; 103 | float* output_d; 104 | 105 | cudaMalloc(&input_d, sizeof(float) * kGlobalSize); 106 | cudaMalloc(&output_d, sizeof(float) * kGlobalSize); 107 | 108 | cudaEvent_t start, stop; 109 | cudaEventCreate(&start); 110 | cudaEventCreate(&stop); 111 | 112 | dim3 gDim(kGridDimX); 113 | dim3 bDim(kBlockDimX); 114 | 115 | printf(" Different access pattern on Global Memory\n"); 116 | 117 | auto fn1 = [=]() { copyGroup32bKernel<1, 0><<>>(input_d, output_d);}; 118 | printf(" LDG.32 \t%.2f GB/s\n", getElapsed(fn1, start, stop)); 119 | 120 | auto fn2 = [=]() { copyGroup32bKernel<1, 1><<>>(input_d, output_d);}; 121 | printf(" LDG.32 g1s1 \t%.2f GB/s\n", getElapsed(fn2, start, stop)); 122 | 123 | auto fn9 = [=]() { copyGroup32bKernel<2, 2><<>>(input_d, output_d);}; 124 | printf(" LDG.32 g2s2 \t%.2f GB/s\n", getElapsed(fn9, start, stop)); 125 | 126 | auto fn10 = [=]() { copyGroup32bKernel<4, 4><<>>(input_d, output_d);}; 127 | printf(" LDG.32 g4s4 \t%.2f GB/s\n", getElapsed(fn10, start, stop)); 128 | 129 | auto fn8 = [=]() { copyGroup32bKernel<8, 8><<>>(input_d, output_d);}; 130 | printf(" LDG.32 g8s8 \t%.2f GB/s\n", getElapsed(fn8, start, stop)); 131 | 132 | /////////////////////////////////////////////////////////////////////////////////////////////////////// 133 | 134 | auto fn3 = [=]() { copyGroup64bKernel<1, 0><<>>((float2*)input_d, (float2*)output_d);}; 135 | printf(" LDG.64 \t%.2f GB/s\n", getElapsed(fn3, start, stop)); 136 | 137 | auto fn4 = [=]() { copyGroup64bKernel<1, 1><<>>((float2*)input_d, (float2*)output_d);}; 138 | printf(" LDG.64 g1s1 \t%.2f GB/s\n", getElapsed(fn4, start, stop)); 139 | 140 | auto fn11 = [=]() { copyGroup64bKernel<2, 2><<>>((float2*)input_d, (float2*)output_d);}; 141 | printf(" LDG.64 g2s2 \t%.2f GB/s\n", getElapsed(fn11, start, stop)); 142 | 143 | auto fn12 = [=]() { copyGroup64bKernel<4, 4><<>>((float2*)input_d, (float2*)output_d);}; 144 | printf(" LDG.64 g4s4 \t%.2f GB/s\n", getElapsed(fn12, start, stop)); 145 | 146 | auto fn13 = [=]() { copyGroup64bKernel<8, 8><<>>((float2*)input_d, (float2*)output_d);}; 147 | printf(" LDG.64 g8s8 \t%.2f GB/s\n", getElapsed(fn13, start, stop)); 148 | 149 | /////////////////////////////////////////////////////////////////////////////////////////////////////// 150 | 151 | auto fn5 = [=]() { copyGroup128bKernel<1, 0><<>>((float4*)input_d, (float4*)output_d);}; 152 | printf(" LDG.128 \t%.2f GB/s\n", getElapsed(fn5, start, stop)); 153 | 154 | auto fn6 = [=]() { copyGroup128bKernel<1, 1><<>>((float4*)input_d, (float4*)output_d);}; 155 | printf(" LDG.128 g1s1 \t%.2f GB/s\n", getElapsed(fn6, start, stop)); 156 | 157 | auto fn7 = [=]() { copyGroup128bKernel<2, 2><<>>((float4*)input_d, (float4*)output_d);}; 158 | printf(" LDG.128 g2s2 \t%.2f GB/s\n", getElapsed(fn7, start, stop)); 159 | 160 | auto fn14 = [=]() { copyGroup128bKernel<4, 4><<>>((float4*)input_d, (float4*)output_d);}; 161 | printf(" LDG.128 g4s4 \t%.2f GB/s\n", getElapsed(fn14, start, stop)); 162 | 163 | auto fn15 = [=]() { copyGroup128bKernel<8, 8><<>>((float4*)input_d, (float4*)output_d);}; 164 | printf(" LDG.128 g8s8 \t%.2f GB/s\n", getElapsed(fn15, start, stop)); 165 | 166 | 167 | cudaFree(input_d); 168 | cudaFree(output_d); 169 | } --------------------------------------------------------------------------------