├── .gitignore
├── .gitmodules
├── utils
    ├── macro.cuh
    ├── utils.cuh
    ├── sass_kernel.cuh
    ├── format_print.cuh
    └── ptx_export.cuh
├── CMakeLists.txt
├── sass_cubin
    ├── reg_reuse_bankconflict.sass
    ├── reg_reuse_double.sass
    ├── reg_without_bankconflict.sass
    ├── reg_with_bankconflict.sass
    ├── warp_schedule.sass
    ├── shared_bankconflict.sass
    ├── memory_latency.sass
    ├── cache_linesize.sass
    ├── memory_bandwidth_thread.sass
    └── memory_bandwidth_block.sass
├── compile_sass.py
├── schedule
    ├── block_schedule.cu
    └── warp_schedule.cu
├── miscellany
    ├── reg_bankconflict.cu
    └── shared_bankconflict.cu
├── memory
    ├── cache_linesize.cu
    ├── memory_bandwidth.cu
    ├── memory_latency.cu
    └── global_memory_bandwidth.cu
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .vscode/
3 | 
4 | build/
5 | bin/
6 | 
7 | *.out
8 | *.cubin


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "turingas"]
2 | 	path = turingas
3 | 	url = git@github.com:daadaada/turingas.git
4 | 


--------------------------------------------------------------------------------
/utils/macro.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | // CUDA
 3 | // Created by sjfeng
 4 | //
 5 | 
 6 | #pragma once 
 7 | 
 8 | #define UPPER_DIV(x, y)        ((x + y - 1) / y)
 9 | 
10 | constexpr int kWarpSize = 32;
11 | 


--------------------------------------------------------------------------------
/utils/utils.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | //
 5 | 
 6 | #pragma once 
 7 | 
 8 | #include "./format_print.cuh"
 9 | #include "./macro.cuh"
10 | #include "./ptx_export.cuh"
11 | #include "./sass_kernel.cuh"
12 | 


--------------------------------------------------------------------------------
/utils/sass_kernel.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | //
 5 | 
 6 | #pragma once 
 7 | 
 8 | #include "cuda.h"
 9 | #include "cuda_runtime.h"
10 | 
11 | cudaError_t launchSassKernel(const char* cubin_name, const char* kernel_name, const dim3& gDim, const dim3& bDim, const int shared_bytes, void** args){
12 |     CUmodule module;
13 |     CUfunction kernel;
14 | 
15 |     cuModuleLoad(&module, cubin_name);
16 |     cuModuleGetFunction(&kernel, module, kernel_name);
17 | 
18 |     cuLaunchKernel(kernel, 
19 |                    gDim.x, gDim.y, gDim.z,
20 |                    bDim.x, bDim.y, bDim.z,
21 |                    shared_bytes, // SharedMem Bytes
22 |                    0, // Stream
23 |                    args, 0);
24 | 
25 |     return cudaPeekAtLastError();
26 | }
27 | 


--------------------------------------------------------------------------------
/utils/format_print.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | // C++
 3 | // Created by sjfeng
 4 | //
 5 | //
 6 | #pragma once
 7 | 
 8 | #include "stdio.h"
 9 | 
10 | void formatArray(float* array, int size, int newline=10){
11 |     for (int i = 0; i < size; ++i){
12 |         printf("%.3f, ", array[i]);
13 |         if (i % newline == newline - 1){
14 |             printf("\n");
15 |         }
16 |     }
17 |     printf("\n\t");
18 | }
19 | 
20 | void formatArray(uint* array, int size, int newline=10){
21 |     for (int i = 0; i < size; ++i){
22 |         printf("%3u, ", array[i]);
23 |         if (i % newline == newline - 1){
24 |             printf("=====================\n");
25 |         }
26 |     }
27 |     printf("\n\t");
28 | }
29 | 
30 | void formatArray(int* array, int size, int newline=10){
31 |     for (int i = 0; i < size; ++i){
32 |         printf("%3u, ", array[i]);
33 |         if (i % newline == newline - 1){
34 |             printf("=====================\n");
35 |         }
36 |     }
37 |     printf("\n\t");
38 | }
39 | 


--------------------------------------------------------------------------------
/utils/ptx_export.cuh:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | //
 5 | 
 6 | #pragma once 
 7 | 
 8 | #include "cuda.h"
 9 | #include "cuda_runtime.h"
10 | #include "./macro.cuh"
11 | 
12 | __forceinline__ __device__ uint32_t getClock(){
13 |     uint32_t clock;
14 |     asm volatile(
15 |         "mov.u32    %0,     %%clock; \n\t"
16 |         :"=r"(clock)::"memory"
17 |     );
18 |     return clock;
19 | }
20 | 
21 | __forceinline__ __device__ uint32_t getSmid(){
22 |     uint32_t smid;
23 |     asm volatile(
24 |         "mov.u32    %0,     %%smid; \n\t"
25 |         :"=r"(smid)::"memory"
26 |     );
27 |     return smid;
28 | }
29 | 
30 | __forceinline__ __device__ uint32_t getWarpid(){
31 |     uint32_t warpid;
32 |     asm volatile(
33 |         "mov.u32    %0,     %%warpid; \n\t"
34 |         :"=r"(warpid)::"memory"
35 |     );
36 |     return warpid;
37 | }
38 | 
39 | __forceinline__ __device__ uint32_t getLaneid(){
40 |     uint32_t laneid;
41 |     asm volatile(
42 |         "mov.u32    %0,     %%laneid; \n\t"
43 |         :"=r"(laneid)::"memory"
44 |     );
45 |     return laneid;
46 | }
47 | 
48 | __forceinline__ __device__ void barSync(){
49 |     asm volatile(
50 |         "bar.sync   0; \n\t"
51 |     );
52 | }
53 | 
54 | __forceinline__ __device__ void ptxExit(){
55 |     asm volatile(
56 |         "exit; \n\t"
57 |     );
58 | }
59 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14)
 2 | 
 3 | project(gpu-arch-microbenchmark
 4 |         LANGUAGES CXX CUDA)
 5 | 
 6 | enable_language(CUDA)
 7 | 
 8 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin/)
 9 | 
10 | set(TARGET_ARCH "-gencode arch=compute_80,code=sm_80 \
11 |                  -gencode arch=compute_75,code=sm_75 \
12 |                  -gencode arch=compute_70,code=sm_70")
13 | 
14 | set(CMAKE_CUDA_FLAGS "${CMAKE_NVCC_FLAGS} ${TARGET_ARCH}")
15 | 
16 | set(MICROBENCHMARK_SRC memory/memory_latency.cu
17 |                        memory/memory_bandwidth.cu
18 |                        memory/cache_linesize.cu
19 |                        memory/global_memory_bandwidth.cu
20 |                        miscellany/reg_bankconflict.cu
21 |                        miscellany/shared_bankconflict.cu
22 |                        schedule/warp_schedule.cu)
23 | 
24 | 
25 | message(STATUS ">>> GPU Microbenchmark")
26 | 
27 | foreach(benchmark ${MICROBENCHMARK_SRC})
28 |     get_filename_component(benchmark_exec ${benchmark} NAME_WE)
29 |     message(STATUS "Benchmark: ${benchmark_exec}")
30 |     add_executable(${benchmark_exec} ${benchmark})
31 |     target_include_directories(${benchmark_exec} PUBLIC ${PROJECT_SOURCE_DIR}/utils)
32 |     target_link_libraries(${benchmark_exec} cuda)
33 | endforeach()
34 | 
35 | message(STATUS "<<<")
36 | 


--------------------------------------------------------------------------------
/sass_cubin/reg_reuse_bankconflict.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 | </regs>
18 | 
19 | 
20 | --:-:-:-:2      MOV input_lo, input[0];
21 | --:-:-:-:2      MOV input_hi, input[1];
22 | --:-:-:-:2      MOV clock_lo, clock[0];
23 | --:-:-:-:4      MOV clock_hi, clock[1];
24 | 
25 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
26 | 
27 | <CODE>
28 | REG_FFMA = "--:-:-:-:1      FFMA v0, v2.reuse, v{:}, v0;"
29 | 
30 | SASS_CODE = []
31 | for i in range(64):
32 |     reg = 4 + i * 2
33 |     SASS_CODE += [REG_FFMA.format(reg)]
34 | 
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 | </CODE>
37 | 
38 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2      BAR.SYNC 0x0;
40 | 
41 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
43 | 
44 | 
45 | 
46 | 
47 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
48 | 
49 | <CODE>
50 | REG_IADD3 = "--:-:-:-:1      IADD3 v0, v2.reuse, v{:}, v0;"
51 | 
52 | SASS_CODE = []
53 | for i in range(64):
54 |     reg = 4 + i * 2
55 |     SASS_CODE += [REG_IADD3.format(reg)]
56 | 
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 | </CODE>
59 | 
60 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2      BAR.SYNC 0x0;
62 | 
63 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2      EXIT;


--------------------------------------------------------------------------------
/sass_cubin/reg_reuse_double.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 | </regs>
18 | 
19 | 
20 | --:-:-:-:2      MOV input_lo, input[0];
21 | --:-:-:-:2      MOV input_hi, input[1];
22 | --:-:-:-:2      MOV clock_lo, clock[0];
23 | --:-:-:-:4      MOV clock_hi, clock[1];
24 | 
25 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
26 | 
27 | <CODE>
28 | REG_FFMA = "--:-:-:-:1      FFMA v0, v2.reuse, v1.reuse, v{:};"
29 | 
30 | SASS_CODE = []
31 | for i in range(64):
32 |     reg = 4 + i * 2
33 |     SASS_CODE += [REG_FFMA.format(reg)]
34 | 
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 | </CODE>
37 | 
38 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2      BAR.SYNC 0x0;
40 | 
41 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
43 | 
44 | 
45 | 
46 | 
47 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
48 | 
49 | <CODE>
50 | REG_IADD3 = "--:-:-:-:1      IADD3 v0, v2.reuse, v1.reuse, v{:};"
51 | 
52 | SASS_CODE = []
53 | for i in range(64):
54 |     reg = 4 + i * 2
55 |     SASS_CODE += [REG_IADD3.format(reg)]
56 | 
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 | </CODE>
59 | 
60 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2      BAR.SYNC 0x0;
62 | 
63 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2      EXIT;


--------------------------------------------------------------------------------
/sass_cubin/reg_without_bankconflict.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 | </regs>
18 | 
19 | 
20 | --:-:-:-:2      MOV input_lo, input[0];
21 | --:-:-:-:2      MOV input_hi, input[1];
22 | --:-:-:-:2      MOV clock_lo, clock[0];
23 | --:-:-:-:4      MOV clock_hi, clock[1];
24 | 
25 | --:-:-:-:4      CS2R c1, SR_CLOCKLO;
26 | 
27 | <CODE>
28 | REG_FFMA = "--:-:-:-:1      FFMA v0, v{:}, v{:}, v0;"
29 | 
30 | SASS_CODE = []
31 | for i in range(64):
32 |     reg = i * 2
33 |     SASS_CODE += [REG_FFMA.format(reg, reg + 1)]
34 | 
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 | </CODE>
37 | 
38 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2      BAR.SYNC 0x0;
40 | 
41 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
43 | 
44 | 
45 | 
46 | 
47 | --:-:-:-:4      CS2R c1, SR_CLOCKLO;
48 | 
49 | <CODE>
50 | REG_IADD3 = "--:-:-:-:1      IADD3 v0, v{:}, v{:}, v0;"
51 | 
52 | SASS_CODE = []
53 | for i in range(64):
54 |     reg = i * 2
55 |     SASS_CODE += [REG_IADD3.format(reg, reg + 1)]
56 | 
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 | </CODE>
59 | 
60 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2      BAR.SYNC 0x0;
62 | 
63 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2      EXIT;


--------------------------------------------------------------------------------
/sass_cubin/reg_with_bankconflict.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 | </regs>
18 | 
19 | 
20 | --:-:-:-:2      MOV input_lo, input[0];
21 | --:-:-:-:2      MOV input_hi, input[1];
22 | --:-:-:-:2      MOV clock_lo, clock[0];
23 | --:-:-:-:4      MOV clock_hi, clock[1];
24 | 
25 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
26 | 
27 | <CODE>
28 | REG_FFMA = "--:-:-:-:1      FFMA v0, v2, v{:}, v0;"
29 | 
30 | SASS_CODE = []
31 | for i in range(64):
32 |     reg = 4 + i * 2
33 |     SASS_CODE += [REG_FFMA.format(reg)]
34 | 
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 | </CODE>
37 | 
38 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2      BAR.SYNC 0x0;
40 | 
41 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
43 | 
44 | 
45 | 
46 | 
47 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
48 | 
49 | <CODE>
50 | REG_IADD3 = "--:-:-:-:1      IADD3 v0, v2, v{:}, v0;"
51 | 
52 | SASS_CODE = []
53 | for i in range(64):
54 |     reg = 4 + i * 2
55 |     SASS_CODE += [REG_IADD3.format(reg)]
56 | 
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 | </CODE>
59 | 
60 | --:-:0:-:4      CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2      BAR.SYNC 0x0;
62 | 
63 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x4], e1;
65 | 
66 | 
67 | 
68 | 
69 | --:-:-:-:2      EXIT;


--------------------------------------------------------------------------------
/compile_sass.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | 
 5 | def camel_to_snake(name):
 6 |     token_list = name.split("_")
 7 |     camel_name = ""
 8 |     for i, token in enumerate(token_list):
 9 |         if i == 0:
10 |             camel_name += token 
11 |         else:
12 |             camel_name += token.capitalize()
13 |     return camel_name
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("-arch", type=int, default=75)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     ARCH_LIST = [70, 75, 80]
23 |     KERNEL_LIST = ["memory_latency", 
24 |                    "memory_bandwidth_thread", "memory_bandwidth_block",
25 |                    "cache_linesize", 
26 |                    "reg_reuse_double", "reg_reuse_bankconflict", "reg_with_bankconflict", "reg_without_bankconflict",
27 |                    "shared_bankconflict",
28 |                    "warp_schedule"]
29 | 
30 |     if args.arch not in ARCH_LIST:
31 |         print("Unsupported Gpu Arch: ", args.arch)
32 |         exit()
33 | 
34 |     print(">>>")
35 |     for kernel in KERNEL_LIST:
36 |         source_sass = f"{kernel}.sass"
37 |         target_cubin = f"{kernel}.cubin"
38 |         target_kernel = camel_to_snake(kernel)
39 |         compile_command = f"python3 -m turingas.main -i ../sass_cubin/{source_sass} -o ../sass_cubin/{target_cubin} -arch {args.arch} -name {target_kernel}"
40 | 
41 |         print(f"    compile kernel: {target_kernel}")
42 |         os.system(compile_command)
43 |     print("<<<")
44 |     


--------------------------------------------------------------------------------
/sass_cubin/warp_schedule.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | run_warp, 4
 6 | </params>
 7 | 
 8 | <regs>
 9 | 0: input_lo
10 | 1: input_hi
11 | 2: output_lo
12 | 3: output_hi
13 | 4: clock_lo
14 | 5: clock_hi
15 | 6-7: c1, c2
16 | 8-9: e1, e2
17 | 10-19 ~ clock_offset_lo, clock_offset_hi, tid, warpid, laneid, warp_offset, warpid32
18 | 20-150 ~ v<0-128>
19 | </regs>
20 | 
21 | 
22 | --:-:0:-:4      S2R tid, SR_TID.X;
23 | 
24 | --:-:-:-:2      MOV input_lo, input[0];
25 | --:-:-:-:2      MOV input_hi, input[1];
26 | --:-:-:-:2      MOV clock_lo, clock[0];
27 | --:-:-:-:2      MOV clock_hi, clock[1];
28 | 
29 | 01:-:-:-:4      SHF.R.S32.HI warpid, RZ, 0x5, tid;
30 | 
31 | --:-:-:-:4      ISETP.NE.AND P0, PT, warpid, run_warp, PT;
32 | --:-:-:-:5      ISETP.EQ.OR P0, PT, warpid, RZ, !P0;
33 | 
34 | --:-:-:-:5      @!P0 EXIT;
35 | 
36 | --:-:-:-:4      SHF.L.S32.HI warpid32, RZ, 0x5, warpid;
37 | --:-:-:-:5      IADD3 laneid, tid, -warpid32, RZ;
38 | 
39 | --:-:-:-:5      IMAD.WIDE clock_offset_lo, laneid, 0x4, clock_lo;
40 | --:-:-:-:5      ISETP.EQ.AND P1, PT, warpid, RZ, PT;
41 | --:-:-:-:5      @P1 IADD3 clock_offset_lo, clock_offset_lo, 0x80, RZ;
42 | --:-:-:-:4      MOV clock_offset_hi, clock_hi;
43 | 
44 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
45 | <CODE>
46 | REG_FFMA = "--:-:-:-:1      FFMA v0, v{:}, v{:}, v0;"   
47 | 
48 | SASS_CODE = []
49 | for i in range(64):
50 |     reg = i * 2
51 |     SASS_CODE += [REG_FFMA.format(i, i + 1)]
52 | 
53 | SASS_CODE += ["--:-:-:-:4      FFMA v0, v1, v2, v0;"]
54 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
55 | </CODE>
56 | --:-:-:-:6      CS2R c2, SR_CLOCKLO;
57 | 
58 | --:-:-:-:6      IADD3 e1, c2, -c1, RZ;
59 | --:-:-:-:4      STG.E.SYS [clock_offset_lo], e1;
60 | 
61 | --:-:-:-:5      EXIT;
62 | 


--------------------------------------------------------------------------------
/sass_cubin/shared_bankconflict.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-9: c<1-4>
15 | 10-13: vA, vB, vC, vD
16 | 14-17: e1, e2, e3, e4
17 | 18-20: x1, x2, x3
18 | 24-27: v<0-3>
19 | </regs>
20 | 
21 | 
22 | --:-:-:-:2      MOV input_lo, input[0];
23 | --:-:-:-:2      MOV input_hi, input[1];
24 | --:-:-:-:2      MOV clock_lo, clock[0];
25 | --:-:-:-:4      MOV clock_hi, clock[1];
26 | 
27 | 
28 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
29 | --:-:0:-:2      LDS vA, [RZ+0x100];
30 | 01:-:-:-:4      CS2R c2, SR_CLOCKLO;
31 | --:-:-:-:5      IADD3 e1, c2, -c1, RZ;
32 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
33 | 
34 | 
35 | 
36 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
37 | --:-:0:-:2      LDS.64 v0, [RZ];
38 | --:-:1:-:2      LDS.64 v2, [RZ + 0x8];
39 | 03:-:-:-:4      CS2R c2, SR_CLOCKLO;
40 | --:-:-:-:5      IADD3 e2, c2, -c1, RZ;
41 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x4], e2;
42 | 
43 | 
44 | 
45 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
46 | --:-:0:-:2      LDS vA, [RZ+0x0];
47 | --:-:1:-:2      LDS vB, [RZ+0x80];
48 | --:-:2:-:2      LDS vC, [RZ+0x100];
49 | --:-:3:-:2      LDS vD, [RZ+0x180];
50 | 15:-:-:-:4      CS2R c2, SR_CLOCKLO;
51 | --:-:-:-:5      IADD3 e3, c2, -c1, RZ;
52 | --:-:-:-:4      STG.E.SYS [clock_lo + 0x8], e3;
53 | 
54 | 
55 | 
56 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
57 | --:-:0:-:2      LDS vA, [RZ+0x0];
58 | --:-:1:-:2      LDS vB, [RZ+0x84];
59 | --:-:2:-:2      LDS vC, [RZ+0x108];
60 | --:-:3:-:2      LDS vD, [RZ+0x18c];
61 | 15:-:-:-:4      CS2R c2, SR_CLOCKLO;
62 | --:-:-:-:5      IADD3 e4, c2, -c1, RZ;
63 | --:-:-:-:4      STG.E.SYS [clock_lo + 0xc], e4;
64 | 
65 | --:-:-:-:2      EXIT;
66 | 


--------------------------------------------------------------------------------
/schedule/block_schedule.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda.h>
 3 | 
 4 | __global__ void block_workload(float *A, float *B){
 5 |     int tid = threadIdx.x;
 6 |     uint32_t global_warpip = get_global_warpid();
 7 | 
 8 |     float dummy = 0;
 9 |     float vA[4], vB[4], vC[4], vD[4];
10 |     float *ptr;
11 |     ptrdiff_t offset = 0;
12 | 
13 |     #pragma unroll
14 |     for (int i = 0; i < 32; ++i){
15 |         offset = i * 4;
16 |         ptr = A + offset;
17 | 
18 |         asm volatile(
19 |             "ld.global.ca.f32   %0,     [%4];       \n\t"
20 |             "ld.global.ca.f32   %1,     [%4+4];     \n\t"
21 |             "ld.global.ca.f32   %2,     [%4+8];     \n\t"
22 |             "ld.global.ca.f32   %3,     [%4+12];    \n\t"
23 |             :"=f"(vA[0]),"=f"(vB[0]),"=f"(vC[0]),"=f"(vD[0])
24 |             :"l"(ptr):"memory"
25 |         );
26 |         dummy += vA[0];
27 |         dummy += vB[0];
28 |         dummy += vC[0];
29 |         dummy += vD[0];
30 |     }
31 |     B[tid] = dummy;
32 | }
33 | 
34 | int main() {
35 |     size_t width = 512;
36 |     size_t bytes = 4 * width;
37 | 
38 | 
39 |     dim3 bDim1(32);
40 |     dim3 bDim1(128);
41 |     dim3 gDim(80);
42 | 
43 |     float *A;
44 |     float *B;
45 |     uint32_t *cost;
46 | 
47 |     cudaMallocManaged(&A, bytes);
48 |     cudaMallocManaged(&B, bytes);
49 |     cudaMallocManaged(&cost, bytes);
50 | 
51 |     for (int i = 0; i < width; ++i) {
52 |         h_A[i] = i;
53 |     }
54 | 
55 |     float       totalElapsed;
56 |     cudaEvent_t start_t, stop_t;
57 |     cudaEventCreate(&start_t);
58 |     cudaEventCreate(&stop_t);
59 | 
60 |     cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
61 |     cudaEventRecord(start_t, 0);
62 | 
63 |     warp_workload<0, 1><<<gDim, bDim>>>(d_A, d_B);
64 |     printf(cudaGetErrorString(cudaGetLastError()));
65 | 
66 |     cudaEventRecord(stop_t, 0);
67 |     cudaEventSynchronize(stop_t);
68 |     cudaMemcpy(h_B, d_B, bytes, cudaMemcpyDeviceToHost);
69 |     cudaEventElapsedTime(&totalElapsed, start_t, stop_t);
70 |     printf("\nHost Time Elapsed %f ms", totalElapsed);
71 | }
72 | 


--------------------------------------------------------------------------------
/sass_cubin/memory_latency.sass:
--------------------------------------------------------------------------------
 1 | <params>
 2 | input, 8
 3 | output, 8
 4 | clock, 8
 5 | </params>
 6 | 
 7 | <regs>
 8 | 0: input_lo
 9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-9: c<1-4>
15 | 10-13: vA, vB, vC, vD
16 | 14-17: e1, e2, e3, e4
17 | 18-20: x1, x2, x3
18 | 21-25 ~ tid, warpid, tid32
19 | 32-35 ~ a0, a1, a2, a3
20 | 36-42 ~ smem1, smem2, smem3, e_s1, e_s2
21 | 43-63 ~ c<5-10>
22 | 64-79 ~ e<5-10>
23 | </regs>
24 | 
25 | <consts>
26 | const_a, 8
27 | </consts>
28 | 
29 | --:-:-:-:2      MOV input_lo, input[0];
30 | --:-:-:-:2      MOV input_hi, input[1];
31 | --:-:-:-:2      MOV clock_lo, clock[0];
32 | --:-:-:-:4      MOV clock_hi, clock[1];
33 | 
34 | 
35 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
36 | --:-:0:-:2      LDG.E.STRONG.GPU vA, [input_lo];
37 | 01:-:-:-:2      CS2R c2, SR_CLOCKLO;
38 | --:-:0:-:2      LDG.E.STRONG.CTA vB, [input_lo+0x4];
39 | 01:-:-:-:2      CS2R c3, SR_CLOCKLO;
40 | --:-:0:-:2      LDG.E.STRONG.CTA vC, [input_lo+0x8];
41 | 01:-:-:-:2      CS2R c4, SR_CLOCKLO;
42 | --:-:0:-:2      LDG.E.STRONG.CTA vA, [input_lo+0x10000];
43 | 01:-:-:-:2      CS2R c5, SR_CLOCKLO;
44 | 
45 | --:-:-:-:4      IADD3 e1, c2, -c1, RZ;
46 | --:-:-:-:4      IADD3 e2, c3, -c2, RZ;
47 | --:-:-:-:4      IADD3 e3, c4, -c3, RZ;
48 | --:-:-:-:4      IADD3 e4, c5, -c4, RZ;
49 | 
50 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
51 | --:-:-:-:4      STG.E.SYS [clock_lo+0x4], e2;
52 | --:-:-:-:4      STG.E.SYS [clock_lo+0x8], e3;
53 | --:-:-:-:4      STG.E.SYS [clock_lo+0x1c], e4;
54 | 
55 | --:-:-:-:6      NOP;
56 | 
57 | --:-:-:-:2      CS2R c1, SR_CLOCKLO;
58 | --:-:1:-:2      LDC.E x1, const_a[0];
59 | 02:-:-:-:2      CS2R c2, SR_CLOCKLO;
60 | --:-:1:-:2      MOV x2, const_a[1];
61 | 02:-:-:-:2      CS2R c3, SR_CLOCKLO;
62 | --:-:1:-:2      MOV x3, const_a[2];
63 | 02:-:-:-:4      CS2R c4, SR_CLOCKLO;
64 | 
65 | --:-:-:-:4      IADD3 e1, c2, -c1, RZ;
66 | --:-:-:-:4      IADD3 e2, c3, -c2, RZ;
67 | --:-:-:-:4      IADD3 e3, c4, -c3, RZ;
68 | --:-:-:-:4      STG.E.SYS [clock_lo+0xc], e1;
69 | --:-:-:-:4      STG.E.SYS [clock_lo+0x10], e2;
70 | --:-:-:-:4      STG.E.SYS [clock_lo+0x14], e3;
71 | 
72 | --:-:-:-:6      NOP;
73 | 
74 | --:-:-:-:2      CS2R smem1, SR_CLOCKLO;
75 | --:-:0:-:2      LDS x1, [RZ+0x0];
76 | 01:-:-:-:4      CS2R smem2, SR_CLOCKLO;
77 | 
78 | --:-:-:-:5      IADD3 e_s1, smem2, -smem1, RZ;
79 | --:-:-:-:4      STG.E.SYS [clock_lo+0x18], e_s1;
80 | 
81 | --:-:-:-:2      EXIT;
82 | 


--------------------------------------------------------------------------------
/schedule/warp_schedule.cu:
--------------------------------------------------------------------------------
  1 | // 
  2 | //
  3 | //
  4 | //
  5 | 
  6 | #include "cuda.h"
  7 | #include "utils.cuh"
  8 | 
  9 | 
 10 | __global__ void warpScheduleKernel(float* input, float* output, uint* clock, const int run_warp){
 11 |     int tid = threadIdx.x;
 12 |     int laneid = tid & 0x1f;
 13 |     int warpid = tid >> 5;
 14 | 
 15 |     if (warpid != 0 and warpid != run_warp){
 16 |         ptxExit();
 17 |     }
 18 | 
 19 |     input += tid;
 20 |     clock += 32 * warpid / run_warp;
 21 | 
 22 | 
 23 |     float array[128];
 24 |     float acc = 0;
 25 |     for (int i = 0; i < 128; ++i){
 26 |         array[i] = input[i];
 27 |     }
 28 | 
 29 |     uint c1 = getClock();
 30 |     #pragma unroll
 31 |     for (int i = 0; i < 128; ++i){
 32 |         acc += array[i] * array[i] + 1.0f;
 33 |     }
 34 |     uint c2 = getClock();
 35 | 
 36 |     clock[laneid] = c2 - c1;
 37 |     output[laneid] = acc;
 38 | }
 39 | 
 40 | uint sumArray(uint* array, int size){
 41 |     uint acc = 0;
 42 |     for (int i = 0; i < size; ++i){
 43 |         acc += array[i];
 44 |     }
 45 |     return acc;
 46 | }
 47 | 
 48 | 
 49 | int main(){
 50 | 
 51 |     float* input_h; 
 52 |     float* input_d;
 53 |     float* output_h;
 54 |     float* output_d;
 55 |     uint32_t* clock_h;
 56 |     uint32_t* clock_d;
 57 | 
 58 |     int size = 4096;
 59 | 
 60 |     input_h     = static_cast<float*>(malloc(sizeof(float) * size));
 61 |     output_h    = static_cast<float*>(malloc(sizeof(float) * size));
 62 |     clock_h     = static_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
 63 | 
 64 | 
 65 |     cudaMalloc(&input_d,  sizeof(float) * size);
 66 |     cudaMalloc(&output_d, sizeof(float) * size);
 67 |     cudaMalloc(&clock_d,  sizeof(uint32_t) * size);
 68 | 
 69 |     cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
 70 | 
 71 | 
 72 |     dim3 gDim(1, 1, 1);
 73 |     dim3 bDim(256, 1, 1);
 74 | 
 75 |     const char* cubin_name = "../sass_cubin/warp_schedule.cubin";
 76 |     const char* kernel_name = "warpSchedule";
 77 | 
 78 |     printf(">>> SASS Level Warp Scedule Detect\n");
 79 |     for (int i = 1; i < 8; ++i){
 80 |         void* kernel_args[4] = {&input_d, &output_d, &clock_d, &i};
 81 |         cudaMemset(clock_d, 0, sizeof(uint) * size);
 82 |         launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args);
 83 |         cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
 84 | 
 85 |         printf("        Run Warp <0, %d>  Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64));
 86 |         cudaDeviceSynchronize();
 87 |     }
 88 | 
 89 | 
 90 | 
 91 |     printf("\n");
 92 |     printf(">>> CUDA-C Level Warp Schedule Detect\n");
 93 |     for (int i = 1; i < 8; ++i){
 94 |         cudaMemset(clock_d, 0, sizeof(uint) * size);
 95 |         warpScheduleKernel<<<gDim, bDim>>>(input_d, output_d, clock_d, i);
 96 |         cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
 97 | 
 98 |         printf("        Run Warp <0, %d>  Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64));
 99 |         cudaDeviceSynchronize();
100 |     }
101 | 
102 |     return 0;
103 | }


--------------------------------------------------------------------------------
/miscellany/reg_bankconflict.cu:
--------------------------------------------------------------------------------
 1 | // 
 2 | //
 3 | //
 4 | //
 5 | 
 6 | #include "cuda.h"
 7 | #include "utils.cuh"
 8 | 
 9 | 
10 | int main(){
11 | 
12 |     float* input_h; 
13 |     float* input_d;
14 |     float* output_h;
15 |     float* output_d;
16 |     uint32_t* clock_h;
17 |     uint32_t* clock_d;
18 | 
19 |     int size = 1024;
20 | 
21 |     input_h     = static_cast<float*>(malloc(sizeof(float) * size));
22 |     output_h    = static_cast<float*>(malloc(sizeof(float) * size));
23 |     clock_h     = static_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
24 | 
25 | 
26 |     cudaMalloc(&input_d,  sizeof(float) * size);
27 |     cudaMalloc(&output_d, sizeof(float) * size);
28 |     cudaMalloc(&clock_d,  sizeof(uint32_t) * size);
29 | 
30 |     cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
31 | 
32 | 
33 |     dim3 gDim(1, 1, 1);
34 |     dim3 bDim(1, 1, 1);
35 | 
36 |     void* kernel_args[] = {&input_d, &output_d, &clock_d};
37 | 
38 | 
39 |     const char* cubin_name1 = "../sass_cubin/reg_with_bankconflict.cubin";
40 |     const char* kernel_name1 = "regWithBankconflict";
41 |     launchSassKernel(cubin_name1, kernel_name1, gDim, bDim, 0, kernel_args);
42 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
43 |     cudaDeviceSynchronize();
44 |     printf(">>> SASS Level Reg With    BankConflict IPC Result\n");
45 |     printf("        FFMA  per \t%.3f cycle\n", static_cast<float>(clock_h[0]) / 64);
46 |     printf("        IADD3 per \t%.3f cycle\n", static_cast<float>(clock_h[1]) / 64);
47 |     
48 | 
49 | 
50 | 
51 |     const char* cubin_name2 = "../sass_cubin/reg_without_bankconflict.cubin";
52 |     const char* kernel_name2 = "regWithoutBankconflict";
53 |     launchSassKernel(cubin_name2, kernel_name2, gDim, bDim, 0, kernel_args);
54 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
55 |     cudaDeviceSynchronize();
56 |     printf("\n");
57 |     printf(">>> SASS Level Reg Without BankConflict IPC Result\n");
58 |     printf("        FFMA  per \t%.3f cycle\n", static_cast<float>(clock_h[0]) / 64);
59 |     printf("        IADD3 per \t%.3f cycle\n", static_cast<float>(clock_h[1]) / 64);
60 |     
61 | 
62 | 
63 |     const char* cubin_name3 = "../sass_cubin/reg_reuse_bankconflict.cubin";
64 |     const char* kernel_name3 = "regReuseBankconflict";
65 |     launchSassKernel(cubin_name3, kernel_name3, gDim, bDim, 0, kernel_args);
66 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
67 |     cudaDeviceSynchronize();
68 |     printf("\n");
69 |     printf(">>> SASS Level Reg Reuse   BankConflict IPC Result\n");
70 |     printf("        FFMA  per \t%.3f cycle\n", static_cast<float>(clock_h[0]) / 64);
71 |     printf("        IADD3 per \t%.3f cycle\n", static_cast<float>(clock_h[1]) / 64);
72 | 
73 | 
74 | 
75 |     const char* cubin_name4 = "../sass_cubin/reg_reuse_double.cubin";
76 |     const char* kernel_name4 = "regReuseDouble";
77 |     launchSassKernel(cubin_name4, kernel_name4, gDim, bDim, 0, kernel_args);
78 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
79 |     cudaDeviceSynchronize();
80 |     printf("\n");
81 |     printf(">>> SASS Level Reg Reuse   Double       IPC Result\n");
82 |     printf("        FFMA  per \t%.3f cycle\n", static_cast<float>(clock_h[0]) / 64);
83 |     printf("        IADD3 per \t%.3f cycle\n", static_cast<float>(clock_h[1]) / 64);
84 | 
85 |     return 0;
86 | }


--------------------------------------------------------------------------------
/sass_cubin/cache_linesize.sass:
--------------------------------------------------------------------------------
  1 | <params>
  2 | input, 8
  3 | output, 8
  4 | clock, 8
  5 | </params>
  6 | 
  7 | <regs>
  8 | 0: input_lo
  9 | 1: input_hi
 10 | 2: output_lo
 11 | 3: output_hi
 12 | 4: clock_lo
 13 | 5: clock_hi
 14 | 6: vA
 15 | 7: e
 16 | 8-240 ~ c<0-200>
 17 | </regs>
 18 | 
 19 | 
 20 | <consts>
 21 | const_a, 1024
 22 | </consts>
 23 | 
 24 | 
 25 | 
 26 | --:-:-:-:2      MOV input_lo, input[0];
 27 | --:-:-:-:2      MOV input_hi, input[1];
 28 | --:-:-:-:2      MOV output_lo, output[0];
 29 | --:-:-:-:2      MOV output_hi, output[1];
 30 | --:-:-:-:2      MOV clock_lo, clock[0];
 31 | --:-:-:-:4      MOV clock_hi, clock[1];
 32 | 
 33 | 
 34 | --:-:-:-:2      CS2R c0, SR_CLOCKLO;
 35 | <CODE>
 36 | SASS_CODE = []
 37 | loop_size = 200
 38 | 
 39 | LDG  = "--:-:0:-:2      LDG.E.STRONG.GPU vA, [input_lo+{:}];"
 40 | CS2R = "01:-:-:-:4      CS2R c{:}, SR_CLOCKLO;"
 41 | IADD = "--:-:-:-:5      IADD3 e, c{:}, -c{:}, RZ;"
 42 | STG  = "--:-:-:-:4      STG.E.SYS [clock_lo+{:}], e;"
 43 | 
 44 | for i in range(loop_size):
 45 |     SASS_CODE += [LDG.format(hex(i * 4))]
 46 |     SASS_CODE += [CS2R.format(i+1)]
 47 | 
 48 | for i in range(loop_size):
 49 |     SASS_CODE += [IADD.format(i+1, i)]
 50 |     SASS_CODE += [STG.format(hex(i*4), i)]
 51 | 
 52 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 53 | </CODE>
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | --:-:-:-:2      IADD3 clock_lo, clock_lo, 0x800, RZ;
 61 | --:-:-:-:2      CS2R c0, SR_CLOCKLO;
 62 | 
 63 | <CODE>
 64 | SASS_CODE = []
 65 | loop_size = 200
 66 | 
 67 | LDG  = "--:-:0:-:2      LDG.E.STRONG.CTA vA, [input_lo+{:}];"
 68 | CS2R = "01:-:-:-:4      CS2R c{:}, SR_CLOCKLO;"
 69 | IADD = "--:-:-:-:5      IADD3 e, c{:}, -c{:}, RZ;"
 70 | STG  = "--:-:-:-:4      STG.E.SYS [clock_lo+{:}], e;"
 71 | 
 72 | for i in range(loop_size):
 73 |     SASS_CODE += [LDG.format(hex(i * 4))]
 74 |     SASS_CODE += [CS2R.format(i+1)]
 75 | 
 76 | for i in range(loop_size):
 77 |     SASS_CODE += [IADD.format(i+1, i)]
 78 |     SASS_CODE += [STG.format(hex(i*4), i)]
 79 | 
 80 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 81 | </CODE>
 82 | 
 83 | 
 84 | 
 85 | 
 86 | --:-:-:-:2      IADD3 clock_lo, clock_lo, 0x800, RZ;
 87 | --:-:-:-:2      CS2R c0, SR_CLOCKLO;
 88 | <CODE>
 89 | SASS_CODE = []
 90 | loop_size = 200
 91 | 
 92 | LDC  = "--:-:0:-:2      LDC.E vA, const_a[{:}];"
 93 | CS2R = "01:-:-:-:4      CS2R c{:}, SR_CLOCKLO;"
 94 | IADD = "--:-:-:-:5      IADD3 e, c{:}, -c{:}, RZ;"
 95 | STG  = "--:-:-:-:4      STG.E.SYS [clock_lo+{:}], e;"
 96 | 
 97 | for i in range(loop_size):
 98 |     SASS_CODE += [LDC.format(i)]
 99 |     SASS_CODE += [CS2R.format(i+1)]
100 | 
101 | for i in range(loop_size):
102 |     SASS_CODE += [IADD.format(i+1, i)]
103 |     SASS_CODE += [STG.format(hex(i*4), i)]
104 | 
105 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
106 | </CODE>
107 | 
108 | 
109 | --:-:-:-:2      IADD3 clock_lo, clock_lo, 0x800, RZ;
110 | --:-:-:-:2      CS2R c0, SR_CLOCKLO;
111 | <CODE>
112 | SASS_CODE = []
113 | loop_size = 200
114 | 
115 | LDC  = "--:-:0:-:2      MOV vA, const_a[{:}];"
116 | CS2R = "01:-:-:-:4      CS2R c{:}, SR_CLOCKLO;"
117 | IADD = "--:-:-:-:5      IADD3 e, c{:}, -c{:}, RZ;"
118 | STG  = "--:-:-:-:4      STG.E.SYS [clock_lo+{:}], e;"
119 | 
120 | for i in range(loop_size):
121 |     SASS_CODE += [LDC.format(i)]
122 |     SASS_CODE += [CS2R.format(i+1)]
123 | 
124 | for i in range(loop_size):
125 |     SASS_CODE += [IADD.format(i+1, i)]
126 |     SASS_CODE += [STG.format(hex(i*4), i)]
127 | 
128 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
129 | </CODE>
130 | 
131 | --:-:-:-:2      EXIT;
132 | 


--------------------------------------------------------------------------------
/sass_cubin/memory_bandwidth_thread.sass:
--------------------------------------------------------------------------------
  1 | <params>
  2 | input, 8
  3 | clock, 8
  4 | </params>
  5 | 
  6 | <regs>
  7 | 0: input_lo
  8 | 1: input_hi
  9 | 4: clock_lo
 10 | 5: clock_hi
 11 | 8-11: v<1-4>
 12 | 12-30 ~ c<1-12>
 13 | 31-40 ~ e<1-6>
 14 | </regs>
 15 | 
 16 | --:-:-:-:2      MOV input_lo, input[0];
 17 | --:-:-:-:2      MOV input_hi, input[1];
 18 | --:-:-:-:2      MOV clock_lo, clock[0];
 19 | --:-:-:-:4      MOV clock_hi, clock[1];
 20 | 
 21 | --:-:3:-:1      LDG.E.128.STRONG.CTA v1, [input_lo];        // warmup
 22 | 
 23 | #########################################################################################
 24 | 
 25 | 08:-:-:-:3      CS2R c1, SR_CLOCKLO;
 26 | <CODE>
 27 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo+{:}];"
 28 | 
 29 | SASS_CODE = []
 30 | for i in range(1024):
 31 |     pos = hex(i * 16)
 32 |     SASS_CODE += [LDG_128_to_reg.format(pos)]
 33 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 34 | </CODE>
 35 | 01:-:-:-:6      CS2R c2, SR_CLOCKLO;
 36 | 
 37 | --:-:-:-:3      CS2R c3, SR_CLOCKLO;
 38 | <CODE>
 39 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [RZ+{:}];"
 40 | 
 41 | SASS_CODE = []
 42 | for i in range(256):
 43 |     pos = hex(i * 16)
 44 |     SASS_CODE += [LDS_128_to_reg.format(pos)]
 45 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 46 | </CODE>
 47 | 02:-:-:-:6      CS2R c4, SR_CLOCKLO;
 48 | 
 49 | #########################################################################################
 50 | 
 51 | --:-:-:-:3      CS2R c5, SR_CLOCKLO;
 52 | <CODE>
 53 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo+{:}];"
 54 | 
 55 | SASS_CODE = []
 56 | for i in range(1024 * 2):
 57 |     pos = hex(i * 8)
 58 |     SASS_CODE += [LDG_64_to_reg.format(pos)]
 59 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 60 | </CODE>
 61 | 01:-:-:-:6      CS2R c6, SR_CLOCKLO;
 62 | 
 63 | --:-:-:-:3      CS2R c7, SR_CLOCKLO;
 64 | <CODE>
 65 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [RZ+{:}];"
 66 | 
 67 | SASS_CODE = []
 68 | for i in range(256 * 2):
 69 |     pos = hex(i * 8)
 70 |     SASS_CODE += [LDS_64_to_reg.format(pos)]
 71 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 72 | </CODE>
 73 | 02:-:-:-:6      CS2R c8, SR_CLOCKLO;
 74 | 
 75 | #########################################################################################
 76 | 
 77 | --:-:-:-:3      CS2R c9, SR_CLOCKLO;
 78 | <CODE>
 79 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];"
 80 | 
 81 | SASS_CODE = []
 82 | for i in range(1024 * 4):
 83 |     pos = hex(i * 4)
 84 |     SASS_CODE += [LDG_32_to_reg.format(pos)]
 85 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 86 | </CODE>
 87 | 01:-:-:-:6      CS2R c10, SR_CLOCKLO;
 88 | 
 89 | --:-:-:-:3      CS2R c11, SR_CLOCKLO;
 90 | <CODE>
 91 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [RZ+{:}];"
 92 | 
 93 | SASS_CODE = []
 94 | for i in range(256 * 4):
 95 |     pos = hex(i * 4)
 96 |     SASS_CODE += [LDS_32_to_reg.format(pos)]
 97 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 98 | </CODE>
 99 | 02:-:-:-:6      CS2R c12, SR_CLOCKLO;
100 | 
101 | #########################################################################################
102 | 
103 | --:-:-:-:2      IADD3 e1, c2, -c1, RZ;
104 | --:-:-:-:2      IADD3 e2, c4, -c3, RZ;
105 | --:-:-:-:2      IADD3 e3, c6, -c5, RZ;
106 | --:-:-:-:2      IADD3 e4, c8, -c7, RZ;
107 | --:-:-:-:2      IADD3 e5, c10, -c9, RZ;
108 | --:-:-:-:2      IADD3 e6, c12, -c11, RZ;
109 | 
110 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
111 | --:-:-:-:4      STG.E.SYS [clock_lo+0x4], e2;
112 | --:-:-:-:4      STG.E.SYS [clock_lo+0x8], e3;
113 | --:-:-:-:4      STG.E.SYS [clock_lo+0xc], e4;
114 | --:-:-:-:4      STG.E.SYS [clock_lo+0x10], e5;
115 | --:-:-:-:4      STG.E.SYS [clock_lo+0x14], e6;
116 | 
117 | --:-:-:-:2      EXIT;
118 | 
119 | 


--------------------------------------------------------------------------------
/memory/cache_linesize.cu:
--------------------------------------------------------------------------------
  1 | // 
  2 | //
  3 | //
  4 | //
  5 | 
  6 | #include "cuda.h"
  7 | #include "utils.cuh"
  8 | 
  9 | 
 10 | __constant__ float cinput[1024];
 11 | 
 12 | __global__ void linesizeDetectKernel(float* input, float* output, uint* clock, float* cinput){
 13 | 
 14 |     uint c[256];
 15 |     float val = 0;
 16 | 
 17 |     float acc = 0;
 18 |     c[0] = getClock();
 19 |     #pragma unroll
 20 |     for (int i = 0; i < 256; ++i){
 21 |         asm volatile(
 22 |             "ld.global.cg.b32    %0,    [%1];  \n\t"
 23 |             :"=f"(val):"l"(input):"memory"
 24 |         );
 25 |         c[i+1] = getClock();
 26 |         acc += val;
 27 |         input += 2;
 28 |     }
 29 |     #pragma unroll
 30 |     for (int i = 0; i < 256; ++i){
 31 |         clock[i] = c[i+1] - c[i];
 32 |     }
 33 |     output[0] = acc;
 34 | 
 35 |     /////////////////////////////////////////////////////////////////////////
 36 | 
 37 |     input += 1024;
 38 |     clock += 512;
 39 |     acc = 0;
 40 |     c[0] = getClock();
 41 |     #pragma unroll
 42 |     for (int i = 0; i < 256; ++i){
 43 |         asm volatile(
 44 |             "ld.global.ca.f32    %0,    [%1];  \n\t"
 45 |             :"=f"(val):"l"(input):"memory"
 46 |         );
 47 |         c[i+1] = getClock();
 48 |         acc += val;
 49 |         input++;
 50 |     }
 51 |     #pragma unroll
 52 |     for (int i = 0; i < 256; ++i){
 53 |         clock[i] = c[i+1] - c[i];
 54 |     }
 55 |     output[1] = acc;
 56 | }
 57 | 
 58 | 
 59 | int detectCacheLinesize(uint* clock, int size, uint gap){
 60 |     int linesize = 0;
 61 |     uint last_cycle = clock[0];
 62 | 
 63 |     int first = 0;
 64 |     int second = 0;
 65 | 
 66 |     // formatArray(clock, 256, 16);
 67 |     for (int i = 1; i < size; ++i){
 68 |         if (clock[i] > last_cycle and clock[i] - last_cycle > gap) {
 69 |             if (first == 0){
 70 |                 first = i;
 71 |             } else {
 72 |                 second = i;
 73 |                 break;
 74 |             }
 75 |         } 
 76 |         last_cycle = clock[i];
 77 |     }
 78 |     return (second - first) * 4;
 79 | }
 80 | 
 81 | 
 82 | int main(){
 83 |     float* input_h; 
 84 |     float* input_d;
 85 |     float* output_h;
 86 |     float* output_d;
 87 |     uint* clock_h;
 88 |     uint* clock_d;
 89 | 
 90 |     int size = 4096;
 91 | 
 92 |     input_h     = static_cast<float*>(malloc(sizeof(float) * size));
 93 |     output_h    = static_cast<float*>(malloc(sizeof(float) * size));
 94 |     clock_h     = static_cast<uint*>(malloc(sizeof(uint) * size));
 95 | 
 96 |     cudaMalloc(&input_d,  sizeof(float) * size);
 97 |     cudaMalloc(&output_d, sizeof(float) * size);
 98 |     cudaMalloc(&clock_d,  sizeof(uint) * size);
 99 | 
100 |     cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
101 | 
102 |     dim3 gDim(1, 1, 1);
103 |     dim3 bDim(1, 1, 1);
104 | 
105 |     void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput};
106 |     const char* cubin_name = "../sass_cubin/cache_linesize.cubin";
107 |     const char* kernel_name = "cacheLinesize";
108 | 
109 |     launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args);
110 |     cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
111 |     cudaDeviceSynchronize();
112 |     printf(">>> SASS Level Cache Linesize Result\n");
113 |     printf("        Global   L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
114 |     printf("        Global   L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512,  512, 10));
115 |     printf("        Constant L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1024, 512, 100));
116 |     printf("        Constant L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1536, 512, 10));
117 | 
118 | 
119 | 
120 |     linesizeDetectKernel<<<gDim, bDim>>>(input_d, output_d, clock_d, cinput);
121 |     cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
122 |     cudaDeviceSynchronize();
123 |     printf("\n");
124 |     printf(">>> CUDA-C Level Cache Linesize Result\n");
125 |     printf("        Global   L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
126 |     printf("        Global   L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10));
127 |     return 0;
128 | }


--------------------------------------------------------------------------------
/sass_cubin/memory_bandwidth_block.sass:
--------------------------------------------------------------------------------
  1 | <params>
  2 | input, 8
  3 | clock, 8
  4 | </params>
  5 | 
  6 | <regs>
  7 | 0: input_lo
  8 | 1: input_hi
  9 | 4: clock_lo
 10 | 5: clock_hi
 11 | 8-11: v<1-4>
 12 | 12-30 ~ c<1-12>
 13 | 31-40 ~ e<1-6>
 14 | 41-44 ~ tid, tid_x_2, tid_x_4, clock_set
 15 | 46-47: input_lo_x_4, input_hi_x_4
 16 | 48-49: input_lo_x_2, input_hi_x_2
 17 | </regs>
 18 | 
 19 | --:-:-:-:2      MOV input_lo, input[0];
 20 | --:-:-:-:2      MOV input_hi, input[1];
 21 | --:-:-:-:2      MOV input_hi_x_4, input[1];
 22 | --:-:-:-:2      MOV input_hi_x_2, input[1];
 23 | --:-:-:-:2      MOV clock_lo, clock[0];
 24 | --:-:-:-:4      MOV clock_hi, clock[1];
 25 | 
 26 | --:-:3:-:1      LDG.E.32.STRONG.CTA v1, [input_lo];                         // warmup
 27 | 
 28 | --:-:0:-:5      S2R tid, SR_TID.X;                                          // tid = threadIdx.x
 29 | 01:-:-:-:6      SHF.L.S32.HI tid, RZ, 0x2, tid;                             // sizeof(T) == 4
 30 | 
 31 | --:-:-:-:5      SHF.L.S32.HI tid_x_4, RZ, 0x2, tid;
 32 | --:-:-:-:5      SHF.L.S32.HI tid_x_2, RZ, 0x1, tid;
 33 | 
 34 | --:-:-:-:5      IADD3 input_lo_x_4, input_lo, tid_x_4, RZ;
 35 | --:-:-:-:5      IADD3 input_lo_x_2, input_lo, tid_x_2, RZ;
 36 | 
 37 | #########################################################################################
 38 | 
 39 | 08:-:-:-:3      CS2R c1, SR_CLOCKLO;
 40 | <CODE>
 41 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo_x_4+{:}];"
 42 | 
 43 | SASS_CODE = []
 44 | for i in range(128):
 45 |     pos = hex(i * 16 * 256)
 46 |     SASS_CODE += [LDG_128_to_reg.format(pos)]
 47 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 48 | </CODE>
 49 | 
 50 | 01:-:-:-:6      CS2R c2, SR_CLOCKLO;
 51 | 
 52 | --:-:-:-:3      CS2R c3, SR_CLOCKLO;
 53 | 
 54 | <CODE>
 55 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [tid_x_4+{:}];"
 56 | 
 57 | SASS_CODE = []
 58 | for i in range(8):
 59 |     pos = hex(i * 16 * 256)
 60 |     SASS_CODE += [LDS_128_to_reg.format(pos)]
 61 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 62 | </CODE>
 63 | 02:-:-:-:6      CS2R c4, SR_CLOCKLO;
 64 | 
 65 | #########################################################################################
 66 | 
 67 | --:-:-:-:3      CS2R c5, SR_CLOCKLO;
 68 | <CODE>
 69 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo_x_2+{:}];"
 70 | 
 71 | SASS_CODE = []
 72 | for i in range(256):
 73 |     pos = hex(i * 8 * 256)
 74 |     SASS_CODE += [LDG_64_to_reg.format(pos)]
 75 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 76 | </CODE>
 77 | 
 78 | 01:-:-:-:6      CS2R c6, SR_CLOCKLO;
 79 | 
 80 | --:-:-:-:3      CS2R c7, SR_CLOCKLO;
 81 | 
 82 | <CODE>
 83 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [tid_x_2+{:}];"
 84 | 
 85 | SASS_CODE = []
 86 | for i in range(16):
 87 |     pos = hex(i * 8 * 256)
 88 |     SASS_CODE += [LDS_64_to_reg.format(pos)]
 89 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
 90 | </CODE>
 91 | 02:-:-:-:6      CS2R c8, SR_CLOCKLO;
 92 | 
 93 | #########################################################################################
 94 | 
 95 | --:-:-:-:3      CS2R c9, SR_CLOCKLO;
 96 | <CODE>
 97 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];"
 98 | 
 99 | SASS_CODE = []
100 | for i in range(512):
101 |     pos = hex(i * 4 * 256)
102 |     SASS_CODE += [LDG_32_to_reg.format(pos)]
103 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
104 | </CODE>
105 | 
106 | 01:-:-:-:6      CS2R c10, SR_CLOCKLO;
107 | 
108 | --:-:-:-:3      CS2R c11, SR_CLOCKLO;
109 | 
110 | <CODE>
111 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [tid+{:}];"
112 | 
113 | SASS_CODE = []
114 | for i in range(32):
115 |     pos = hex(i * 4 * 256)
116 |     SASS_CODE += [LDS_32_to_reg.format(pos)]
117 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
118 | </CODE>
119 | 02:-:-:-:6      CS2R c12, SR_CLOCKLO;
120 | 
121 | #########################################################################################
122 | 
123 | --:-:-:-:6      IMAD.WIDE clock_lo, tid, 0x6, clock_lo;
124 | 
125 | --:-:-:-:2      IADD3 e1, c2, -c1, RZ;
126 | --:-:-:-:2      IADD3 e2, c4, -c3, RZ;
127 | --:-:-:-:2      IADD3 e3, c6, -c5, RZ;
128 | --:-:-:-:2      IADD3 e4, c8, -c7, RZ;
129 | --:-:-:-:2      IADD3 e5, c10, -c9, RZ;
130 | --:-:-:-:2      IADD3 e6, c12, -c11, RZ;
131 | 
132 | --:-:-:-:4      STG.E.SYS [clock_lo], e1;
133 | --:-:-:-:4      STG.E.SYS [clock_lo+0x4], e2;
134 | --:-:-:-:4      STG.E.SYS [clock_lo+0x8], e3;
135 | --:-:-:-:4      STG.E.SYS [clock_lo+0xc], e4;
136 | --:-:-:-:4      STG.E.SYS [clock_lo+0x10], e5;
137 | --:-:-:-:4      STG.E.SYS [clock_lo+0x14], e6;
138 | 
139 | --:-:-:-:2      EXIT;
140 | 
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPU Arch Microbenchmark
  2 | 
  3 | 
  4 | ## Prerequisites
  5 | 1. install `turingas` compiler
  6 |     > `git clone --recursive git@github.com:sjfeng1999/gpu-arch-microbenchmark.git`  
  7 |     > `cd turingas`  
  8 |     > `python setup.py install`  
  9 | 
 10 | ## Usage    
 11 | 1. `mkdir build && cd build`
 12 | 2. `cmake .. && make`
 13 | 3. `python ../compile_sass.py -arch=(70|75|80)`
 14 | 4. `./(memory_latency|reg_bankconflict|...)`
 15 | 
 16 | ## Microbenchmark
 17 | 
 18 | ### 1. Memory Latency
 19 | 
 20 | |Device                      |Latency    |Turing RTX-2070 (TU104)|
 21 | |:--------------------------:|:---------:|:---------------------:|
 22 | |Global Latency              |cycle      | 1000 ~ 1200           |
 23 | |TLB Latency                 |cycle      | 472                   |
 24 | |L2 Latency                  |cycle      | 236                   |
 25 | |L1 Latency                  |cycle      | 32                    |  
 26 | |Shared Latency              |cycle      | 23                    |  
 27 | |Constant Latency            |cycle      | 448                   |
 28 | |Constant L2 Latency         |cycle      | 62                    |
 29 | |Constant L1 Latency         |cycle      | 4                     |  
 30 | 
 31 | - const L1-cache is as fast as register.
 32 | 
 33 | ### 2. Memory Bandwidth  
 34 | 
 35 | 1. memory bandwidth within one thread
 36 | 
 37 | |Device          | Bandwidth   | Turing RTX-2070 |
 38 | |:--------------:|:-----------:|:---------------:|
 39 | |Global  LDG.128 | GB/s        |194.12           |
 40 | |Global  LDG.64  | GB/s        |140.77           |
 41 | |Global  LDG.32  | GB/s        |54.18            |
 42 | |Shared  LDS.128 | GB/s        |152.96           |
 43 | |Shared  LDS.64  | GB/s        |30.58            |
 44 | |Shared  LDS.32  | GB/s        |13.32            |
 45 | 
 46 | 1. global memory bandwidth within (64 block * 256 thread)
 47 | 
 48 | |Device                      | Bandwidth   | Turing RTX-2070 |
 49 | |:--------------------------:|:-----------:|:---------------:|
 50 | |LDG.32                      | GB/s        |246.65           |
 51 | |LDG.32 Group1 Stride1       | GB/s        |118.73(2X)       |
 52 | |LDG.32 Group2 Stride2       | GB/s        |119.08(2X)       |
 53 | |LDG.32 Group4 Stride4       | GB/s        |117.11(2X)       |
 54 | |LDG.32 Group8 Stride8       | GB/s        |336.27           |
 55 | |LDG.64                      | GB/s        |379.24           |
 56 | |LDG.64 Group1 Stride1       | GB/s        |126.40(2X)       |
 57 | |LDG.64 Group2 Stride2       | GB/s        |124.51(2X)       |
 58 | |LDG.64 Group4 Stride4       | GB/s        |398.84           |
 59 | |LDG.64 Group8 Stride8       | GB/s        |371.28           |
 60 | |LDG.128                     | GB/s        |391.83           |
 61 | |LDG.128 Group1 Stride1      | GB/s        |125.25(2X)       |
 62 | |LDG.128 Group2 Stride2      | GB/s        |402.55           |
 63 | |LDG.128 Group4 Stride4      | GB/s        |394.22           |
 64 | |LDG.128 Group8 Stride8      | GB/s        |396.10           |
 65 | 
 66 | ### 3. Cache Linesize
 67 | 
 68 | |Device                      | Linesize  | Turing RTX-2070(TU104)|
 69 | |:--------------------------:|:---------:|:---------------------:|
 70 | |L2 Linesise                 |bytes      | 64                    |
 71 | |L1 Linesize                 |bytes      | 32                    |
 72 | |Constant L2 Linesise        |bytes      | 256                   |
 73 | |Constant L1 Linesize        |bytes      | 32                    |
 74 | 
 75 | ### 4. Reg Bankconflict
 76 | 
 77 | | Instruction |CPI      | conflict | without conflict | reg reuse | double reuse |
 78 | |:-----------:|:-------:|:--------:|:----------------:|:---------:|:------------:|
 79 | |FFMA         |  cycle  | 3.516    | 2.969            |  2.938    |  2.938       |
 80 | |IADD3        |  cycle  | 3.031    | 2.062            |  2.031    |  2.031       |
 81 | 
 82 | 
 83 | ### 5. Shared Bankconflict
 84 | 
 85 | | Memory Load            | Latency   | Turing RTX-2070 (TU104)|
 86 | |:----------------------:|:---------:|:----------------------:|
 87 | | Single                 | cycle     |  23                    |
 88 | | Vector2 X 2            | cycle     |  27                    |
 89 | | Conflict Strided       | cycle     |  41                    |
 90 | | Conlict-Free Strided   | cycle     |  32                    |
 91 | 
 92 | 
 93 | ## Instruction Efficiency
 94 | 
 95 | 
 96 | ## Roadmap
 97 | 
 98 | - [ ] warp schedule
 99 | - [ ] L1/L2 cache n-way k-set
100 | 
101 | # Citation
102 | - Jia, Zhe, et al. "Dissecting the NVIDIA volta GPU architecture via microbenchmarking." arXiv preprint arXiv:1804.06826 (2018).
103 | - Jia, Zhe, et al. "Dissecting the NVidia Turing T4 GPU via microbenchmarking." arXiv preprint arXiv:1903.07486 (2019).
104 | - Yan, Da, Wei Wang, and Xiaowen Chu. "Optimizing batched winograd convolution on GPUs." Proceedings of the 25th ACM SIGPLAN symposium on principles and practice of parallel programming. 2020. [**(turingas)**](https://github.com/daadaada/turingas)
105 | 


--------------------------------------------------------------------------------
/memory/memory_bandwidth.cu:
--------------------------------------------------------------------------------
 1 | // 
 2 | //
 3 | //
 4 | //
 5 | 
 6 | #include "cuda.h"
 7 | #include "utils.cuh"
 8 | 
 9 | const float kMemoryFrequency_MHz = 5000.0f;          // 5000MHz
10 | 
11 | float calculateBandWidth(uint elapsed_cycle, const int data_bytes) {
12 |     float second_x_1024_x_1024 = static_cast<float>(elapsed_cycle) / kMemoryFrequency_MHz;
13 |     float data_KBytes = static_cast<float>(data_bytes) / 1024;
14 |     return data_KBytes / second_x_1024_x_1024;
15 | }
16 | 
17 | template<typename T>
18 | uint getAvgElapsedCycle(int thread_group_size, int stride_size, T* data) {
19 |     T acc = 0;
20 |     for (int i = 0; i < thread_group_size; ++i) {
21 |         acc += data[i * stride_size];
22 |     }
23 |     return static_cast<uint>(acc) / thread_group_size;
24 | }
25 | 
26 | 
27 | int main(){
28 |     float* input_d;
29 |     uint32_t* clock_h;
30 |     uint32_t* clock_d;
31 | 
32 |     int global_size = 4 * 1024 * 1024;
33 |     int shared_size = 32 * 1024;
34 | 
35 |     clock_h = static_cast<uint32_t*>(malloc(sizeof(uint32_t) * global_size));
36 | 
37 |     cudaMalloc(&input_d,  sizeof(float) * global_size);
38 |     cudaMalloc(&clock_d,  sizeof(uint32_t) * global_size);
39 | 
40 |     void* kernel_args[] = {&input_d, &clock_d};
41 | 
42 | 
43 |     dim3 gDim1(1, 1, 1);
44 |     dim3 bDim1(1, 1, 1);
45 |     int global_load_bytes = 512 * 1024;
46 |     int shared_load_bytes = 32 * 1024;
47 | 
48 |     const char* cubin_name1 = "../sass_cubin/memory_bandwidth_thread.cubin";
49 |     const char* kernel_name1 = "memoryBandwidthThread";
50 |     launchSassKernel(cubin_name1, kernel_name1, gDim1, bDim1, shared_size, kernel_args);
51 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost);
52 |     cudaDeviceSynchronize();
53 | 
54 |     printf(">>> SASS Level Memory BandWidth Result\n");
55 |     printf("    Global Memory Load %9d Bytes\n", global_load_bytes);
56 |     printf("    Shared Memory Load %9d Bytes\n", shared_load_bytes);
57 |     printf("        Within Thread Result\n");
58 |     printf("            LDG.128  Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
59 |         getAvgElapsedCycle(1, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 0), global_load_bytes));
60 |     printf("            LDG.64   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
61 |         getAvgElapsedCycle(1, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 2), global_load_bytes));
62 |     printf("            LDG.32   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
63 |         getAvgElapsedCycle(1, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 4), global_load_bytes));
64 |     printf("            LDS.128  Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
65 |         getAvgElapsedCycle(1, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 1), shared_load_bytes));
66 |     printf("            LDS.64   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
67 |         getAvgElapsedCycle(1, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 3), shared_load_bytes));
68 |     printf("            LDS.32   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
69 |         getAvgElapsedCycle(1, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 5), shared_load_bytes));
70 |     printf("\n");
71 | 
72 | 
73 |     dim3 gDim2(1, 1, 1);
74 |     dim3 bDim2(256, 1, 1);
75 |     const char* cubin_name2 = "../sass_cubin/memory_bandwidth_block.cubin";
76 |     const char* kernel_name2 = "memoryBandwidthBlock";
77 |     launchSassKernel(cubin_name2, kernel_name2, gDim2, bDim2, shared_size, kernel_args);
78 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost);
79 |     cudaDeviceSynchronize();
80 | 
81 |     printf("        Thread Average Result Within Block\n");
82 |     printf("            LDG.128  Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
83 |         getAvgElapsedCycle(256, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 0), global_load_bytes));
84 |     printf("            LDG.64   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
85 |         getAvgElapsedCycle(256, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 2), global_load_bytes));
86 |     printf("            LDG.32   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
87 |         getAvgElapsedCycle(256, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 4), global_load_bytes));
88 |     printf("            LDS.128  Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
89 |         getAvgElapsedCycle(256, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 1), shared_load_bytes));
90 |     printf("            LDS.64   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
91 |         getAvgElapsedCycle(256, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 3), shared_load_bytes));
92 |     printf("            LDS.32   Elaped Cycle \t=%8u cycle   Bandwidth =\t %5.2f GB/s\n", 
93 |         getAvgElapsedCycle(256, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 5), shared_load_bytes));
94 |     printf("\n");
95 |     return 0;
96 | }


--------------------------------------------------------------------------------
/miscellany/shared_bankconflict.cu:
--------------------------------------------------------------------------------
  1 | // 
  2 | //
  3 | //
  4 | //
  5 | 
  6 | #include "cuda.h"
  7 | #include "utils.cuh"
  8 | 
  9 | 
 10 | __global__ void sharedBankconflictKernel(float* input, float* output, uint32_t* clock){
 11 |     
 12 |     asm volatile (
 13 |         ".reg.f32   val1, val2, val3, val4;                   \n\t"
 14 |         ".reg.u32   c_1, c_2;                                 \n\t"
 15 |         ".reg.u32   e_1;                                        \n\t"
 16 |         ".shared.b32 smem[1024];                                \n\t"
 17 | 
 18 |         "mov.u32    c_1,   %%clock;                         \n\t"
 19 |         "ld.shared.f32    val1,    [smem + 0x100];          \n\t"
 20 |         "mov.u32    c_2,   %%clock;                         \n\t"
 21 |         "sub.u32    e_1, c_2, c_1;                          \n\t"
 22 |         "st.global.u32    [%2], e_1;                        \n\t"
 23 |         "st.global.f32    [%1] , val1;                      \n\t"
 24 | 
 25 |         //////////////////////////////////////////////////////////////////
 26 | 
 27 |         "mov.u32    c_1,   %%clock;                         \n\t"
 28 |         "ld.shared.f32    val1,    [smem];                  \n\t"
 29 |         "ld.shared.f32    val2,    [smem + 0x80];           \n\t"
 30 |         "ld.shared.f32    val3,    [smem + 0x100];          \n\t"
 31 |         "ld.shared.f32    val4,    [smem + 0x180];          \n\t"
 32 |         "mov.u32    c_2,   %%clock;                         \n\t"
 33 | 
 34 |         "sub.u32    e_1, c_2, c_1;                          \n\t"
 35 |         "st.global.u32    [%2 + 0x4], e_1;                  \n\t"
 36 |         "st.global.f32    [%1 + 0x10], val1;                \n\t"
 37 |         "st.global.f32    [%1 + 0x20], val2;                \n\t"
 38 |         "st.global.f32    [%1 + 0x30], val3;                \n\t"
 39 |         "st.global.f32    [%1 + 0x40], val4;                \n\t"
 40 | 
 41 |         //////////////////////////////////////////////////////////////////
 42 | 
 43 |         "mov.u32    c_1,   %%clock;                         \n\t"
 44 |         "ld.shared.f32    val1,    [smem];                  \n\t"
 45 |         "ld.shared.f32    val2,    [smem + 0x84];           \n\t"
 46 |         "ld.shared.f32    val3,    [smem + 0x108];          \n\t"
 47 |         "ld.shared.f32    val4,    [smem + 0x18c];          \n\t"
 48 |         "mov.u32    c_2,   %%clock;                         \n\t"
 49 | 
 50 |         "sub.u32    e_1, c_2, c_1;                          \n\t"
 51 |         "st.global.u32    [%2 + 0x8], e_1;                  \n\t"
 52 |         "st.global.f32    [%1 + 0x44], val1;                \n\t"
 53 |         "st.global.f32    [%1 + 0x14], val2;                \n\t"
 54 |         "st.global.f32    [%1 + 0x24], val3;                \n\t"
 55 |         "st.global.f32    [%1 + 0x34], val4;                \n\t"
 56 | 
 57 |         //////////////////////////////////////////////////////////////////
 58 | 
 59 |         "mov.u32    c_1,   %%clock;                         \n\t"
 60 |         "ld.shared.v2.f32 {val1, val2},  [smem];            \n\t"
 61 |         "ld.shared.v2.f32 {val3, val4},  [smem + 0x8];      \n\t"
 62 |         "mov.u32    c_2,   %%clock;                         \n\t"
 63 | 
 64 |         "sub.u32    e_1, c_2, c_1;                          \n\t"
 65 |         "st.global.u32    [%2 + 0xc], e_1;                  \n\t"
 66 |         "st.global.f32    [%1 + 0x48] , val1;               \n\t"
 67 |         "st.global.f32    [%1 + 0x18], val2;                \n\t"
 68 |         "st.global.f32    [%1 + 0x28], val3;                \n\t"
 69 |         "st.global.f32    [%1 + 0x38], val4;                \n\t"
 70 | 
 71 |         //////////////////////////////////////////////////////////////////
 72 |         ::"l"(input),"l"(output),"l"(clock):"memory"
 73 |     );
 74 | }
 75 | 
 76 | int main(){
 77 | 
 78 |     float* input_h; 
 79 |     float* input_d;
 80 |     float* output_h;
 81 |     float* output_d;
 82 |     uint32_t* clock_h;
 83 |     uint32_t* clock_d;
 84 | 
 85 |     int size = 1024;
 86 | 
 87 |     input_h     = static_cast<float*>(malloc(sizeof(float) * size));
 88 |     output_h    = static_cast<float*>(malloc(sizeof(float) * size));
 89 |     clock_h     = static_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
 90 | 
 91 | 
 92 |     cudaMalloc(&input_d,  sizeof(float) * size);
 93 |     cudaMalloc(&output_d, sizeof(float) * size);
 94 |     cudaMalloc(&clock_d,  sizeof(uint32_t) * size);
 95 | 
 96 |     cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
 97 | 
 98 | 
 99 |     dim3 gDim(1, 1, 1);
100 |     dim3 bDim(1, 1, 1);
101 | 
102 |     void* kernel_args[] = {&input_d, &output_d, &clock_d};
103 | 
104 | 
105 |     const char* cubin_name = "../sass_cubin/shared_bankconflict.cubin";
106 |     const char* kernel_name = "sharedBankconflict";
107 |     launchSassKernel(cubin_name, kernel_name, gDim, bDim, size * sizeof(float), kernel_args);
108 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
109 |     cudaDeviceSynchronize();
110 | 
111 |     printf(">>> SASS Level Shared Load BankConflict Result\n");
112 |     printf("        Single           Load [0x100]                   Elapsed \t%3u cycle\n", clock_h[0]);
113 |     printf("        Vector           Load [0x0, 0x4 , 0x8  , 0xc  ] Elapsed \t%3u cycle\n", clock_h[1]);
114 |     printf("        WithConflict     Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[2]);
115 |     printf("        WithoutConflict  Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[3]);
116 |     
117 | 
118 |     sharedBankconflictKernel<<<gDim, bDim>>>(input_d, output_d, clock_d);
119 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
120 |     cudaDeviceSynchronize();
121 |     printf("\n");
122 |     printf(">>> CUDA-C Level Shared Load BankConflict Result\n");
123 |     printf("        Single           Load [0x100]                   Elapsed \t%3u cycle\n", clock_h[0]);
124 |     printf("        Vector           Load [0x0, 0x4 , 0x8  , 0xc  ] Elapsed \t%3u cycle\n", clock_h[3]);
125 |     printf("        WithConflict     Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[1]);
126 |     printf("        WithoutConflict  Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[2]);
127 | 
128 |     return 0;
129 | }


--------------------------------------------------------------------------------
/memory/memory_latency.cu:
--------------------------------------------------------------------------------
  1 | // 
  2 | //
  3 | //
  4 | //
  5 | 
  6 | #include "cuda.h"
  7 | #include "utils.cuh"
  8 | 
  9 | 
 10 | __constant__ float cinput[1024];
 11 | 
 12 | __global__ void latencyDetectKernel(float* input, float* output, uint32_t* clock, float* cinput){
 13 | 
 14 |     input += 1024 * 1024 * 1024 / sizeof(float) / 2;
 15 |     cinput += 512;
 16 | 
 17 |     asm volatile (
 18 |         ".reg.f32   val1, val2, val3;                   \n\t"
 19 |         ".reg.u32   c_1, c_2, c_3, c_4;                 \n\t"
 20 |         ".reg.u32   e_1, e_2, e_3;                      \n\t"
 21 |         ".reg.u32   e_4, e_5, e_6;                      \n\t"
 22 |         ".shared.b8 smem[32];                           \n\t"
 23 |         
 24 | 
 25 |         "mov.u32    c_1,   %%clock;                     \n\t"
 26 |         "ld.global.cg.f32    val1,    [%0];             \n\t"
 27 |         "mov.u32    c_2,   %%clock;                     \n\t"
 28 |         "ld.global.ca.f32    val2,    [%0 + 0x4];       \n\t"
 29 |         "mov.u32    c_3,   %%clock;                     \n\t"
 30 |         "ld.global.ca.f32    val3,    [%0 + 0x8];       \n\t"
 31 |         "mov.u32    c_4,   %%clock;                     \n\t"
 32 | 
 33 |         "sub.u32    e_1, c_2, c_1;                      \n\t"
 34 |         "sub.u32    e_2, c_3, c_2;                      \n\t"
 35 |         "sub.u32    e_3, c_4, c_3;                      \n\t"
 36 |         
 37 |         "add.f32    val1, val1, val2;                   \n\t"
 38 |         "add.f32    val1, val1, val3;                   \n\t"
 39 | 
 40 |         "st.global.u32    [%2],       e_1;              \n\t"
 41 |         "st.global.u32    [%2 + 0x4], e_2;              \n\t"
 42 |         "st.global.u32    [%2 + 0x8], e_3;              \n\t"
 43 | 
 44 |         "st.global.f32    [%1],       val1;             \n\t"
 45 |         "st.global.f32    [%1 + 0x4], val2;             \n\t"
 46 |         "st.global.f32    [%1 + 0x8], val3;             \n\t"
 47 | 
 48 |         ///////////////////////////////////////////////////////////////////
 49 | 
 50 |         "bar.sync   0;                                  \n\t"
 51 | 
 52 |         ///////////////////////////////////////////////////////////////////
 53 |         
 54 |         "mov.u32    c_1,   %%clock;                     \n\t"
 55 |         "ld.const.cg.f32    val1,    [%3];              \n\t"
 56 |         "mov.u32    c_2,   %%clock;                     \n\t"
 57 |         "ld.const.ca.f32    val2,    [%3 + 0x4];        \n\t"
 58 |         "mov.u32    c_3,   %%clock;                     \n\t"
 59 |         "ld.const.ca.f32    val3,    [%3 + 0x8];        \n\t"
 60 |         "mov.u32    c_4,   %%clock;                     \n\t"
 61 | 
 62 |         "sub.u32    e_4, c_2, c_1;                      \n\t"
 63 |         "sub.u32    e_5, c_3, c_2;                      \n\t"
 64 |         "sub.u32    e_6, c_4, c_3;                      \n\t"
 65 |         
 66 |         "add.f32    val1, val1, val2;                   \n\t"
 67 |         "add.f32    val1, val1, val3;                   \n\t"
 68 | 
 69 |         "st.global.u32    [%2 + 0xc],  e_4;             \n\t"
 70 |         "st.global.u32    [%2 + 0x10], e_5;             \n\t"
 71 |         "st.global.u32    [%2 + 0x14], e_6;             \n\t"
 72 | 
 73 |         "st.global.f32    [%1 + 0xc],  val1;            \n\t"
 74 |         "st.global.f32    [%1 + 0x10], val2;            \n\t"
 75 |         "st.global.f32    [%1 + 0x14], val3;            \n\t"
 76 | 
 77 |         /////////////////////////////////////////////////////////////////////////
 78 | 
 79 |         "bar.sync   0;                                  \n\t"
 80 | 
 81 |         ///////////////////////////////////////////////////////////////////
 82 | 
 83 |         "mov.u32    c_1,   %%clock;                     \n\t"
 84 |         "ld.shared.f32    val1,    [smem];              \n\t"
 85 |         "mov.u32    c_2,   %%clock;                     \n\t"
 86 | 
 87 |         "sub.u32    e_4, c_2, c_1;                      \n\t"
 88 |         "st.global.u32    [%2 + 0x18], e_4;             \n\t"
 89 |         "st.global.f32    [%1 + 0x18], val1;            \n\t"
 90 | 
 91 |         ::"l"(input),"l"(output),"l"(clock),"l"(cinput):"memory"
 92 |     );
 93 | 
 94 | }
 95 | 
 96 | 
 97 | int main(){
 98 |     float* input_d;
 99 |     float* output_d;
100 |     uint32_t* clock_h;
101 |     uint32_t* clock_d;
102 | 
103 |     int size = 1024;
104 |     int large_size = 1500 * 1024 * 1024;
105 | 
106 |     clock_h = static_cast<uint32_t*>(malloc(sizeof(uint32_t) * size));
107 | 
108 |     cudaMalloc(&input_d,  large_size);
109 |     cudaMalloc(&output_d, sizeof(float) * size);
110 |     cudaMalloc(&clock_d,  sizeof(uint32_t) * size);
111 | 
112 |     for (int i = 0; i < 128; ++i){
113 |         cinput[i] = i;
114 |     }
115 | 
116 |     dim3 gDim(1, 1, 1);
117 |     dim3 bDim(1, 1, 1);
118 | 
119 |     void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput};
120 |     const char* cubin_name = "../sass_cubin/memory_latency.cubin";
121 |     const char* kernel_name = "memoryLatency";
122 | 
123 |     launchSassKernel(cubin_name, kernel_name, gDim, bDim, size, kernel_args);
124 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
125 |     cudaDeviceSynchronize();
126 | 
127 |     printf(">>> SASS Level Memory Latency Result\n");
128 |     printf("        Global    Memory    Latency \t= %4u cycle\n", clock_h[0]);
129 |     printf("        Global    TLB       Latency \t= %4u cycle\n", clock_h[7]);
130 |     printf("        Global    L2-Cache  Latency \t= %4u cycle\n", clock_h[1]);
131 |     printf("        Global    L1-Cache  Latency \t= %4u cycle\n", clock_h[2]);
132 |     printf("        Shared    Memory    Latency \t= %4u cycle\n", clock_h[6]);
133 |     printf("        Constant  Memory    Latency \t= %4u cycle\n", clock_h[3]);
134 |     printf("        Constant  L2-Cache  Latency \t= %4u cycle\n", clock_h[4]);
135 |     printf("        Constant  L1-Cache  Latency \t= %4u cycle\n", clock_h[5]);
136 |     
137 | 
138 |     
139 |     latencyDetectKernel<<<gDim, bDim>>>(input_d, output_d, clock_d, cinput);
140 |     cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
141 |     cudaDeviceSynchronize();
142 |     printf("\n");
143 |     printf(">>> CUDA-C Level Memory Latency Result\n");
144 |     printf("        Global    Memory    Latency \t= %4u cycle\n", clock_h[0]);
145 |     printf("        Global    L2-Cache  Latency \t= %4u cycle\n", clock_h[1]);
146 |     printf("        Global    L1-Cache  Latency \t= %4u cycle\n", clock_h[2]);
147 |     printf("        Shared    Memory    Latency \t= %4u cycle\n", clock_h[6]);
148 |     printf("        Constant  Memory    Latency \t= %4u cycle\n", clock_h[3]);
149 |     printf("        Constant  L2-Cache  Latency \t= %4u cycle\n", clock_h[4]);
150 |     printf("        Constant  L1-Cache  Latency \t= %4u cycle\n", clock_h[5]);
151 |     return 0;
152 | }


--------------------------------------------------------------------------------
/memory/global_memory_bandwidth.cu:
--------------------------------------------------------------------------------
  1 | // 
  2 | //
  3 | //
  4 | //
  5 | 
  6 | #include "cuda.h"
  7 | #include "utils.cuh"
  8 | 
  9 | constexpr int kGridDimX      = 64;
 10 | constexpr int kBlockDimX     = 256;
 11 | constexpr int kWarpCount     = kBlockDimX / kWarpSize;
 12 | constexpr int kLoopSize      = 4 * 1024;
 13 | constexpr size_t kGlobalSize = 256 * 1024 * 1024;
 14 | constexpr float kCopySize    = (float)kLoopSize * kGridDimX * kBlockDimX * sizeof(float);
 15 | 
 16 | template<int GroupSize, int StrideSize>
 17 | __global__
 18 | void copyGroup32bKernel(float* input, float* output) {
 19 |     const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
 20 |     const int kBlockWorkload = kWarpCount * kWarpWorkload;
 21 |     const int kLine = kGridDimX * kBlockWorkload;
 22 | 
 23 |     int ctaid = blockIdx.x;
 24 |     int tid = threadIdx.x;
 25 |     int warpid = tid / 32;
 26 |     int laneid = tid % 32;
 27 |     int groupid = laneid / GroupSize;
 28 |     int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
 29 | 
 30 |     float* thread_input = input + offset;
 31 |     float* thread_output = output + offset;
 32 | 
 33 |     for (int i = 0; i < kLoopSize; ++i) {
 34 |         *thread_output = *thread_input;
 35 |         thread_input += kLine;
 36 |         thread_output += kLine;
 37 |     }
 38 | }
 39 | 
 40 | 
 41 | template<int GroupSize, int StrideSize>
 42 | __global__
 43 | void copyGroup64bKernel(float2* input, float2* output) {
 44 |     const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
 45 |     const int kBlockWorkload = kWarpCount * kWarpWorkload;
 46 |     const int kLine = kGridDimX * kBlockWorkload;
 47 | 
 48 |     int ctaid = blockIdx.x;
 49 |     int tid = threadIdx.x;
 50 |     int warpid = tid / 32;
 51 |     int laneid = tid % 32;
 52 |     int groupid = laneid / GroupSize;
 53 |     int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
 54 | 
 55 |     float2* thread_input = input + offset;
 56 |     float2* thread_output = output + offset;
 57 | 
 58 |     for (int i = 0; i < (kLoopSize / 2); ++i) {
 59 |         *thread_output = *thread_input;
 60 |         thread_input += kLine;
 61 |         thread_output += kLine;
 62 |     }
 63 | }
 64 | 
 65 | 
 66 | template<int GroupSize, int StrideSize>
 67 | __global__
 68 | void copyGroup128bKernel(float4* input, float4* output) {
 69 |     const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
 70 |     const int kBlockWorkload = kWarpCount * kWarpWorkload;
 71 |     const int kLine = kGridDimX * kBlockWorkload;
 72 | 
 73 |     int ctaid = blockIdx.x;
 74 |     int tid = threadIdx.x;
 75 |     int warpid = tid / 32;
 76 |     int laneid = tid % 32;
 77 |     int groupid = laneid / GroupSize;
 78 |     int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
 79 | 
 80 |     float4* thread_input = input + offset;
 81 |     float4* thread_output = output + offset;
 82 | 
 83 |     for (int i = 0; i < (kLoopSize / 4); ++i) {
 84 |         *thread_output = *thread_input;
 85 |         thread_input += kLine;
 86 |         thread_output += kLine;
 87 |     }
 88 | }
 89 | 
 90 | template<typename Func>
 91 | float getElapsed(Func fn, cudaEvent_t start, cudaEvent_t stop) {
 92 |     float elapsed = 0;
 93 |     cudaEventRecord(start);
 94 |     fn();
 95 |     cudaEventRecord(stop);
 96 |     cudaDeviceSynchronize();
 97 |     cudaEventElapsedTime(&elapsed, start, stop);
 98 |     return kCopySize / elapsed / 1024 / 1024;
 99 | }
100 | 
101 | int main() {
102 |     float* input_d;
103 |     float* output_d;
104 | 
105 |     cudaMalloc(&input_d,  sizeof(float) * kGlobalSize);
106 |     cudaMalloc(&output_d,  sizeof(float) * kGlobalSize);
107 | 
108 |     cudaEvent_t start, stop;
109 |     cudaEventCreate(&start);
110 |     cudaEventCreate(&stop);
111 | 
112 |     dim3 gDim(kGridDimX);
113 |     dim3 bDim(kBlockDimX);
114 | 
115 |     printf(" Different access pattern on Global Memory\n");
116 | 
117 |     auto fn1 = [=]() { copyGroup32bKernel<1, 0><<<gDim, bDim>>>(input_d, output_d);};
118 |     printf("    LDG.32                   \t%.2f GB/s\n", getElapsed(fn1, start, stop)); 
119 | 
120 |     auto fn2 = [=]() { copyGroup32bKernel<1, 1><<<gDim, bDim>>>(input_d, output_d);};
121 |     printf("    LDG.32 g1s1              \t%.2f GB/s\n", getElapsed(fn2, start, stop)); 
122 | 
123 |     auto fn9 = [=]() { copyGroup32bKernel<2, 2><<<gDim, bDim>>>(input_d, output_d);};
124 |     printf("    LDG.32 g2s2              \t%.2f GB/s\n", getElapsed(fn9, start, stop)); 
125 | 
126 |     auto fn10 = [=]() { copyGroup32bKernel<4, 4><<<gDim, bDim>>>(input_d, output_d);};
127 |     printf("    LDG.32 g4s4              \t%.2f GB/s\n", getElapsed(fn10, start, stop)); 
128 | 
129 |     auto fn8 = [=]() { copyGroup32bKernel<8, 8><<<gDim, bDim>>>(input_d, output_d);};
130 |     printf("    LDG.32 g8s8              \t%.2f GB/s\n", getElapsed(fn8, start, stop)); 
131 | 
132 |     ///////////////////////////////////////////////////////////////////////////////////////////////////////
133 | 
134 |     auto fn3 = [=]() { copyGroup64bKernel<1, 0><<<gDim, bDim>>>((float2*)input_d, (float2*)output_d);};
135 |     printf("    LDG.64                   \t%.2f GB/s\n", getElapsed(fn3, start, stop)); 
136 | 
137 |     auto fn4 = [=]() { copyGroup64bKernel<1, 1><<<gDim, bDim>>>((float2*)input_d, (float2*)output_d);};
138 |     printf("    LDG.64 g1s1              \t%.2f GB/s\n", getElapsed(fn4, start, stop)); 
139 | 
140 |     auto fn11 = [=]() { copyGroup64bKernel<2, 2><<<gDim, bDim>>>((float2*)input_d, (float2*)output_d);};
141 |     printf("    LDG.64 g2s2              \t%.2f GB/s\n", getElapsed(fn11, start, stop)); 
142 | 
143 |     auto fn12 = [=]() { copyGroup64bKernel<4, 4><<<gDim, bDim>>>((float2*)input_d, (float2*)output_d);};
144 |     printf("    LDG.64 g4s4              \t%.2f GB/s\n", getElapsed(fn12, start, stop)); 
145 | 
146 |     auto fn13 = [=]() { copyGroup64bKernel<8, 8><<<gDim, bDim>>>((float2*)input_d, (float2*)output_d);};
147 |     printf("    LDG.64 g8s8              \t%.2f GB/s\n", getElapsed(fn13, start, stop)); 
148 | 
149 |     ///////////////////////////////////////////////////////////////////////////////////////////////////////
150 | 
151 |     auto fn5 = [=]() { copyGroup128bKernel<1, 0><<<gDim, bDim>>>((float4*)input_d, (float4*)output_d);};
152 |     printf("    LDG.128                  \t%.2f GB/s\n", getElapsed(fn5, start, stop)); 
153 | 
154 |     auto fn6 = [=]() { copyGroup128bKernel<1, 1><<<gDim, bDim>>>((float4*)input_d, (float4*)output_d);};
155 |     printf("    LDG.128 g1s1             \t%.2f GB/s\n", getElapsed(fn6, start, stop)); 
156 | 
157 |     auto fn7 = [=]() { copyGroup128bKernel<2, 2><<<gDim, bDim>>>((float4*)input_d, (float4*)output_d);};
158 |     printf("    LDG.128 g2s2              \t%.2f GB/s\n", getElapsed(fn7, start, stop)); 
159 | 
160 |     auto fn14 = [=]() { copyGroup128bKernel<4, 4><<<gDim, bDim>>>((float4*)input_d, (float4*)output_d);};
161 |     printf("    LDG.128 g4s4              \t%.2f GB/s\n", getElapsed(fn14, start, stop)); 
162 | 
163 |     auto fn15 = [=]() { copyGroup128bKernel<8, 8><<<gDim, bDim>>>((float4*)input_d, (float4*)output_d);};
164 |     printf("    LDG.128 g8s8              \t%.2f GB/s\n", getElapsed(fn15, start, stop)); 
165 | 
166 | 
167 |     cudaFree(input_d);
168 |     cudaFree(output_d);
169 | }


--------------------------------------------------------------------------------