├── .gitignore
├── .gitmodules
├── utils
├── macro.cuh
├── utils.cuh
├── sass_kernel.cuh
├── format_print.cuh
└── ptx_export.cuh
├── CMakeLists.txt
├── sass_cubin
├── reg_reuse_bankconflict.sass
├── reg_reuse_double.sass
├── reg_without_bankconflict.sass
├── reg_with_bankconflict.sass
├── warp_schedule.sass
├── shared_bankconflict.sass
├── memory_latency.sass
├── cache_linesize.sass
├── memory_bandwidth_thread.sass
└── memory_bandwidth_block.sass
├── compile_sass.py
├── schedule
├── block_schedule.cu
└── warp_schedule.cu
├── miscellany
├── reg_bankconflict.cu
└── shared_bankconflict.cu
├── memory
├── cache_linesize.cu
├── memory_bandwidth.cu
├── memory_latency.cu
└── global_memory_bandwidth.cu
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .vscode/
3 |
4 | build/
5 | bin/
6 |
7 | *.out
8 | *.cubin
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "turingas"]
2 | path = turingas
3 | url = git@github.com:daadaada/turingas.git
4 |
--------------------------------------------------------------------------------
/utils/macro.cuh:
--------------------------------------------------------------------------------
1 | //
2 | // CUDA
3 | // Created by sjfeng
4 | //
5 |
6 | #pragma once
7 |
8 | #define UPPER_DIV(x, y) ((x + y - 1) / y)
9 |
10 | constexpr int kWarpSize = 32;
11 |
--------------------------------------------------------------------------------
/utils/utils.cuh:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #pragma once
7 |
8 | #include "./format_print.cuh"
9 | #include "./macro.cuh"
10 | #include "./ptx_export.cuh"
11 | #include "./sass_kernel.cuh"
12 |
--------------------------------------------------------------------------------
/utils/sass_kernel.cuh:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #pragma once
7 |
8 | #include "cuda.h"
9 | #include "cuda_runtime.h"
10 |
11 | cudaError_t launchSassKernel(const char* cubin_name, const char* kernel_name, const dim3& gDim, const dim3& bDim, const int shared_bytes, void** args){
12 | CUmodule module;
13 | CUfunction kernel;
14 |
15 | cuModuleLoad(&module, cubin_name);
16 | cuModuleGetFunction(&kernel, module, kernel_name);
17 |
18 | cuLaunchKernel(kernel,
19 | gDim.x, gDim.y, gDim.z,
20 | bDim.x, bDim.y, bDim.z,
21 | shared_bytes, // SharedMem Bytes
22 | 0, // Stream
23 | args, 0);
24 |
25 | return cudaPeekAtLastError();
26 | }
27 |
--------------------------------------------------------------------------------
/utils/format_print.cuh:
--------------------------------------------------------------------------------
1 | //
2 | // C++
3 | // Created by sjfeng
4 | //
5 | //
6 | #pragma once
7 |
8 | #include "stdio.h"
9 |
10 | void formatArray(float* array, int size, int newline=10){
11 | for (int i = 0; i < size; ++i){
12 | printf("%.3f, ", array[i]);
13 | if (i % newline == newline - 1){
14 | printf("\n");
15 | }
16 | }
17 | printf("\n\t");
18 | }
19 |
20 | void formatArray(uint* array, int size, int newline=10){
21 | for (int i = 0; i < size; ++i){
22 | printf("%3u, ", array[i]);
23 | if (i % newline == newline - 1){
24 | printf("=====================\n");
25 | }
26 | }
27 | printf("\n\t");
28 | }
29 |
30 | void formatArray(int* array, int size, int newline=10){
31 | for (int i = 0; i < size; ++i){
32 | printf("%3u, ", array[i]);
33 | if (i % newline == newline - 1){
34 | printf("=====================\n");
35 | }
36 | }
37 | printf("\n\t");
38 | }
39 |
--------------------------------------------------------------------------------
/utils/ptx_export.cuh:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #pragma once
7 |
8 | #include "cuda.h"
9 | #include "cuda_runtime.h"
10 | #include "./macro.cuh"
11 |
12 | __forceinline__ __device__ uint32_t getClock(){
13 | uint32_t clock;
14 | asm volatile(
15 | "mov.u32 %0, %%clock; \n\t"
16 | :"=r"(clock)::"memory"
17 | );
18 | return clock;
19 | }
20 |
21 | __forceinline__ __device__ uint32_t getSmid(){
22 | uint32_t smid;
23 | asm volatile(
24 | "mov.u32 %0, %%smid; \n\t"
25 | :"=r"(smid)::"memory"
26 | );
27 | return smid;
28 | }
29 |
30 | __forceinline__ __device__ uint32_t getWarpid(){
31 | uint32_t warpid;
32 | asm volatile(
33 | "mov.u32 %0, %%warpid; \n\t"
34 | :"=r"(warpid)::"memory"
35 | );
36 | return warpid;
37 | }
38 |
39 | __forceinline__ __device__ uint32_t getLaneid(){
40 | uint32_t laneid;
41 | asm volatile(
42 | "mov.u32 %0, %%laneid; \n\t"
43 | :"=r"(laneid)::"memory"
44 | );
45 | return laneid;
46 | }
47 |
48 | __forceinline__ __device__ void barSync(){
49 | asm volatile(
50 | "bar.sync 0; \n\t"
51 | );
52 | }
53 |
54 | __forceinline__ __device__ void ptxExit(){
55 | asm volatile(
56 | "exit; \n\t"
57 | );
58 | }
59 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.14)
2 |
3 | project(gpu-arch-microbenchmark
4 | LANGUAGES CXX CUDA)
5 |
6 | enable_language(CUDA)
7 |
8 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin/)
9 |
10 | set(TARGET_ARCH "-gencode arch=compute_80,code=sm_80 \
11 | -gencode arch=compute_75,code=sm_75 \
12 | -gencode arch=compute_70,code=sm_70")
13 |
14 | set(CMAKE_CUDA_FLAGS "${CMAKE_NVCC_FLAGS} ${TARGET_ARCH}")
15 |
16 | set(MICROBENCHMARK_SRC memory/memory_latency.cu
17 | memory/memory_bandwidth.cu
18 | memory/cache_linesize.cu
19 | memory/global_memory_bandwidth.cu
20 | miscellany/reg_bankconflict.cu
21 | miscellany/shared_bankconflict.cu
22 | schedule/warp_schedule.cu)
23 |
24 |
25 | message(STATUS ">>> GPU Microbenchmark")
26 |
27 | foreach(benchmark ${MICROBENCHMARK_SRC})
28 | get_filename_component(benchmark_exec ${benchmark} NAME_WE)
29 | message(STATUS "Benchmark: ${benchmark_exec}")
30 | add_executable(${benchmark_exec} ${benchmark})
31 | target_include_directories(${benchmark_exec} PUBLIC ${PROJECT_SOURCE_DIR}/utils)
32 | target_link_libraries(${benchmark_exec} cuda)
33 | endforeach()
34 |
35 | message(STATUS "<<<")
36 |
--------------------------------------------------------------------------------
/sass_cubin/reg_reuse_bankconflict.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 |
18 |
19 |
20 | --:-:-:-:2 MOV input_lo, input[0];
21 | --:-:-:-:2 MOV input_hi, input[1];
22 | --:-:-:-:2 MOV clock_lo, clock[0];
23 | --:-:-:-:4 MOV clock_hi, clock[1];
24 |
25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
26 |
27 |
28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2.reuse, v{:}, v0;"
29 |
30 | SASS_CODE = []
31 | for i in range(64):
32 | reg = 4 + i * 2
33 | SASS_CODE += [REG_FFMA.format(reg)]
34 |
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 |
37 |
38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2 BAR.SYNC 0x0;
40 |
41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
43 |
44 |
45 |
46 |
47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
48 |
49 |
50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2.reuse, v{:}, v0;"
51 |
52 | SASS_CODE = []
53 | for i in range(64):
54 | reg = 4 + i * 2
55 | SASS_CODE += [REG_IADD3.format(reg)]
56 |
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 |
59 |
60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2 BAR.SYNC 0x0;
62 |
63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2 EXIT;
--------------------------------------------------------------------------------
/sass_cubin/reg_reuse_double.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 |
18 |
19 |
20 | --:-:-:-:2 MOV input_lo, input[0];
21 | --:-:-:-:2 MOV input_hi, input[1];
22 | --:-:-:-:2 MOV clock_lo, clock[0];
23 | --:-:-:-:4 MOV clock_hi, clock[1];
24 |
25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
26 |
27 |
28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2.reuse, v1.reuse, v{:};"
29 |
30 | SASS_CODE = []
31 | for i in range(64):
32 | reg = 4 + i * 2
33 | SASS_CODE += [REG_FFMA.format(reg)]
34 |
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 |
37 |
38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2 BAR.SYNC 0x0;
40 |
41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
43 |
44 |
45 |
46 |
47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
48 |
49 |
50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2.reuse, v1.reuse, v{:};"
51 |
52 | SASS_CODE = []
53 | for i in range(64):
54 | reg = 4 + i * 2
55 | SASS_CODE += [REG_IADD3.format(reg)]
56 |
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 |
59 |
60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2 BAR.SYNC 0x0;
62 |
63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2 EXIT;
--------------------------------------------------------------------------------
/sass_cubin/reg_without_bankconflict.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 |
18 |
19 |
20 | --:-:-:-:2 MOV input_lo, input[0];
21 | --:-:-:-:2 MOV input_hi, input[1];
22 | --:-:-:-:2 MOV clock_lo, clock[0];
23 | --:-:-:-:4 MOV clock_hi, clock[1];
24 |
25 | --:-:-:-:4 CS2R c1, SR_CLOCKLO;
26 |
27 |
28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v{:}, v{:}, v0;"
29 |
30 | SASS_CODE = []
31 | for i in range(64):
32 | reg = i * 2
33 | SASS_CODE += [REG_FFMA.format(reg, reg + 1)]
34 |
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 |
37 |
38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2 BAR.SYNC 0x0;
40 |
41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
43 |
44 |
45 |
46 |
47 | --:-:-:-:4 CS2R c1, SR_CLOCKLO;
48 |
49 |
50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v{:}, v{:}, v0;"
51 |
52 | SASS_CODE = []
53 | for i in range(64):
54 | reg = i * 2
55 | SASS_CODE += [REG_IADD3.format(reg, reg + 1)]
56 |
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 |
59 |
60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2 BAR.SYNC 0x0;
62 |
63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1;
65 | --:-:-:-:2 EXIT;
--------------------------------------------------------------------------------
/sass_cubin/reg_with_bankconflict.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-7: c1, c2
15 | 8-9: e1, e2
16 | 10-224 ~ v<0-192>
17 |
18 |
19 |
20 | --:-:-:-:2 MOV input_lo, input[0];
21 | --:-:-:-:2 MOV input_hi, input[1];
22 | --:-:-:-:2 MOV clock_lo, clock[0];
23 | --:-:-:-:4 MOV clock_hi, clock[1];
24 |
25 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
26 |
27 |
28 | REG_FFMA = "--:-:-:-:1 FFMA v0, v2, v{:}, v0;"
29 |
30 | SASS_CODE = []
31 | for i in range(64):
32 | reg = 4 + i * 2
33 | SASS_CODE += [REG_FFMA.format(reg)]
34 |
35 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
36 |
37 |
38 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
39 | --:-:-:-:2 BAR.SYNC 0x0;
40 |
41 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
42 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
43 |
44 |
45 |
46 |
47 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
48 |
49 |
50 | REG_IADD3 = "--:-:-:-:1 IADD3 v0, v2, v{:}, v0;"
51 |
52 | SASS_CODE = []
53 | for i in range(64):
54 | reg = 4 + i * 2
55 | SASS_CODE += [REG_IADD3.format(reg)]
56 |
57 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
58 |
59 |
60 | --:-:0:-:4 CS2R c2, SR_CLOCKLO;
61 | --:-:-:-:2 BAR.SYNC 0x0;
62 |
63 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
64 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e1;
65 |
66 |
67 |
68 |
69 | --:-:-:-:2 EXIT;
--------------------------------------------------------------------------------
/compile_sass.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 |
5 | def camel_to_snake(name):
6 | token_list = name.split("_")
7 | camel_name = ""
8 | for i, token in enumerate(token_list):
9 | if i == 0:
10 | camel_name += token
11 | else:
12 | camel_name += token.capitalize()
13 | return camel_name
14 |
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("-arch", type=int, default=75)
19 |
20 | args = parser.parse_args()
21 |
22 | ARCH_LIST = [70, 75, 80]
23 | KERNEL_LIST = ["memory_latency",
24 | "memory_bandwidth_thread", "memory_bandwidth_block",
25 | "cache_linesize",
26 | "reg_reuse_double", "reg_reuse_bankconflict", "reg_with_bankconflict", "reg_without_bankconflict",
27 | "shared_bankconflict",
28 | "warp_schedule"]
29 |
30 | if args.arch not in ARCH_LIST:
31 | print("Unsupported Gpu Arch: ", args.arch)
32 | exit()
33 |
34 | print(">>>")
35 | for kernel in KERNEL_LIST:
36 | source_sass = f"{kernel}.sass"
37 | target_cubin = f"{kernel}.cubin"
38 | target_kernel = camel_to_snake(kernel)
39 | compile_command = f"python3 -m turingas.main -i ../sass_cubin/{source_sass} -o ../sass_cubin/{target_cubin} -arch {args.arch} -name {target_kernel}"
40 |
41 | print(f" compile kernel: {target_kernel}")
42 | os.system(compile_command)
43 | print("<<<")
44 |
--------------------------------------------------------------------------------
/sass_cubin/warp_schedule.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 | run_warp, 4
6 |
7 |
8 |
9 | 0: input_lo
10 | 1: input_hi
11 | 2: output_lo
12 | 3: output_hi
13 | 4: clock_lo
14 | 5: clock_hi
15 | 6-7: c1, c2
16 | 8-9: e1, e2
17 | 10-19 ~ clock_offset_lo, clock_offset_hi, tid, warpid, laneid, warp_offset, warpid32
18 | 20-150 ~ v<0-128>
19 |
20 |
21 |
22 | --:-:0:-:4 S2R tid, SR_TID.X;
23 |
24 | --:-:-:-:2 MOV input_lo, input[0];
25 | --:-:-:-:2 MOV input_hi, input[1];
26 | --:-:-:-:2 MOV clock_lo, clock[0];
27 | --:-:-:-:2 MOV clock_hi, clock[1];
28 |
29 | 01:-:-:-:4 SHF.R.S32.HI warpid, RZ, 0x5, tid;
30 |
31 | --:-:-:-:4 ISETP.NE.AND P0, PT, warpid, run_warp, PT;
32 | --:-:-:-:5 ISETP.EQ.OR P0, PT, warpid, RZ, !P0;
33 |
34 | --:-:-:-:5 @!P0 EXIT;
35 |
36 | --:-:-:-:4 SHF.L.S32.HI warpid32, RZ, 0x5, warpid;
37 | --:-:-:-:5 IADD3 laneid, tid, -warpid32, RZ;
38 |
39 | --:-:-:-:5 IMAD.WIDE clock_offset_lo, laneid, 0x4, clock_lo;
40 | --:-:-:-:5 ISETP.EQ.AND P1, PT, warpid, RZ, PT;
41 | --:-:-:-:5 @P1 IADD3 clock_offset_lo, clock_offset_lo, 0x80, RZ;
42 | --:-:-:-:4 MOV clock_offset_hi, clock_hi;
43 |
44 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
45 |
46 | REG_FFMA = "--:-:-:-:1 FFMA v0, v{:}, v{:}, v0;"
47 |
48 | SASS_CODE = []
49 | for i in range(64):
50 | reg = i * 2
51 | SASS_CODE += [REG_FFMA.format(i, i + 1)]
52 |
53 | SASS_CODE += ["--:-:-:-:4 FFMA v0, v1, v2, v0;"]
54 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
55 |
56 | --:-:-:-:6 CS2R c2, SR_CLOCKLO;
57 |
58 | --:-:-:-:6 IADD3 e1, c2, -c1, RZ;
59 | --:-:-:-:4 STG.E.SYS [clock_offset_lo], e1;
60 |
61 | --:-:-:-:5 EXIT;
62 |
--------------------------------------------------------------------------------
/sass_cubin/shared_bankconflict.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-9: c<1-4>
15 | 10-13: vA, vB, vC, vD
16 | 14-17: e1, e2, e3, e4
17 | 18-20: x1, x2, x3
18 | 24-27: v<0-3>
19 |
20 |
21 |
22 | --:-:-:-:2 MOV input_lo, input[0];
23 | --:-:-:-:2 MOV input_hi, input[1];
24 | --:-:-:-:2 MOV clock_lo, clock[0];
25 | --:-:-:-:4 MOV clock_hi, clock[1];
26 |
27 |
28 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
29 | --:-:0:-:2 LDS vA, [RZ+0x100];
30 | 01:-:-:-:4 CS2R c2, SR_CLOCKLO;
31 | --:-:-:-:5 IADD3 e1, c2, -c1, RZ;
32 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
33 |
34 |
35 |
36 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
37 | --:-:0:-:2 LDS.64 v0, [RZ];
38 | --:-:1:-:2 LDS.64 v2, [RZ + 0x8];
39 | 03:-:-:-:4 CS2R c2, SR_CLOCKLO;
40 | --:-:-:-:5 IADD3 e2, c2, -c1, RZ;
41 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x4], e2;
42 |
43 |
44 |
45 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
46 | --:-:0:-:2 LDS vA, [RZ+0x0];
47 | --:-:1:-:2 LDS vB, [RZ+0x80];
48 | --:-:2:-:2 LDS vC, [RZ+0x100];
49 | --:-:3:-:2 LDS vD, [RZ+0x180];
50 | 15:-:-:-:4 CS2R c2, SR_CLOCKLO;
51 | --:-:-:-:5 IADD3 e3, c2, -c1, RZ;
52 | --:-:-:-:4 STG.E.SYS [clock_lo + 0x8], e3;
53 |
54 |
55 |
56 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
57 | --:-:0:-:2 LDS vA, [RZ+0x0];
58 | --:-:1:-:2 LDS vB, [RZ+0x84];
59 | --:-:2:-:2 LDS vC, [RZ+0x108];
60 | --:-:3:-:2 LDS vD, [RZ+0x18c];
61 | 15:-:-:-:4 CS2R c2, SR_CLOCKLO;
62 | --:-:-:-:5 IADD3 e4, c2, -c1, RZ;
63 | --:-:-:-:4 STG.E.SYS [clock_lo + 0xc], e4;
64 |
65 | --:-:-:-:2 EXIT;
66 |
--------------------------------------------------------------------------------
/schedule/block_schedule.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | __global__ void block_workload(float *A, float *B){
5 | int tid = threadIdx.x;
6 | uint32_t global_warpip = get_global_warpid();
7 |
8 | float dummy = 0;
9 | float vA[4], vB[4], vC[4], vD[4];
10 | float *ptr;
11 | ptrdiff_t offset = 0;
12 |
13 | #pragma unroll
14 | for (int i = 0; i < 32; ++i){
15 | offset = i * 4;
16 | ptr = A + offset;
17 |
18 | asm volatile(
19 | "ld.global.ca.f32 %0, [%4]; \n\t"
20 | "ld.global.ca.f32 %1, [%4+4]; \n\t"
21 | "ld.global.ca.f32 %2, [%4+8]; \n\t"
22 | "ld.global.ca.f32 %3, [%4+12]; \n\t"
23 | :"=f"(vA[0]),"=f"(vB[0]),"=f"(vC[0]),"=f"(vD[0])
24 | :"l"(ptr):"memory"
25 | );
26 | dummy += vA[0];
27 | dummy += vB[0];
28 | dummy += vC[0];
29 | dummy += vD[0];
30 | }
31 | B[tid] = dummy;
32 | }
33 |
34 | int main() {
35 | size_t width = 512;
36 | size_t bytes = 4 * width;
37 |
38 |
39 | dim3 bDim1(32);
40 | dim3 bDim1(128);
41 | dim3 gDim(80);
42 |
43 | float *A;
44 | float *B;
45 | uint32_t *cost;
46 |
47 | cudaMallocManaged(&A, bytes);
48 | cudaMallocManaged(&B, bytes);
49 | cudaMallocManaged(&cost, bytes);
50 |
51 | for (int i = 0; i < width; ++i) {
52 | h_A[i] = i;
53 | }
54 |
55 | float totalElapsed;
56 | cudaEvent_t start_t, stop_t;
57 | cudaEventCreate(&start_t);
58 | cudaEventCreate(&stop_t);
59 |
60 | cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
61 | cudaEventRecord(start_t, 0);
62 |
63 | warp_workload<0, 1><<>>(d_A, d_B);
64 | printf(cudaGetErrorString(cudaGetLastError()));
65 |
66 | cudaEventRecord(stop_t, 0);
67 | cudaEventSynchronize(stop_t);
68 | cudaMemcpy(h_B, d_B, bytes, cudaMemcpyDeviceToHost);
69 | cudaEventElapsedTime(&totalElapsed, start_t, stop_t);
70 | printf("\nHost Time Elapsed %f ms", totalElapsed);
71 | }
72 |
--------------------------------------------------------------------------------
/sass_cubin/memory_latency.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6-9: c<1-4>
15 | 10-13: vA, vB, vC, vD
16 | 14-17: e1, e2, e3, e4
17 | 18-20: x1, x2, x3
18 | 21-25 ~ tid, warpid, tid32
19 | 32-35 ~ a0, a1, a2, a3
20 | 36-42 ~ smem1, smem2, smem3, e_s1, e_s2
21 | 43-63 ~ c<5-10>
22 | 64-79 ~ e<5-10>
23 |
24 |
25 |
26 | const_a, 8
27 |
28 |
29 | --:-:-:-:2 MOV input_lo, input[0];
30 | --:-:-:-:2 MOV input_hi, input[1];
31 | --:-:-:-:2 MOV clock_lo, clock[0];
32 | --:-:-:-:4 MOV clock_hi, clock[1];
33 |
34 |
35 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
36 | --:-:0:-:2 LDG.E.STRONG.GPU vA, [input_lo];
37 | 01:-:-:-:2 CS2R c2, SR_CLOCKLO;
38 | --:-:0:-:2 LDG.E.STRONG.CTA vB, [input_lo+0x4];
39 | 01:-:-:-:2 CS2R c3, SR_CLOCKLO;
40 | --:-:0:-:2 LDG.E.STRONG.CTA vC, [input_lo+0x8];
41 | 01:-:-:-:2 CS2R c4, SR_CLOCKLO;
42 | --:-:0:-:2 LDG.E.STRONG.CTA vA, [input_lo+0x10000];
43 | 01:-:-:-:2 CS2R c5, SR_CLOCKLO;
44 |
45 | --:-:-:-:4 IADD3 e1, c2, -c1, RZ;
46 | --:-:-:-:4 IADD3 e2, c3, -c2, RZ;
47 | --:-:-:-:4 IADD3 e3, c4, -c3, RZ;
48 | --:-:-:-:4 IADD3 e4, c5, -c4, RZ;
49 |
50 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
51 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2;
52 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3;
53 | --:-:-:-:4 STG.E.SYS [clock_lo+0x1c], e4;
54 |
55 | --:-:-:-:6 NOP;
56 |
57 | --:-:-:-:2 CS2R c1, SR_CLOCKLO;
58 | --:-:1:-:2 LDC.E x1, const_a[0];
59 | 02:-:-:-:2 CS2R c2, SR_CLOCKLO;
60 | --:-:1:-:2 MOV x2, const_a[1];
61 | 02:-:-:-:2 CS2R c3, SR_CLOCKLO;
62 | --:-:1:-:2 MOV x3, const_a[2];
63 | 02:-:-:-:4 CS2R c4, SR_CLOCKLO;
64 |
65 | --:-:-:-:4 IADD3 e1, c2, -c1, RZ;
66 | --:-:-:-:4 IADD3 e2, c3, -c2, RZ;
67 | --:-:-:-:4 IADD3 e3, c4, -c3, RZ;
68 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e1;
69 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e2;
70 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e3;
71 |
72 | --:-:-:-:6 NOP;
73 |
74 | --:-:-:-:2 CS2R smem1, SR_CLOCKLO;
75 | --:-:0:-:2 LDS x1, [RZ+0x0];
76 | 01:-:-:-:4 CS2R smem2, SR_CLOCKLO;
77 |
78 | --:-:-:-:5 IADD3 e_s1, smem2, -smem1, RZ;
79 | --:-:-:-:4 STG.E.SYS [clock_lo+0x18], e_s1;
80 |
81 | --:-:-:-:2 EXIT;
82 |
--------------------------------------------------------------------------------
/schedule/warp_schedule.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 |
10 | __global__ void warpScheduleKernel(float* input, float* output, uint* clock, const int run_warp){
11 | int tid = threadIdx.x;
12 | int laneid = tid & 0x1f;
13 | int warpid = tid >> 5;
14 |
15 | if (warpid != 0 and warpid != run_warp){
16 | ptxExit();
17 | }
18 |
19 | input += tid;
20 | clock += 32 * warpid / run_warp;
21 |
22 |
23 | float array[128];
24 | float acc = 0;
25 | for (int i = 0; i < 128; ++i){
26 | array[i] = input[i];
27 | }
28 |
29 | uint c1 = getClock();
30 | #pragma unroll
31 | for (int i = 0; i < 128; ++i){
32 | acc += array[i] * array[i] + 1.0f;
33 | }
34 | uint c2 = getClock();
35 |
36 | clock[laneid] = c2 - c1;
37 | output[laneid] = acc;
38 | }
39 |
40 | uint sumArray(uint* array, int size){
41 | uint acc = 0;
42 | for (int i = 0; i < size; ++i){
43 | acc += array[i];
44 | }
45 | return acc;
46 | }
47 |
48 |
49 | int main(){
50 |
51 | float* input_h;
52 | float* input_d;
53 | float* output_h;
54 | float* output_d;
55 | uint32_t* clock_h;
56 | uint32_t* clock_d;
57 |
58 | int size = 4096;
59 |
60 | input_h = static_cast(malloc(sizeof(float) * size));
61 | output_h = static_cast(malloc(sizeof(float) * size));
62 | clock_h = static_cast(malloc(sizeof(uint32_t) * size));
63 |
64 |
65 | cudaMalloc(&input_d, sizeof(float) * size);
66 | cudaMalloc(&output_d, sizeof(float) * size);
67 | cudaMalloc(&clock_d, sizeof(uint32_t) * size);
68 |
69 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
70 |
71 |
72 | dim3 gDim(1, 1, 1);
73 | dim3 bDim(256, 1, 1);
74 |
75 | const char* cubin_name = "../sass_cubin/warp_schedule.cubin";
76 | const char* kernel_name = "warpSchedule";
77 |
78 | printf(">>> SASS Level Warp Scedule Detect\n");
79 | for (int i = 1; i < 8; ++i){
80 | void* kernel_args[4] = {&input_d, &output_d, &clock_d, &i};
81 | cudaMemset(clock_d, 0, sizeof(uint) * size);
82 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args);
83 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
84 |
85 | printf(" Run Warp <0, %d> Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64));
86 | cudaDeviceSynchronize();
87 | }
88 |
89 |
90 |
91 | printf("\n");
92 | printf(">>> CUDA-C Level Warp Schedule Detect\n");
93 | for (int i = 1; i < 8; ++i){
94 | cudaMemset(clock_d, 0, sizeof(uint) * size);
95 | warpScheduleKernel<<>>(input_d, output_d, clock_d, i);
96 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
97 |
98 | printf(" Run Warp <0, %d> Elapsed \t%6u cycle\n", i, sumArray(clock_h, 64));
99 | cudaDeviceSynchronize();
100 | }
101 |
102 | return 0;
103 | }
--------------------------------------------------------------------------------
/miscellany/reg_bankconflict.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 |
10 | int main(){
11 |
12 | float* input_h;
13 | float* input_d;
14 | float* output_h;
15 | float* output_d;
16 | uint32_t* clock_h;
17 | uint32_t* clock_d;
18 |
19 | int size = 1024;
20 |
21 | input_h = static_cast(malloc(sizeof(float) * size));
22 | output_h = static_cast(malloc(sizeof(float) * size));
23 | clock_h = static_cast(malloc(sizeof(uint32_t) * size));
24 |
25 |
26 | cudaMalloc(&input_d, sizeof(float) * size);
27 | cudaMalloc(&output_d, sizeof(float) * size);
28 | cudaMalloc(&clock_d, sizeof(uint32_t) * size);
29 |
30 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
31 |
32 |
33 | dim3 gDim(1, 1, 1);
34 | dim3 bDim(1, 1, 1);
35 |
36 | void* kernel_args[] = {&input_d, &output_d, &clock_d};
37 |
38 |
39 | const char* cubin_name1 = "../sass_cubin/reg_with_bankconflict.cubin";
40 | const char* kernel_name1 = "regWithBankconflict";
41 | launchSassKernel(cubin_name1, kernel_name1, gDim, bDim, 0, kernel_args);
42 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
43 | cudaDeviceSynchronize();
44 | printf(">>> SASS Level Reg With BankConflict IPC Result\n");
45 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64);
46 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64);
47 |
48 |
49 |
50 |
51 | const char* cubin_name2 = "../sass_cubin/reg_without_bankconflict.cubin";
52 | const char* kernel_name2 = "regWithoutBankconflict";
53 | launchSassKernel(cubin_name2, kernel_name2, gDim, bDim, 0, kernel_args);
54 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
55 | cudaDeviceSynchronize();
56 | printf("\n");
57 | printf(">>> SASS Level Reg Without BankConflict IPC Result\n");
58 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64);
59 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64);
60 |
61 |
62 |
63 | const char* cubin_name3 = "../sass_cubin/reg_reuse_bankconflict.cubin";
64 | const char* kernel_name3 = "regReuseBankconflict";
65 | launchSassKernel(cubin_name3, kernel_name3, gDim, bDim, 0, kernel_args);
66 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
67 | cudaDeviceSynchronize();
68 | printf("\n");
69 | printf(">>> SASS Level Reg Reuse BankConflict IPC Result\n");
70 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64);
71 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64);
72 |
73 |
74 |
75 | const char* cubin_name4 = "../sass_cubin/reg_reuse_double.cubin";
76 | const char* kernel_name4 = "regReuseDouble";
77 | launchSassKernel(cubin_name4, kernel_name4, gDim, bDim, 0, kernel_args);
78 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
79 | cudaDeviceSynchronize();
80 | printf("\n");
81 | printf(">>> SASS Level Reg Reuse Double IPC Result\n");
82 | printf(" FFMA per \t%.3f cycle\n", static_cast(clock_h[0]) / 64);
83 | printf(" IADD3 per \t%.3f cycle\n", static_cast(clock_h[1]) / 64);
84 |
85 | return 0;
86 | }
--------------------------------------------------------------------------------
/sass_cubin/cache_linesize.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | output, 8
4 | clock, 8
5 |
6 |
7 |
8 | 0: input_lo
9 | 1: input_hi
10 | 2: output_lo
11 | 3: output_hi
12 | 4: clock_lo
13 | 5: clock_hi
14 | 6: vA
15 | 7: e
16 | 8-240 ~ c<0-200>
17 |
18 |
19 |
20 |
21 | const_a, 1024
22 |
23 |
24 |
25 |
26 | --:-:-:-:2 MOV input_lo, input[0];
27 | --:-:-:-:2 MOV input_hi, input[1];
28 | --:-:-:-:2 MOV output_lo, output[0];
29 | --:-:-:-:2 MOV output_hi, output[1];
30 | --:-:-:-:2 MOV clock_lo, clock[0];
31 | --:-:-:-:4 MOV clock_hi, clock[1];
32 |
33 |
34 | --:-:-:-:2 CS2R c0, SR_CLOCKLO;
35 |
36 | SASS_CODE = []
37 | loop_size = 200
38 |
39 | LDG = "--:-:0:-:2 LDG.E.STRONG.GPU vA, [input_lo+{:}];"
40 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
41 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
42 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
43 |
44 | for i in range(loop_size):
45 | SASS_CODE += [LDG.format(hex(i * 4))]
46 | SASS_CODE += [CS2R.format(i+1)]
47 |
48 | for i in range(loop_size):
49 | SASS_CODE += [IADD.format(i+1, i)]
50 | SASS_CODE += [STG.format(hex(i*4), i)]
51 |
52 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
61 | --:-:-:-:2 CS2R c0, SR_CLOCKLO;
62 |
63 |
64 | SASS_CODE = []
65 | loop_size = 200
66 |
67 | LDG = "--:-:0:-:2 LDG.E.STRONG.CTA vA, [input_lo+{:}];"
68 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
69 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
70 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
71 |
72 | for i in range(loop_size):
73 | SASS_CODE += [LDG.format(hex(i * 4))]
74 | SASS_CODE += [CS2R.format(i+1)]
75 |
76 | for i in range(loop_size):
77 | SASS_CODE += [IADD.format(i+1, i)]
78 | SASS_CODE += [STG.format(hex(i*4), i)]
79 |
80 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
81 |
82 |
83 |
84 |
85 |
86 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
87 | --:-:-:-:2 CS2R c0, SR_CLOCKLO;
88 |
89 | SASS_CODE = []
90 | loop_size = 200
91 |
92 | LDC = "--:-:0:-:2 LDC.E vA, const_a[{:}];"
93 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
94 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
95 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
96 |
97 | for i in range(loop_size):
98 | SASS_CODE += [LDC.format(i)]
99 | SASS_CODE += [CS2R.format(i+1)]
100 |
101 | for i in range(loop_size):
102 | SASS_CODE += [IADD.format(i+1, i)]
103 | SASS_CODE += [STG.format(hex(i*4), i)]
104 |
105 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
106 |
107 |
108 |
109 | --:-:-:-:2 IADD3 clock_lo, clock_lo, 0x800, RZ;
110 | --:-:-:-:2 CS2R c0, SR_CLOCKLO;
111 |
112 | SASS_CODE = []
113 | loop_size = 200
114 |
115 | LDC = "--:-:0:-:2 MOV vA, const_a[{:}];"
116 | CS2R = "01:-:-:-:4 CS2R c{:}, SR_CLOCKLO;"
117 | IADD = "--:-:-:-:5 IADD3 e, c{:}, -c{:}, RZ;"
118 | STG = "--:-:-:-:4 STG.E.SYS [clock_lo+{:}], e;"
119 |
120 | for i in range(loop_size):
121 | SASS_CODE += [LDC.format(i)]
122 | SASS_CODE += [CS2R.format(i+1)]
123 |
124 | for i in range(loop_size):
125 | SASS_CODE += [IADD.format(i+1, i)]
126 | SASS_CODE += [STG.format(hex(i*4), i)]
127 |
128 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
129 |
130 |
131 | --:-:-:-:2 EXIT;
132 |
--------------------------------------------------------------------------------
/sass_cubin/memory_bandwidth_thread.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | clock, 8
4 |
5 |
6 |
7 | 0: input_lo
8 | 1: input_hi
9 | 4: clock_lo
10 | 5: clock_hi
11 | 8-11: v<1-4>
12 | 12-30 ~ c<1-12>
13 | 31-40 ~ e<1-6>
14 |
15 |
16 | --:-:-:-:2 MOV input_lo, input[0];
17 | --:-:-:-:2 MOV input_hi, input[1];
18 | --:-:-:-:2 MOV clock_lo, clock[0];
19 | --:-:-:-:4 MOV clock_hi, clock[1];
20 |
21 | --:-:3:-:1 LDG.E.128.STRONG.CTA v1, [input_lo]; // warmup
22 |
23 | #########################################################################################
24 |
25 | 08:-:-:-:3 CS2R c1, SR_CLOCKLO;
26 |
27 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo+{:}];"
28 |
29 | SASS_CODE = []
30 | for i in range(1024):
31 | pos = hex(i * 16)
32 | SASS_CODE += [LDG_128_to_reg.format(pos)]
33 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
34 |
35 | 01:-:-:-:6 CS2R c2, SR_CLOCKLO;
36 |
37 | --:-:-:-:3 CS2R c3, SR_CLOCKLO;
38 |
39 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [RZ+{:}];"
40 |
41 | SASS_CODE = []
42 | for i in range(256):
43 | pos = hex(i * 16)
44 | SASS_CODE += [LDS_128_to_reg.format(pos)]
45 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
46 |
47 | 02:-:-:-:6 CS2R c4, SR_CLOCKLO;
48 |
49 | #########################################################################################
50 |
51 | --:-:-:-:3 CS2R c5, SR_CLOCKLO;
52 |
53 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo+{:}];"
54 |
55 | SASS_CODE = []
56 | for i in range(1024 * 2):
57 | pos = hex(i * 8)
58 | SASS_CODE += [LDG_64_to_reg.format(pos)]
59 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
60 |
61 | 01:-:-:-:6 CS2R c6, SR_CLOCKLO;
62 |
63 | --:-:-:-:3 CS2R c7, SR_CLOCKLO;
64 |
65 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [RZ+{:}];"
66 |
67 | SASS_CODE = []
68 | for i in range(256 * 2):
69 | pos = hex(i * 8)
70 | SASS_CODE += [LDS_64_to_reg.format(pos)]
71 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
72 |
73 | 02:-:-:-:6 CS2R c8, SR_CLOCKLO;
74 |
75 | #########################################################################################
76 |
77 | --:-:-:-:3 CS2R c9, SR_CLOCKLO;
78 |
79 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];"
80 |
81 | SASS_CODE = []
82 | for i in range(1024 * 4):
83 | pos = hex(i * 4)
84 | SASS_CODE += [LDG_32_to_reg.format(pos)]
85 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
86 |
87 | 01:-:-:-:6 CS2R c10, SR_CLOCKLO;
88 |
89 | --:-:-:-:3 CS2R c11, SR_CLOCKLO;
90 |
91 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [RZ+{:}];"
92 |
93 | SASS_CODE = []
94 | for i in range(256 * 4):
95 | pos = hex(i * 4)
96 | SASS_CODE += [LDS_32_to_reg.format(pos)]
97 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
98 |
99 | 02:-:-:-:6 CS2R c12, SR_CLOCKLO;
100 |
101 | #########################################################################################
102 |
103 | --:-:-:-:2 IADD3 e1, c2, -c1, RZ;
104 | --:-:-:-:2 IADD3 e2, c4, -c3, RZ;
105 | --:-:-:-:2 IADD3 e3, c6, -c5, RZ;
106 | --:-:-:-:2 IADD3 e4, c8, -c7, RZ;
107 | --:-:-:-:2 IADD3 e5, c10, -c9, RZ;
108 | --:-:-:-:2 IADD3 e6, c12, -c11, RZ;
109 |
110 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
111 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2;
112 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3;
113 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e4;
114 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e5;
115 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e6;
116 |
117 | --:-:-:-:2 EXIT;
118 |
119 |
--------------------------------------------------------------------------------
/memory/cache_linesize.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 |
10 | __constant__ float cinput[1024];
11 |
12 | __global__ void linesizeDetectKernel(float* input, float* output, uint* clock, float* cinput){
13 |
14 | uint c[256];
15 | float val = 0;
16 |
17 | float acc = 0;
18 | c[0] = getClock();
19 | #pragma unroll
20 | for (int i = 0; i < 256; ++i){
21 | asm volatile(
22 | "ld.global.cg.b32 %0, [%1]; \n\t"
23 | :"=f"(val):"l"(input):"memory"
24 | );
25 | c[i+1] = getClock();
26 | acc += val;
27 | input += 2;
28 | }
29 | #pragma unroll
30 | for (int i = 0; i < 256; ++i){
31 | clock[i] = c[i+1] - c[i];
32 | }
33 | output[0] = acc;
34 |
35 | /////////////////////////////////////////////////////////////////////////
36 |
37 | input += 1024;
38 | clock += 512;
39 | acc = 0;
40 | c[0] = getClock();
41 | #pragma unroll
42 | for (int i = 0; i < 256; ++i){
43 | asm volatile(
44 | "ld.global.ca.f32 %0, [%1]; \n\t"
45 | :"=f"(val):"l"(input):"memory"
46 | );
47 | c[i+1] = getClock();
48 | acc += val;
49 | input++;
50 | }
51 | #pragma unroll
52 | for (int i = 0; i < 256; ++i){
53 | clock[i] = c[i+1] - c[i];
54 | }
55 | output[1] = acc;
56 | }
57 |
58 |
59 | int detectCacheLinesize(uint* clock, int size, uint gap){
60 | int linesize = 0;
61 | uint last_cycle = clock[0];
62 |
63 | int first = 0;
64 | int second = 0;
65 |
66 | // formatArray(clock, 256, 16);
67 | for (int i = 1; i < size; ++i){
68 | if (clock[i] > last_cycle and clock[i] - last_cycle > gap) {
69 | if (first == 0){
70 | first = i;
71 | } else {
72 | second = i;
73 | break;
74 | }
75 | }
76 | last_cycle = clock[i];
77 | }
78 | return (second - first) * 4;
79 | }
80 |
81 |
82 | int main(){
83 | float* input_h;
84 | float* input_d;
85 | float* output_h;
86 | float* output_d;
87 | uint* clock_h;
88 | uint* clock_d;
89 |
90 | int size = 4096;
91 |
92 | input_h = static_cast(malloc(sizeof(float) * size));
93 | output_h = static_cast(malloc(sizeof(float) * size));
94 | clock_h = static_cast(malloc(sizeof(uint) * size));
95 |
96 | cudaMalloc(&input_d, sizeof(float) * size);
97 | cudaMalloc(&output_d, sizeof(float) * size);
98 | cudaMalloc(&clock_d, sizeof(uint) * size);
99 |
100 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
101 |
102 | dim3 gDim(1, 1, 1);
103 | dim3 bDim(1, 1, 1);
104 |
105 | void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput};
106 | const char* cubin_name = "../sass_cubin/cache_linesize.cubin";
107 | const char* kernel_name = "cacheLinesize";
108 |
109 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, 0, kernel_args);
110 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
111 | cudaDeviceSynchronize();
112 | printf(">>> SASS Level Cache Linesize Result\n");
113 | printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
114 | printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10));
115 | printf(" Constant L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1024, 512, 100));
116 | printf(" Constant L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 1536, 512, 10));
117 |
118 |
119 |
120 | linesizeDetectKernel<<>>(input_d, output_d, clock_d, cinput);
121 | cudaMemcpy(clock_h, clock_d, sizeof(float) * size, cudaMemcpyDeviceToHost);
122 | cudaDeviceSynchronize();
123 | printf("\n");
124 | printf(">>> CUDA-C Level Cache Linesize Result\n");
125 | printf(" Global L2 LineSize \t= %3u B\n", detectCacheLinesize(clock_h, 512, 40));
126 | printf(" Global L1 LineSize \t= %3u B\n", detectCacheLinesize(clock_h + 512, 512, 10));
127 | return 0;
128 | }
--------------------------------------------------------------------------------
/sass_cubin/memory_bandwidth_block.sass:
--------------------------------------------------------------------------------
1 |
2 | input, 8
3 | clock, 8
4 |
5 |
6 |
7 | 0: input_lo
8 | 1: input_hi
9 | 4: clock_lo
10 | 5: clock_hi
11 | 8-11: v<1-4>
12 | 12-30 ~ c<1-12>
13 | 31-40 ~ e<1-6>
14 | 41-44 ~ tid, tid_x_2, tid_x_4, clock_set
15 | 46-47: input_lo_x_4, input_hi_x_4
16 | 48-49: input_lo_x_2, input_hi_x_2
17 |
18 |
19 | --:-:-:-:2 MOV input_lo, input[0];
20 | --:-:-:-:2 MOV input_hi, input[1];
21 | --:-:-:-:2 MOV input_hi_x_4, input[1];
22 | --:-:-:-:2 MOV input_hi_x_2, input[1];
23 | --:-:-:-:2 MOV clock_lo, clock[0];
24 | --:-:-:-:4 MOV clock_hi, clock[1];
25 |
26 | --:-:3:-:1 LDG.E.32.STRONG.CTA v1, [input_lo]; // warmup
27 |
28 | --:-:0:-:5 S2R tid, SR_TID.X; // tid = threadIdx.x
29 | 01:-:-:-:6 SHF.L.S32.HI tid, RZ, 0x2, tid; // sizeof(T) == 4
30 |
31 | --:-:-:-:5 SHF.L.S32.HI tid_x_4, RZ, 0x2, tid;
32 | --:-:-:-:5 SHF.L.S32.HI tid_x_2, RZ, 0x1, tid;
33 |
34 | --:-:-:-:5 IADD3 input_lo_x_4, input_lo, tid_x_4, RZ;
35 | --:-:-:-:5 IADD3 input_lo_x_2, input_lo, tid_x_2, RZ;
36 |
37 | #########################################################################################
38 |
39 | 08:-:-:-:3 CS2R c1, SR_CLOCKLO;
40 |
41 | LDG_128_to_reg = "--:-:0:-:1 LDG.E.128.STRONG.CTA v1, [input_lo_x_4+{:}];"
42 |
43 | SASS_CODE = []
44 | for i in range(128):
45 | pos = hex(i * 16 * 256)
46 | SASS_CODE += [LDG_128_to_reg.format(pos)]
47 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
48 |
49 |
50 | 01:-:-:-:6 CS2R c2, SR_CLOCKLO;
51 |
52 | --:-:-:-:3 CS2R c3, SR_CLOCKLO;
53 |
54 |
55 | LDS_128_to_reg = "--:-:1:-:1 LDS.128 v1, [tid_x_4+{:}];"
56 |
57 | SASS_CODE = []
58 | for i in range(8):
59 | pos = hex(i * 16 * 256)
60 | SASS_CODE += [LDS_128_to_reg.format(pos)]
61 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
62 |
63 | 02:-:-:-:6 CS2R c4, SR_CLOCKLO;
64 |
65 | #########################################################################################
66 |
67 | --:-:-:-:3 CS2R c5, SR_CLOCKLO;
68 |
69 | LDG_64_to_reg = "--:-:0:-:1 LDG.E.64.STRONG.CTA v1, [input_lo_x_2+{:}];"
70 |
71 | SASS_CODE = []
72 | for i in range(256):
73 | pos = hex(i * 8 * 256)
74 | SASS_CODE += [LDG_64_to_reg.format(pos)]
75 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
76 |
77 |
78 | 01:-:-:-:6 CS2R c6, SR_CLOCKLO;
79 |
80 | --:-:-:-:3 CS2R c7, SR_CLOCKLO;
81 |
82 |
83 | LDS_64_to_reg = "--:-:1:-:1 LDS.64 v1, [tid_x_2+{:}];"
84 |
85 | SASS_CODE = []
86 | for i in range(16):
87 | pos = hex(i * 8 * 256)
88 | SASS_CODE += [LDS_64_to_reg.format(pos)]
89 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
90 |
91 | 02:-:-:-:6 CS2R c8, SR_CLOCKLO;
92 |
93 | #########################################################################################
94 |
95 | --:-:-:-:3 CS2R c9, SR_CLOCKLO;
96 |
97 | LDG_32_to_reg = "--:-:0:-:1 LDG.E.STRONG.CTA v1, [input_lo+{:}];"
98 |
99 | SASS_CODE = []
100 | for i in range(512):
101 | pos = hex(i * 4 * 256)
102 | SASS_CODE += [LDG_32_to_reg.format(pos)]
103 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
104 |
105 |
106 | 01:-:-:-:6 CS2R c10, SR_CLOCKLO;
107 |
108 | --:-:-:-:3 CS2R c11, SR_CLOCKLO;
109 |
110 |
111 | LDS_32_to_reg = "--:-:1:-:1 LDS v1, [tid+{:}];"
112 |
113 | SASS_CODE = []
114 | for i in range(32):
115 | pos = hex(i * 4 * 256)
116 | SASS_CODE += [LDS_32_to_reg.format(pos)]
117 | out_ = "\n" + "\n".join(SASS_CODE) + "\n"
118 |
119 | 02:-:-:-:6 CS2R c12, SR_CLOCKLO;
120 |
121 | #########################################################################################
122 |
123 | --:-:-:-:6 IMAD.WIDE clock_lo, tid, 0x6, clock_lo;
124 |
125 | --:-:-:-:2 IADD3 e1, c2, -c1, RZ;
126 | --:-:-:-:2 IADD3 e2, c4, -c3, RZ;
127 | --:-:-:-:2 IADD3 e3, c6, -c5, RZ;
128 | --:-:-:-:2 IADD3 e4, c8, -c7, RZ;
129 | --:-:-:-:2 IADD3 e5, c10, -c9, RZ;
130 | --:-:-:-:2 IADD3 e6, c12, -c11, RZ;
131 |
132 | --:-:-:-:4 STG.E.SYS [clock_lo], e1;
133 | --:-:-:-:4 STG.E.SYS [clock_lo+0x4], e2;
134 | --:-:-:-:4 STG.E.SYS [clock_lo+0x8], e3;
135 | --:-:-:-:4 STG.E.SYS [clock_lo+0xc], e4;
136 | --:-:-:-:4 STG.E.SYS [clock_lo+0x10], e5;
137 | --:-:-:-:4 STG.E.SYS [clock_lo+0x14], e6;
138 |
139 | --:-:-:-:2 EXIT;
140 |
141 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GPU Arch Microbenchmark
2 |
3 |
4 | ## Prerequisites
5 | 1. install `turingas` compiler
6 | > `git clone --recursive git@github.com:sjfeng1999/gpu-arch-microbenchmark.git`
7 | > `cd turingas`
8 | > `python setup.py install`
9 |
10 | ## Usage
11 | 1. `mkdir build && cd build`
12 | 2. `cmake .. && make`
13 | 3. `python ../compile_sass.py -arch=(70|75|80)`
14 | 4. `./(memory_latency|reg_bankconflict|...)`
15 |
16 | ## Microbenchmark
17 |
18 | ### 1. Memory Latency
19 |
20 | |Device |Latency |Turing RTX-2070 (TU104)|
21 | |:--------------------------:|:---------:|:---------------------:|
22 | |Global Latency |cycle | 1000 ~ 1200 |
23 | |TLB Latency |cycle | 472 |
24 | |L2 Latency |cycle | 236 |
25 | |L1 Latency |cycle | 32 |
26 | |Shared Latency |cycle | 23 |
27 | |Constant Latency |cycle | 448 |
28 | |Constant L2 Latency |cycle | 62 |
29 | |Constant L1 Latency |cycle | 4 |
30 |
31 | - const L1-cache is as fast as register.
32 |
33 | ### 2. Memory Bandwidth
34 |
35 | 1. memory bandwidth within one thread
36 |
37 | |Device | Bandwidth | Turing RTX-2070 |
38 | |:--------------:|:-----------:|:---------------:|
39 | |Global LDG.128 | GB/s |194.12 |
40 | |Global LDG.64 | GB/s |140.77 |
41 | |Global LDG.32 | GB/s |54.18 |
42 | |Shared LDS.128 | GB/s |152.96 |
43 | |Shared LDS.64 | GB/s |30.58 |
44 | |Shared LDS.32 | GB/s |13.32 |
45 |
46 | 1. global memory bandwidth within (64 block * 256 thread)
47 |
48 | |Device | Bandwidth | Turing RTX-2070 |
49 | |:--------------------------:|:-----------:|:---------------:|
50 | |LDG.32 | GB/s |246.65 |
51 | |LDG.32 Group1 Stride1 | GB/s |118.73(2X) |
52 | |LDG.32 Group2 Stride2 | GB/s |119.08(2X) |
53 | |LDG.32 Group4 Stride4 | GB/s |117.11(2X) |
54 | |LDG.32 Group8 Stride8 | GB/s |336.27 |
55 | |LDG.64 | GB/s |379.24 |
56 | |LDG.64 Group1 Stride1 | GB/s |126.40(2X) |
57 | |LDG.64 Group2 Stride2 | GB/s |124.51(2X) |
58 | |LDG.64 Group4 Stride4 | GB/s |398.84 |
59 | |LDG.64 Group8 Stride8 | GB/s |371.28 |
60 | |LDG.128 | GB/s |391.83 |
61 | |LDG.128 Group1 Stride1 | GB/s |125.25(2X) |
62 | |LDG.128 Group2 Stride2 | GB/s |402.55 |
63 | |LDG.128 Group4 Stride4 | GB/s |394.22 |
64 | |LDG.128 Group8 Stride8 | GB/s |396.10 |
65 |
66 | ### 3. Cache Linesize
67 |
68 | |Device | Linesize | Turing RTX-2070(TU104)|
69 | |:--------------------------:|:---------:|:---------------------:|
70 | |L2 Linesise |bytes | 64 |
71 | |L1 Linesize |bytes | 32 |
72 | |Constant L2 Linesise |bytes | 256 |
73 | |Constant L1 Linesize |bytes | 32 |
74 |
75 | ### 4. Reg Bankconflict
76 |
77 | | Instruction |CPI | conflict | without conflict | reg reuse | double reuse |
78 | |:-----------:|:-------:|:--------:|:----------------:|:---------:|:------------:|
79 | |FFMA | cycle | 3.516 | 2.969 | 2.938 | 2.938 |
80 | |IADD3 | cycle | 3.031 | 2.062 | 2.031 | 2.031 |
81 |
82 |
83 | ### 5. Shared Bankconflict
84 |
85 | | Memory Load | Latency | Turing RTX-2070 (TU104)|
86 | |:----------------------:|:---------:|:----------------------:|
87 | | Single | cycle | 23 |
88 | | Vector2 X 2 | cycle | 27 |
89 | | Conflict Strided | cycle | 41 |
90 | | Conlict-Free Strided | cycle | 32 |
91 |
92 |
93 | ## Instruction Efficiency
94 |
95 |
96 | ## Roadmap
97 |
98 | - [ ] warp schedule
99 | - [ ] L1/L2 cache n-way k-set
100 |
101 | # Citation
102 | - Jia, Zhe, et al. "Dissecting the NVIDIA volta GPU architecture via microbenchmarking." arXiv preprint arXiv:1804.06826 (2018).
103 | - Jia, Zhe, et al. "Dissecting the NVidia Turing T4 GPU via microbenchmarking." arXiv preprint arXiv:1903.07486 (2019).
104 | - Yan, Da, Wei Wang, and Xiaowen Chu. "Optimizing batched winograd convolution on GPUs." Proceedings of the 25th ACM SIGPLAN symposium on principles and practice of parallel programming. 2020. [**(turingas)**](https://github.com/daadaada/turingas)
105 |
--------------------------------------------------------------------------------
/memory/memory_bandwidth.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 | const float kMemoryFrequency_MHz = 5000.0f; // 5000MHz
10 |
11 | float calculateBandWidth(uint elapsed_cycle, const int data_bytes) {
12 | float second_x_1024_x_1024 = static_cast(elapsed_cycle) / kMemoryFrequency_MHz;
13 | float data_KBytes = static_cast(data_bytes) / 1024;
14 | return data_KBytes / second_x_1024_x_1024;
15 | }
16 |
17 | template
18 | uint getAvgElapsedCycle(int thread_group_size, int stride_size, T* data) {
19 | T acc = 0;
20 | for (int i = 0; i < thread_group_size; ++i) {
21 | acc += data[i * stride_size];
22 | }
23 | return static_cast(acc) / thread_group_size;
24 | }
25 |
26 |
27 | int main(){
28 | float* input_d;
29 | uint32_t* clock_h;
30 | uint32_t* clock_d;
31 |
32 | int global_size = 4 * 1024 * 1024;
33 | int shared_size = 32 * 1024;
34 |
35 | clock_h = static_cast(malloc(sizeof(uint32_t) * global_size));
36 |
37 | cudaMalloc(&input_d, sizeof(float) * global_size);
38 | cudaMalloc(&clock_d, sizeof(uint32_t) * global_size);
39 |
40 | void* kernel_args[] = {&input_d, &clock_d};
41 |
42 |
43 | dim3 gDim1(1, 1, 1);
44 | dim3 bDim1(1, 1, 1);
45 | int global_load_bytes = 512 * 1024;
46 | int shared_load_bytes = 32 * 1024;
47 |
48 | const char* cubin_name1 = "../sass_cubin/memory_bandwidth_thread.cubin";
49 | const char* kernel_name1 = "memoryBandwidthThread";
50 | launchSassKernel(cubin_name1, kernel_name1, gDim1, bDim1, shared_size, kernel_args);
51 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost);
52 | cudaDeviceSynchronize();
53 |
54 | printf(">>> SASS Level Memory BandWidth Result\n");
55 | printf(" Global Memory Load %9d Bytes\n", global_load_bytes);
56 | printf(" Shared Memory Load %9d Bytes\n", shared_load_bytes);
57 | printf(" Within Thread Result\n");
58 | printf(" LDG.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
59 | getAvgElapsedCycle(1, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 0), global_load_bytes));
60 | printf(" LDG.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
61 | getAvgElapsedCycle(1, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 2), global_load_bytes));
62 | printf(" LDG.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
63 | getAvgElapsedCycle(1, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 4), global_load_bytes));
64 | printf(" LDS.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
65 | getAvgElapsedCycle(1, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 1), shared_load_bytes));
66 | printf(" LDS.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
67 | getAvgElapsedCycle(1, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 3), shared_load_bytes));
68 | printf(" LDS.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
69 | getAvgElapsedCycle(1, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(1, 6, clock_h + 5), shared_load_bytes));
70 | printf("\n");
71 |
72 |
73 | dim3 gDim2(1, 1, 1);
74 | dim3 bDim2(256, 1, 1);
75 | const char* cubin_name2 = "../sass_cubin/memory_bandwidth_block.cubin";
76 | const char* kernel_name2 = "memoryBandwidthBlock";
77 | launchSassKernel(cubin_name2, kernel_name2, gDim2, bDim2, shared_size, kernel_args);
78 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * global_size, cudaMemcpyDeviceToHost);
79 | cudaDeviceSynchronize();
80 |
81 | printf(" Thread Average Result Within Block\n");
82 | printf(" LDG.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
83 | getAvgElapsedCycle(256, 6, clock_h + 0), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 0), global_load_bytes));
84 | printf(" LDG.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
85 | getAvgElapsedCycle(256, 6, clock_h + 2), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 2), global_load_bytes));
86 | printf(" LDG.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
87 | getAvgElapsedCycle(256, 6, clock_h + 4), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 4), global_load_bytes));
88 | printf(" LDS.128 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
89 | getAvgElapsedCycle(256, 6, clock_h + 1), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 1), shared_load_bytes));
90 | printf(" LDS.64 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
91 | getAvgElapsedCycle(256, 6, clock_h + 3), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 3), shared_load_bytes));
92 | printf(" LDS.32 Elaped Cycle \t=%8u cycle Bandwidth =\t %5.2f GB/s\n",
93 | getAvgElapsedCycle(256, 6, clock_h + 5), calculateBandWidth(getAvgElapsedCycle(256, 6, clock_h + 5), shared_load_bytes));
94 | printf("\n");
95 | return 0;
96 | }
--------------------------------------------------------------------------------
/miscellany/shared_bankconflict.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 |
10 | __global__ void sharedBankconflictKernel(float* input, float* output, uint32_t* clock){
11 |
12 | asm volatile (
13 | ".reg.f32 val1, val2, val3, val4; \n\t"
14 | ".reg.u32 c_1, c_2; \n\t"
15 | ".reg.u32 e_1; \n\t"
16 | ".shared.b32 smem[1024]; \n\t"
17 |
18 | "mov.u32 c_1, %%clock; \n\t"
19 | "ld.shared.f32 val1, [smem + 0x100]; \n\t"
20 | "mov.u32 c_2, %%clock; \n\t"
21 | "sub.u32 e_1, c_2, c_1; \n\t"
22 | "st.global.u32 [%2], e_1; \n\t"
23 | "st.global.f32 [%1] , val1; \n\t"
24 |
25 | //////////////////////////////////////////////////////////////////
26 |
27 | "mov.u32 c_1, %%clock; \n\t"
28 | "ld.shared.f32 val1, [smem]; \n\t"
29 | "ld.shared.f32 val2, [smem + 0x80]; \n\t"
30 | "ld.shared.f32 val3, [smem + 0x100]; \n\t"
31 | "ld.shared.f32 val4, [smem + 0x180]; \n\t"
32 | "mov.u32 c_2, %%clock; \n\t"
33 |
34 | "sub.u32 e_1, c_2, c_1; \n\t"
35 | "st.global.u32 [%2 + 0x4], e_1; \n\t"
36 | "st.global.f32 [%1 + 0x10], val1; \n\t"
37 | "st.global.f32 [%1 + 0x20], val2; \n\t"
38 | "st.global.f32 [%1 + 0x30], val3; \n\t"
39 | "st.global.f32 [%1 + 0x40], val4; \n\t"
40 |
41 | //////////////////////////////////////////////////////////////////
42 |
43 | "mov.u32 c_1, %%clock; \n\t"
44 | "ld.shared.f32 val1, [smem]; \n\t"
45 | "ld.shared.f32 val2, [smem + 0x84]; \n\t"
46 | "ld.shared.f32 val3, [smem + 0x108]; \n\t"
47 | "ld.shared.f32 val4, [smem + 0x18c]; \n\t"
48 | "mov.u32 c_2, %%clock; \n\t"
49 |
50 | "sub.u32 e_1, c_2, c_1; \n\t"
51 | "st.global.u32 [%2 + 0x8], e_1; \n\t"
52 | "st.global.f32 [%1 + 0x44], val1; \n\t"
53 | "st.global.f32 [%1 + 0x14], val2; \n\t"
54 | "st.global.f32 [%1 + 0x24], val3; \n\t"
55 | "st.global.f32 [%1 + 0x34], val4; \n\t"
56 |
57 | //////////////////////////////////////////////////////////////////
58 |
59 | "mov.u32 c_1, %%clock; \n\t"
60 | "ld.shared.v2.f32 {val1, val2}, [smem]; \n\t"
61 | "ld.shared.v2.f32 {val3, val4}, [smem + 0x8]; \n\t"
62 | "mov.u32 c_2, %%clock; \n\t"
63 |
64 | "sub.u32 e_1, c_2, c_1; \n\t"
65 | "st.global.u32 [%2 + 0xc], e_1; \n\t"
66 | "st.global.f32 [%1 + 0x48] , val1; \n\t"
67 | "st.global.f32 [%1 + 0x18], val2; \n\t"
68 | "st.global.f32 [%1 + 0x28], val3; \n\t"
69 | "st.global.f32 [%1 + 0x38], val4; \n\t"
70 |
71 | //////////////////////////////////////////////////////////////////
72 | ::"l"(input),"l"(output),"l"(clock):"memory"
73 | );
74 | }
75 |
76 | int main(){
77 |
78 | float* input_h;
79 | float* input_d;
80 | float* output_h;
81 | float* output_d;
82 | uint32_t* clock_h;
83 | uint32_t* clock_d;
84 |
85 | int size = 1024;
86 |
87 | input_h = static_cast(malloc(sizeof(float) * size));
88 | output_h = static_cast(malloc(sizeof(float) * size));
89 | clock_h = static_cast(malloc(sizeof(uint32_t) * size));
90 |
91 |
92 | cudaMalloc(&input_d, sizeof(float) * size);
93 | cudaMalloc(&output_d, sizeof(float) * size);
94 | cudaMalloc(&clock_d, sizeof(uint32_t) * size);
95 |
96 | cudaMemcpy(input_d, input_h, sizeof(float) * size, cudaMemcpyHostToDevice);
97 |
98 |
99 | dim3 gDim(1, 1, 1);
100 | dim3 bDim(1, 1, 1);
101 |
102 | void* kernel_args[] = {&input_d, &output_d, &clock_d};
103 |
104 |
105 | const char* cubin_name = "../sass_cubin/shared_bankconflict.cubin";
106 | const char* kernel_name = "sharedBankconflict";
107 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, size * sizeof(float), kernel_args);
108 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
109 | cudaDeviceSynchronize();
110 |
111 | printf(">>> SASS Level Shared Load BankConflict Result\n");
112 | printf(" Single Load [0x100] Elapsed \t%3u cycle\n", clock_h[0]);
113 | printf(" Vector Load [0x0, 0x4 , 0x8 , 0xc ] Elapsed \t%3u cycle\n", clock_h[1]);
114 | printf(" WithConflict Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[2]);
115 | printf(" WithoutConflict Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[3]);
116 |
117 |
118 | sharedBankconflictKernel<<>>(input_d, output_d, clock_d);
119 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
120 | cudaDeviceSynchronize();
121 | printf("\n");
122 | printf(">>> CUDA-C Level Shared Load BankConflict Result\n");
123 | printf(" Single Load [0x100] Elapsed \t%3u cycle\n", clock_h[0]);
124 | printf(" Vector Load [0x0, 0x4 , 0x8 , 0xc ] Elapsed \t%3u cycle\n", clock_h[3]);
125 | printf(" WithConflict Load [0x0, 0x80, 0x100, 0x180] Elapsed \t%3u cycle\n", clock_h[1]);
126 | printf(" WithoutConflict Load [0x0, 0x84, 0x108, 0x18c] Elapsed \t%3u cycle\n", clock_h[2]);
127 |
128 | return 0;
129 | }
--------------------------------------------------------------------------------
/memory/memory_latency.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 |
10 | __constant__ float cinput[1024];
11 |
12 | __global__ void latencyDetectKernel(float* input, float* output, uint32_t* clock, float* cinput){
13 |
14 | input += 1024 * 1024 * 1024 / sizeof(float) / 2;
15 | cinput += 512;
16 |
17 | asm volatile (
18 | ".reg.f32 val1, val2, val3; \n\t"
19 | ".reg.u32 c_1, c_2, c_3, c_4; \n\t"
20 | ".reg.u32 e_1, e_2, e_3; \n\t"
21 | ".reg.u32 e_4, e_5, e_6; \n\t"
22 | ".shared.b8 smem[32]; \n\t"
23 |
24 |
25 | "mov.u32 c_1, %%clock; \n\t"
26 | "ld.global.cg.f32 val1, [%0]; \n\t"
27 | "mov.u32 c_2, %%clock; \n\t"
28 | "ld.global.ca.f32 val2, [%0 + 0x4]; \n\t"
29 | "mov.u32 c_3, %%clock; \n\t"
30 | "ld.global.ca.f32 val3, [%0 + 0x8]; \n\t"
31 | "mov.u32 c_4, %%clock; \n\t"
32 |
33 | "sub.u32 e_1, c_2, c_1; \n\t"
34 | "sub.u32 e_2, c_3, c_2; \n\t"
35 | "sub.u32 e_3, c_4, c_3; \n\t"
36 |
37 | "add.f32 val1, val1, val2; \n\t"
38 | "add.f32 val1, val1, val3; \n\t"
39 |
40 | "st.global.u32 [%2], e_1; \n\t"
41 | "st.global.u32 [%2 + 0x4], e_2; \n\t"
42 | "st.global.u32 [%2 + 0x8], e_3; \n\t"
43 |
44 | "st.global.f32 [%1], val1; \n\t"
45 | "st.global.f32 [%1 + 0x4], val2; \n\t"
46 | "st.global.f32 [%1 + 0x8], val3; \n\t"
47 |
48 | ///////////////////////////////////////////////////////////////////
49 |
50 | "bar.sync 0; \n\t"
51 |
52 | ///////////////////////////////////////////////////////////////////
53 |
54 | "mov.u32 c_1, %%clock; \n\t"
55 | "ld.const.cg.f32 val1, [%3]; \n\t"
56 | "mov.u32 c_2, %%clock; \n\t"
57 | "ld.const.ca.f32 val2, [%3 + 0x4]; \n\t"
58 | "mov.u32 c_3, %%clock; \n\t"
59 | "ld.const.ca.f32 val3, [%3 + 0x8]; \n\t"
60 | "mov.u32 c_4, %%clock; \n\t"
61 |
62 | "sub.u32 e_4, c_2, c_1; \n\t"
63 | "sub.u32 e_5, c_3, c_2; \n\t"
64 | "sub.u32 e_6, c_4, c_3; \n\t"
65 |
66 | "add.f32 val1, val1, val2; \n\t"
67 | "add.f32 val1, val1, val3; \n\t"
68 |
69 | "st.global.u32 [%2 + 0xc], e_4; \n\t"
70 | "st.global.u32 [%2 + 0x10], e_5; \n\t"
71 | "st.global.u32 [%2 + 0x14], e_6; \n\t"
72 |
73 | "st.global.f32 [%1 + 0xc], val1; \n\t"
74 | "st.global.f32 [%1 + 0x10], val2; \n\t"
75 | "st.global.f32 [%1 + 0x14], val3; \n\t"
76 |
77 | /////////////////////////////////////////////////////////////////////////
78 |
79 | "bar.sync 0; \n\t"
80 |
81 | ///////////////////////////////////////////////////////////////////
82 |
83 | "mov.u32 c_1, %%clock; \n\t"
84 | "ld.shared.f32 val1, [smem]; \n\t"
85 | "mov.u32 c_2, %%clock; \n\t"
86 |
87 | "sub.u32 e_4, c_2, c_1; \n\t"
88 | "st.global.u32 [%2 + 0x18], e_4; \n\t"
89 | "st.global.f32 [%1 + 0x18], val1; \n\t"
90 |
91 | ::"l"(input),"l"(output),"l"(clock),"l"(cinput):"memory"
92 | );
93 |
94 | }
95 |
96 |
97 | int main(){
98 | float* input_d;
99 | float* output_d;
100 | uint32_t* clock_h;
101 | uint32_t* clock_d;
102 |
103 | int size = 1024;
104 | int large_size = 1500 * 1024 * 1024;
105 |
106 | clock_h = static_cast(malloc(sizeof(uint32_t) * size));
107 |
108 | cudaMalloc(&input_d, large_size);
109 | cudaMalloc(&output_d, sizeof(float) * size);
110 | cudaMalloc(&clock_d, sizeof(uint32_t) * size);
111 |
112 | for (int i = 0; i < 128; ++i){
113 | cinput[i] = i;
114 | }
115 |
116 | dim3 gDim(1, 1, 1);
117 | dim3 bDim(1, 1, 1);
118 |
119 | void* kernel_args[] = {&input_d, &output_d, &clock_d, &cinput};
120 | const char* cubin_name = "../sass_cubin/memory_latency.cubin";
121 | const char* kernel_name = "memoryLatency";
122 |
123 | launchSassKernel(cubin_name, kernel_name, gDim, bDim, size, kernel_args);
124 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
125 | cudaDeviceSynchronize();
126 |
127 | printf(">>> SASS Level Memory Latency Result\n");
128 | printf(" Global Memory Latency \t= %4u cycle\n", clock_h[0]);
129 | printf(" Global TLB Latency \t= %4u cycle\n", clock_h[7]);
130 | printf(" Global L2-Cache Latency \t= %4u cycle\n", clock_h[1]);
131 | printf(" Global L1-Cache Latency \t= %4u cycle\n", clock_h[2]);
132 | printf(" Shared Memory Latency \t= %4u cycle\n", clock_h[6]);
133 | printf(" Constant Memory Latency \t= %4u cycle\n", clock_h[3]);
134 | printf(" Constant L2-Cache Latency \t= %4u cycle\n", clock_h[4]);
135 | printf(" Constant L1-Cache Latency \t= %4u cycle\n", clock_h[5]);
136 |
137 |
138 |
139 | latencyDetectKernel<<>>(input_d, output_d, clock_d, cinput);
140 | cudaMemcpy(clock_h, clock_d, sizeof(uint) * size, cudaMemcpyDeviceToHost);
141 | cudaDeviceSynchronize();
142 | printf("\n");
143 | printf(">>> CUDA-C Level Memory Latency Result\n");
144 | printf(" Global Memory Latency \t= %4u cycle\n", clock_h[0]);
145 | printf(" Global L2-Cache Latency \t= %4u cycle\n", clock_h[1]);
146 | printf(" Global L1-Cache Latency \t= %4u cycle\n", clock_h[2]);
147 | printf(" Shared Memory Latency \t= %4u cycle\n", clock_h[6]);
148 | printf(" Constant Memory Latency \t= %4u cycle\n", clock_h[3]);
149 | printf(" Constant L2-Cache Latency \t= %4u cycle\n", clock_h[4]);
150 | printf(" Constant L1-Cache Latency \t= %4u cycle\n", clock_h[5]);
151 | return 0;
152 | }
--------------------------------------------------------------------------------
/memory/global_memory_bandwidth.cu:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | //
5 |
6 | #include "cuda.h"
7 | #include "utils.cuh"
8 |
9 | constexpr int kGridDimX = 64;
10 | constexpr int kBlockDimX = 256;
11 | constexpr int kWarpCount = kBlockDimX / kWarpSize;
12 | constexpr int kLoopSize = 4 * 1024;
13 | constexpr size_t kGlobalSize = 256 * 1024 * 1024;
14 | constexpr float kCopySize = (float)kLoopSize * kGridDimX * kBlockDimX * sizeof(float);
15 |
16 | template
17 | __global__
18 | void copyGroup32bKernel(float* input, float* output) {
19 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
20 | const int kBlockWorkload = kWarpCount * kWarpWorkload;
21 | const int kLine = kGridDimX * kBlockWorkload;
22 |
23 | int ctaid = blockIdx.x;
24 | int tid = threadIdx.x;
25 | int warpid = tid / 32;
26 | int laneid = tid % 32;
27 | int groupid = laneid / GroupSize;
28 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
29 |
30 | float* thread_input = input + offset;
31 | float* thread_output = output + offset;
32 |
33 | for (int i = 0; i < kLoopSize; ++i) {
34 | *thread_output = *thread_input;
35 | thread_input += kLine;
36 | thread_output += kLine;
37 | }
38 | }
39 |
40 |
41 | template
42 | __global__
43 | void copyGroup64bKernel(float2* input, float2* output) {
44 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
45 | const int kBlockWorkload = kWarpCount * kWarpWorkload;
46 | const int kLine = kGridDimX * kBlockWorkload;
47 |
48 | int ctaid = blockIdx.x;
49 | int tid = threadIdx.x;
50 | int warpid = tid / 32;
51 | int laneid = tid % 32;
52 | int groupid = laneid / GroupSize;
53 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
54 |
55 | float2* thread_input = input + offset;
56 | float2* thread_output = output + offset;
57 |
58 | for (int i = 0; i < (kLoopSize / 2); ++i) {
59 | *thread_output = *thread_input;
60 | thread_input += kLine;
61 | thread_output += kLine;
62 | }
63 | }
64 |
65 |
66 | template
67 | __global__
68 | void copyGroup128bKernel(float4* input, float4* output) {
69 | const int kWarpWorkload = 32 + kWarpSize / GroupSize * StrideSize;
70 | const int kBlockWorkload = kWarpCount * kWarpWorkload;
71 | const int kLine = kGridDimX * kBlockWorkload;
72 |
73 | int ctaid = blockIdx.x;
74 | int tid = threadIdx.x;
75 | int warpid = tid / 32;
76 | int laneid = tid % 32;
77 | int groupid = laneid / GroupSize;
78 | int offset = ctaid * kBlockWorkload + warpid * kWarpWorkload + laneid + groupid * StrideSize;
79 |
80 | float4* thread_input = input + offset;
81 | float4* thread_output = output + offset;
82 |
83 | for (int i = 0; i < (kLoopSize / 4); ++i) {
84 | *thread_output = *thread_input;
85 | thread_input += kLine;
86 | thread_output += kLine;
87 | }
88 | }
89 |
90 | template
91 | float getElapsed(Func fn, cudaEvent_t start, cudaEvent_t stop) {
92 | float elapsed = 0;
93 | cudaEventRecord(start);
94 | fn();
95 | cudaEventRecord(stop);
96 | cudaDeviceSynchronize();
97 | cudaEventElapsedTime(&elapsed, start, stop);
98 | return kCopySize / elapsed / 1024 / 1024;
99 | }
100 |
101 | int main() {
102 | float* input_d;
103 | float* output_d;
104 |
105 | cudaMalloc(&input_d, sizeof(float) * kGlobalSize);
106 | cudaMalloc(&output_d, sizeof(float) * kGlobalSize);
107 |
108 | cudaEvent_t start, stop;
109 | cudaEventCreate(&start);
110 | cudaEventCreate(&stop);
111 |
112 | dim3 gDim(kGridDimX);
113 | dim3 bDim(kBlockDimX);
114 |
115 | printf(" Different access pattern on Global Memory\n");
116 |
117 | auto fn1 = [=]() { copyGroup32bKernel<1, 0><<>>(input_d, output_d);};
118 | printf(" LDG.32 \t%.2f GB/s\n", getElapsed(fn1, start, stop));
119 |
120 | auto fn2 = [=]() { copyGroup32bKernel<1, 1><<>>(input_d, output_d);};
121 | printf(" LDG.32 g1s1 \t%.2f GB/s\n", getElapsed(fn2, start, stop));
122 |
123 | auto fn9 = [=]() { copyGroup32bKernel<2, 2><<>>(input_d, output_d);};
124 | printf(" LDG.32 g2s2 \t%.2f GB/s\n", getElapsed(fn9, start, stop));
125 |
126 | auto fn10 = [=]() { copyGroup32bKernel<4, 4><<>>(input_d, output_d);};
127 | printf(" LDG.32 g4s4 \t%.2f GB/s\n", getElapsed(fn10, start, stop));
128 |
129 | auto fn8 = [=]() { copyGroup32bKernel<8, 8><<>>(input_d, output_d);};
130 | printf(" LDG.32 g8s8 \t%.2f GB/s\n", getElapsed(fn8, start, stop));
131 |
132 | ///////////////////////////////////////////////////////////////////////////////////////////////////////
133 |
134 | auto fn3 = [=]() { copyGroup64bKernel<1, 0><<>>((float2*)input_d, (float2*)output_d);};
135 | printf(" LDG.64 \t%.2f GB/s\n", getElapsed(fn3, start, stop));
136 |
137 | auto fn4 = [=]() { copyGroup64bKernel<1, 1><<>>((float2*)input_d, (float2*)output_d);};
138 | printf(" LDG.64 g1s1 \t%.2f GB/s\n", getElapsed(fn4, start, stop));
139 |
140 | auto fn11 = [=]() { copyGroup64bKernel<2, 2><<>>((float2*)input_d, (float2*)output_d);};
141 | printf(" LDG.64 g2s2 \t%.2f GB/s\n", getElapsed(fn11, start, stop));
142 |
143 | auto fn12 = [=]() { copyGroup64bKernel<4, 4><<>>((float2*)input_d, (float2*)output_d);};
144 | printf(" LDG.64 g4s4 \t%.2f GB/s\n", getElapsed(fn12, start, stop));
145 |
146 | auto fn13 = [=]() { copyGroup64bKernel<8, 8><<>>((float2*)input_d, (float2*)output_d);};
147 | printf(" LDG.64 g8s8 \t%.2f GB/s\n", getElapsed(fn13, start, stop));
148 |
149 | ///////////////////////////////////////////////////////////////////////////////////////////////////////
150 |
151 | auto fn5 = [=]() { copyGroup128bKernel<1, 0><<>>((float4*)input_d, (float4*)output_d);};
152 | printf(" LDG.128 \t%.2f GB/s\n", getElapsed(fn5, start, stop));
153 |
154 | auto fn6 = [=]() { copyGroup128bKernel<1, 1><<>>((float4*)input_d, (float4*)output_d);};
155 | printf(" LDG.128 g1s1 \t%.2f GB/s\n", getElapsed(fn6, start, stop));
156 |
157 | auto fn7 = [=]() { copyGroup128bKernel<2, 2><<>>((float4*)input_d, (float4*)output_d);};
158 | printf(" LDG.128 g2s2 \t%.2f GB/s\n", getElapsed(fn7, start, stop));
159 |
160 | auto fn14 = [=]() { copyGroup128bKernel<4, 4><<>>((float4*)input_d, (float4*)output_d);};
161 | printf(" LDG.128 g4s4 \t%.2f GB/s\n", getElapsed(fn14, start, stop));
162 |
163 | auto fn15 = [=]() { copyGroup128bKernel<8, 8><<>>((float4*)input_d, (float4*)output_d);};
164 | printf(" LDG.128 g8s8 \t%.2f GB/s\n", getElapsed(fn15, start, stop));
165 |
166 |
167 | cudaFree(input_d);
168 | cudaFree(output_d);
169 | }
--------------------------------------------------------------------------------