├── .gitignore
├── README.md
└── microbench
    ├── .gitignore
    ├── Makefile
    ├── appendix
        ├── Makefile
        ├── mma_baseline.cu
        ├── mma_permuted.cu
        └── mma_pipeline.cu
    ├── bin
        └── README.md
    ├── common
        └── common.mk
    ├── hw_def
        ├── common
        │   ├── common.h
        │   └── deviceQuery.h
        └── hw_def.h
    ├── numericbench
        ├── bf16numeric
        │   ├── bf16_numeric
        │   │   ├── Makefile
        │   │   ├── bf16_chain_matmul.cu
        │   │   └── bf16_numeric.cu
        │   ├── bf16add
        │   │   ├── Makefile
        │   │   └── bf16add.cu
        │   ├── bf16mul
        │   │   ├── Makefile
        │   │   └── bf16mul.cu
        │   ├── m16n8k16
        │   │   ├── Makefile
        │   │   └── m16n8k16_bf16.cu
        │   └── m16n8k8
        │   │   ├── Makefile
        │   │   └── m16n8k8_bf16.cu
        ├── cpu_base.h
        ├── cpu_int_base.h
        ├── fp16numeric
        │   ├── fp16_numeric
        │   │   ├── Makefile
        │   │   └── fp16_chain_matmul.cu
        │   ├── fp16add
        │   │   ├── Makefile
        │   │   └── fp16add.cu
        │   └── fp16mul
        │   │   ├── Makefile
        │   │   └── fp16mul.cu
        ├── int8numeric
        │   ├── int8add
        │   │   ├── Makefile
        │   │   └── int8add.cu
        │   └── s8numeric
        │   │   └── s8numeric.cu
        └── tf32numeric
        │   ├── m16n8k4
        │       ├── Makefile
        │       └── m16n8k4_tf32.cu
        │   ├── m16n8k8
        │       ├── Makefile
        │       └── m16n8k8_tf32.cu
        │   ├── tf32_numeric
        │       ├── Makefile
        │       ├── tf32_chain_matmul.cu
        │       └── tf32_numeric.cu
        │   ├── tf32add
        │       ├── Makefile
        │       └── tf32add.cu
        │   └── tf32mul
        │       ├── Makefile
        │       └── tf32mul.cu
    ├── run_all.sh
    └── ubench
        ├── ldmatrix
            ├── ldmatrix_ILP
            │   ├── Makefile
            │   └── ldmatrix_ilp.cu
            ├── ldmatrix_lat
            │   ├── Makefile
            │   └── ldmatrix_lat.cu
            ├── ldmatrix_x2_lat
            │   ├── Makefile
            │   └── ldmatrix_x2_lat.cu
            ├── ldmatrix_x4_lat
            │   ├── Makefile
            │   └── ldmatrix_x4_lat.cu
            ├── shared_bw
            │   ├── Makefile
            │   └── shared_bw.cu
            ├── shared_bw_64
            │   ├── Makefile
            │   └── shared_bw_64.cu
            ├── shared_lat
            │   ├── Makefile
            │   └── shared_lat.cu
            ├── shared_x2_lat
            │   ├── Makefile
            │   └── shared_x2_lat.cu
            ├── shared_x4_lat
            │   ├── Makefile
            │   └── shared_x4_lat.cu
            ├── shared_x8
            │   ├── Makefile
            │   └── shared_x8.cu
            └── shd_config
            │   ├── Makefile
            │   └── shd_config.cu
        ├── mma
            ├── mma_m16n8k128_int1
            │   ├── Makefile
            │   └── mma_m16n8k128_int1.cu
            ├── mma_m16n8k16_bf16fp32
            │   ├── Makefile
            │   └── mma_m16n8k16_bf16fp32.cu
            ├── mma_m16n8k16_fp
            │   ├── Makefile
            │   └── mma_m16n8k16_fp32.cu
            ├── mma_m16n8k16_half
            │   ├── Makefile
            │   └── mma_m16n8k16_half.cu
            ├── mma_m16n8k16_int
            │   ├── Makefile
            │   └── mma_m16n8k16_int.cu
            ├── mma_m16n8k256_int1
            │   ├── Makefile
            │   └── mma_m16n8k256_int1.cu
            ├── mma_m16n8k32_fp8
            │   ├── Makefile
            │   └── mma_m16n8k32_fp8.cu
            ├── mma_m16n8k32_int
            │   ├── Makefile
            │   └── mma_m16n8k32_int.cu
            ├── mma_m16n8k32_int4
            │   ├── Makefile
            │   └── mma_m16n8k32_int4.cu
            ├── mma_m16n8k4_tf32
            │   ├── Makefile
            │   └── mma_m16n8k4_tf32.cu
            ├── mma_m16n8k64_int4
            │   ├── Makefile
            │   └── mma_m16n8k64_int4.cu
            ├── mma_m16n8k8_bf16fp32
            │   ├── Makefile
            │   └── mma_m16n8k8_bf16fp32.cu
            ├── mma_m16n8k8_fp
            │   ├── Makefile
            │   └── mma_m16n8k8_fp32.cu
            ├── mma_m16n8k8_half
            │   ├── Makefile
            │   └── mma_m16n8k8_half.cu
            ├── mma_m16n8k8_tf32
            │   ├── Makefile
            │   └── mma_m16n8k8_tf32.cu
            ├── mma_m8n8k16_int
            │   ├── Makefile
            │   └── mma_m8n8k16_int8.cu
            └── mma_m8n8k4_fp16fp32
            │   ├── Makefile
            │   └── mma_m8n8k4_fp16fp32.cu
        ├── mmasp
            ├── mmasp_m16n8k16_fp
            │   ├── Makefile
            │   └── mmasp_m16n8k16_fp32.cu
            ├── mmasp_m16n8k16_fp16fp16
            │   ├── Makefile
            │   └── mmasp_m16n8k16_fp16fp16.cu
            ├── mmasp_m16n8k16_tf32
            │   ├── Makefile
            │   └── mmasp_m16n8k16_tf32.cu
            ├── mmasp_m16n8k32_fp
            │   ├── Makefile
            │   └── mmasp_m16n8k32_fp32.cu
            ├── mmasp_m16n8k32_fp16fp16
            │   ├── Makefile
            │   └── mmasp_m16n8k32_fp16fp16.cu
            ├── mmasp_m16n8k32_int
            │   ├── Makefile
            │   └── mmasp_m16n8k32_int.cu
            ├── mmasp_m16n8k64_fp8
            │   ├── Makefile
            │   └── mmasp_m16n8k64_fp8.cu
            ├── mmasp_m16n8k64_int
            │   ├── Makefile
            │   └── mmasp_m16n8k64_int.cu
            └── mmasp_m16n8k8_tf32
            │   ├── Makefile
            │   └── mmasp_m16n8k8_tf32.cu
        └── wmma_load
            ├── loadbf16
                ├── Makefile
                └── load_bf16.cu
            └── loadfp16
                ├── Makefile
                └── load_fp16.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | 
35 | microbench/ubench/core/*
36 | *.app
37 | *.txt
38 | *.log
39 | 
40 | microbench/logs/*
41 | microbench/revisionlogs/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors
 2 | 
 3 | Code repo for Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors.
 4 | 
 5 | * [preprint](https://arxiv.org/abs/2206.02874)
 6 | * [IEEE TPDS](https://ieeexplore.ieee.org/document/9931992)
 7 | 
 8 | TODO: Provide better instructions and explanations
 9 | 
10 | # Setup 
11 | 
12 | ## Add CUDA path to env. e.g
13 | 
14 | ```
15 | export CUDA_PATH=/usr/local/cuda-11.0
16 | export PATH=$CUDA_PATH/bin:$PATH
17 | export CUDACXX=$CUDA_PATH/bin/nvcc
18 | ```
19 | 
20 | ## Config compiler target Arch/SM 
21 | 
22 | ```export TargetSM=80 ``` // for A100
23 | 
24 | ```export TargetSM=70 ``` // for V100
25 | 
26 | ```export TargetSM=75 ``` // for Turing
27 | 
28 | 
29 | ## Run script
30 | 
31 | ```
32 | cd/microbench
33 | sh run_all.sh
34 | ```
35 | 
36 | You are expected to get xxx-ILPx.log files.
37 | 
38 | Note, there will be static_assert errors messages when running the scripts, because some codes have static_assert() for larger ILPs. This kind of error messages can be ignored.
39 | 
40 | 
41 | -------------------------------------
42 | 
43 | ## References
44 | Some codes are borrowed from [Accel-Sim](https://github.com/accel-sim/accel-sim-framework)
45 | 
46 | ## citations
47 | ```
48 | @ARTICLE{9931992,
49 |   author={Sun, Wei and Li, Ang and Geng, Tong and Stuijk, Sander and Corporaal, Henk},
50 |   journal={IEEE Transactions on Parallel and Distributed Systems}, 
51 |   title={Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors}, 
52 |   year={2023},
53 |   volume={34},
54 |   number={1},
55 |   pages={246-261},
56 |   doi={10.1109/TPDS.2022.3217824}}
57 | ```
58 | 


--------------------------------------------------------------------------------
/microbench/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | *.csv
 3 | ubench/atomics/Atomic_add_bw/atomic_add_bw
 4 | ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict
 5 | ubench/atomics/Atomic_add_lat/atomic_add_lat
 6 | ubench/core/MaxFlops_double/MaxFlops_double
 7 | ubench/core/MaxFlops_float/MaxFlops_float
 8 | ubench/core/MaxFlops_half/MaxFlops_half
 9 | ubench/core/MaxFlops_int32/MaxFlops_int32
10 | ubench/core/config_dpu/config_dpu
11 | ubench/core/config_fpu/config_fpu
12 | ubench/core/config_int/config_int
13 | ubench/core/config_sfu/config_sfu
14 | ubench/core/config_tensor/config_tensor
15 | ubench/core/config_udp/config_udp
16 | ubench/core/core_config/core_config
17 | ubench/core/lat_double/lat_double
18 | ubench/core/lat_float/lat_float
19 | ubench/core/lat_half/lat_half
20 | ubench/core/lat_int32/lat_int32
21 | ubench/core/regfile_bw/regfile_bw
22 | ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt
23 | ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt
24 | ubench/core/tensor_bw_half/tensor_bw_half
25 | ubench/core/tensor_lat_half/tensor_lat_half
26 | ubench/l1_cache/l1_access_grain/l1_access_grain
27 | ubench/l1_cache/l1_adaptive/l1_adaptive
28 | ubench/l1_cache/l1_associativity/l1_associativity
29 | ubench/l1_cache/l1_banks/l1_banks
30 | ubench/l1_cache/l1_bw_128/l1_bw_128
31 | ubench/l1_cache/l1_bw_32f/l1_bw_32f
32 | ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll
33 | ubench/l1_cache/l1_bw_64f/l1_bw_64f
34 | ubench/l1_cache/l1_bw_64v/l1_bw_64v
35 | ubench/l1_cache/l1_config/l1_config
36 | ubench/l1_cache/l1_lat/l1_lat
37 | ubench/l1_cache/l1_mshr/l1_mshr
38 | ubench/l1_cache/l1_sector/l1_sector
39 | ubench/l1_cache/l1_shared_bw/l1_shared_bw
40 | ubench/l1_cache/l1_write_policy/l1_write_policy
41 | ubench/l2_cache/l2_access_grain/l2_access_grain
42 | ubench/l2_cache/l2_bw_128/l2_bw_128
43 | ubench/l2_cache/l2_bw_32f/l2_bw_32f
44 | ubench/l2_cache/l2_bw_64f/l2_bw_64f
45 | ubench/l2_cache/l2_config/l2_config
46 | ubench/l2_cache/l2_copy_engine/l2_copy_engine
47 | ubench/l2_cache/l2_lat/l2_lat
48 | ubench/l2_cache/l2_write_policy/l2_write_policy
49 | ubench/mem/mem_atom_size/mem_atom_size
50 | ubench/mem/mem_bw/mem_bw
51 | ubench/mem/mem_config/mem_config
52 | ubench/mem/mem_lat/mem_lat
53 | ubench/shd/shared_bw/shared_bw
54 | ubench/shd/shared_bw_64/shared_bw_64
55 | ubench/shd/shared_lat/shared_lat
56 | ubench/shd/shd_config/shd_config
57 | ubench/system/deviceQuery/deviceQuery
58 | ubench/system/kernel_lat/kernel_lat
59 | ubench/system/system_config/system_config
60 | ubench/system/list_devices/list_devices


--------------------------------------------------------------------------------
/microbench/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | BASE_DIR := $(shell pwd)
 3 | BIN_DIR := $(BASE_DIR)/bin
 4 | SUB_DIRS        = $(wildcard ubench/*/*/)
 5 | # SUB_DIRS        = $(wildcard ubench/ldmatrix/*/)
 6 | # SUB_DIRS        = $(wildcard ubench/mma/*/)
 7 | SUB_DIRS_ALL    = $(SUB_DIRS:%=all-%)
 8 | SUB_DIRS_CLEAN  = $(SUB_DIRS:%=clean-%)
 9 | 
10 | all: create_dir $(SUB_DIRS_ALL)	
11 | 
12 | clean: delete_dir $(SUB_DIRS_CLEAN)
13 | 
14 | $(SUB_DIRS_ALL):
15 | 	$(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%)
16 | 
17 | $(SUB_DIRS_CLEAN):
18 | 	$(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean
19 | 
20 | create_dir:
21 | 	mkdir -p $(BIN_DIR)	
22 | 
23 | delete_dir:
24 | 	cd $(BIN_DIR); rm -f *.app
25 | 
26 | 
27 | 	
28 | 	
29 | 


--------------------------------------------------------------------------------
/microbench/appendix/Makefile:
--------------------------------------------------------------------------------
 1 | CC = nvcc
 2 | FLAG =-gencode=arch=compute_86,code=\"sm_86,compute_86\"  -lcudart
 3 | 
 4 | all: pipeline_mma \
 5 | 	baseline_mma \
 6 | 	permuted_mma 
 7 | 	
 8 | 
 9 | baseline_mma: ./mma_baseline.cu
10 | 	$(CC) $(FLAG)  -o $@.out $^
11 | 
12 | 
13 | pipeline_mma: ./mma_pipeline.cu
14 | 	$(CC) $(FLAG)  -o $@.out $^
15 | 
16 | 
17 | permuted_mma: ./mma_permuted.cu
18 | 	$(CC) $(FLAG)  -o $@.out $^
19 | 
20 | 
21 | clean:
22 | 	rm -rf *.out


--------------------------------------------------------------------------------
/microbench/bin/README.md:
--------------------------------------------------------------------------------
1 | #Programs


--------------------------------------------------------------------------------
/microbench/common/common.mk:
--------------------------------------------------------------------------------
 1 | BASE_DIR := $(shell pwd)
 2 | BIN_DIR := $(BASE_DIR)/../../../bin/
 3 | 
 4 | GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\" # V100
 5 | GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\" # Turing
 6 | GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\" # A100
 7 | GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\" # RTX30
 8 | GENCODE_SM89 ?= -gencode=arch=compute_86,code=\"sm_89,compute_89\" # RTX30
 9 | 
10 | 
11 | TargetSM ?= 89
12 | GENCODE_SM = -gencode=arch=compute_${TargetSM},code=\"sm_${TargetSM},compute_${TargetSM}\"
13 | CUOPTS = $(GENCODE_ARCH) $(GENCODE_SM) 
14 | 
15 | CC := nvcc
16 | 
17 | CUDA_PATH ?= /usr/local/cuda/
18 | INCLUDE := $(CUDA_PATH)/samples/common/inc/
19 | LIB := 
20 | ILP ?= 1
21 | ITERS ?= 999
22 | MEAN ?= 0.0
23 | STDDEV ?= 1.0
24 | release:
25 | 	$(CC) $(NVCC_FLGAS) --define-macro ILPconfig=$(ILP),ITERS=$(ITERS),MEAN=$(MEAN),STDDEV=$(STDDEV) $(CUOPTS) $(SRC) -o $(EXE) -I $(INCLUDE) -L $(LIB) -lcudart
26 | 	cp $(EXE) $(BIN_DIR)
27 | 
28 | # clean:
29 | # 	rm -f *.o; rm -f $(EXE)
30 | 
31 | clean:
32 | 	rm -f *.app *.txt
33 | 
34 | run:
35 | 	./$(EXE)
36 | 
37 | profile:
38 | 	nv-nsight-cu-cli --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.per_second,smsp__sass_average_data_bytes_per_wavefront_mem_shared.pct,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active ./$(EXE)
39 | 
40 | profile_bank:
41 | 	nv-nsight-cu-cli --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ldsm.sum,smsp__sass_average_data_bytes_per_wavefront_mem_shared ./$(EXE)
42 | 
43 | profile_lsu_mio:
44 | 	nv-nsight-cu-cli --metrics smsp__average_warp_latency_issue_stalled_lg_throttle.ratio,smsp__average_warp_latency_issue_stalled_mio_throttle.ratio,smsp__average_warp_latency_issue_stalled_short_scoreboard.ratio ./$(EXE)
45 | 
46 | 
47 | # smsp__average_warps_issue_stalled_mio_throttle_per_issue_active.ratio 
48 | # sm__inst_executed_pipe_lsu.sum
49 | # smsp__average_inst_executed_pipe_lsu_per_warp.ratio
50 | 
51 | profile_smem:
52 | 	nv-nsight-cu-cli --metrics l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,sm__sass_data_bytes_mem_shared_op_ld.sum,sm__inst_executed_pipe_lsu.sum,sm__sass_l1tex_pipe_lsu_wavefronts_mem_shared.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ld.sum ./$(EXE)
53 | 
54 | events:
55 | 	nvprof  --events elapsed_cycles_sm ./$(EXE)
56 | 
57 | profileall:
58 | 	nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE) 
59 | 
60 | nvsight:
61 | 	nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum  --csv --page raw ./$(EXE) | tee nsight.csv
62 | 
63 | ptx:
64 | 	cuobjdump -ptx ./$(EXE)  tee ptx.txt
65 | 
66 | sass:
67 | 	cuobjdump -sass ./$(EXE)  tee sass.txt
68 | 


--------------------------------------------------------------------------------
/microbench/hw_def/common/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | 
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <vector>
  9 | #include <type_traits>
 10 | #define ACCEL_SIM_MODE 1
 11 | 
 12 | enum issue_model { single = 1, dual = 2 };
 13 | 
 14 | static const char *issue_model_str[] = {"none", "single", "dual"};
 15 | 
 16 | enum core_model { shared = 0, subcore = 1 };
 17 | 
 18 | static const char *core_model_str[] = {"none", "shared", "subcore"};
 19 | 
 20 | enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 };
 21 | 
 22 | // GPU error check
 23 | #define gpuErrchk(ans)                                                         \
 24 |   { gpuAssert((ans), __FILE__, __LINE__); }
 25 | inline void gpuAssert(cudaError_t code, const char *file, int line,
 26 |                       bool abort = true) {
 27 |   if (code != cudaSuccess) {
 28 |     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
 29 |             line);
 30 |     if (abort)
 31 |       exit(code);
 32 |   }
 33 | }
 34 | 
 35 | // source:
 36 | // https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
 37 | unsigned round_up_2n(unsigned v) {
 38 |   v--;
 39 |   v |= v >> 1;
 40 |   v |= v >> 2;
 41 |   v |= v >> 4;
 42 |   v |= v >> 8;
 43 |   v |= v >> 16;
 44 |   v++;
 45 | 
 46 |   return v;
 47 | }
 48 | 
 49 | unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); }
 50 | 
 51 | bool isPowerOfTwo(int n) {
 52 |   if (n == 0)
 53 |     return false;
 54 | 
 55 |   return (ceil(log2(n)) == floor(log2(n)));
 56 | }
 57 | 
 58 | static const char *dram_model_str[] = {"none", "GDDR5", "GDDR5X", "GDDR6",
 59 |                                        "HBM"};
 60 | static const unsigned dram_model_bus_width[] = {0, 32, 32, 16, 128}; // in bits
 61 | static const unsigned dram_model_mem_per_ctrlr[] = {0, 1, 1, 1, 1};
 62 | static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2};
 63 | static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2};
 64 | // atom size =
 65 | // dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length
 66 | unsigned get_atom_size_inByte(enum dram_model model) {
 67 |   return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] *
 68 |          dram_model_burst_length[model];
 69 | }
 70 | // CCD = dram_model_burst_length/dram_model_freq_ratio
 71 | unsigned get_adjusted_CCD(enum dram_model model) {
 72 |   assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0);
 73 |   return dram_model_burst_length[model] / dram_model_freq_ratio[model];
 74 | }
 75 | 
 76 | unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
 77 |   unsigned channel_width =
 78 |       dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
 79 |   assert(total_memory_width % channel_width == 0);
 80 |   return total_memory_width / channel_width;
 81 | }
 82 | 
 83 | // DDR timing struct
 84 | struct DDR_Timing {
 85 |   unsigned freq;
 86 |   unsigned nbk;
 87 |   unsigned CCD;
 88 |   unsigned RRD;
 89 |   unsigned RCD;
 90 |   unsigned RAS;
 91 |   unsigned RP;
 92 |   unsigned RC;
 93 |   unsigned CL;
 94 |   unsigned WL;
 95 |   unsigned CDLR;
 96 |   unsigned WR;
 97 |   unsigned nbkgrp;
 98 |   unsigned CCDL;
 99 |   unsigned RTPL;
100 | 
101 |   DDR_Timing(unsigned mfreq, unsigned n_bk, unsigned tCCD, unsigned tRRD,
102 |              unsigned tRCD, unsigned tRAS, unsigned tRP, unsigned tRC,
103 |              unsigned tCL, unsigned tWL, unsigned tCDLR, unsigned tWR,
104 |              unsigned n_bkgrp, unsigned tCCDL, unsigned tRTPL) {
105 |     freq = mfreq;
106 |     nbk = n_bk;
107 |     CCD = tCCD;
108 |     RRD = tRRD;
109 |     RCD = tRCD;
110 |     RAS = tRAS;
111 |     RP = tRP;
112 |     RC = tRC;
113 |     CL = tCL;
114 |     WL = tWL;
115 |     CDLR = tCDLR;
116 |     WR = tWR;
117 |     nbkgrp = n_bkgrp;
118 |     CCDL = tCCDL;
119 |     RTPL = tRTPL;
120 |   }
121 | 
122 |   void scale_timing_for_new_freq(float newfreq) {
123 |     float freq_scale = freq / newfreq;
124 |     RRD = ceil(RRD / freq_scale);
125 |     RCD = ceil(RCD / freq_scale);
126 |     RAS = ceil(RAS / freq_scale);
127 |     RP = ceil(RP / freq_scale);
128 |     RC = ceil(RC / freq_scale);
129 |     CL = ceil(CL / freq_scale);
130 |     WL = ceil(WL / freq_scale);
131 |     CDLR = ceil(CDLR / freq_scale);
132 |     WR = ceil(WR / freq_scale);
133 |     CCDL = ceil(CCDL / freq_scale);
134 |     RTPL = ceil(RTPL / freq_scale);
135 |   }
136 | };
137 | 
138 | // GDDR5 timing from hynix H5GQ1H24AFR
139 | //-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
140 | //                        CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
141 | 
142 | static const DDR_Timing GDDR5_Timing_1800MHZ(1800, 16, 2, 6, 12, 28, 12, 40, 12,
143 |                                              4, 5, 12, 4, 3, 2);
144 | 
145 | // HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017
146 | // paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
147 | // Timing for 1 GHZ:
148 | //-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
149 | //                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
150 | 
151 | static const DDR_Timing HBM_Timing_1000MHZ(1000, 16, 1, 4, 14, 33, 14, 47, 14,
152 |                                            2, 3, 12, 4, 2, 4);
153 | 
154 | #endif


--------------------------------------------------------------------------------
/microbench/hw_def/common/deviceQuery.h:
--------------------------------------------------------------------------------
 1 | #ifndef DEVICE_QUERY_H
 2 | #define DEVICE_QUERY_H
 3 | 
 4 | #include <cuda_runtime.h>
 5 | unsigned CLK_FREQUENCY;
 6 | unsigned SM_NUMBER;           // number of SMs
 7 | unsigned WARP_SIZE;           // max threads per warp
 8 | unsigned MAX_THREADS_PER_SM;  // max threads / sm
 9 | unsigned MAX_SHARED_MEM_SIZE; // Max configerable shared memory size in bytes
10 | unsigned MAX_WARPS_PER_SM;    // max warps / sm
11 | unsigned MAX_REG_PER_SM;      // max warps / sm
12 | 
13 | unsigned MAX_THREAD_BLOCK_SIZE;         // max threads per threadblock
14 | unsigned MAX_SHARED_MEM_SIZE_PER_BLOCK; // Max configerable shared memory size
15 |                                         // per block in bytes
16 | unsigned
17 |     MAX_REG_PER_BLOCK; // Max configerable shared memory size per block in bytes
18 | 
19 | size_t L2_SIZE; // L2 size in bytes
20 | 
21 | size_t MEM_SIZE;            // Memory size in bytes
22 | unsigned MEM_CLK_FREQUENCY; // Memory clock freq in MHZ
23 | unsigned MEM_BITWIDTH;      // Memory bit width
24 | 
25 | // launched threadblocks
26 | unsigned THREADS_PER_BLOCK;
27 | unsigned BLOCKS_PER_SM;
28 | unsigned THREADS_PER_SM;
29 | unsigned BLOCKS_NUM;
30 | unsigned TOTAL_THREADS;
31 | 
32 | cudaDeviceProp deviceProp;
33 | 
34 | unsigned intilizeDeviceProp(unsigned deviceID) {
35 |   cudaSetDevice(deviceID);
36 |   cudaGetDeviceProperties(&deviceProp, deviceID);
37 | 
38 |   CLK_FREQUENCY = deviceProp.clockRate;
39 |   // core stats
40 |   SM_NUMBER = deviceProp.multiProcessorCount;
41 |   MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
42 |   MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
43 |   WARP_SIZE = deviceProp.warpSize;
44 |   MAX_WARPS_PER_SM =
45 |       deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
46 |   MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
47 | 
48 |   // threadblock stats
49 |   MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
50 |   MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
51 |   MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
52 | 
53 |   // launched thread blocks to ensure GPU is fully occupied as much as possible
54 |   THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
55 |   BLOCKS_PER_SM =
56 |       deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
57 |   THREADS_PER_SM = BLOCKS_PER_SM * THREADS_PER_BLOCK;
58 |   BLOCKS_NUM = BLOCKS_PER_SM * SM_NUMBER;
59 |   TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
60 | 
61 |   // L2 cache
62 |   L2_SIZE = deviceProp.l2CacheSize;
63 | 
64 |   // memory
65 |   MEM_SIZE = deviceProp.totalGlobalMem;
66 |   MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
67 |   MEM_BITWIDTH = deviceProp.memoryBusWidth;
68 | 
69 |   return 1;
70 | }
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/microbench/hw_def/hw_def.h:
--------------------------------------------------------------------------------
 1 | #ifndef HW_DEF_H
 2 | #define HW_DEF_H
 3 | 
 4 | #include<cstdint>
 5 | #include <cmath>
 6 | #include "./common/common.h"
 7 | #include "./common/deviceQuery.h"
 8 | 
 9 | // note this is just fake meta data, used for performance microbenchmarking
10 | void initialize_fake_metadata_2_4(uint32_t* metadata, int row_size, int col_size) {
11 |     int range = 6;
12 |     uint32_t FourToTwoMeta[6] = { 0x4, 0x8, 0x9, 0xc, 0xd, 0xe };
13 |     for (int i = 0; i < row_size * col_size / 16; i++) { // 32 bit can represent 16 indexes , each index has 2 bit
14 |         uint32_t result = 0x0;
15 |         for (int n = 0; n < 32 / 4; ++n) {
16 |             double rnd = double(std::rand()) / double(RAND_MAX);
17 |             rnd = range * rnd;
18 |             uint32_t meta = FourToTwoMeta[(int)rnd];
19 | 
20 |             result = (uint32_t)(result | ((uint32_t)(meta << (i * 4))));
21 |         }
22 |         metadata[i] = result;
23 |     }
24 | }
25 | 
26 | __forceinline__ __device__ unsigned lane_id()
27 | {
28 |     unsigned ret; 
29 |     asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
30 |     return ret;
31 | }
32 | 
33 | __forceinline__ __device__ unsigned warp_id()
34 | {
35 |     // this is not equal to threadIdx.x / 32
36 |     unsigned ret; 
37 |     asm volatile ("mov.u32 %0, %warpid;" : "=r"(ret));
38 |     return ret;
39 | }
40 | 
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/bf16_numeric/Makefile:
--------------------------------------------------------------------------------
1 | SRC = bf16_chain_matmul.cu
2 | 
3 | EXE = bf16_chain_matmul.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/bf16add/Makefile:
--------------------------------------------------------------------------------
1 | SRC = bf16add.cu
2 | 
3 | EXE = bf16add.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/bf16mul/Makefile:
--------------------------------------------------------------------------------
1 | SRC = bf16mul.cu
2 | 
3 | EXE = bf16mul.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/m16n8k16/Makefile:
--------------------------------------------------------------------------------
1 | SRC = m16n8k16_bf16.cu
2 | 
3 | EXE = m16n8k16_bf16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/m16n8k16/m16n8k16_bf16.cu:
--------------------------------------------------------------------------------
  1 | // simple gemm  using bf16/half data types
  2 | // we do not target on optimal overall performance, so we will not use software pipepline
  3 | // pipepline or asychronous copy can speed up gemm further with cost of extra shared memory storage
  4 | // CUTLASS provides good examples of how to implement pipeline for gemm
  5 | #include <algorithm>
  6 | #include <cuda.h>
  7 | #include <iostream>
  8 | #include <mma.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <cstdlib>
 12 | #include <cuda_fp16.h>
 13 | #include <random>
 14 | #include "../../../hw_def/hw_def.h"
 15 | #include "../../cpu_base.h"
 16 | 
 17 | typedef __nv_bfloat16 op_AB; 
 18 | typedef float op_CD; 
 19 | 
 20 | 
 21 | #ifndef ITERS
 22 | #define ITERS  (1024 )
 23 | #endif
 24 | 
 25 | #define ROUNDS  (ITERS*10 )
 26 | 
 27 | const int inst_m = 16;
 28 | const int inst_n = 8;
 29 | const int inst_k = 16;
 30 | 
 31 | __forceinline__ __device__ unsigned lane_id_()
 32 | {
 33 |     unsigned ret; 
 34 |     asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
 35 |     return ret;
 36 | }
 37 | 
 38 | 
 39 | 
 40 | __global__ void gemm_m16n8k16_kernel(op_AB* MatA,op_AB* MatB,op_CD* MatC, op_CD* MatD ){
 41 |     //uint32_t tid = threadIdx.x;
 42 |     //uint32_t gid = blockIdx.x * blockDim.x + tid;//global at this block
 43 |     //uint32_t warpid = gid / warpSize;
 44 |     uint32_t lane_id =  lane_id_();
 45 |     // four threads per group, group id
 46 |     uint32_t group_id = lane_id >>2;
 47 |     uint32_t tid_in_group = lane_id % 4;
 48 | 
 49 |     // m16 n8 k16
 50 |     op_AB frag_A[8]; // 16 * 16  / 32 = 8 * bf16
 51 |     op_AB frag_B[4]; // 8 * 16  / 32
 52 |     op_CD frag_D[4]; // float , 16*8 /32 = 4*float
 53 |     // load operand fragA
 54 |     #pragma unroll
 55 |     for(int i =0; i < 8; i++){
 56 |         uint32_t row_a = 0;
 57 |         if( (i>=0 && i<2) || (i>=4 && i<6) ){
 58 |             row_a = group_id;
 59 |         }else{
 60 |             row_a = group_id + 8;
 61 |         }
 62 | 
 63 |         uint32_t col_a = 0;
 64 |         if(i<4){
 65 |             col_a = (tid_in_group * 2) + (i & 0x1);
 66 |         }else{
 67 |             col_a = (tid_in_group * 2) + (i & 0x1) + 8;
 68 |         }
 69 |         // row major
 70 |         frag_A[i] = MatA[inst_k*row_a + col_a];
 71 |         
 72 |     }
 73 |     // for(int i =0; i < 8; i++){
 74 |     //     printf("laneId = %d, fragA[%d] = %f \n", lane_id, i, float(frag_A[i]));
 75 |     // }
 76 | 
 77 |     // load operand fragB, MatB has to be col-major
 78 |     #pragma unroll
 79 |     for(int i =0; i < 4; i++){
 80 |         uint32_t row_b = 0;
 81 |         if( i < 2 ){
 82 |             row_b = (tid_in_group * 2) + (i & 0x1);
 83 |         }else{
 84 |             row_b = (tid_in_group * 2) + (i & 0x1)+8;
 85 |         }
 86 |         uint32_t col_b = group_id;
 87 |         // row-major B
 88 |         frag_B[i] = MatB[row_b*inst_n + col_b];
 89 |     }
 90 | 
 91 |     // for(int i =0; i < 4; i++){
 92 |     //     printf("laneId = %d, fragB[%d] = %f \n", lane_id, i, float(frag_B[i]));
 93 |     // }
 94 | 
 95 |     // load operand fragC, MatC has to be row-major
 96 |     #pragma unroll
 97 |     for(int i =0; i < 4; i++){
 98 |         uint32_t row_c = 0;
 99 |         if( i < 2 ){
100 |             row_c = group_id;
101 |         }else{
102 |             row_c = group_id + 8;
103 |         }
104 |         uint32_t col_c = (tid_in_group * 2) + (i & 0x1);
105 |         // row-major
106 |         frag_D[i] = MatC[inst_n*row_c + col_c];
107 |     }
108 | 
109 |     // printf("\n\n");
110 |     // for(int i =0; i < 4; i++){
111 |     //     printf("laneId = %d, fragC[%d] = %f \n", lane_id, i, float(frag_D[i]));
112 |     // }
113 | 
114 |     //step 1: load data 
115 |     // MatA => frag_A, MatB => frag_B, MatC => frag_C
116 | 
117 | 
118 |     uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
119 |     uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
120 |     float *C = reinterpret_cast<float *>(&frag_D[0]);
121 |     float *D = C;  // D = A*B + D.
122 | 
123 |     asm volatile(
124 |         "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
125 |         "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
126 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
127 |         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
128 |           "r"(B[0]), "r"(B[1]),
129 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
130 |       );
131 |     
132 |     __syncwarp();
133 | 
134 |     // store back result
135 |     // printf("\n\n");
136 |     // for(int i =0; i < 4; i++){
137 |     //     printf("laneId = %d, fragD[%d] = %f \n", lane_id, i, float(frag_D[i]));
138 |     // }
139 |     #pragma unroll
140 |     for(int i =0; i < 4; i++){
141 |         uint32_t row_d = 0;
142 |         if( i < 2 ){
143 |             row_d = group_id;
144 |         }else{
145 |             row_d = group_id + 8;
146 |         }
147 |         uint32_t col_d = (tid_in_group * 2) + (i & 0x1);
148 |         // row-major
149 |         MatD[inst_n*row_d + col_d] = frag_D[i];
150 |     }
151 | 
152 | }
153 | 
154 | 
155 | std::vector<double> gemm_m16n8k16_bf16(){
156 |     int BLOCKS_NUM = 1;
157 |     int nwarps = 1;
158 |     int warp_size = 32;
159 | 
160 |     
161 |     unsigned total_A_SIZE = inst_m*inst_k*nwarps;
162 |     unsigned total_B_SIZE = inst_k*inst_n*nwarps;
163 |     unsigned total_C_SIZE = inst_m*inst_n*nwarps;
164 | 
165 | 
166 |     op_AB *host_matA = (op_AB *)malloc(total_A_SIZE * sizeof(op_AB));
167 |     op_AB *host_matB = (op_AB *)malloc(total_B_SIZE * sizeof(op_AB));
168 | 
169 |     op_CD *host_matC = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
170 |     op_CD *host_matD = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
171 |     std::random_device rd{};
172 |     std::mt19937 gen{rd()};
173 |     std::normal_distribution<> random_gen{-1.0,1.0};
174 |     // initialize A, row-major
175 |     float *host_matA_cpu = (float *)malloc(total_A_SIZE * sizeof(float));
176 |     float *host_matB_cpu = (float *)malloc(total_B_SIZE * sizeof(float));
177 |     for(int r = 0; r < inst_m; r ++){
178 |         for(int c = 0; c < inst_k; c ++){
179 |             //float rnd = (float)(r*inst_k+c);
180 |             float rnd = (float)random_gen(gen);
181 |             host_matA_cpu[r*inst_k+c] = rnd;
182 |             host_matA[r*inst_k+c] = (op_AB)rnd;
183 |         }
184 |     }
185 |     // std::cout<<"print MatA" <<std::endl;
186 |     // print_mat(host_matA_cpu,inst_m,inst_k);
187 | 
188 |     // initialize B, row-major
189 |     for(int r = 0; r < inst_k; r ++){
190 |         for(int c = 0; c < inst_n; c ++){
191 |             float rnd = (float)random_gen(gen);
192 |             //float rnd = float(r*inst_n+c);   //(float)random_gen(gen);
193 |             host_matB_cpu[r*inst_n+c] = rnd;
194 |             host_matB[r*inst_n+c] = (op_AB)rnd;
195 |         }
196 |     }
197 |     // std::cout<<"print MatB" <<std::endl;
198 |     // print_mat(host_matB_cpu,inst_k,inst_n);
199 | 
200 |     // initialize C, row-major
201 |     for(int r = 0; r < inst_m; r ++){
202 |         for(int c = 0; c < inst_n; c ++){
203 |             host_matC[r*inst_n+c] = 0.0 ;//  (op_CD)random_gen(gen);
204 |         }
205 |     }
206 | 
207 | 
208 | 
209 |     float *cpu_res_baseline = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
210 |     // host computation
211 |     gemm_mnk_cpu(host_matA_cpu,host_matB_cpu,host_matC,cpu_res_baseline,inst_m,inst_n,inst_k);
212 | 
213 |     
214 | 
215 |     //device ptr
216 |     op_AB *dev_matA;
217 |     op_AB *dev_matB;
218 |     op_CD *dev_matC;
219 | 
220 |     op_CD *dev_matD;
221 |     // allocate device global memory
222 |     // D = A*B + C
223 |     cudaMalloc(&dev_matA, total_A_SIZE * sizeof(op_AB));
224 |     cudaMalloc(&dev_matB, total_B_SIZE * sizeof(op_AB));
225 |     cudaMalloc(&dev_matC, total_C_SIZE * sizeof(op_CD));
226 |     cudaMalloc(&dev_matD, total_C_SIZE * sizeof(op_CD));
227 |     // copy data from host to device
228 |     gpuErrchk(cudaMemcpy(dev_matA, host_matA, total_A_SIZE * sizeof(op_AB), cudaMemcpyHostToDevice));
229 | 
230 |     gpuErrchk(cudaMemcpy(dev_matB, host_matB, total_B_SIZE * sizeof(op_AB), cudaMemcpyHostToDevice));
231 |     gpuErrchk(cudaMemcpy(dev_matC, host_matC, total_C_SIZE * sizeof(op_CD), cudaMemcpyHostToDevice));
232 | 
233 |     gemm_m16n8k16_kernel<<<BLOCKS_NUM, nwarps*warp_size>>>(dev_matA,dev_matB,dev_matC,dev_matD);
234 |     gpuErrchk(cudaPeekAtLastError());
235 | 
236 |     gpuErrchk(cudaMemcpy(host_matD, dev_matD, total_C_SIZE * sizeof(op_CD), cudaMemcpyDeviceToHost));
237 | 
238 |     //check errors
239 |     double l1_norm = 0.0;
240 |     double abs_err = 0.0;
241 |     double l2_relative_err = 0.0;
242 |     compute_diff_l1_norm(cpu_res_baseline,host_matD,inst_m,inst_n,abs_err,l1_norm);
243 |     compute_diff_l2_norm(cpu_res_baseline,host_matD,inst_m,inst_n,l2_relative_err);
244 |     std::vector<double> errors{abs_err,l1_norm,abs_err/inst_k,l1_norm/inst_k,l2_relative_err};
245 |     
246 |     return errors;
247 | }
248 | 
249 | 
250 | 
251 | 
252 | int main(){
253 |     std::cout<<"***********************************"<<std::endl;
254 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 numeric errors w.r.t fp32 on cpu"  << std::endl;
255 |     // gemm_m16n8k16_bf16();
256 | 
257 |     double avg_abs_err = 0.0;
258 |     double avg_l1_norm = 0.0;
259 |     double avg_abs_err_FMA = 0.0;
260 |     double avg_l1_norm_FMA = 0.0;
261 |     double l2_relative = 0.0;
262 |     for(int i=0;i < ROUNDS; i ++){
263 |         std::vector<double> errors = gemm_m16n8k16_bf16();
264 |         avg_abs_err += errors[0];
265 |         avg_l1_norm += errors[1];
266 |         avg_abs_err_FMA += errors[2];
267 |         avg_l1_norm_FMA += errors[3];
268 |         l2_relative += errors[4];
269 |     }
270 | 
271 | 
272 |     // std::cout<<"element-wise avg_abs_err = " << avg_abs_err/ITERS <<std::endl;
273 |     // std::cout<<"element-wise avg_l1_norm_err = " << avg_l1_norm/ITERS <<std::endl;
274 | 
275 |     std::cout<<"l2 relative error :"<< l2_relative/ROUNDS << std::endl;
276 |     std::cout<<"l2 relative error per FMA :"<< l2_relative/(ROUNDS*inst_m*inst_k*inst_n) << std::endl;
277 | 
278 |     // std::cout<<"error/FMA :"<<std::endl;
279 |     // std::cout<<"avg_abs_err_FMA = " << avg_abs_err_FMA/ITERS <<std::endl;
280 |     // std::cout<<"avg_l1_norm_FMA = " << avg_l1_norm_FMA/ITERS <<std::endl;
281 | }
282 | 
283 | 
284 | 
285 | 


--------------------------------------------------------------------------------
/microbench/numericbench/bf16numeric/m16n8k8/Makefile:
--------------------------------------------------------------------------------
1 | SRC = m16n8k8_bf16.cu
2 | 
3 | EXE = m16n8k8_bf16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/cpu_base.h:
--------------------------------------------------------------------------------
  1 | #include<iostream>
  2 | #include<vector>
  3 | #include<cmath>
  4 | #include<cstdlib>
  5 | #pragma once
  6 | // check the difference of two matrix
  7 | void compute_diff_l1_norm(float* cpu_base, float* gpu_res, int rows, int cols,double& abs_err, double&  l1_norm){
  8 | 
  9 |     // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i|
 10 |     // double l1_norm = 0.0; 
 11 |     // double abs = 0.0;
 12 |     for(int row =0; row< rows; row ++){
 13 |         for(int col =0; col < cols; col ++){
 14 |             int gid = col + row*cols;
 15 |             abs_err += std::abs(gpu_res[gid] - cpu_base[gid]);
 16 |             l1_norm += abs_err/std::abs(gpu_res[gid]);
 17 |         }
 18 |     }
 19 |     // l1_norm = l1_norm/(rows*cols);
 20 |     // abs_err = abs_err/(rows*cols);
 21 |     //return l1_norm/(rows*cols);
 22 | };
 23 | 
 24 | 
 25 | 
 26 | void compute_diff_l2_norm(float* cpu_base, float* gpu_res, int rows, int cols, double&  l2_norm){
 27 | 
 28 |     // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i|
 29 |     // double l1_norm = 0.0; 
 30 |     // double abs = 0.0;
 31 |     double tensor_diff_norm = 0.0;
 32 |     double tensor_gpu_norm = 0.0;
 33 |     for(int row =0; row< rows; row ++){
 34 |         for(int col =0; col < cols; col ++){
 35 |             int gid = col + row*cols;
 36 |             tensor_diff_norm += std::pow((double(gpu_res[gid]) - double(cpu_base[gid])) ,2 ) ;
 37 |             tensor_gpu_norm += std::pow(double(gpu_res[gid]),2 );
 38 |         }
 39 |     }
 40 |     l2_norm = std::sqrt(tensor_diff_norm)/std::sqrt(tensor_gpu_norm);
 41 |     //l1_norm = l1_norm/(rows*cols);
 42 |     // abs_err = abs_err/(rows*cols);
 43 |     //return l1_norm/(rows*cols);
 44 | };
 45 | 
 46 | 
 47 | 
 48 | void gemm_mnk_cpu(float* MatA,float* MatB,float* MatC, float* MatD, int M, int N, int K){
 49 | 
 50 |     // Matd = MatA * MatB + MatC
 51 |     for(int row=0; row < M; row++){
 52 |         for(int col=0; col < N; col++){
 53 |             int gid =  col + row*N;
 54 |             float tmp = 0.0;
 55 |             for(int inner=0; inner < K; inner++)
 56 |             {
 57 |                 tmp += MatA[inner + row*K] * MatB[col + inner*N];
 58 |             }
 59 |             MatD[gid] = tmp+ MatC[gid];
 60 |         }
 61 |     }
 62 | 
 63 | };
 64 | 
 65 | void print_mat(float* Mat, int rows, int cols){
 66 |     
 67 |     for(int row = 0; row < rows; row++){
 68 |         for(int col =0;col < cols; col++){
 69 |             printf("%8.4f ", Mat[col + row*cols]);
 70 |         }
 71 |         std::cout<<std::endl;
 72 |     }
 73 |     std::cout<<std::endl;
 74 | 
 75 | }
 76 | 
 77 | // check if the matrix has inf or not
 78 | bool Mat_has_inf(float* Mat, int num_eles){
 79 | 
 80 |     for(int i=0;i<num_eles;i++){
 81 |         if(std::isinf(Mat[i])){
 82 |             //std::cout<<Mat[i]<<std::endl;
 83 |             return true;
 84 | 
 85 |         } 
 86 |     }
 87 |     return false;
 88 | }
 89 | 
 90 | 
 91 | bool Mat_has_nan(float* Mat, int num_eles){
 92 | 
 93 |     for(int i=0;i<num_eles;i++){
 94 |         if(std::isnan(Mat[i])) return true;
 95 |     }
 96 |     return false;
 97 | }
 98 | // template <typename typeIn, typename typeOut,int M, int N, int K >
 99 | // struct gemmCPU
100 | // {
101 | //     /* data */
102 | // };
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/microbench/numericbench/cpu_int_base.h:
--------------------------------------------------------------------------------
  1 | #include<iostream>
  2 | #include<vector>
  3 | #include<cmath>
  4 | #include<cstdlib>
  5 | #pragma once
  6 | // check the difference of two matrix
  7 | void compute_diff_l1_norm(int* cpu_base, int* gpu_res, int rows, int cols,double& abs_err, double&  l1_norm){
  8 | 
  9 |     // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i|
 10 |     // double l1_norm = 0.0; 
 11 |     // double abs = 0.0;
 12 |     for(int row =0; row< rows; row ++){
 13 |         for(int col =0; col < cols; col ++){
 14 |             int gid = col + row*cols;
 15 |             abs_err += std::abs(gpu_res[gid] - cpu_base[gid]);
 16 |             l1_norm += abs_err/std::abs(gpu_res[gid]);
 17 |         }
 18 |     }
 19 |     // l1_norm = l1_norm/(rows*cols);
 20 |     // abs_err = abs_err/(rows*cols);
 21 |     //return l1_norm/(rows*cols);
 22 | };
 23 | 
 24 | 
 25 | 
 26 | void compute_diff_l2_norm(int* cpu_base, int* gpu_res, int rows, int cols, double&  l2_norm){
 27 | 
 28 |     // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i|
 29 |     // double l1_norm = 0.0; 
 30 |     // double abs = 0.0;
 31 |     double tensor_diff_norm = 0.0;
 32 |     double tensor_gpu_norm = 0.0;
 33 |     for(int row =0; row< rows; row ++){
 34 |         for(int col =0; col < cols; col ++){
 35 |             int gid = col + row*cols;
 36 |             tensor_diff_norm += std::pow((double(gpu_res[gid]) - double(cpu_base[gid])) ,2 ) ;
 37 |             tensor_gpu_norm += std::pow(double(gpu_res[gid]),2 );
 38 |         }
 39 |     }
 40 |     l2_norm = std::sqrt(tensor_diff_norm)/std::sqrt(tensor_gpu_norm);
 41 |     //l1_norm = l1_norm/(rows*cols);
 42 |     // abs_err = abs_err/(rows*cols);
 43 |     //return l1_norm/(rows*cols);
 44 | };
 45 | 
 46 | 
 47 | template<typename datatypeIN, typename datatypeOut>
 48 | void gemm_mnk_cpu(datatypeIN* MatA,datatypeIN* MatB,datatypeOut* MatC, datatypeOut* MatD, int M, int N, int K){
 49 | 
 50 |     // Matd = MatA * MatB + MatC
 51 |     for(int row=0; row < M; row++){
 52 |         for(int col=0; col < N; col++){
 53 |             int gid =  col + row*N;
 54 |             datatypeOut tmp = 0;
 55 |             for(int inner=0; inner < K; inner++)
 56 |             {
 57 |                 tmp += MatA[inner + row*K] * MatB[col + inner*N];
 58 |             }
 59 |             MatD[gid] = tmp+ MatC[gid];
 60 |         }
 61 |     }
 62 | 
 63 | };
 64 | 
 65 | void print_mat(int* Mat, int rows, int cols){
 66 |     
 67 |     for(int row = 0; row < rows; row++){
 68 |         for(int col =0;col < cols; col++){
 69 |             printf("%8d ", Mat[col + row*cols]);
 70 |         }
 71 |         std::cout<<std::endl;
 72 |     }
 73 |     std::cout<<std::endl;
 74 | 
 75 | }
 76 | 
 77 | // check if the matrix has inf or not
 78 | bool Mat_has_inf(int* Mat, int num_eles){
 79 | 
 80 |     for(int i=0;i<num_eles;i++){
 81 |         if(std::isinf(Mat[i])){
 82 |             //std::cout<<Mat[i]<<std::endl;
 83 |             return true;
 84 | 
 85 |         } 
 86 |     }
 87 |     return false;
 88 | }
 89 | 
 90 | 
 91 | bool Mat_has_nan(int* Mat, int num_eles){
 92 | 
 93 |     for(int i=0;i<num_eles;i++){
 94 |         if(std::isnan(Mat[i])) return true;
 95 |     }
 96 |     return false;
 97 | }
 98 | // template <typename typeIn, typename typeOut,int M, int N, int K >
 99 | // struct gemmCPU
100 | // {
101 | //     /* data */
102 | // };
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/microbench/numericbench/fp16numeric/fp16_numeric/Makefile:
--------------------------------------------------------------------------------
1 | SRC = fp16_chain_matmul.cu
2 | 
3 | EXE = fp16_chain_matmul.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/fp16numeric/fp16add/Makefile:
--------------------------------------------------------------------------------
1 | SRC = fp16add.cu
2 | 
3 | EXE = fp16add.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/fp16numeric/fp16mul/Makefile:
--------------------------------------------------------------------------------
1 | SRC = fp16mul.cu
2 | 
3 | EXE = fp16mul.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/int8numeric/int8add/Makefile:
--------------------------------------------------------------------------------
1 | SRC = int8add.cu
2 | 
3 | EXE = int8add.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/m16n8k4/Makefile:
--------------------------------------------------------------------------------
1 | SRC = m16n8k4_tf32.cu
2 | 
3 | EXE = m16n8k4_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/m16n8k4/m16n8k4_tf32.cu:
--------------------------------------------------------------------------------
  1 | // simple gemm  using bf16/half data types
  2 | // we do not target on optimal overall performance, so we will not use software pipepline
  3 | // pipepline or asychronous copy can speed up gemm further with cost of extra shared memory storage
  4 | // CUTLASS provides good examples of how to implement pipeline for gemm
  5 | #include <algorithm>
  6 | #include <cuda.h>
  7 | #include <iostream>
  8 | #include <mma.h>
  9 | #include <vector>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <cstdlib>
 13 | #include <cuda_fp16.h>
 14 | #include <random>
 15 | #include "../../../hw_def/hw_def.h"
 16 | #include "../../cpu_base.h"
 17 | 
 18 | typedef float op_AB; 
 19 | typedef float op_CD; 
 20 | 
 21 | 
 22 | #ifndef ITERS
 23 | #define ITERS  (1024 )
 24 | #endif
 25 | 
 26 | #define ROUNDS  (ITERS*10 )
 27 | 
 28 | const int inst_m = 16;
 29 | const int inst_n = 8;
 30 | const int inst_k = 4;
 31 | 
 32 | // we want to know the numeric precision of PTX instruction - the lowerst programming interface.
 33 | // Since higher-level applications are based on the PTX instruction, the numeric errors/differences higher-level applications are based on the ptx instruction. 
 34 | // __forceinline__ __device__ unsigned lane_id()
 35 | // {
 36 | //     unsigned ret; 
 37 | //     asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
 38 | //     return ret;
 39 | // }
 40 | 
 41 | __forceinline__ __device__ unsigned lane_id_()
 42 | {
 43 |     unsigned ret; 
 44 |     asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
 45 |     return ret;
 46 | }
 47 | 
 48 | 
 49 | 
 50 | __global__ void gemm_m16n8k4_kernel(op_AB* MatA,op_AB* MatB,op_CD* MatC, op_CD* MatD ){
 51 |     uint32_t lane_id =  lane_id_();
 52 |     // four threads per group, group id
 53 |     uint32_t group_id = lane_id >>2;
 54 |     uint32_t tid_in_group = lane_id % 4;
 55 | 
 56 |     // m16 n8 k16
 57 |     uint32_t frag_A[2]; // 16 * 16  / 32 = 8 * bf16
 58 |     uint32_t frag_B[1]; // 8 * 16  / 32
 59 |     op_CD frag_D[4]; // float , 16*8 /32 = 4*float
 60 |     // load operand fragA
 61 |     #pragma unroll
 62 |     for(int i =0; i < 2; i++){
 63 |         uint32_t row_a = 0;
 64 |         uint32_t col_a = 0;
 65 |         if( i==0 ){
 66 |             row_a = group_id;
 67 |         }else{
 68 |             row_a = group_id + 8;
 69 |         }
 70 |         col_a = tid_in_group;
 71 |         // row major
 72 |         // Cvt Float - TF32
 73 |         asm("cvt.rna.tf32.f32  %0, %1;\n" : "=r"(frag_A[i]) : "f"(MatA[inst_k*row_a + col_a]));
 74 |     }
 75 |     #pragma unroll
 76 |     for(int i =0; i < 1; i++){
 77 |         uint32_t row_b =  tid_in_group ; 
 78 |         uint32_t col_b = group_id;
 79 |         // row-major B
 80 |         asm("cvt.rna.tf32.f32  %0, %1;\n" : "=r"(frag_B[i]) : "f"(MatB[row_b*inst_n + col_b]));
 81 |         //frag_B[i] = (MatB[row_b*inst_n + col_b]);
 82 |     }
 83 | 
 84 |     #pragma unroll
 85 |     for(int i =0; i < 4; i++){
 86 |         uint32_t row_c = 0;
 87 |         if( i < 2 ){
 88 |             row_c = group_id;
 89 |         }else{
 90 |             row_c = group_id + 8;
 91 |         }
 92 |         uint32_t col_c = (tid_in_group * 2) + (i & 0x1);
 93 |         // row-major
 94 |         frag_D[i] = MatC[inst_n*row_c + col_c];
 95 |     }
 96 | 
 97 |     uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 98 |     uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 99 |     float *C = reinterpret_cast<float *>(&frag_D[0]);
100 |     float *D = C;  // D = A*B + D.
101 | 
102 |     asm volatile(
103 |         "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
104 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
105 |         : "r"(A[0]), "r"(A[1]), 
106 |           "r"(B[0]), 
107 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
108 |     );
109 |     
110 |     __syncwarp();
111 | 
112 |     #pragma unroll
113 |     for(int i =0; i < 4; i++){
114 |         uint32_t row_d = 0;
115 |         if( i < 2 ){
116 |             row_d = group_id;
117 |         }else{
118 |             row_d = group_id + 8;
119 |         }
120 |         uint32_t col_d = (tid_in_group * 2) + (i & 0x1);
121 |         // row-major
122 |         MatD[inst_n*row_d + col_d] = frag_D[i];
123 |     }
124 | }
125 | 
126 | 
127 | 
128 | 
129 | 
130 | std::vector<double> gemm_m16n8k8_bf16(){
131 |     int BLOCKS_NUM = 1;
132 |     int nwarps = 1;
133 |     int warp_size = 32;
134 | 
135 |     
136 |     unsigned total_A_SIZE = inst_m*inst_k*nwarps;
137 |     unsigned total_B_SIZE = inst_k*inst_n*nwarps;
138 |     unsigned total_C_SIZE = inst_m*inst_n*nwarps;
139 | 
140 | 
141 |     op_AB *host_matA = (op_AB *)malloc(total_A_SIZE * sizeof(op_AB));
142 |     op_AB *host_matB = (op_AB *)malloc(total_B_SIZE * sizeof(op_AB));
143 | 
144 |     op_CD *host_matC = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
145 |     op_CD *host_matD = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
146 |     std::random_device rd{};
147 |     std::mt19937 gen{rd()};
148 |     std::normal_distribution<> random_gen{-1.0,1.0};
149 |     // initialize A, row-major
150 |     float *host_matA_cpu = (float *)malloc(total_A_SIZE * sizeof(float));
151 |     float *host_matB_cpu = (float *)malloc(total_B_SIZE * sizeof(float));
152 |     for(int r = 0; r < inst_m; r ++){
153 |         for(int c = 0; c < inst_k; c ++){
154 |             //float rnd = (float)(r*inst_k+c);
155 |             float rnd = (float)random_gen(gen);
156 |             host_matA_cpu[r*inst_k+c] = rnd;
157 |             host_matA[r*inst_k+c] = (op_AB)rnd;
158 |         }
159 |     }
160 |     // std::cout<<"print MatA" <<std::endl;
161 |     // print_mat(host_matA_cpu,inst_m,inst_k);
162 | 
163 |     // initialize B, row-major
164 |     for(int r = 0; r < inst_k; r ++){
165 |         for(int c = 0; c < inst_n; c ++){
166 |             float rnd = (float)random_gen(gen);
167 |             //float rnd = float(r*inst_n+c);   //(float)random_gen(gen);
168 |             host_matB_cpu[r*inst_n+c] = rnd;
169 |             host_matB[r*inst_n+c] = (op_AB)rnd;
170 |         }
171 |     }
172 |     // std::cout<<"print MatB" <<std::endl;
173 |     // print_mat(host_matB_cpu,inst_k,inst_n);
174 | 
175 |     // initialize C, row-major
176 |     for(int r = 0; r < inst_m; r ++){
177 |         for(int c = 0; c < inst_n; c ++){
178 |             host_matC[r*inst_n+c] = 0.0 ;//  (op_CD)random_gen(gen);
179 |         }
180 |     }
181 | 
182 | 
183 | 
184 |     float *cpu_res_baseline = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD));
185 |     // host computation
186 |     gemm_mnk_cpu(host_matA_cpu,host_matB_cpu,host_matC,cpu_res_baseline,inst_m,inst_n,inst_k);
187 | 
188 |     
189 | 
190 |     //device ptr
191 |     op_AB *dev_matA;
192 |     op_AB *dev_matB;
193 |     op_CD *dev_matC;
194 | 
195 |     op_CD *dev_matD;
196 |     // allocate device global memory
197 |     // D = A*B + C
198 |     cudaMalloc(&dev_matA, total_A_SIZE * sizeof(op_AB));
199 |     cudaMalloc(&dev_matB, total_B_SIZE * sizeof(op_AB));
200 |     cudaMalloc(&dev_matC, total_C_SIZE * sizeof(op_CD));
201 |     cudaMalloc(&dev_matD, total_C_SIZE * sizeof(op_CD));
202 |     // copy data from host to device
203 |     gpuErrchk(cudaMemcpy(dev_matA, host_matA, total_A_SIZE * sizeof(op_AB), cudaMemcpyHostToDevice));
204 | 
205 |     gpuErrchk(cudaMemcpy(dev_matB, host_matB, total_B_SIZE * sizeof(op_AB), cudaMemcpyHostToDevice));
206 |     gpuErrchk(cudaMemcpy(dev_matC, host_matC, total_C_SIZE * sizeof(op_CD), cudaMemcpyHostToDevice));
207 | 
208 |     gemm_m16n8k4_kernel<<<BLOCKS_NUM, nwarps*warp_size>>>(dev_matA,dev_matB,dev_matC,dev_matD);
209 |     gpuErrchk(cudaPeekAtLastError());
210 | 
211 |     gpuErrchk(cudaMemcpy(host_matD, dev_matD, total_C_SIZE * sizeof(op_CD), cudaMemcpyDeviceToHost));
212 | 
213 |     //check errors
214 |     double l1_norm = 0.0;
215 |     double abs_err = 0.0;
216 |     double l2_relative_err = 0.0;
217 |     compute_diff_l1_norm(cpu_res_baseline,host_matD,inst_m,inst_n,abs_err,l1_norm);
218 |     compute_diff_l2_norm(cpu_res_baseline,host_matD,inst_m,inst_n,l2_relative_err);
219 | 
220 |     // std::cout<<"print cpu_res_baseline" <<std::endl;
221 | 
222 |     // print_mat(cpu_res_baseline,inst_m,inst_n);
223 | 
224 |     // std::cout<<"print GPU res" <<std::endl;
225 |     // print_mat(host_matD,inst_m,inst_n);
226 | 
227 |     // std::cout<<"element-wise abs_err = " << abs_err <<std::endl;
228 |     // std::cout<<"element-wise l1 norm = " << l1_norm <<std::endl;
229 | 
230 |     // std::cout<<"err/FMA :"<<std::endl;
231 |     // std::cout<<"abs_err = " << abs_err/inst_k <<std::endl;
232 |     // std::cout<<"l1 norm = " << l1_norm/inst_k <<std::endl;
233 |     
234 |     std::vector<double> errors{abs_err,l1_norm,abs_err/inst_k,l1_norm/inst_k,l2_relative_err};
235 |     
236 |     return errors;
237 | }
238 | 
239 | 
240 | 
241 | 
242 | int main(){
243 |     std::cout<<"***********************************"<<std::endl;
244 |     std::cout << "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 numeric errors w.r.t fp32 on cpu"  << std::endl;
245 | 
246 |     double avg_abs_err = 0.0;
247 |     double avg_l1_norm = 0.0;
248 |     double avg_abs_err_FMA = 0.0;
249 |     double avg_l1_norm_FMA = 0.0;
250 |     double l2_relative = 0.0;
251 |     for(int i=0;i < ROUNDS; i ++){
252 |         std::vector<double> errors = gemm_m16n8k8_bf16();
253 |         avg_abs_err += errors[0];
254 |         avg_l1_norm += errors[1];
255 |         avg_abs_err_FMA += errors[2];
256 |         avg_l1_norm_FMA += errors[3];
257 |         l2_relative += errors[4];
258 |     }
259 | 
260 |     // std::cout<<"element-wise error :"<<std::endl;
261 |     // std::cout<<"element-wise avg_abs_err = " << avg_abs_err/ITERS <<std::endl;
262 |     // std::cout<<"element-wise avg_l1_norm_err = " << avg_l1_norm/ITERS <<std::endl;
263 | 
264 |     std::cout<<"l2 relative error :"<< l2_relative/ROUNDS << std::endl;
265 |     std::cout<<"l2 relative error per FMA :"<< l2_relative/(ROUNDS*inst_m*inst_k*inst_n) << std::endl;
266 | 
267 |     // std::cout<<"error/FMA :"<<std::endl;
268 |     // std::cout<<"avg_abs_err_FMA = " << avg_abs_err_FMA/ITERS <<std::endl;
269 |     // std::cout<<"avg_l1_norm_FMA = " << avg_l1_norm_FMA/ITERS <<std::endl;
270 | }
271 | 
272 | 
273 | 
274 | 


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/m16n8k8/Makefile:
--------------------------------------------------------------------------------
1 | SRC = m16n8k8_tf32.cu
2 | 
3 | EXE = m16n8k8_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/tf32_numeric/Makefile:
--------------------------------------------------------------------------------
1 | #SRC = tf32_numeric.cu
2 | SRC = tf32_chain_matmul.cu
3 | 
4 | EXE = tf32_chain_matmul.app
5 | 
6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
7 | 
8 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/tf32add/Makefile:
--------------------------------------------------------------------------------
1 | #SRC = tf32_numeric.cu
2 | SRC = tf32add.cu
3 | 
4 | EXE = tf32add.app
5 | 
6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
7 | 
8 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/numericbench/tf32numeric/tf32mul/Makefile:
--------------------------------------------------------------------------------
1 | #SRC = tf32_numeric.cu
2 | SRC = tf32mul.cu
3 | 
4 | EXE = tf32mul.app
5 | 
6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
7 | 
8 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/run_all.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | # max=10
 3 | # for (( i=2; i <= $max; ++i ))
 4 | # do
 5 | #     echo "$i"
 6 | # done
 7 | THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
 8 | SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 9 | 
10 | for ((i = 1; i <= 8; ++i)) do 
11 |     cd ${SCRIPT_DIR}
12 |     make clean
13 |     ILPconfig=${i}
14 |     echo "ILP = ${ILPconfig}"
15 |     make -k ILP=${ILPconfig}
16 |     cd ${SCRIPT_DIR}/bin/
17 |     for f in ./*; do
18 |         echo "running $f microbenchmark"
19 |         $f >> ${SCRIPT_DIR}/A100-ILP"${ILPconfig}".log
20 |         echo "/////////////////////////////////"
21 |     done
22 | done
23 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_ILP/Makefile:
--------------------------------------------------------------------------------
1 | SRC = ldmatrix_ilp.cu
2 | 
3 | EXE = ldmatrix_ilp.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_ILP/ldmatrix_ilp.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | //#define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (999 )
 15 | #endif
 16 | 
 17 | // #define ILPconfig 2 // Moved to nvcc flags
 18 | 
 19 | #ifndef ILPconfig
 20 | #define ILPconfig 1
 21 | #endif
 22 | 
 23 | static_assert(ILPconfig<=8,"ILP > 8 is not supported\n");
 24 | 
 25 | 
 26 | typedef uint32_t shared_m;
 27 | // Measure latency of ITERS ldmatrix.x1
 28 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 29 |   shared_m *dsink, uint32_t stride) {
 30 | 
 31 |   // thread index
 32 |   uint32_t tid = threadIdx.x;
 33 |   uint32_t bid = blockIdx.x;
 34 |   uint32_t uid = bid * blockDim.x + tid;
 35 |   uint32_t n_threads = blockDim.x * gridDim.x;
 36 | 
 37 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 38 | 
 39 |   // one thread to initialize the pointer-chasing array
 40 |   if(uid == 0){
 41 |     for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++)
 42 |       s[i] = (i )*16 % (1024); // s[i] is multiple of 16, because addree is aligned with 4 bytes
 43 |   }
 44 |     
 45 |     asm volatile("bar.sync 0;");
 46 | 
 47 |     // if(uid == 0){
 48 |     //   for(int i = 0; i < SHARED_MEM_SIZE; i ++){
 49 |     //     printf("s[%d] = %d \t", i, s[i]);
 50 |   
 51 |     //   }
 52 |     //   printf("\n");
 53 |     // }
 54 |   //if (uid == 0) {
 55 |     // initalize pointer chaser
 56 |     //unsigned x = threadIdx.x*4;
 57 |     unsigned addr = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 58 | 
 59 |     //#if ILPconfig == 2
 60 | 
 61 |  
 62 | 
 63 |     
 64 |     unsigned addr2 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32) *4]));
 65 |     unsigned addr3 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 64) *4]));
 66 |     unsigned addr4 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 96) *4]));
 67 |     
 68 |     unsigned addr5 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*4) *4]));
 69 |     unsigned addr6 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*5) *4]));
 70 |     unsigned addr7 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*6) *4]));
 71 |     unsigned addr8 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*7) *4]));
 72 |     //printf("thread %d , addr = %d \n", tid, addr);
 73 |     // start timing
 74 |     uint32_t start = 0;
 75 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 76 | 
 77 |    
 78 |     //#pragma unroll
 79 |     for (uint32_t i = 0; i < ITERS; ++i) {
 80 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr) : "r"(addr)); 
 81 |         #if ILPconfig >= 2
 82 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr2) : "r"(addr2)); 
 83 |         #endif
 84 |         #if ILPconfig >= 3
 85 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr3) : "r"(addr3)); 
 86 |         #endif
 87 |         #if ILPconfig >= 4
 88 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr4) : "r"(addr4)); 
 89 |         #endif
 90 |         #if ILPconfig >= 5
 91 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr5) : "r"(addr5)); 
 92 |         #endif
 93 |         #if ILPconfig >= 6
 94 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr6) : "r"(addr6)); 
 95 |         #endif
 96 |         #if ILPconfig >= 7
 97 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr7) : "r"(addr7)); 
 98 |         #endif
 99 |         #if ILPconfig >= 8
100 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr8) : "r"(addr8)); 
101 |         #endif
102 |         __syncwarp();
103 |     }
104 |     uint32_t stop = 0;
105 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
106 | 
107 |     //printf("thread %d , x = %d \n", tid, addr);
108 | 
109 |     // write time and data back to memory
110 |     startClk[uid] = start;
111 |     stopClk[uid] = stop;
112 |     dsink[uid] = addr + addr2 ;
113 |     dsink[uid] +=  addr3 + addr4 + addr5 + addr6 + addr7 + addr8;
114 |     // dsink[uid] += addr5 + addr6 + addr7 + addr8;
115 | 
116 |     // float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
117 |     // printf("Shared Memory Latency  = %f cycles\n", lat);
118 |   //}
119 | }
120 | 
121 | 
122 | void test_with_different_thread(int THREADS_NUM, int ILP){
123 | 
124 |   BLOCKS_NUM = 1;
125 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
126 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
127 | 
128 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
129 | 
130 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
131 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
132 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
133 | 
134 |   uint32_t *startClk_g;
135 |   uint32_t *stopClk_g;
136 |   shared_m *dsink_g;
137 | 
138 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
139 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
140 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
141 | 
142 |   shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
143 |   gpuErrchk(cudaPeekAtLastError());
144 |     //printf("pass kenerl \n");
145 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
146 |                        cudaMemcpyDeviceToHost));
147 |   gpuErrchk(
148 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
149 |   gpuErrchk(
150 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
151 | 
152 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
153 | 
154 |   std::cout << THREADS_NUM/32 <<" warps ldmatrix.x1 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
155 | 
156 |   long num_bytes =  (THREADS_NUM/32) * 8 * 8 * 2 * 1 * ILP;
157 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
158 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
159 |   std::cout << std::endl;
160 |   
161 |   cudaDeviceSynchronize();
162 |  
163 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
164 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
165 | 
166 |   // if (ACCEL_SIM_MODE) {
167 |   //   std::cout << "\n//Accel_Sim config: \n";
168 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
169 |   // }
170 |   
171 | }
172 | int main() {
173 | 
174 |   std::vector<int> warps = {1,2,3,4,5,6,8,12,16,20,24,28,32};
175 |   //std::vector<int> warps = {4,8,12,16};
176 |   intilizeDeviceProp(0);
177 |   std::cout<<"***********************************"<<std::endl;
178 |   std::cout << "ldmatrix.x1 microbenchmark with ILP = " << ILPconfig << std::endl;
179 |   for(auto &e:warps){
180 |    test_with_different_thread(32*e,ILPconfig);
181 |   }
182 | 
183 |   return 0;
184 | }
185 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_lat/Makefile:
--------------------------------------------------------------------------------
1 | SRC = ldmatrix_lat.cu
2 | 
3 | EXE = ldmatrix_lat.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_lat/ldmatrix_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | //#define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | 
 18 | #ifndef ILPconfig
 19 | #define ILPconfig 1
 20 | #endif
 21 | 
 22 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 23 | 
 24 | // two way bank conflict - > 23 latenct
 25 | // bank-conflict-free -> 25 latency
 26 | 
 27 | typedef uint32_t shared_m;
 28 | // Measure latency of ITERS ldmatrix.x1
 29 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 30 |   shared_m *dsink, uint32_t stride) {
 31 | 
 32 |   // thread index
 33 |   uint32_t tid = threadIdx.x;
 34 |   uint32_t bid = blockIdx.x;
 35 |   uint32_t uid = bid * blockDim.x + tid;
 36 |   uint32_t n_threads = blockDim.x * gridDim.x;
 37 | 
 38 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 39 | 
 40 |   // one thread to initialize the pointer-chasing array
 41 |   if(uid == 0){
 42 |     for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++)
 43 |       s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 44 |   }
 45 |     
 46 |     asm volatile("bar.sync 0;");
 47 | 
 48 |     // if(uid == 0){
 49 |     //   for(int i = 0; i < SHARED_MEM_SIZE; i ++){
 50 |     //     printf("s[%d] = %d \t", i, s[i]);
 51 |   
 52 |     //   }
 53 |     //   printf("\n");
 54 |     // }
 55 |   //if (uid == 0) {
 56 |     // initalize pointer chaser
 57 |     //unsigned x = threadIdx.x*4;
 58 |     unsigned addr = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 59 |     //printf("thread %d , addr = %d \n", tid, addr);
 60 |     // start timing
 61 |     uint32_t start = 0;
 62 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 63 | 
 64 |     // pointer-chasing ITERS times
 65 |     //#pragma unroll
 66 |     for (uint32_t i = 0; i < ITERS; ++i) {
 67 |         asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr) : "r"(addr)); // first 11
 68 |         __syncwarp();
 69 |     }
 70 |     //asm volatile("bar.sync 0;");
 71 |    
 72 |     //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
 73 |     // stop timing
 74 |     uint32_t stop = 0;
 75 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 76 |     addr ++;
 77 |     //printf("thread %d , x = %d \n", tid, addr);
 78 | 
 79 |     // write time and data back to memory
 80 |     if(uid == 0){
 81 | 
 82 |       startClk[uid] = start;
 83 |       stopClk[uid] = stop;
 84 |       dsink[uid] = addr;
 85 |     }
 86 | 
 87 | 
 88 |     // float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
 89 |     // printf("Shared Memory Latency  = %f cycles\n", lat);
 90 |   //}
 91 | }
 92 | 
 93 | 
 94 | void test_with_different_thread(int THREADS_NUM){
 95 | 
 96 |   BLOCKS_NUM = 1;
 97 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
 98 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
 99 | 
100 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
101 | 
102 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
103 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
104 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
105 | 
106 |   uint32_t *startClk_g;
107 |   uint32_t *stopClk_g;
108 |   shared_m *dsink_g;
109 | 
110 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
111 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
112 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
113 | 
114 |   shared_lat<<<BLOCKS_NUM, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
115 |   gpuErrchk(cudaPeekAtLastError());
116 |     //printf("pass kenerl \n");
117 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
118 |                        cudaMemcpyDeviceToHost));
119 |   gpuErrchk(
120 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
121 |   gpuErrchk(
122 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
123 | 
124 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
125 | 
126 |   std::cout << THREADS_NUM/32 <<" warps ldmatrix.x1 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
127 | 
128 |   long num_bytes =  (THREADS_NUM/32) * 8 * 8 * 2 * 1;
129 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
130 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
131 |   std::cout << std::endl;
132 |   
133 |   cudaDeviceSynchronize();
134 |  
135 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
136 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
137 | 
138 |   // if (ACCEL_SIM_MODE) {
139 |   //   std::cout << "\n//Accel_Sim config: \n";
140 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
141 |   // }
142 |   
143 | }
144 | int main() {
145 |   intilizeDeviceProp(0);
146 |   std::vector<int> warps = {1,2,4,8,16,32};
147 |   std::cout << "ldmatrix.x1 microbenchmark " <<std::endl;
148 |   for(auto &e:warps){
149 |     test_with_different_thread(32*e);
150 |   }
151 | 
152 |   return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_x2_lat/Makefile:
--------------------------------------------------------------------------------
1 | SRC = ldmatrix_x2_lat.cu
2 | 
3 | EXE = ldmatrix_x2_lat.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_x2_lat/ldmatrix_x2_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | //#define THREADS_NUM 128
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | #ifndef ILPconfig
 18 | #define ILPconfig 1
 19 | #endif
 20 | 
 21 | static_assert(ILPconfig<=6,"ILP > 6 is not implemented\n");
 22 | // two way bank conflict - > 23 latenct
 23 | // bank-conflict-free -> 25 latency
 24 | 
 25 | typedef uint32_t shared_m;
 26 | // Measure latency of ITERS ldmatrix.x1
 27 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 28 |   shared_m *dsink, uint32_t stride) {
 29 | 
 30 |   // thread index
 31 |   uint32_t tid = threadIdx.x;
 32 |   uint32_t bid = blockIdx.x;
 33 |   uint32_t uid = bid * blockDim.x + tid;
 34 |   uint32_t n_threads = blockDim.x * gridDim.x;
 35 | 
 36 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 37 | 
 38 |   // one thread to initialize the pointer-chasing array
 39 |   if(uid == 0){
 40 |     for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++)
 41 |       s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 42 |   }
 43 |     
 44 |     asm volatile("bar.sync 0;");
 45 | 
 46 |     unsigned addr = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 47 |     unsigned addr_1 = 0;
 48 | 
 49 |     unsigned addr2 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32) *4]));
 50 |     unsigned addr2_1 = 0;
 51 | 
 52 |     unsigned addr3 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 64) *4]));
 53 |     unsigned addr3_1 = 0;
 54 | 
 55 |     unsigned addr4 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 96) *4]));
 56 |     unsigned addr4_1 = 0;
 57 | 
 58 |     unsigned addr5 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*4) *4]));
 59 |     unsigned addr5_1 = 0;
 60 | 
 61 |     unsigned addr6 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32*5) *4]));
 62 |     unsigned addr6_1 = 0;
 63 |     //printf("thread %d , addr = %d \n", tid, addr);
 64 |     // start timing
 65 |     uint32_t start = 0;
 66 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 67 | 
 68 |     // pointer-chasing ITERS times
 69 |     #pragma unroll
 70 |     for (uint32_t i = 0; i < ITERS; ++i) {
 71 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr), "=r"(addr_1) : "r"(addr)); 
 72 |         #if ILPconfig >= 2
 73 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr2), "=r"(addr2_1) : "r"(addr2)); 
 74 |         #endif
 75 |         #if ILPconfig >= 3
 76 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr3), "=r"(addr3_1) : "r"(addr3)); 
 77 |         #endif
 78 |         #if ILPconfig >= 4
 79 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr4), "=r"(addr4_1) : "r"(addr4)); 
 80 |         #endif
 81 | 
 82 |         #if ILPconfig >= 5
 83 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr5), "=r"(addr5_1) : "r"(addr5)); 
 84 |         #endif
 85 | 
 86 |         #if ILPconfig >= 6
 87 |         asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr6), "=r"(addr6_1) : "r"(addr6)); 
 88 |         #endif
 89 | 
 90 | 
 91 | 
 92 |         __syncwarp();
 93 |     }
 94 |     uint32_t stop = 0;
 95 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 96 | 
 97 |     //printf("thread %d , x = %d \n", tid, addr);
 98 | 
 99 |     // write time and data back to memory
100 |     startClk[uid] = start;
101 |     stopClk[uid] = stop;
102 |     dsink[uid] = addr + addr_1;
103 | 
104 |     dsink[uid] += addr2 + addr2_1;
105 | 
106 |     dsink[uid] += addr3 + addr3_1;
107 |     dsink[uid] += addr4 + addr4_1;
108 |     dsink[uid] += addr5 + addr5_1;
109 |     dsink[uid] += addr6 + addr6_1;
110 | 
111 | 
112 | }
113 | 
114 | void test_with_different_thread(int THREADS_NUM, int ILP){
115 |   BLOCKS_NUM = 1;
116 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
117 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
118 | 
119 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
120 | 
121 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
122 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
123 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
124 | 
125 |   uint32_t *startClk_g;
126 |   uint32_t *stopClk_g;
127 |   shared_m *dsink_g;
128 | 
129 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
130 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
131 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
132 | 
133 |   shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
134 |   gpuErrchk(cudaPeekAtLastError());
135 |     //printf("pass kenerl \n");
136 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
137 |                        cudaMemcpyDeviceToHost));
138 |   gpuErrchk(
139 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
140 |   gpuErrchk(
141 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
142 | 
143 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
144 | 
145 | 
146 |   std::cout << THREADS_NUM/32 <<" warps ldmatrix.x2 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
147 | 
148 |   long num_bytes =  (THREADS_NUM/32) * 8 * 8 * 2 * 2 * ILP;
149 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
150 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
151 |   std::cout << std::endl;
152 |   
153 |   cudaDeviceSynchronize();
154 | 
155 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
156 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
157 | 
158 |   // if (ACCEL_SIM_MODE) {
159 |   //   std::cout << "\n//Accel_Sim config: \n";
160 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
161 |   // }
162 | 
163 | }
164 | 
165 | int main() {
166 |   std::vector<int> warps = {1,2,4,6,8,12,16,20,24,28,32};
167 |   //std::vector<int> warps = {4,8,12,16};
168 |   intilizeDeviceProp(0);
169 |   //std::cout << "ldmatrix.x2 microbenchmark " <<std::endl;
170 |   std::cout<<"***********************************"<<std::endl;
171 |   std::cout << "ldmatrix.x2 microbenchmark with ILP = " << ILPconfig << std::endl;
172 |   for(auto &e:warps){
173 |     test_with_different_thread(32*e,ILPconfig);
174 |   }
175 |   
176 | 
177 |   return 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_x4_lat/Makefile:
--------------------------------------------------------------------------------
1 | SRC = ldmatrix_x4_lat.cu
2 | 
3 | EXE = ldmatrix_x4_lat.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/ldmatrix_x4_lat/ldmatrix_x4_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | //#define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (999 )
 15 | #endif
 16 | 
 17 | 
 18 | #ifndef ILPconfig
 19 | #define ILPconfig 1
 20 | #endif
 21 | 
 22 | static_assert(ILPconfig<=8,"ILP > 8 is not implemented\n");
 23 | 
 24 | // two way bank conflict - > 23 latenct
 25 | // bank-conflict-free -> 25 latency
 26 | 
 27 | typedef uint32_t shared_m;
 28 | // Measure latency of ITERS ldmatrix.x1
 29 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 30 |   shared_m *dsink, uint32_t stride) {
 31 | 
 32 |   // thread index
 33 |   uint32_t tid = threadIdx.x;
 34 |   uint32_t bid = blockIdx.x;
 35 |   uint32_t uid = bid * blockDim.x + tid;
 36 |   uint32_t n_threads = blockDim.x * gridDim.x;
 37 | 
 38 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 39 | 
 40 |   // one thread to initialize the pointer-chasing array
 41 |   if(uid == 0){
 42 |     for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++)
 43 |       s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 44 |   }
 45 | 
 46 |     
 47 |     asm volatile("bar.sync 0;");
 48 | 
 49 |     // if(uid == 0){
 50 |     //   for(int i = 0; i < SHARED_MEM_SIZE; i ++){
 51 |     //     printf("s[%d] = %d \t", i, s[i]);
 52 |   
 53 |     //   }
 54 |     //   printf("\n");
 55 |     // }
 56 |   //if (uid == 0) {
 57 |     // initalize pointer chaser
 58 |     //unsigned x = threadIdx.x*4;
 59 |     unsigned addr = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 60 |     unsigned addr_1 = 0;
 61 |     unsigned addr_2 = 0;
 62 |     unsigned addr_3 = 0;
 63 | 
 64 |     unsigned addr2 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 32) * 4]));
 65 |     unsigned addr2_1 = 0;
 66 |     unsigned addr2_2 = 0;
 67 |     unsigned addr2_3 = 0;
 68 | 
 69 | 
 70 |     unsigned addr3 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 64) * 4]));
 71 |     unsigned addr3_1 = 0;
 72 |     unsigned addr3_2 = 0;
 73 |     unsigned addr3_3 = 0;
 74 | 
 75 | 
 76 |     unsigned addr4 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 96) * 4]));
 77 |     unsigned addr4_1 = 0;
 78 |     unsigned addr4_2 = 0;
 79 |     unsigned addr4_3 = 0;
 80 | 
 81 |     unsigned addr5 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 128) * 4]));
 82 |     unsigned addr5_1 = 0;
 83 |     unsigned addr5_2 = 0;
 84 |     unsigned addr5_3 = 0;
 85 | 
 86 |     unsigned addr6 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 160) * 4]));
 87 |     unsigned addr6_1 = 0;
 88 |     unsigned addr6_2 = 0;
 89 |     unsigned addr6_3 = 0;
 90 | 
 91 |     unsigned addr7 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 192) * 4]));
 92 |     unsigned addr7_1 = 0;
 93 |     unsigned addr7_2 = 0;
 94 |     unsigned addr7_3 = 0;
 95 | 
 96 |     unsigned addr8 = static_cast<unsigned>(__cvta_generic_to_shared(&s[(threadIdx.x + 224) * 4]));
 97 |     unsigned addr8_1 = 0;
 98 |     unsigned addr8_2 = 0;
 99 |     unsigned addr8_3 = 0;
100 | 
101 | 
102 |     //printf("thread %d , addr = %d \n", tid, addr);
103 |     // start timing
104 |     uint32_t start = 0;
105 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
106 |     // pointer-chasing ITERS times
107 |     // #pragma unroll
108 |     for (uint32_t i = 0; i < ITERS; ++i) {
109 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr_1),"=r"(addr_2),"=r"(addr_3) : "r"(addr)); 
110 |         #if ILPconfig >= 2
111 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr2), "=r"(addr2_1),"=r"(addr2_2),"=r"(addr2_3) : "r"(addr2)); 
112 |         #endif 
113 |         #if ILPconfig >= 3
114 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr3), "=r"(addr3_1),"=r"(addr3_2),"=r"(addr3_3) : "r"(addr3));
115 |         #endif 
116 |         #if ILPconfig >= 4
117 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr4), "=r"(addr4_1),"=r"(addr4_2),"=r"(addr4_3) : "r"(addr4)); 
118 |         #endif 
119 | 
120 |         #if ILPconfig >= 5
121 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr5), "=r"(addr5_1),"=r"(addr5_2),"=r"(addr5_3) : "r"(addr5)); 
122 |         #endif 
123 | 
124 |         #if ILPconfig >= 6
125 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr6), "=r"(addr6_1),"=r"(addr6_2),"=r"(addr6_3) : "r"(addr6)); 
126 |         #endif 
127 | 
128 |         #if ILPconfig >= 7
129 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr7), "=r"(addr7_1),"=r"(addr7_2),"=r"(addr7_3) : "r"(addr7)); 
130 |         #endif 
131 | 
132 |         #if ILPconfig >= 8
133 |         asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr8), "=r"(addr8_1),"=r"(addr8_2),"=r"(addr8_3) : "r"(addr8)); 
134 |         #endif 
135 |         __syncwarp();
136 |     }
137 |     //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
138 |     // stop timing
139 |     uint32_t stop = 0;
140 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
141 | 
142 |     //printf("thread %d , x = %d \n", tid, addr);
143 | 
144 |     // write time and data back to memory
145 |     startClk[uid] = start;
146 |     stopClk[uid] = stop;
147 |     dsink[uid] = addr + addr_1 + addr_2 + addr_3;
148 |     dsink[uid] += addr2 + addr2_1 + addr2_2 + addr2_3;
149 | 
150 |     dsink[uid] += addr3 + addr3_1 + addr3_2 + addr3_3;
151 |     dsink[uid] += addr4 + addr4_1 + addr4_2 + addr4_3;
152 |     dsink[uid] += addr5 + addr5_1 + addr5_2 + addr5_3;
153 |     dsink[uid] += addr6 + addr6_1 + addr6_2 + addr6_3;
154 |     dsink[uid] += addr7 + addr7_1 + addr7_2 + addr7_3;
155 |     dsink[uid] += addr8 + addr8_1 + addr8_2 + addr8_3;
156 |     // float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
157 |     // printf("Shared Memory Latency  = %f cycles\n", lat);
158 |   //}
159 | }
160 | void test_with_different_thread(int THREADS_NUM, int ILP){
161 | 
162 |   BLOCKS_NUM = 1;
163 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
164 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
165 | 
166 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
167 | 
168 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
169 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
170 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
171 | 
172 |   uint32_t *startClk_g;
173 |   uint32_t *stopClk_g;
174 |   shared_m *dsink_g;
175 | 
176 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
177 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
178 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
179 | 
180 |   shared_lat<<<BLOCKS_NUM, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
181 |   gpuErrchk(cudaPeekAtLastError());
182 |    // printf("pass kenerl \n");
183 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
184 |                        cudaMemcpyDeviceToHost));
185 |   gpuErrchk(
186 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
187 |   gpuErrchk(
188 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
189 | 
190 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
191 | 
192 |   long num_bytes =  (TOTAL_THREADS/32) * 8 * 8 * 2 * 4 * ILP;
193 | 
194 |   std::cout << THREADS_NUM/32 <<" warps ldmatrix.x4 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
195 | 
196 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
197 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
198 |   std::cout << std::endl;
199 |   
200 |   cudaDeviceSynchronize();
201 | 
202 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
203 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
204 | 
205 |   // if (ACCEL_SIM_MODE) {
206 |   //   std::cout << "\n//Accel_Sim config: \n";
207 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
208 |   // }
209 | 
210 | 
211 | }
212 | 
213 | 
214 | int main() {
215 |   std::vector<int> warps = {1,2,3,4,5,6,7,8,12,16,20};
216 |   //std::vector<int> warps = {4,8,12,16};
217 |   intilizeDeviceProp(0);
218 |   std::cout<<"***********************************"<<std::endl;
219 |   std::cout << "ldmatrix.x4 microbenchmark with ILP = " << ILPconfig << std::endl;
220 |   // std::cout << "ldmatrix.x4 microbenchmark " <<std::endl;
221 |   for(auto &e:warps){
222 |     test_with_different_thread(32*e, ILPconfig);
223 |   }
224 |   //test_with_different_thread(32*6);
225 |   
226 |   return 0;
227 | }
228 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_bw/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | SRC = shared_bw.cu
3 | 
4 | EXE = shared_bw.app
5 | 
6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
7 | 
8 | include ../../../common/common.mk
9 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_bw/shared_bw.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #include "../../../hw_def/hw_def.h"
  8 | 
  9 | #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 10 | #ifndef ITERS
 11 | #define ITERS  (1024 )
 12 | #endif
 13 | 
 14 | #ifndef ILPconfig
 15 | #define ILPconfig 1
 16 | #endif
 17 | 
 18 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 19 | 
 20 | __global__ void shared_bw(uint64_t *startClk, uint64_t *stopClk,
 21 |                           uint32_t *dsink, uint32_t stride) {
 22 | 
 23 |   // thread index
 24 |   uint32_t tid = threadIdx.x;
 25 |   uint32_t bid = blockIdx.x;
 26 |   uint32_t uid = bid * blockDim.x + tid;
 27 |   uint32_t n_threads = blockDim.x * gridDim.x;
 28 | 
 29 |   // a register to avoid compiler optimization
 30 |   // uint32_t sink0 = 0;
 31 |   register uint32_t tmp = uid;
 32 | 
 33 |   uint64_t start = 0;
 34 |   uint64_t stop = 0;
 35 | 
 36 |   __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory
 37 |   // uint32_t s[SHARED_MEM_SIZE];
 38 |   // one thread to initialize the pointer-chasing array
 39 |   for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads)
 40 |     s[i] = (i + stride) % SHARED_MEM_SIZE;
 41 | 
 42 |   // synchronize all threads
 43 |   asm volatile("bar.sync 0;");
 44 | 
 45 |   // start timing
 46 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 47 | 
 48 |   // load data from shared memory
 49 |   for (uint32_t i = 0; i < ITERS; ++i) {
 50 |     tmp = s[tmp];
 51 |   }
 52 | 
 53 |   // synchronize all threads
 54 |   asm volatile("bar.sync 0;");
 55 | 
 56 |   // stop timing
 57 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
 58 | 
 59 |   // sink0 = tmp;
 60 |   // write time and data back to memory
 61 |   startClk[uid] = start;
 62 |   stopClk[uid] = stop;
 63 |   dsink[uid] = tmp;
 64 | }
 65 | 
 66 | int main() {
 67 |   intilizeDeviceProp(0);
 68 | 
 69 |   BLOCKS_NUM = 1;
 70 |   TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
 71 |   THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
 72 | 
 73 |   assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
 74 | 
 75 |   uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
 76 |   uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
 77 |   uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
 78 | 
 79 |   uint64_t *startClk_g;
 80 |   uint64_t *stopClk_g;
 81 |   uint32_t *dsink_g;
 82 | 
 83 |   gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
 84 |   gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
 85 |   gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t)));
 86 | 
 87 |   shared_bw<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
 88 |                                       THREADS_PER_BLOCK);
 89 |   gpuErrchk(cudaPeekAtLastError());
 90 | 
 91 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
 92 |                        cudaMemcpyDeviceToHost));
 93 |   gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
 94 |                        cudaMemcpyDeviceToHost));
 95 |   gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t),
 96 |                        cudaMemcpyDeviceToHost));
 97 | 
 98 |   double bw, BW;
 99 |   uint64_t total_time =
100 |       *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
101 |       *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
102 |   bw =
103 |       (double)(ITERS * TOTAL_THREADS * sizeof(uint32_t)) / ((double)total_time);
104 |   BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
105 |   std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW
106 |             << "(GB/s/SM)\n";
107 |   std::cout << "Total Clk number = " << total_time << "\n";
108 | 
109 |   return 1;
110 | }
111 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_bw_64/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | SRC = shared_bw_64.cu
3 | 
4 | EXE = shared_bw_64.app
5 | 
6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
7 | 
8 | include ../../../common/common.mk
9 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_bw_64/shared_bw_64.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #include "../../../hw_def/hw_def.h"
  8 | 
  9 | #define SHARED_MEM_SIZE (32 * 1024 / 8) // 32KB
 10 | #ifndef ITERS
 11 | #define ITERS  (1024 )
 12 | #endif
 13 | 
 14 | 
 15 | #ifndef ILPconfig
 16 | #define ILPconfig 1
 17 | #endif
 18 | 
 19 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 20 | 
 21 | __global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk,
 22 |                           uint64_t *dsink, uint32_t stride) {
 23 | 
 24 |   // thread index
 25 |   uint32_t tid = threadIdx.x;
 26 |   uint32_t bid = blockIdx.x;
 27 |   uint32_t uid = bid * blockDim.x + tid;
 28 |   uint32_t n_threads = blockDim.x * gridDim.x;
 29 | 
 30 |   // a register to avoid compiler optimization
 31 |   // uint32_t sink0 = 0;
 32 |   register uint64_t tmp = uid;
 33 | 
 34 |   uint32_t start = 0;
 35 |   uint32_t stop = 0;
 36 | 
 37 |   __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory
 38 |   // uint32_t s[SHARED_MEM_SIZE];
 39 |   // one thread to initialize the pointer-chasing array
 40 |   for (uint64_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads)
 41 |     s[i] = (i + stride) % SHARED_MEM_SIZE;
 42 | 
 43 |   // synchronize all threads
 44 |   asm volatile("bar.sync 0;");
 45 | 
 46 |   // start timing
 47 |   asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 48 | 
 49 |   // load data from shared memory
 50 |   for (uint32_t i = 0; i < ITERS; ++i) {
 51 |     tmp = s[tmp];
 52 |   }
 53 | 
 54 |   // synchronize all threads
 55 |   asm volatile("bar.sync 0;");
 56 | 
 57 |   // stop timing
 58 |   asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 59 | 
 60 |   // sink0 = tmp;
 61 |   // write time and data back to memory
 62 |   startClk[uid] = start;
 63 |   stopClk[uid] = stop;
 64 |   dsink[uid] = tmp;
 65 | }
 66 | 
 67 | int main() {
 68 |   intilizeDeviceProp(0);
 69 | 
 70 |   BLOCKS_NUM = 1;
 71 |   TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
 72 |   THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM;
 73 | 
 74 |   assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK);
 75 | 
 76 |   uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
 77 |   uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t));
 78 |   uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
 79 | 
 80 |   uint32_t *startClk_g;
 81 |   uint32_t *stopClk_g;
 82 |   uint64_t *dsink_g;
 83 | 
 84 |   gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t)));
 85 |   gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t)));
 86 |   gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t)));
 87 | 
 88 |   shared_bw<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g,
 89 |                                                THREADS_PER_BLOCK);
 90 |   gpuErrchk(cudaPeekAtLastError());
 91 | 
 92 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t),
 93 |                        cudaMemcpyDeviceToHost));
 94 |   gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t),
 95 |                        cudaMemcpyDeviceToHost));
 96 |   gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t),
 97 |                        cudaMemcpyDeviceToHost));
 98 | 
 99 |   double bw, BW;
100 |   uint64_t total_time =
101 |       *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
102 |       *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
103 |   bw =
104 |       (double)(ITERS * TOTAL_THREADS * sizeof(uint64_t)) / ((double)total_time);
105 |   BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024;
106 |   std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW
107 |             << "(GB/s/SM)\n";
108 |   std::cout << "Total Clk number = " << total_time << "\n";
109 | 
110 |   return 1;
111 | }
112 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_lat/Makefile:
--------------------------------------------------------------------------------
1 | SRC = shared_lat.cu
2 | 
3 | EXE = shared_lat.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk
8 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_lat/shared_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (32 * 1024 ) // 32k
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | // #define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | 
 18 | #ifndef ILPconfig
 19 | #define ILPconfig 1
 20 | #endif
 21 | 
 22 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 23 | 
 24 | #define U32ACCESS
 25 | 
 26 | // two way bank conflict - > 23 latenct
 27 | // bank-conflict-free -> 25 latency
 28 | 
 29 | #ifdef U32ACCESS
 30 | typedef uint32_t shared_m;
 31 | #else
 32 | typedef uint64_t shared_m;
 33 | #endif
 34 | // two way bank conflict - > 23 latenct
 35 | // bank-conflict-free -> 25 latency
 36 | 
 37 | 
 38 | // Measure latency of ITERS reads.
 39 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 40 |   shared_m *dsink, uint32_t stride) {
 41 | 
 42 |   // thread index
 43 |   uint32_t tid = threadIdx.x;
 44 |   uint32_t bid = blockIdx.x;
 45 |   uint32_t uid = bid * blockDim.x + tid;
 46 |   uint32_t n_threads = blockDim.x * gridDim.x;
 47 | 
 48 |   extern __shared__ int smem[]; // dynamic 
 49 | 
 50 |   shared_m *s = (shared_m*)&smem[0];
 51 | 
 52 |   int s_smem = SHARED_MEM_SIZE/sizeof(shared_m);
 53 |   
 54 |   if(uid == 0){
 55 |     for (uint32_t i = 0; i < (s_smem - stride); i ++)
 56 |       s[i] = (i + stride) % s_smem; //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 57 |   }
 58 |   // one thread to initialize the pointer-chasing array
 59 |   // for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads)
 60 |   //   s[i] = (i + stride) % SHARED_MEM_SIZE;
 61 | 
 62 |   asm volatile("bar.sync 0;");
 63 | 
 64 |   // if(uid == 0){
 65 |   //   for(int i = 0; i < SHARED_MEM_SIZE; i ++){
 66 |   //     printf("s[%d] = %d \t", i, s[i]);
 67 | 
 68 |   //   }
 69 |   //   printf("\n");
 70 |   // }
 71 | 
 72 |   //if (uid == 0) {
 73 |     // initalize pointer chaser
 74 |     shared_m p_chaser = threadIdx.x * stride;;
 75 | 
 76 |     // start timing
 77 |     uint32_t start = 0;
 78 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 79 | 
 80 |     // pointer-chasing ITERS times
 81 |     for (uint32_t i = 0; i < ITERS; ++i) {
 82 |       p_chaser = s[p_chaser];
 83 |     }
 84 | 
 85 |     // stop timing
 86 |     uint32_t stop = 0;
 87 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 88 | 
 89 |     // write time and data back to memory
 90 |     if(uid == 0){
 91 |       startClk[uid] = start;
 92 |       stopClk[uid] = stop;
 93 |       dsink[uid] = p_chaser;
 94 |     }
 95 | 
 96 |   //}
 97 | }
 98 | 
 99 | 
100 | // n-way bank conflict (n = 1,2,4,8...32)
101 | void bank_conflict_test(int n, int THREADS_NUM){
102 | 
103 |   
104 | 
105 |   BLOCKS_NUM = 1;
106 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
107 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
108 | 
109 |   assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
110 | 
111 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
112 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
113 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
114 | 
115 |   uint32_t *startClk_g;
116 |   uint32_t *stopClk_g;
117 |   shared_m *dsink_g;
118 | 
119 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
120 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
121 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
122 | 
123 |   shared_lat<<<1, THREADS_NUM,SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, n);
124 |   gpuErrchk(cudaPeekAtLastError());
125 | 
126 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
127 |                        cudaMemcpyDeviceToHost));
128 |   gpuErrchk(
129 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
130 |   gpuErrchk(
131 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
132 | 
133 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
134 |   
135 |   //printf("Shared Memory Latency  = %f cycles\n", lat);
136 |   std::cout << n <<"-way bank conflict ,  " << THREADS_NUM/32 <<" warps, latency = " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
137 | 
138 |   long num_bytes =  (THREADS_NUM) * 4;
139 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
140 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
141 |   std::cout << std::endl;
142 |   
143 |   cudaDeviceSynchronize();
144 | 
145 |   //printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
146 | 
147 |   // if (ACCEL_SIM_MODE) {
148 |   //   std::cout << "\n//Accel_Sim config: \n";
149 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
150 |   // }
151 | 
152 | }
153 | 
154 | 
155 | 
156 | 
157 | int main() {
158 |   intilizeDeviceProp(0);
159 | 
160 |   std::vector<int> warps = {1,4,8};
161 |   for(auto& e:warps){
162 |     bank_conflict_test(1, 32*e );
163 |     std::cout <<"***************************************"<<std::endl;
164 |     bank_conflict_test(2, 32*e );
165 |     std::cout <<"***************************************"<<std::endl;
166 |     bank_conflict_test(4, 32*e );
167 |     std::cout <<"***************************************"<<std::endl;
168 |     bank_conflict_test(8, 32*e );
169 |     std::cout <<"***************************************"<<std::endl;
170 |   }
171 | 
172 |   // for(int i = 1; i<=32; i=i*2){
173 |   //   bank_conflict_test(1, 32*i );
174 |   // }
175 |   // bank_conflict_test(1);
176 |   // bank_conflict_test(2);
177 |   return 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x2_lat/Makefile:
--------------------------------------------------------------------------------
1 | SRC = shared_x2_lat.cu
2 | 
3 | EXE = shared_x2_lat.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk
8 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x2_lat/shared_x2_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE (32 * 1024) // 32 KB in bytes
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | // #define THREADS_NUM 256
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | //#define U32ACCESS
 18 | 
 19 | 
 20 | #ifndef ILPconfig
 21 | #define ILPconfig 1
 22 | #endif
 23 | 
 24 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 25 | 
 26 | // two way bank conflict - > 23 latenct
 27 | // bank-conflict-free -> 25 latency
 28 | 
 29 | #ifdef U32ACCESS
 30 | typedef uint32_t shared_m;
 31 | #else
 32 | typedef uint64_t shared_m;
 33 | #endif
 34 | 
 35 | // Measure latency of ITERS reads.
 36 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 37 |   shared_m *dsink, uint32_t stride) {
 38 | 
 39 |   // thread index
 40 |   uint32_t tid = threadIdx.x;
 41 |   uint32_t bid = blockIdx.x;
 42 |   uint32_t uid = bid * blockDim.x + tid;
 43 |   uint32_t n_threads = blockDim.x * gridDim.x;
 44 | 
 45 |   //__shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 46 | 
 47 |   extern __shared__ int smem[]; // dynamic 
 48 | 
 49 |   shared_m *s = (shared_m*)&smem[0];
 50 | 
 51 |   int s_smem = SHARED_MEM_SIZE/sizeof(shared_m);
 52 | 
 53 |   // one thread to initialize the pointer-chasing array
 54 |   // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads)
 55 |   //   s[i] = (i + stride) % s_smem;
 56 | 
 57 |     if(uid == 0){
 58 |       for (uint32_t i = 0; i < (s_smem - stride); i ++)
 59 |         s[i] = (i + stride) % s_smem; //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 60 |     }
 61 |     // 
 62 |   asm volatile("bar.sync 0;");
 63 | 
 64 |   // if(uid == 0){
 65 |   //   for(int i = 0; i < s_smem; i ++){
 66 |   //     printf("s[%d] = %d \t", i, int(s[i]) );
 67 | 
 68 |   //   }
 69 |   //   printf("\n");
 70 |   // }
 71 | 
 72 |   //if (uid == 0) {
 73 |     // initalize pointer chaser
 74 |     shared_m p_chaser = threadIdx.x*stride ;
 75 | 
 76 |     #ifdef U32ACCESS
 77 |     shared_m p_chaser_1 = threadIdx.x  + 32;
 78 |     #endif
 79 |     // start timing
 80 |     uint32_t start = 0;
 81 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 82 | 
 83 |     // pointer-chasing ITERS times
 84 |     for (uint32_t i = 0; i < ITERS; ++i) {
 85 |       p_chaser = s[p_chaser];
 86 | 
 87 |       #ifdef U32ACCESS
 88 |       p_chaser_1 =s[p_chaser_1];
 89 |       #endif
 90 | 
 91 |       //p_chaser_1 =s[p_chaser_1];
 92 |       //asm volatile("bar.sync 0;");
 93 |     }
 94 | 
 95 |     // stop timing
 96 |     asm volatile("bar.sync 0;");
 97 |     uint32_t stop = 0;
 98 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 99 | 
100 |     // write time and data back to memory
101 |     if(uid  == 0){
102 |       startClk[uid] = start;
103 |       stopClk[uid] = stop;
104 |       dsink[uid] = p_chaser;// + p_chaser_1;
105 |       #ifdef U32ACCESS
106 |       dsink[uid] += p_chaser_1;
107 |       #endif
108 | 
109 |     }
110 | 
111 |   //}
112 | }
113 | 
114 | 
115 | void test_with_different_thread(int stride, int THREADS_NUM){
116 |   //int n_warps = THREADS_NUM/32;
117 |   BLOCKS_NUM = 1;
118 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
119 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
120 | 
121 |   // if( n_warps == 8 ){
122 |   //   #define SHARED_MEM_SIZE (16 * 1024)
123 |   // }else{
124 |   //   #define SHARED_MEM_SIZE (32 * 1024)
125 |   // }
126 | 
127 |   assert(SHARED_MEM_SIZE  <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
128 | 
129 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
130 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
131 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
132 | 
133 |   uint32_t *startClk_g;
134 |   uint32_t *stopClk_g;
135 |   shared_m *dsink_g;
136 | 
137 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
138 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
139 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
140 | 
141 |   shared_lat<<<1, THREADS_NUM, SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, stride);
142 |   gpuErrchk(cudaPeekAtLastError());
143 | 
144 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
145 |                        cudaMemcpyDeviceToHost));
146 |   gpuErrchk(
147 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
148 |   gpuErrchk(
149 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
150 | 
151 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
152 | 
153 |   std::cout << THREADS_NUM/32 <<" warps Shared Memory read(8B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
154 | 
155 |   long num_bytes =  (THREADS_NUM) * 8;
156 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
157 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
158 |   std::cout << std::endl;
159 |   
160 |   cudaDeviceSynchronize();
161 | 
162 | 
163 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
164 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
165 | 
166 |   // if (ACCEL_SIM_MODE) {
167 |   //   std::cout << "\n//Accel_Sim config: \n";
168 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
169 |   // }
170 | }
171 | 
172 | 
173 | int main() {
174 |   intilizeDeviceProp(0);
175 |   for(int i = 1; i <= 32; i = i*2){
176 |     test_with_different_thread(1,32*i);
177 |   }
178 | 
179 |   //test_with_different_thread(1,32*14);
180 | 
181 |   return 0;
182 | }
183 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x4_lat/Makefile:
--------------------------------------------------------------------------------
 1 | SRC = shared_x4_lat.cu
 2 | 
 3 | EXE = shared_x4_lat.app
 4 | 
 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
 6 | 
 7 | include ../../../common/common.mk
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x4_lat/shared_x4_lat.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE  (32*1024) //(32 * 1024 ) // in bytes
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | // #define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | #ifndef ILPconfig
 18 | #define ILPconfig 1
 19 | #endif
 20 | 
 21 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 22 | 
 23 | // two way bank conflict - > 23 latenct
 24 | // bank-conflict-free -> 25 latency
 25 | 
 26 | #define U32ACCESS
 27 | 
 28 | // two way bank conflict - > 23 latenct
 29 | // bank-conflict-free -> 25 latency
 30 | 
 31 | #ifdef U32ACCESS
 32 | typedef uint32_t shared_m;
 33 | #else
 34 | typedef uint64_t shared_m;
 35 | #endif
 36 | 
 37 | //typedef uint32_t shared_m;
 38 | 
 39 | // typedef uint64_t shared_m;
 40 | // Measure latency of ITERS reads.
 41 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 42 |   shared_m *dsink, uint32_t stride) {
 43 | 
 44 |   // thread index
 45 |   uint32_t tid = threadIdx.x;
 46 |   uint32_t bid = blockIdx.x;
 47 |   uint32_t uid = bid * blockDim.x + tid;
 48 |   uint32_t n_threads = blockDim.x * gridDim.x;
 49 | 
 50 |   extern __shared__ int smem[]; // dynamic 
 51 | 
 52 |   shared_m *s = (shared_m*)&smem[0];
 53 | 
 54 |   int s_smem = SHARED_MEM_SIZE/sizeof(shared_m);
 55 | 
 56 |   // // one thread to initialize the pointer-chasing array
 57 |   // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads)
 58 |   //   s[i] = (i + stride) % s_smem;
 59 |     if(uid == 0){
 60 |       for (uint32_t i = 0; i < (s_smem - stride); i ++)
 61 |         s[i] = shared_m((i + stride) % s_smem); //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 62 |     }
 63 |     // 
 64 |   asm volatile("bar.sync 0;");
 65 | 
 66 |   // if(uid == 0){
 67 |   //   for(int i = 0; i < s_smem; i ++){
 68 |   //     printf("s[%d] = %d \t", i, int(s[i]) );
 69 | 
 70 |   //   }
 71 |   //   printf("\n");
 72 |   // }
 73 | 
 74 |   //if (uid == 0) {
 75 |     // initalize pointer chaser
 76 |     shared_m p_chaser = threadIdx.x * stride;
 77 |     //p_chaser = static_cast<unsigned>(__cvta_generic_to_shared(&s[p_chaser]));
 78 |     shared_m p_chaser_1 = threadIdx.x * stride + 32;
 79 | 
 80 |     
 81 |     #ifdef U32ACCESS
 82 |     shared_m p_chaser_2 = threadIdx.x * stride + 64;
 83 |     shared_m p_chaser_3 = threadIdx.x * stride + 96;
 84 |     #endif
 85 | 
 86 | 
 87 |     // start timing
 88 |     uint32_t start = 0;
 89 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 90 | 
 91 |     // pointer-chasing ITERS times
 92 |     //#pragma unroll
 93 |     for (uint32_t i = 0; i < ITERS; ++i) {
 94 |       p_chaser = s[p_chaser];  // ld.shared.u64 %0, [%1];. 
 95 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser) : "r"(p_chaser*4) );
 96 | 
 97 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_1) : "r"(p_chaser_1*4) );
 98 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_2) : "r"(p_chaser_2*4) );
 99 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_3) : "r"(p_chaser_3*4) );
100 | 
101 |       p_chaser_1 =s[p_chaser_1];
102 | 
103 |       #ifdef U32ACCESS
104 |       p_chaser_2 =s[p_chaser_2];
105 |       p_chaser_3 =s[p_chaser_3];
106 |       #endif
107 |       // p_chaser_2 =s[p_chaser_2];
108 |       // p_chaser_3 =s[p_chaser_3];
109 |       
110 | 
111 |       
112 |       
113 |       // p_chaser_1 =s[p_chaser_1];
114 |       // p_chaser_2 =s[p_chaser_2];
115 |       // p_chaser_3 =s[p_chaser_3];
116 |       //asm volatile("bar.sync 0;");
117 |     }
118 | 
119 |     // stop timing
120 |     asm volatile("bar.sync 0;");
121 |     uint32_t stop = 0;
122 |     
123 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
124 | 
125 |     // write time and data back to memory
126 |     if(uid == 0){
127 |       startClk[uid] = start;
128 |       stopClk[uid] = stop;
129 |       dsink[uid] = p_chaser + p_chaser_1; // + p_chaser_2 + p_chaser_3;
130 |   
131 |       #ifdef U32ACCESS
132 |       dsink[uid] += (p_chaser_2 + p_chaser_3);
133 |       #endif
134 |     }
135 | 
136 |   //}
137 | }
138 | 
139 | void test_with_different_thread(int stride, int THREADS_NUM){
140 |   BLOCKS_NUM = 1;
141 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
142 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
143 | 
144 |   assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
145 | 
146 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
147 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
148 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
149 | 
150 |   uint32_t *startClk_g;
151 |   uint32_t *stopClk_g;
152 |   shared_m *dsink_g;
153 | 
154 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
155 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
156 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
157 | 
158 |   shared_lat<<<BLOCKS_NUM, THREADS_NUM,SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, stride);
159 |   gpuErrchk(cudaPeekAtLastError());
160 | 
161 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
162 |                        cudaMemcpyDeviceToHost));
163 |   gpuErrchk(
164 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
165 |   gpuErrchk(
166 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
167 | 
168 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
169 | 
170 |   std::cout << THREADS_NUM/32 <<" warps Shared Memory read(16B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
171 | 
172 |   long num_bytes =  (THREADS_NUM) * 16;
173 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
174 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
175 |   std::cout << std::endl;
176 |   
177 |   cudaDeviceSynchronize();
178 | 
179 | 
180 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
181 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
182 | 
183 |   // if (ACCEL_SIM_MODE) {
184 |   //   std::cout << "\n//Accel_Sim config: \n";
185 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
186 |   // }
187 | }
188 | 
189 | int main() {
190 |   intilizeDeviceProp(0);
191 |   for(int i = 1; i <= 32; i = i*2){
192 |     test_with_different_thread(4,32*i);
193 |   }
194 |   //test_with_different_thread(1,32);
195 |   //test_with_different_thread(32*6);
196 |   return 0;
197 | }
198 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x8/Makefile:
--------------------------------------------------------------------------------
1 | SRC = shared_x8.cu
2 | 
3 | EXE = shared_x8.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk
8 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shared_x8/shared_x8.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define SHARED_MEM_SIZE  (32*1024) //(32 * 1024 ) // in bytes
  9 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 10 | // array technique
 11 | // #define THREADS_NUM 32
 12 | // iterate over the array ITERS times
 13 | #ifndef ITERS
 14 | #define ITERS  (1024 )
 15 | #endif
 16 | 
 17 | 
 18 | #ifndef ILPconfig
 19 | #define ILPconfig 1
 20 | #endif
 21 | 
 22 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 23 | 
 24 | 
 25 | // two way bank conflict - > 23 latenct
 26 | // bank-conflict-free -> 25 latency
 27 | 
 28 | #define U32ACCESS
 29 | 
 30 | // two way bank conflict - > 23 latenct
 31 | // bank-conflict-free -> 25 latency
 32 | 
 33 | #ifdef U32ACCESS
 34 | typedef uint32_t shared_m;
 35 | #else
 36 | typedef uint64_t shared_m;
 37 | #endif
 38 | 
 39 | //typedef uint32_t shared_m;
 40 | 
 41 | // typedef uint64_t shared_m;
 42 | // Measure latency of ITERS reads.
 43 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 44 |   shared_m *dsink, uint32_t stride) {
 45 | 
 46 |   // thread index
 47 |   uint32_t tid = threadIdx.x;
 48 |   uint32_t bid = blockIdx.x;
 49 |   uint32_t uid = bid * blockDim.x + tid;
 50 |   uint32_t n_threads = blockDim.x * gridDim.x;
 51 | 
 52 |   extern __shared__ int smem[]; // dynamic 
 53 | 
 54 |   shared_m *s = (shared_m*)&smem[0];
 55 | 
 56 |   int s_smem = SHARED_MEM_SIZE/sizeof(shared_m);
 57 | 
 58 |   // // one thread to initialize the pointer-chasing array
 59 |   // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads)
 60 |   //   s[i] = (i + stride) % s_smem;
 61 |     if(uid == 0){
 62 |       for (uint32_t i = 0; i < (s_smem - stride); i ++)
 63 |         s[i] = shared_m((i + stride) % s_smem); //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes
 64 |     }
 65 |     // 
 66 |   asm volatile("bar.sync 0;");
 67 | 
 68 |   // if(uid == 0){
 69 |   //   for(int i = 0; i < s_smem; i ++){
 70 |   //     printf("s[%d] = %d \t", i, int(s[i]) );
 71 | 
 72 |   //   }
 73 |   //   printf("\n");
 74 |   // }
 75 | 
 76 |   //if (uid == 0) {
 77 |     // initalize pointer chaser
 78 |     shared_m p_chaser = threadIdx.x * stride;
 79 |     //p_chaser = static_cast<unsigned>(__cvta_generic_to_shared(&s[p_chaser]));
 80 |     shared_m p_chaser_1 = threadIdx.x * stride + 32;
 81 | 
 82 |     
 83 |     
 84 |     shared_m p_chaser_2 = threadIdx.x * stride + 64;
 85 |     shared_m p_chaser_3 = threadIdx.x * stride + 96;
 86 |     
 87 | 
 88 | 
 89 |     #ifdef U32ACCESS
 90 |     shared_m p_chaser_4 = threadIdx.x * stride + 32*4;
 91 |     shared_m p_chaser_5 = threadIdx.x * stride + 32*5;
 92 |     shared_m p_chaser_6 = threadIdx.x * stride + 32*6;
 93 |     shared_m p_chaser_7 = threadIdx.x * stride + 32*7;
 94 |     #endif
 95 | 
 96 |     // start timing
 97 |     uint32_t start = 0;
 98 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 99 | 
100 |     // pointer-chasing ITERS times
101 |     //#pragma unroll
102 |     for (uint32_t i = 0; i < ITERS; ++i) {
103 |       p_chaser = s[p_chaser];  // ld.shared.u64 %0, [%1];. 
104 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser) : "r"(p_chaser*4) );
105 | 
106 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_1) : "r"(p_chaser_1*4) );
107 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_2) : "r"(p_chaser_2*4) );
108 |       // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_3) : "r"(p_chaser_3*4) );
109 | 
110 |       p_chaser_1 =s[p_chaser_1];
111 | 
112 |       
113 |       p_chaser_2 =s[p_chaser_2];
114 |       p_chaser_3 =s[p_chaser_3];
115 | 
116 |       #ifdef U32ACCESS
117 |       p_chaser_4 =s[p_chaser_4];
118 |       p_chaser_5 =s[p_chaser_5];
119 |       p_chaser_6 =s[p_chaser_6];
120 |       p_chaser_7 =s[p_chaser_7];
121 |       #endif
122 |       // p_chaser_2 =s[p_chaser_2];
123 |       // p_chaser_3 =s[p_chaser_3];
124 |       
125 | 
126 |       
127 |       
128 |       // p_chaser_1 =s[p_chaser_1];
129 |       // p_chaser_2 =s[p_chaser_2];
130 |       // p_chaser_3 =s[p_chaser_3];
131 |       //asm volatile("bar.sync 0;");
132 |     }
133 | 
134 |     // stop timing
135 |     asm volatile("bar.sync 0;");
136 |     uint32_t stop = 0;
137 |     
138 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
139 | 
140 |     // write time and data back to memory
141 |     if(uid == 0){
142 |       startClk[uid] = start;
143 |       stopClk[uid] = stop;
144 |       dsink[uid] = p_chaser + p_chaser_1 +p_chaser_2 + p_chaser_3  ; // + p_chaser_2 + p_chaser_3;
145 |   
146 |       #ifdef U32ACCESS
147 |         dsink[uid] += (p_chaser_4 + p_chaser_5 +p_chaser_6 + p_chaser_7);
148 |       #endif
149 |     }
150 | 
151 |   //}
152 | }
153 | 
154 | void test_with_different_thread(int stride, int THREADS_NUM){
155 |   BLOCKS_NUM = 1;
156 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
157 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
158 | 
159 |   assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
160 | 
161 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
162 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
163 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
164 | 
165 |   uint32_t *startClk_g;
166 |   uint32_t *stopClk_g;
167 |   shared_m *dsink_g;
168 | 
169 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
170 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
171 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
172 | 
173 |   shared_lat<<<BLOCKS_NUM, THREADS_NUM,SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, stride);
174 |   gpuErrchk(cudaPeekAtLastError());
175 | 
176 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
177 |                        cudaMemcpyDeviceToHost));
178 |   gpuErrchk(
179 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
180 |   gpuErrchk(
181 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
182 | 
183 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
184 | 
185 |   std::cout << THREADS_NUM/32 <<" warps Shared Memory read(16B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
186 | 
187 |   long num_bytes =  (THREADS_NUM) * 32;
188 |   std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " <<std::endl;
189 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
190 |   std::cout << std::endl;
191 |   
192 |   cudaDeviceSynchronize();
193 | 
194 | 
195 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
196 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
197 | 
198 |   // if (ACCEL_SIM_MODE) {
199 |   //   std::cout << "\n//Accel_Sim config: \n";
200 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
201 |   // }
202 | }
203 | 
204 | int main() {
205 |   intilizeDeviceProp(0);
206 |   std::cout << " ld.shared x 8"<< std::endl;
207 |   for(int i = 1; i <= 32; i = i*2){
208 |     test_with_different_thread(1,32*i);
209 |   }
210 |   //test_with_different_thread(1,32);
211 |   //test_with_different_thread(32*6);
212 |   return 0;
213 | }
214 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shd_config/Makefile:
--------------------------------------------------------------------------------
1 | SRC = shd_config.cu
2 | 
3 | EXE = shd_config.app
4 | 
5 | include ../../../common/common.mk
6 | 


--------------------------------------------------------------------------------
/microbench/ubench/ldmatrix/shd_config/shd_config.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | using namespace std;
 3 | 
 4 | #include "../../../hw_def/hw_def.h"
 5 | 
 6 | 
 7 | 
 8 | int main() {
 9 |   intilizeDeviceProp(0);
10 | 
11 |   printf("Shared memory per multiprocessor = %lu bytes\n",
12 |          deviceProp.sharedMemPerMultiprocessor);
13 | 
14 |   printf("Shared memory per block = %lu bytes\n", deviceProp.sharedMemPerBlock);
15 | 
16 |   if (ACCEL_SIM_MODE) {
17 | 
18 |     //std::cout << "\n//Accel_Sim config: \n";
19 |     std::cout << " deviceProp.maxThreadsPerBlock = " << deviceProp.maxThreadsPerBlock<<std::endl;
20 |     std::cout << "-gpgpu_shmem_size " << deviceProp.sharedMemPerMultiprocessor
21 |               << std::endl;
22 |     std::cout << "-gpgpu_shmem_sizeDefault "
23 |               << deviceProp.sharedMemPerMultiprocessor << std::endl;
24 |     std::cout << "-gpgpu_shmem_per_block " << deviceProp.sharedMemPerBlock
25 |               << std::endl;
26 |   }
27 | 
28 |   return 1;
29 | }


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k128_int1/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k128_int1.cu
2 | 
3 | EXE = mma_m16n8k128_int1.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_bf16fp32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k16_bf16fp32.cu
2 | 
3 | EXE = mma_m16n8k16_bf16fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_bf16fp32/mma_m16n8k16_bf16fp32.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | #ifndef ILPconfig
 22 | #define ILPconfig 1
 23 | #endif
 24 | 
 25 | 
 26 | static_assert(ILPconfig<=5," ILP>5 is not implemented\n");
 27 | 
 28 | 
 29 | 
 30 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res,
 31 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 32 |   // thread index
 33 |   uint32_t tid = threadIdx.x;
 34 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 35 |   uint32_t warpid = gid / warpSize;
 36 | 
 37 |   a = a + warpid * 16*16; // m*k = 16*16
 38 |   b = b + warpid * 8*16; // n*k = 8*16
 39 |   res = res + warpid * 16*8;// m*n = 16*16
 40 | 
 41 |    /** step 1: create register for each thread **/
 42 |    __nv_bfloat16 frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 
 43 |    __nv_bfloat16 frag_B[4*ILPconfig];  // one .f16x2 registers, 4 half  elements
 44 |   float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 45 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 46 |   for(int i = 0;i<8 * ILPconfig;i++){
 47 |     frag_A[i] = a[i + lane_id()*8]; 
 48 |     
 49 |   }
 50 |   for(int i =0;i<4 * ILPconfig;i++){
 51 |     frag_B[i] = b[i + lane_id()*4]; 
 52 |     frag_D[i] = 0.0f;
 53 |   }
 54 | 
 55 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 56 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 57 |   float *C = reinterpret_cast<float *>(&frag_D[0]);
 58 |   float *D = C;  // D = A*B + D. 
 59 | 
 60 |   // float fpuA = frag_A[0];
 61 |   // float fpuB = frag_B[0];
 62 |   float fpuC = frag_D[0];
 63 | 
 64 |   // int intA = threadIdx.x;
 65 |   // int intB = threadIdx.x + 1;
 66 |   int intC = threadIdx.x + 2;
 67 | 
 68 |   uint64_t start = 0;
 69 |   uint64_t stop = 0;
 70 |   // synchronize all threads
 71 |   asm volatile("bar.sync 0;");
 72 |   // start timing
 73 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 74 |   //#pragma unroll
 75 |   for (int j = 0; j < ITERS; ++j) {
 76 |     asm volatile(
 77 |         "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
 78 |         "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 79 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
 80 |         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
 81 |           "r"(B[0]), "r"(B[1]),
 82 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
 83 |     );
 84 |     #if ILPconfig >= 2
 85 |     asm volatile(
 86 |       "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
 87 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 88 |       : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
 89 |       : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 
 90 |         "r"(B[2]), "r"(B[3]),
 91 |         "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
 92 |     );
 93 |     #endif
 94 |     #if ILPconfig >= 3
 95 |     asm volatile(
 96 |         "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
 97 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 98 |       : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
 99 |       : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 
100 |         "r"(B[4]), "r"(B[5]),
101 |         "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11])
102 |     );
103 |     #endif
104 |     #if ILPconfig >= 4
105 |     asm volatile(
106 |         "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
107 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
108 |       : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15])
109 |       : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 
110 |         "r"(B[6]), "r"(B[7]),
111 |         "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15])
112 |     );
113 |     #endif
114 |     #if ILPconfig >= 5
115 |     asm volatile(
116 |         "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
117 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
118 |       : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19])
119 |       : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 
120 |         "r"(B[8]), "r"(B[9]),
121 |         "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19])
122 |     );
123 |     #endif
124 |     __syncwarp();
125 |   }
126 |   // stop timing
127 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
128 |   for(int i=0; i < 4*ILPconfig;i++){
129 |     res[i] += frag_D[i]; 
130 | 
131 |     res[i] += fpuC;
132 |     res[i] += intC;
133 |   }
134 | 
135 |   //res[0] += fpuC;
136 |   startClk[gid] = start;
137 |   stopClk[gid] = stop;
138 | }
139 | 
140 | 
141 | template <class T, class R> 
142 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
143 |     intilizeDeviceProp(0);
144 |   
145 |     int BLOCKS_NUM = 1;
146 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
147 |     int WARP_SIZE = 32;
148 |   
149 |     unsigned total_A_SIZE =
150 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
151 |     unsigned total_B_SIZE =
152 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
153 |     unsigned total_R_SIZE =
154 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
155 |   
156 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
157 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
158 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
159 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
160 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
161 |   
162 |     uint64_t *startClk_g;
163 |     uint64_t *stopClk_g;
164 |     T *data1_g;
165 |     T *data2_g;
166 |     R *res_g;
167 |   
168 |     for (uint32_t i = 0; i < 16*8; i++) {
169 |       data1[i] = (T)i;
170 |     }
171 |   
172 |     for (uint32_t i = 0; i < 8*8; i++) {
173 |       data2[i] = (T)i;
174 |     }
175 |   
176 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
177 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
178 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
179 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
180 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
181 |   
182 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
183 |                          cudaMemcpyHostToDevice));
184 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
185 |                          cudaMemcpyHostToDevice));
186 |   
187 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
188 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
189 |     gpuErrchk(cudaPeekAtLastError());
190 |   
191 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
192 |                          cudaMemcpyDeviceToHost));
193 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
194 |                          cudaMemcpyDeviceToHost));
195 |     gpuErrchk(
196 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
197 |   
198 |     float mma_bw, fma_bw;
199 |     uint64_t total_time =
200 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
201 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
202 | 
203 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
204 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
205 | 
206 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
207 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
208 |     //           (float)total_time;
209 |     fma_bw = ((float)(ITERS * 16 * 8 * 16  * ILPconfig * //0 *
210 |                       (TOTAL_THREADS / WARP_SIZE))) /
211 |              (float)total_time;
212 |   
213 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
214 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
215 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32  latency " << (float)total_time/(float)ITERS << " cycles\n";
216 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
217 |   
218 |     std::cout << "Total Clk number = " << total_time << "\n";
219 |   
220 |     if (report_fma_bw)
221 |       return fma_bw;
222 |     else
223 |       return mma_bw;
224 | }
225 | 
226 | int main() {
227 |     std::vector<int> warps = {1,2,4,6,8,12,16,32};
228 |     intilizeDeviceProp(0);
229 |     std::cout<<"***********************************"<<std::endl;
230 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 microbenchmark with ILP = " << ILPconfig << std::endl;
231 |     for(auto& e:warps){
232 |         std::cout << "Number of warps = "<< e <<std::endl;
233 |         run<float, float>(32*e);
234 |         std::cout << std::endl;
235 |     }
236 | 
237 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
238 |     // tensor1688_max_flops<half, float>(32);
239 |     return 0;
240 |   }
241 |   
242 | 


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_fp/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k16_fp32.cu
2 | 
3 | EXE = mma_m16n8k16_fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_fp/mma_m16n8k16_fp32.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | #ifndef ILPconfig
 22 | #define ILPconfig 1
 23 | #endif
 24 | 
 25 | 
 26 | static_assert(ILPconfig<=5," ILP>5 is not implemented\n");
 27 | 
 28 | 
 29 | 
 30 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, float *res,
 31 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 32 |   // thread index
 33 |   uint32_t tid = threadIdx.x;
 34 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 35 |   uint32_t warpid = gid / warpSize;
 36 | 
 37 |   a = a + warpid * 16*16; // m*k = 16*16
 38 |   b = b + warpid * 8*16; // n*k = 8*16
 39 |   res = res + warpid * 16*8;// m*n = 16*16
 40 | 
 41 |    /** step 1: create register for each thread **/
 42 |   half frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 
 43 |   half frag_B[4*ILPconfig];  // one .f16x2 registers, 4 half  elements
 44 |   float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 45 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 46 |   for(int i = 0;i<8 * ILPconfig;i++){
 47 |     frag_A[i] = a[i + lane_id()*8]; 
 48 |     
 49 |   }
 50 |   for(int i =0;i<4 * ILPconfig;i++){
 51 |     frag_B[i] = b[i + lane_id()*4]; 
 52 |     frag_D[i] = 0.0f;
 53 |   }
 54 | 
 55 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 56 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 57 |   float *C = reinterpret_cast<float *>(&frag_D[0]);
 58 |   float *D = C;  // D = A*B + D. 
 59 | 
 60 |   // float fpuA = frag_A[0];
 61 |   // float fpuB = frag_B[0];
 62 |   float fpuC = frag_D[0];
 63 | 
 64 |   // int intA = threadIdx.x;
 65 |   // int intB = threadIdx.x + 1;
 66 |   int intC = threadIdx.x + 2;
 67 | 
 68 |   uint64_t start = 0;
 69 |   uint64_t stop = 0;
 70 |   // synchronize all threads
 71 |   asm volatile("bar.sync 0;");
 72 |   // start timing
 73 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 74 |   //#pragma unroll
 75 |   for (int j = 0; j < ITERS; ++j) {
 76 |     asm volatile(
 77 |         "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
 78 |         "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 79 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
 80 |         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
 81 |           "r"(B[0]), "r"(B[1]),
 82 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
 83 |     );
 84 |     #if ILPconfig >= 2
 85 |     asm volatile(
 86 |       "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
 87 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 88 |       : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
 89 |       : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 
 90 |         "r"(B[2]), "r"(B[3]),
 91 |         "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
 92 |     );
 93 |     #endif
 94 |     #if ILPconfig >= 3
 95 |     asm volatile(
 96 |       "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
 97 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 98 |       : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
 99 |       : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 
100 |         "r"(B[4]), "r"(B[5]),
101 |         "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11])
102 |     );
103 |     #endif
104 |     #if ILPconfig >= 4
105 |     asm volatile(
106 |       "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
107 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
108 |       : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15])
109 |       : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 
110 |         "r"(B[6]), "r"(B[7]),
111 |         "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15])
112 |     );
113 |     #endif
114 |     #if ILPconfig >= 5
115 |     asm volatile(
116 |       "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
117 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
118 |       : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19])
119 |       : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 
120 |         "r"(B[8]), "r"(B[9]),
121 |         "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19])
122 |     );
123 |     #endif
124 |     __syncwarp();
125 |   }
126 |   // stop timing
127 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
128 |   for(int i=0; i < 4*ILPconfig;i++){
129 |     res[i] += frag_D[i]; 
130 | 
131 |     res[i] += fpuC;
132 |     res[i] += intC;
133 |   }
134 | 
135 |   //res[0] += fpuC;
136 |   startClk[gid] = start;
137 |   stopClk[gid] = stop;
138 | }
139 | 
140 | 
141 | template <class T, class R> 
142 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
143 |     intilizeDeviceProp(0);
144 |   
145 |     int BLOCKS_NUM = 1;
146 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
147 |     int WARP_SIZE = 32;
148 |   
149 |     unsigned total_A_SIZE =
150 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
151 |     unsigned total_B_SIZE =
152 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
153 |     unsigned total_R_SIZE =
154 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
155 |   
156 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
157 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
158 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
159 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
160 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
161 |   
162 |     uint64_t *startClk_g;
163 |     uint64_t *stopClk_g;
164 |     T *data1_g;
165 |     T *data2_g;
166 |     R *res_g;
167 |   
168 |     for (uint32_t i = 0; i < 16*8; i++) {
169 |       data1[i] = (T)i;
170 |     }
171 |   
172 |     for (uint32_t i = 0; i < 8*8; i++) {
173 |       data2[i] = (T)i;
174 |     }
175 |   
176 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
177 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
178 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
179 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
180 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
181 |   
182 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
183 |                          cudaMemcpyHostToDevice));
184 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
185 |                          cudaMemcpyHostToDevice));
186 |   
187 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
188 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
189 |     gpuErrchk(cudaPeekAtLastError());
190 |   
191 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
192 |                          cudaMemcpyDeviceToHost));
193 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
194 |                          cudaMemcpyDeviceToHost));
195 |     gpuErrchk(
196 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
197 |   
198 |     float mma_bw, fma_bw;
199 |     uint64_t total_time =
200 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
201 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
202 | 
203 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
204 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
205 | 
206 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
207 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
208 |     //           (float)total_time;
209 |     fma_bw = ((float)(ITERS * 16 * 8 * 16  * ILPconfig * //0 *
210 |                       (TOTAL_THREADS / WARP_SIZE))) /
211 |              (float)total_time;
212 |   
213 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
214 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
215 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  latency " << (float)total_time/(float)ITERS << " cycles\n";
216 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
217 |   
218 |     std::cout << "Total Clk number = " << total_time << "\n";
219 |   
220 |     if (report_fma_bw)
221 |       return fma_bw;
222 |     else
223 |       return mma_bw;
224 | }
225 | 
226 | int main() {
227 |     std::vector<int> warps = {1,2,4,6,8,12,16,32};
228 |     intilizeDeviceProp(0);
229 |     std::cout<<"***********************************"<<std::endl;
230 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 microbenchmark with ILP = " << ILPconfig << std::endl;
231 |     for(auto& e:warps){
232 |         std::cout << "Number of warps = "<< e <<std::endl;
233 |         run<half, float>(32*e);
234 |         std::cout << std::endl;
235 |     }
236 | 
237 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
238 |     // tensor1688_max_flops<half, float>(32);
239 |     return 0;
240 |   }
241 |   
242 | 
243 | 
244 | 
245 | 
246 | 
247 |     //fpuC = fpuC * fpuA;
248 |     //asm volatile("fma.rn.f32 %0, %1, %2, %3 ;" : "=f"(fpuC) : "f"(fpuA), "f"(fpuB),"f"(fpuC)  );
249 |     
250 |     // asm volatile("{\t\n"
251 |     // "fma.rn.f32 %0, %1, %2 , %0;\n\t"
252 |     // "fma.rn.f32 %0, %1, %2 , %0;\n\t"
253 |     // "fma.rn.f32 %0, %1, %2 , %0;\n\t"
254 |     // "fma.rn.f32 %0, %1, %2 , %0;\n\t"
255 |     // "}"
256 |     // : "+f"(fpuC), "+f"(fpuA), "+f"(fpuB));
257 | 
258 |     
259 |     // asm volatile("{\t\n"
260 |     // "mul.rn.f32 %0, %1, %0;\n\t"
261 |     // "mul.rn.f32 %0, %1, %0;\n\t"
262 |     // "mul.rn.f32 %0, %1, %0;\n\t"
263 |     // "mul.rn.f32 %0, %1, %0;\n\t"
264 |     // "}"
265 |     // : "+f"(fpuC), "+f"(fpuA));
266 | 
267 | 
268 | 
269 |     // asm volatile("{\t\n"
270 |     // "mad.lo.s32 %0, %1, %2 , %0;\n\t"
271 |     // "mad.lo.s32 %0, %1, %2 , %0;\n\t"
272 |     // "mad.lo.s32 %0, %1, %2 , %0;\n\t"
273 |     // "mad.lo.s32 %0, %1, %2 , %0;\n\t"
274 |     // "}"
275 |     // : "+r"(intC), "+r"(intA), "+r"(intB));
276 | 


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_half/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k16_half.cu
2 | 
3 | EXE = mma_m16n8k16_half.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_half/mma_m16n8k16_half.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | 
 22 | #ifndef ILPconfig
 23 | #define ILPconfig 1
 24 | #endif
 25 | 
 26 | 
 27 | static_assert(ILPconfig<=6, "ILP>6 is not implemented\n");
 28 | 
 29 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, half *res,
 30 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 31 |   // thread index
 32 |   uint32_t tid = threadIdx.x;
 33 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 34 |   uint32_t warpid = gid / warpSize;
 35 | 
 36 |   a = a + warpid * 16*16; // m*k = 16*16
 37 |   b = b + warpid * 8*16; // n*k = 8*16
 38 |   res = res + warpid * 16*8;// m*n = 16*16
 39 | 
 40 |    /** step 1: create register for each thread **/
 41 |   half frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 
 42 |   half frag_B[4*ILPconfig];  // one .f16x2 registers, 4 half  elements
 43 |   half frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 44 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 45 |   for(int i = 0;i<8 *ILPconfig ;i++){
 46 |     frag_A[i] = a[i + lane_id()*8]; 
 47 |     
 48 |   }
 49 |   for(int i =0;i<4 *ILPconfig ;i++){
 50 |     frag_B[i] = b[i + lane_id()*4]; 
 51 |     frag_D[i] = 0.0;
 52 |   }
 53 | 
 54 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 55 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 56 |   uint32_t *C = reinterpret_cast<uint32_t *>(&frag_D[0]);
 57 |   uint32_t *D = C;  // D = A*B + D. 
 58 | 
 59 |   float fpuA = frag_A[0];
 60 |   float fpuB = frag_B[0];
 61 |   float fpuC = frag_D[0];
 62 | 
 63 |   int intA = threadIdx.x;
 64 |   int intB = threadIdx.x + 1;
 65 |   int intC = threadIdx.x + 2;
 66 | 
 67 |   uint64_t start = 0;
 68 |   uint64_t stop = 0;
 69 |   // synchronize all threads
 70 |   asm volatile("bar.sync 0;");
 71 |   // start timing
 72 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 73 |   //#pragma unroll
 74 |   for (int j = 0; j < ITERS; ++j) {
 75 |     asm volatile(
 76 |         "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
 77 |         "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
 78 |         : "=r"(D[0]), "=r"(D[1]) 
 79 |         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
 80 |           "r"(B[0]), "r"(B[1]),
 81 |           "r"(C[0]), "r"(C[1]) 
 82 |     ); // input C operand will use output operand D.
 83 |     #if ILPconfig >= 2
 84 |     asm volatile(
 85 |       "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
 86 |       "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
 87 |       : "=r"(D[2]), "=r"(D[3]) 
 88 |       : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 
 89 |         "r"(B[2]), "r"(B[3]),
 90 |         "r"(C[2]), "r"(C[3]) 
 91 |     ); // input C operand will use output operand D.
 92 |     #endif
 93 | 
 94 |     #if ILPconfig >= 3
 95 |     asm volatile(
 96 |       "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
 97 |       "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
 98 |       : "=r"(D[4]), "=r"(D[5]) 
 99 |       : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 
100 |         "r"(B[4]), "r"(B[5]),
101 |         "r"(C[4]), "r"(C[5]) 
102 |     ); // input C operand will use output operand D.
103 |     #endif
104 |     #if ILPconfig >= 4
105 |     asm volatile(
106 |       "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
107 |       "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
108 |       : "=r"(D[6]), "=r"(D[7]) 
109 |       : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 
110 |         "r"(B[6]), "r"(B[7]),
111 |         "r"(C[6]), "r"(C[7]) 
112 |     ); // input C operand will use output operand D.
113 |     #endif
114 | 
115 |     #if ILPconfig >= 5
116 |     asm volatile(
117 |       "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
118 |       "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
119 |       : "=r"(D[8]), "=r"(D[9]) 
120 |       : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 
121 |         "r"(B[8]), "r"(B[9]),
122 |         "r"(C[8]), "r"(C[9]) 
123 |     ); // input C operand will use output operand D.
124 |     #endif
125 |     #if ILPconfig >= 6
126 |     asm volatile(
127 |       "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
128 |       "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
129 |       : "=r"(D[10]), "=r"(D[11]) 
130 |       : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 
131 |         "r"(B[10]), "r"(B[11]),
132 |         "r"(C[10]), "r"(C[11]) 
133 |     ); // input C operand will use output operand D.
134 |     #endif
135 |     __syncwarp();
136 |   }
137 |   // synchronize all threads
138 |   asm volatile("bar.sync 0;");
139 |   // stop timing
140 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
141 |   // avoid undeserable optimization 
142 |   for(int i=0; i < 4 * ILPconfig;i++){
143 |     res[i] = frag_D[i];
144 | 
145 |     res[i] += fpuC;
146 |     res[i] += intC;
147 |   }
148 | 
149 |   //res[0] += fpuC;
150 |   startClk[gid] = start;
151 |   stopClk[gid] = stop;
152 | }
153 | 
154 | 
155 | template <class T, class R> 
156 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
157 |     intilizeDeviceProp(0);
158 |   
159 |     int BLOCKS_NUM = 1;
160 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
161 |     int WARP_SIZE = 32;
162 |   
163 |     unsigned total_A_SIZE =
164 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
165 |     unsigned total_B_SIZE =
166 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
167 |     unsigned total_R_SIZE =
168 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
169 |   
170 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
171 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
172 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
173 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
174 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
175 |   
176 |     uint64_t *startClk_g;
177 |     uint64_t *stopClk_g;
178 |     T *data1_g;
179 |     T *data2_g;
180 |     R *res_g;
181 |   
182 |     for (uint32_t i = 0; i < 16*8; i++) {
183 |       data1[i] = (T)i;
184 |     }
185 |   
186 |     for (uint32_t i = 0; i < 8*8; i++) {
187 |       data2[i] = (T)i;
188 |     }
189 |   
190 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
191 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
192 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
193 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
194 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
195 |   
196 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
197 |                          cudaMemcpyHostToDevice));
198 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
199 |                          cudaMemcpyHostToDevice));
200 |   
201 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
202 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
203 |     gpuErrchk(cudaPeekAtLastError());
204 |   
205 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
206 |                          cudaMemcpyDeviceToHost));
207 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
208 |                          cudaMemcpyDeviceToHost));
209 |     gpuErrchk(
210 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
211 |   
212 |     float mma_bw, fma_bw;
213 |     uint64_t total_time =
214 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
215 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
216 | 
217 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
218 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
219 | 
220 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
221 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
222 |     //           (float)total_time;
223 |     fma_bw = ((float)(ITERS * 16 * 8 * 16 * ILPconfig * //0 *
224 |                       (TOTAL_THREADS / WARP_SIZE))) /
225 |              (float)total_time;
226 |   
227 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
228 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
229 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16  latency " << (float)total_time/(float)ITERS << " cycles\n";
230 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
231 |   
232 |     std::cout << "Total Clk number = " << total_time << "\n";
233 |   
234 |     if (report_fma_bw)
235 |       return fma_bw;
236 |     else
237 |       return mma_bw;
238 | }
239 | 
240 | int main() {
241 |     intilizeDeviceProp(0);
242 |     std::cout<<"***********************************"<<std::endl;
243 |     std::cout << "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 microbenchmark with ILP = " << ILPconfig << std::endl;
244 |     for(int i = 1; i <= 32; i = i*2){
245 |         std::cout << "Number of warps = "<< i <<std::endl;
246 |         run<half, half>(32*i);
247 |         std::cout << std::endl;
248 |     }
249 | 
250 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
251 |     // tensor1688_max_flops<half, float>(32);
252 |     return 0;
253 |   }
254 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k16_int/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k16_int.cu
2 | 
3 | EXE = mma_m16n8k16_int.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k256_int1/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k256_int1.cu
2 | 
3 | EXE = mma_m16n8k256_int1.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k32_fp8/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k32_fp8.cu
2 | 
3 | EXE = mma_m16n8k32_fp8.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k32_int/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k32_int.cu
2 | 
3 | EXE = mma_m16n8k32_int.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k32_int/mma_m16n8k32_int.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | #ifndef ILPconfig
 21 | #define ILPconfig 1
 22 | #endif
 23 | 
 24 | static_assert(ILPconfig<=8, "ILP > 8 is not implemented\n");
 25 | 
 26 | 
 27 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, char *a, char *b, int *res,
 28 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 29 |   // thread index
 30 |   uint32_t tid = threadIdx.x;
 31 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 32 |   uint32_t warpid = gid / warpSize;
 33 | 
 34 |   a = a + warpid * 16*32; // m*k = 16*32
 35 |   b = b + warpid * 8*32; // n*k = 8*32
 36 |   res = res + warpid * 16*8;// m*n = 16*8
 37 | 
 38 |    /** step 1: create register for each thread **/
 39 |   char frag_A[16*ILPconfig]; // four int8 registers, 
 40 |   char frag_B[8*ILPconfig];  // one .f16x2 registers, 2 half  elements
 41 |   int frag_D[4*ILPconfig]; //result(fp32) 2 f32 registers
 42 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 43 |   for(int i = 0;i<16*ILPconfig;i++){
 44 |     frag_A[i] = a[i + lane_id()*16]; 
 45 |     
 46 |   }
 47 |   for(int i =0;i<8*ILPconfig;i++){
 48 |     frag_B[i] = b[i + lane_id()*8]; 
 49 |     //frag_D[i] = 0.0f;
 50 |   }
 51 |   for(int i =0;i<4*ILPconfig;i++){
 52 |     //frag_B[i] = b[i + lane_id()*4]; 
 53 |     frag_D[i] = 0;
 54 |   }
 55 | 
 56 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 57 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 58 |   int *C = reinterpret_cast<int *>(&frag_D[0]);
 59 |   int *D = C;  // D = A*B + D. 
 60 | 
 61 | 
 62 |   uint64_t start = 0;
 63 |   uint64_t stop = 0;
 64 |   // synchronize all threads
 65 |   asm volatile("bar.sync 0;");
 66 |   // start timing
 67 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 68 |   //#pragma unroll
 69 |   for (int j = 0; j < ITERS; ++j) {
 70 |     asm volatile(
 71 |         "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
 72 |         "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 73 |         : "=r"(D[0]), "=r"(D[1]) , "=r"(D[2]), "=r"(D[3])
 74 |         : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
 75 |           "r"(B[0]), "r"(B[1]),
 76 |           "r"(C[0]), "r"(C[1]) ,"r"(C[2]), "r"(C[3])
 77 |     ); 
 78 | 
 79 |     #if ILPconfig >= 2
 80 |     asm volatile(
 81 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
 82 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 83 |       : "=r"(D[4]), "=r"(D[5]) , "=r"(D[6]), "=r"(D[7])
 84 |       : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 
 85 |         "r"(B[2]), "r"(B[3]),
 86 |         "r"(C[4]), "r"(C[5]) ,"r"(C[6]), "r"(C[7])
 87 |     ); 
 88 |     #endif
 89 | 
 90 |     #if ILPconfig >= 3
 91 |     asm volatile(
 92 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
 93 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
 94 |       : "=r"(D[8]), "=r"(D[9]) , "=r"(D[10]), "=r"(D[11])
 95 |       : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 
 96 |         "r"(B[4]), "r"(B[5]),
 97 |         "r"(C[8]), "r"(C[9]) ,"r"(C[10]), "r"(C[11])
 98 |     ); 
 99 |     #endif
100 |     #if ILPconfig >= 4
101 |     asm volatile(
102 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
103 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
104 |       : "=r"(D[12]), "=r"(D[13]) , "=r"(D[14]), "=r"(D[15])
105 |       : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 
106 |         "r"(B[6]), "r"(B[7]),
107 |         "r"(C[12]), "r"(C[13]) ,"r"(C[14]), "r"(C[15])
108 |     ); 
109 |     #endif
110 | 
111 |     #if ILPconfig >= 5
112 |     asm volatile(
113 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
114 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
115 |       : "=r"(D[16]), "=r"(D[17]) , "=r"(D[18]), "=r"(D[19])
116 |       : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 
117 |         "r"(B[8]), "r"(B[9]),
118 |         "r"(C[16]), "r"(C[17]) ,"r"(C[18]), "r"(C[19])
119 |     ); 
120 |     #endif
121 | 
122 |     #if ILPconfig >= 6
123 |     asm volatile(
124 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
125 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
126 |       : "=r"(D[20]), "=r"(D[21]) , "=r"(D[22]), "=r"(D[23])
127 |       : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 
128 |         "r"(B[10]), "r"(B[11]),
129 |         "r"(C[20]), "r"(C[21]) ,"r"(C[22]), "r"(C[23])
130 |     ); 
131 |     #endif
132 |     #if ILPconfig >= 7
133 |     asm volatile(
134 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
135 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
136 |       : "=r"(D[24]), "=r"(D[25]) , "=r"(D[26]), "=r"(D[27])
137 |       : "r"(A[24]), "r"(A[25]), "r"(A[26]), "r"(A[27]), 
138 |         "r"(B[12]), "r"(B[13]),
139 |         "r"(C[24]), "r"(C[25]) ,"r"(C[26]), "r"(C[27])
140 |     ); 
141 |     #endif
142 |     #if ILPconfig >= 8
143 |     asm volatile(
144 |       "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 "
145 |       "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
146 |       : "=r"(D[28]), "=r"(D[29]) , "=r"(D[30]), "=r"(D[31])
147 |       : "r"(A[28]), "r"(A[28]), "r"(A[30]), "r"(A[31]), 
148 |         "r"(B[14]), "r"(B[15]),
149 |         "r"(C[28]), "r"(C[29]) ,"r"(C[30]), "r"(C[31])
150 |     ); 
151 |     #endif
152 | 
153 |     __syncwarp();
154 | 
155 |   }
156 | 
157 |   // synchronize warps
158 |  
159 |   // stop timing
160 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");//around 1 cycle overhead
161 |   for(int i=0; i < 4*ILPconfig;i++){
162 |     res[i] = frag_D[i]; 
163 | 
164 |   }
165 | 
166 |   //res[0] += fpuC;
167 |   startClk[gid] = start;
168 |   stopClk[gid] = stop;
169 | }
170 | 
171 | 
172 | template <class T, class R> 
173 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
174 |     intilizeDeviceProp(0);
175 |   
176 |     int BLOCKS_NUM = 1;
177 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
178 |     int WARP_SIZE = 32;
179 |   
180 |     unsigned total_A_SIZE =
181 |         16*32 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
182 |     unsigned total_B_SIZE =
183 |         8*32 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
184 |     unsigned total_R_SIZE =
185 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
186 |   
187 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
188 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
189 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
190 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
191 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
192 |   
193 |     uint64_t *startClk_g;
194 |     uint64_t *stopClk_g;
195 |     T *data1_g;
196 |     T *data2_g;
197 |     R *res_g;
198 |   
199 |     for (uint32_t i = 0; i < 16*32; i++) {
200 |       data1[i] = (T)i;
201 |     }
202 |   
203 |     for (uint32_t i = 0; i < 8*32; i++) {
204 |       data2[i] = (T)i;
205 |     }
206 |   
207 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
208 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
209 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
210 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
211 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
212 |   
213 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
214 |                          cudaMemcpyHostToDevice));
215 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
216 |                          cudaMemcpyHostToDevice));
217 |   
218 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
219 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
220 |     gpuErrchk(cudaPeekAtLastError());
221 |   
222 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
223 |                          cudaMemcpyDeviceToHost));
224 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
225 |                          cudaMemcpyDeviceToHost));
226 |     gpuErrchk(
227 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
228 |   
229 |     float mma_bw, fma_bw;
230 |     uint64_t total_time =
231 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
232 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
233 | 
234 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
235 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
236 | 
237 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
238 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
239 |     //           (float)total_time;
240 |     fma_bw = ((float)(ITERS * 16 * 8 * 32 * ILPconfig * //0 *
241 |                       (TOTAL_THREADS / WARP_SIZE))) /
242 |              (float)total_time;
243 |   
244 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
245 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
246 |     std::cout << "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 latency " << (float)total_time/(float)ITERS << " cycles\n";
247 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
248 |   
249 |     std::cout << "Total Clk number = " << total_time << "\n";
250 |   
251 |     if (report_fma_bw)
252 |       return fma_bw;
253 |     else
254 |       return mma_bw;
255 | }
256 | 
257 | int main() {
258 |     intilizeDeviceProp(0);
259 |     std::cout<<"***********************************"<<std::endl;
260 |     std::cout << "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 microbenchmark with ILP = " << ILPconfig << std::endl;
261 |     for(int i = 1; i <= 32; i = i*2){
262 |         std::cout << "Number of warps = "<< i <<std::endl;
263 |         run<char, int>(32*i);
264 |         std::cout << std::endl;
265 |     }
266 | 
267 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
268 |     // tensor1688_max_flops<half, float>(32);
269 |     return 0;
270 |   }
271 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k32_int4/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k32_int4.cu
2 | 
3 | EXE = mma_m16n8k32_int4.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k4_tf32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k4_tf32.cu
2 | 
3 | EXE = mma_m16n8k4_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k4_tf32/mma_m16n8k4_tf32.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | 
 22 | #ifndef ILPconfig
 23 | #define ILPconfig 1
 24 | #endif
 25 | 
 26 | 
 27 | #if ILPconfig > 6
 28 | static_assert(0,"ILP > 6 is not supported\n");
 29 | #endif
 30 | 
 31 | 
 32 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res,
 33 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 34 |   // thread index
 35 |   uint32_t tid = threadIdx.x;
 36 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 37 |   uint32_t warpid = gid / warpSize;
 38 | 
 39 |   a = a + warpid * 16*4; // m*k = 16*16
 40 |   b = b + warpid * 8*4; // n*k = 8*16
 41 |   res = res + warpid * 16*8;// m*n = 16*16
 42 | 
 43 |    /** step 1: create register for each thread **/
 44 |   float frag_A[2*ILPconfig]; // two .f16x2 registers, 8 half elements, 
 45 |   float frag_B[1*ILPconfig];  // one .f16x2 registers, 4 half  elements
 46 |   float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 47 | 
 48 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 49 |   for(int i = 0;i<2*ILPconfig;i++){
 50 |     frag_A[i] = a[i + lane_id()*4]; 
 51 |     //frag_A_ILP2[i] = a[i + lane_id()*4] + 1; 
 52 |     
 53 |   }
 54 |   for(int i =0;i<1*ILPconfig;i++){
 55 |     frag_B[i] = b[i + lane_id()*1]; 
 56 |     //frag_B_ILP2[i] = b[i + lane_id()*1] + 1; 
 57 |   }
 58 | 
 59 | 
 60 |   for(int i =0;i<4*ILPconfig;i++){
 61 |     //frag_B[i] = b[i + lane_id()*4]; 
 62 |     frag_D[i] = 0.0f;
 63 |     //frag_D_ILP2[i] = 0.0f;
 64 |   }
 65 | 
 66 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 67 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 68 |   float *C = reinterpret_cast<float *>(&frag_D[0]);
 69 |   float *D = C;  // D = A*B + D. 
 70 | 
 71 | 
 72 |   // float fpuA = frag_A[0];
 73 |   // float fpuB = frag_B[0];
 74 |   float fpuC = frag_D[0];
 75 | 
 76 |   // int intA = threadIdx.x;
 77 |   // int intB = threadIdx.x + 1;
 78 |   int intC = threadIdx.x + 2;
 79 | 
 80 |   uint64_t start = 0;
 81 |   uint64_t stop = 0;
 82 |   // synchronize all threads
 83 |   asm volatile("bar.sync 0;");
 84 |   // start timing
 85 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 86 |   //#pragma unroll
 87 |   for (int j = 0; j < ITERS; ++j) {
 88 |     asm volatile(
 89 |         "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
 90 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
 91 |         : "r"(A[0]), "r"(A[1]), 
 92 |           "r"(B[0]), 
 93 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
 94 |     );
 95 | 
 96 |     #if ILPconfig >= 2
 97 |     asm volatile(
 98 |       "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
 99 |       : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
100 |       : "r"(A[2]), "r"(A[3]), 
101 |         "r"(B[1]), 
102 |         "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
103 |     );
104 |     #endif
105 | 
106 |    #if ILPconfig >= 3
107 |     asm volatile(
108 |       "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
109 |       : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
110 |       : "r"(A[4]), "r"(A[5]), 
111 |         "r"(B[2]), 
112 |         "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11])
113 |     );
114 |     #endif
115 | 
116 |   #if ILPconfig >= 4
117 |   asm volatile(
118 |     "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
119 |     : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15])
120 |     : "r"(A[6]), "r"(A[7]), 
121 |       "r"(B[3]), 
122 |       "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15])
123 |     );
124 |   #endif
125 | 
126 |   #if ILPconfig >= 5
127 |   asm volatile(
128 |     "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
129 |     : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19])
130 |     : "r"(A[8]), "r"(A[9]), 
131 |       "r"(B[4]), 
132 |       "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19])
133 |     );
134 |   #endif
135 | 
136 |   #if ILPconfig >= 6
137 |   asm volatile(
138 |     "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
139 |     : "=f"(D[20]), "=f"(D[21]), "=f"(D[22]), "=f"(D[23])
140 |     : "r"(A[10]), "r"(A[11]), 
141 |       "r"(B[5]), 
142 |       "f"(C[20]), "f"(C[21]), "f"(C[22]), "f"(C[23])
143 |     );
144 |   #endif
145 |     __syncwarp();
146 | 
147 |   }
148 |   // stop timing
149 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
150 |   for(int i=0; i < 4*ILPconfig;i++){
151 |     res[i] = frag_D[i]; 
152 |     //res[i] += frag_D_ILP2[i + lane_id()*4];
153 |     res[i] += fpuC;
154 |     res[i] += intC;
155 |   }
156 | 
157 |   //res[0] += fpuC;
158 |   startClk[gid] = start;
159 |   stopClk[gid] = stop;
160 | }
161 | 
162 | 
163 | template <class T, class R> 
164 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
165 |     intilizeDeviceProp(0);
166 |   
167 |     int BLOCKS_NUM = 1;
168 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
169 |     int WARP_SIZE = 32;
170 |   
171 |     unsigned total_A_SIZE =
172 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
173 |     unsigned total_B_SIZE =
174 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
175 |     unsigned total_R_SIZE =
176 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
177 |   
178 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
179 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
180 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
181 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
182 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
183 |   
184 |     uint64_t *startClk_g;
185 |     uint64_t *stopClk_g;
186 |     T *data1_g;
187 |     T *data2_g;
188 |     R *res_g;
189 |   
190 |     for (uint32_t i = 0; i < 16*4; i++) {
191 |       data1[i] = (T)i;
192 |     }
193 |   
194 |     for (uint32_t i = 0; i < 4*8; i++) {
195 |       data2[i] = (T)i;
196 |     }
197 |   
198 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
199 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
200 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
201 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
202 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
203 |   
204 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
205 |                          cudaMemcpyHostToDevice));
206 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
207 |                          cudaMemcpyHostToDevice));
208 |   
209 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
210 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
211 |     gpuErrchk(cudaPeekAtLastError());
212 |   
213 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
214 |                          cudaMemcpyDeviceToHost));
215 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
216 |                          cudaMemcpyDeviceToHost));
217 |     gpuErrchk(
218 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
219 |   
220 |     float mma_bw, fma_bw;
221 |     uint64_t total_time =
222 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
223 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
224 | 
225 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
226 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
227 | 
228 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
229 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
230 |     //           (float)total_time;
231 |     fma_bw = ((float)(ITERS * 16 * 8 * 4 * ILPconfig *  //0 *
232 |                       (TOTAL_THREADS / WARP_SIZE))) /
233 |              (float)total_time;
234 |   
235 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
236 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
237 |     std::cout << "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32  latency " << (float)total_time/(float)ITERS << " cycles\n";
238 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
239 |   
240 |     std::cout << "Total Clk number = " << total_time << "\n";
241 |   
242 |     if (report_fma_bw)
243 |       return fma_bw;
244 |     else
245 |       return mma_bw;
246 | }
247 | 
248 | int main() {
249 |     intilizeDeviceProp(0);
250 |     // std::cout << "mma1688 FP16 operand, FP32 accumalte:\n";
251 |     std::cout<<"***********************************"<<std::endl;
252 |     std::cout << "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 microbenchmark with ILP = " << ILPconfig << std::endl;
253 |     for(int i = 1; i <= 32; i = i*2){
254 |         std::cout << "Number of warps = "<< i <<std::endl;
255 |         run<float, float>(32*i);
256 |         std::cout << std::endl;
257 |     }
258 | 
259 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
260 |     // tensor1688_max_flops<half, float>(32);
261 |     return 0;
262 |   }
263 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k64_int4/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k64_int4.cu
2 | 
3 | EXE = mma_m16n8k64_int4.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_bf16fp32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k8_bf16fp32.cu
2 | 
3 | EXE = mma_m16n8k8_bf16fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_fp/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k8_fp32.cu
2 | 
3 | EXE = mma_m16n8k8_fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_half/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k8_half.cu
2 | 
3 | EXE = mma_m16n8k8_half.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_half/mma_m16n8k8_half.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | #ifndef ILPconfig
 22 | #define ILPconfig 1
 23 | #endif
 24 | 
 25 | static_assert(ILPconfig <= 8, "ILP > 8 is not supported\n");
 26 | 
 27 | 
 28 | __global__ void tensr1688_flops(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, float *res,
 29 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 30 |   // thread index
 31 |   uint32_t tid = threadIdx.x;
 32 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 33 |   uint32_t warpid = gid / warpSize;
 34 | 
 35 |   a = a + warpid * 16*8; // m*k = 16*16
 36 |   b = b + warpid * 8*8; // n*k = 8*16
 37 |   res = res + warpid * 16*8;// m*n = 16*16
 38 | 
 39 |    /** step 1: create register for each thread **/
 40 |   half frag_A[4*ILPconfig]; // two .f16x2 registers, 8 half elements, 
 41 |   half frag_B[2*ILPconfig];  // one .f16x2 registers, 4 half  elements
 42 |   half frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 43 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 44 |   for(int i = 0;i<4*ILPconfig;i++){
 45 |     frag_A[i] = a[i + lane_id()*4]; 
 46 |     frag_D[i] = 0.0f;
 47 |   }
 48 |   for(int i =0;i<2*ILPconfig;i++){
 49 |     frag_B[i] = b[i + lane_id()*2]; 
 50 |   }
 51 | 
 52 |   //TODO: cast half to 
 53 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 54 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 55 |   uint32_t *C = reinterpret_cast<uint32_t *>(&frag_D[0]);
 56 |   uint32_t *D = C; 
 57 | 
 58 |   // float fpuA = frag_A[0];
 59 |   // float fpuB = frag_B[0];
 60 |   float fpuC = frag_D[0];
 61 | 
 62 | 
 63 | 
 64 |   // int intA = threadIdx.x;
 65 |   // int intB = threadIdx.x + 1;
 66 |   int intC = threadIdx.x + 2;
 67 | 
 68 |   uint64_t start = 0;
 69 |   uint64_t stop = 0;
 70 |   // synchronize all threads
 71 |   asm volatile("bar.sync 0;");
 72 |   // start timing
 73 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 74 |   #pragma unroll
 75 |   for (int j = 0; j < ITERS; ++j) {
 76 |     asm volatile(
 77 |         "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
 78 |         "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
 79 |         : "=r"(D[0]), "=r"(D[1])
 80 |         : "r"(A[0]), "r"(A[1]),
 81 |           "r"(B[0]), 
 82 |           "r"(C[0]), "r"(C[1])
 83 |     );
 84 | 
 85 |     #if ILPconfig >= 2
 86 |     asm volatile(
 87 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
 88 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
 89 |       : "=r"(D[2]), "=r"(D[3])
 90 |       : "r"(A[2]), "r"(A[3]),
 91 |         "r"(B[1]), 
 92 |         "r"(C[2]), "r"(C[3])
 93 |     );
 94 |     #endif
 95 | 
 96 |     #if ILPconfig >= 3
 97 |     asm volatile(
 98 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
 99 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
100 |       : "=r"(D[4]), "=r"(D[5])
101 |       : "r"(A[4]), "r"(A[5]),
102 |         "r"(B[2]), 
103 |         "r"(C[4]), "r"(C[5])
104 |     );
105 |     #endif
106 |     #if ILPconfig >= 4
107 |     asm volatile(
108 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
109 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
110 |       : "=r"(D[6]), "=r"(D[7])
111 |       : "r"(A[6]), "r"(A[7]),
112 |         "r"(B[3]), 
113 |         "r"(C[6]), "r"(C[7])
114 |     );
115 |     #endif
116 | 
117 |     #if ILPconfig >= 5
118 |     asm volatile(
119 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
120 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
121 |       : "=r"(D[8]), "=r"(D[9])
122 |       : "r"(A[8]), "r"(A[9]),
123 |         "r"(B[4]), 
124 |         "r"(C[8]), "r"(C[9])
125 |     );
126 |     #endif
127 | 
128 |     #if ILPconfig >= 6
129 |     asm volatile(
130 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
131 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
132 |       : "=r"(D[10]), "=r"(D[11])
133 |       : "r"(A[10]), "r"(A[11]),
134 |         "r"(B[5]), 
135 |         "r"(C[10]), "r"(C[11])
136 |     );
137 |     #endif
138 | 
139 |     #if ILPconfig >= 7
140 |     asm volatile(
141 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
142 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
143 |       : "=r"(D[12]), "=r"(D[13])
144 |       : "r"(A[12]), "r"(A[13]),
145 |         "r"(B[6]), 
146 |         "r"(C[12]), "r"(C[13])
147 |     );
148 |     #endif
149 | 
150 |     #if ILPconfig >= 8
151 |     asm volatile(
152 |       "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
153 |       "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
154 |       : "=r"(D[14]), "=r"(D[15])
155 |       : "r"(A[14]), "r"(A[15]),
156 |         "r"(B[7]), 
157 |         "r"(C[14]), "r"(C[15])
158 |     );
159 |     #endif
160 |     __syncwarp();
161 | 
162 |   }
163 |   // synchronize all threads
164 |   //asm volatile("bar.sync 0;");
165 |   // stop timing
166 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
167 |   for(int i=0; i < 4*ILPconfig;i++){
168 |     res[i] = frag_D[i]; 
169 | 
170 |     res[i] += float(fpuC);
171 |     res[i] += intC;
172 |   }
173 | 
174 |   //res[0] += fpuC;
175 |   startClk[gid] = start;
176 |   stopClk[gid] = stop;
177 | }
178 | 
179 | 
180 | template <class T, class R> 
181 | float tensor1688_max_flops(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
182 |     intilizeDeviceProp(0);
183 |   
184 |     int BLOCKS_NUM = 1;
185 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
186 |     int WARP_SIZE = 32;
187 |   
188 |     unsigned total_A_SIZE =
189 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
190 |     unsigned total_B_SIZE =
191 |         8*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
192 |     unsigned total_R_SIZE =
193 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
194 |   
195 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
196 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
197 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
198 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
199 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
200 |   
201 |     uint64_t *startClk_g;
202 |     uint64_t *stopClk_g;
203 |     T *data1_g;
204 |     T *data2_g;
205 |     R *res_g;
206 |   
207 |     for (uint32_t i = 0; i < 16*8; i++) {
208 |       data1[i] = (T)i;
209 |     }
210 |   
211 |     for (uint32_t i = 0; i < 8*8; i++) {
212 |       data2[i] = (T)i;
213 |     }
214 |   
215 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
216 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
217 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
218 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
219 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
220 |   
221 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
222 |                          cudaMemcpyHostToDevice));
223 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
224 |                          cudaMemcpyHostToDevice));
225 |   
226 |     tensr1688_flops<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
227 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
228 |     gpuErrchk(cudaPeekAtLastError());
229 |   
230 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
231 |                          cudaMemcpyDeviceToHost));
232 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
233 |                          cudaMemcpyDeviceToHost));
234 |     gpuErrchk(
235 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
236 |   
237 |     float mma_bw, fma_bw;
238 |     uint64_t total_time =
239 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
240 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
241 | 
242 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0) /
243 |           ((float)total_time);
244 | 
245 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
246 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
247 |     //           (float)total_time;
248 |     fma_bw = ((float)(ITERS * 16 * 8 * 8 * ILPconfig * 
249 |                       (TOTAL_THREADS / WARP_SIZE))) /
250 |              (float)total_time;
251 |   
252 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
253 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
254 |     std::cout << "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16  latency " << (float)total_time/(float)ITERS << " cycles\n";
255 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
256 |   
257 |     std::cout << "Total Clk number = " << total_time << "\n";
258 |   
259 |     if (report_fma_bw)
260 |       return fma_bw;
261 |     else
262 |       return mma_bw;
263 | }
264 | 
265 | int main() {
266 |     intilizeDeviceProp(0);
267 |     std::cout<<"***********************************"<<std::endl;
268 |     std::cout << "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 microbenchmark with ILP = " << ILPconfig << std::endl;
269 |     for(int i = 1; i <= 32; i = i*2){
270 |         std::cout << "Number of warps = "<< i <<std::endl;
271 |         tensor1688_max_flops<half, float>(32*i);
272 |         std::cout << std::endl;
273 |     }
274 | 
275 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
276 |     // tensor1688_max_flops<half, float>(32);
277 |     return 0;
278 |   }
279 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_tf32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m16n8k8_tf32.cu
2 | 
3 | EXE = mma_m16n8k8_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m16n8k8_tf32/mma_m16n8k8_tf32.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | #ifndef ILPconfig
 21 | #define ILPconfig 1
 22 | #endif
 23 | 
 24 | 
 25 | static_assert(ILPconfig <=6, "ILP>8 is not supported\n");
 26 | 
 27 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res,
 28 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 29 |   // thread index
 30 |   uint32_t tid = threadIdx.x;
 31 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 32 |   uint32_t warpid = gid / warpSize;
 33 | 
 34 |   a = a + warpid * 16*8; // m*k = 16*16
 35 |   b = b + warpid * 8*8; // n*k = 8*16
 36 |   res = res + warpid * 16*8;// m*n = 16*16
 37 | 
 38 |    /** step 1: create register for each thread **/
 39 |   float frag_A[4 * ILPconfig]; // two .f16x2 registers, 8 half elements, 
 40 |   float frag_B[2 * ILPconfig];  // one .f16x2 registers, 4 half  elements
 41 |   float frag_D[4 * ILPconfig]; //result(fp32) 4 f32 registers
 42 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 43 |   for(int i = 0;i<4 * ILPconfig;i++){
 44 |     frag_A[i] = a[i + lane_id()*4]; 
 45 |     
 46 |   }
 47 |   for(int i =0;i<2 * ILPconfig;i++){
 48 |     frag_B[i] = b[i + lane_id()*1]; 
 49 |     
 50 |   }
 51 | 
 52 | 
 53 |   for(int i =0;i<4 * ILPconfig;i++){
 54 |     //frag_B[i] = b[i + lane_id()*4]; 
 55 |     frag_D[i] = 0.0f;
 56 |   }
 57 | 
 58 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 59 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 60 |   float *C = reinterpret_cast<float *>(&frag_D[0]);
 61 |   float *D = C;  // D = A*B + D. 
 62 | 
 63 |   // float fpuA = frag_A[0];
 64 |   // float fpuB = frag_B[0];
 65 |   float fpuC = frag_D[0];
 66 | 
 67 |   // int intA = threadIdx.x;
 68 |   // int intB = threadIdx.x + 1;
 69 |   int intC = threadIdx.x + 2;
 70 | 
 71 |   uint64_t start = 0;
 72 |   uint64_t stop = 0;
 73 |   // synchronize all threads
 74 |   asm volatile("bar.sync 0;");
 75 |   // start timing
 76 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 77 |   //#pragma unroll
 78 |   for (int j = 0; j < ITERS; ++j) {
 79 | 
 80 |     asm volatile(
 81 |         "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
 82 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
 83 |         : 
 84 |           "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 
 85 |           "r"(B[0]), "r"(B[1]), 
 86 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
 87 |     );
 88 | 
 89 |     #if ILPconfig >= 2
 90 |     asm volatile(
 91 |       "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
 92 |       : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
 93 |       : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 
 94 |         "r"(B[2]), "r"(B[3]), 
 95 |         "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
 96 |     );
 97 |     #endif
 98 |     #if ILPconfig >= 3
 99 |     asm volatile(
100 |       "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
101 |       : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
102 |       : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 
103 |         "r"(B[4]), "r"(B[5]), 
104 |         "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11])
105 |     );
106 |     #endif
107 |     #if ILPconfig >= 4
108 |     asm volatile(
109 |       "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
110 |       : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15])
111 |       : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 
112 |         "r"(B[6]), "r"(B[7]), 
113 |         "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15])
114 |     );
115 |     #endif
116 |     #if ILPconfig >= 5
117 |     asm volatile(
118 |       "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
119 |       : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19])
120 |       : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 
121 |         "r"(B[8]), "r"(B[9]), 
122 |         "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19])
123 |     );
124 |     #endif
125 |     #if ILPconfig >= 6
126 |     asm volatile(
127 |       "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n"
128 |       : "=f"(D[20]), "=f"(D[21]), "=f"(D[22]), "=f"(D[23])
129 |       : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 
130 |         "r"(B[10]), "r"(B[11]), 
131 |         "f"(C[20]), "f"(C[21]), "f"(C[22]), "f"(C[23])
132 |     );
133 |     #endif
134 |     __syncwarp();
135 |   }
136 |   // synchronize all threads
137 |   // asm volatile("bar.sync 0;");
138 |   // stop timing
139 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
140 |   for(int i=0; i < 4*ILPconfig;i++){
141 |     res[i] += frag_D[i]; 
142 | 
143 |     res[i] += fpuC;
144 |     res[i] += intC;
145 |   }
146 | 
147 |   //res[0] += fpuC;
148 |   startClk[gid] = start;
149 |   stopClk[gid] = stop;
150 | }
151 | 
152 | 
153 | template <class T, class R> 
154 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
155 |     intilizeDeviceProp(0);
156 |   
157 |     int BLOCKS_NUM = 1;
158 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
159 |     int WARP_SIZE = 32;
160 |   
161 |     unsigned total_A_SIZE =
162 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
163 |     unsigned total_B_SIZE =
164 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
165 |     unsigned total_R_SIZE =
166 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
167 |   
168 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
169 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
170 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
171 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
172 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
173 |   
174 |     uint64_t *startClk_g;
175 |     uint64_t *stopClk_g;
176 |     T *data1_g;
177 |     T *data2_g;
178 |     R *res_g;
179 |   
180 |     for (uint32_t i = 0; i < 16*4; i++) {
181 |       data1[i] = (T)i;
182 |     }
183 |   
184 |     for (uint32_t i = 0; i < 4*8; i++) {
185 |       data2[i] = (T)i;
186 |     }
187 |   
188 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
189 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
190 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
191 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
192 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
193 |   
194 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
195 |                          cudaMemcpyHostToDevice));
196 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
197 |                          cudaMemcpyHostToDevice));
198 |   
199 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
200 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
201 |     gpuErrchk(cudaPeekAtLastError());
202 |   
203 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
204 |                          cudaMemcpyDeviceToHost));
205 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
206 |                          cudaMemcpyDeviceToHost));
207 |     gpuErrchk(
208 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
209 |   
210 |     float mma_bw, fma_bw;
211 |     uint64_t total_time =
212 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
213 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
214 | 
215 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
216 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
217 | 
218 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
219 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
220 |     //           (float)total_time;
221 |     fma_bw = ((float)(ITERS * 16 * 8 * 8 * ILPconfig * //0 *
222 |                       (TOTAL_THREADS / WARP_SIZE))) /
223 |              (float)total_time;
224 |   
225 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
226 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
227 |     std::cout << "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32  latency " << (float)total_time/(float)ITERS << " cycles\n";
228 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
229 |   
230 |     std::cout << "Total Clk number = " << total_time << "\n";
231 |   
232 |     if (report_fma_bw)
233 |       return fma_bw;
234 |     else
235 |       return mma_bw;
236 | }
237 | 
238 | int main() {
239 |     intilizeDeviceProp(0);
240 |     std::cout<<"***********************************"<<std::endl;
241 |     std::cout << "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 microbenchmark with ILP = " << ILPconfig << std::endl;
242 |     for(int i = 1; i <= 32; i = i*2){
243 |         std::cout << "Number of warps = "<< i <<std::endl;
244 |         run<float, float>(32*i);
245 |         std::cout << std::endl;
246 |     }
247 | 
248 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
249 |     // tensor1688_max_flops<half, float>(32);
250 |     return 0;
251 |   }
252 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m8n8k16_int/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m8n8k16_int8.cu
2 | 
3 | EXE = mma_m8n8k16_int8.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m8n8k16_int/mma_m8n8k16_int8.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | 
 22 | #ifndef ILPconfig
 23 | #define ILPconfig 1
 24 | #endif
 25 | 
 26 | #if ILPconfig >8
 27 | static_assert(0,"ILP > 8 is not supported\n");
 28 | #endif
 29 | 
 30 | 
 31 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, int *a, int *b, float *res,
 32 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 33 |   // thread index
 34 |   uint32_t tid = threadIdx.x;
 35 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 36 |   uint32_t warpid = gid / warpSize;
 37 | 
 38 |   a = a + warpid * 8*16; // m*k = 8*16
 39 |   b = b + warpid * 8*16; // n*k = 8*16
 40 |   res = res + warpid * 8*8;// m*n = 8*8
 41 | 
 42 |   
 43 |   char frag_A[4 * ILPconfig]; // four int8 registers, 
 44 |   char frag_B[4 * ILPconfig];  // one .f16x2 registers, 2 half  elements
 45 |   int frag_D[2 * ILPconfig]; //result(fp32) 2 f32 registers
 46 | 
 47 |   for(int i = 0;i<4*ILPconfig;i++){
 48 |     frag_A[i] = a[i + lane_id()]; 
 49 |   }
 50 |   for(int i =0;i<4*ILPconfig;i++){
 51 |     frag_B[i] = b[i + lane_id()]; 
 52 |   }
 53 |   for(int i =0;i<2*ILPconfig;i++){
 54 |     frag_D[i] = 0.0f;
 55 |   }
 56 | 
 57 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 58 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);
 59 |   int *C = reinterpret_cast<int *>(&frag_D[0]);
 60 |   int *D = C;  // D = A*B + D. 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 |   float fpuA = frag_A[0];
 67 |   float fpuB = frag_B[0];
 68 |   float fpuC = frag_D[0];
 69 | 
 70 |   int intA = threadIdx.x;
 71 |   int intB = threadIdx.x + 1;
 72 |   int intC = threadIdx.x + 2;
 73 | 
 74 |   uint64_t start = 0;
 75 |   uint64_t stop = 0;
 76 |   // synchronize all threads
 77 |   asm volatile("bar.sync 0;");
 78 |   // start timing
 79 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 80 |   //#pragma unroll
 81 |   for (int j = 0; j < ITERS; ++j) {
 82 |     asm volatile(
 83 |         "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
 84 |         "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
 85 |         : "=r"(D[0]), "=r"(D[1]) 
 86 |         : "r"(A[0]), 
 87 |           "r"(B[0]), 
 88 |           "r"(C[0]), "r"(C[1]) 
 89 |     ); // input C operand will use output operand D.
 90 |     #if ILPconfig >= 2
 91 |     asm volatile(
 92 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
 93 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
 94 |       : "=r"(D[2]), "=r"(D[3]) 
 95 |       : "r"(A[1]), 
 96 |         "r"(B[1]), 
 97 |         "r"(C[2]), "r"(C[3]) 
 98 |     ); // input C operand will use output operand D.
 99 |     #endif
100 |     #if ILPconfig >= 3 
101 |     asm volatile(
102 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
103 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
104 |       : "=r"(D[4]), "=r"(D[5]) 
105 |       : "r"(A[2]), 
106 |         "r"(B[2]), 
107 |         "r"(C[4]), "r"(C[5]) 
108 |     ); 
109 |     #endif
110 |     #if ILPconfig >= 4
111 |     asm volatile(
112 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
113 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
114 |       : "=r"(D[6]), "=r"(D[7]) 
115 |       : "r"(A[3]), 
116 |         "r"(B[3]), 
117 |         "r"(C[6]), "r"(C[7]) 
118 |     ); 
119 |     #endif
120 | 
121 |     #if ILPconfig >= 5
122 |     asm volatile(
123 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
124 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
125 |       : "=r"(D[8]), "=r"(D[9]) 
126 |       : "r"(A[4]), 
127 |         "r"(B[4]), 
128 |         "r"(C[8]), "r"(C[9]) 
129 |     ); 
130 |     #endif
131 |     #if ILPconfig >= 6
132 |     asm volatile(
133 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
134 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
135 |       : "=r"(D[10]), "=r"(D[11]) 
136 |       : "r"(A[5]), 
137 |         "r"(B[5]), 
138 |         "r"(C[10]), "r"(C[11]) 
139 |     ); 
140 |     #endif
141 |     #if ILPconfig >= 7
142 |     asm volatile(
143 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
144 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
145 |       : "=r"(D[12]), "=r"(D[13]) 
146 |       : "r"(A[6]), 
147 |         "r"(B[6]), 
148 |         "r"(C[12]), "r"(C[13]) 
149 |     ); 
150 |     #endif
151 |     #if ILPconfig >= 8
152 |     asm volatile(
153 |       "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 "
154 |       "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
155 |       : "=r"(D[14]), "=r"(D[15]) 
156 |       : "r"(A[7]), 
157 |         "r"(B[7]), 
158 |         "r"(C[14]), "r"(C[15]) 
159 |     ); 
160 |     #endif
161 | 
162 | 
163 | 
164 |     __syncwarp();
165 | 
166 |   }
167 |   // synchronize all threads
168 |   // asm volatile("bar.sync 0;");
169 |   // stop timing
170 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
171 |   for(int i=0; i < 2*ILPconfig;i++){
172 |     res[i] += D[i]; 
173 | 
174 |     res[i] += fpuC;
175 |     res[i] += intC;
176 |   }
177 | 
178 |   //res[0] += fpuC;
179 |   startClk[gid] = start;
180 |   stopClk[gid] = stop;
181 | }
182 | 
183 | 
184 | template <class T, class R> 
185 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
186 |     intilizeDeviceProp(0);
187 |   
188 |     int BLOCKS_NUM = 1;
189 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
190 |     int WARP_SIZE = 32;
191 |   
192 |     unsigned total_A_SIZE =
193 |         16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
194 |     unsigned total_B_SIZE =
195 |         8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
196 |     unsigned total_R_SIZE =
197 |         16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
198 |   
199 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
200 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
201 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
202 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
203 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
204 |   
205 |     uint64_t *startClk_g;
206 |     uint64_t *stopClk_g;
207 |     T *data1_g;
208 |     T *data2_g;
209 |     R *res_g;
210 |   
211 |     for (uint32_t i = 0; i < 16*8; i++) {
212 |       data1[i] = (T)i;
213 |     }
214 |   
215 |     for (uint32_t i = 0; i < 8*8; i++) {
216 |       data2[i] = (T)i;
217 |     }
218 |   
219 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
220 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
221 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
222 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
223 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
224 |   
225 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
226 |                          cudaMemcpyHostToDevice));
227 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
228 |                          cudaMemcpyHostToDevice));
229 |   
230 |     mma_ubench<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
231 |         startClk_g, stopClk_g, data1_g, data2_g, res_g, 0);
232 |     gpuErrchk(cudaPeekAtLastError());
233 |   
234 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
235 |                          cudaMemcpyDeviceToHost));
236 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
237 |                          cudaMemcpyDeviceToHost));
238 |     gpuErrchk(
239 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
240 |   
241 |     float mma_bw, fma_bw;
242 |     uint64_t total_time =
243 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
244 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
245 | 
246 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) /
247 |           ((float)total_time);  // max 64FMA/clk/SM on RTX3070Ti
248 | 
249 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
250 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
251 |     //           (float)total_time;
252 |     fma_bw = ((float)(ITERS * 8 * 8 * 16 * ILPconfig * //0 *
253 |                       (TOTAL_THREADS / WARP_SIZE))) /
254 |              (float)total_time;
255 |   
256 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
257 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
258 |     std::cout << "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 latency " << (float)total_time/(float)ITERS << " cycles\n";
259 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
260 |   
261 |     std::cout << "Total Clk number = " << total_time << "\n";
262 |   
263 |     if (report_fma_bw)
264 |       return fma_bw;
265 |     else
266 |       return mma_bw;
267 | }
268 | 
269 | int main() {
270 |     intilizeDeviceProp(0);
271 |     // std::cout << "mma1688 FP16 operand, FP32 accumalte:\n";
272 |     std::cout<<"***********************************"<<std::endl;
273 |     std::cout << "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 microbenchmark with ILP = " << ILPconfig << std::endl;
274 |     for(int i = 1; i <= 32; i = i*2){
275 |         std::cout << "Number of warps = "<< i <<std::endl;
276 |         run<int, float>(32*i);
277 |         std::cout << std::endl;
278 |     }
279 | 
280 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
281 |     // tensor1688_max_flops<half, float>(32);
282 |     return 0;
283 |   }
284 |   


--------------------------------------------------------------------------------
/microbench/ubench/mma/mma_m8n8k4_fp16fp32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mma_m8n8k4_fp16fp32.cu
2 | 
3 | EXE = mma_m8n8k4_fp16fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k16_fp/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k16_fp32.cu
2 | 
3 | EXE = mmasp_m16n8k16_fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k16_fp16fp16/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k16_fp16fp16.cu
2 | 
3 | EXE = mmasp_m16n8k16_fp16fp16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k16_tf32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k16_tf32.cu
2 | 
3 | EXE = mmasp_m16n8k16_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k32_fp/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k32_fp32.cu
2 | 
3 | EXE = mmasp_m16n8k32_fp32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k32_fp16fp16/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k32_fp16fp16.cu
2 | 
3 | EXE = mmasp_m16n8k32_fp16fp16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k32_int/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k32_int.cu
2 | 
3 | EXE = mmasp_m16n8k32_int.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k64_fp8/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k64_fp8.cu
2 | 
3 | EXE = mmasp_m16n8k64_fp8.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k64_int/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k64_int.cu
2 | 
3 | EXE = mmasp_m16n8k64_int.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k8_tf32/Makefile:
--------------------------------------------------------------------------------
1 | SRC = mmasp_m16n8k8_tf32.cu
2 | 
3 | EXE = mmasp_m16n8k8_tf32.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/mmasp/mmasp_m16n8k8_tf32/mmasp_m16n8k8_tf32.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include <mma.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <cstdlib>
  8 | #include "../../../hw_def/hw_def.h"
  9 | 
 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB
 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 12 | // array technique
 13 | //#define THREADS_NUM 32
 14 | // iterate over the array ITERS times
 15 | #ifndef ITERS
 16 | #define ITERS  (1024 )
 17 | #endif
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | #ifndef ILPconfig
 24 | #define ILPconfig 1
 25 | #endif
 26 | 
 27 | 
 28 | 
 29 | static_assert(ILPconfig<=4,"ILP> 4 is not spported");
 30 | 
 31 | 
 32 | __global__ void mmasp_1688(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, uint32_t* meteE, float *res,
 33 |           uint32_t strid) { // strid set to 0 used to prevent optimization
 34 |   // thread index
 35 |   uint32_t tid = threadIdx.x;
 36 |   uint32_t gid = blockIdx.x * blockDim.x + tid;
 37 |   uint32_t warpid = gid / warpSize;
 38 | 
 39 |   a = a + warpid * 16*4; // m*k/2 = 16*4
 40 |   b = b + warpid * 8*8; // n*k = 8*8
 41 |   res = res + warpid * 16*8;// m*n = 16*8
 42 | 
 43 |    /** step 1: create register for each thread **/
 44 |    float frag_A[2*ILPconfig]; // two b32 registrs, 4 half non-zero elements, 16 dense 
 45 |    float frag_B[2*ILPconfig]; // two f16x2 registers, 8 half dense elements
 46 |    float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers
 47 |    uint32_t frag_E[1*ILPconfig]; // A .b32 register containing 16 2-bit vectors to for indexing non-zero of A
 48 |   // fake load, we are focusing on mma latency/throughput. So no need to care about loading
 49 |   for(int i = 0;i<2*ILPconfig;i++){
 50 |     frag_A[i] = a[i ]; 
 51 |     frag_B[i] = b[i]; 
 52 |   }
 53 |   for(int i =0;i<4*ILPconfig;i++){
 54 |     
 55 |     frag_D[i] = 0.0f;
 56 |   }
 57 |   for(int i =0; i < 1*ILPconfig ; i ++){
 58 |     frag_E[i] = meteE[i];
 59 |   }
 60 |   //TODO: cast half to 
 61 |   uint32_t const *A = reinterpret_cast<uint32_t const *>(&frag_A[0]);
 62 |   uint32_t const *B = reinterpret_cast<uint32_t const *>(&frag_B[0]);//?
 63 |   float *C = reinterpret_cast<float *>(&frag_D[0]);
 64 |   float *D = C; 
 65 |   uint32_t const *E = reinterpret_cast<uint32_t const*>(&frag_E[0]); ;
 66 | 
 67 |   float fpuA = frag_A[0];
 68 |   float fpuB = frag_B[0];
 69 |   float fpuC = frag_D[0];
 70 | 
 71 | 
 72 | 
 73 | //   int intA = threadIdx.x;
 74 | //   int intB = threadIdx.x + 1;
 75 |   int intC = threadIdx.x + 2;
 76 | 
 77 |   uint64_t start = 0;
 78 |   uint64_t stop = 0;
 79 |   // synchronize all threads
 80 |   asm volatile("bar.sync 0;");
 81 |   // start timing
 82 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory");
 83 |   //#pragma unroll
 84 |   for (int j = 0; j < ITERS; ++j) {
 85 |     asm volatile(
 86 |         "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
 87 |         "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n"
 88 |         : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
 89 |         : "r"(A[0]), "r"(A[1]),
 90 |           "r"(B[0]), "r"(B[1]),
 91 |           "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
 92 |           "r"(E[0])
 93 |         );
 94 |     
 95 |     #if ILPconfig >= 2
 96 |     asm volatile(
 97 |         "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
 98 |         "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n"
 99 |         : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
100 |         : "r"(A[2]), "r"(A[3]),
101 |           "r"(B[2]), "r"(B[3]),
102 |           "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]),
103 |           "r"(E[1])
104 |         );
105 |     #endif
106 | 
107 |     #if ILPconfig >= 3
108 |     asm volatile(
109 |         "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
110 |         "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n"
111 |         : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
112 |         : "r"(A[4]), "r"(A[5]),
113 |           "r"(B[4]), "r"(B[5]),
114 |           "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]),
115 |           "r"(E[2])
116 |         );
117 |     #endif
118 | 
119 |     #if ILPconfig >= 4
120 |     asm volatile(
121 |         "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
122 |         "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n"
123 |         : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15])
124 |         : "r"(A[6]), "r"(A[7]),
125 |           "r"(B[6]), "r"(B[7]),
126 |           "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]),
127 |           "r"(E[3])
128 |         );        
129 |     #endif
130 |     __syncwarp();
131 |   }
132 |   // synchronize all threads
133 |   
134 |   // stop timing
135 |   asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");
136 |   // avoid compiler optimization
137 |   for(int i=0; i < 4*ILPconfig;i++){
138 |     res[i] = frag_D[i]; 
139 | 
140 |     res[i] += float(fpuC);
141 |     res[i] += intC;
142 |   }
143 | 
144 |   //res[0] += fpuC;
145 |   startClk[gid] = start;
146 |   stopClk[gid] = stop;
147 | }
148 | 
149 | 
150 | template <class T, class R> 
151 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) {
152 |     intilizeDeviceProp(0);
153 |   
154 |     int BLOCKS_NUM = 1;
155 |     int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM;
156 |     int WARP_SIZE = 32;
157 |     
158 |     int nwarps = THREADS_PER_BLOCK/WARP_SIZE;
159 | 
160 |     int mma_m = 16;
161 |     int mma_n = 8;
162 |     int mma_k = 8;
163 | 
164 | 
165 |     // T *data1 = (T *)malloc(mma_m*mma_k/2 * sizeof(T));
166 |     // T *data2 = (T *)malloc(mma_n*mma_k * sizeof(T));
167 |     // R *res = (R *)malloc(mma_m*mma_n * sizeof(R));
168 |     // uint32_t *meta_e = (uint32_t *)malloc(mma_m*mma_k/16 *sizeof(uint32_t) );
169 | 
170 |     unsigned total_A_SIZE =
171 |         mma_m*mma_k/2 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp
172 |     unsigned total_B_SIZE =
173 |         mma_n*mma_k * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp
174 |     unsigned total_R_SIZE =
175 |         mma_m*mma_n * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
176 | 
177 |     
178 |     unsigned total_E_SIZE =
179 |         mma_m*mma_k/16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp
180 |   
181 |     uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
182 |     uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t));
183 |     T *data1 = (T *)malloc(total_A_SIZE * sizeof(T));
184 |     T *data2 = (T *)malloc(total_B_SIZE * sizeof(T));
185 |     R *res = (R *)malloc(total_R_SIZE * sizeof(R));
186 | 
187 |     uint32_t *meta_e = (uint32_t *)malloc(total_E_SIZE *sizeof(uint32_t) );
188 |     //uint32_t *meta_p = meta_e;
189 |     for(int i=0; i < nwarps; i++){
190 | 
191 |         initialize_fake_metadata_2_4(&meta_e[mma_m*mma_k/16 * i] ,mma_m,mma_k);
192 | 
193 |     }
194 |     
195 |   
196 |     uint64_t *startClk_g;
197 |     uint64_t *stopClk_g;
198 |     T *data1_g;
199 |     T *data2_g;
200 |     R *res_g;
201 |     uint32_t *meta_e_g;
202 |   
203 |     for (uint32_t i = 0; i < mma_m*mma_k/2; i++) {
204 |       data1[i] = (T)i;
205 |     }
206 |   
207 |     for (uint32_t i = 0; i < mma_k*mma_n; i++) {
208 |       data2[i] = (T)i;
209 |     }
210 |   
211 |     gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t)));
212 |     gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t)));
213 |     gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T)));
214 |     gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T)));
215 |     gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R)));
216 |     gpuErrchk(cudaMalloc(&meta_e_g, total_E_SIZE *sizeof(uint32_t)));
217 | 
218 |   
219 |     gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T),
220 |                          cudaMemcpyHostToDevice));
221 |     gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T),
222 |                          cudaMemcpyHostToDevice));
223 | 
224 |     gpuErrchk(cudaMemcpy(meta_e_g, meta_e, total_E_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice));
225 |   
226 |     mmasp_1688<<<BLOCKS_NUM, THREADS_PER_BLOCK>>>(
227 |         startClk_g, stopClk_g, data1_g, data2_g,meta_e_g, res_g, 0);
228 |     gpuErrchk(cudaPeekAtLastError());
229 |   
230 |     gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t),
231 |                          cudaMemcpyDeviceToHost));
232 |     gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t),
233 |                          cudaMemcpyDeviceToHost));
234 |     gpuErrchk(
235 |         cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost));
236 |   
237 |     float mma_bw, fma_bw;
238 |     uint64_t total_time =
239 |         *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) -
240 |         *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]);
241 | 
242 |     float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0) /
243 |           ((float)total_time);
244 | 
245 |     mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time;
246 |     // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) /
247 |     //           (float)total_time;
248 |     fma_bw = ((float)(ITERS * mma_m * mma_n * mma_k * ILPconfig *
249 |                       (TOTAL_THREADS / WARP_SIZE))) /
250 |              (float)total_time;
251 |   
252 |     // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n";
253 |     //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n";
254 |     std::cout << "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32  latency " << (float)total_time/(float)ITERS << " cycles\n";
255 |     std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n";
256 |   
257 |     std::cout << "Total Clk number = " << total_time << "\n";
258 |   
259 |     if (report_fma_bw)
260 |       return fma_bw;
261 |     else
262 |       return mma_bw;
263 | }
264 | 
265 | int main() {
266 |     intilizeDeviceProp(0);
267 |     std::cout<<"***********************************"<<std::endl;
268 |     std::cout << "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32  microbenchmark with ILP = " << ILPconfig << std::endl;
269 |     for(int i = 1; i <= 32; i = i*2){
270 |         std::cout << "Number of warps = "<< i <<std::endl;
271 |         run<float, float>(32*i);
272 |         std::cout << std::endl;
273 |     }
274 | 
275 |     // std::cout << "Number of warps = "<< 1 <<std::endl;
276 |     // tensor1688_max_flops<half, float>(32);
277 |     return 0;
278 |   }
279 |   


--------------------------------------------------------------------------------
/microbench/ubench/wmma_load/loadbf16/Makefile:
--------------------------------------------------------------------------------
1 | SRC = load_bf16.cu
2 | 
3 | EXE = load_bf16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/wmma_load/loadbf16/load_bf16.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define WMMA_M 16
  9 | #define WMMA_N 16
 10 | #define WMMA_K 16
 11 | 
 12 | 
 13 | 
 14 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB
 15 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 16 | // array technique
 17 | //#define THREADS_NUM 32
 18 | // iterate over the array ITERS times
 19 | #ifndef ITERS
 20 | #define ITERS  (1 )
 21 | #endif
 22 | 
 23 | 
 24 | #ifndef ILPconfig
 25 | #define ILPconfig 1
 26 | #endif
 27 | 
 28 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 29 | // two way bank conflict - > 23 latenct
 30 | // bank-conflict-free -> 25 latency
 31 | 
 32 | typedef uint32_t shared_m;
 33 | // Measure latency of ITERS ldmatrix.x1
 34 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 35 |   shared_m *dsink, uint32_t stride) {
 36 | 
 37 |   // thread index
 38 |   uint32_t tid = threadIdx.x;
 39 |   uint32_t bid = blockIdx.x;
 40 |   uint32_t uid = bid * blockDim.x + tid;
 41 |   uint32_t n_threads = blockDim.x * gridDim.x;
 42 | 
 43 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 44 | 
 45 |   // one thread to initialize the pointer-chasing array
 46 |   for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads)
 47 |     s[i] = (i )*16 % 512;
 48 |     
 49 |     asm volatile("bar.sync 0;");
 50 | 
 51 |     // if(uid == 0){
 52 |     //   for(int i = 0; i < SHARED_MEM_SIZE; i ++){
 53 |     //     printf("s[%d] = %d \t", i, s[i]);
 54 |   
 55 |     //   }
 56 |     //   printf("\n");
 57 |     // }
 58 |   //if (uid == 0) {
 59 |     // initalize pointer chaser
 60 |     //unsigned x = threadIdx.x*4;
 61 |     //unsigned addr = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 62 |     //unsigned addr2 = 0;
 63 |     // unsigned addr3 = 0;
 64 |     // unsigned addr4 = 0;
 65 |     // unsigned addr4 = 0;
 66 |     // unsigned addr4 = 0;
 67 |     unsigned frag[4];
 68 |     frag[0] = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 69 |     //printf("thread %d , addr = %d \n", tid, addr);
 70 |     // start timing
 71 |     uint32_t start = 0;
 72 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 73 | 
 74 |     // pointer-chasing ITERS times
 75 |     #pragma unroll
 76 |     for (uint32_t i = 0; i < ITERS; ++i) {
 77 |         //asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr2),"=r"(addr3),"=r"(addr4) : "r"(addr)); // first 11
 78 |         asm volatile ("wmma.load.a.sync.aligned.row.m16n16k16.shared.bf16 {%0,%1,%2,%3}, [%4];" 
 79 |                     : "=r"(frag[0]), "=r"(frag[1]),"=r"(frag[2]),"=r"(frag[3])
 80 |                     : "r"(frag[0])     ); // first 11
 81 | 
 82 |                     
 83 |         //asm volatile ("ldmatrix.sync.aligne.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); // first 9
 84 |         //x = x++;
 85 |     }
 86 |     //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
 87 |     // stop timing
 88 |     uint32_t stop = 0;
 89 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 90 | 
 91 |     //printf("thread %d , x = %d \n", tid, addr);
 92 | 
 93 |     // write time and data back to memory
 94 |     startClk[uid] = start;
 95 |     stopClk[uid] = stop;
 96 |     dsink[uid] = frag[0] +  frag[1] + frag[2] + frag[3];
 97 | 
 98 |     // float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
 99 |     // printf("Shared Memory Latency  = %f cycles\n", lat);
100 |   //}
101 | }
102 | void test_with_different_thread(int THREADS_NUM){
103 | 
104 |   BLOCKS_NUM = 1;
105 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
106 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
107 | 
108 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
109 | 
110 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
111 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
112 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
113 | 
114 |   uint32_t *startClk_g;
115 |   uint32_t *stopClk_g;
116 |   shared_m *dsink_g;
117 | 
118 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
119 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
120 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
121 | 
122 |   shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
123 |   gpuErrchk(cudaPeekAtLastError());
124 |    // printf("pass kenerl \n");
125 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
126 |                        cudaMemcpyDeviceToHost));
127 |   gpuErrchk(
128 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
129 |   gpuErrchk(
130 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
131 | 
132 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
133 | 
134 |   std::cout << THREADS_NUM/32 <<" warps, wmma.load.a.sync.aligned.row.m16n16k16.shared.bf16 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
135 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
136 |   std::cout << std::endl;
137 |   
138 |   cudaDeviceSynchronize();
139 | 
140 |   // printf("Shared Memory Latency  = %f cycles\n", lat);
141 |   // printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
142 | 
143 |   // if (ACCEL_SIM_MODE) {
144 |   //   std::cout << "\n//Accel_Sim config: \n";
145 |   //   std::cout << "-gpgpu_smem_latency " << (unsigned)(lat) << std::endl;
146 |   // }
147 | 
148 | 
149 | }
150 | 
151 | 
152 | int main() {
153 |   intilizeDeviceProp(0);
154 |   for(int i = 1; i <= 32; i = i*2){
155 |     test_with_different_thread(32*i);
156 |   }
157 |   //test_with_different_thread(32*8);
158 |   
159 |   return 0;
160 | }
161 | 


--------------------------------------------------------------------------------
/microbench/ubench/wmma_load/loadfp16/Makefile:
--------------------------------------------------------------------------------
1 | SRC = load_fp16.cu
2 | 
3 | EXE = load_fp16.app
4 | 
5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt
6 | 
7 | include ../../../common/common.mk


--------------------------------------------------------------------------------
/microbench/ubench/wmma_load/loadfp16/load_fp16.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "../../../hw_def/hw_def.h"
  7 | 
  8 | #define WMMA_M 16
  9 | #define WMMA_N 16
 10 | #define WMMA_K 16
 11 | 
 12 | 
 13 | 
 14 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB
 15 | // Launch only one thread to calcaulte the latency using a pointer-chasing
 16 | // array technique
 17 | //#define THREADS_NUM 32
 18 | // iterate over the array ITERS times
 19 | #ifndef ITERS
 20 | #define ITERS  (1024 )
 21 | #endif
 22 | 
 23 | 
 24 | #ifndef ILPconfig
 25 | #define ILPconfig 1
 26 | #endif
 27 | 
 28 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n");
 29 | // two way bank conflict - > 23 latenct
 30 | // bank-conflict-free -> 25 latency
 31 | 
 32 | typedef uint32_t shared_m;
 33 | // Measure latency of ITERS ldmatrix.x1
 34 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk,
 35 |   shared_m *dsink, uint32_t stride) {
 36 | 
 37 |   // thread index
 38 |   uint32_t tid = threadIdx.x;
 39 |   uint32_t bid = blockIdx.x;
 40 |   uint32_t uid = bid * blockDim.x + tid;
 41 |   uint32_t n_threads = blockDim.x * gridDim.x;
 42 | 
 43 |   __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory
 44 | 
 45 |   // one thread to initialize the pointer-chasing array
 46 |   for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads)
 47 |     s[i] = (i )*16 % 512;
 48 |     
 49 |     asm volatile("bar.sync 0;");
 50 |     unsigned frag[8];
 51 |     frag[0] = static_cast<unsigned>(__cvta_generic_to_shared(&s[threadIdx.x*4]));
 52 |     //printf("thread %d , addr = %d \n", tid, addr);
 53 |     // start timing
 54 |     uint32_t start = 0;
 55 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");
 56 | 
 57 |     // pointer-chasing ITERS times
 58 |     //#pragma unroll
 59 |     for (uint32_t i = 0; i < ITERS; ++i) {
 60 |         //asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr2),"=r"(addr3),"=r"(addr4) : "r"(addr)); // first 11
 61 |         asm volatile ("wmma.load.a.sync.aligned.row.m16n16k16.shared.f16 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" 
 62 |                     : "=r"(frag[0]), "=r"(frag[1]),"=r"(frag[2]),"=r"(frag[3]), "=r"(frag[4]), "=r"(frag[5]),"=r"(frag[6]),"=r"(frag[7]) 
 63 |                     : "r"(frag[0])     ); // first 11
 64 |     }
 65 |     //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
 66 |     // stop timing
 67 |     uint32_t stop = 0;
 68 |     asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory");
 69 | 
 70 |     //printf("thread %d , x = %d \n", tid, addr);
 71 | 
 72 |     // write time and data back to memory
 73 |     startClk[uid] = start;
 74 |     stopClk[uid] = stop;
 75 |     dsink[uid] = frag[0] +  frag[1] + frag[2] + frag[3] + frag[4] + frag[5] + frag[6] + frag[7];
 76 | }
 77 | void test_with_different_thread(int THREADS_NUM){
 78 | 
 79 |   BLOCKS_NUM = 1;
 80 |   TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM;
 81 |   THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM;
 82 | 
 83 |   assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK);
 84 | 
 85 |   uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t));
 86 |   uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t));
 87 |   shared_m *dsink = (shared_m *)malloc(sizeof(shared_m));
 88 | 
 89 |   uint32_t *startClk_g;
 90 |   uint32_t *stopClk_g;
 91 |   shared_m *dsink_g;
 92 | 
 93 |   gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t)));
 94 |   gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t)));
 95 |   gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m)));
 96 | 
 97 |   shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1);
 98 |   gpuErrchk(cudaPeekAtLastError());
 99 |    // printf("pass kenerl \n");
100 |   gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t),
101 |                        cudaMemcpyDeviceToHost));
102 |   gpuErrchk(
103 |       cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost));
104 |   gpuErrchk(
105 |       cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost));
106 | 
107 |   float lat = (float)(stopClk[0] - startClk[0]) / ITERS;
108 | 
109 |   std::cout << THREADS_NUM/32 <<" warps, wmma.load.a.sync.aligned.m16n16k16.row.f16 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl;
110 |   std::cout << "Total Clk number " <<  stopClk[0] - startClk[0] <<std::endl;
111 |   std::cout << std::endl;
112 |   
113 |   cudaDeviceSynchronize();
114 | 
115 | }
116 | 
117 | 
118 | int main() {
119 |   intilizeDeviceProp(0);
120 |   for(int i = 1; i <= 32; i = i*2){
121 |     test_with_different_thread(32*i);
122 |   }
123 |   return 0;
124 | }
125 | 


--------------------------------------------------------------------------------