├── .gitignore ├── README.md └── microbench ├── .gitignore ├── Makefile ├── appendix ├── Makefile ├── mma_baseline.cu ├── mma_permuted.cu └── mma_pipeline.cu ├── bin └── README.md ├── common └── common.mk ├── hw_def ├── common │ ├── common.h │ └── deviceQuery.h └── hw_def.h ├── numericbench ├── bf16numeric │ ├── bf16_numeric │ │ ├── Makefile │ │ ├── bf16_chain_matmul.cu │ │ └── bf16_numeric.cu │ ├── bf16add │ │ ├── Makefile │ │ └── bf16add.cu │ ├── bf16mul │ │ ├── Makefile │ │ └── bf16mul.cu │ ├── m16n8k16 │ │ ├── Makefile │ │ └── m16n8k16_bf16.cu │ └── m16n8k8 │ │ ├── Makefile │ │ └── m16n8k8_bf16.cu ├── cpu_base.h ├── cpu_int_base.h ├── fp16numeric │ ├── fp16_numeric │ │ ├── Makefile │ │ └── fp16_chain_matmul.cu │ ├── fp16add │ │ ├── Makefile │ │ └── fp16add.cu │ └── fp16mul │ │ ├── Makefile │ │ └── fp16mul.cu ├── int8numeric │ ├── int8add │ │ ├── Makefile │ │ └── int8add.cu │ └── s8numeric │ │ └── s8numeric.cu └── tf32numeric │ ├── m16n8k4 │ ├── Makefile │ └── m16n8k4_tf32.cu │ ├── m16n8k8 │ ├── Makefile │ └── m16n8k8_tf32.cu │ ├── tf32_numeric │ ├── Makefile │ ├── tf32_chain_matmul.cu │ └── tf32_numeric.cu │ ├── tf32add │ ├── Makefile │ └── tf32add.cu │ └── tf32mul │ ├── Makefile │ └── tf32mul.cu ├── run_all.sh └── ubench ├── ldmatrix ├── ldmatrix_ILP │ ├── Makefile │ └── ldmatrix_ilp.cu ├── ldmatrix_lat │ ├── Makefile │ └── ldmatrix_lat.cu ├── ldmatrix_x2_lat │ ├── Makefile │ └── ldmatrix_x2_lat.cu ├── ldmatrix_x4_lat │ ├── Makefile │ └── ldmatrix_x4_lat.cu ├── shared_bw │ ├── Makefile │ └── shared_bw.cu ├── shared_bw_64 │ ├── Makefile │ └── shared_bw_64.cu ├── shared_lat │ ├── Makefile │ └── shared_lat.cu ├── shared_x2_lat │ ├── Makefile │ └── shared_x2_lat.cu ├── shared_x4_lat │ ├── Makefile │ └── shared_x4_lat.cu ├── shared_x8 │ ├── Makefile │ └── shared_x8.cu └── shd_config │ ├── Makefile │ └── shd_config.cu ├── mma ├── mma_m16n8k128_int1 │ ├── Makefile │ └── mma_m16n8k128_int1.cu ├── mma_m16n8k16_bf16fp32 │ ├── Makefile │ └── mma_m16n8k16_bf16fp32.cu ├── mma_m16n8k16_fp │ ├── Makefile │ └── mma_m16n8k16_fp32.cu ├── mma_m16n8k16_half │ ├── Makefile │ └── mma_m16n8k16_half.cu ├── mma_m16n8k16_int │ ├── Makefile │ └── mma_m16n8k16_int.cu ├── mma_m16n8k256_int1 │ ├── Makefile │ └── mma_m16n8k256_int1.cu ├── mma_m16n8k32_fp8 │ ├── Makefile │ └── mma_m16n8k32_fp8.cu ├── mma_m16n8k32_int │ ├── Makefile │ └── mma_m16n8k32_int.cu ├── mma_m16n8k32_int4 │ ├── Makefile │ └── mma_m16n8k32_int4.cu ├── mma_m16n8k4_tf32 │ ├── Makefile │ └── mma_m16n8k4_tf32.cu ├── mma_m16n8k64_int4 │ ├── Makefile │ └── mma_m16n8k64_int4.cu ├── mma_m16n8k8_bf16fp32 │ ├── Makefile │ └── mma_m16n8k8_bf16fp32.cu ├── mma_m16n8k8_fp │ ├── Makefile │ └── mma_m16n8k8_fp32.cu ├── mma_m16n8k8_half │ ├── Makefile │ └── mma_m16n8k8_half.cu ├── mma_m16n8k8_tf32 │ ├── Makefile │ └── mma_m16n8k8_tf32.cu ├── mma_m8n8k16_int │ ├── Makefile │ └── mma_m8n8k16_int8.cu └── mma_m8n8k4_fp16fp32 │ ├── Makefile │ └── mma_m8n8k4_fp16fp32.cu ├── mmasp ├── mmasp_m16n8k16_fp │ ├── Makefile │ └── mmasp_m16n8k16_fp32.cu ├── mmasp_m16n8k16_fp16fp16 │ ├── Makefile │ └── mmasp_m16n8k16_fp16fp16.cu ├── mmasp_m16n8k16_tf32 │ ├── Makefile │ └── mmasp_m16n8k16_tf32.cu ├── mmasp_m16n8k32_fp │ ├── Makefile │ └── mmasp_m16n8k32_fp32.cu ├── mmasp_m16n8k32_fp16fp16 │ ├── Makefile │ └── mmasp_m16n8k32_fp16fp16.cu ├── mmasp_m16n8k32_int │ ├── Makefile │ └── mmasp_m16n8k32_int.cu ├── mmasp_m16n8k64_fp8 │ ├── Makefile │ └── mmasp_m16n8k64_fp8.cu ├── mmasp_m16n8k64_int │ ├── Makefile │ └── mmasp_m16n8k64_int.cu └── mmasp_m16n8k8_tf32 │ ├── Makefile │ └── mmasp_m16n8k8_tf32.cu └── wmma_load ├── loadbf16 ├── Makefile └── load_bf16.cu └── loadfp16 ├── Makefile └── load_fp16.cu /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | 35 | microbench/ubench/core/* 36 | *.app 37 | *.txt 38 | *.log 39 | 40 | microbench/logs/* 41 | microbench/revisionlogs/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors 2 | 3 | Code repo for Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors. 4 | 5 | * [preprint](https://arxiv.org/abs/2206.02874) 6 | * [IEEE TPDS](https://ieeexplore.ieee.org/document/9931992) 7 | 8 | TODO: Provide better instructions and explanations 9 | 10 | # Setup 11 | 12 | ## Add CUDA path to env. e.g 13 | 14 | ``` 15 | export CUDA_PATH=/usr/local/cuda-11.0 16 | export PATH=$CUDA_PATH/bin:$PATH 17 | export CUDACXX=$CUDA_PATH/bin/nvcc 18 | ``` 19 | 20 | ## Config compiler target Arch/SM 21 | 22 | ```export TargetSM=80 ``` // for A100 23 | 24 | ```export TargetSM=70 ``` // for V100 25 | 26 | ```export TargetSM=75 ``` // for Turing 27 | 28 | 29 | ## Run script 30 | 31 | ``` 32 | cd/microbench 33 | sh run_all.sh 34 | ``` 35 | 36 | You are expected to get xxx-ILPx.log files. 37 | 38 | Note, there will be static_assert errors messages when running the scripts, because some codes have static_assert() for larger ILPs. This kind of error messages can be ignored. 39 | 40 | 41 | ------------------------------------- 42 | 43 | ## References 44 | Some codes are borrowed from [Accel-Sim](https://github.com/accel-sim/accel-sim-framework) 45 | 46 | ## citations 47 | ``` 48 | @ARTICLE{9931992, 49 | author={Sun, Wei and Li, Ang and Geng, Tong and Stuijk, Sander and Corporaal, Henk}, 50 | journal={IEEE Transactions on Parallel and Distributed Systems}, 51 | title={Dissecting Tensor Cores via Microbenchmarks: Latency, Throughput and Numeric Behaviors}, 52 | year={2023}, 53 | volume={34}, 54 | number={1}, 55 | pages={246-261}, 56 | doi={10.1109/TPDS.2022.3217824}} 57 | ``` 58 | -------------------------------------------------------------------------------- /microbench/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.csv 3 | ubench/atomics/Atomic_add_bw/atomic_add_bw 4 | ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict 5 | ubench/atomics/Atomic_add_lat/atomic_add_lat 6 | ubench/core/MaxFlops_double/MaxFlops_double 7 | ubench/core/MaxFlops_float/MaxFlops_float 8 | ubench/core/MaxFlops_half/MaxFlops_half 9 | ubench/core/MaxFlops_int32/MaxFlops_int32 10 | ubench/core/config_dpu/config_dpu 11 | ubench/core/config_fpu/config_fpu 12 | ubench/core/config_int/config_int 13 | ubench/core/config_sfu/config_sfu 14 | ubench/core/config_tensor/config_tensor 15 | ubench/core/config_udp/config_udp 16 | ubench/core/core_config/core_config 17 | ubench/core/lat_double/lat_double 18 | ubench/core/lat_float/lat_float 19 | ubench/core/lat_half/lat_half 20 | ubench/core/lat_int32/lat_int32 21 | ubench/core/regfile_bw/regfile_bw 22 | ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt 23 | ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt 24 | ubench/core/tensor_bw_half/tensor_bw_half 25 | ubench/core/tensor_lat_half/tensor_lat_half 26 | ubench/l1_cache/l1_access_grain/l1_access_grain 27 | ubench/l1_cache/l1_adaptive/l1_adaptive 28 | ubench/l1_cache/l1_associativity/l1_associativity 29 | ubench/l1_cache/l1_banks/l1_banks 30 | ubench/l1_cache/l1_bw_128/l1_bw_128 31 | ubench/l1_cache/l1_bw_32f/l1_bw_32f 32 | ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll 33 | ubench/l1_cache/l1_bw_64f/l1_bw_64f 34 | ubench/l1_cache/l1_bw_64v/l1_bw_64v 35 | ubench/l1_cache/l1_config/l1_config 36 | ubench/l1_cache/l1_lat/l1_lat 37 | ubench/l1_cache/l1_mshr/l1_mshr 38 | ubench/l1_cache/l1_sector/l1_sector 39 | ubench/l1_cache/l1_shared_bw/l1_shared_bw 40 | ubench/l1_cache/l1_write_policy/l1_write_policy 41 | ubench/l2_cache/l2_access_grain/l2_access_grain 42 | ubench/l2_cache/l2_bw_128/l2_bw_128 43 | ubench/l2_cache/l2_bw_32f/l2_bw_32f 44 | ubench/l2_cache/l2_bw_64f/l2_bw_64f 45 | ubench/l2_cache/l2_config/l2_config 46 | ubench/l2_cache/l2_copy_engine/l2_copy_engine 47 | ubench/l2_cache/l2_lat/l2_lat 48 | ubench/l2_cache/l2_write_policy/l2_write_policy 49 | ubench/mem/mem_atom_size/mem_atom_size 50 | ubench/mem/mem_bw/mem_bw 51 | ubench/mem/mem_config/mem_config 52 | ubench/mem/mem_lat/mem_lat 53 | ubench/shd/shared_bw/shared_bw 54 | ubench/shd/shared_bw_64/shared_bw_64 55 | ubench/shd/shared_lat/shared_lat 56 | ubench/shd/shd_config/shd_config 57 | ubench/system/deviceQuery/deviceQuery 58 | ubench/system/kernel_lat/kernel_lat 59 | ubench/system/system_config/system_config 60 | ubench/system/list_devices/list_devices -------------------------------------------------------------------------------- /microbench/Makefile: -------------------------------------------------------------------------------- 1 | 2 | BASE_DIR := $(shell pwd) 3 | BIN_DIR := $(BASE_DIR)/bin 4 | SUB_DIRS = $(wildcard ubench/*/*/) 5 | # SUB_DIRS = $(wildcard ubench/ldmatrix/*/) 6 | # SUB_DIRS = $(wildcard ubench/mma/*/) 7 | SUB_DIRS_ALL = $(SUB_DIRS:%=all-%) 8 | SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%) 9 | 10 | all: create_dir $(SUB_DIRS_ALL) 11 | 12 | clean: delete_dir $(SUB_DIRS_CLEAN) 13 | 14 | $(SUB_DIRS_ALL): 15 | $(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%) 16 | 17 | $(SUB_DIRS_CLEAN): 18 | $(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean 19 | 20 | create_dir: 21 | mkdir -p $(BIN_DIR) 22 | 23 | delete_dir: 24 | cd $(BIN_DIR); rm -f *.app 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /microbench/appendix/Makefile: -------------------------------------------------------------------------------- 1 | CC = nvcc 2 | FLAG =-gencode=arch=compute_86,code=\"sm_86,compute_86\" -lcudart 3 | 4 | all: pipeline_mma \ 5 | baseline_mma \ 6 | permuted_mma 7 | 8 | 9 | baseline_mma: ./mma_baseline.cu 10 | $(CC) $(FLAG) -o $@.out $^ 11 | 12 | 13 | pipeline_mma: ./mma_pipeline.cu 14 | $(CC) $(FLAG) -o $@.out $^ 15 | 16 | 17 | permuted_mma: ./mma_permuted.cu 18 | $(CC) $(FLAG) -o $@.out $^ 19 | 20 | 21 | clean: 22 | rm -rf *.out -------------------------------------------------------------------------------- /microbench/bin/README.md: -------------------------------------------------------------------------------- 1 | #Programs -------------------------------------------------------------------------------- /microbench/common/common.mk: -------------------------------------------------------------------------------- 1 | BASE_DIR := $(shell pwd) 2 | BIN_DIR := $(BASE_DIR)/../../../bin/ 3 | 4 | GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\" # V100 5 | GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\" # Turing 6 | GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\" # A100 7 | GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\" # RTX30 8 | GENCODE_SM89 ?= -gencode=arch=compute_86,code=\"sm_89,compute_89\" # RTX30 9 | 10 | 11 | TargetSM ?= 89 12 | GENCODE_SM = -gencode=arch=compute_${TargetSM},code=\"sm_${TargetSM},compute_${TargetSM}\" 13 | CUOPTS = $(GENCODE_ARCH) $(GENCODE_SM) 14 | 15 | CC := nvcc 16 | 17 | CUDA_PATH ?= /usr/local/cuda/ 18 | INCLUDE := $(CUDA_PATH)/samples/common/inc/ 19 | LIB := 20 | ILP ?= 1 21 | ITERS ?= 999 22 | MEAN ?= 0.0 23 | STDDEV ?= 1.0 24 | release: 25 | $(CC) $(NVCC_FLGAS) --define-macro ILPconfig=$(ILP),ITERS=$(ITERS),MEAN=$(MEAN),STDDEV=$(STDDEV) $(CUOPTS) $(SRC) -o $(EXE) -I $(INCLUDE) -L $(LIB) -lcudart 26 | cp $(EXE) $(BIN_DIR) 27 | 28 | # clean: 29 | # rm -f *.o; rm -f $(EXE) 30 | 31 | clean: 32 | rm -f *.app *.txt 33 | 34 | run: 35 | ./$(EXE) 36 | 37 | profile: 38 | nv-nsight-cu-cli --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.per_second,smsp__sass_average_data_bytes_per_wavefront_mem_shared.pct,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active ./$(EXE) 39 | 40 | profile_bank: 41 | nv-nsight-cu-cli --metrics l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ldsm.sum,smsp__sass_average_data_bytes_per_wavefront_mem_shared ./$(EXE) 42 | 43 | profile_lsu_mio: 44 | nv-nsight-cu-cli --metrics smsp__average_warp_latency_issue_stalled_lg_throttle.ratio,smsp__average_warp_latency_issue_stalled_mio_throttle.ratio,smsp__average_warp_latency_issue_stalled_short_scoreboard.ratio ./$(EXE) 45 | 46 | 47 | # smsp__average_warps_issue_stalled_mio_throttle_per_issue_active.ratio 48 | # sm__inst_executed_pipe_lsu.sum 49 | # smsp__average_inst_executed_pipe_lsu_per_warp.ratio 50 | 51 | profile_smem: 52 | nv-nsight-cu-cli --metrics l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,sm__sass_data_bytes_mem_shared_op_ld.sum,sm__inst_executed_pipe_lsu.sum,sm__sass_l1tex_pipe_lsu_wavefronts_mem_shared.sum,sm__sass_l1tex_data_pipe_lsu_wavefronts_mem_shared_op_ld.sum ./$(EXE) 53 | 54 | events: 55 | nvprof --events elapsed_cycles_sm ./$(EXE) 56 | 57 | profileall: 58 | nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE) 59 | 60 | nvsight: 61 | nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv 62 | 63 | ptx: 64 | cuobjdump -ptx ./$(EXE) tee ptx.txt 65 | 66 | sass: 67 | cuobjdump -sass ./$(EXE) tee sass.txt 68 | -------------------------------------------------------------------------------- /microbench/hw_def/common/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #define ACCEL_SIM_MODE 1 11 | 12 | enum issue_model { single = 1, dual = 2 }; 13 | 14 | static const char *issue_model_str[] = {"none", "single", "dual"}; 15 | 16 | enum core_model { shared = 0, subcore = 1 }; 17 | 18 | static const char *core_model_str[] = {"none", "shared", "subcore"}; 19 | 20 | enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 }; 21 | 22 | // GPU error check 23 | #define gpuErrchk(ans) \ 24 | { gpuAssert((ans), __FILE__, __LINE__); } 25 | inline void gpuAssert(cudaError_t code, const char *file, int line, 26 | bool abort = true) { 27 | if (code != cudaSuccess) { 28 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, 29 | line); 30 | if (abort) 31 | exit(code); 32 | } 33 | } 34 | 35 | // source: 36 | // https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 37 | unsigned round_up_2n(unsigned v) { 38 | v--; 39 | v |= v >> 1; 40 | v |= v >> 2; 41 | v |= v >> 4; 42 | v |= v >> 8; 43 | v |= v >> 16; 44 | v++; 45 | 46 | return v; 47 | } 48 | 49 | unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); } 50 | 51 | bool isPowerOfTwo(int n) { 52 | if (n == 0) 53 | return false; 54 | 55 | return (ceil(log2(n)) == floor(log2(n))); 56 | } 57 | 58 | static const char *dram_model_str[] = {"none", "GDDR5", "GDDR5X", "GDDR6", 59 | "HBM"}; 60 | static const unsigned dram_model_bus_width[] = {0, 32, 32, 16, 128}; // in bits 61 | static const unsigned dram_model_mem_per_ctrlr[] = {0, 1, 1, 1, 1}; 62 | static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2}; 63 | static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2}; 64 | // atom size = 65 | // dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length 66 | unsigned get_atom_size_inByte(enum dram_model model) { 67 | return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] * 68 | dram_model_burst_length[model]; 69 | } 70 | // CCD = dram_model_burst_length/dram_model_freq_ratio 71 | unsigned get_adjusted_CCD(enum dram_model model) { 72 | assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0); 73 | return dram_model_burst_length[model] / dram_model_freq_ratio[model]; 74 | } 75 | 76 | unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { 77 | unsigned channel_width = 78 | dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model]; 79 | assert(total_memory_width % channel_width == 0); 80 | return total_memory_width / channel_width; 81 | } 82 | 83 | // DDR timing struct 84 | struct DDR_Timing { 85 | unsigned freq; 86 | unsigned nbk; 87 | unsigned CCD; 88 | unsigned RRD; 89 | unsigned RCD; 90 | unsigned RAS; 91 | unsigned RP; 92 | unsigned RC; 93 | unsigned CL; 94 | unsigned WL; 95 | unsigned CDLR; 96 | unsigned WR; 97 | unsigned nbkgrp; 98 | unsigned CCDL; 99 | unsigned RTPL; 100 | 101 | DDR_Timing(unsigned mfreq, unsigned n_bk, unsigned tCCD, unsigned tRRD, 102 | unsigned tRCD, unsigned tRAS, unsigned tRP, unsigned tRC, 103 | unsigned tCL, unsigned tWL, unsigned tCDLR, unsigned tWR, 104 | unsigned n_bkgrp, unsigned tCCDL, unsigned tRTPL) { 105 | freq = mfreq; 106 | nbk = n_bk; 107 | CCD = tCCD; 108 | RRD = tRRD; 109 | RCD = tRCD; 110 | RAS = tRAS; 111 | RP = tRP; 112 | RC = tRC; 113 | CL = tCL; 114 | WL = tWL; 115 | CDLR = tCDLR; 116 | WR = tWR; 117 | nbkgrp = n_bkgrp; 118 | CCDL = tCCDL; 119 | RTPL = tRTPL; 120 | } 121 | 122 | void scale_timing_for_new_freq(float newfreq) { 123 | float freq_scale = freq / newfreq; 124 | RRD = ceil(RRD / freq_scale); 125 | RCD = ceil(RCD / freq_scale); 126 | RAS = ceil(RAS / freq_scale); 127 | RP = ceil(RP / freq_scale); 128 | RC = ceil(RC / freq_scale); 129 | CL = ceil(CL / freq_scale); 130 | WL = ceil(WL / freq_scale); 131 | CDLR = ceil(CDLR / freq_scale); 132 | WR = ceil(WR / freq_scale); 133 | CCDL = ceil(CCDL / freq_scale); 134 | RTPL = ceil(RTPL / freq_scale); 135 | } 136 | }; 137 | 138 | // GDDR5 timing from hynix H5GQ1H24AFR 139 | //-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: 140 | // CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2" 141 | 142 | static const DDR_Timing GDDR5_Timing_1800MHZ(1800, 16, 2, 6, 12, 28, 12, 40, 12, 143 | 4, 5, 12, 4, 3, 2); 144 | 145 | // HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 146 | // paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf) 147 | // Timing for 1 GHZ: 148 | //-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47: 149 | // CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4" 150 | 151 | static const DDR_Timing HBM_Timing_1000MHZ(1000, 16, 1, 4, 14, 33, 14, 47, 14, 152 | 2, 3, 12, 4, 2, 4); 153 | 154 | #endif -------------------------------------------------------------------------------- /microbench/hw_def/common/deviceQuery.h: -------------------------------------------------------------------------------- 1 | #ifndef DEVICE_QUERY_H 2 | #define DEVICE_QUERY_H 3 | 4 | #include 5 | unsigned CLK_FREQUENCY; 6 | unsigned SM_NUMBER; // number of SMs 7 | unsigned WARP_SIZE; // max threads per warp 8 | unsigned MAX_THREADS_PER_SM; // max threads / sm 9 | unsigned MAX_SHARED_MEM_SIZE; // Max configerable shared memory size in bytes 10 | unsigned MAX_WARPS_PER_SM; // max warps / sm 11 | unsigned MAX_REG_PER_SM; // max warps / sm 12 | 13 | unsigned MAX_THREAD_BLOCK_SIZE; // max threads per threadblock 14 | unsigned MAX_SHARED_MEM_SIZE_PER_BLOCK; // Max configerable shared memory size 15 | // per block in bytes 16 | unsigned 17 | MAX_REG_PER_BLOCK; // Max configerable shared memory size per block in bytes 18 | 19 | size_t L2_SIZE; // L2 size in bytes 20 | 21 | size_t MEM_SIZE; // Memory size in bytes 22 | unsigned MEM_CLK_FREQUENCY; // Memory clock freq in MHZ 23 | unsigned MEM_BITWIDTH; // Memory bit width 24 | 25 | // launched threadblocks 26 | unsigned THREADS_PER_BLOCK; 27 | unsigned BLOCKS_PER_SM; 28 | unsigned THREADS_PER_SM; 29 | unsigned BLOCKS_NUM; 30 | unsigned TOTAL_THREADS; 31 | 32 | cudaDeviceProp deviceProp; 33 | 34 | unsigned intilizeDeviceProp(unsigned deviceID) { 35 | cudaSetDevice(deviceID); 36 | cudaGetDeviceProperties(&deviceProp, deviceID); 37 | 38 | CLK_FREQUENCY = deviceProp.clockRate; 39 | // core stats 40 | SM_NUMBER = deviceProp.multiProcessorCount; 41 | MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor; 42 | MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor; 43 | WARP_SIZE = deviceProp.warpSize; 44 | MAX_WARPS_PER_SM = 45 | deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize; 46 | MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor; 47 | 48 | // threadblock stats 49 | MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock; 50 | MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock; 51 | MAX_REG_PER_BLOCK = deviceProp.regsPerBlock; 52 | 53 | // launched thread blocks to ensure GPU is fully occupied as much as possible 54 | THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock; 55 | BLOCKS_PER_SM = 56 | deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock; 57 | THREADS_PER_SM = BLOCKS_PER_SM * THREADS_PER_BLOCK; 58 | BLOCKS_NUM = BLOCKS_PER_SM * SM_NUMBER; 59 | TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 60 | 61 | // L2 cache 62 | L2_SIZE = deviceProp.l2CacheSize; 63 | 64 | // memory 65 | MEM_SIZE = deviceProp.totalGlobalMem; 66 | MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f; 67 | MEM_BITWIDTH = deviceProp.memoryBusWidth; 68 | 69 | return 1; 70 | } 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /microbench/hw_def/hw_def.h: -------------------------------------------------------------------------------- 1 | #ifndef HW_DEF_H 2 | #define HW_DEF_H 3 | 4 | #include 5 | #include 6 | #include "./common/common.h" 7 | #include "./common/deviceQuery.h" 8 | 9 | // note this is just fake meta data, used for performance microbenchmarking 10 | void initialize_fake_metadata_2_4(uint32_t* metadata, int row_size, int col_size) { 11 | int range = 6; 12 | uint32_t FourToTwoMeta[6] = { 0x4, 0x8, 0x9, 0xc, 0xd, 0xe }; 13 | for (int i = 0; i < row_size * col_size / 16; i++) { // 32 bit can represent 16 indexes , each index has 2 bit 14 | uint32_t result = 0x0; 15 | for (int n = 0; n < 32 / 4; ++n) { 16 | double rnd = double(std::rand()) / double(RAND_MAX); 17 | rnd = range * rnd; 18 | uint32_t meta = FourToTwoMeta[(int)rnd]; 19 | 20 | result = (uint32_t)(result | ((uint32_t)(meta << (i * 4)))); 21 | } 22 | metadata[i] = result; 23 | } 24 | } 25 | 26 | __forceinline__ __device__ unsigned lane_id() 27 | { 28 | unsigned ret; 29 | asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); 30 | return ret; 31 | } 32 | 33 | __forceinline__ __device__ unsigned warp_id() 34 | { 35 | // this is not equal to threadIdx.x / 32 36 | unsigned ret; 37 | asm volatile ("mov.u32 %0, %warpid;" : "=r"(ret)); 38 | return ret; 39 | } 40 | 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /microbench/numericbench/bf16numeric/bf16_numeric/Makefile: -------------------------------------------------------------------------------- 1 | SRC = bf16_chain_matmul.cu 2 | 3 | EXE = bf16_chain_matmul.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/bf16numeric/bf16add/Makefile: -------------------------------------------------------------------------------- 1 | SRC = bf16add.cu 2 | 3 | EXE = bf16add.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/bf16numeric/bf16mul/Makefile: -------------------------------------------------------------------------------- 1 | SRC = bf16mul.cu 2 | 3 | EXE = bf16mul.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/bf16numeric/m16n8k16/Makefile: -------------------------------------------------------------------------------- 1 | SRC = m16n8k16_bf16.cu 2 | 3 | EXE = m16n8k16_bf16.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/bf16numeric/m16n8k16/m16n8k16_bf16.cu: -------------------------------------------------------------------------------- 1 | // simple gemm using bf16/half data types 2 | // we do not target on optimal overall performance, so we will not use software pipepline 3 | // pipepline or asychronous copy can speed up gemm further with cost of extra shared memory storage 4 | // CUTLASS provides good examples of how to implement pipeline for gemm 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../../../hw_def/hw_def.h" 15 | #include "../../cpu_base.h" 16 | 17 | typedef __nv_bfloat16 op_AB; 18 | typedef float op_CD; 19 | 20 | 21 | #ifndef ITERS 22 | #define ITERS (1024 ) 23 | #endif 24 | 25 | #define ROUNDS (ITERS*10 ) 26 | 27 | const int inst_m = 16; 28 | const int inst_n = 8; 29 | const int inst_k = 16; 30 | 31 | __forceinline__ __device__ unsigned lane_id_() 32 | { 33 | unsigned ret; 34 | asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); 35 | return ret; 36 | } 37 | 38 | 39 | 40 | __global__ void gemm_m16n8k16_kernel(op_AB* MatA,op_AB* MatB,op_CD* MatC, op_CD* MatD ){ 41 | //uint32_t tid = threadIdx.x; 42 | //uint32_t gid = blockIdx.x * blockDim.x + tid;//global at this block 43 | //uint32_t warpid = gid / warpSize; 44 | uint32_t lane_id = lane_id_(); 45 | // four threads per group, group id 46 | uint32_t group_id = lane_id >>2; 47 | uint32_t tid_in_group = lane_id % 4; 48 | 49 | // m16 n8 k16 50 | op_AB frag_A[8]; // 16 * 16 / 32 = 8 * bf16 51 | op_AB frag_B[4]; // 8 * 16 / 32 52 | op_CD frag_D[4]; // float , 16*8 /32 = 4*float 53 | // load operand fragA 54 | #pragma unroll 55 | for(int i =0; i < 8; i++){ 56 | uint32_t row_a = 0; 57 | if( (i>=0 && i<2) || (i>=4 && i<6) ){ 58 | row_a = group_id; 59 | }else{ 60 | row_a = group_id + 8; 61 | } 62 | 63 | uint32_t col_a = 0; 64 | if(i<4){ 65 | col_a = (tid_in_group * 2) + (i & 0x1); 66 | }else{ 67 | col_a = (tid_in_group * 2) + (i & 0x1) + 8; 68 | } 69 | // row major 70 | frag_A[i] = MatA[inst_k*row_a + col_a]; 71 | 72 | } 73 | // for(int i =0; i < 8; i++){ 74 | // printf("laneId = %d, fragA[%d] = %f \n", lane_id, i, float(frag_A[i])); 75 | // } 76 | 77 | // load operand fragB, MatB has to be col-major 78 | #pragma unroll 79 | for(int i =0; i < 4; i++){ 80 | uint32_t row_b = 0; 81 | if( i < 2 ){ 82 | row_b = (tid_in_group * 2) + (i & 0x1); 83 | }else{ 84 | row_b = (tid_in_group * 2) + (i & 0x1)+8; 85 | } 86 | uint32_t col_b = group_id; 87 | // row-major B 88 | frag_B[i] = MatB[row_b*inst_n + col_b]; 89 | } 90 | 91 | // for(int i =0; i < 4; i++){ 92 | // printf("laneId = %d, fragB[%d] = %f \n", lane_id, i, float(frag_B[i])); 93 | // } 94 | 95 | // load operand fragC, MatC has to be row-major 96 | #pragma unroll 97 | for(int i =0; i < 4; i++){ 98 | uint32_t row_c = 0; 99 | if( i < 2 ){ 100 | row_c = group_id; 101 | }else{ 102 | row_c = group_id + 8; 103 | } 104 | uint32_t col_c = (tid_in_group * 2) + (i & 0x1); 105 | // row-major 106 | frag_D[i] = MatC[inst_n*row_c + col_c]; 107 | } 108 | 109 | // printf("\n\n"); 110 | // for(int i =0; i < 4; i++){ 111 | // printf("laneId = %d, fragC[%d] = %f \n", lane_id, i, float(frag_D[i])); 112 | // } 113 | 114 | //step 1: load data 115 | // MatA => frag_A, MatB => frag_B, MatC => frag_C 116 | 117 | 118 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 119 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 120 | float *C = reinterpret_cast(&frag_D[0]); 121 | float *D = C; // D = A*B + D. 122 | 123 | asm volatile( 124 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 125 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 126 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 127 | : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 128 | "r"(B[0]), "r"(B[1]), 129 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 130 | ); 131 | 132 | __syncwarp(); 133 | 134 | // store back result 135 | // printf("\n\n"); 136 | // for(int i =0; i < 4; i++){ 137 | // printf("laneId = %d, fragD[%d] = %f \n", lane_id, i, float(frag_D[i])); 138 | // } 139 | #pragma unroll 140 | for(int i =0; i < 4; i++){ 141 | uint32_t row_d = 0; 142 | if( i < 2 ){ 143 | row_d = group_id; 144 | }else{ 145 | row_d = group_id + 8; 146 | } 147 | uint32_t col_d = (tid_in_group * 2) + (i & 0x1); 148 | // row-major 149 | MatD[inst_n*row_d + col_d] = frag_D[i]; 150 | } 151 | 152 | } 153 | 154 | 155 | std::vector gemm_m16n8k16_bf16(){ 156 | int BLOCKS_NUM = 1; 157 | int nwarps = 1; 158 | int warp_size = 32; 159 | 160 | 161 | unsigned total_A_SIZE = inst_m*inst_k*nwarps; 162 | unsigned total_B_SIZE = inst_k*inst_n*nwarps; 163 | unsigned total_C_SIZE = inst_m*inst_n*nwarps; 164 | 165 | 166 | op_AB *host_matA = (op_AB *)malloc(total_A_SIZE * sizeof(op_AB)); 167 | op_AB *host_matB = (op_AB *)malloc(total_B_SIZE * sizeof(op_AB)); 168 | 169 | op_CD *host_matC = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD)); 170 | op_CD *host_matD = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD)); 171 | std::random_device rd{}; 172 | std::mt19937 gen{rd()}; 173 | std::normal_distribution<> random_gen{-1.0,1.0}; 174 | // initialize A, row-major 175 | float *host_matA_cpu = (float *)malloc(total_A_SIZE * sizeof(float)); 176 | float *host_matB_cpu = (float *)malloc(total_B_SIZE * sizeof(float)); 177 | for(int r = 0; r < inst_m; r ++){ 178 | for(int c = 0; c < inst_k; c ++){ 179 | //float rnd = (float)(r*inst_k+c); 180 | float rnd = (float)random_gen(gen); 181 | host_matA_cpu[r*inst_k+c] = rnd; 182 | host_matA[r*inst_k+c] = (op_AB)rnd; 183 | } 184 | } 185 | // std::cout<<"print MatA" <>>(dev_matA,dev_matB,dev_matC,dev_matD); 234 | gpuErrchk(cudaPeekAtLastError()); 235 | 236 | gpuErrchk(cudaMemcpy(host_matD, dev_matD, total_C_SIZE * sizeof(op_CD), cudaMemcpyDeviceToHost)); 237 | 238 | //check errors 239 | double l1_norm = 0.0; 240 | double abs_err = 0.0; 241 | double l2_relative_err = 0.0; 242 | compute_diff_l1_norm(cpu_res_baseline,host_matD,inst_m,inst_n,abs_err,l1_norm); 243 | compute_diff_l2_norm(cpu_res_baseline,host_matD,inst_m,inst_n,l2_relative_err); 244 | std::vector errors{abs_err,l1_norm,abs_err/inst_k,l1_norm/inst_k,l2_relative_err}; 245 | 246 | return errors; 247 | } 248 | 249 | 250 | 251 | 252 | int main(){ 253 | std::cout<<"***********************************"< errors = gemm_m16n8k16_bf16(); 264 | avg_abs_err += errors[0]; 265 | avg_l1_norm += errors[1]; 266 | avg_abs_err_FMA += errors[2]; 267 | avg_l1_norm_FMA += errors[3]; 268 | l2_relative += errors[4]; 269 | } 270 | 271 | 272 | // std::cout<<"element-wise avg_abs_err = " << avg_abs_err/ITERS < 99 | // struct gemmCPU 100 | // { 101 | // /* data */ 102 | // }; 103 | 104 | 105 | -------------------------------------------------------------------------------- /microbench/numericbench/cpu_int_base.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #pragma once 6 | // check the difference of two matrix 7 | void compute_diff_l1_norm(int* cpu_base, int* gpu_res, int rows, int cols,double& abs_err, double& l1_norm){ 8 | 9 | // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i| 10 | // double l1_norm = 0.0; 11 | // double abs = 0.0; 12 | for(int row =0; row< rows; row ++){ 13 | for(int col =0; col < cols; col ++){ 14 | int gid = col + row*cols; 15 | abs_err += std::abs(gpu_res[gid] - cpu_base[gid]); 16 | l1_norm += abs_err/std::abs(gpu_res[gid]); 17 | } 18 | } 19 | // l1_norm = l1_norm/(rows*cols); 20 | // abs_err = abs_err/(rows*cols); 21 | //return l1_norm/(rows*cols); 22 | }; 23 | 24 | 25 | 26 | void compute_diff_l2_norm(int* cpu_base, int* gpu_res, int rows, int cols, double& l2_norm){ 27 | 28 | // l1 norm : |gpu_res[i] - cpu_res[i]|/|gpu_res[i| 29 | // double l1_norm = 0.0; 30 | // double abs = 0.0; 31 | double tensor_diff_norm = 0.0; 32 | double tensor_gpu_norm = 0.0; 33 | for(int row =0; row< rows; row ++){ 34 | for(int col =0; col < cols; col ++){ 35 | int gid = col + row*cols; 36 | tensor_diff_norm += std::pow((double(gpu_res[gid]) - double(cpu_base[gid])) ,2 ) ; 37 | tensor_gpu_norm += std::pow(double(gpu_res[gid]),2 ); 38 | } 39 | } 40 | l2_norm = std::sqrt(tensor_diff_norm)/std::sqrt(tensor_gpu_norm); 41 | //l1_norm = l1_norm/(rows*cols); 42 | // abs_err = abs_err/(rows*cols); 43 | //return l1_norm/(rows*cols); 44 | }; 45 | 46 | 47 | template 48 | void gemm_mnk_cpu(datatypeIN* MatA,datatypeIN* MatB,datatypeOut* MatC, datatypeOut* MatD, int M, int N, int K){ 49 | 50 | // Matd = MatA * MatB + MatC 51 | for(int row=0; row < M; row++){ 52 | for(int col=0; col < N; col++){ 53 | int gid = col + row*N; 54 | datatypeOut tmp = 0; 55 | for(int inner=0; inner < K; inner++) 56 | { 57 | tmp += MatA[inner + row*K] * MatB[col + inner*N]; 58 | } 59 | MatD[gid] = tmp+ MatC[gid]; 60 | } 61 | } 62 | 63 | }; 64 | 65 | void print_mat(int* Mat, int rows, int cols){ 66 | 67 | for(int row = 0; row < rows; row++){ 68 | for(int col =0;col < cols; col++){ 69 | printf("%8d ", Mat[col + row*cols]); 70 | } 71 | std::cout< 99 | // struct gemmCPU 100 | // { 101 | // /* data */ 102 | // }; 103 | 104 | 105 | -------------------------------------------------------------------------------- /microbench/numericbench/fp16numeric/fp16_numeric/Makefile: -------------------------------------------------------------------------------- 1 | SRC = fp16_chain_matmul.cu 2 | 3 | EXE = fp16_chain_matmul.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/fp16numeric/fp16add/Makefile: -------------------------------------------------------------------------------- 1 | SRC = fp16add.cu 2 | 3 | EXE = fp16add.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/fp16numeric/fp16mul/Makefile: -------------------------------------------------------------------------------- 1 | SRC = fp16mul.cu 2 | 3 | EXE = fp16mul.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/int8numeric/int8add/Makefile: -------------------------------------------------------------------------------- 1 | SRC = int8add.cu 2 | 3 | EXE = int8add.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/tf32numeric/m16n8k4/Makefile: -------------------------------------------------------------------------------- 1 | SRC = m16n8k4_tf32.cu 2 | 3 | EXE = m16n8k4_tf32.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/numericbench/tf32numeric/m16n8k4/m16n8k4_tf32.cu: -------------------------------------------------------------------------------- 1 | // simple gemm using bf16/half data types 2 | // we do not target on optimal overall performance, so we will not use software pipepline 3 | // pipepline or asychronous copy can speed up gemm further with cost of extra shared memory storage 4 | // CUTLASS provides good examples of how to implement pipeline for gemm 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "../../../hw_def/hw_def.h" 16 | #include "../../cpu_base.h" 17 | 18 | typedef float op_AB; 19 | typedef float op_CD; 20 | 21 | 22 | #ifndef ITERS 23 | #define ITERS (1024 ) 24 | #endif 25 | 26 | #define ROUNDS (ITERS*10 ) 27 | 28 | const int inst_m = 16; 29 | const int inst_n = 8; 30 | const int inst_k = 4; 31 | 32 | // we want to know the numeric precision of PTX instruction - the lowerst programming interface. 33 | // Since higher-level applications are based on the PTX instruction, the numeric errors/differences higher-level applications are based on the ptx instruction. 34 | // __forceinline__ __device__ unsigned lane_id() 35 | // { 36 | // unsigned ret; 37 | // asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); 38 | // return ret; 39 | // } 40 | 41 | __forceinline__ __device__ unsigned lane_id_() 42 | { 43 | unsigned ret; 44 | asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); 45 | return ret; 46 | } 47 | 48 | 49 | 50 | __global__ void gemm_m16n8k4_kernel(op_AB* MatA,op_AB* MatB,op_CD* MatC, op_CD* MatD ){ 51 | uint32_t lane_id = lane_id_(); 52 | // four threads per group, group id 53 | uint32_t group_id = lane_id >>2; 54 | uint32_t tid_in_group = lane_id % 4; 55 | 56 | // m16 n8 k16 57 | uint32_t frag_A[2]; // 16 * 16 / 32 = 8 * bf16 58 | uint32_t frag_B[1]; // 8 * 16 / 32 59 | op_CD frag_D[4]; // float , 16*8 /32 = 4*float 60 | // load operand fragA 61 | #pragma unroll 62 | for(int i =0; i < 2; i++){ 63 | uint32_t row_a = 0; 64 | uint32_t col_a = 0; 65 | if( i==0 ){ 66 | row_a = group_id; 67 | }else{ 68 | row_a = group_id + 8; 69 | } 70 | col_a = tid_in_group; 71 | // row major 72 | // Cvt Float - TF32 73 | asm("cvt.rna.tf32.f32 %0, %1;\n" : "=r"(frag_A[i]) : "f"(MatA[inst_k*row_a + col_a])); 74 | } 75 | #pragma unroll 76 | for(int i =0; i < 1; i++){ 77 | uint32_t row_b = tid_in_group ; 78 | uint32_t col_b = group_id; 79 | // row-major B 80 | asm("cvt.rna.tf32.f32 %0, %1;\n" : "=r"(frag_B[i]) : "f"(MatB[row_b*inst_n + col_b])); 81 | //frag_B[i] = (MatB[row_b*inst_n + col_b]); 82 | } 83 | 84 | #pragma unroll 85 | for(int i =0; i < 4; i++){ 86 | uint32_t row_c = 0; 87 | if( i < 2 ){ 88 | row_c = group_id; 89 | }else{ 90 | row_c = group_id + 8; 91 | } 92 | uint32_t col_c = (tid_in_group * 2) + (i & 0x1); 93 | // row-major 94 | frag_D[i] = MatC[inst_n*row_c + col_c]; 95 | } 96 | 97 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 98 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 99 | float *C = reinterpret_cast(&frag_D[0]); 100 | float *D = C; // D = A*B + D. 101 | 102 | asm volatile( 103 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 104 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 105 | : "r"(A[0]), "r"(A[1]), 106 | "r"(B[0]), 107 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 108 | ); 109 | 110 | __syncwarp(); 111 | 112 | #pragma unroll 113 | for(int i =0; i < 4; i++){ 114 | uint32_t row_d = 0; 115 | if( i < 2 ){ 116 | row_d = group_id; 117 | }else{ 118 | row_d = group_id + 8; 119 | } 120 | uint32_t col_d = (tid_in_group * 2) + (i & 0x1); 121 | // row-major 122 | MatD[inst_n*row_d + col_d] = frag_D[i]; 123 | } 124 | } 125 | 126 | 127 | 128 | 129 | 130 | std::vector gemm_m16n8k8_bf16(){ 131 | int BLOCKS_NUM = 1; 132 | int nwarps = 1; 133 | int warp_size = 32; 134 | 135 | 136 | unsigned total_A_SIZE = inst_m*inst_k*nwarps; 137 | unsigned total_B_SIZE = inst_k*inst_n*nwarps; 138 | unsigned total_C_SIZE = inst_m*inst_n*nwarps; 139 | 140 | 141 | op_AB *host_matA = (op_AB *)malloc(total_A_SIZE * sizeof(op_AB)); 142 | op_AB *host_matB = (op_AB *)malloc(total_B_SIZE * sizeof(op_AB)); 143 | 144 | op_CD *host_matC = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD)); 145 | op_CD *host_matD = (op_CD *)malloc(total_C_SIZE * sizeof(op_CD)); 146 | std::random_device rd{}; 147 | std::mt19937 gen{rd()}; 148 | std::normal_distribution<> random_gen{-1.0,1.0}; 149 | // initialize A, row-major 150 | float *host_matA_cpu = (float *)malloc(total_A_SIZE * sizeof(float)); 151 | float *host_matB_cpu = (float *)malloc(total_B_SIZE * sizeof(float)); 152 | for(int r = 0; r < inst_m; r ++){ 153 | for(int c = 0; c < inst_k; c ++){ 154 | //float rnd = (float)(r*inst_k+c); 155 | float rnd = (float)random_gen(gen); 156 | host_matA_cpu[r*inst_k+c] = rnd; 157 | host_matA[r*inst_k+c] = (op_AB)rnd; 158 | } 159 | } 160 | // std::cout<<"print MatA" <>>(dev_matA,dev_matB,dev_matC,dev_matD); 209 | gpuErrchk(cudaPeekAtLastError()); 210 | 211 | gpuErrchk(cudaMemcpy(host_matD, dev_matD, total_C_SIZE * sizeof(op_CD), cudaMemcpyDeviceToHost)); 212 | 213 | //check errors 214 | double l1_norm = 0.0; 215 | double abs_err = 0.0; 216 | double l2_relative_err = 0.0; 217 | compute_diff_l1_norm(cpu_res_baseline,host_matD,inst_m,inst_n,abs_err,l1_norm); 218 | compute_diff_l2_norm(cpu_res_baseline,host_matD,inst_m,inst_n,l2_relative_err); 219 | 220 | // std::cout<<"print cpu_res_baseline" < errors = gemm_m16n8k8_bf16(); 253 | avg_abs_err += errors[0]; 254 | avg_l1_norm += errors[1]; 255 | avg_abs_err_FMA += errors[2]; 256 | avg_l1_norm_FMA += errors[3]; 257 | l2_relative += errors[4]; 258 | } 259 | 260 | // std::cout<<"element-wise error :"<> ${SCRIPT_DIR}/A100-ILP"${ILPconfig}".log 20 | echo "/////////////////////////////////" 21 | done 22 | done 23 | -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/ldmatrix_ILP/Makefile: -------------------------------------------------------------------------------- 1 | SRC = ldmatrix_ilp.cu 2 | 3 | EXE = ldmatrix_ilp.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/ldmatrix_ILP/ldmatrix_ilp.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | //#define THREADS_NUM 32 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (999 ) 15 | #endif 16 | 17 | // #define ILPconfig 2 // Moved to nvcc flags 18 | 19 | #ifndef ILPconfig 20 | #define ILPconfig 1 21 | #endif 22 | 23 | static_assert(ILPconfig<=8,"ILP > 8 is not supported\n"); 24 | 25 | 26 | typedef uint32_t shared_m; 27 | // Measure latency of ITERS ldmatrix.x1 28 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 29 | shared_m *dsink, uint32_t stride) { 30 | 31 | // thread index 32 | uint32_t tid = threadIdx.x; 33 | uint32_t bid = blockIdx.x; 34 | uint32_t uid = bid * blockDim.x + tid; 35 | uint32_t n_threads = blockDim.x * gridDim.x; 36 | 37 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 38 | 39 | // one thread to initialize the pointer-chasing array 40 | if(uid == 0){ 41 | for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++) 42 | s[i] = (i )*16 % (1024); // s[i] is multiple of 16, because addree is aligned with 4 bytes 43 | } 44 | 45 | asm volatile("bar.sync 0;"); 46 | 47 | // if(uid == 0){ 48 | // for(int i = 0; i < SHARED_MEM_SIZE; i ++){ 49 | // printf("s[%d] = %d \t", i, s[i]); 50 | 51 | // } 52 | // printf("\n"); 53 | // } 54 | //if (uid == 0) { 55 | // initalize pointer chaser 56 | //unsigned x = threadIdx.x*4; 57 | unsigned addr = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 58 | 59 | //#if ILPconfig == 2 60 | 61 | 62 | 63 | 64 | unsigned addr2 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32) *4])); 65 | unsigned addr3 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 64) *4])); 66 | unsigned addr4 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 96) *4])); 67 | 68 | unsigned addr5 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*4) *4])); 69 | unsigned addr6 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*5) *4])); 70 | unsigned addr7 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*6) *4])); 71 | unsigned addr8 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*7) *4])); 72 | //printf("thread %d , addr = %d \n", tid, addr); 73 | // start timing 74 | uint32_t start = 0; 75 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 76 | 77 | 78 | //#pragma unroll 79 | for (uint32_t i = 0; i < ITERS; ++i) { 80 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr) : "r"(addr)); 81 | #if ILPconfig >= 2 82 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr2) : "r"(addr2)); 83 | #endif 84 | #if ILPconfig >= 3 85 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr3) : "r"(addr3)); 86 | #endif 87 | #if ILPconfig >= 4 88 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr4) : "r"(addr4)); 89 | #endif 90 | #if ILPconfig >= 5 91 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr5) : "r"(addr5)); 92 | #endif 93 | #if ILPconfig >= 6 94 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr6) : "r"(addr6)); 95 | #endif 96 | #if ILPconfig >= 7 97 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr7) : "r"(addr7)); 98 | #endif 99 | #if ILPconfig >= 8 100 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr8) : "r"(addr8)); 101 | #endif 102 | __syncwarp(); 103 | } 104 | uint32_t stop = 0; 105 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 106 | 107 | //printf("thread %d , x = %d \n", tid, addr); 108 | 109 | // write time and data back to memory 110 | startClk[uid] = start; 111 | stopClk[uid] = stop; 112 | dsink[uid] = addr + addr2 ; 113 | dsink[uid] += addr3 + addr4 + addr5 + addr6 + addr7 + addr8; 114 | // dsink[uid] += addr5 + addr6 + addr7 + addr8; 115 | 116 | // float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 117 | // printf("Shared Memory Latency = %f cycles\n", lat); 118 | //} 119 | } 120 | 121 | 122 | void test_with_different_thread(int THREADS_NUM, int ILP){ 123 | 124 | BLOCKS_NUM = 1; 125 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 126 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 127 | 128 | assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 129 | 130 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 131 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 132 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 133 | 134 | uint32_t *startClk_g; 135 | uint32_t *stopClk_g; 136 | shared_m *dsink_g; 137 | 138 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 139 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 140 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 141 | 142 | shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1); 143 | gpuErrchk(cudaPeekAtLastError()); 144 | //printf("pass kenerl \n"); 145 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 146 | cudaMemcpyDeviceToHost)); 147 | gpuErrchk( 148 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 149 | gpuErrchk( 150 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 151 | 152 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 153 | 154 | std::cout << THREADS_NUM/32 <<" warps ldmatrix.x1 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 155 | 156 | long num_bytes = (THREADS_NUM/32) * 8 * 8 * 2 * 1 * ILP; 157 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < warps = {1,2,3,4,5,6,8,12,16,20,24,28,32}; 175 | //std::vector warps = {4,8,12,16}; 176 | intilizeDeviceProp(0); 177 | std::cout<<"***********************************"< 1 is not supported\n"); 23 | 24 | // two way bank conflict - > 23 latenct 25 | // bank-conflict-free -> 25 latency 26 | 27 | typedef uint32_t shared_m; 28 | // Measure latency of ITERS ldmatrix.x1 29 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 30 | shared_m *dsink, uint32_t stride) { 31 | 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t bid = blockIdx.x; 35 | uint32_t uid = bid * blockDim.x + tid; 36 | uint32_t n_threads = blockDim.x * gridDim.x; 37 | 38 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 39 | 40 | // one thread to initialize the pointer-chasing array 41 | if(uid == 0){ 42 | for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++) 43 | s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 44 | } 45 | 46 | asm volatile("bar.sync 0;"); 47 | 48 | // if(uid == 0){ 49 | // for(int i = 0; i < SHARED_MEM_SIZE; i ++){ 50 | // printf("s[%d] = %d \t", i, s[i]); 51 | 52 | // } 53 | // printf("\n"); 54 | // } 55 | //if (uid == 0) { 56 | // initalize pointer chaser 57 | //unsigned x = threadIdx.x*4; 58 | unsigned addr = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 59 | //printf("thread %d , addr = %d \n", tid, addr); 60 | // start timing 61 | uint32_t start = 0; 62 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 63 | 64 | // pointer-chasing ITERS times 65 | //#pragma unroll 66 | for (uint32_t i = 0; i < ITERS; ++i) { 67 | asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(addr) : "r"(addr)); // first 11 68 | __syncwarp(); 69 | } 70 | //asm volatile("bar.sync 0;"); 71 | 72 | //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); 73 | // stop timing 74 | uint32_t stop = 0; 75 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 76 | addr ++; 77 | //printf("thread %d , x = %d \n", tid, addr); 78 | 79 | // write time and data back to memory 80 | if(uid == 0){ 81 | 82 | startClk[uid] = start; 83 | stopClk[uid] = stop; 84 | dsink[uid] = addr; 85 | } 86 | 87 | 88 | // float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 89 | // printf("Shared Memory Latency = %f cycles\n", lat); 90 | //} 91 | } 92 | 93 | 94 | void test_with_different_thread(int THREADS_NUM){ 95 | 96 | BLOCKS_NUM = 1; 97 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 98 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 99 | 100 | assert(SHARED_MEM_SIZE * sizeof(shared_m) < MAX_SHARED_MEM_SIZE_PER_BLOCK); 101 | 102 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 103 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 104 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 105 | 106 | uint32_t *startClk_g; 107 | uint32_t *stopClk_g; 108 | shared_m *dsink_g; 109 | 110 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 111 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 112 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 113 | 114 | shared_lat<<>>(startClk_g, stopClk_g, dsink_g, 1); 115 | gpuErrchk(cudaPeekAtLastError()); 116 | //printf("pass kenerl \n"); 117 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 118 | cudaMemcpyDeviceToHost)); 119 | gpuErrchk( 120 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 121 | gpuErrchk( 122 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 123 | 124 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 125 | 126 | std::cout << THREADS_NUM/32 <<" warps ldmatrix.x1 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 127 | 128 | long num_bytes = (THREADS_NUM/32) * 8 * 8 * 2 * 1; 129 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < warps = {1,2,4,8,16,32}; 147 | std::cout << "ldmatrix.x1 microbenchmark " < 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | //#define THREADS_NUM 128 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (1024 ) 15 | #endif 16 | 17 | #ifndef ILPconfig 18 | #define ILPconfig 1 19 | #endif 20 | 21 | static_assert(ILPconfig<=6,"ILP > 6 is not implemented\n"); 22 | // two way bank conflict - > 23 latenct 23 | // bank-conflict-free -> 25 latency 24 | 25 | typedef uint32_t shared_m; 26 | // Measure latency of ITERS ldmatrix.x1 27 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 28 | shared_m *dsink, uint32_t stride) { 29 | 30 | // thread index 31 | uint32_t tid = threadIdx.x; 32 | uint32_t bid = blockIdx.x; 33 | uint32_t uid = bid * blockDim.x + tid; 34 | uint32_t n_threads = blockDim.x * gridDim.x; 35 | 36 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 37 | 38 | // one thread to initialize the pointer-chasing array 39 | if(uid == 0){ 40 | for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++) 41 | s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 42 | } 43 | 44 | asm volatile("bar.sync 0;"); 45 | 46 | unsigned addr = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 47 | unsigned addr_1 = 0; 48 | 49 | unsigned addr2 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32) *4])); 50 | unsigned addr2_1 = 0; 51 | 52 | unsigned addr3 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 64) *4])); 53 | unsigned addr3_1 = 0; 54 | 55 | unsigned addr4 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 96) *4])); 56 | unsigned addr4_1 = 0; 57 | 58 | unsigned addr5 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*4) *4])); 59 | unsigned addr5_1 = 0; 60 | 61 | unsigned addr6 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32*5) *4])); 62 | unsigned addr6_1 = 0; 63 | //printf("thread %d , addr = %d \n", tid, addr); 64 | // start timing 65 | uint32_t start = 0; 66 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 67 | 68 | // pointer-chasing ITERS times 69 | #pragma unroll 70 | for (uint32_t i = 0; i < ITERS; ++i) { 71 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr), "=r"(addr_1) : "r"(addr)); 72 | #if ILPconfig >= 2 73 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr2), "=r"(addr2_1) : "r"(addr2)); 74 | #endif 75 | #if ILPconfig >= 3 76 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr3), "=r"(addr3_1) : "r"(addr3)); 77 | #endif 78 | #if ILPconfig >= 4 79 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr4), "=r"(addr4_1) : "r"(addr4)); 80 | #endif 81 | 82 | #if ILPconfig >= 5 83 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr5), "=r"(addr5_1) : "r"(addr5)); 84 | #endif 85 | 86 | #if ILPconfig >= 6 87 | asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(addr6), "=r"(addr6_1) : "r"(addr6)); 88 | #endif 89 | 90 | 91 | 92 | __syncwarp(); 93 | } 94 | uint32_t stop = 0; 95 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 96 | 97 | //printf("thread %d , x = %d \n", tid, addr); 98 | 99 | // write time and data back to memory 100 | startClk[uid] = start; 101 | stopClk[uid] = stop; 102 | dsink[uid] = addr + addr_1; 103 | 104 | dsink[uid] += addr2 + addr2_1; 105 | 106 | dsink[uid] += addr3 + addr3_1; 107 | dsink[uid] += addr4 + addr4_1; 108 | dsink[uid] += addr5 + addr5_1; 109 | dsink[uid] += addr6 + addr6_1; 110 | 111 | 112 | } 113 | 114 | void test_with_different_thread(int THREADS_NUM, int ILP){ 115 | BLOCKS_NUM = 1; 116 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 117 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 118 | 119 | assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 120 | 121 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 122 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 123 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 124 | 125 | uint32_t *startClk_g; 126 | uint32_t *stopClk_g; 127 | shared_m *dsink_g; 128 | 129 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 130 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 131 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 132 | 133 | shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1); 134 | gpuErrchk(cudaPeekAtLastError()); 135 | //printf("pass kenerl \n"); 136 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 137 | cudaMemcpyDeviceToHost)); 138 | gpuErrchk( 139 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 140 | gpuErrchk( 141 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 142 | 143 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 144 | 145 | 146 | std::cout << THREADS_NUM/32 <<" warps ldmatrix.x2 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 147 | 148 | long num_bytes = (THREADS_NUM/32) * 8 * 8 * 2 * 2 * ILP; 149 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < warps = {1,2,4,6,8,12,16,20,24,28,32}; 167 | //std::vector warps = {4,8,12,16}; 168 | intilizeDeviceProp(0); 169 | //std::cout << "ldmatrix.x2 microbenchmark " < 8 is not implemented\n"); 23 | 24 | // two way bank conflict - > 23 latenct 25 | // bank-conflict-free -> 25 latency 26 | 27 | typedef uint32_t shared_m; 28 | // Measure latency of ITERS ldmatrix.x1 29 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 30 | shared_m *dsink, uint32_t stride) { 31 | 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t bid = blockIdx.x; 35 | uint32_t uid = bid * blockDim.x + tid; 36 | uint32_t n_threads = blockDim.x * gridDim.x; 37 | 38 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 39 | 40 | // one thread to initialize the pointer-chasing array 41 | if(uid == 0){ 42 | for (uint32_t i = 0; i < (SHARED_MEM_SIZE - stride); i ++) 43 | s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 44 | } 45 | 46 | 47 | asm volatile("bar.sync 0;"); 48 | 49 | // if(uid == 0){ 50 | // for(int i = 0; i < SHARED_MEM_SIZE; i ++){ 51 | // printf("s[%d] = %d \t", i, s[i]); 52 | 53 | // } 54 | // printf("\n"); 55 | // } 56 | //if (uid == 0) { 57 | // initalize pointer chaser 58 | //unsigned x = threadIdx.x*4; 59 | unsigned addr = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 60 | unsigned addr_1 = 0; 61 | unsigned addr_2 = 0; 62 | unsigned addr_3 = 0; 63 | 64 | unsigned addr2 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 32) * 4])); 65 | unsigned addr2_1 = 0; 66 | unsigned addr2_2 = 0; 67 | unsigned addr2_3 = 0; 68 | 69 | 70 | unsigned addr3 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 64) * 4])); 71 | unsigned addr3_1 = 0; 72 | unsigned addr3_2 = 0; 73 | unsigned addr3_3 = 0; 74 | 75 | 76 | unsigned addr4 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 96) * 4])); 77 | unsigned addr4_1 = 0; 78 | unsigned addr4_2 = 0; 79 | unsigned addr4_3 = 0; 80 | 81 | unsigned addr5 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 128) * 4])); 82 | unsigned addr5_1 = 0; 83 | unsigned addr5_2 = 0; 84 | unsigned addr5_3 = 0; 85 | 86 | unsigned addr6 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 160) * 4])); 87 | unsigned addr6_1 = 0; 88 | unsigned addr6_2 = 0; 89 | unsigned addr6_3 = 0; 90 | 91 | unsigned addr7 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 192) * 4])); 92 | unsigned addr7_1 = 0; 93 | unsigned addr7_2 = 0; 94 | unsigned addr7_3 = 0; 95 | 96 | unsigned addr8 = static_cast(__cvta_generic_to_shared(&s[(threadIdx.x + 224) * 4])); 97 | unsigned addr8_1 = 0; 98 | unsigned addr8_2 = 0; 99 | unsigned addr8_3 = 0; 100 | 101 | 102 | //printf("thread %d , addr = %d \n", tid, addr); 103 | // start timing 104 | uint32_t start = 0; 105 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 106 | // pointer-chasing ITERS times 107 | // #pragma unroll 108 | for (uint32_t i = 0; i < ITERS; ++i) { 109 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr_1),"=r"(addr_2),"=r"(addr_3) : "r"(addr)); 110 | #if ILPconfig >= 2 111 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr2), "=r"(addr2_1),"=r"(addr2_2),"=r"(addr2_3) : "r"(addr2)); 112 | #endif 113 | #if ILPconfig >= 3 114 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr3), "=r"(addr3_1),"=r"(addr3_2),"=r"(addr3_3) : "r"(addr3)); 115 | #endif 116 | #if ILPconfig >= 4 117 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr4), "=r"(addr4_1),"=r"(addr4_2),"=r"(addr4_3) : "r"(addr4)); 118 | #endif 119 | 120 | #if ILPconfig >= 5 121 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr5), "=r"(addr5_1),"=r"(addr5_2),"=r"(addr5_3) : "r"(addr5)); 122 | #endif 123 | 124 | #if ILPconfig >= 6 125 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr6), "=r"(addr6_1),"=r"(addr6_2),"=r"(addr6_3) : "r"(addr6)); 126 | #endif 127 | 128 | #if ILPconfig >= 7 129 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr7), "=r"(addr7_1),"=r"(addr7_2),"=r"(addr7_3) : "r"(addr7)); 130 | #endif 131 | 132 | #if ILPconfig >= 8 133 | asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr8), "=r"(addr8_1),"=r"(addr8_2),"=r"(addr8_3) : "r"(addr8)); 134 | #endif 135 | __syncwarp(); 136 | } 137 | //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); 138 | // stop timing 139 | uint32_t stop = 0; 140 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 141 | 142 | //printf("thread %d , x = %d \n", tid, addr); 143 | 144 | // write time and data back to memory 145 | startClk[uid] = start; 146 | stopClk[uid] = stop; 147 | dsink[uid] = addr + addr_1 + addr_2 + addr_3; 148 | dsink[uid] += addr2 + addr2_1 + addr2_2 + addr2_3; 149 | 150 | dsink[uid] += addr3 + addr3_1 + addr3_2 + addr3_3; 151 | dsink[uid] += addr4 + addr4_1 + addr4_2 + addr4_3; 152 | dsink[uid] += addr5 + addr5_1 + addr5_2 + addr5_3; 153 | dsink[uid] += addr6 + addr6_1 + addr6_2 + addr6_3; 154 | dsink[uid] += addr7 + addr7_1 + addr7_2 + addr7_3; 155 | dsink[uid] += addr8 + addr8_1 + addr8_2 + addr8_3; 156 | // float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 157 | // printf("Shared Memory Latency = %f cycles\n", lat); 158 | //} 159 | } 160 | void test_with_different_thread(int THREADS_NUM, int ILP){ 161 | 162 | BLOCKS_NUM = 1; 163 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 164 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 165 | 166 | assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 167 | 168 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 169 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 170 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 171 | 172 | uint32_t *startClk_g; 173 | uint32_t *stopClk_g; 174 | shared_m *dsink_g; 175 | 176 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 177 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 178 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 179 | 180 | shared_lat<<>>(startClk_g, stopClk_g, dsink_g, 1); 181 | gpuErrchk(cudaPeekAtLastError()); 182 | // printf("pass kenerl \n"); 183 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 184 | cudaMemcpyDeviceToHost)); 185 | gpuErrchk( 186 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 187 | gpuErrchk( 188 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 189 | 190 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 191 | 192 | long num_bytes = (TOTAL_THREADS/32) * 8 * 8 * 2 * 4 * ILP; 193 | 194 | std::cout << THREADS_NUM/32 <<" warps ldmatrix.x4 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 195 | 196 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < warps = {1,2,3,4,5,6,7,8,12,16,20}; 216 | //std::vector warps = {4,8,12,16}; 217 | intilizeDeviceProp(0); 218 | std::cout<<"***********************************"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../../../hw_def/hw_def.h" 8 | 9 | #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB 10 | #ifndef ITERS 11 | #define ITERS (1024 ) 12 | #endif 13 | 14 | #ifndef ILPconfig 15 | #define ILPconfig 1 16 | #endif 17 | 18 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 19 | 20 | __global__ void shared_bw(uint64_t *startClk, uint64_t *stopClk, 21 | uint32_t *dsink, uint32_t stride) { 22 | 23 | // thread index 24 | uint32_t tid = threadIdx.x; 25 | uint32_t bid = blockIdx.x; 26 | uint32_t uid = bid * blockDim.x + tid; 27 | uint32_t n_threads = blockDim.x * gridDim.x; 28 | 29 | // a register to avoid compiler optimization 30 | // uint32_t sink0 = 0; 31 | register uint32_t tmp = uid; 32 | 33 | uint64_t start = 0; 34 | uint64_t stop = 0; 35 | 36 | __shared__ uint32_t s[SHARED_MEM_SIZE]; // static shared memory 37 | // uint32_t s[SHARED_MEM_SIZE]; 38 | // one thread to initialize the pointer-chasing array 39 | for (uint32_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads) 40 | s[i] = (i + stride) % SHARED_MEM_SIZE; 41 | 42 | // synchronize all threads 43 | asm volatile("bar.sync 0;"); 44 | 45 | // start timing 46 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 47 | 48 | // load data from shared memory 49 | for (uint32_t i = 0; i < ITERS; ++i) { 50 | tmp = s[tmp]; 51 | } 52 | 53 | // synchronize all threads 54 | asm volatile("bar.sync 0;"); 55 | 56 | // stop timing 57 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 58 | 59 | // sink0 = tmp; 60 | // write time and data back to memory 61 | startClk[uid] = start; 62 | stopClk[uid] = stop; 63 | dsink[uid] = tmp; 64 | } 65 | 66 | int main() { 67 | intilizeDeviceProp(0); 68 | 69 | BLOCKS_NUM = 1; 70 | TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 71 | THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; 72 | 73 | assert(SHARED_MEM_SIZE * sizeof(uint32_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); 74 | 75 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 76 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 77 | uint32_t *dsink = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); 78 | 79 | uint64_t *startClk_g; 80 | uint64_t *stopClk_g; 81 | uint32_t *dsink_g; 82 | 83 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 84 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 85 | gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint32_t))); 86 | 87 | shared_bw<<<1, THREADS_PER_BLOCK>>>(startClk_g, stopClk_g, dsink_g, 88 | THREADS_PER_BLOCK); 89 | gpuErrchk(cudaPeekAtLastError()); 90 | 91 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 92 | cudaMemcpyDeviceToHost)); 93 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 94 | cudaMemcpyDeviceToHost)); 95 | gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint32_t), 96 | cudaMemcpyDeviceToHost)); 97 | 98 | double bw, BW; 99 | uint64_t total_time = 100 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 101 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 102 | bw = 103 | (double)(ITERS * TOTAL_THREADS * sizeof(uint32_t)) / ((double)total_time); 104 | BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; 105 | std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW 106 | << "(GB/s/SM)\n"; 107 | std::cout << "Total Clk number = " << total_time << "\n"; 108 | 109 | return 1; 110 | } 111 | -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/shared_bw_64/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SRC = shared_bw_64.cu 3 | 4 | EXE = shared_bw_64.app 5 | 6 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 7 | 8 | include ../../../common/common.mk 9 | -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/shared_bw_64/shared_bw_64.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../../../hw_def/hw_def.h" 8 | 9 | #define SHARED_MEM_SIZE (32 * 1024 / 8) // 32KB 10 | #ifndef ITERS 11 | #define ITERS (1024 ) 12 | #endif 13 | 14 | 15 | #ifndef ILPconfig 16 | #define ILPconfig 1 17 | #endif 18 | 19 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 20 | 21 | __global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk, 22 | uint64_t *dsink, uint32_t stride) { 23 | 24 | // thread index 25 | uint32_t tid = threadIdx.x; 26 | uint32_t bid = blockIdx.x; 27 | uint32_t uid = bid * blockDim.x + tid; 28 | uint32_t n_threads = blockDim.x * gridDim.x; 29 | 30 | // a register to avoid compiler optimization 31 | // uint32_t sink0 = 0; 32 | register uint64_t tmp = uid; 33 | 34 | uint32_t start = 0; 35 | uint32_t stop = 0; 36 | 37 | __shared__ uint64_t s[SHARED_MEM_SIZE]; // static shared memory 38 | // uint32_t s[SHARED_MEM_SIZE]; 39 | // one thread to initialize the pointer-chasing array 40 | for (uint64_t i = uid; i < (SHARED_MEM_SIZE); i += n_threads) 41 | s[i] = (i + stride) % SHARED_MEM_SIZE; 42 | 43 | // synchronize all threads 44 | asm volatile("bar.sync 0;"); 45 | 46 | // start timing 47 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 48 | 49 | // load data from shared memory 50 | for (uint32_t i = 0; i < ITERS; ++i) { 51 | tmp = s[tmp]; 52 | } 53 | 54 | // synchronize all threads 55 | asm volatile("bar.sync 0;"); 56 | 57 | // stop timing 58 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 59 | 60 | // sink0 = tmp; 61 | // write time and data back to memory 62 | startClk[uid] = start; 63 | stopClk[uid] = stop; 64 | dsink[uid] = tmp; 65 | } 66 | 67 | int main() { 68 | intilizeDeviceProp(0); 69 | 70 | BLOCKS_NUM = 1; 71 | TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 72 | THREADS_PER_SM = THREADS_PER_BLOCK * BLOCKS_NUM; 73 | 74 | assert(SHARED_MEM_SIZE * sizeof(uint64_t) < MAX_SHARED_MEM_SIZE_PER_BLOCK); 75 | 76 | uint32_t *startClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); 77 | uint32_t *stopClk = (uint32_t *)malloc(TOTAL_THREADS * sizeof(uint32_t)); 78 | uint64_t *dsink = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 79 | 80 | uint32_t *startClk_g; 81 | uint32_t *stopClk_g; 82 | uint64_t *dsink_g; 83 | 84 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint32_t))); 85 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint32_t))); 86 | gpuErrchk(cudaMalloc(&dsink_g, TOTAL_THREADS * sizeof(uint64_t))); 87 | 88 | shared_bw<<>>(startClk_g, stopClk_g, dsink_g, 89 | THREADS_PER_BLOCK); 90 | gpuErrchk(cudaPeekAtLastError()); 91 | 92 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint32_t), 93 | cudaMemcpyDeviceToHost)); 94 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint32_t), 95 | cudaMemcpyDeviceToHost)); 96 | gpuErrchk(cudaMemcpy(dsink, dsink_g, TOTAL_THREADS * sizeof(uint64_t), 97 | cudaMemcpyDeviceToHost)); 98 | 99 | double bw, BW; 100 | uint64_t total_time = 101 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 102 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 103 | bw = 104 | (double)(ITERS * TOTAL_THREADS * sizeof(uint64_t)) / ((double)total_time); 105 | BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; 106 | std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW 107 | << "(GB/s/SM)\n"; 108 | std::cout << "Total Clk number = " << total_time << "\n"; 109 | 110 | return 1; 111 | } 112 | -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/shared_lat/Makefile: -------------------------------------------------------------------------------- 1 | SRC = shared_lat.cu 2 | 3 | EXE = shared_lat.app 4 | 5 | NVCC_FLGAS = -Xptxas -dlcm=cv -Xptxas -dscm=wt 6 | 7 | include ../../../common/common.mk 8 | -------------------------------------------------------------------------------- /microbench/ubench/ldmatrix/shared_lat/shared_lat.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (32 * 1024 ) // 32k 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | // #define THREADS_NUM 32 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (1024 ) 15 | #endif 16 | 17 | 18 | #ifndef ILPconfig 19 | #define ILPconfig 1 20 | #endif 21 | 22 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 23 | 24 | #define U32ACCESS 25 | 26 | // two way bank conflict - > 23 latenct 27 | // bank-conflict-free -> 25 latency 28 | 29 | #ifdef U32ACCESS 30 | typedef uint32_t shared_m; 31 | #else 32 | typedef uint64_t shared_m; 33 | #endif 34 | // two way bank conflict - > 23 latenct 35 | // bank-conflict-free -> 25 latency 36 | 37 | 38 | // Measure latency of ITERS reads. 39 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 40 | shared_m *dsink, uint32_t stride) { 41 | 42 | // thread index 43 | uint32_t tid = threadIdx.x; 44 | uint32_t bid = blockIdx.x; 45 | uint32_t uid = bid * blockDim.x + tid; 46 | uint32_t n_threads = blockDim.x * gridDim.x; 47 | 48 | extern __shared__ int smem[]; // dynamic 49 | 50 | shared_m *s = (shared_m*)&smem[0]; 51 | 52 | int s_smem = SHARED_MEM_SIZE/sizeof(shared_m); 53 | 54 | if(uid == 0){ 55 | for (uint32_t i = 0; i < (s_smem - stride); i ++) 56 | s[i] = (i + stride) % s_smem; //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 57 | } 58 | // one thread to initialize the pointer-chasing array 59 | // for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads) 60 | // s[i] = (i + stride) % SHARED_MEM_SIZE; 61 | 62 | asm volatile("bar.sync 0;"); 63 | 64 | // if(uid == 0){ 65 | // for(int i = 0; i < SHARED_MEM_SIZE; i ++){ 66 | // printf("s[%d] = %d \t", i, s[i]); 67 | 68 | // } 69 | // printf("\n"); 70 | // } 71 | 72 | //if (uid == 0) { 73 | // initalize pointer chaser 74 | shared_m p_chaser = threadIdx.x * stride;; 75 | 76 | // start timing 77 | uint32_t start = 0; 78 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 79 | 80 | // pointer-chasing ITERS times 81 | for (uint32_t i = 0; i < ITERS; ++i) { 82 | p_chaser = s[p_chaser]; 83 | } 84 | 85 | // stop timing 86 | uint32_t stop = 0; 87 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 88 | 89 | // write time and data back to memory 90 | if(uid == 0){ 91 | startClk[uid] = start; 92 | stopClk[uid] = stop; 93 | dsink[uid] = p_chaser; 94 | } 95 | 96 | //} 97 | } 98 | 99 | 100 | // n-way bank conflict (n = 1,2,4,8...32) 101 | void bank_conflict_test(int n, int THREADS_NUM){ 102 | 103 | 104 | 105 | BLOCKS_NUM = 1; 106 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 107 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 108 | 109 | assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 110 | 111 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 112 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 113 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 114 | 115 | uint32_t *startClk_g; 116 | uint32_t *stopClk_g; 117 | shared_m *dsink_g; 118 | 119 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 120 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 121 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 122 | 123 | shared_lat<<<1, THREADS_NUM,SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, n); 124 | gpuErrchk(cudaPeekAtLastError()); 125 | 126 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 127 | cudaMemcpyDeviceToHost)); 128 | gpuErrchk( 129 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 130 | gpuErrchk( 131 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 132 | 133 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 134 | 135 | //printf("Shared Memory Latency = %f cycles\n", lat); 136 | std::cout << n <<"-way bank conflict , " << THREADS_NUM/32 <<" warps, latency = " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 137 | 138 | long num_bytes = (THREADS_NUM) * 4; 139 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < warps = {1,4,8}; 161 | for(auto& e:warps){ 162 | bank_conflict_test(1, 32*e ); 163 | std::cout <<"***************************************"< 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (32 * 1024) // 32 KB in bytes 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | // #define THREADS_NUM 256 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (1024 ) 15 | #endif 16 | 17 | //#define U32ACCESS 18 | 19 | 20 | #ifndef ILPconfig 21 | #define ILPconfig 1 22 | #endif 23 | 24 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 25 | 26 | // two way bank conflict - > 23 latenct 27 | // bank-conflict-free -> 25 latency 28 | 29 | #ifdef U32ACCESS 30 | typedef uint32_t shared_m; 31 | #else 32 | typedef uint64_t shared_m; 33 | #endif 34 | 35 | // Measure latency of ITERS reads. 36 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 37 | shared_m *dsink, uint32_t stride) { 38 | 39 | // thread index 40 | uint32_t tid = threadIdx.x; 41 | uint32_t bid = blockIdx.x; 42 | uint32_t uid = bid * blockDim.x + tid; 43 | uint32_t n_threads = blockDim.x * gridDim.x; 44 | 45 | //__shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 46 | 47 | extern __shared__ int smem[]; // dynamic 48 | 49 | shared_m *s = (shared_m*)&smem[0]; 50 | 51 | int s_smem = SHARED_MEM_SIZE/sizeof(shared_m); 52 | 53 | // one thread to initialize the pointer-chasing array 54 | // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads) 55 | // s[i] = (i + stride) % s_smem; 56 | 57 | if(uid == 0){ 58 | for (uint32_t i = 0; i < (s_smem - stride); i ++) 59 | s[i] = (i + stride) % s_smem; //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 60 | } 61 | // 62 | asm volatile("bar.sync 0;"); 63 | 64 | // if(uid == 0){ 65 | // for(int i = 0; i < s_smem; i ++){ 66 | // printf("s[%d] = %d \t", i, int(s[i]) ); 67 | 68 | // } 69 | // printf("\n"); 70 | // } 71 | 72 | //if (uid == 0) { 73 | // initalize pointer chaser 74 | shared_m p_chaser = threadIdx.x*stride ; 75 | 76 | #ifdef U32ACCESS 77 | shared_m p_chaser_1 = threadIdx.x + 32; 78 | #endif 79 | // start timing 80 | uint32_t start = 0; 81 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 82 | 83 | // pointer-chasing ITERS times 84 | for (uint32_t i = 0; i < ITERS; ++i) { 85 | p_chaser = s[p_chaser]; 86 | 87 | #ifdef U32ACCESS 88 | p_chaser_1 =s[p_chaser_1]; 89 | #endif 90 | 91 | //p_chaser_1 =s[p_chaser_1]; 92 | //asm volatile("bar.sync 0;"); 93 | } 94 | 95 | // stop timing 96 | asm volatile("bar.sync 0;"); 97 | uint32_t stop = 0; 98 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 99 | 100 | // write time and data back to memory 101 | if(uid == 0){ 102 | startClk[uid] = start; 103 | stopClk[uid] = stop; 104 | dsink[uid] = p_chaser;// + p_chaser_1; 105 | #ifdef U32ACCESS 106 | dsink[uid] += p_chaser_1; 107 | #endif 108 | 109 | } 110 | 111 | //} 112 | } 113 | 114 | 115 | void test_with_different_thread(int stride, int THREADS_NUM){ 116 | //int n_warps = THREADS_NUM/32; 117 | BLOCKS_NUM = 1; 118 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 119 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 120 | 121 | // if( n_warps == 8 ){ 122 | // #define SHARED_MEM_SIZE (16 * 1024) 123 | // }else{ 124 | // #define SHARED_MEM_SIZE (32 * 1024) 125 | // } 126 | 127 | assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 128 | 129 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 130 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 131 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 132 | 133 | uint32_t *startClk_g; 134 | uint32_t *stopClk_g; 135 | shared_m *dsink_g; 136 | 137 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 138 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 139 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 140 | 141 | shared_lat<<<1, THREADS_NUM, SHARED_MEM_SIZE>>>(startClk_g, stopClk_g, dsink_g, stride); 142 | gpuErrchk(cudaPeekAtLastError()); 143 | 144 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 145 | cudaMemcpyDeviceToHost)); 146 | gpuErrchk( 147 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 148 | gpuErrchk( 149 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 150 | 151 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 152 | 153 | std::cout << THREADS_NUM/32 <<" warps Shared Memory read(8B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 154 | 155 | long num_bytes = (THREADS_NUM) * 8; 156 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (32*1024) //(32 * 1024 ) // in bytes 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | // #define THREADS_NUM 32 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (1024 ) 15 | #endif 16 | 17 | #ifndef ILPconfig 18 | #define ILPconfig 1 19 | #endif 20 | 21 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 22 | 23 | // two way bank conflict - > 23 latenct 24 | // bank-conflict-free -> 25 latency 25 | 26 | #define U32ACCESS 27 | 28 | // two way bank conflict - > 23 latenct 29 | // bank-conflict-free -> 25 latency 30 | 31 | #ifdef U32ACCESS 32 | typedef uint32_t shared_m; 33 | #else 34 | typedef uint64_t shared_m; 35 | #endif 36 | 37 | //typedef uint32_t shared_m; 38 | 39 | // typedef uint64_t shared_m; 40 | // Measure latency of ITERS reads. 41 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 42 | shared_m *dsink, uint32_t stride) { 43 | 44 | // thread index 45 | uint32_t tid = threadIdx.x; 46 | uint32_t bid = blockIdx.x; 47 | uint32_t uid = bid * blockDim.x + tid; 48 | uint32_t n_threads = blockDim.x * gridDim.x; 49 | 50 | extern __shared__ int smem[]; // dynamic 51 | 52 | shared_m *s = (shared_m*)&smem[0]; 53 | 54 | int s_smem = SHARED_MEM_SIZE/sizeof(shared_m); 55 | 56 | // // one thread to initialize the pointer-chasing array 57 | // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads) 58 | // s[i] = (i + stride) % s_smem; 59 | if(uid == 0){ 60 | for (uint32_t i = 0; i < (s_smem - stride); i ++) 61 | s[i] = shared_m((i + stride) % s_smem); //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 62 | } 63 | // 64 | asm volatile("bar.sync 0;"); 65 | 66 | // if(uid == 0){ 67 | // for(int i = 0; i < s_smem; i ++){ 68 | // printf("s[%d] = %d \t", i, int(s[i]) ); 69 | 70 | // } 71 | // printf("\n"); 72 | // } 73 | 74 | //if (uid == 0) { 75 | // initalize pointer chaser 76 | shared_m p_chaser = threadIdx.x * stride; 77 | //p_chaser = static_cast(__cvta_generic_to_shared(&s[p_chaser])); 78 | shared_m p_chaser_1 = threadIdx.x * stride + 32; 79 | 80 | 81 | #ifdef U32ACCESS 82 | shared_m p_chaser_2 = threadIdx.x * stride + 64; 83 | shared_m p_chaser_3 = threadIdx.x * stride + 96; 84 | #endif 85 | 86 | 87 | // start timing 88 | uint32_t start = 0; 89 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 90 | 91 | // pointer-chasing ITERS times 92 | //#pragma unroll 93 | for (uint32_t i = 0; i < ITERS; ++i) { 94 | p_chaser = s[p_chaser]; // ld.shared.u64 %0, [%1];. 95 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser) : "r"(p_chaser*4) ); 96 | 97 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_1) : "r"(p_chaser_1*4) ); 98 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_2) : "r"(p_chaser_2*4) ); 99 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_3) : "r"(p_chaser_3*4) ); 100 | 101 | p_chaser_1 =s[p_chaser_1]; 102 | 103 | #ifdef U32ACCESS 104 | p_chaser_2 =s[p_chaser_2]; 105 | p_chaser_3 =s[p_chaser_3]; 106 | #endif 107 | // p_chaser_2 =s[p_chaser_2]; 108 | // p_chaser_3 =s[p_chaser_3]; 109 | 110 | 111 | 112 | 113 | // p_chaser_1 =s[p_chaser_1]; 114 | // p_chaser_2 =s[p_chaser_2]; 115 | // p_chaser_3 =s[p_chaser_3]; 116 | //asm volatile("bar.sync 0;"); 117 | } 118 | 119 | // stop timing 120 | asm volatile("bar.sync 0;"); 121 | uint32_t stop = 0; 122 | 123 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 124 | 125 | // write time and data back to memory 126 | if(uid == 0){ 127 | startClk[uid] = start; 128 | stopClk[uid] = stop; 129 | dsink[uid] = p_chaser + p_chaser_1; // + p_chaser_2 + p_chaser_3; 130 | 131 | #ifdef U32ACCESS 132 | dsink[uid] += (p_chaser_2 + p_chaser_3); 133 | #endif 134 | } 135 | 136 | //} 137 | } 138 | 139 | void test_with_different_thread(int stride, int THREADS_NUM){ 140 | BLOCKS_NUM = 1; 141 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 142 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 143 | 144 | assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 145 | 146 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 147 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 148 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 149 | 150 | uint32_t *startClk_g; 151 | uint32_t *stopClk_g; 152 | shared_m *dsink_g; 153 | 154 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 155 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 156 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 157 | 158 | shared_lat<<>>(startClk_g, stopClk_g, dsink_g, stride); 159 | gpuErrchk(cudaPeekAtLastError()); 160 | 161 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 162 | cudaMemcpyDeviceToHost)); 163 | gpuErrchk( 164 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 165 | gpuErrchk( 166 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 167 | 168 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 169 | 170 | std::cout << THREADS_NUM/32 <<" warps Shared Memory read(16B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 171 | 172 | long num_bytes = (THREADS_NUM) * 16; 173 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define SHARED_MEM_SIZE (32*1024) //(32 * 1024 ) // in bytes 9 | // Launch only one thread to calcaulte the latency using a pointer-chasing 10 | // array technique 11 | // #define THREADS_NUM 32 12 | // iterate over the array ITERS times 13 | #ifndef ITERS 14 | #define ITERS (1024 ) 15 | #endif 16 | 17 | 18 | #ifndef ILPconfig 19 | #define ILPconfig 1 20 | #endif 21 | 22 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 23 | 24 | 25 | // two way bank conflict - > 23 latenct 26 | // bank-conflict-free -> 25 latency 27 | 28 | #define U32ACCESS 29 | 30 | // two way bank conflict - > 23 latenct 31 | // bank-conflict-free -> 25 latency 32 | 33 | #ifdef U32ACCESS 34 | typedef uint32_t shared_m; 35 | #else 36 | typedef uint64_t shared_m; 37 | #endif 38 | 39 | //typedef uint32_t shared_m; 40 | 41 | // typedef uint64_t shared_m; 42 | // Measure latency of ITERS reads. 43 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 44 | shared_m *dsink, uint32_t stride) { 45 | 46 | // thread index 47 | uint32_t tid = threadIdx.x; 48 | uint32_t bid = blockIdx.x; 49 | uint32_t uid = bid * blockDim.x + tid; 50 | uint32_t n_threads = blockDim.x * gridDim.x; 51 | 52 | extern __shared__ int smem[]; // dynamic 53 | 54 | shared_m *s = (shared_m*)&smem[0]; 55 | 56 | int s_smem = SHARED_MEM_SIZE/sizeof(shared_m); 57 | 58 | // // one thread to initialize the pointer-chasing array 59 | // for (uint32_t i = uid; i < (s_smem - stride); i += n_threads) 60 | // s[i] = (i + stride) % s_smem; 61 | if(uid == 0){ 62 | for (uint32_t i = 0; i < (s_smem - stride); i ++) 63 | s[i] = shared_m((i + stride) % s_smem); //s[i] = (i )*16 % 2048; // s[i] is multiple of 16, because addree is aligned with 4 bytes 64 | } 65 | // 66 | asm volatile("bar.sync 0;"); 67 | 68 | // if(uid == 0){ 69 | // for(int i = 0; i < s_smem; i ++){ 70 | // printf("s[%d] = %d \t", i, int(s[i]) ); 71 | 72 | // } 73 | // printf("\n"); 74 | // } 75 | 76 | //if (uid == 0) { 77 | // initalize pointer chaser 78 | shared_m p_chaser = threadIdx.x * stride; 79 | //p_chaser = static_cast(__cvta_generic_to_shared(&s[p_chaser])); 80 | shared_m p_chaser_1 = threadIdx.x * stride + 32; 81 | 82 | 83 | 84 | shared_m p_chaser_2 = threadIdx.x * stride + 64; 85 | shared_m p_chaser_3 = threadIdx.x * stride + 96; 86 | 87 | 88 | 89 | #ifdef U32ACCESS 90 | shared_m p_chaser_4 = threadIdx.x * stride + 32*4; 91 | shared_m p_chaser_5 = threadIdx.x * stride + 32*5; 92 | shared_m p_chaser_6 = threadIdx.x * stride + 32*6; 93 | shared_m p_chaser_7 = threadIdx.x * stride + 32*7; 94 | #endif 95 | 96 | // start timing 97 | uint32_t start = 0; 98 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 99 | 100 | // pointer-chasing ITERS times 101 | //#pragma unroll 102 | for (uint32_t i = 0; i < ITERS; ++i) { 103 | p_chaser = s[p_chaser]; // ld.shared.u64 %0, [%1];. 104 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser) : "r"(p_chaser*4) ); 105 | 106 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_1) : "r"(p_chaser_1*4) ); 107 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_2) : "r"(p_chaser_2*4) ); 108 | // asm volatile("ld.shared.u32 %0, [%1];" : "=r"(p_chaser_3) : "r"(p_chaser_3*4) ); 109 | 110 | p_chaser_1 =s[p_chaser_1]; 111 | 112 | 113 | p_chaser_2 =s[p_chaser_2]; 114 | p_chaser_3 =s[p_chaser_3]; 115 | 116 | #ifdef U32ACCESS 117 | p_chaser_4 =s[p_chaser_4]; 118 | p_chaser_5 =s[p_chaser_5]; 119 | p_chaser_6 =s[p_chaser_6]; 120 | p_chaser_7 =s[p_chaser_7]; 121 | #endif 122 | // p_chaser_2 =s[p_chaser_2]; 123 | // p_chaser_3 =s[p_chaser_3]; 124 | 125 | 126 | 127 | 128 | // p_chaser_1 =s[p_chaser_1]; 129 | // p_chaser_2 =s[p_chaser_2]; 130 | // p_chaser_3 =s[p_chaser_3]; 131 | //asm volatile("bar.sync 0;"); 132 | } 133 | 134 | // stop timing 135 | asm volatile("bar.sync 0;"); 136 | uint32_t stop = 0; 137 | 138 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 139 | 140 | // write time and data back to memory 141 | if(uid == 0){ 142 | startClk[uid] = start; 143 | stopClk[uid] = stop; 144 | dsink[uid] = p_chaser + p_chaser_1 +p_chaser_2 + p_chaser_3 ; // + p_chaser_2 + p_chaser_3; 145 | 146 | #ifdef U32ACCESS 147 | dsink[uid] += (p_chaser_4 + p_chaser_5 +p_chaser_6 + p_chaser_7); 148 | #endif 149 | } 150 | 151 | //} 152 | } 153 | 154 | void test_with_different_thread(int stride, int THREADS_NUM){ 155 | BLOCKS_NUM = 1; 156 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 157 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 158 | 159 | assert(SHARED_MEM_SIZE <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 160 | 161 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 162 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 163 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 164 | 165 | uint32_t *startClk_g; 166 | uint32_t *stopClk_g; 167 | shared_m *dsink_g; 168 | 169 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 170 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 171 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 172 | 173 | shared_lat<<>>(startClk_g, stopClk_g, dsink_g, stride); 174 | gpuErrchk(cudaPeekAtLastError()); 175 | 176 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 177 | cudaMemcpyDeviceToHost)); 178 | gpuErrchk( 179 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 180 | gpuErrchk( 181 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 182 | 183 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 184 | 185 | std::cout << THREADS_NUM/32 <<" warps Shared Memory read(16B/t) latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 186 | 187 | long num_bytes = (THREADS_NUM) * 32; 188 | std::cout << "Shared mem throughput = " << num_bytes / lat << " bytes/clk " < 2 | using namespace std; 3 | 4 | #include "../../../hw_def/hw_def.h" 5 | 6 | 7 | 8 | int main() { 9 | intilizeDeviceProp(0); 10 | 11 | printf("Shared memory per multiprocessor = %lu bytes\n", 12 | deviceProp.sharedMemPerMultiprocessor); 13 | 14 | printf("Shared memory per block = %lu bytes\n", deviceProp.sharedMemPerBlock); 15 | 16 | if (ACCEL_SIM_MODE) { 17 | 18 | //std::cout << "\n//Accel_Sim config: \n"; 19 | std::cout << " deviceProp.maxThreadsPerBlock = " << deviceProp.maxThreadsPerBlock< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../../hw_def/hw_def.h" 9 | 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing 12 | // array technique 13 | //#define THREADS_NUM 32 14 | // iterate over the array ITERS times 15 | #ifndef ITERS 16 | #define ITERS (1024 ) 17 | #endif 18 | 19 | 20 | 21 | #ifndef ILPconfig 22 | #define ILPconfig 1 23 | #endif 24 | 25 | 26 | static_assert(ILPconfig<=5," ILP>5 is not implemented\n"); 27 | 28 | 29 | 30 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res, 31 | uint32_t strid) { // strid set to 0 used to prevent optimization 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t gid = blockIdx.x * blockDim.x + tid; 35 | uint32_t warpid = gid / warpSize; 36 | 37 | a = a + warpid * 16*16; // m*k = 16*16 38 | b = b + warpid * 8*16; // n*k = 8*16 39 | res = res + warpid * 16*8;// m*n = 16*16 40 | 41 | /** step 1: create register for each thread **/ 42 | __nv_bfloat16 frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 43 | __nv_bfloat16 frag_B[4*ILPconfig]; // one .f16x2 registers, 4 half elements 44 | float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 45 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 46 | for(int i = 0;i<8 * ILPconfig;i++){ 47 | frag_A[i] = a[i + lane_id()*8]; 48 | 49 | } 50 | for(int i =0;i<4 * ILPconfig;i++){ 51 | frag_B[i] = b[i + lane_id()*4]; 52 | frag_D[i] = 0.0f; 53 | } 54 | 55 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 56 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 57 | float *C = reinterpret_cast(&frag_D[0]); 58 | float *D = C; // D = A*B + D. 59 | 60 | // float fpuA = frag_A[0]; 61 | // float fpuB = frag_B[0]; 62 | float fpuC = frag_D[0]; 63 | 64 | // int intA = threadIdx.x; 65 | // int intB = threadIdx.x + 1; 66 | int intC = threadIdx.x + 2; 67 | 68 | uint64_t start = 0; 69 | uint64_t stop = 0; 70 | // synchronize all threads 71 | asm volatile("bar.sync 0;"); 72 | // start timing 73 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 74 | //#pragma unroll 75 | for (int j = 0; j < ITERS; ++j) { 76 | asm volatile( 77 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 78 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 79 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 80 | : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 81 | "r"(B[0]), "r"(B[1]), 82 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 83 | ); 84 | #if ILPconfig >= 2 85 | asm volatile( 86 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 87 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 88 | : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7]) 89 | : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 90 | "r"(B[2]), "r"(B[3]), 91 | "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]) 92 | ); 93 | #endif 94 | #if ILPconfig >= 3 95 | asm volatile( 96 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 97 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 98 | : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11]) 99 | : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 100 | "r"(B[4]), "r"(B[5]), 101 | "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]) 102 | ); 103 | #endif 104 | #if ILPconfig >= 4 105 | asm volatile( 106 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 107 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 108 | : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15]) 109 | : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 110 | "r"(B[6]), "r"(B[7]), 111 | "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]) 112 | ); 113 | #endif 114 | #if ILPconfig >= 5 115 | asm volatile( 116 | "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " 117 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 118 | : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19]) 119 | : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 120 | "r"(B[8]), "r"(B[9]), 121 | "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19]) 122 | ); 123 | #endif 124 | __syncwarp(); 125 | } 126 | // stop timing 127 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 128 | for(int i=0; i < 4*ILPconfig;i++){ 129 | res[i] += frag_D[i]; 130 | 131 | res[i] += fpuC; 132 | res[i] += intC; 133 | } 134 | 135 | //res[0] += fpuC; 136 | startClk[gid] = start; 137 | stopClk[gid] = stop; 138 | } 139 | 140 | 141 | template 142 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 143 | intilizeDeviceProp(0); 144 | 145 | int BLOCKS_NUM = 1; 146 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 147 | int WARP_SIZE = 32; 148 | 149 | unsigned total_A_SIZE = 150 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 151 | unsigned total_B_SIZE = 152 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 153 | unsigned total_R_SIZE = 154 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 155 | 156 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 157 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 158 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 159 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 160 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 161 | 162 | uint64_t *startClk_g; 163 | uint64_t *stopClk_g; 164 | T *data1_g; 165 | T *data2_g; 166 | R *res_g; 167 | 168 | for (uint32_t i = 0; i < 16*8; i++) { 169 | data1[i] = (T)i; 170 | } 171 | 172 | for (uint32_t i = 0; i < 8*8; i++) { 173 | data2[i] = (T)i; 174 | } 175 | 176 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 177 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 178 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 179 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 180 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 181 | 182 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 183 | cudaMemcpyHostToDevice)); 184 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 185 | cudaMemcpyHostToDevice)); 186 | 187 | mma_ubench<<>>( 188 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 189 | gpuErrchk(cudaPeekAtLastError()); 190 | 191 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 192 | cudaMemcpyDeviceToHost)); 193 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 194 | cudaMemcpyDeviceToHost)); 195 | gpuErrchk( 196 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 197 | 198 | float mma_bw, fma_bw; 199 | uint64_t total_time = 200 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 201 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 202 | 203 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 204 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 205 | 206 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 207 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 208 | // (float)total_time; 209 | fma_bw = ((float)(ITERS * 16 * 8 * 16 * ILPconfig * //0 * 210 | (TOTAL_THREADS / WARP_SIZE))) / 211 | (float)total_time; 212 | 213 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 214 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 215 | std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 216 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 217 | 218 | std::cout << "Total Clk number = " << total_time << "\n"; 219 | 220 | if (report_fma_bw) 221 | return fma_bw; 222 | else 223 | return mma_bw; 224 | } 225 | 226 | int main() { 227 | std::vector warps = {1,2,4,6,8,12,16,32}; 228 | intilizeDeviceProp(0); 229 | std::cout<<"***********************************"<5 is not implemented\n"); 27 | 28 | 29 | 30 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, float *res, 31 | uint32_t strid) { // strid set to 0 used to prevent optimization 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t gid = blockIdx.x * blockDim.x + tid; 35 | uint32_t warpid = gid / warpSize; 36 | 37 | a = a + warpid * 16*16; // m*k = 16*16 38 | b = b + warpid * 8*16; // n*k = 8*16 39 | res = res + warpid * 16*8;// m*n = 16*16 40 | 41 | /** step 1: create register for each thread **/ 42 | half frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 43 | half frag_B[4*ILPconfig]; // one .f16x2 registers, 4 half elements 44 | float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 45 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 46 | for(int i = 0;i<8 * ILPconfig;i++){ 47 | frag_A[i] = a[i + lane_id()*8]; 48 | 49 | } 50 | for(int i =0;i<4 * ILPconfig;i++){ 51 | frag_B[i] = b[i + lane_id()*4]; 52 | frag_D[i] = 0.0f; 53 | } 54 | 55 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 56 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 57 | float *C = reinterpret_cast(&frag_D[0]); 58 | float *D = C; // D = A*B + D. 59 | 60 | // float fpuA = frag_A[0]; 61 | // float fpuB = frag_B[0]; 62 | float fpuC = frag_D[0]; 63 | 64 | // int intA = threadIdx.x; 65 | // int intB = threadIdx.x + 1; 66 | int intC = threadIdx.x + 2; 67 | 68 | uint64_t start = 0; 69 | uint64_t stop = 0; 70 | // synchronize all threads 71 | asm volatile("bar.sync 0;"); 72 | // start timing 73 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 74 | //#pragma unroll 75 | for (int j = 0; j < ITERS; ++j) { 76 | asm volatile( 77 | "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " 78 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 79 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 80 | : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 81 | "r"(B[0]), "r"(B[1]), 82 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 83 | ); 84 | #if ILPconfig >= 2 85 | asm volatile( 86 | "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " 87 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 88 | : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7]) 89 | : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 90 | "r"(B[2]), "r"(B[3]), 91 | "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]) 92 | ); 93 | #endif 94 | #if ILPconfig >= 3 95 | asm volatile( 96 | "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " 97 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 98 | : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11]) 99 | : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 100 | "r"(B[4]), "r"(B[5]), 101 | "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]) 102 | ); 103 | #endif 104 | #if ILPconfig >= 4 105 | asm volatile( 106 | "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " 107 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 108 | : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15]) 109 | : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 110 | "r"(B[6]), "r"(B[7]), 111 | "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]) 112 | ); 113 | #endif 114 | #if ILPconfig >= 5 115 | asm volatile( 116 | "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " 117 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 118 | : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19]) 119 | : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 120 | "r"(B[8]), "r"(B[9]), 121 | "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19]) 122 | ); 123 | #endif 124 | __syncwarp(); 125 | } 126 | // stop timing 127 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 128 | for(int i=0; i < 4*ILPconfig;i++){ 129 | res[i] += frag_D[i]; 130 | 131 | res[i] += fpuC; 132 | res[i] += intC; 133 | } 134 | 135 | //res[0] += fpuC; 136 | startClk[gid] = start; 137 | stopClk[gid] = stop; 138 | } 139 | 140 | 141 | template 142 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 143 | intilizeDeviceProp(0); 144 | 145 | int BLOCKS_NUM = 1; 146 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 147 | int WARP_SIZE = 32; 148 | 149 | unsigned total_A_SIZE = 150 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 151 | unsigned total_B_SIZE = 152 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 153 | unsigned total_R_SIZE = 154 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 155 | 156 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 157 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 158 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 159 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 160 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 161 | 162 | uint64_t *startClk_g; 163 | uint64_t *stopClk_g; 164 | T *data1_g; 165 | T *data2_g; 166 | R *res_g; 167 | 168 | for (uint32_t i = 0; i < 16*8; i++) { 169 | data1[i] = (T)i; 170 | } 171 | 172 | for (uint32_t i = 0; i < 8*8; i++) { 173 | data2[i] = (T)i; 174 | } 175 | 176 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 177 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 178 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 179 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 180 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 181 | 182 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 183 | cudaMemcpyHostToDevice)); 184 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 185 | cudaMemcpyHostToDevice)); 186 | 187 | mma_ubench<<>>( 188 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 189 | gpuErrchk(cudaPeekAtLastError()); 190 | 191 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 192 | cudaMemcpyDeviceToHost)); 193 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 194 | cudaMemcpyDeviceToHost)); 195 | gpuErrchk( 196 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 197 | 198 | float mma_bw, fma_bw; 199 | uint64_t total_time = 200 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 201 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 202 | 203 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 204 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 205 | 206 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 207 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 208 | // (float)total_time; 209 | fma_bw = ((float)(ITERS * 16 * 8 * 16 * ILPconfig * //0 * 210 | (TOTAL_THREADS / WARP_SIZE))) / 211 | (float)total_time; 212 | 213 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 214 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 215 | std::cout << "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 216 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 217 | 218 | std::cout << "Total Clk number = " << total_time << "\n"; 219 | 220 | if (report_fma_bw) 221 | return fma_bw; 222 | else 223 | return mma_bw; 224 | } 225 | 226 | int main() { 227 | std::vector warps = {1,2,4,6,8,12,16,32}; 228 | intilizeDeviceProp(0); 229 | std::cout<<"***********************************"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../../hw_def/hw_def.h" 9 | 10 | // #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB 11 | // Launch only one thread to calcaulte the latency using a pointer-chasing 12 | // array technique 13 | //#define THREADS_NUM 32 14 | // iterate over the array ITERS times 15 | #ifndef ITERS 16 | #define ITERS (1024 ) 17 | #endif 18 | 19 | 20 | 21 | 22 | #ifndef ILPconfig 23 | #define ILPconfig 1 24 | #endif 25 | 26 | 27 | static_assert(ILPconfig<=6, "ILP>6 is not implemented\n"); 28 | 29 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, half *res, 30 | uint32_t strid) { // strid set to 0 used to prevent optimization 31 | // thread index 32 | uint32_t tid = threadIdx.x; 33 | uint32_t gid = blockIdx.x * blockDim.x + tid; 34 | uint32_t warpid = gid / warpSize; 35 | 36 | a = a + warpid * 16*16; // m*k = 16*16 37 | b = b + warpid * 8*16; // n*k = 8*16 38 | res = res + warpid * 16*8;// m*n = 16*16 39 | 40 | /** step 1: create register for each thread **/ 41 | half frag_A[8*ILPconfig]; // two .f16x2 registers, 8 half elements, 42 | half frag_B[4*ILPconfig]; // one .f16x2 registers, 4 half elements 43 | half frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 44 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 45 | for(int i = 0;i<8 *ILPconfig ;i++){ 46 | frag_A[i] = a[i + lane_id()*8]; 47 | 48 | } 49 | for(int i =0;i<4 *ILPconfig ;i++){ 50 | frag_B[i] = b[i + lane_id()*4]; 51 | frag_D[i] = 0.0; 52 | } 53 | 54 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 55 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 56 | uint32_t *C = reinterpret_cast(&frag_D[0]); 57 | uint32_t *D = C; // D = A*B + D. 58 | 59 | float fpuA = frag_A[0]; 60 | float fpuB = frag_B[0]; 61 | float fpuC = frag_D[0]; 62 | 63 | int intA = threadIdx.x; 64 | int intB = threadIdx.x + 1; 65 | int intC = threadIdx.x + 2; 66 | 67 | uint64_t start = 0; 68 | uint64_t stop = 0; 69 | // synchronize all threads 70 | asm volatile("bar.sync 0;"); 71 | // start timing 72 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 73 | //#pragma unroll 74 | for (int j = 0; j < ITERS; ++j) { 75 | asm volatile( 76 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 77 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 78 | : "=r"(D[0]), "=r"(D[1]) 79 | : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 80 | "r"(B[0]), "r"(B[1]), 81 | "r"(C[0]), "r"(C[1]) 82 | ); // input C operand will use output operand D. 83 | #if ILPconfig >= 2 84 | asm volatile( 85 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 86 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 87 | : "=r"(D[2]), "=r"(D[3]) 88 | : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 89 | "r"(B[2]), "r"(B[3]), 90 | "r"(C[2]), "r"(C[3]) 91 | ); // input C operand will use output operand D. 92 | #endif 93 | 94 | #if ILPconfig >= 3 95 | asm volatile( 96 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 97 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 98 | : "=r"(D[4]), "=r"(D[5]) 99 | : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 100 | "r"(B[4]), "r"(B[5]), 101 | "r"(C[4]), "r"(C[5]) 102 | ); // input C operand will use output operand D. 103 | #endif 104 | #if ILPconfig >= 4 105 | asm volatile( 106 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 107 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 108 | : "=r"(D[6]), "=r"(D[7]) 109 | : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 110 | "r"(B[6]), "r"(B[7]), 111 | "r"(C[6]), "r"(C[7]) 112 | ); // input C operand will use output operand D. 113 | #endif 114 | 115 | #if ILPconfig >= 5 116 | asm volatile( 117 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 118 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 119 | : "=r"(D[8]), "=r"(D[9]) 120 | : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 121 | "r"(B[8]), "r"(B[9]), 122 | "r"(C[8]), "r"(C[9]) 123 | ); // input C operand will use output operand D. 124 | #endif 125 | #if ILPconfig >= 6 126 | asm volatile( 127 | "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " 128 | "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" 129 | : "=r"(D[10]), "=r"(D[11]) 130 | : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 131 | "r"(B[10]), "r"(B[11]), 132 | "r"(C[10]), "r"(C[11]) 133 | ); // input C operand will use output operand D. 134 | #endif 135 | __syncwarp(); 136 | } 137 | // synchronize all threads 138 | asm volatile("bar.sync 0;"); 139 | // stop timing 140 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 141 | // avoid undeserable optimization 142 | for(int i=0; i < 4 * ILPconfig;i++){ 143 | res[i] = frag_D[i]; 144 | 145 | res[i] += fpuC; 146 | res[i] += intC; 147 | } 148 | 149 | //res[0] += fpuC; 150 | startClk[gid] = start; 151 | stopClk[gid] = stop; 152 | } 153 | 154 | 155 | template 156 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 157 | intilizeDeviceProp(0); 158 | 159 | int BLOCKS_NUM = 1; 160 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 161 | int WARP_SIZE = 32; 162 | 163 | unsigned total_A_SIZE = 164 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 165 | unsigned total_B_SIZE = 166 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 167 | unsigned total_R_SIZE = 168 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 169 | 170 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 171 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 172 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 173 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 174 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 175 | 176 | uint64_t *startClk_g; 177 | uint64_t *stopClk_g; 178 | T *data1_g; 179 | T *data2_g; 180 | R *res_g; 181 | 182 | for (uint32_t i = 0; i < 16*8; i++) { 183 | data1[i] = (T)i; 184 | } 185 | 186 | for (uint32_t i = 0; i < 8*8; i++) { 187 | data2[i] = (T)i; 188 | } 189 | 190 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 191 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 192 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 193 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 194 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 195 | 196 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 197 | cudaMemcpyHostToDevice)); 198 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 199 | cudaMemcpyHostToDevice)); 200 | 201 | mma_ubench<<>>( 202 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 203 | gpuErrchk(cudaPeekAtLastError()); 204 | 205 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 206 | cudaMemcpyDeviceToHost)); 207 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 208 | cudaMemcpyDeviceToHost)); 209 | gpuErrchk( 210 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 211 | 212 | float mma_bw, fma_bw; 213 | uint64_t total_time = 214 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 215 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 216 | 217 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 218 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 219 | 220 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 221 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 222 | // (float)total_time; 223 | fma_bw = ((float)(ITERS * 16 * 8 * 16 * ILPconfig * //0 * 224 | (TOTAL_THREADS / WARP_SIZE))) / 225 | (float)total_time; 226 | 227 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 228 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 229 | std::cout << "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 latency " << (float)total_time/(float)ITERS << " cycles\n"; 230 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 231 | 232 | std::cout << "Total Clk number = " << total_time << "\n"; 233 | 234 | if (report_fma_bw) 235 | return fma_bw; 236 | else 237 | return mma_bw; 238 | } 239 | 240 | int main() { 241 | intilizeDeviceProp(0); 242 | std::cout<<"***********************************"< 8 is not implemented\n"); 25 | 26 | 27 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, char *a, char *b, int *res, 28 | uint32_t strid) { // strid set to 0 used to prevent optimization 29 | // thread index 30 | uint32_t tid = threadIdx.x; 31 | uint32_t gid = blockIdx.x * blockDim.x + tid; 32 | uint32_t warpid = gid / warpSize; 33 | 34 | a = a + warpid * 16*32; // m*k = 16*32 35 | b = b + warpid * 8*32; // n*k = 8*32 36 | res = res + warpid * 16*8;// m*n = 16*8 37 | 38 | /** step 1: create register for each thread **/ 39 | char frag_A[16*ILPconfig]; // four int8 registers, 40 | char frag_B[8*ILPconfig]; // one .f16x2 registers, 2 half elements 41 | int frag_D[4*ILPconfig]; //result(fp32) 2 f32 registers 42 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 43 | for(int i = 0;i<16*ILPconfig;i++){ 44 | frag_A[i] = a[i + lane_id()*16]; 45 | 46 | } 47 | for(int i =0;i<8*ILPconfig;i++){ 48 | frag_B[i] = b[i + lane_id()*8]; 49 | //frag_D[i] = 0.0f; 50 | } 51 | for(int i =0;i<4*ILPconfig;i++){ 52 | //frag_B[i] = b[i + lane_id()*4]; 53 | frag_D[i] = 0; 54 | } 55 | 56 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 57 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 58 | int *C = reinterpret_cast(&frag_D[0]); 59 | int *D = C; // D = A*B + D. 60 | 61 | 62 | uint64_t start = 0; 63 | uint64_t stop = 0; 64 | // synchronize all threads 65 | asm volatile("bar.sync 0;"); 66 | // start timing 67 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 68 | //#pragma unroll 69 | for (int j = 0; j < ITERS; ++j) { 70 | asm volatile( 71 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 72 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 73 | : "=r"(D[0]), "=r"(D[1]) , "=r"(D[2]), "=r"(D[3]) 74 | : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 75 | "r"(B[0]), "r"(B[1]), 76 | "r"(C[0]), "r"(C[1]) ,"r"(C[2]), "r"(C[3]) 77 | ); 78 | 79 | #if ILPconfig >= 2 80 | asm volatile( 81 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 82 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 83 | : "=r"(D[4]), "=r"(D[5]) , "=r"(D[6]), "=r"(D[7]) 84 | : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 85 | "r"(B[2]), "r"(B[3]), 86 | "r"(C[4]), "r"(C[5]) ,"r"(C[6]), "r"(C[7]) 87 | ); 88 | #endif 89 | 90 | #if ILPconfig >= 3 91 | asm volatile( 92 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 93 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 94 | : "=r"(D[8]), "=r"(D[9]) , "=r"(D[10]), "=r"(D[11]) 95 | : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 96 | "r"(B[4]), "r"(B[5]), 97 | "r"(C[8]), "r"(C[9]) ,"r"(C[10]), "r"(C[11]) 98 | ); 99 | #endif 100 | #if ILPconfig >= 4 101 | asm volatile( 102 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 103 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 104 | : "=r"(D[12]), "=r"(D[13]) , "=r"(D[14]), "=r"(D[15]) 105 | : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 106 | "r"(B[6]), "r"(B[7]), 107 | "r"(C[12]), "r"(C[13]) ,"r"(C[14]), "r"(C[15]) 108 | ); 109 | #endif 110 | 111 | #if ILPconfig >= 5 112 | asm volatile( 113 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 114 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 115 | : "=r"(D[16]), "=r"(D[17]) , "=r"(D[18]), "=r"(D[19]) 116 | : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 117 | "r"(B[8]), "r"(B[9]), 118 | "r"(C[16]), "r"(C[17]) ,"r"(C[18]), "r"(C[19]) 119 | ); 120 | #endif 121 | 122 | #if ILPconfig >= 6 123 | asm volatile( 124 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 125 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 126 | : "=r"(D[20]), "=r"(D[21]) , "=r"(D[22]), "=r"(D[23]) 127 | : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 128 | "r"(B[10]), "r"(B[11]), 129 | "r"(C[20]), "r"(C[21]) ,"r"(C[22]), "r"(C[23]) 130 | ); 131 | #endif 132 | #if ILPconfig >= 7 133 | asm volatile( 134 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 135 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 136 | : "=r"(D[24]), "=r"(D[25]) , "=r"(D[26]), "=r"(D[27]) 137 | : "r"(A[24]), "r"(A[25]), "r"(A[26]), "r"(A[27]), 138 | "r"(B[12]), "r"(B[13]), 139 | "r"(C[24]), "r"(C[25]) ,"r"(C[26]), "r"(C[27]) 140 | ); 141 | #endif 142 | #if ILPconfig >= 8 143 | asm volatile( 144 | "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 " 145 | "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" 146 | : "=r"(D[28]), "=r"(D[29]) , "=r"(D[30]), "=r"(D[31]) 147 | : "r"(A[28]), "r"(A[28]), "r"(A[30]), "r"(A[31]), 148 | "r"(B[14]), "r"(B[15]), 149 | "r"(C[28]), "r"(C[29]) ,"r"(C[30]), "r"(C[31]) 150 | ); 151 | #endif 152 | 153 | __syncwarp(); 154 | 155 | } 156 | 157 | // synchronize warps 158 | 159 | // stop timing 160 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory");//around 1 cycle overhead 161 | for(int i=0; i < 4*ILPconfig;i++){ 162 | res[i] = frag_D[i]; 163 | 164 | } 165 | 166 | //res[0] += fpuC; 167 | startClk[gid] = start; 168 | stopClk[gid] = stop; 169 | } 170 | 171 | 172 | template 173 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 174 | intilizeDeviceProp(0); 175 | 176 | int BLOCKS_NUM = 1; 177 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 178 | int WARP_SIZE = 32; 179 | 180 | unsigned total_A_SIZE = 181 | 16*32 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 182 | unsigned total_B_SIZE = 183 | 8*32 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 184 | unsigned total_R_SIZE = 185 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 186 | 187 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 188 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 189 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 190 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 191 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 192 | 193 | uint64_t *startClk_g; 194 | uint64_t *stopClk_g; 195 | T *data1_g; 196 | T *data2_g; 197 | R *res_g; 198 | 199 | for (uint32_t i = 0; i < 16*32; i++) { 200 | data1[i] = (T)i; 201 | } 202 | 203 | for (uint32_t i = 0; i < 8*32; i++) { 204 | data2[i] = (T)i; 205 | } 206 | 207 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 208 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 209 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 210 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 211 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 212 | 213 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 214 | cudaMemcpyHostToDevice)); 215 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 216 | cudaMemcpyHostToDevice)); 217 | 218 | mma_ubench<<>>( 219 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 220 | gpuErrchk(cudaPeekAtLastError()); 221 | 222 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 223 | cudaMemcpyDeviceToHost)); 224 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 225 | cudaMemcpyDeviceToHost)); 226 | gpuErrchk( 227 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 228 | 229 | float mma_bw, fma_bw; 230 | uint64_t total_time = 231 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 232 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 233 | 234 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 235 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 236 | 237 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 238 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 239 | // (float)total_time; 240 | fma_bw = ((float)(ITERS * 16 * 8 * 32 * ILPconfig * //0 * 241 | (TOTAL_THREADS / WARP_SIZE))) / 242 | (float)total_time; 243 | 244 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 245 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 246 | std::cout << "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 247 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 248 | 249 | std::cout << "Total Clk number = " << total_time << "\n"; 250 | 251 | if (report_fma_bw) 252 | return fma_bw; 253 | else 254 | return mma_bw; 255 | } 256 | 257 | int main() { 258 | intilizeDeviceProp(0); 259 | std::cout<<"***********************************"< 6 28 | static_assert(0,"ILP > 6 is not supported\n"); 29 | #endif 30 | 31 | 32 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res, 33 | uint32_t strid) { // strid set to 0 used to prevent optimization 34 | // thread index 35 | uint32_t tid = threadIdx.x; 36 | uint32_t gid = blockIdx.x * blockDim.x + tid; 37 | uint32_t warpid = gid / warpSize; 38 | 39 | a = a + warpid * 16*4; // m*k = 16*16 40 | b = b + warpid * 8*4; // n*k = 8*16 41 | res = res + warpid * 16*8;// m*n = 16*16 42 | 43 | /** step 1: create register for each thread **/ 44 | float frag_A[2*ILPconfig]; // two .f16x2 registers, 8 half elements, 45 | float frag_B[1*ILPconfig]; // one .f16x2 registers, 4 half elements 46 | float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 47 | 48 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 49 | for(int i = 0;i<2*ILPconfig;i++){ 50 | frag_A[i] = a[i + lane_id()*4]; 51 | //frag_A_ILP2[i] = a[i + lane_id()*4] + 1; 52 | 53 | } 54 | for(int i =0;i<1*ILPconfig;i++){ 55 | frag_B[i] = b[i + lane_id()*1]; 56 | //frag_B_ILP2[i] = b[i + lane_id()*1] + 1; 57 | } 58 | 59 | 60 | for(int i =0;i<4*ILPconfig;i++){ 61 | //frag_B[i] = b[i + lane_id()*4]; 62 | frag_D[i] = 0.0f; 63 | //frag_D_ILP2[i] = 0.0f; 64 | } 65 | 66 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 67 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 68 | float *C = reinterpret_cast(&frag_D[0]); 69 | float *D = C; // D = A*B + D. 70 | 71 | 72 | // float fpuA = frag_A[0]; 73 | // float fpuB = frag_B[0]; 74 | float fpuC = frag_D[0]; 75 | 76 | // int intA = threadIdx.x; 77 | // int intB = threadIdx.x + 1; 78 | int intC = threadIdx.x + 2; 79 | 80 | uint64_t start = 0; 81 | uint64_t stop = 0; 82 | // synchronize all threads 83 | asm volatile("bar.sync 0;"); 84 | // start timing 85 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 86 | //#pragma unroll 87 | for (int j = 0; j < ITERS; ++j) { 88 | asm volatile( 89 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 90 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 91 | : "r"(A[0]), "r"(A[1]), 92 | "r"(B[0]), 93 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 94 | ); 95 | 96 | #if ILPconfig >= 2 97 | asm volatile( 98 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 99 | : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7]) 100 | : "r"(A[2]), "r"(A[3]), 101 | "r"(B[1]), 102 | "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]) 103 | ); 104 | #endif 105 | 106 | #if ILPconfig >= 3 107 | asm volatile( 108 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 109 | : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11]) 110 | : "r"(A[4]), "r"(A[5]), 111 | "r"(B[2]), 112 | "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]) 113 | ); 114 | #endif 115 | 116 | #if ILPconfig >= 4 117 | asm volatile( 118 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 119 | : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15]) 120 | : "r"(A[6]), "r"(A[7]), 121 | "r"(B[3]), 122 | "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]) 123 | ); 124 | #endif 125 | 126 | #if ILPconfig >= 5 127 | asm volatile( 128 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 129 | : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19]) 130 | : "r"(A[8]), "r"(A[9]), 131 | "r"(B[4]), 132 | "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19]) 133 | ); 134 | #endif 135 | 136 | #if ILPconfig >= 6 137 | asm volatile( 138 | "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" 139 | : "=f"(D[20]), "=f"(D[21]), "=f"(D[22]), "=f"(D[23]) 140 | : "r"(A[10]), "r"(A[11]), 141 | "r"(B[5]), 142 | "f"(C[20]), "f"(C[21]), "f"(C[22]), "f"(C[23]) 143 | ); 144 | #endif 145 | __syncwarp(); 146 | 147 | } 148 | // stop timing 149 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 150 | for(int i=0; i < 4*ILPconfig;i++){ 151 | res[i] = frag_D[i]; 152 | //res[i] += frag_D_ILP2[i + lane_id()*4]; 153 | res[i] += fpuC; 154 | res[i] += intC; 155 | } 156 | 157 | //res[0] += fpuC; 158 | startClk[gid] = start; 159 | stopClk[gid] = stop; 160 | } 161 | 162 | 163 | template 164 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 165 | intilizeDeviceProp(0); 166 | 167 | int BLOCKS_NUM = 1; 168 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 169 | int WARP_SIZE = 32; 170 | 171 | unsigned total_A_SIZE = 172 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 173 | unsigned total_B_SIZE = 174 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 175 | unsigned total_R_SIZE = 176 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 177 | 178 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 179 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 180 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 181 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 182 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 183 | 184 | uint64_t *startClk_g; 185 | uint64_t *stopClk_g; 186 | T *data1_g; 187 | T *data2_g; 188 | R *res_g; 189 | 190 | for (uint32_t i = 0; i < 16*4; i++) { 191 | data1[i] = (T)i; 192 | } 193 | 194 | for (uint32_t i = 0; i < 4*8; i++) { 195 | data2[i] = (T)i; 196 | } 197 | 198 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 199 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 200 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 201 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 202 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 203 | 204 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 205 | cudaMemcpyHostToDevice)); 206 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 207 | cudaMemcpyHostToDevice)); 208 | 209 | mma_ubench<<>>( 210 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 211 | gpuErrchk(cudaPeekAtLastError()); 212 | 213 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 214 | cudaMemcpyDeviceToHost)); 215 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 216 | cudaMemcpyDeviceToHost)); 217 | gpuErrchk( 218 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 219 | 220 | float mma_bw, fma_bw; 221 | uint64_t total_time = 222 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 223 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 224 | 225 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 226 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 227 | 228 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 229 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 230 | // (float)total_time; 231 | fma_bw = ((float)(ITERS * 16 * 8 * 4 * ILPconfig * //0 * 232 | (TOTAL_THREADS / WARP_SIZE))) / 233 | (float)total_time; 234 | 235 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 236 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 237 | std::cout << "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 238 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 239 | 240 | std::cout << "Total Clk number = " << total_time << "\n"; 241 | 242 | if (report_fma_bw) 243 | return fma_bw; 244 | else 245 | return mma_bw; 246 | } 247 | 248 | int main() { 249 | intilizeDeviceProp(0); 250 | // std::cout << "mma1688 FP16 operand, FP32 accumalte:\n"; 251 | std::cout<<"***********************************"< 8 is not supported\n"); 26 | 27 | 28 | __global__ void tensr1688_flops(uint64_t *startClk, uint64_t *stopClk, half *a, half *b, float *res, 29 | uint32_t strid) { // strid set to 0 used to prevent optimization 30 | // thread index 31 | uint32_t tid = threadIdx.x; 32 | uint32_t gid = blockIdx.x * blockDim.x + tid; 33 | uint32_t warpid = gid / warpSize; 34 | 35 | a = a + warpid * 16*8; // m*k = 16*16 36 | b = b + warpid * 8*8; // n*k = 8*16 37 | res = res + warpid * 16*8;// m*n = 16*16 38 | 39 | /** step 1: create register for each thread **/ 40 | half frag_A[4*ILPconfig]; // two .f16x2 registers, 8 half elements, 41 | half frag_B[2*ILPconfig]; // one .f16x2 registers, 4 half elements 42 | half frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 43 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 44 | for(int i = 0;i<4*ILPconfig;i++){ 45 | frag_A[i] = a[i + lane_id()*4]; 46 | frag_D[i] = 0.0f; 47 | } 48 | for(int i =0;i<2*ILPconfig;i++){ 49 | frag_B[i] = b[i + lane_id()*2]; 50 | } 51 | 52 | //TODO: cast half to 53 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 54 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 55 | uint32_t *C = reinterpret_cast(&frag_D[0]); 56 | uint32_t *D = C; 57 | 58 | // float fpuA = frag_A[0]; 59 | // float fpuB = frag_B[0]; 60 | float fpuC = frag_D[0]; 61 | 62 | 63 | 64 | // int intA = threadIdx.x; 65 | // int intB = threadIdx.x + 1; 66 | int intC = threadIdx.x + 2; 67 | 68 | uint64_t start = 0; 69 | uint64_t stop = 0; 70 | // synchronize all threads 71 | asm volatile("bar.sync 0;"); 72 | // start timing 73 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 74 | #pragma unroll 75 | for (int j = 0; j < ITERS; ++j) { 76 | asm volatile( 77 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 78 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 79 | : "=r"(D[0]), "=r"(D[1]) 80 | : "r"(A[0]), "r"(A[1]), 81 | "r"(B[0]), 82 | "r"(C[0]), "r"(C[1]) 83 | ); 84 | 85 | #if ILPconfig >= 2 86 | asm volatile( 87 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 88 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 89 | : "=r"(D[2]), "=r"(D[3]) 90 | : "r"(A[2]), "r"(A[3]), 91 | "r"(B[1]), 92 | "r"(C[2]), "r"(C[3]) 93 | ); 94 | #endif 95 | 96 | #if ILPconfig >= 3 97 | asm volatile( 98 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 99 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 100 | : "=r"(D[4]), "=r"(D[5]) 101 | : "r"(A[4]), "r"(A[5]), 102 | "r"(B[2]), 103 | "r"(C[4]), "r"(C[5]) 104 | ); 105 | #endif 106 | #if ILPconfig >= 4 107 | asm volatile( 108 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 109 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 110 | : "=r"(D[6]), "=r"(D[7]) 111 | : "r"(A[6]), "r"(A[7]), 112 | "r"(B[3]), 113 | "r"(C[6]), "r"(C[7]) 114 | ); 115 | #endif 116 | 117 | #if ILPconfig >= 5 118 | asm volatile( 119 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 120 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 121 | : "=r"(D[8]), "=r"(D[9]) 122 | : "r"(A[8]), "r"(A[9]), 123 | "r"(B[4]), 124 | "r"(C[8]), "r"(C[9]) 125 | ); 126 | #endif 127 | 128 | #if ILPconfig >= 6 129 | asm volatile( 130 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 131 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 132 | : "=r"(D[10]), "=r"(D[11]) 133 | : "r"(A[10]), "r"(A[11]), 134 | "r"(B[5]), 135 | "r"(C[10]), "r"(C[11]) 136 | ); 137 | #endif 138 | 139 | #if ILPconfig >= 7 140 | asm volatile( 141 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 142 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 143 | : "=r"(D[12]), "=r"(D[13]) 144 | : "r"(A[12]), "r"(A[13]), 145 | "r"(B[6]), 146 | "r"(C[12]), "r"(C[13]) 147 | ); 148 | #endif 149 | 150 | #if ILPconfig >= 8 151 | asm volatile( 152 | "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " 153 | "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" 154 | : "=r"(D[14]), "=r"(D[15]) 155 | : "r"(A[14]), "r"(A[15]), 156 | "r"(B[7]), 157 | "r"(C[14]), "r"(C[15]) 158 | ); 159 | #endif 160 | __syncwarp(); 161 | 162 | } 163 | // synchronize all threads 164 | //asm volatile("bar.sync 0;"); 165 | // stop timing 166 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 167 | for(int i=0; i < 4*ILPconfig;i++){ 168 | res[i] = frag_D[i]; 169 | 170 | res[i] += float(fpuC); 171 | res[i] += intC; 172 | } 173 | 174 | //res[0] += fpuC; 175 | startClk[gid] = start; 176 | stopClk[gid] = stop; 177 | } 178 | 179 | 180 | template 181 | float tensor1688_max_flops(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 182 | intilizeDeviceProp(0); 183 | 184 | int BLOCKS_NUM = 1; 185 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 186 | int WARP_SIZE = 32; 187 | 188 | unsigned total_A_SIZE = 189 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 190 | unsigned total_B_SIZE = 191 | 8*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 192 | unsigned total_R_SIZE = 193 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 194 | 195 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 196 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 197 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 198 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 199 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 200 | 201 | uint64_t *startClk_g; 202 | uint64_t *stopClk_g; 203 | T *data1_g; 204 | T *data2_g; 205 | R *res_g; 206 | 207 | for (uint32_t i = 0; i < 16*8; i++) { 208 | data1[i] = (T)i; 209 | } 210 | 211 | for (uint32_t i = 0; i < 8*8; i++) { 212 | data2[i] = (T)i; 213 | } 214 | 215 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 216 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 217 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 218 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 219 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 220 | 221 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 222 | cudaMemcpyHostToDevice)); 223 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 224 | cudaMemcpyHostToDevice)); 225 | 226 | tensr1688_flops<<>>( 227 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 228 | gpuErrchk(cudaPeekAtLastError()); 229 | 230 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 231 | cudaMemcpyDeviceToHost)); 232 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 233 | cudaMemcpyDeviceToHost)); 234 | gpuErrchk( 235 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 236 | 237 | float mma_bw, fma_bw; 238 | uint64_t total_time = 239 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 240 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 241 | 242 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0) / 243 | ((float)total_time); 244 | 245 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 246 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 247 | // (float)total_time; 248 | fma_bw = ((float)(ITERS * 16 * 8 * 8 * ILPconfig * 249 | (TOTAL_THREADS / WARP_SIZE))) / 250 | (float)total_time; 251 | 252 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 253 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 254 | std::cout << "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 latency " << (float)total_time/(float)ITERS << " cycles\n"; 255 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 256 | 257 | std::cout << "Total Clk number = " << total_time << "\n"; 258 | 259 | if (report_fma_bw) 260 | return fma_bw; 261 | else 262 | return mma_bw; 263 | } 264 | 265 | int main() { 266 | intilizeDeviceProp(0); 267 | std::cout<<"***********************************"<8 is not supported\n"); 26 | 27 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, float *res, 28 | uint32_t strid) { // strid set to 0 used to prevent optimization 29 | // thread index 30 | uint32_t tid = threadIdx.x; 31 | uint32_t gid = blockIdx.x * blockDim.x + tid; 32 | uint32_t warpid = gid / warpSize; 33 | 34 | a = a + warpid * 16*8; // m*k = 16*16 35 | b = b + warpid * 8*8; // n*k = 8*16 36 | res = res + warpid * 16*8;// m*n = 16*16 37 | 38 | /** step 1: create register for each thread **/ 39 | float frag_A[4 * ILPconfig]; // two .f16x2 registers, 8 half elements, 40 | float frag_B[2 * ILPconfig]; // one .f16x2 registers, 4 half elements 41 | float frag_D[4 * ILPconfig]; //result(fp32) 4 f32 registers 42 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 43 | for(int i = 0;i<4 * ILPconfig;i++){ 44 | frag_A[i] = a[i + lane_id()*4]; 45 | 46 | } 47 | for(int i =0;i<2 * ILPconfig;i++){ 48 | frag_B[i] = b[i + lane_id()*1]; 49 | 50 | } 51 | 52 | 53 | for(int i =0;i<4 * ILPconfig;i++){ 54 | //frag_B[i] = b[i + lane_id()*4]; 55 | frag_D[i] = 0.0f; 56 | } 57 | 58 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 59 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 60 | float *C = reinterpret_cast(&frag_D[0]); 61 | float *D = C; // D = A*B + D. 62 | 63 | // float fpuA = frag_A[0]; 64 | // float fpuB = frag_B[0]; 65 | float fpuC = frag_D[0]; 66 | 67 | // int intA = threadIdx.x; 68 | // int intB = threadIdx.x + 1; 69 | int intC = threadIdx.x + 2; 70 | 71 | uint64_t start = 0; 72 | uint64_t stop = 0; 73 | // synchronize all threads 74 | asm volatile("bar.sync 0;"); 75 | // start timing 76 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 77 | //#pragma unroll 78 | for (int j = 0; j < ITERS; ++j) { 79 | 80 | asm volatile( 81 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 82 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 83 | : 84 | "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), 85 | "r"(B[0]), "r"(B[1]), 86 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) 87 | ); 88 | 89 | #if ILPconfig >= 2 90 | asm volatile( 91 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 92 | : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7]) 93 | : "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]), 94 | "r"(B[2]), "r"(B[3]), 95 | "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]) 96 | ); 97 | #endif 98 | #if ILPconfig >= 3 99 | asm volatile( 100 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 101 | : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11]) 102 | : "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]), 103 | "r"(B[4]), "r"(B[5]), 104 | "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]) 105 | ); 106 | #endif 107 | #if ILPconfig >= 4 108 | asm volatile( 109 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 110 | : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15]) 111 | : "r"(A[12]), "r"(A[13]), "r"(A[14]), "r"(A[15]), 112 | "r"(B[6]), "r"(B[7]), 113 | "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]) 114 | ); 115 | #endif 116 | #if ILPconfig >= 5 117 | asm volatile( 118 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 119 | : "=f"(D[16]), "=f"(D[17]), "=f"(D[18]), "=f"(D[19]) 120 | : "r"(A[16]), "r"(A[17]), "r"(A[18]), "r"(A[19]), 121 | "r"(B[8]), "r"(B[9]), 122 | "f"(C[16]), "f"(C[17]), "f"(C[18]), "f"(C[19]) 123 | ); 124 | #endif 125 | #if ILPconfig >= 6 126 | asm volatile( 127 | "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5, %6, %7}, {%8,%9}, {%10,%11,%12,%13};\n" 128 | : "=f"(D[20]), "=f"(D[21]), "=f"(D[22]), "=f"(D[23]) 129 | : "r"(A[20]), "r"(A[21]), "r"(A[22]), "r"(A[23]), 130 | "r"(B[10]), "r"(B[11]), 131 | "f"(C[20]), "f"(C[21]), "f"(C[22]), "f"(C[23]) 132 | ); 133 | #endif 134 | __syncwarp(); 135 | } 136 | // synchronize all threads 137 | // asm volatile("bar.sync 0;"); 138 | // stop timing 139 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 140 | for(int i=0; i < 4*ILPconfig;i++){ 141 | res[i] += frag_D[i]; 142 | 143 | res[i] += fpuC; 144 | res[i] += intC; 145 | } 146 | 147 | //res[0] += fpuC; 148 | startClk[gid] = start; 149 | stopClk[gid] = stop; 150 | } 151 | 152 | 153 | template 154 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 155 | intilizeDeviceProp(0); 156 | 157 | int BLOCKS_NUM = 1; 158 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 159 | int WARP_SIZE = 32; 160 | 161 | unsigned total_A_SIZE = 162 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 163 | unsigned total_B_SIZE = 164 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 165 | unsigned total_R_SIZE = 166 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 167 | 168 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 169 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 170 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 171 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 172 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 173 | 174 | uint64_t *startClk_g; 175 | uint64_t *stopClk_g; 176 | T *data1_g; 177 | T *data2_g; 178 | R *res_g; 179 | 180 | for (uint32_t i = 0; i < 16*4; i++) { 181 | data1[i] = (T)i; 182 | } 183 | 184 | for (uint32_t i = 0; i < 4*8; i++) { 185 | data2[i] = (T)i; 186 | } 187 | 188 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 189 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 190 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 191 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 192 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 193 | 194 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 195 | cudaMemcpyHostToDevice)); 196 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 197 | cudaMemcpyHostToDevice)); 198 | 199 | mma_ubench<<>>( 200 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 201 | gpuErrchk(cudaPeekAtLastError()); 202 | 203 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 204 | cudaMemcpyDeviceToHost)); 205 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 206 | cudaMemcpyDeviceToHost)); 207 | gpuErrchk( 208 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 209 | 210 | float mma_bw, fma_bw; 211 | uint64_t total_time = 212 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 213 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 214 | 215 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 216 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 217 | 218 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 219 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 220 | // (float)total_time; 221 | fma_bw = ((float)(ITERS * 16 * 8 * 8 * ILPconfig * //0 * 222 | (TOTAL_THREADS / WARP_SIZE))) / 223 | (float)total_time; 224 | 225 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 226 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 227 | std::cout << "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 228 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 229 | 230 | std::cout << "Total Clk number = " << total_time << "\n"; 231 | 232 | if (report_fma_bw) 233 | return fma_bw; 234 | else 235 | return mma_bw; 236 | } 237 | 238 | int main() { 239 | intilizeDeviceProp(0); 240 | std::cout<<"***********************************"<8 27 | static_assert(0,"ILP > 8 is not supported\n"); 28 | #endif 29 | 30 | 31 | __global__ void mma_ubench(uint64_t *startClk, uint64_t *stopClk, int *a, int *b, float *res, 32 | uint32_t strid) { // strid set to 0 used to prevent optimization 33 | // thread index 34 | uint32_t tid = threadIdx.x; 35 | uint32_t gid = blockIdx.x * blockDim.x + tid; 36 | uint32_t warpid = gid / warpSize; 37 | 38 | a = a + warpid * 8*16; // m*k = 8*16 39 | b = b + warpid * 8*16; // n*k = 8*16 40 | res = res + warpid * 8*8;// m*n = 8*8 41 | 42 | 43 | char frag_A[4 * ILPconfig]; // four int8 registers, 44 | char frag_B[4 * ILPconfig]; // one .f16x2 registers, 2 half elements 45 | int frag_D[2 * ILPconfig]; //result(fp32) 2 f32 registers 46 | 47 | for(int i = 0;i<4*ILPconfig;i++){ 48 | frag_A[i] = a[i + lane_id()]; 49 | } 50 | for(int i =0;i<4*ILPconfig;i++){ 51 | frag_B[i] = b[i + lane_id()]; 52 | } 53 | for(int i =0;i<2*ILPconfig;i++){ 54 | frag_D[i] = 0.0f; 55 | } 56 | 57 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 58 | uint32_t const *B = reinterpret_cast(&frag_B[0]); 59 | int *C = reinterpret_cast(&frag_D[0]); 60 | int *D = C; // D = A*B + D. 61 | 62 | 63 | 64 | 65 | 66 | float fpuA = frag_A[0]; 67 | float fpuB = frag_B[0]; 68 | float fpuC = frag_D[0]; 69 | 70 | int intA = threadIdx.x; 71 | int intB = threadIdx.x + 1; 72 | int intC = threadIdx.x + 2; 73 | 74 | uint64_t start = 0; 75 | uint64_t stop = 0; 76 | // synchronize all threads 77 | asm volatile("bar.sync 0;"); 78 | // start timing 79 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 80 | //#pragma unroll 81 | for (int j = 0; j < ITERS; ++j) { 82 | asm volatile( 83 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 84 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 85 | : "=r"(D[0]), "=r"(D[1]) 86 | : "r"(A[0]), 87 | "r"(B[0]), 88 | "r"(C[0]), "r"(C[1]) 89 | ); // input C operand will use output operand D. 90 | #if ILPconfig >= 2 91 | asm volatile( 92 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 93 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 94 | : "=r"(D[2]), "=r"(D[3]) 95 | : "r"(A[1]), 96 | "r"(B[1]), 97 | "r"(C[2]), "r"(C[3]) 98 | ); // input C operand will use output operand D. 99 | #endif 100 | #if ILPconfig >= 3 101 | asm volatile( 102 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 103 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 104 | : "=r"(D[4]), "=r"(D[5]) 105 | : "r"(A[2]), 106 | "r"(B[2]), 107 | "r"(C[4]), "r"(C[5]) 108 | ); 109 | #endif 110 | #if ILPconfig >= 4 111 | asm volatile( 112 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 113 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 114 | : "=r"(D[6]), "=r"(D[7]) 115 | : "r"(A[3]), 116 | "r"(B[3]), 117 | "r"(C[6]), "r"(C[7]) 118 | ); 119 | #endif 120 | 121 | #if ILPconfig >= 5 122 | asm volatile( 123 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 124 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 125 | : "=r"(D[8]), "=r"(D[9]) 126 | : "r"(A[4]), 127 | "r"(B[4]), 128 | "r"(C[8]), "r"(C[9]) 129 | ); 130 | #endif 131 | #if ILPconfig >= 6 132 | asm volatile( 133 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 134 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 135 | : "=r"(D[10]), "=r"(D[11]) 136 | : "r"(A[5]), 137 | "r"(B[5]), 138 | "r"(C[10]), "r"(C[11]) 139 | ); 140 | #endif 141 | #if ILPconfig >= 7 142 | asm volatile( 143 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 144 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 145 | : "=r"(D[12]), "=r"(D[13]) 146 | : "r"(A[6]), 147 | "r"(B[6]), 148 | "r"(C[12]), "r"(C[13]) 149 | ); 150 | #endif 151 | #if ILPconfig >= 8 152 | asm volatile( 153 | "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 " 154 | "{%0,%1}, {%2}, {%3}, {%4,%5};\n" 155 | : "=r"(D[14]), "=r"(D[15]) 156 | : "r"(A[7]), 157 | "r"(B[7]), 158 | "r"(C[14]), "r"(C[15]) 159 | ); 160 | #endif 161 | 162 | 163 | 164 | __syncwarp(); 165 | 166 | } 167 | // synchronize all threads 168 | // asm volatile("bar.sync 0;"); 169 | // stop timing 170 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 171 | for(int i=0; i < 2*ILPconfig;i++){ 172 | res[i] += D[i]; 173 | 174 | res[i] += fpuC; 175 | res[i] += intC; 176 | } 177 | 178 | //res[0] += fpuC; 179 | startClk[gid] = start; 180 | stopClk[gid] = stop; 181 | } 182 | 183 | 184 | template 185 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 186 | intilizeDeviceProp(0); 187 | 188 | int BLOCKS_NUM = 1; 189 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 190 | int WARP_SIZE = 32; 191 | 192 | unsigned total_A_SIZE = 193 | 16*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 194 | unsigned total_B_SIZE = 195 | 8*16 * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 196 | unsigned total_R_SIZE = 197 | 16*8 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 198 | 199 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 200 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 201 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 202 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 203 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 204 | 205 | uint64_t *startClk_g; 206 | uint64_t *stopClk_g; 207 | T *data1_g; 208 | T *data2_g; 209 | R *res_g; 210 | 211 | for (uint32_t i = 0; i < 16*8; i++) { 212 | data1[i] = (T)i; 213 | } 214 | 215 | for (uint32_t i = 0; i < 8*8; i++) { 216 | data2[i] = (T)i; 217 | } 218 | 219 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 220 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 221 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 222 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 223 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 224 | 225 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 226 | cudaMemcpyHostToDevice)); 227 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 228 | cudaMemcpyHostToDevice)); 229 | 230 | mma_ubench<<>>( 231 | startClk_g, stopClk_g, data1_g, data2_g, res_g, 0); 232 | gpuErrchk(cudaPeekAtLastError()); 233 | 234 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 235 | cudaMemcpyDeviceToHost)); 236 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 237 | cudaMemcpyDeviceToHost)); 238 | gpuErrchk( 239 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 240 | 241 | float mma_bw, fma_bw; 242 | uint64_t total_time = 243 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 244 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 245 | 246 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0 ) / 247 | ((float)total_time); // max 64FMA/clk/SM on RTX3070Ti 248 | 249 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 250 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 251 | // (float)total_time; 252 | fma_bw = ((float)(ITERS * 8 * 8 * 16 * ILPconfig * //0 * 253 | (TOTAL_THREADS / WARP_SIZE))) / 254 | (float)total_time; 255 | 256 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 257 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 258 | std::cout << "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 259 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 260 | 261 | std::cout << "Total Clk number = " << total_time << "\n"; 262 | 263 | if (report_fma_bw) 264 | return fma_bw; 265 | else 266 | return mma_bw; 267 | } 268 | 269 | int main() { 270 | intilizeDeviceProp(0); 271 | // std::cout << "mma1688 FP16 operand, FP32 accumalte:\n"; 272 | std::cout<<"***********************************"< 4 is not spported"); 30 | 31 | 32 | __global__ void mmasp_1688(uint64_t *startClk, uint64_t *stopClk, float *a, float *b, uint32_t* meteE, float *res, 33 | uint32_t strid) { // strid set to 0 used to prevent optimization 34 | // thread index 35 | uint32_t tid = threadIdx.x; 36 | uint32_t gid = blockIdx.x * blockDim.x + tid; 37 | uint32_t warpid = gid / warpSize; 38 | 39 | a = a + warpid * 16*4; // m*k/2 = 16*4 40 | b = b + warpid * 8*8; // n*k = 8*8 41 | res = res + warpid * 16*8;// m*n = 16*8 42 | 43 | /** step 1: create register for each thread **/ 44 | float frag_A[2*ILPconfig]; // two b32 registrs, 4 half non-zero elements, 16 dense 45 | float frag_B[2*ILPconfig]; // two f16x2 registers, 8 half dense elements 46 | float frag_D[4*ILPconfig]; //result(fp32) 4 f32 registers 47 | uint32_t frag_E[1*ILPconfig]; // A .b32 register containing 16 2-bit vectors to for indexing non-zero of A 48 | // fake load, we are focusing on mma latency/throughput. So no need to care about loading 49 | for(int i = 0;i<2*ILPconfig;i++){ 50 | frag_A[i] = a[i ]; 51 | frag_B[i] = b[i]; 52 | } 53 | for(int i =0;i<4*ILPconfig;i++){ 54 | 55 | frag_D[i] = 0.0f; 56 | } 57 | for(int i =0; i < 1*ILPconfig ; i ++){ 58 | frag_E[i] = meteE[i]; 59 | } 60 | //TODO: cast half to 61 | uint32_t const *A = reinterpret_cast(&frag_A[0]); 62 | uint32_t const *B = reinterpret_cast(&frag_B[0]);//? 63 | float *C = reinterpret_cast(&frag_D[0]); 64 | float *D = C; 65 | uint32_t const *E = reinterpret_cast(&frag_E[0]); ; 66 | 67 | float fpuA = frag_A[0]; 68 | float fpuB = frag_B[0]; 69 | float fpuC = frag_D[0]; 70 | 71 | 72 | 73 | // int intA = threadIdx.x; 74 | // int intB = threadIdx.x + 1; 75 | int intC = threadIdx.x + 2; 76 | 77 | uint64_t start = 0; 78 | uint64_t stop = 0; 79 | // synchronize all threads 80 | asm volatile("bar.sync 0;"); 81 | // start timing 82 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); 83 | //#pragma unroll 84 | for (int j = 0; j < ITERS; ++j) { 85 | asm volatile( 86 | "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " 87 | "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n" 88 | : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) 89 | : "r"(A[0]), "r"(A[1]), 90 | "r"(B[0]), "r"(B[1]), 91 | "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), 92 | "r"(E[0]) 93 | ); 94 | 95 | #if ILPconfig >= 2 96 | asm volatile( 97 | "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " 98 | "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n" 99 | : "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7]) 100 | : "r"(A[2]), "r"(A[3]), 101 | "r"(B[2]), "r"(B[3]), 102 | "f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7]), 103 | "r"(E[1]) 104 | ); 105 | #endif 106 | 107 | #if ILPconfig >= 3 108 | asm volatile( 109 | "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " 110 | "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n" 111 | : "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11]) 112 | : "r"(A[4]), "r"(A[5]), 113 | "r"(B[4]), "r"(B[5]), 114 | "f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11]), 115 | "r"(E[2]) 116 | ); 117 | #endif 118 | 119 | #if ILPconfig >= 4 120 | asm volatile( 121 | "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " 122 | "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11}, %12, 0x0;\n" 123 | : "=f"(D[12]), "=f"(D[13]), "=f"(D[14]), "=f"(D[15]) 124 | : "r"(A[6]), "r"(A[7]), 125 | "r"(B[6]), "r"(B[7]), 126 | "f"(C[12]), "f"(C[13]), "f"(C[14]), "f"(C[15]), 127 | "r"(E[3]) 128 | ); 129 | #endif 130 | __syncwarp(); 131 | } 132 | // synchronize all threads 133 | 134 | // stop timing 135 | asm volatile("mov.u64 %0, %%clock64;" : "=l"(stop)::"memory"); 136 | // avoid compiler optimization 137 | for(int i=0; i < 4*ILPconfig;i++){ 138 | res[i] = frag_D[i]; 139 | 140 | res[i] += float(fpuC); 141 | res[i] += intC; 142 | } 143 | 144 | //res[0] += fpuC; 145 | startClk[gid] = start; 146 | stopClk[gid] = stop; 147 | } 148 | 149 | 150 | template 151 | float run(int THREADS_PER_BLOCK, bool report_fma_bw = false) { 152 | intilizeDeviceProp(0); 153 | 154 | int BLOCKS_NUM = 1; 155 | int TOTAL_THREADS = THREADS_PER_BLOCK * BLOCKS_NUM; 156 | int WARP_SIZE = 32; 157 | 158 | int nwarps = THREADS_PER_BLOCK/WARP_SIZE; 159 | 160 | int mma_m = 16; 161 | int mma_n = 8; 162 | int mma_k = 8; 163 | 164 | 165 | // T *data1 = (T *)malloc(mma_m*mma_k/2 * sizeof(T)); 166 | // T *data2 = (T *)malloc(mma_n*mma_k * sizeof(T)); 167 | // R *res = (R *)malloc(mma_m*mma_n * sizeof(R)); 168 | // uint32_t *meta_e = (uint32_t *)malloc(mma_m*mma_k/16 *sizeof(uint32_t) ); 169 | 170 | unsigned total_A_SIZE = 171 | mma_m*mma_k/2 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x8 matrix per warp 172 | unsigned total_B_SIZE = 173 | mma_n*mma_k * (TOTAL_THREADS / WARP_SIZE); // asume one 8*8 matrix per warp 174 | unsigned total_R_SIZE = 175 | mma_m*mma_n * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 176 | 177 | 178 | unsigned total_E_SIZE = 179 | mma_m*mma_k/16 * (TOTAL_THREADS / WARP_SIZE); // asume one 16x16 matrix per warp 180 | 181 | uint64_t *startClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 182 | uint64_t *stopClk = (uint64_t *)malloc(TOTAL_THREADS * sizeof(uint64_t)); 183 | T *data1 = (T *)malloc(total_A_SIZE * sizeof(T)); 184 | T *data2 = (T *)malloc(total_B_SIZE * sizeof(T)); 185 | R *res = (R *)malloc(total_R_SIZE * sizeof(R)); 186 | 187 | uint32_t *meta_e = (uint32_t *)malloc(total_E_SIZE *sizeof(uint32_t) ); 188 | //uint32_t *meta_p = meta_e; 189 | for(int i=0; i < nwarps; i++){ 190 | 191 | initialize_fake_metadata_2_4(&meta_e[mma_m*mma_k/16 * i] ,mma_m,mma_k); 192 | 193 | } 194 | 195 | 196 | uint64_t *startClk_g; 197 | uint64_t *stopClk_g; 198 | T *data1_g; 199 | T *data2_g; 200 | R *res_g; 201 | uint32_t *meta_e_g; 202 | 203 | for (uint32_t i = 0; i < mma_m*mma_k/2; i++) { 204 | data1[i] = (T)i; 205 | } 206 | 207 | for (uint32_t i = 0; i < mma_k*mma_n; i++) { 208 | data2[i] = (T)i; 209 | } 210 | 211 | gpuErrchk(cudaMalloc(&startClk_g, TOTAL_THREADS * sizeof(uint64_t))); 212 | gpuErrchk(cudaMalloc(&stopClk_g, TOTAL_THREADS * sizeof(uint64_t))); 213 | gpuErrchk(cudaMalloc(&data1_g, total_A_SIZE * sizeof(T))); 214 | gpuErrchk(cudaMalloc(&data2_g, total_B_SIZE * sizeof(T))); 215 | gpuErrchk(cudaMalloc(&res_g, total_R_SIZE * sizeof(R))); 216 | gpuErrchk(cudaMalloc(&meta_e_g, total_E_SIZE *sizeof(uint32_t))); 217 | 218 | 219 | gpuErrchk(cudaMemcpy(data1_g, data1, total_A_SIZE * sizeof(T), 220 | cudaMemcpyHostToDevice)); 221 | gpuErrchk(cudaMemcpy(data2_g, data2, total_B_SIZE * sizeof(T), 222 | cudaMemcpyHostToDevice)); 223 | 224 | gpuErrchk(cudaMemcpy(meta_e_g, meta_e, total_E_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice)); 225 | 226 | mmasp_1688<<>>( 227 | startClk_g, stopClk_g, data1_g, data2_g,meta_e_g, res_g, 0); 228 | gpuErrchk(cudaPeekAtLastError()); 229 | 230 | gpuErrchk(cudaMemcpy(startClk, startClk_g, TOTAL_THREADS * sizeof(uint64_t), 231 | cudaMemcpyDeviceToHost)); 232 | gpuErrchk(cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS * sizeof(uint64_t), 233 | cudaMemcpyDeviceToHost)); 234 | gpuErrchk( 235 | cudaMemcpy(res, res_g, total_R_SIZE * sizeof(R), cudaMemcpyDeviceToHost)); 236 | 237 | float mma_bw, fma_bw; 238 | uint64_t total_time = 239 | *std::max_element(&stopClk[0], &stopClk[TOTAL_THREADS]) - 240 | *std::min_element(&startClk[0], &startClk[TOTAL_THREADS]); 241 | 242 | float fpuFMA = (float)(ITERS * TOTAL_THREADS * 1 * 1 * 1 * 0) / 243 | ((float)total_time); 244 | 245 | mma_bw = ((float)(ITERS * TOTAL_THREADS)) / (float)total_time; 246 | // hmma_bw = ((float)(REPEAT_TIMES * TOTAL_THREADS * SASS_hmma_per_PTX_wmma)) / 247 | // (float)total_time; 248 | fma_bw = ((float)(ITERS * mma_m * mma_n * mma_k * ILPconfig * 249 | (TOTAL_THREADS / WARP_SIZE))) / 250 | (float)total_time; 251 | 252 | // std::cout << "wmma PTX issue bandwidth = " << wmma_bw << "(thread/clk/SM) \n"; 253 | //std::cout << "mma issue bandwidth = " << mma_bw << "(thread/clk/SM)\n"; 254 | std::cout << "mma.sp.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 latency " << (float)total_time/(float)ITERS << " cycles\n"; 255 | std::cout << "FMA tensor bandwidth = " << fma_bw + fpuFMA << "(FMA/clk/SM)\n"; 256 | 257 | std::cout << "Total Clk number = " << total_time << "\n"; 258 | 259 | if (report_fma_bw) 260 | return fma_bw; 261 | else 262 | return mma_bw; 263 | } 264 | 265 | int main() { 266 | intilizeDeviceProp(0); 267 | std::cout<<"***********************************"< 1 is not supported\n"); 29 | // two way bank conflict - > 23 latenct 30 | // bank-conflict-free -> 25 latency 31 | 32 | typedef uint32_t shared_m; 33 | // Measure latency of ITERS ldmatrix.x1 34 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 35 | shared_m *dsink, uint32_t stride) { 36 | 37 | // thread index 38 | uint32_t tid = threadIdx.x; 39 | uint32_t bid = blockIdx.x; 40 | uint32_t uid = bid * blockDim.x + tid; 41 | uint32_t n_threads = blockDim.x * gridDim.x; 42 | 43 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 44 | 45 | // one thread to initialize the pointer-chasing array 46 | for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads) 47 | s[i] = (i )*16 % 512; 48 | 49 | asm volatile("bar.sync 0;"); 50 | 51 | // if(uid == 0){ 52 | // for(int i = 0; i < SHARED_MEM_SIZE; i ++){ 53 | // printf("s[%d] = %d \t", i, s[i]); 54 | 55 | // } 56 | // printf("\n"); 57 | // } 58 | //if (uid == 0) { 59 | // initalize pointer chaser 60 | //unsigned x = threadIdx.x*4; 61 | //unsigned addr = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 62 | //unsigned addr2 = 0; 63 | // unsigned addr3 = 0; 64 | // unsigned addr4 = 0; 65 | // unsigned addr4 = 0; 66 | // unsigned addr4 = 0; 67 | unsigned frag[4]; 68 | frag[0] = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 69 | //printf("thread %d , addr = %d \n", tid, addr); 70 | // start timing 71 | uint32_t start = 0; 72 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 73 | 74 | // pointer-chasing ITERS times 75 | #pragma unroll 76 | for (uint32_t i = 0; i < ITERS; ++i) { 77 | //asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr2),"=r"(addr3),"=r"(addr4) : "r"(addr)); // first 11 78 | asm volatile ("wmma.load.a.sync.aligned.row.m16n16k16.shared.bf16 {%0,%1,%2,%3}, [%4];" 79 | : "=r"(frag[0]), "=r"(frag[1]),"=r"(frag[2]),"=r"(frag[3]) 80 | : "r"(frag[0]) ); // first 11 81 | 82 | 83 | //asm volatile ("ldmatrix.sync.aligne.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); // first 9 84 | //x = x++; 85 | } 86 | //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); 87 | // stop timing 88 | uint32_t stop = 0; 89 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 90 | 91 | //printf("thread %d , x = %d \n", tid, addr); 92 | 93 | // write time and data back to memory 94 | startClk[uid] = start; 95 | stopClk[uid] = stop; 96 | dsink[uid] = frag[0] + frag[1] + frag[2] + frag[3]; 97 | 98 | // float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 99 | // printf("Shared Memory Latency = %f cycles\n", lat); 100 | //} 101 | } 102 | void test_with_different_thread(int THREADS_NUM){ 103 | 104 | BLOCKS_NUM = 1; 105 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 106 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 107 | 108 | assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 109 | 110 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 111 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 112 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 113 | 114 | uint32_t *startClk_g; 115 | uint32_t *stopClk_g; 116 | shared_m *dsink_g; 117 | 118 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 119 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 120 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 121 | 122 | shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1); 123 | gpuErrchk(cudaPeekAtLastError()); 124 | // printf("pass kenerl \n"); 125 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 126 | cudaMemcpyDeviceToHost)); 127 | gpuErrchk( 128 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 129 | gpuErrchk( 130 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 131 | 132 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 133 | 134 | std::cout << THREADS_NUM/32 <<" warps, wmma.load.a.sync.aligned.row.m16n16k16.shared.bf16 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 135 | std::cout << "Total Clk number " << stopClk[0] - startClk[0] < 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../../../hw_def/hw_def.h" 7 | 8 | #define WMMA_M 16 9 | #define WMMA_N 16 10 | #define WMMA_K 16 11 | 12 | 13 | 14 | #define SHARED_MEM_SIZE (48 * 1024 / 4) // 32 KB 15 | // Launch only one thread to calcaulte the latency using a pointer-chasing 16 | // array technique 17 | //#define THREADS_NUM 32 18 | // iterate over the array ITERS times 19 | #ifndef ITERS 20 | #define ITERS (1024 ) 21 | #endif 22 | 23 | 24 | #ifndef ILPconfig 25 | #define ILPconfig 1 26 | #endif 27 | 28 | static_assert(ILPconfig<=1,"ILP > 1 is not supported\n"); 29 | // two way bank conflict - > 23 latenct 30 | // bank-conflict-free -> 25 latency 31 | 32 | typedef uint32_t shared_m; 33 | // Measure latency of ITERS ldmatrix.x1 34 | __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, 35 | shared_m *dsink, uint32_t stride) { 36 | 37 | // thread index 38 | uint32_t tid = threadIdx.x; 39 | uint32_t bid = blockIdx.x; 40 | uint32_t uid = bid * blockDim.x + tid; 41 | uint32_t n_threads = blockDim.x * gridDim.x; 42 | 43 | __shared__ shared_m s[SHARED_MEM_SIZE]; // static shared memory 44 | 45 | // one thread to initialize the pointer-chasing array 46 | for (uint32_t i = uid; i < (SHARED_MEM_SIZE - stride); i += n_threads) 47 | s[i] = (i )*16 % 512; 48 | 49 | asm volatile("bar.sync 0;"); 50 | unsigned frag[8]; 51 | frag[0] = static_cast(__cvta_generic_to_shared(&s[threadIdx.x*4])); 52 | //printf("thread %d , addr = %d \n", tid, addr); 53 | // start timing 54 | uint32_t start = 0; 55 | asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); 56 | 57 | // pointer-chasing ITERS times 58 | //#pragma unroll 59 | for (uint32_t i = 0; i < ITERS; ++i) { 60 | //asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(addr), "=r"(addr2),"=r"(addr3),"=r"(addr4) : "r"(addr)); // first 11 61 | asm volatile ("wmma.load.a.sync.aligned.row.m16n16k16.shared.f16 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" 62 | : "=r"(frag[0]), "=r"(frag[1]),"=r"(frag[2]),"=r"(frag[3]), "=r"(frag[4]), "=r"(frag[5]),"=r"(frag[6]),"=r"(frag[7]) 63 | : "r"(frag[0]) ); // first 11 64 | } 65 | //asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr)); 66 | // stop timing 67 | uint32_t stop = 0; 68 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop)::"memory"); 69 | 70 | //printf("thread %d , x = %d \n", tid, addr); 71 | 72 | // write time and data back to memory 73 | startClk[uid] = start; 74 | stopClk[uid] = stop; 75 | dsink[uid] = frag[0] + frag[1] + frag[2] + frag[3] + frag[4] + frag[5] + frag[6] + frag[7]; 76 | } 77 | void test_with_different_thread(int THREADS_NUM){ 78 | 79 | BLOCKS_NUM = 1; 80 | TOTAL_THREADS = THREADS_NUM * BLOCKS_NUM; 81 | THREADS_PER_SM = THREADS_NUM * BLOCKS_NUM; 82 | 83 | assert(SHARED_MEM_SIZE * sizeof(shared_m) <= MAX_SHARED_MEM_SIZE_PER_BLOCK); 84 | 85 | uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); 86 | uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); 87 | shared_m *dsink = (shared_m *)malloc(sizeof(shared_m)); 88 | 89 | uint32_t *startClk_g; 90 | uint32_t *stopClk_g; 91 | shared_m *dsink_g; 92 | 93 | gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); 94 | gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); 95 | gpuErrchk(cudaMalloc(&dsink_g, sizeof(shared_m))); 96 | 97 | shared_lat<<<1, THREADS_NUM>>>(startClk_g, stopClk_g, dsink_g, 1); 98 | gpuErrchk(cudaPeekAtLastError()); 99 | // printf("pass kenerl \n"); 100 | gpuErrchk(cudaMemcpy(startClk, startClk_g, sizeof(uint32_t), 101 | cudaMemcpyDeviceToHost)); 102 | gpuErrchk( 103 | cudaMemcpy(stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 104 | gpuErrchk( 105 | cudaMemcpy(dsink, dsink_g, sizeof(shared_m), cudaMemcpyDeviceToHost)); 106 | 107 | float lat = (float)(stopClk[0] - startClk[0]) / ITERS; 108 | 109 | std::cout << THREADS_NUM/32 <<" warps, wmma.load.a.sync.aligned.m16n16k16.row.f16 latency " << lat <<" ( " <<(unsigned)(lat) << " ) " << std::endl; 110 | std::cout << "Total Clk number " << stopClk[0] - startClk[0] <