├── GPU_Microbenchmarks ├── MaxFlops │ ├── Makefile │ ├── MaxFlops │ └── MaxFlops.cu ├── bw_measurements.txt ├── l1_bw_64f │ ├── Makefile │ ├── l1_bw_64f │ ├── l1_bw_64f.cu │ └── l1_bw_64f_unroll.cu ├── l2_bw_64f │ ├── Makefile │ ├── l2_bw_64f │ └── l2_bw_64f.cu ├── measure_bw.sh └── mem_bw │ ├── Makefile │ ├── mem_bw │ └── mem_bw.cu ├── README.md ├── hierarchical ├── events_transpose.csv ├── metrics_transpose.csv ├── profile_nvp.sh ├── roofline_transpose.png ├── timing_transpose.csv └── transpose ├── multi_kernels ├── events.csv ├── metrics.csv ├── profile_ncu.sh ├── roofline_kernels.png └── timing.csv └── roofline_tool.py /GPU_Microbenchmarks/MaxFlops/Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_SM30 := -gencode=arch=compute_30,code=\"sm_30,compute_30\" 2 | GENCODE_SM35 := -gencode=arch=compute_35,code=\"sm_35,compute_35\" 3 | GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\" 4 | GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\" 5 | GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\" 6 | 7 | CUOPTS = $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70) 8 | 9 | 10 | CC := nvcc 11 | 12 | INCLUDE := 13 | LIB := 14 | 15 | SRC = MaxFlops.cu 16 | 17 | EXE = MaxFlops 18 | 19 | release: 20 | $(CC) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart 21 | 22 | clean: 23 | rm -f *.o; rm -f $(EXE) 24 | 25 | run: 26 | ./$(EXE) 27 | 28 | profile: 29 | nvprof ./$(EXE) 30 | 31 | events: 32 | nvprof --events elapsed_cycles_sm ./$(EXE) 33 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/MaxFlops/MaxFlops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/GPU_Microbenchmarks/MaxFlops/MaxFlops -------------------------------------------------------------------------------- /GPU_Microbenchmarks/MaxFlops/MaxFlops.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define THREADS_PER_BLOCK 1024 6 | #define THREADS_PER_SM 1024 7 | #define BLOCKS_NUM 1 8 | #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) 9 | #define WARP_SIZE 32 10 | #define REPEAT_TIMES 1024 11 | 12 | // GPU error check 13 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 14 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ 15 | if (code != cudaSuccess) { 16 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 17 | if (abort) exit(code); 18 | } 19 | } 20 | 21 | 22 | template 23 | __global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, T *data2, T *res) { 24 | int gid = blockIdx.x*blockDim.x + threadIdx.x; 25 | register T s1 = data1[gid]; 26 | register T s2 = data2[gid]; 27 | register T result = 0; 28 | 29 | // synchronize all threads 30 | asm volatile ("bar.sync 0;"); 31 | 32 | // start timing 33 | uint32_t start = 0; 34 | asm volatile ("mov.u32 %0, %%clock;" : "=r"(start) :: "memory"); 35 | 36 | for (int j=0 ; j<<>>(startClk_g, stopClk_g, data1_g, data2_g, res_g); 89 | gpuErrchk( cudaPeekAtLastError() ); 90 | 91 | gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 92 | gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 93 | gpuErrchk( cudaMemcpy(res, res_g, TOTAL_THREADS*sizeof(float), cudaMemcpyDeviceToHost) ); 94 | 95 | float flops; 96 | flops = (float)(REPEAT_TIMES*THREADS_PER_SM*8)/((float)(stopClk[0]-startClk[0])); 97 | printf("FLOP per SM = %f (flop/clk/SM)\n", flops); 98 | printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); 99 | 100 | return 0; 101 | } 102 | 103 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/bw_measurements.txt: -------------------------------------------------------------------------------- 1 | Running GPU Benchmarks 2 | 3 | L1 bandwidth = 123.449023 (byte/clk/SM) 4 | Total Clk number = 33976 5 | 6 | L2 bandwidth = 1793.795898 (byte/cycle) 7 | Total Clk number = 1496466 8 | 9 | Mem BW= 733.951599 (Byte/Clk) 10 | Mem BW= 858.081856 (GB/sec) 11 | Total Clk number = 274305 12 | 13 | FLOP per SM = 126.637703 (flop/clk/SM) 14 | Total Clk number = 66241 15 | 16 | Final Measurements After Calculations 17 | 18 | L1 bandwidth = 12295.522690 (GB/sec) 19 | L2 bandwidth = 2233.275893 (GB/sec) 20 | HBM bandwidth = 858.081856 (GB/sec) 21 | Max Flops = 12613.115218 (GFLOPS) 22 | 23 | Measurements normalized to 32B transaction size 24 | 25 | L1 bandwidth = 384.235084 (GTXN/s) 26 | L2 bandwidth = 69.789871 (GTXN/s) 27 | HBM bandwidth = 26.815058 (GTXN/s) 28 | Max warp-based Instructions = 394.159850 (GIPS) -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l1_bw_64f/Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_SM30 := -gencode=arch=compute_30,code=\"sm_30,compute_30\" 2 | GENCODE_SM35 := -gencode=arch=compute_35,code=\"sm_35,compute_35\" 3 | GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\" 4 | GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\" 5 | GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\" 6 | 7 | CUOPTS = $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70) 8 | 9 | 10 | CC := nvcc 11 | 12 | INCLUDE := 13 | LIB := 14 | 15 | SRC = l1_bw_64f.cu 16 | 17 | EXE = l1_bw_64f 18 | 19 | release: 20 | $(CC) --ptxas-options=-v $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart 21 | 22 | clean: 23 | rm -f *.o; rm -f $(EXE) 24 | 25 | run: 26 | ./$(EXE) 27 | 28 | profile: 29 | nvprof ./$(EXE) 30 | 31 | events: 32 | nvprof --events elapsed_cycles_sm ./$(EXE) 33 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l1_bw_64f/l1_bw_64f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/GPU_Microbenchmarks/l1_bw_64f/l1_bw_64f -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l1_bw_64f/l1_bw_64f.cu: -------------------------------------------------------------------------------- 1 | //This code is a modification of L1 cache benchmark from 2 | //"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": https://arxiv.org/pdf/1804.06826.pdf 3 | 4 | //This benchmark measures the maximum read bandwidth of L1 cache for 64 bit read 5 | 6 | //This code have been tested on Volta V100 architecture 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #define THREADS_PER_BLOCK 1024 13 | #define THREADS_PER_SM 1024 14 | #define BLOCKS_NUM 1 15 | #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) 16 | #define WARP_SIZE 32 17 | #define REPEAT_TIMES 256 18 | #define ARRAY_SIZE 8192 //ARRAY_SIZE has to be less than L1_SIZE 19 | #define L1_SIZE 16384 //L1 size in 64-bit. Volta L1 size is 128KB, i.e. 16K of 64-bit 20 | 21 | // GPU error check 22 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 23 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ 24 | if (code != cudaSuccess) { 25 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 26 | if (abort) exit(code); 27 | } 28 | } 29 | 30 | __global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, double *posArray){ 31 | 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t uid = blockIdx.x * blockDim.x + tid; 35 | 36 | // a register to avoid compiler optimization 37 | double sink0 = 0; 38 | double sink1 = 0; 39 | 40 | // populate l1 cache to warm up 41 | for (uint32_t i = tid; i;\n\t" 64 | "ld.global.ca.f64 data0, [%2+0];\n\t" 65 | "ld.global.ca.f64 data1, [%2+256];\n\t" 66 | "add.f64 %0, data0, %0;\n\t" 67 | "add.f64 %1, data1, %1;\n\t" 68 | "}" : "+d"(sink0),"+d"(sink1) : "l"(ptr) : "memory" 69 | ); 70 | } 71 | 72 | // synchronize all threads 73 | asm volatile("bar.sync 0;"); 74 | 75 | // stop timing 76 | uint32_t stop = 0; 77 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory"); 78 | 79 | // write time and data back to memory 80 | startClk[uid] = start; 81 | stopClk[uid] = stop; 82 | dsink[uid] = sink0+sink1; 83 | } 84 | 85 | int main(){ 86 | uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 87 | uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 88 | double *posArray = (double*) malloc(ARRAY_SIZE*sizeof(double)); 89 | double *dsink = (double*) malloc(TOTAL_THREADS*sizeof(double)); 90 | 91 | uint32_t *startClk_g; 92 | uint32_t *stopClk_g; 93 | double *posArray_g; 94 | double *dsink_g; 95 | 96 | for (uint32_t i=0; i>>(startClk_g, stopClk_g, dsink_g, posArray_g); 107 | gpuErrchk( cudaPeekAtLastError() ); 108 | 109 | gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 110 | gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 111 | gpuErrchk( cudaMemcpy(dsink, dsink_g, TOTAL_THREADS*sizeof(double), cudaMemcpyDeviceToHost) ); 112 | 113 | double bw; 114 | bw = (double)(REPEAT_TIMES*THREADS_PER_SM*8*2)/((double)(stopClk[0]-startClk[0])); 115 | printf("L1 bandwidth = %f (byte/clk/SM)\n", bw); 116 | printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l1_bw_64f/l1_bw_64f_unroll.cu: -------------------------------------------------------------------------------- 1 | //This code is a modification of L1 cache benchmark from 2 | //"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": https://arxiv.org/pdf/1804.06826.pdf 3 | 4 | //This benchmark measures the maximum read bandwidth of L1 cache for 64 bit read 5 | 6 | //This code have been tested on Volta V100 architecture 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #define THREADS_PER_BLOCK 1024 13 | #define THREADS_PER_SM 1024 14 | #define BLOCKS_NUM 1 15 | #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) 16 | #define WARP_SIZE 32 17 | #define REPEAT_TIMES 256 18 | #define ARRAY_SIZE 8192//THREADS_PER_SM*4+REPEAT_TIMES*WARP_SIZE*4) //ARRAY_SIZE has to be less than L1_SIZE 19 | #define L1_SIZE 16384 //L1 size in 64-bit. Volta L1 size is 128KB, i.e. 16K of 64-bit 20 | 21 | // GPU error check 22 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 23 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ 24 | if (code != cudaSuccess) { 25 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 26 | if (abort) exit(code); 27 | } 28 | } 29 | 30 | __global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, double *posArray){ 31 | 32 | // thread index 33 | uint32_t tid = threadIdx.x; 34 | uint32_t uid = blockIdx.x * blockDim.x + tid; 35 | 36 | // a register to avoid compiler optimization 37 | double sink0 = 0; 38 | double sink1 = 0; 39 | double sink2 = 0; 40 | double sink3 = 0; 41 | 42 | // populate l1 cache to warm up 43 | for (uint32_t i = tid; i;\n\t" 67 | "ld.global.ca.f64 data0, [%4+0];\n\t" 68 | "ld.global.ca.f64 data1, [%4+256];\n\t" 69 | "ld.global.ca.f64 data2, [%4+512];\n\t" 70 | "ld.global.ca.f64 data3, [%4+768];\n\t" 71 | "add.f64 %0, data0, %0;\n\t" 72 | "add.f64 %1, data1, %1;\n\t" 73 | "add.f64 %2, data2, %2;\n\t" 74 | "add.f64 %3, data3, %3;\n\t" 75 | "}" : "+d"(sink0),"+d"(sink1),"+d"(sink2),"+d"(sink3) : "l"(ptr) : "memory" 76 | ); 77 | //} 78 | } 79 | 80 | // synchronize all threads 81 | asm volatile("bar.sync 0;"); 82 | 83 | // stop timing 84 | uint32_t stop = 0; 85 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory"); 86 | 87 | // write time and data back to memory 88 | startClk[uid] = start; 89 | stopClk[uid] = stop; 90 | dsink[uid] = sink0+sink1+sink2+sink3; 91 | } 92 | 93 | int main(){ 94 | uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 95 | uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 96 | double *posArray = (double*) malloc(ARRAY_SIZE*sizeof(double)); 97 | double *dsink = (double*) malloc(TOTAL_THREADS*sizeof(double)); 98 | 99 | uint32_t *startClk_g; 100 | uint32_t *stopClk_g; 101 | double *posArray_g; 102 | double *dsink_g; 103 | 104 | for (uint32_t i=0; i>>(startClk_g, stopClk_g, dsink_g, posArray_g); 115 | gpuErrchk( cudaPeekAtLastError() ); 116 | 117 | gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 118 | gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 119 | gpuErrchk( cudaMemcpy(dsink, dsink_g, TOTAL_THREADS*sizeof(double), cudaMemcpyDeviceToHost) ); 120 | 121 | double bw; 122 | bw = (double)(REPEAT_TIMES*THREADS_PER_SM*8*4)/((double)(stopClk[0]-startClk[0])); 123 | printf("L1 bandwidth = %f (byte/clk/SM)\n", bw); 124 | printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l2_bw_64f/Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_SM30 := -gencode=arch=compute_30,code=\"sm_30,compute_30\" 2 | GENCODE_SM35 := -gencode=arch=compute_35,code=\"sm_35,compute_35\" 3 | GENCODE_SM50 := -gencode=arch=compute_50,code=\"sm_50,compute_50\" 4 | GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\" 5 | GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\" 6 | 7 | CUOPTS = $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70) 8 | 9 | 10 | CC := nvcc 11 | 12 | INCLUDE := 13 | LIB := 14 | 15 | SRC = l2_bw_64f.cu 16 | 17 | EXE = l2_bw_64f 18 | 19 | NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt 20 | 21 | release: 22 | $(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart 23 | 24 | clean: 25 | rm -f *.o; rm -f $(EXE) 26 | 27 | run: 28 | ./$(EXE) 29 | 30 | profile: 31 | nvprof ./$(EXE) 32 | 33 | events: 34 | nvprof --events elapsed_cycles_sm ./$(EXE) 35 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l2_bw_64f/l2_bw_64f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/GPU_Microbenchmarks/l2_bw_64f/l2_bw_64f -------------------------------------------------------------------------------- /GPU_Microbenchmarks/l2_bw_64f/l2_bw_64f.cu: -------------------------------------------------------------------------------- 1 | //This code is a modification of L2 cache benchmark from 2 | //"Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking": https://arxiv.org/pdf/1804.06826.pdf 3 | 4 | //This benchmark measures the maximum read bandwidth of L2 cache for 64 bit 5 | //Compile this file using the following command to disable L1 cache: 6 | // nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu 7 | 8 | //This code have been tested on Volta V100 architecture 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | #define BLOCKS_NUM 160 16 | #define THREADS_NUM 1024 //thread number/block 17 | #define TOTAL_THREADS (BLOCKS_NUM * THREADS_NUM) 18 | #define REPEAT_TIMES 2048 19 | #define WARP_SIZE 32 20 | #define ARRAY_SIZE (TOTAL_THREADS + REPEAT_TIMES*WARP_SIZE) //Array size must not exceed L2 size 21 | #define L2_SIZE 786432 //number of doubles L2 can store 22 | 23 | // GPU error check 24 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 25 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ 26 | if (code != cudaSuccess) { 27 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 28 | if (abort) exit(code); 29 | } 30 | } 31 | 32 | /* 33 | L2 cache is warmed up by loading posArray and adding sink 34 | Start timing after warming up 35 | Load posArray and add sink to generate read traffic 36 | Repeat the previous step while offsetting posArray by one each iteration 37 | Stop timing and store data 38 | */ 39 | 40 | __global__ void l2_bw (uint32_t*startClk, uint32_t*stopClk, double*dsink, double*posArray){ 41 | // block and thread index 42 | uint32_t tid = threadIdx.x; 43 | uint32_t bid = blockIdx.x; 44 | uint32_t uid = bid * blockDim.x + tid; 45 | 46 | // a register to avoid compiler optimization 47 | double sink = 0; 48 | 49 | // warm up l2 cache 50 | for(uint32_t i = uid; i>>(startClk_g, stopClk_g, dsink_g, posArray_g); 115 | gpuErrchk( cudaPeekAtLastError() ); 116 | 117 | gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 118 | gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 119 | gpuErrchk( cudaMemcpy(dsink, dsink_g, TOTAL_THREADS*sizeof(double), cudaMemcpyDeviceToHost) ); 120 | 121 | float bw; 122 | unsigned long long data = (unsigned long long)TOTAL_THREADS*REPEAT_TIMES*8; 123 | bw = (float)(data)/((float)(stopClk[0]-startClk[0])); 124 | printf("L2 bandwidth = %f (byte/cycle)\n", bw); 125 | printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); 126 | 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/measure_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use GPU_Microbenchmarks from github.com/accel-sim/gpu-app-collection/tree/release/src/cuda/GPU_Microbenchmark 4 | # to measure the memory bandwidth of L1, L2 cache and HBM. Also measure Max GPU IPS. 5 | # To calculate GPU frequency we use nvidia-smi 6 | 7 | # Make sure to provide all Makefiles with the appropriate GENCODE_SMxx where xx is GPU compute capability 8 | 9 | L1_path='l1_bw_64f' 10 | L1_bin='l1_bw_64f' 11 | 12 | L2_path='l2_bw_64f' 13 | L2_bin='l2_bw_64f' 14 | 15 | HBM_path='mem_bw' 16 | HBM_bin='mem_bw' 17 | 18 | Flops_path='MaxFlops' 19 | Flops_bin='MaxFlops' 20 | 21 | # Execute Makefiles 22 | make -s -C $L1_path 23 | make -s -C $L2_path 24 | make -s -C $HBM_path 25 | make -s -C $Flops_path 26 | 27 | echo -e "Running GPU Benchmarks \n" > bw_measurements.txt 28 | 29 | # Calculate bandwidths 30 | ./$L1_path/$L1_bin >> bw_measurements.txt 31 | echo "" >> bw_measurements.txt # leave blank line 32 | ./$L2_path/$L2_bin >> bw_measurements.txt 33 | echo "" >> bw_measurements.txt 34 | ./$HBM_path/$HBM_bin >> bw_measurements.txt 35 | echo "" >> bw_measurements.txt 36 | ./$Flops_path/$Flops_bin >> bw_measurements.txt 37 | 38 | # L1 BW 39 | L1_bw=$(grep -E 'L1 bandwidth' bw_measurements.txt | cut -d '=' -f 2 | cut -d '(' -f 1) 40 | ## multiply for n SM's . V100 has 80 SM 41 | SM=80 42 | L1_bw=$(echo ${L1_bw} \* ${SM} | bc) 43 | 44 | # L2 BW 45 | L2_bw=$(grep -E 'L2 bandwidth' bw_measurements.txt | cut -d '=' -f 2 | cut -d '(' -f 1) 46 | 47 | # HBM BW 48 | HBM_bw=$(grep -E 'GB/sec' bw_measurements.txt | cut -d '=' -f 2 | cut -d '(' -f 1) 49 | 50 | # Max Flops 51 | MaxFlops=$(grep -E 'FLOP per SM' bw_measurements.txt | cut -d '=' -f 2 | cut -d '(' -f 1) 52 | MaxFlops=$(echo ${MaxFlops} \* ${SM} | bc) 53 | 54 | # Obtain GPU SM Max frequency 55 | frequency=$(nvidia-smi -q -d CLOCK | sed -n '/SM Clock Samples/, /Avg/p' | grep Max | cut -d ':' -f 2 | cut -d 'M' -f 1) 56 | 57 | echo -e "\nFinal Measurements After Calculations\n" >> bw_measurements.txt 58 | 59 | # Multiply L1,L2 BW with frequency -> GB/sec 60 | frequency=$(echo ${frequency} \* 0.001 | bc) # transform to GHz 61 | L1_bw=$(echo ${L1_bw} \* ${frequency} | bc) 62 | L2_bw=$(echo ${L2_bw} \* ${frequency} | bc) 63 | MaxFlops=$(echo ${MaxFlops} \* ${frequency} | bc) 64 | 65 | echo "L1 bandwidth = ${L1_bw} (GB/sec)" >> bw_measurements.txt 66 | echo "L2 bandwidth = ${L2_bw} (GB/sec)" >> bw_measurements.txt 67 | echo "HBM bandwidth = ${HBM_bw} (GB/sec)" >> bw_measurements.txt 68 | echo "Max Flops = ${MaxFlops} (GFLOPS)" >> bw_measurements.txt 69 | 70 | echo -e "\nMeasurements normalized to 32B transaction size\n" >> bw_measurements.txt 71 | 72 | # multiply with 1/32 = 0.03125 73 | L1_bw=$(echo ${L1_bw} \* 0.03125 | bc) 74 | L2_bw=$(echo ${L2_bw} \* 0.03125 | bc) 75 | HBM_bw=$(echo ${HBM_bw} \* 0.03125 | bc) 76 | MaxFlops=$(echo ${MaxFlops} \* 0.03125 | bc) 77 | 78 | echo "L1 bandwidth = ${L1_bw} (GTXN/s)" >> bw_measurements.txt 79 | echo "L2 bandwidth = ${L2_bw} (GTXN/s)" >> bw_measurements.txt 80 | echo "HBM bandwidth = ${HBM_bw} (GTXN/s)" >> bw_measurements.txt 81 | echo "Max warp-based Instructions = ${MaxFlops} (GIPS)" >> bw_measurements.txt -------------------------------------------------------------------------------- /GPU_Microbenchmarks/mem_bw/Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_SM30 := -gencode=arch=compute_30,code=\"sm_30,compute_30\" 2 | GENCODE_SM35 := -gencode=arch=compute_35,code=\"sm_35,compute_35\" 3 | GENCODE_SM61 := -gencode=arch=compute_61,code=\"sm_61,compute_61\" 4 | GENCODE_SM70 := -gencode=arch=compute_70,code=\"sm_70,compute_70\" 5 | 6 | CUOPTS = $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM61) $(GENCODE_SM70) 7 | 8 | 9 | CC := nvcc 10 | 11 | INCLUDE := 12 | LIB := 13 | 14 | SRC = mem_bw.cu 15 | 16 | EXE = mem_bw 17 | 18 | NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt 19 | 20 | release: 21 | $(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart 22 | 23 | run: 24 | ./$(EXE) 25 | 26 | profile: 27 | nvprof ./$(EXE) 28 | 29 | profileall: 30 | nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE) 31 | 32 | events: 33 | nvprof --events elapsed_cycles_sm ./$(EXE) 34 | -------------------------------------------------------------------------------- /GPU_Microbenchmarks/mem_bw/mem_bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/GPU_Microbenchmarks/mem_bw/mem_bw -------------------------------------------------------------------------------- /GPU_Microbenchmarks/mem_bw/mem_bw.cu: -------------------------------------------------------------------------------- 1 | 2 | //This benchmark measures the maximum read bandwidth of GPU memory 3 | //Compile this file using the following command to disable L1 cache: 4 | // nvcc -Xptxas -dlcm=cg -Xptxas -dscm=wt l2_bw.cu 5 | 6 | //This code have been tested on Volta V100 architecture 7 | //You can check the mem BW from the NVPROF (dram_read_throughput+dram_write_throughput) 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #define BLOCKS_NUM 160 14 | #define THREADS_NUM 1024 //thread number/block 15 | #define TOTAL_THREADS (BLOCKS_NUM*THREADS_NUM) 16 | #define ARRAY_SIZE 8388608 //Array size has to exceed L2 size to avoid L2 cache residence 17 | #define WARP_SIZE 32 18 | #define L2_SIZE 1572864 //number of floats L2 can store 19 | #define clock_freq_MHZ 1132 20 | 21 | // GPU error check 22 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 23 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ 24 | if (code != cudaSuccess) { 25 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 26 | if (abort) exit(code); 27 | } 28 | } 29 | 30 | /* 31 | Four Vector Addition using flost4 types 32 | Send as many as float4 read requests on the flight to increase Row buffer locality of DRAM and hit the max BW 33 | */ 34 | 35 | __global__ void mem_bw (float* A, float* B, float* C, float* D, float* E, float* F, uint32_t *startClk, uint32_t *stopClk){ 36 | // block and thread index 37 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 38 | 39 | // synchronize all threads 40 | asm volatile ("bar.sync 0;"); 41 | 42 | // start timing 43 | uint32_t start = 0; 44 | asm volatile ("mov.u32 %0, %%clock;" : "=r"(start) :: "memory"); 45 | 46 | for(int i = idx; i < ARRAY_SIZE/4; i += blockDim.x * gridDim.x) { 47 | float4 a1 = reinterpret_cast(A)[i]; 48 | float4 b1 = reinterpret_cast(B)[i]; 49 | float4 d1 = reinterpret_cast(D)[i]; 50 | float4 e1 = reinterpret_cast(E)[i]; 51 | float4 f1 = reinterpret_cast(F)[i]; 52 | float4 c1; 53 | 54 | c1.x = a1.x + b1.x + d1.x + e1.x + f1.x; 55 | c1.y = a1.y + b1.y + d1.y + e1.y + f1.y; 56 | c1.z = a1.z + b1.z + d1.z + e1.z + f1.z; 57 | c1.w = a1.w + b1.w + d1.w + e1.w + f1.w; 58 | 59 | reinterpret_cast(C)[i] = c1; 60 | } 61 | 62 | // synchronize all threads 63 | 64 | // synchronize all threads 65 | asm volatile ("bar.sync 0;"); 66 | 67 | // stop timing 68 | uint32_t stop = 0; 69 | asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory"); 70 | 71 | // write time and data back to memory 72 | startClk[idx] = start; 73 | stopClk[idx] = stop; 74 | } 75 | 76 | int main(){ 77 | uint32_t *startClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 78 | uint32_t *stopClk = (uint32_t*) malloc(TOTAL_THREADS*sizeof(uint32_t)); 79 | float *A = (float*) malloc(ARRAY_SIZE*sizeof(float)); 80 | float *B = (float*) malloc(ARRAY_SIZE*sizeof(float)); 81 | float *C = (float*) malloc(ARRAY_SIZE*sizeof(float)); 82 | float *D = (float*) malloc(ARRAY_SIZE*sizeof(float)); 83 | float *E = (float*) malloc(ARRAY_SIZE*sizeof(float)); 84 | float *F = (float*) malloc(ARRAY_SIZE*sizeof(float)); 85 | 86 | 87 | uint32_t *startClk_g; 88 | uint32_t *stopClk_g; 89 | float *A_g; 90 | float *B_g; 91 | float *C_g; 92 | float *D_g; 93 | float *E_g; 94 | float *F_g; 95 | 96 | 97 | for (uint32_t i=0; i>>(A_g, B_g, C_g, D_g, E_g, F_g, startClk_g, stopClk_g); 128 | cudaEventRecord(stop); 129 | cudaEventSynchronize(stop); 130 | 131 | gpuErrchk( cudaPeekAtLastError() ); 132 | 133 | gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 134 | gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); 135 | gpuErrchk( cudaMemcpy(C, C_g, ARRAY_SIZE*sizeof(float), cudaMemcpyDeviceToHost) ); 136 | 137 | float mem_bw; 138 | float milliseconds = 0; 139 | cudaEventElapsedTime(&milliseconds, start, stop); 140 | 141 | unsigned N = ARRAY_SIZE * 6 * 4; //6 arrays of floats types 142 | 143 | mem_bw = (float)(N)/((float)(stopClk[0]-startClk[0])); 144 | printf("Mem BW= %f (Byte/Clk)\n", mem_bw); 145 | printf("Mem BW= %f (GB/sec)\n", (float)N/milliseconds/1e6); 146 | printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); 147 | } 148 | 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Hierarchical Roofline Module 2 | 3 | ### Module to plot hierarchical roofline model for GPU kernels. 4 | 5 | ### Methodology can be found in the following paper: 6 | ### [Ding, Nan & Williams, Samuel. (2019). An Instruction Roofline Model for GPUs.](https://crd.lbl.gov/assets/Uploads/InstructionRooflineModel-PMBS19-.pdf) 7 | 8 | ## Metrics Used 9 | 10 | - Νumber of instructions executed by kernel 11 | 12 | - Τotal number of global transactions for L1 13 | 14 | - Τotal number of shared transactions for L1 15 | 16 | - Τotal number of L2 transactions 17 | 18 | - Τotal number of HBM transactions 19 | 20 | From the above we get the following results: 21 | 22 | - Instruction Intensity: (No. Instructions/32) / (Νο. transactions) , instructions scaled to warp-level 23 | 24 | - Performance: (No. Instructions/32) / (1e9 * run time) , kernel_time is provided in usecs 25 | 26 | For **nvprof** and **ncu** metric comparison, the NSight Compute [User Manual](https://docs.nvidia.com/nsight-compute/2020.1/pdf/NsightComputeCli.pdf) can be used. 27 | 28 | ## Plotting Requirements 29 | 30 | - [matplotlib](https://matplotlib.org/) 31 | 32 | - [pandas](https://pandas.pydata.org/) 33 | 34 | ## How to use the roofline tool 35 | 36 | ### 1. Collect metrics with NVIDIA profiler 37 | 38 | Use the **profile_nvp.sh** (for nvprof) or the **profile_ncu.sh** (for ncu) 39 | 40 | ### 2. Modify roofline_tool.py 41 | 42 | - Edit nvp, ncu dictionaries with appropriate metrics if needed 43 | 44 | - Edit **create_graph()** with peak GIPS and memory bandwidth values 45 | 46 | - Modify parameters in **main()** function: 47 | 48 | - Edit colors, markers & labels dictionaries with proper kernel names (l1, l2 & hbm keys are used for hierarchical roofline) 49 | 50 | - Define metric files 51 | 52 | - Define the profiler used (nvp or ncu) 53 | 54 | - Define kernel name(s) as shown in csv files 55 | 56 | - Define memory types used for bw ceilings and for app characterization 57 | 58 | - Define graph elements 59 | 60 | - Choose roofline **mode**: 61 | 62 | - **0** : Hierarchical roofline for a single kernel 63 | 64 | 65 | - **1** : Roofline Model for multiple kernels and chosen memory type(s) 66 | 67 | 68 | 69 | --- 70 | 71 | ## Bandwidth Ceilings 72 | In module **roofline_tool.py** the ceilings for L1,L2 cache, HBM and maximum (warp-based) IPS for the NVIDIA V100S and A100 GPU models, are sourced from the above-mentioned paper and from the [Ampere Whitepaper](https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf) respectively. Empirical bandwidths can be calculated by running ***measure_bw.sh*** under GPU_Microbenchmarks folder (results for V100S are provided in bw_measurements.txt). The ***measure_bw.sh*** currently supports only GPU models with **nvprof** profiling capabilities. The microbenchmarks used can be found here: [accel-sim/gpu-app-collection](https://github.com/accel-sim/gpu-app-collection/tree/release/src/cuda/GPU_Microbenchmark) -------------------------------------------------------------------------------- /hierarchical/events_transpose.csv: -------------------------------------------------------------------------------- 1 | "Device","Kernel","Invocations","Event Name","Min","Max","Avg","Total" 2 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"thread_inst_executed",16777216,16777216,16777216,1694498816 3 | -------------------------------------------------------------------------------- /hierarchical/metrics_transpose.csv: -------------------------------------------------------------------------------- 1 | "Device","Kernel","Invocations","Metric Name","Metric Description","Min","Max","Avg" 2 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"inst_executed_global_loads","Warp level instructions for global loads",32768,32768,32768 3 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"gld_transactions","Global Load Transactions",131072,131072,131072 4 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"gst_transactions","Global Store Transactions",524288,524288,524288 5 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"shared_load_transactions","Shared Load Transactions",0,0,0 6 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"shared_store_transactions","Shared Store Transactions",0,0,0 7 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"l2_read_transactions","L2 Read Transactions",131168,131364,131172 8 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"l2_write_transactions","L2 Write Transactions",533906,550835,545629 9 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"dram_read_transactions","Device Memory Read Transactions",131291,144058,136549 10 | "Tesla V100S-PCIE-32GB (0)","transposeNaive(float*, float*, int, int)",101,"dram_write_transactions","Device Memory Write Transactions",77553,176292,135811 11 | -------------------------------------------------------------------------------- /hierarchical/profile_nvp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script profiling a CUDA application (transpose) 4 | # A python application can be profiled likewise 5 | # the nvprof profiler is used 6 | 7 | exe="transpose" # modify appropriately 8 | 9 | # metrics for nvprof 10 | metrics='inst_executed_global_loads,gld_transactions,gst_transactions,shared_load_transactions,shared_store_transactions,l2_read_transactions,l2_write_transactions,dram_read_transactions,dram_write_transactions' 11 | 12 | events='thread_inst_executed' 13 | # if profiling python module it may need the following option 14 | # options='--openacc-profiling off' 15 | 16 | nvprof --csv --print-gpu-summary --log-file timing_${exe}.csv ./${exe} 17 | nvprof --csv --metrics $metrics --log-file metrics_${exe}.csv ./${exe} 18 | nvprof --csv --events $events --log-file events_${exe}.csv ./${exe} 19 | -------------------------------------------------------------------------------- /hierarchical/roofline_transpose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/hierarchical/roofline_transpose.png -------------------------------------------------------------------------------- /hierarchical/timing_transpose.csv: -------------------------------------------------------------------------------- 1 | "Type","Time(%)","Time","Calls","Avg","Min","Max","Name" 2 | ,%,ms,,ms,us,ms, 3 | "GPU activities",43.971243,2.670947,101,0.026445,25.983000,0.027104,"transposeNaive(float*, float*, int, int)" 4 | "GPU activities",43.555097,2.645669,2,1.322834,687.129000,1.958540,"[CUDA memcpy DtoH]" 5 | "GPU activities",12.473660,0.757688,1,0.757688,757.688000,0.757688,"[CUDA memcpy HtoD]" 6 | -------------------------------------------------------------------------------- /hierarchical/transpose: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/hierarchical/transpose -------------------------------------------------------------------------------- /multi_kernels/events.csv: -------------------------------------------------------------------------------- 1 | "Process ID","Process Name","Host Name","Kernel Name","Block Size","Grid Size","Device","CC","Invocations","Section Name","Metric Name","Metric Unit","Minimum","Maximum","Average" 2 | "6515","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","smsp__thread_inst_executed.sum","inst","74,784,015.00","74,784,015.00","74,784,015.00" 3 | "6515","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","smsp__thread_inst_executed.sum","inst","43,077,888.00","43,077,888.00","43,077,888.00" 4 | -------------------------------------------------------------------------------- /multi_kernels/metrics.csv: -------------------------------------------------------------------------------- 1 | "Process ID","Process Name","Host Name","Kernel Name","Block Size","Grid Size","Device","CC","Invocations","Section Name","Metric Name","Metric Unit","Minimum","Maximum","Average" 2 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","dram__sectors_read.sum","sector","402,316.00","402,364.00","402,339.36" 3 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","dram__sectors_write.sum","sector","0.00","0.00","0.00" 4 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum","","83,370.00","83,445.00","83,411.23" 5 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum","","82,944.00","82,944.00","82,944.00" 6 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum","sector","375,000.00","375,000.00","375,000.00" 7 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum","sector","0.00","0.00","0.00" 8 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_atom.sum","sector","0.00","0.00","0.00" 9 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_read.sum","sector","561,714.00","565,602.00","563,533.63" 10 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_red.sum","sector","2,706,852.00","2,706,852.00","2,706,852.00" 11 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_write.sum","sector","14.00","415.00","127.02" 12 | "1696","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","smsp__inst_executed_op_global_ld.sum","inst","46,875.00","46,875.00","46,875.00" 13 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","dram__sectors_read.sum","sector","806,228.00","806,604.00","806,261.76" 14 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","dram__sectors_write.sum","sector","12,412.00","241,456.00","15,106.60" 15 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum","","0.00","0.00","0.00" 16 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum","","0.00","0.00","0.00" 17 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum","sector","3,570,040.00","3,572,749.00","3,571,289.82" 18 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum","sector","375,000.00","375,000.00","375,000.00" 19 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_atom.sum","sector","0.00","0.00","0.00" 20 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_read.sum","sector","3,977,858.00","3,980,093.00","3,978,598.37" 21 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_red.sum","sector","0.00","0.00","0.00" 22 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","lts__t_sectors_op_write.sum","sector","561,601.00","562,390.00","561,636.89" 23 | "1696","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","smsp__inst_executed_op_global_ld.sum","inst","201,324.00","201,324.00","201,324.00" 24 | -------------------------------------------------------------------------------- /multi_kernels/profile_ncu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Draft script profiling a python application 4 | # that utilizes a GPU model (e.g. with CuPy library) 5 | # the Nsight Compute Profiler (ncu) is used 6 | # exe file must be defined appropriately 7 | 8 | 9 | gld_transactions='l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum' 10 | gst_transactions='l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum' 11 | shared_load_transactions='l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum' 12 | shared_store_transactions='l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum' 13 | l2_transactions_2='lts__t_sectors_op_atom.sum' 14 | l2_transactions_3='lts__t_sectors_op_red.sum' 15 | l2_read_transactions_1='lts__t_sectors_op_read.sum' 16 | l2_read_transactions="${l2_read_transactions_1},${l2_transactions_2},${l2_transactions_3}" 17 | l2_write_transactions_1='lts__t_sectors_op_write.sum' 18 | l2_write_transactions="${l2_write_transactions_1},${l2_transactions_2},${l2_transactions_3}" 19 | dram_read_transactions='dram__sectors_read.sum' 20 | dram_write_transactions='dram__sectors_write.sum' 21 | 22 | metrics="${gld_transactions},${gst_transactions},${shared_load_transactions},${shared_store_transactions},${l2_read_transactions},${l2_write_transactions},${dram_read_transactions},${dram_write_transactions}" 23 | events='smsp__thread_inst_executed.sum' 24 | 25 | options='--csv --profile-from-start off --print-summary per-kernel' 26 | 27 | $ncu $options --metrics gpu__time_active.avg --log-file timing.csv $python $exe 28 | $ncu $options --metrics $metrics --log-file metrics.csv $python $exe 29 | $ncu $options --metrics $events --log-file events.csv $python $exe -------------------------------------------------------------------------------- /multi_kernels/roofline_kernels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Giotyp/GPU-Roofline-Python/8ca95c7ccf9a94c771ab54deff639dac81653458/multi_kernels/roofline_kernels.png -------------------------------------------------------------------------------- /multi_kernels/timing.csv: -------------------------------------------------------------------------------- 1 | "Process ID","Process Name","Host Name","Kernel Name","Block Size","Grid Size","Device","CC","Invocations","Section Name","Metric Name","Metric Unit","Minimum","Maximum","Average" 2 | "76","python3.8","127.0.0.1","kernel_B","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","gpu__time_active.avg","nsecond","90,016.00","95,136.00","93,188.16" 3 | "76","python3.8","127.0.0.1","kernel_A","(1024, 1, 1)","(216, 1, 1)","0","8.0","100","Command line profiler metrics","gpu__time_active.avg","nsecond","76,064.00","78,240.00","76,952.96" 4 | -------------------------------------------------------------------------------- /roofline_tool.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | 5 | ## Create dictionaries for ncu and nvprof ## 6 | ## with appropriate metric names ## 7 | 8 | nvp = { 9 | "time_kernel": "Name", 10 | "Average": "Avg", 11 | "metric_kernel": "Kernel", 12 | "gld":"gld_transactions", 13 | "gst":"gst_transactions", 14 | "shld":"shared_load_transactions", 15 | "shst":"shared_store_transactions", 16 | "l2rd":"l2_read_transactions", 17 | "l2wr":"l2_write_transactions", 18 | "drrd":"dram_read_transactions", 19 | "drwr":"dram_write_transactions", 20 | } 21 | 22 | ncu = { 23 | "time_kernel": "Kernel Name", 24 | "Average": "Average", 25 | "metric_kernel": "Kernel Name", 26 | "gld":"l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum", 27 | "gst":"l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum", 28 | "shld":"l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", 29 | "shst":"l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", 30 | "l2rd":"lts__t_sectors_op_read.sum", 31 | "l2wr":"lts__t_sectors_op_write.sum", 32 | "l2at":"lts__t_sectors_op_atom.sum", 33 | "l2red":"lts__t_sectors_op_red.sum", 34 | "drrd":"dram__sectors_read.sum", 35 | "drwr":"dram__sectors_write.sum", 36 | } 37 | 38 | 39 | # Function to return float value from row object 40 | # ncu stores numbers in ',' format e.g. 750,000.0 41 | def float_val(x): 42 | return float(x.to_string().split(' ')[-1].replace(',','')) 43 | 44 | 45 | def create_graph(memories): 46 | ## Define Bandwidths ## 47 | 48 | #peak = 489.6 # theoretical peak for V100S in warp GIPS 49 | peak = 609.12 # theoretical peak for A100 50 | 51 | #l1_bw = 437.5 # theoretical bandwidth of L1 for V100S 52 | l1_bw = 1312.5 # theoretical bandwidth of L1 for A100 53 | l1_elbow = peak/l1_bw 54 | 55 | #l2_bw = 93.6 # theoretical bandwidth of L2 for V100S 56 | l2_bw = 215.3 # theoretical bandwidth of L2 for A100 57 | l2_elbow = peak/l2_bw 58 | 59 | #hbm_bw = 25.9 # theoretical bandwidth of HBM for V100S 60 | hbm_bw = 48.6 # theoretical bandwidth of HBM for A100 61 | hbm_elbow = peak/hbm_bw 62 | 63 | 64 | ## Plotting ## 65 | 66 | # Modify specified figure parameters according to needs 67 | 68 | fig = plt.figure(figsize=(8,4)) 69 | # (left,bottom,width,height) of the figure 70 | ax = plt.axes((0.1,0.1,0.8,0.8)) 71 | 72 | ax.set_xscale('log') 73 | ax.set_yscale('log') 74 | ax.set_xlabel('Instruction Intensity (Warp Instructions per Transaction)') 75 | ax.set_ylabel('Performance (warp GIPS)') 76 | 77 | xmin, xmax, ymin, ymax = -2, 2, -1, 3 78 | 79 | ax.set_xlim(10**xmin, 10**xmax) 80 | ax.set_ylim(10**ymin, 10**ymax) 81 | 82 | instr_min = 10**xmin 83 | instr_max = 10**xmax 84 | 85 | peak_x = np.asarray([l1_elbow, 10**xmax]) # performance ceiling 86 | peak_y = np.asarray([peak, peak]) 87 | 88 | l1_x = np.asarray([instr_min, l1_elbow ]) # L1 ceiling 89 | l1_y = np.asarray([l1_bw*instr_min, peak]) 90 | 91 | l2_x = np.asarray([instr_min, l2_elbow]) # L2 ceiling 92 | l2_y = np.asarray([l2_bw*instr_min, peak]) 93 | 94 | hbm_x = np.asarray([instr_min, hbm_elbow]) # HBM ceiling 95 | hbm_y = np.asarray([hbm_bw*instr_min, peak]) 96 | 97 | # add architectural characterization to figure 98 | 99 | l1, l2, hbm = memories 100 | 101 | ax.plot(peak_x, peak_y, color='0') # Performance ceiling 102 | 103 | if l1: 104 | ax.plot(l1_x, l1_y, color='r', label=f'L1 {l1_bw} GTXN/s') # L1 ceiling 105 | if l2: 106 | ax.plot(l2_x, l2_y, color='g', label=f'L2 {l2_bw} GTXN/s') # L2 ceiling 107 | if hbm: 108 | ax.plot(hbm_x, hbm_y, color='b', label=f'HBM {hbm_bw} GTXN/s') # HBM ceiling 109 | 110 | # text for peak performance 111 | ax.text(l1_elbow, peak+100, f'Theoretical Peak: {peak} warp GIPS') 112 | 113 | return ax,fig 114 | 115 | 116 | 117 | def timing(kernel_stats, kernel_name, time_file, profiler): 118 | # Format csv files to discard nvprof unwanted output 119 | # e.g. : ==5328== NVPROF is profiling process 5328, command: ./transpose 120 | 121 | with open(time_file,"r+") as f: 122 | new_f = f.readlines() 123 | f.seek(0) 124 | for line in new_f: 125 | if "==" not in line: 126 | f.write(line) 127 | f.truncate() 128 | 129 | # get csv database with pandas 130 | timing = pd.read_csv(time_file) 131 | 132 | ## Kernel Time ## 133 | time_row = timing.loc[(timing[profiler["time_kernel"]] == kernel_name)] 134 | 135 | kernel_time = time_row[profiler["Average"]] # average kernel time 136 | kernel_time = float_val(kernel_time) 137 | 138 | if profiler == nvp: 139 | unit_row = timing.loc[(timing['Time(%)'].str.match('%', na=False))] 140 | unit = unit_row.iloc[0]["Avg"] # unit used 141 | unit += "econd" # convert to ncu display 142 | elif profiler == ncu: 143 | unit = time_row["Metric Unit"].to_list()[0] # unit used 144 | 145 | # change to usecond 146 | if unit == 'msecond': 147 | kernel_time *= 1000 148 | elif unit == 'nsecond': 149 | kernel_time /= 1000 150 | elif unit == 'second': 151 | kernel_time *= 1000000 152 | 153 | kernel_stats['kernel_time'] = kernel_time 154 | 155 | 156 | 157 | def find_inst(kernel_stats, kernel_name, events_file, profiler): 158 | with open(events_file,"r+") as f: 159 | new_f = f.readlines() 160 | f.seek(0) 161 | for line in new_f: 162 | if "==" not in line: 163 | f.write(line) 164 | f.truncate() 165 | 166 | events = pd.read_csv(events_file) 167 | 168 | # Total Instructions 169 | instructions = events.loc[(events[profiler["metric_kernel"]] == kernel_name)] 170 | kernel_inst = instructions[profiler["Average"]] 171 | kernel_inst = float_val(kernel_inst) 172 | total_inst_nrml = kernel_inst 173 | 174 | total_inst_nrml /= 32 175 | 176 | kernel_stats['total_inst'] = total_inst_nrml 177 | 178 | 179 | 180 | def app_char(kernel_stats, kernel_name, metrics_file, graph, memories, profiler, labels, colors, markers, mode): 181 | with open(metrics_file,"r+") as f: 182 | new_f = f.readlines() 183 | f.seek(0) 184 | for line in new_f: 185 | if "==" not in line: 186 | f.write(line) 187 | f.truncate() 188 | 189 | metrics = pd.read_csv(metrics_file) 190 | kernel_metrics = metrics.loc[(metrics[profiler["metric_kernel"]] == kernel_name)] 191 | 192 | l1, l2, hbm = memories 193 | 194 | if l1: 195 | ## L1 stats ## 196 | 197 | gld_stats = kernel_metrics.loc[kernel_metrics['Metric Name'] == profiler["gld"]] 198 | gld_trans = int(float_val(gld_stats[profiler["Average"]])) 199 | 200 | gst_stats = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["gst"])] 201 | gst_trans = int(float_val(gst_stats[profiler["Average"]])) 202 | 203 | sld_stats = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["shld"])] 204 | sld_trans = int(float_val(sld_stats[profiler["Average"]])) 205 | 206 | sst_stats = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["shst"])] 207 | sst_trans = int(float_val(sst_stats[profiler["Average"]])) 208 | 209 | l1_total = gld_trans + gst_trans + sld_trans + sst_trans 210 | 211 | l1_intensity = kernel_stats['total_inst'] / l1_total 212 | l1_performance = kernel_stats['total_inst'] / (1000 * kernel_stats['kernel_time']) # performance in GIPS ( kernel_time in μsecs ) 213 | 214 | 215 | if mode == 0: 216 | color = colors["l1"] 217 | marker = markers["l1"] 218 | label = labels["l1"] 219 | elif mode == 1: 220 | color = colors[kernel_name] 221 | marker = markers[kernel_name] 222 | label = labels[kernel_name] 223 | 224 | graph.plot(l1_intensity, l1_performance, color=color, marker = marker, label=label) 225 | 226 | 227 | if l2: 228 | ## L2 stats ## 229 | 230 | l2_at, l2_red = 0,0 231 | if profiler == ncu: 232 | l2_at = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["l2at"])] 233 | l2_at = int(float_val(l2_at[profiler["Average"]])) 234 | l2_red = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["l2red"])] 235 | l2_red = int(float_val(l2_red[profiler["Average"]])) 236 | 237 | l2_rd = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["l2rd"])] 238 | l2_rd = int(float_val(l2_rd[profiler["Average"]])) 239 | 240 | l2_read_trans = l2_rd + l2_red + l2_at 241 | 242 | l2_wr = kernel_metrics.loc[(kernel_metrics["Metric Name"] == profiler["l2wr"])] 243 | l2_wr = int(float_val(l2_wr[profiler["Average"]])) 244 | 245 | l2_write_trans = l2_wr + l2_red + l2_at 246 | 247 | l2_total = l2_read_trans + l2_write_trans 248 | 249 | l2_intensity = kernel_stats['total_inst'] / l2_total 250 | l2_performance = kernel_stats['total_inst'] / (1000 * kernel_stats['kernel_time']) # performance in GIPS ( kernel_time in μsecs ) 251 | 252 | 253 | if mode == 0: 254 | color = colors["l2"] 255 | marker = markers["l2"] 256 | label = labels["l2"] 257 | elif mode == 1: 258 | color = colors[kernel_name] 259 | marker = markers[kernel_name] 260 | label = labels[kernel_name] 261 | 262 | graph.plot(l2_intensity, l2_performance, color=color, marker = marker, label=label) 263 | 264 | 265 | if hbm: 266 | ## HBM stats ## 267 | 268 | dram_rd = kernel_metrics.loc[kernel_metrics['Metric Name'] == profiler["drrd"]] 269 | dram_rd = int(float_val(dram_rd[profiler["Average"]])) 270 | 271 | dram_wr = kernel_metrics.loc[kernel_metrics['Metric Name'] == profiler["drwr"]] 272 | dram_wr = int(float_val(dram_wr[profiler["Average"]])) 273 | 274 | dram_total = dram_rd + dram_wr 275 | 276 | hbm_intensity = kernel_stats['total_inst'] / dram_total 277 | hbm_performance = kernel_stats['total_inst'] / (1000 * kernel_stats['kernel_time']) # performance in GIPS ( kernel_time in μsecs ) 278 | 279 | 280 | if mode == 0: 281 | color = colors["hbm"] 282 | marker = markers["hbm"] 283 | label = labels["hbm"] 284 | elif mode == 1: 285 | color = colors[kernel_name] 286 | marker = markers[kernel_name] 287 | label = labels[kernel_name] 288 | 289 | graph.plot(hbm_intensity, hbm_performance, color=color, marker = marker, label=label) 290 | 291 | 292 | if __name__ == "__main__": 293 | 294 | ## Define dictionaries for colors, markers, labels ## 295 | 296 | colors = { 297 | "l1":"r", 298 | "l2":"g", 299 | "hbm":"b", 300 | 'kernel_A': 'y', 301 | "kernel_B":'m', 302 | } 303 | 304 | markers = { 305 | "l1":"s", 306 | "l2":"s", 307 | "hbm":"s", 308 | 'kernel_A': 's', 309 | "kernel_B":'s', 310 | } 311 | 312 | labels = { 313 | "l1":"L1 (tot_inst)", 314 | "l2":"L2 (tot_inst)", 315 | "hbm":"HBM (tot_inst)", 316 | 'kernel_A': "A_kernel", 317 | 'kernel_B': "B_kernel", 318 | } 319 | 320 | ## Define csv filenames ## 321 | 322 | timings = "multi_kernels/timing.csv" 323 | events = "multi_kernels/events.csv" 324 | metrics = "multi_kernels/metrics.csv" 325 | 326 | ## Define profiler (ncu or nvp) ## 327 | ## Dictionaries declared at the top ## 328 | 329 | profiler = ncu 330 | 331 | ## Define kernel name(s) (as shown in csv file) ## 332 | ## For hierarchical roofline provide single kernel ## 333 | ## otherwise provide multiple kernels ## 334 | 335 | kernels = ["kernel_A", "kernel_B"] 336 | 337 | ## Define Memories for ceilings ## 338 | 339 | l1, l2, hbm = True, True, True 340 | memories_ceil = [l1, l2, hbm] 341 | 342 | ## Define Memories for plotting ## 343 | 344 | l1, l2, hbm = True, False, False 345 | memories_plot = [l1, l2, hbm] 346 | 347 | ## Define graph elements ## 348 | 349 | title = f"Kernel L1 Performance (NVIDIA A100)" 350 | figname = f"multi_kernels/roofline_kernels.png" 351 | 352 | ## Define rooflinemode ## 353 | ## (0: hierarchical for 1 kernel ## 354 | ## 1: multiple kernels) ## 355 | 356 | mode = 1 357 | 358 | 359 | ax, fig = create_graph(memories_ceil) 360 | 361 | 362 | for kernel_name in kernels: 363 | kernel_stats = {} 364 | 365 | timing(kernel_stats, kernel_name, timings, profiler) 366 | 367 | find_inst(kernel_stats, kernel_name, events, profiler) 368 | 369 | app_char(kernel_stats, kernel_name, metrics, ax, memories_plot, profiler, labels, colors, markers, mode) 370 | 371 | # add legend 372 | ax.legend(loc='lower right', fontsize='10') 373 | 374 | 375 | ax.set_title(title) 376 | ax.grid(True) 377 | 378 | fig.savefig(figname, bbox_inches="tight") --------------------------------------------------------------------------------