├── .gitignore ├── .gitmodules ├── 3rd_party └── nersc-roofline-master │ ├── ERT │ ├── Config │ │ ├── config.cori.nersc.gov.hsw │ │ ├── config.cori.nersc.gov.knl │ │ └── config.cori.nersc.gov.volta │ └── Kernels │ │ ├── kernel1.c │ │ ├── kernel1.h │ │ ├── kernel3.c │ │ ├── kernel3.h │ │ └── rep.h │ ├── GPP │ ├── KNL │ │ ├── CustomComplex.h │ │ ├── Makefile │ │ ├── batchjob.advisor │ │ ├── batchjob.likwid.survey │ │ ├── batchjob.notools.survey │ │ ├── batchjob.sde.survey │ │ ├── batchjob.vtune │ │ ├── compile.likwid.survey │ │ ├── compile.notools.survey │ │ ├── compile.tools.survey │ │ ├── gppCustomComplex.cpp │ │ ├── parse-sde2.sh │ │ ├── parse-vtune.py │ │ ├── parse-vtune.sh │ │ └── roofline.py │ └── Volta │ │ ├── GPUComplex.cu │ │ ├── GPUComplex.h │ │ ├── Makefile │ │ ├── compile.survey │ │ ├── gppKer_gpuComplex.cpp │ │ ├── ncu-section-files │ │ └── SpeedOfLight_HierarchicalDoubleRooflineChart.section │ │ └── run.survey │ ├── Plotting │ ├── plot_plot_roofline_py.eps │ ├── plot_plot_roofline_py.pdf │ ├── plot_plot_roofline_py.png │ ├── plot_roofline.py │ └── plot_roofline_hierarchical.py │ ├── README.md │ └── stream-ai-example │ ├── .gitignore │ ├── Makefile │ ├── READ.ME │ ├── README.md │ ├── fortran_itt_sde │ ├── Makefile │ ├── api_itt_sde.c │ ├── jacobi.f90 │ └── module_itt_sde.f90 │ ├── parse-sde.sh │ ├── parse-vtune2017.sh │ ├── parse-vtune2018.sh │ ├── stream-ai.sh │ └── stream_mpi.c ├── README.md ├── benchmark ├── cublas │ └── cublas.cu ├── cusparse │ ├── Makefile │ ├── Makefile.volta │ ├── cusparse.cu │ ├── run.sh │ └── vars.h ├── hpec │ └── 20-champions-1 │ │ └── run.sh └── sputnik │ ├── sim.cu │ └── spmm.cu ├── src ├── BF.cpp ├── Makefile ├── Makefile.multi ├── Makefile.multi.big ├── SNIG.cpp ├── cost.cpp ├── cuSparse.cpp ├── fuse.cpp ├── fuse │ ├── fuse.h │ └── header.h ├── gpu_lib │ ├── gpu_env.h │ ├── gpu_runtime.h │ └── header.h ├── inspector │ ├── code_gen.cpp │ ├── code_gen.h │ ├── code_gen_basic.h │ ├── cost_model.h │ ├── data_inspector.h │ ├── gpu_block.h │ ├── gpu_block_scheduler.h │ ├── gpu_run_config.h │ ├── gpu_wrap.h │ ├── header.h │ ├── matrix_block.h │ ├── matrix_block_container.h │ └── matrix_block_gen.h ├── main.cpp ├── mc_test.cpp ├── microbenchmark │ ├── 20-champion.cu │ ├── all_network.cu │ ├── bf.cu │ ├── bf_opt.cu │ ├── cusparse_spmm.cu │ ├── fuse.cu │ ├── fuse_cmp.cu │ ├── header.h │ ├── load-data.cu │ ├── matrix_transpose.cu │ ├── matrix_transpose_and_delete.cu │ ├── multi_gpu │ │ ├── header.h │ │ ├── multi_gpu.cu │ │ └── multi_gpu_big.cu │ ├── n16284-l1.cu │ ├── n16384-l11.cu │ ├── n16384-l2-l10.cu │ ├── out_memory.cu │ ├── random.h │ ├── rectangels.cu │ ├── row-succ-20-uiuc-transpose.cu │ ├── row-succ-20-uiuc.cu │ ├── row-succ-no-transpose.cu │ ├── row-succ-transpose-batch-parallel.cu │ ├── row-succ.cu │ └── snig.cu ├── multi_gpu.cpp ├── multi_gpu │ ├── add_mpi │ │ ├── add_singlegpu │ │ ├── add_singlegpu.cu │ │ ├── makefile │ │ ├── mpi_call.cpp │ │ ├── mpi_call.o │ │ ├── run_volta.sh │ │ ├── saxpy.cu │ │ ├── saxpy.o │ │ ├── test │ │ └── vars.h │ ├── add_omp.cu │ └── add_stream.cu ├── network.cpp ├── reorder │ ├── hash.h │ ├── header.h │ └── reorder.h ├── run_bf.sh └── utils │ ├── cpu_spmm.h │ ├── cpu_spmm_fuse.h │ ├── cpu_transpose.h │ ├── cpu_transpose_and_delete.h │ ├── debug.h │ ├── header.h │ ├── matrix.h │ ├── matrix_base.h │ ├── string.h │ └── type.h └── tools ├── 3d.png ├── 3d_plot.py ├── control_code_analysis.py ├── cost_model.py ├── edgedraw.py ├── get_dataset.sh ├── paper.cpp ├── plt_show.py ├── statistics.py └── tmp.png /.gitignore: -------------------------------------------------------------------------------- 1 | *.gpu 2 | *.ptx 3 | *.cubin 4 | *.fatbin 5 | *.sass 6 | 7 | *.bin 8 | *.out 9 | 10 | *.tar.gz 11 | *.tsv 12 | *.tmp 13 | *.txt 14 | 15 | data 16 | 17 | data_show 18 | 19 | 3rd_party/20-graphchallenge/SpDNN_Challenge2020/data -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SC Sputnik"] 2 | path = 3rd_party/sputnik 3 | url = https://github.com/google-research/sputnik.git 4 | 5 | [submodule "PPoPP TuringAs"] 6 | path = 3rd_party/turingas 7 | url = https://github.com/daadaada/turingas.git 8 | 9 | [submodule "PPoPP GAS"] 10 | path = 3rd_party/gas 11 | url = https://github.com/daadaada/gas.git 12 | 13 | [submodule "CGO GPA"] 14 | path = 3rd_party/GPA 15 | url = https://github.com/Jokeren/GPA.git 16 | 17 | [submodule "PACT SparseRT"] 18 | path = 3rd_party/gpu-sparsert 19 | url = https://github.com/marsupialtail/gpu-sparsert.git 20 | 21 | 22 | [submodule "20 Champions UIUC/NVIDIA"] 23 | path = 3rd_party/20-graphchallenge/SpDNN_Challenge2020 24 | url = https://github.com/merthidayetoglu/SpDNN_Challenge2020.git 25 | 26 | 27 | [submodule "20 Champions Utah"] 28 | path = 3rd_party/20-graphchallenge/SNIG 29 | url = https://github.com/dian-lun-lin/SNIG.git 30 | 31 | 32 | [submodule "20 Innovation Pitt"] 33 | path = 3rd_party/20-graphchallenge/DistSparseDNN 34 | url = https://github.com/hmofrad/DistSparseDNN.git 35 | 36 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.hsw: -------------------------------------------------------------------------------- 1 | # Cori Haswell partition 2 | 3 | ERT_RESULTS Results.cori.nersc.gov.hsw 4 | 5 | ERT_DRIVER driver1 6 | ERT_KERNEL kernel1 7 | 8 | ERT_MPI True 9 | ERT_MPI_CFLAGS 10 | ERT_MPI_LDFLAGS 11 | 12 | ERT_OPENMP True 13 | ERT_OPENMP_CFLAGS -openmp 14 | ERT_OPENMP_LDFLAGS -openmp 15 | 16 | ERT_FLOPS 1,2,4,8,16 17 | ERT_ALIGN 32 18 | 19 | ERT_CC cc 20 | ERT_CFLAGS -O3 -fno-alias -fno-fnalias -xCORE-AVX2 -DERT_INTEL 21 | 22 | ERT_LD cc 23 | ERT_LDFLAGS 24 | ERT_LDLIBS 25 | 26 | ERT_RUN export OMP_NUM_THREADS=ERT_OPENMP_THREADS; export OMP_PLACES=threads; export OMP_PROC_BIND=spread; srun -n ERT_MPI_PROCS --cpu_bind=cores -c `expr 64 / ERT_MPI_PROCS` ./ERT_CODE 27 | 28 | ERT_PROCS_THREADS 32 29 | ERT_MPI_PROCS 2,4,8,16,32 30 | ERT_OPENMP_THREADS 1-32 31 | 32 | ERT_NUM_EXPERIMENTS 5 33 | 34 | ERT_MEMORY_MAX 1073741824 35 | 36 | ERT_WORKING_SET_MIN 1 37 | 38 | ERT_TRIALS_MIN 1 39 | 40 | ERT_GNUPLOT gnuplot 41 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.knl: -------------------------------------------------------------------------------- 1 | # Cori KNL partition 2 | 3 | ERT_RESULTS Results.cori.nersc.gov.knl 4 | 5 | ERT_DRIVER driver1 6 | ERT_KERNEL kernel1 7 | 8 | ERT_MPI True 9 | ERT_MPI_CFLAGS 10 | ERT_MPI_LDFLAGS 11 | 12 | ERT_OPENMP True 13 | ERT_OPENMP_CFLAGS -qopenmp 14 | ERT_OPENMP_LDFLAGS -qopenmp 15 | 16 | ERT_FLOPS 1,2,4,8,16,32,64 17 | ERT_ALIGN 64 18 | 19 | ERT_CC cc 20 | ERT_CFLAGS -O3 -fno-alias -fno-fnalias -xMIC-AVX512 -DERT_INTEL 21 | 22 | ERT_LD cc 23 | ERT_LDFLAGS 24 | ERT_LDLIBS 25 | 26 | ERT_RUN export SLURM_CORES=$(( 256 / ERT_MPI_PROCS )); export OMP_PLACES=threads; export OMP_PROC_BIND=spread; export OMP_NUM_THREADS=ERT_OPENMP_THREADS; srun -n ERT_MPI_PROCS -c $SLURM_CORES --cpu_bind=cores ./ERT_CODE 27 | 28 | ERT_PROCS_THREADS 256 29 | ERT_MPI_PROCS 1,4,16,64 30 | ERT_OPENMP_THREADS 1-256 31 | 32 | ERT_NUM_EXPERIMENTS 1 33 | 34 | ERT_STRIDE 100 35 | ERT_MEMORY_MAX 1073741824 36 | 37 | ERT_WORKING_SET_MIN 1 38 | 39 | ERT_TRIALS_MIN 1 40 | 41 | ERT_GNUPLOT gnuplot 42 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.volta: -------------------------------------------------------------------------------- 1 | # Cori Volta partition 2 | 3 | ERT_RESULTS Results.cori.nersc.gov.volta 4 | 5 | ERT_DRIVER driver1 6 | ERT_KERNEL kernel1 7 | 8 | ERT_GPU True 9 | ERT_GPU_CFLAGS -x cu 10 | ERT_GPU_LDFLAGS 11 | 12 | ERT_FLOPS 1,2,4,8,16,32,64,128,256 13 | ERT_ALIGN 32 14 | 15 | ERT_CC nvcc 16 | ERT_CFLAGS -O3 17 | 18 | ERT_LD nvcc 19 | ERT_LDFLAGS 20 | ERT_LDLIBS 21 | 22 | ERT_RUN ./ERT_CODE 23 | 24 | ERT_BLOCKS_THREADS 163840 25 | ERT_GPU_BLOCKS 80,160,320,640,1280,2560 26 | ERT_GPU_THREADS 64,128,256,512,1024 27 | 28 | ERT_NUM_EXPERIMENTS 1 29 | 30 | ERT_MEMORY_MAX 1073741824 31 | 32 | ERT_WORKING_SET_MIN 128 33 | 34 | ERT_TRIALS_MIN 1 35 | 36 | ERT_GNUPLOT gnuplot 37 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Kernels/kernel1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "rep.h" 6 | #include "kernel1.h" 7 | 8 | void initialize(uint64_t nsize, 9 | double* __restrict__ A, 10 | double value) 11 | { 12 | #ifdef ERT_INTEL 13 | __assume_aligned(A, ERT_ALIGN); 14 | #elif __xlC__ 15 | __alignx(ERT_ALIGN, A); 16 | #endif 17 | 18 | uint64_t i; 19 | for (i = 0; i < nsize; ++i) { 20 | A[i] = value; 21 | } 22 | } 23 | 24 | #ifdef ERT_GPU 25 | __global__ void block_stride(uint64_t ntrials, uint64_t nsize, double *A) 26 | { 27 | uint64_t total_thr = gridDim.x * blockDim.x; 28 | uint64_t elem_per_thr = (nsize + (total_thr-1)) / total_thr; 29 | uint64_t blockOffset = blockIdx.x * blockDim.x; 30 | 31 | uint64_t start_idx = blockOffset + threadIdx.x; 32 | uint64_t end_idx = start_idx + elem_per_thr * total_thr; 33 | uint64_t stride_idx = total_thr; 34 | 35 | if (start_idx > nsize) { 36 | start_idx = nsize; 37 | } 38 | 39 | if (end_idx > nsize) { 40 | end_idx = nsize; 41 | } 42 | 43 | double alpha = 0.5; 44 | uint64_t i, j; 45 | for (j = 0; j < ntrials; ++j) { 46 | for (i = start_idx; i < end_idx; i += stride_idx) { 47 | double beta = 0.8; 48 | #if (ERT_FLOP & 1) == 1 /* add 1 flop */ 49 | KERNEL1(beta,A[i],alpha); 50 | #endif 51 | #if (ERT_FLOP & 2) == 2 /* add 2 flops */ 52 | KERNEL2(beta,A[i],alpha); 53 | #endif 54 | #if (ERT_FLOP & 4) == 4 /* add 4 flops */ 55 | REP2(KERNEL2(beta,A[i],alpha)); 56 | #endif 57 | #if (ERT_FLOP & 8) == 8 /* add 8 flops */ 58 | REP4(KERNEL2(beta,A[i],alpha)); 59 | #endif 60 | #if (ERT_FLOP & 16) == 16 /* add 16 flops */ 61 | REP8(KERNEL2(beta,A[i],alpha)); 62 | #endif 63 | #if (ERT_FLOP & 32) == 32 /* add 32 flops */ 64 | REP16(KERNEL2(beta,A[i],alpha)); 65 | #endif 66 | #if (ERT_FLOP & 64) == 64 /* add 64 flops */ 67 | REP32(KERNEL2(beta,A[i],alpha)); 68 | #endif 69 | #if (ERT_FLOP & 128) == 128 /* add 128 flops */ 70 | REP64(KERNEL2(beta,A[i],alpha)); 71 | #endif 72 | #if (ERT_FLOP & 256) == 256 /* add 256 flops */ 73 | REP128(KERNEL2(beta,A[i],alpha)); 74 | #endif 75 | #if (ERT_FLOP & 512) == 512 /* add 512 flops */ 76 | REP256(KERNEL2(beta,A[i],alpha)); 77 | #endif 78 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */ 79 | REP512(KERNEL2(beta,A[i],alpha)); 80 | #endif 81 | 82 | A[i] = beta; 83 | } 84 | alpha = alpha * (1 - 1e-8); 85 | } 86 | } 87 | 88 | int gpu_blocks; 89 | int gpu_threads; 90 | 91 | void gpuKernel(uint64_t nsize, 92 | uint64_t ntrials, 93 | double* __restrict__ A, 94 | int* bytes_per_elem, 95 | int* mem_accesses_per_elem) 96 | { 97 | *bytes_per_elem = sizeof(*A); 98 | *mem_accesses_per_elem = 2; 99 | 100 | #ifdef ERT_INTEL 101 | __assume_aligned(A, ERT_ALIGN); 102 | #elif __xlC__ 103 | __alignx(ERT_ALIGN, A); 104 | #endif 105 | 106 | block_stride <<< gpu_blocks, gpu_threads>>> (ntrials, nsize, A); 107 | } 108 | #else 109 | void kernel(uint64_t nsize, 110 | uint64_t ntrials, 111 | double* __restrict__ A, 112 | int* bytes_per_elem, 113 | int* mem_accesses_per_elem) 114 | { 115 | *bytes_per_elem = sizeof(*A); 116 | *mem_accesses_per_elem = 2; 117 | 118 | #ifdef ERT_INTEL 119 | __assume_aligned(A, ERT_ALIGN); 120 | #elif __xlC__ 121 | __alignx(ERT_ALIGN, A); 122 | #endif 123 | 124 | double alpha = 0.5; 125 | uint64_t i, j; 126 | for (j = 0; j < ntrials; ++j) { 127 | #pragma unroll (8) 128 | for (i = 0; i < nsize; ++i) { 129 | double beta = 0.8; 130 | #if (ERT_FLOP & 1) == 1 /* add 1 flop */ 131 | KERNEL1(beta,A[i],alpha); 132 | #endif 133 | #if (ERT_FLOP & 2) == 2 /* add 2 flops */ 134 | KERNEL2(beta,A[i],alpha); 135 | #endif 136 | #if (ERT_FLOP & 4) == 4 /* add 4 flops */ 137 | REP2(KERNEL2(beta,A[i],alpha)); 138 | #endif 139 | #if (ERT_FLOP & 8) == 8 /* add 8 flops */ 140 | REP4(KERNEL2(beta,A[i],alpha)); 141 | #endif 142 | #if (ERT_FLOP & 16) == 16 /* add 16 flops */ 143 | REP8(KERNEL2(beta,A[i],alpha)); 144 | #endif 145 | #if (ERT_FLOP & 32) == 32 /* add 32 flops */ 146 | REP16(KERNEL2(beta,A[i],alpha)); 147 | #endif 148 | #if (ERT_FLOP & 64) == 64 /* add 64 flops */ 149 | REP32(KERNEL2(beta,A[i],alpha)); 150 | #endif 151 | #if (ERT_FLOP & 128) == 128 /* add 128 flops */ 152 | REP64(KERNEL2(beta,A[i],alpha)); 153 | #endif 154 | #if (ERT_FLOP & 256) == 256 /* add 256 flops */ 155 | REP128(KERNEL2(beta,A[i],alpha)); 156 | #endif 157 | #if (ERT_FLOP & 512) == 512 /* add 512 flops */ 158 | REP256(KERNEL2(beta,A[i],alpha)); 159 | #endif 160 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */ 161 | REP512(KERNEL2(beta,A[i],alpha)); 162 | #endif 163 | 164 | A[i] = beta; 165 | } 166 | alpha = alpha * (1 - 1e-8); 167 | } 168 | } 169 | #endif 170 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Kernels/kernel1.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL1_H 2 | #define KERNEL1_H 3 | 4 | #ifdef ERT_GPU 5 | extern int gpu_blocks; 6 | extern int gpu_threads; 7 | #endif 8 | 9 | #define KERNEL1(a,b,c) ((a) = (b) + (c)) 10 | #define KERNEL2(a,b,c) ((a) = (a)*(b) + (c)) 11 | 12 | void initialize(uint64_t nsize, 13 | double* __restrict__ array, 14 | double value); 15 | 16 | #ifdef ERT_GPU 17 | void gpuKernel(uint64_t nsize, 18 | uint64_t ntrials, 19 | double* __restrict__ array, 20 | int* bytes_per_elem, 21 | int* mem_accesses_per_elem); 22 | #else 23 | void kernel(uint64_t nsize, 24 | uint64_t ntrials, 25 | double* __restrict__ array, 26 | int* bytes_per_elem, 27 | int* mem_accesses_per_elem); 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Kernels/kernel3.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "rep.h" 6 | #include "kernel3.h" 7 | 8 | void initialize(uint64_t nsize, 9 | double* __restrict__ A, 10 | double value) 11 | { 12 | #ifdef ERT_INTEL 13 | __assume_aligned(A, ERT_ALIGN); 14 | #elif __xlC__ 15 | __alignx(ERT_ALIGN, A); 16 | #endif 17 | 18 | uint64_t i; 19 | for (i = 0; i < nsize; ++i) { 20 | A[i] = value; 21 | } 22 | } 23 | 24 | #ifdef ERT_GPU 25 | __global__ void block_stride(uint64_t ntrials, uint64_t nsize, double *A) 26 | { 27 | uint64_t total_thr = gridDim.x * blockDim.x; 28 | uint64_t elem_per_thr = (nsize + (total_thr-1)) / total_thr; 29 | uint64_t blockOffset = blockIdx.x * blockDim.x; 30 | 31 | uint64_t start_idx = blockOffset + threadIdx.x; 32 | uint64_t end_idx = start_idx + elem_per_thr * total_thr; 33 | uint64_t stride_idx = total_thr; 34 | 35 | if (start_idx > nsize) { 36 | start_idx = nsize; 37 | } 38 | 39 | if (end_idx > nsize) { 40 | end_idx = nsize; 41 | } 42 | 43 | double alpha = 0.5; 44 | uint64_t i, j; 45 | for (j = 0; j < ntrials; ++j) { 46 | for (i = start_idx; i < end_idx; i += stride_idx) { 47 | double beta = 0.8; 48 | #if (ERT_FLOP & 1) == 1 /* add 1 flop */ 49 | KERNEL1(beta,A[i],alpha); 50 | #endif 51 | #if (ERT_FLOP & 2) == 2 /* add 2 flops */ 52 | KERNEL2(beta,A[i],alpha); 53 | #endif 54 | #if (ERT_FLOP & 4) == 4 /* add 4 flops */ 55 | REP2(KERNEL2(beta,A[i],alpha)); 56 | #endif 57 | #if (ERT_FLOP & 8) == 8 /* add 8 flops */ 58 | REP4(KERNEL2(beta,A[i],alpha)); 59 | #endif 60 | #if (ERT_FLOP & 16) == 16 /* add 16 flops */ 61 | REP8(KERNEL2(beta,A[i],alpha)); 62 | #endif 63 | #if (ERT_FLOP & 32) == 32 /* add 32 flops */ 64 | REP16(KERNEL2(beta,A[i],alpha)); 65 | #endif 66 | #if (ERT_FLOP & 64) == 64 /* add 64 flops */ 67 | REP32(KERNEL2(beta,A[i],alpha)); 68 | #endif 69 | #if (ERT_FLOP & 128) == 128 /* add 128 flops */ 70 | REP64(KERNEL2(beta,A[i],alpha)); 71 | #endif 72 | #if (ERT_FLOP & 256) == 256 /* add 256 flops */ 73 | REP128(KERNEL2(beta,A[i],alpha)); 74 | #endif 75 | #if (ERT_FLOP & 512) == 512 /* add 512 flops */ 76 | REP256(KERNEL2(beta,A[i],alpha)); 77 | #endif 78 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */ 79 | REP512(KERNEL2(beta,A[i],alpha)); 80 | #endif 81 | 82 | A[i] = beta; 83 | } 84 | alpha = alpha * (1 - 1e-8); 85 | } 86 | } 87 | 88 | int gpu_blocks; 89 | int gpu_threads; 90 | 91 | void gpuKernel(uint64_t nsize, 92 | uint64_t ntrials, 93 | double* __restrict__ A, 94 | int* bytes_per_elem, 95 | int* mem_accesses_per_elem) 96 | { 97 | *bytes_per_elem = sizeof(*A); 98 | *mem_accesses_per_elem = 2; 99 | 100 | #ifdef ERT_INTEL 101 | __assume_aligned(A, ERT_ALIGN); 102 | #elif __xlC__ 103 | __alignx(ERT_ALIGN, A); 104 | #endif 105 | 106 | block_stride <<< gpu_blocks, gpu_threads>>> (ntrials, nsize, A); 107 | } 108 | #else 109 | void kernel(uint64_t nsize, 110 | uint64_t ntrials, 111 | double* __restrict__ A, 112 | int* bytes_per_elem, 113 | int* mem_accesses_per_elem) 114 | { 115 | *bytes_per_elem = sizeof(*A); 116 | *mem_accesses_per_elem = 2; 117 | 118 | #ifdef ERT_INTEL 119 | __assume_aligned(A, ERT_ALIGN); 120 | #elif __xlC__ 121 | __alignx(ERT_ALIGN, A); 122 | #endif 123 | 124 | double alpha = 0.5; 125 | uint64_t i, j; 126 | for (j = 0; j < ntrials; ++j) { 127 | #pragma unroll (8) 128 | for (i = 0; i < nsize; ++i) { 129 | double beta = 0.8; 130 | #if (ERT_FLOP & 1) == 1 /* add 1 flop */ 131 | KERNEL1(beta,A[i],alpha); 132 | #endif 133 | #if (ERT_FLOP & 2) == 2 /* add 2 flops */ 134 | KERNEL2(beta,A[i],alpha); 135 | #endif 136 | #if (ERT_FLOP & 4) == 4 /* add 4 flops */ 137 | REP2(KERNEL2(beta,A[i],alpha)); 138 | #endif 139 | #if (ERT_FLOP & 8) == 8 /* add 8 flops */ 140 | REP4(KERNEL2(beta,A[i],alpha)); 141 | #endif 142 | #if (ERT_FLOP & 16) == 16 /* add 16 flops */ 143 | REP8(KERNEL2(beta,A[i],alpha)); 144 | #endif 145 | #if (ERT_FLOP & 32) == 32 /* add 32 flops */ 146 | REP16(KERNEL2(beta,A[i],alpha)); 147 | #endif 148 | #if (ERT_FLOP & 64) == 64 /* add 64 flops */ 149 | REP32(KERNEL2(beta,A[i],alpha)); 150 | #endif 151 | #if (ERT_FLOP & 128) == 128 /* add 128 flops */ 152 | REP64(KERNEL2(beta,A[i],alpha)); 153 | #endif 154 | #if (ERT_FLOP & 256) == 256 /* add 256 flops */ 155 | REP128(KERNEL2(beta,A[i],alpha)); 156 | #endif 157 | #if (ERT_FLOP & 512) == 512 /* add 512 flops */ 158 | REP256(KERNEL2(beta,A[i],alpha)); 159 | #endif 160 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */ 161 | REP512(KERNEL2(beta,A[i],alpha)); 162 | #endif 163 | 164 | A[i] = beta; 165 | } 166 | alpha = alpha * (1 - 1e-8); 167 | } 168 | } 169 | #endif 170 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Kernels/kernel3.h: -------------------------------------------------------------------------------- 1 | #ifndef KERNEL3_H 2 | #define KERNEL3_H 3 | 4 | #ifdef ERT_GPU 5 | extern int gpu_blocks; 6 | extern int gpu_threads; 7 | #endif 8 | 9 | #define KERNEL1(a,b,c) ((a) = (b)/(a) ) 10 | #define KERNEL2(a,b,c) ((a) = (b)/(a) + (c)) 11 | 12 | 13 | void initialize(uint64_t nsize, 14 | double* __restrict__ array, 15 | double value); 16 | 17 | #ifdef ERT_GPU 18 | void gpuKernel(uint64_t nsize, 19 | uint64_t ntrials, 20 | double* __restrict__ array, 21 | int* bytes_per_elem, 22 | int* mem_accesses_per_elem); 23 | #else 24 | void kernel(uint64_t nsize, 25 | uint64_t ntrials, 26 | double* __restrict__ array, 27 | int* bytes_per_elem, 28 | int* mem_accesses_per_elem); 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/ERT/Kernels/rep.h: -------------------------------------------------------------------------------- 1 | #ifndef REP_H 2 | #define REP_H 3 | 4 | #define REP2(S) S ; S 5 | #define REP4(S) REP2(S); REP2(S) 6 | #define REP8(S) REP4(S); REP4(S) 7 | #define REP16(S) REP8(S); REP8(S) 8 | #define REP32(S) REP16(S); REP16(S) 9 | #define REP64(S) REP32(S); REP32(S) 10 | #define REP128(S) REP64(S); REP64(S) 11 | #define REP256(S) REP128(S); REP128(S) 12 | #define REP512(S) REP256(S); REP256(S) 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/CustomComplex.h: -------------------------------------------------------------------------------- 1 | /* 2 | Templated CustomComplex class that represents a complex class comprised of any type of real and imaginary types. 3 | */ 4 | #ifndef __CustomComplex 5 | #define __CustomComplex 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | template 18 | 19 | class CustomComplex { 20 | 21 | private : 22 | re x; 23 | im y; 24 | 25 | public: 26 | explicit CustomComplex () { 27 | x = 0.00; 28 | y = 0.00; 29 | } 30 | 31 | 32 | explicit CustomComplex(const double& a, const double& b) { 33 | x = a; 34 | y = b; 35 | } 36 | 37 | CustomComplex(const CustomComplex& src) { 38 | x = src.x; 39 | y = src.y; 40 | } 41 | 42 | CustomComplex& operator =(const CustomComplex& src) { 43 | x = src.x; 44 | y = src.y; 45 | 46 | return *this; 47 | } 48 | 49 | CustomComplex& operator +=(const CustomComplex& src) { 50 | x = src.x + this->x; 51 | y = src.y + this->y; 52 | 53 | return *this; 54 | } 55 | 56 | CustomComplex& operator -=(const CustomComplex& src) { 57 | x = src.x - this->x; 58 | y = src.y - this->y; 59 | 60 | return *this; 61 | } 62 | 63 | CustomComplex& operator -() { 64 | x = -this->x; 65 | y = -this->y; 66 | 67 | return *this; 68 | } 69 | 70 | CustomComplex& operator ~() { 71 | return *this; 72 | } 73 | 74 | void print() const { 75 | printf("( %f, %f) ", this->x, this->y); 76 | printf("\n"); 77 | } 78 | 79 | double get_real() const 80 | { 81 | return this->x; 82 | } 83 | 84 | double get_imag() const 85 | { 86 | return this->y; 87 | } 88 | 89 | void set_real(double val) 90 | { 91 | this->x = val; 92 | } 93 | 94 | void set_imag(double val) 95 | { 96 | this->y = val; 97 | } 98 | 99 | // 6 flops 100 | template 101 | friend inline CustomComplex operator *(const CustomComplex &a, const CustomComplex &b) { 102 | real x_this = a.x * b.x - a.y*b.y ; 103 | imag y_this = a.x * b.y + a.y*b.x ; 104 | CustomComplex result(x_this, y_this); 105 | return (result); 106 | } 107 | 108 | //2 flops 109 | template 110 | friend inline CustomComplex operator *(const CustomComplex &a, const double &b) { 111 | CustomComplex result(a.x*b, a.y*b); 112 | return result; 113 | } 114 | 115 | //2 flops 116 | template 117 | friend inline CustomComplex operator -(const double &a, CustomComplex& src) { 118 | CustomComplex result(a - src.x, 0 - src.y); 119 | return result; 120 | } 121 | 122 | template 123 | friend inline CustomComplex operator +(const double &a, CustomComplex& src) { 124 | CustomComplex result(a + src.x, src.y); 125 | return result; 126 | } 127 | 128 | template 129 | friend inline CustomComplex CustomComplex_conj(const CustomComplex& src) ; 130 | 131 | template 132 | friend inline double CustomComplex_abs(const CustomComplex& src) ; 133 | 134 | template 135 | friend inline double CustomComplex_real( const CustomComplex& src) ; 136 | 137 | template 138 | friend inline double CustomComplex_imag( const CustomComplex& src) ; 139 | }; 140 | 141 | /* 142 | * Return the conjugate of a complex number 143 | 1flop 144 | */ 145 | template 146 | inline CustomComplex CustomComplex_conj(const CustomComplex& src) { 147 | 148 | re re_this = src.x; 149 | im im_this = -1 * src.y; 150 | 151 | CustomComplex result(re_this, im_this); 152 | return result; 153 | 154 | } 155 | 156 | /* 157 | * Return the absolute of a complex number 158 | */ 159 | template 160 | inline double CustomComplex_abs(const CustomComplex& src) { 161 | re re_this = src.x * src.x; 162 | im im_this = src.y * src.y; 163 | 164 | re result = sqrt(re_this+im_this); 165 | return result; 166 | } 167 | 168 | /* 169 | * Return the real part of a complex number 170 | */ 171 | template 172 | inline double CustomComplex_real( const CustomComplex& src) { 173 | return src.x; 174 | } 175 | 176 | /* 177 | * Return the imaginary part of a complex number 178 | */ 179 | template 180 | inline double CustomComplex_imag( const CustomComplex& src) { 181 | return src.y; 182 | } 183 | 184 | #endif 185 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/Makefile: -------------------------------------------------------------------------------- 1 | #EXE = gppKer_double.ex.likwid.div.dbl.nofma.iw6 2 | #SRC = gppKer_double.cpp 3 | EXE = gppCustomComplex.ex.survey.likwid.div.cmplx.fma.iw6 4 | SRC = gppCustomComplex.cpp 5 | 6 | #CXX = xlc++_r 7 | #CXX = g++ 8 | CXX = CC 9 | 10 | LINK = ${CXX} 11 | 12 | ifeq ($(CXX),CC) 13 | ##Intel compiler flag 14 | #CXXFLAGS= -g -O3 -qopenmp -qopt-report=5 -std=c++11 15 | CXXFLAGS= -g -O3 -qopenmp -std=c++11 16 | CXXFLAGS+=-fma #Fused multiply and add 17 | #CXXFLAGS+=-DTOOLS -I${VTUNE_DIR}/include 18 | #CXXFLAGS+=-I /usr/common/software/likwid/4.3.0/include/ -DLIKWID_PERFMON 19 | # #CXXFLAGS+=-I /usr/common/software/likwid/4.3.0/include/ -DUSE_VTUNE -I${VTUNE_DIR}/include -DLIKWID_PERFMON 20 | #CXXFLAGS+=-xCORE-AVX2 21 | CXXFLAGS+=-xMIC-AVX512 22 | LINKFLAGS=-qopenmp 23 | #LINKFLAGS+=-L /usr/common/software/likwid/4.3.0/lib -llikwid 24 | #LINKFLAGS+=-L ${VTUNE_DIR}/lib64 -littnotify 25 | 26 | #Cray compiler flag 27 | # CXXFLAGS= -hlist=a 28 | endif 29 | 30 | ifeq ($(CXX),g++) 31 | CXXFLAGS= -g -O3 -std=c++11 -fopenmp 32 | LINKFLAGS=-fopenmp 33 | endif 34 | 35 | ifeq ($(CXX),xlc++_r) 36 | CXXFLAGS=-O3 -std=gnu++11 -g -qsmp 37 | LINKFLAGS=-qsmp 38 | endif 39 | 40 | ifeq ($(CXX),clang++) 41 | CXXFLAGS=-O3 -std=gnu++11 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME} 42 | LINKFLAGS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME} 43 | endif 44 | 45 | OBJ = $(SRC:.cpp=.o) 46 | 47 | $(EXE): $(OBJ) 48 | $(CXX) $(OBJ) -o $(EXE) $(LINKFLAGS) 49 | 50 | $(OBJ1): $(SRC) 51 | $(CXX) -c $(SRC) $(CXXFLAGS) 52 | 53 | clean: 54 | rm -f $(OBJ) $(EXE) 55 | 56 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/batchjob.advisor: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -n 1 3 | #SBATCH -t 03:00:00 4 | #SBATCH -A nstaff 5 | #SBATCH -C knl,quad,cache 6 | 7 | module swap craype-haswell craype-mic-knl 8 | 9 | export OMP_NUM_THREADS=64 10 | export OMP_PROC_BIND=spread 11 | export OMP_PLACES=threads 12 | 13 | module unload darshan 14 | module load advisor 15 | 16 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 17 | mkdir Results_adv_$SLURM_JOB_ID 18 | seqend=1 19 | 20 | label='survey.div.cmplx.fma' 21 | res=Results_adv_$SLURM_JOB_ID/results.$label 22 | touch $res 23 | for i in $(seq 1 $seqend) 24 | do 25 | 26 | srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 advixe-cl --collect=roofline --project-dir=Results_adv_$SLURM_JOB_ID/my_adv.knl -- ./gppCustomComplex.ex.$label.iw6 512 2 32768 20 0 >> $res 27 | 28 | done 29 | 30 | cd Results_adv_$SLURM_JOB_ID/ 31 | #advixe-cl -report roofline --project-dir Results_adv_$SLURM_JOB_ID/my_adv.knl 32 | advixe-cl -report roofline --project-dir my_adv.knl > adv.html 33 | 34 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/batchjob.likwid.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -n 1 3 | #SBATCH -t 00:10:00 4 | #SBATCH -A nstaff 5 | #SBATCH -C knl,quad,cache 6 | #SBATCH --perf=vtune 7 | 8 | module swap craype-haswell craype-mic-knl 9 | 10 | export OMP_NUM_THREADS=64 11 | export OMP_PROC_BIND=spread 12 | export OMP_PLACES=threads 13 | 14 | module unload darshan 15 | module load likwid 16 | 17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 18 | mkdir Results_likwid_survey_$SLURM_JOB_ID 19 | seqend=1 20 | #arr=(1 2 3 4 5 6) 21 | arr=(6) 22 | gs=('FLOPS_DP' 'HBM_CACHE' 'L2' 'DATA') 23 | 24 | 25 | label='survey.likwid.div.cmplx.fma' 26 | for i in $(seq 1 $seqend) 27 | do 28 | for j in ${arr[@]} 29 | do 30 | for k in ${gs[@]} 31 | do 32 | kk=${k/_/.} 33 | echo $kk 34 | res=Results_likwid_survey_$SLURM_JOB_ID/result.likwid.$kk.txt 35 | touch $res 36 | 37 | srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 likwid-perfctr -c 0-271 -g $k ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 38 | done 39 | 40 | done 41 | done 42 | 43 | cd Results_likwid_survey_$SLURM_JOB_ID/ 44 | 45 | echo --------------------------------------- 46 | echo "Likwid on KNL" 47 | echo --------------------------------------- 48 | # FLOPS 49 | #echo "Run with -g FLOPS_DP" 50 | mflopss=`grep AVX512 result.likwid.FLOPS.DP.txt | tail -n 1 | cut -d '|' -f 3` 51 | runtime=`grep RDTSC result.likwid.FLOPS.DP.txt | tail -n 1 | cut -d '|' -f 6` 52 | gflops=`python -c "print('{0:.3f}'.format($mflopss * $runtime/1000))"` 53 | gflopss=`python -c "print('{0:.3f}'.format($mflopss/1000))"` 54 | #echo "Runtime: $runtime" 55 | #echo "GFLOP/s: $gflopss" 56 | echo "GFLOPS: $gflops" 57 | #echo 58 | #BYTES - DDR and MCDRAM 59 | #echo "Run with -g HBM_CACHE" 60 | hbm_mbytess=`grep "MCDRAM Memory bandwidth" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3` 61 | hbm_gbytes=`grep "MCDRAM Memory data volume" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3` 62 | ddr_mbytess=`grep "DDR Memory bandwidth" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3` 63 | ddr_gbytes=`grep "DDR Memory data volume" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3` 64 | runtime=`grep RDTSC result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 6` 65 | hbm_gbytess=`python -c "print('{0:.3f}'.format($hbm_mbytess/1000))"` 66 | ddr_gbytess=`python -c "print('{0:.3f}'.format($ddr_mbytess/1000))"` 67 | #echo "Runtime: $runtime s" 68 | echo "MCDRAM Bytes: $hbm_gbytes" 69 | #echo "MCDRAM bandwidth: $hbm_gbytess GB/s" 70 | echo "DDR Bytes: $ddr_gbytes" 71 | #echo "DDR bandwidth: $ddr_gbytess GB/s" 72 | #echo 73 | #BYTES - L2 74 | #echo "Run with -g L2" 75 | l2_mbytess=`grep "L2 bandwidth" result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 3` 76 | l2_mbytes=`grep "L2 data volume" result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 3` #Likwid wrongly reports MB in GB 77 | l2_gbytes=`python -c "print('{0:.3f}'.format($l2_mbytes/1000))"` 78 | runtime=`grep RDTSC result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 6` 79 | l2_gbytess=`python -c "print('{0:.3f}'.format($l2_mbytess/1000))"` 80 | #echo "Runtime: $runtime s" 81 | echo "L2 Bytes: $l2_gbytes" 82 | #echo "L2 bandwidth: $l2_gbytess GB/s" 83 | #echo 84 | #BYTES - L1 85 | #echo "Run with -g DATA (for L1 loads/stores uops)" 86 | uops_ld=`grep MEM_UOPS_RETIRED_ALL_LOADS result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 4` 87 | uops_st=`grep MEM_UOPS_RETIRED_ALL_STORES result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 4` 88 | runtime=`grep RDTSC result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 6` 89 | l1_gbytess=`python -c "print('{0:.3f}'.format(($uops_ld+$uops_st)/1000.0/1000/1000*64/$runtime))"` 90 | l1_gbytes=`python -c "print('{0:.3f}'.format(($uops_ld+$uops_st)/1000.0/1000/1000*64))"` 91 | #echo "Runtime: $runtime s" 92 | echo "L1 Bytes: $l1_gbytes" 93 | #echo "L1 bandwidth: $l1_gbytess GB/s" 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/batchjob.notools.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -n 1 3 | #SBATCH -t 02:00:00 4 | #SBATCH -A nstaff 5 | #SBATCH -C knl,quad,cache 6 | 7 | module swap craype-haswell craype-mic-knl 8 | 9 | export OMP_NUM_THREADS=64 10 | export OMP_PROC_BIND=spread 11 | export OMP_PLACES=threads 12 | 13 | module unload darshan 14 | 15 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 16 | mkdir Results_notools_survey_$SLURM_JOB_ID 17 | seqend=1 18 | #arr=(1 2 3 4 5 6) 19 | arr=(6) 20 | 21 | label='survey.div.cmplx.fma' 22 | res=Results_notools_survey_$SLURM_JOB_ID/results.$label 23 | touch $res 24 | for i in $(seq 1 $seqend) 25 | do 26 | for j in ${arr[@]} 27 | do 28 | srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 29 | done 30 | done 31 | 32 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/batchjob.sde.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -n 1 3 | #SBATCH -t 01:00:00 4 | #SBATCH -A nstaff 5 | #SBATCH -C knl,quad,cache 6 | #SBATCH --perf=vtune 7 | 8 | module swap craype-haswell craype-mic-knl 9 | 10 | export OMP_NUM_THREADS=64 11 | export OMP_PROC_BIND=spread 12 | export OMP_PLACES=threads 13 | 14 | module unload darshan 15 | export PATH=$PATH:/global/cfs/cdirs/nstaff/cjyang/P3HPC/Empirical_Roofline_Tool-1.1.0/new-GPP/BGW-Kernels/sde/sde-external-8.16.0-2018-01-30-lin/ 16 | 17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 18 | mkdir Results_sde_survey_$SLURM_JOB_ID 19 | seqend=1 20 | #arr=(1 2 3 4 5 6) 21 | arr=(6) 22 | 23 | label='survey.tools.div.cmplx.fma' 24 | res=Results_sde_survey_$SLURM_JOB_ID/result.sde.out 25 | touch $res 26 | for i in $(seq 1 $seqend) 27 | do 28 | for j in ${arr[@]} 29 | do 30 | srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 sde64 -knl -d -iform 1 -omix $res -global_region -start_ssc_mark 111:repeat -stop_ssc_mark 222:repeat -- ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0 31 | done 32 | done 33 | 34 | cd Results_sde_survey_$SLURM_JOB_ID/ 35 | cp ../parse-sde2.sh . 36 | 37 | echo --------------------------------------- 38 | echo "SDE on KNL" 39 | echo --------------------------------------- 40 | ./parse-sde2.sh result.sde.out > result.sde.out.parse 41 | flops=`grep 'Total FLOPs = ' result.sde.out.parse | cut -d '=' -f 2` 42 | gflops=`python -c "print('{0:.3f}'.format($flops/1000.0/1000/1000))"` 43 | echo GFLOPS: $gflops 44 | bytes=`grep 'Total Bytes = ' result.sde.out.parse | cut -d '=' -f 2` 45 | gbytes=`python -c "print('{0:.3f}'.format($bytes/1000.0/1000/1000))"` 46 | echo L1 Bytes: $gbytes 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/batchjob.vtune: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -n 1 3 | #SBATCH -t 03:00:00 4 | #SBATCH -A nstaff 5 | #SBATCH -C knl,quad,cache 6 | #SBATCH --perf=vtune 7 | 8 | module swap craype-haswell craype-mic-knl 9 | 10 | export OMP_NUM_THREADS=64 11 | export OMP_PROC_BIND=spread 12 | export OMP_PLACES=threads 13 | 14 | module unload darshan 15 | module load vtune #/2018.up2 16 | 17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 18 | mkdir Results_tools_$SLURM_JOB_ID 19 | seqend=1 20 | 21 | label='survey.tools.div.cmplx.fma' 22 | res=Results_tools_$SLURM_JOB_ID/results.$label 23 | touch $res 24 | for i in $(seq 1 $seqend) 25 | do 26 | srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 vtune -start-paused -r Results_tools_$SLURM_JOB_ID/my-vtune.knl -collect memory-access -finalization-mode=none -data-limit=0 -- ./gppCustomComplex.ex.$label.iw6 512 2 32768 20 0 >> $res 27 | done 28 | 29 | cd Results_tools_$SLURM_JOB_ID/ 30 | cp ../parse-vtune.sh . 31 | cp ../parse-vtune.py . 32 | 33 | vtune -report hw-events -group-by=package -r my-vtune.knl/ -format csv -csv-delimiter comma > my-vtune.knl.summary 34 | 35 | 36 | echo --------------------------------------- 37 | echo "VTune on KNL" 38 | echo --------------------------------------- 39 | ./parse-vtune.sh my-vtune.knl.summary > my-vtune.knl.summary.parse 40 | ddr_bytes=`grep 'Total Bytes = ' my-vtune.knl.summary.parse | tail -n 2 | head -n 1 | cut -d '=' -f 2` 41 | ddr_gbytes=`python -c "print('{0:.3f}'.format($ddr_bytes/1000.0/1000/1000))"` 42 | echo DDR Bytes: $ddr_gbytes 43 | hbm_bytes=`grep 'Total Bytes = ' my-vtune.knl.summary.parse | tail -n 1 | cut -d '=' -f 2` 44 | hbm_gbytes=`python -c "print('{0:.3f}'.format($hbm_bytes/1000.0/1000/1000))"` 45 | echo MCDRAM Bytes: $hbm_gbytes 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/compile.likwid.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | module swap craype-haswell craype-mic-knl 3 | module load likwid 4 | 5 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 6 | 7 | sed -i 's/#CXXFLAGS+=-I \/usr/CXXFLAGS+=-I \/usr/g' Makefile 8 | sed -i 's/#LINKFLAGS+=-L \/usr/LINKFLAGS+=-L \/usr/g' Makefile 9 | 10 | label='survey.likwid.div.cmplx.fma' 11 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile 12 | arr=(1 2 3 4 5 6) 13 | for i in ${arr[@]} 14 | do 15 | sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp 16 | sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile 17 | make clean && make 18 | done 19 | 20 | 21 | sed -i 's/CXXFLAGS+=-I \/usr/#CXXFLAGS+=-I \/usr/g' Makefile 22 | sed -i 's/LINKFLAGS+=-L \/usr/#LINKFLAGS+=-L \/usr/g' Makefile 23 | 24 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/compile.notools.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | module swap craype-haswell craype-mic-knl 3 | 4 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 5 | 6 | label='survey.div.cmplx.fma' 7 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile 8 | 9 | arr=(1 2 3 4 5 6) 10 | for i in ${arr[@]} 11 | do 12 | sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp 13 | sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile 14 | make clean && make 15 | done 16 | 17 | 18 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/compile.tools.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | module swap craype-haswell craype-mic-knl 3 | module load vtune 4 | 5 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL 6 | 7 | sed -i 's/#CXXFLAGS+=-DTOOLS/CXXFLAGS+=-DTOOLS/g' Makefile 8 | sed -i 's/#LINKFLAGS+=-L ${VTUNE/LINKFLAGS+=-L ${VTUNE/g' Makefile 9 | 10 | label='survey.tools.div.cmplx.fma' 11 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile 12 | arr=(1 2 3 4 5 6) 13 | for i in ${arr[@]} 14 | do 15 | sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp 16 | sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile 17 | make clean && make 18 | done 19 | 20 | sed -i 's/CXXFLAGS+=-DTOOLS/#CXXFLAGS+=-DTOOLS/g' Makefile 21 | sed -i 's/LINKFLAGS+=-L ${VTUNE/#LINKFLAGS+=-L ${VTUNE/g' Makefile 22 | 23 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/parse-vtune.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | file='my-vtune.knl.summary' 3 | df=pd.read_csv(file) 4 | 5 | total_read=df.filter(regex='UNC_M_CAS_COUNT.RD').sum(axis=1)*64 6 | total_write=df.filter(regex='UNC_M_CAS_COUNT.WR').sum(axis=1)*64 7 | print('--->DDR Report') 8 | print('--->Total Bytes read = '+str(total_read[0])) 9 | print('--->Total Bytes written = '+str(total_write[0])) 10 | print('--->Total Bytes = '+str(total_read[0] + total_write[0] )) 11 | 12 | total_read=df.filter(regex='UNC_E_RPQ_INSERTS').sum(axis=1)*64 13 | total_write=df.filter(regex='UNC_E_WPQ_INSERTS').sum(axis=1)*64 14 | print('--->MCDRAM Report') 15 | print('--->Total Bytes read = '+str(total_read[0])) 16 | print('--->Total Bytes written = '+str(total_write[0])) 17 | print('--->Total Bytes = '+str(total_read[0] + total_write[0])) 18 | 19 | 20 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/KNL/parse-vtune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Parses a VTune summary report for uncore memory access counts 4 | 5 | module load python 6 | python ./parse-vtune.py 7 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/Volta/Makefile: -------------------------------------------------------------------------------- 1 | EXE = gppKer_gpuComplex.ex.survey.div.cmplx.fma.iw6 2 | SRC1 = gppKer_gpuComplex.cpp 3 | SRC2 = GPUComplex.cu 4 | #EXE = gppKer_double.ex 5 | #SRC1 = gppKer_double.cpp 6 | #SRC2 = GPUComplex_double.cu 7 | 8 | 9 | #CXX = xlc++ 10 | #CXX = CC 11 | #CXX = g++ 12 | #CXX = clang++ 13 | CXX = nvcc 14 | 15 | LINK = ${CXX} 16 | 17 | ifeq ($(CXX),nvcc) 18 | CXXFLAGS= -g -O3 -std=c++11 -Wno-deprecated-gpu-targets -arch=sm_70 19 | # CXXFLAGS+=-Xptxas -v --maxrregcount=150 #output usage of registers 20 | CXXFLAGS+=-fmad=true 21 | LINKFLAGS=-Wno-deprecated-gpu-targets 22 | endif 23 | 24 | ifeq ($(CXX),g++) 25 | CXXFLAGS= -g -O3 -std=c++11 -fopenmp -foffload="-lm" -foffload=nvptx-none 26 | LINKFLAGS=-fopenmp 27 | endif 28 | 29 | ifeq ($(CXX),xlc++) 30 | CXXFLAGS=-O3 -std=gnu++11 -g -qsmp=noauto:omp -qoffload #-Xptxas -v 31 | LINKFLAGS=-qsmp=noauto:omp -qoffload 32 | endif 33 | 34 | ifeq ($(CXX),clang++) 35 | CXXFLAGS=-O3 -std=gnu++11 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME} 36 | LINKFLAGS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME} 37 | endif 38 | 39 | ifeq ($(CXX),icc) 40 | CXXFLAGS=-O3 -qopenmp -qopt-report=5 41 | CXXFLAGS+=xCORE_AVX2 42 | # CXXFLAGS+=-xMIC_AVX512 43 | LINKFLAGS=-qopenmp 44 | endif 45 | 46 | OBJ1 = $(SRC1:.cpp=.o) 47 | OBJ2 = $(SRC2:.cu=.o) 48 | 49 | $(EXE): $(OBJ1) $(OBJ2) 50 | $(CXX) $(OBJ1) $(OBJ2) -o $(EXE) $(LINKFLAGS) 51 | 52 | $(OBJ1): $(SRC1) 53 | $(CXX) -c $(SRC1) $(CXXFLAGS) 54 | 55 | $(OBJ2): $(SRC2) 56 | $(CXX) -c $(SRC2) $(CXXFLAGS) 57 | 58 | clean: 59 | rm -f $(OBJ1) $(OBJ2) $(EXE) 60 | 61 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/Volta/compile.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/Volta 4 | 5 | module load cuda 6 | 7 | label='survey.div.cmplx.fma' 8 | sed -i 's/fmad=.*/fmad=true/g' Makefile 9 | 10 | #arr=(1 2 3 4 5 6) 11 | arr=(6) 12 | for i in ${arr[@]} 13 | do 14 | sed -i "s/#define nend.*/#define nend $i/g" GPUComplex.h 15 | sed -i "s/gppKer_gpuComplex.ex.*/gppKer_gpuComplex.ex.$label.iw$i/g" Makefile 16 | make clean && make 17 | done 18 | 19 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/GPP/Volta/run.survey: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #module load cuda 4 | export CUDA_VISIBLE_DEVICES=0 5 | 6 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/Volta 7 | mkdir Results_survey 8 | seqend=1 9 | #arr=(1 2 3 4 5 6) 10 | arr=(6) 11 | 12 | label='survey.div.cmplx.fma' 13 | res=Results_survey/results.$label 14 | touch $res 15 | for i in $(seq 1 $seqend) 16 | do 17 | for j in ${arr[@]} 18 | do 19 | ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 20 | done 21 | done 22 | 23 | metnv='flop_count_dp,flop_count_sp,flop_count_hp,gld_transactions,gst_transactions,atomic_transactions,local_load_transactions,local_store_transactions,shared_load_transactions,shared_store_transactions,l2_read_transactions,l2_write_transactions,dram_read_transactions,dram_write_transactions,system_read_transactions,system_write_transactions' 24 | metncu10='sm__cycles_elapsed.avg,sm__cycles_elapsed.avg.per_second,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__sass_thread_inst_executed_op_fadd_pred_on.sum,sm__sass_thread_inst_executed_op_fmul_pred_on.sum,sm__sass_thread_inst_executed_op_ffma_pred_on.sum,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__inst_executed_pipe_tensor.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum,l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum,l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum,l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum,l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum,lts__t_sectors_op_read.sum,lts__t_sectors_op_write.sum,lts__t_sectors_op_atom.sum,lts__t_sectors_op_red.sum,dram__sectors_read.sum,dram__sectors_write.sum' 25 | metncu11='sm__cycles_elapsed.avg,sm__cycles_elapsed.avg.per_second,sm__sass_thread_inst_executed_op_dadd_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_fadd_pred_on.sum,sm__sass_thread_inst_executed_op_ffma_pred_on.sum,sm__sass_thread_inst_executed_op_fmul_pred_on.sum,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__inst_executed_pipe_tensor.sum,l1tex__t_bytes.sum,lts__t_bytes.sum,dram__bytes.sum' 26 | 27 | 28 | 29 | label='survey.div.cmplx.fma' 30 | 31 | j=6 32 | module load cuda/10.2.89 33 | res=Results_survey/results.nvprof.$label 34 | touch $res 35 | which nvprof 36 | nvprof --version 37 | srun -n1 nvprof --kernels "NumBandNgpown_kernel" --metrics $metnv ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 38 | 39 | res=Results_survey/results.ncu10.$label 40 | touch $res 41 | which nv-nsight-cu-cli 42 | nv-nsight-cu-cli -v 43 | srun -n1 nv-nsight-cu-cli -k "NumBandNgpown_kernel" --metrics $metncu10 ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 44 | 45 | module load nsight-compute/2020.1.0 46 | res=Results_survey/results.ncu11.$label 47 | touch $res 48 | which ncu 49 | ncu -v 50 | srun -n1 ncu -k "NumBandNgpown_kernel" --metrics $metncu11 ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 51 | 52 | srun -n1 ncu -k "NumBandNgpown_kernel" -o ncu.prof --section-folder ./ncu-section-files --section SpeedOfLight_HierarchicalDoubleRooflineChart ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 53 | 54 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.pdf -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.png -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/Plotting/plot_roofline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import sys 6 | import os 7 | import matplotlib.patches as mpatches 8 | font = { 'size' : 20} 9 | plt.rc('font', **font) 10 | 11 | filename = 'plot_' + sys.argv[0].replace('.','_') 12 | markersize = 16 13 | colors = ['b','g','r','y','m','c'] 14 | styles = ['o','s','v','^','D',">","<","*","h","H","+","1","2","3","4","8","p","d","|","_",".",","] 15 | 16 | f = open(sys.argv[1], "r") 17 | 18 | for line in f: 19 | if 'memroofs' in line: 20 | linesp = line.split() 21 | linesp = linesp[1:] 22 | smemroofs = [float(a) for a in linesp] 23 | print 'memroofs', smemroofs 24 | if 'mem_roof_names' in line: 25 | linesp = line.strip().split("\'") 26 | linesp = filter(lambda a: (a != ' ') and (a != ''), linesp) 27 | smem_roof_name = linesp[1:] 28 | print 'mem_roof_names', smem_roof_name 29 | if 'comproofs' in line: 30 | linesp = line.split() 31 | linesp = linesp[1:] 32 | scomproofs = [float(a) for a in linesp] 33 | print 'comproofs', scomproofs 34 | if 'comp_roof_names' in line: 35 | linesp = line.strip().split("\'") 36 | linesp = filter(lambda a: (a != ' ') and (a != ''), linesp) 37 | scomp_roof_name = linesp[1:] 38 | print 'comp_roof_names', scomp_roof_name 39 | if 'AI' in line: 40 | linesp = line.split() 41 | linesp = linesp[1:] 42 | AI = [float(a) for a in linesp] 43 | print 'AI', AI 44 | if 'FLOPS' in line: 45 | linesp = line.split() 46 | linesp = linesp[1:] 47 | FLOPS = [float(a) for a in linesp] 48 | print 'FLOPS', FLOPS 49 | if 'labels' in line: 50 | linesp=line.strip().split("\'") 51 | linesp = filter(lambda a: (a != ' ') and (a != ''), linesp) 52 | labels = linesp[1:] 53 | print 'labels', labels 54 | 55 | 56 | fig = plt.figure(1,figsize=(10.67,6.6)) 57 | plt.clf() 58 | ax = fig.gca() 59 | ax.set_xscale('log') 60 | ax.set_yscale('log') 61 | ax.set_xlabel('Arithmetic Intensity [FLOPs/Byte]') 62 | ax.set_ylabel('Performance [GFLOP/sec]') 63 | 64 | nx = 10000 65 | xmin = -1 66 | xmax = 2 67 | ymin = 50 68 | ymax = 30000 69 | 70 | ax.set_xlim(10**xmin, 10**xmax) 71 | ax.set_ylim(ymin, ymax) 72 | 73 | ixx = int(nx*0.02) 74 | xlim = ax.get_xlim() 75 | ylim = ax.get_ylim() 76 | 77 | scomp_x_elbow = [] 78 | scomp_ix_elbow = [] 79 | smem_x_elbow = [] 80 | smem_ix_elbow = [] 81 | 82 | x = np.logspace(xmin,xmax,nx) 83 | for roof in scomproofs: 84 | for ix in range(1,nx): 85 | if smemroofs[0] * x[ix] >= roof and smemroofs[0] * x[ix-1] < roof: 86 | scomp_x_elbow.append(x[ix-1]) 87 | scomp_ix_elbow.append(ix-1) 88 | break 89 | 90 | 91 | for roof in smemroofs: 92 | for ix in range(1,nx): 93 | if (scomproofs[0] <= roof * x[ix] and scomproofs[0] > roof * x[ix-1]): 94 | smem_x_elbow.append(x[ix-1]) 95 | smem_ix_elbow.append(ix-1) 96 | break 97 | 98 | for i in range(0,len(scomproofs)): 99 | y = np.ones(len(x)) * scomproofs[i] 100 | ax.plot(x[scomp_ix_elbow[i]:],y[scomp_ix_elbow[i]:],c='k',ls='-',lw='2') 101 | 102 | for i in range(0,len(smemroofs)): 103 | y = x * smemroofs[i] 104 | ax.plot(x[:smem_ix_elbow[i]+1],y[:smem_ix_elbow[i]+1],c='k',ls='-',lw='2') 105 | 106 | 107 | marker_handles = list() 108 | for i in range(0,len(AI)): 109 | ax.plot(float(AI[i]),float(FLOPS[i]),c=colors[i],marker=styles[i],linestyle='None',ms=markersize,label=labels[i]) 110 | marker_handles.append(ax.plot([],[],c=colors[i],marker=styles[i],linestyle='None',ms=markersize,label=labels[i])[0]) 111 | 112 | for roof in scomproofs: 113 | ax.text(x[-ixx],roof, 114 | scomp_roof_name[scomproofs.index(roof)] + ': ' + '{0:.1f}'.format(float(roof)) + ' GFLOP/s', 115 | horizontalalignment='right', 116 | verticalalignment='bottom') 117 | 118 | for roof in smemroofs: 119 | ang = np.arctan(np.log10(xlim[1]/xlim[0]) / np.log10(ylim[1]/ylim[0]) 120 | * fig.get_size_inches()[1]/fig.get_size_inches()[0] ) 121 | ax.text(x[ixx],x[ixx]*roof*(1+0.25*np.sin(ang)**2), 122 | smem_roof_name[smemroofs.index(roof)] + ': ' + '{0:.1f}'.format(float(roof)) + ' GB/s', 123 | horizontalalignment='left', 124 | verticalalignment='bottom', 125 | rotation=180/np.pi*ang) 126 | 127 | 128 | leg1 = plt.legend(handles = marker_handles,loc=4, ncol=1) 129 | ax.add_artist(leg1) 130 | 131 | plt.savefig(filename+'.png') 132 | plt.savefig(filename+'.eps') 133 | plt.savefig(filename+'.pdf') 134 | #plt.show() 135 | 136 | 137 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/README.md: -------------------------------------------------------------------------------- 1 | # nersc-roofline 2 | 3 | This repo contains files necessary to generate results here 4 | 5 | https://docs.nersc.gov/programming/performance-debugging-tools/roofline/. 6 | 7 | and for the following two papers 8 | 9 | C. Yang, R. Gayatri, T. Kurth, P. Basu, Z. Ronaghi, A. Adetokunbo, B. Friesen, B. Cook, D. Doerfler, L. Oliker et al., An Empirical Roofline Methodology for Quantitatively Assessing Performance Portability, in 2018 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). IEEE, 2018, pp. 14-23. 10 | 11 | C. Yang, Hierarchical Roofline Analysis: How to Collect Data using Performance Tools on Intel CPUs and NVIDIA GPUs, arXiv.org 12 | 13 | The data collection methodology for Roofline analysis on NVIDIA GPUs has been updated here 14 | 15 | https://gitlab.com/NERSC/roofline-on-nvidia-gpus 16 | 17 | 18 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/.gitignore: -------------------------------------------------------------------------------- 1 | stream_mpi.exe 2 | results* 3 | 4 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/Makefile: -------------------------------------------------------------------------------- 1 | MPICC = cc 2 | CFLAGS = -g -O3 -dynamic -qopenmp -restrict -qopt-streaming-stores always \ 3 | -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=50 \ 4 | -I$(VTUNE_AMPLIFIER_XE_2018_DIR)/include 5 | LDFLAGS = -L$(VTUNE_AMPLIFIER_XE_2018_DIR)/lib64 -littnotify 6 | 7 | stream_mpi.exe: stream_mpi.c Makefile 8 | $(MPICC) $(CFLAGS) stream_mpi.c -o stream_mpi.exe $(LDFLAGS) 9 | 10 | clean: 11 | rm -f stream_mpi.exe 12 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/READ.ME: -------------------------------------------------------------------------------- 1 | =============================================== 2 | 3 | STREAM is the de facto industry standard benchmark 4 | for measuring sustained memory bandwidth. 5 | 6 | Documentation for STREAM is on the web at: 7 | http://www.cs.virginia.edu/stream/ref.html 8 | 9 | =============================================== 10 | NEWS 11 | =============================================== 12 | UPDATE: October 28 2014: 13 | 14 | "stream_mpi.c" released in the Versions directory. 15 | 16 | Based on Version 5.10 of stream.c, stream_mpi.c 17 | brings the following new features: 18 | * MPI implementation that *distributes* the arrays 19 | across all MPI ranks. (The older Fortran version 20 | of STREAM in MPI *replicates* the arrays across 21 | all MPI ranks.) 22 | * Data is allocated using "posix_memalign" 23 | rather than using static arrays. Different 24 | compiler flags may be needed for both portability 25 | and optimization. 26 | See the READ.ME file in the Versions directory 27 | for more details. 28 | * Error checking and timing done by all ranks and 29 | gathered by rank 0 for processing and output. 30 | * Timing code uses barriers to ensure correct 31 | operation even when multiple MPI ranks run on 32 | shared memory systems. 33 | 34 | NOTE: MPI is not a preferred implementation for 35 | STREAM, which is intended to measure memory 36 | bandwidth in shared-memory systems. In stream_mpi, 37 | the MPI calls are only used to properly synchronize 38 | the timers (using MPI_Barrier) and to gather 39 | timing and error data, so the performance should 40 | scale linearly with the size of the cluster. 41 | But it may be useful, and was an interesting 42 | exercise to develop and debug. 43 | 44 | =============================================== 45 | UPDATE: January 17 2013: 46 | 47 | Version 5.10 of stream.c is finally available! 48 | 49 | There are no changes to what is being measured, but 50 | a number of long-awaited improvements have been made: 51 | 52 | * Updated validation code does not suffer from 53 | accumulated roundoff error for large arrays. 54 | * Defining the preprocessor variable "VERBOSE" 55 | when compiling will (1) cause the code to print the 56 | measured average relative absolute error (rather than 57 | simply printing "Solution Validates", and (2) print 58 | the first 10 array entries with relative error exceeding 59 | the error tolerance. 60 | * Array index variables have been upgraded from 61 | "int" to "ssize_t" to allow arrays with more 62 | than 2 billion elements on 64-bit systems. 63 | * Substantial improvements to the comments in 64 | the source on how to configure/compile/run the 65 | benchmark. 66 | * The proprocessor variable controlling the array 67 | size has been changed from "N" to "STREAM_ARRAY_SIZE". 68 | * A new preprocessor variable "STREAM_TYPE" can be 69 | used to override the data type from the default 70 | "double" to "float". 71 | This mechanism could also be used to change to 72 | non-floating-point types, but several "printf" 73 | statements would need to have their formats changed 74 | to accomodate the modified data type. 75 | * Some small changes in output, including printing 76 | array sizes is GiB as well as MiB. 77 | * Change to the default output format to print fewer 78 | decimals for the bandwidth and more decimals for 79 | the min/max/avg execution times. 80 | 81 | 82 | =============================================== 83 | UPDATE: February 19 2009: 84 | 85 | The most recent "official" versions have been renamed 86 | "stream.f" and "stream.c" -- all other versions have 87 | been moved to the "Versions" subdirectory and should be 88 | considered obsolete. 89 | 90 | The "official" timer (was "second_wall.c") has been 91 | renamed "mysecond.c". This is embedded in the C version 92 | ("stream.c"), but still needs to be externally linked to 93 | the FORTRAN version ("stream.f"). The new version defines 94 | entry points both with and without trailing underscores, 95 | so it *should* link automagically with any Fortran compiler. 96 | 97 | =============================================== 98 | 99 | STREAM is a project of "Dr. Bandwidth": 100 | John D. McCalpin, Ph.D. 101 | john@mccalpin.com 102 | 103 | =============================================== 104 | 105 | The STREAM web and ftp sites are currently hosted at 106 | the Department of Computer Science at the University of 107 | Virginia under the generous sponsorship of Professor Bill 108 | Wulf and Professor Alan Batson. 109 | 110 | =============================================== 111 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/README.md: -------------------------------------------------------------------------------- 1 | # stream-ai-example 2 | 3 | This directory contains all the files necessary to illustrate calculating 4 | arithmetic intensity using Intel's SDE and VTune tools. 5 | 6 | For more information, see: 7 | 8 | https://docs.nersc.gov/performance/arithmetic_intensity/ 9 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/Makefile: -------------------------------------------------------------------------------- 1 | ## 2 | # module load vtune 3 | # module load sde 4 | # module unload darshan 5 | # 6 | FC=ftn 7 | CC=cc 8 | FFLAGS= -g -dynamic -O3 -openmp 9 | CFLAGS= -g -I $(VTUNE_AMPLIFIER_XE_2016_DIR)/include 10 | LDFLAGS=$(VTUNE_AMPLIFIER_XE_2016_DIR)/lib64/libittnotify.a 11 | EXE=jacobi.x 12 | COBJ= api_itt_sde.o 13 | F90OBJ= module_itt_sde.o 14 | default: jacobi 15 | clean: 16 | rm *.o *.mod $(EXE) 17 | %.o: %.c 18 | $(CC) $(CFLAGS) -c $< 19 | %.o: %.f90 20 | $(FC) $(FFLAGS) -c $< 21 | jacobi: $(COBJ) $(F90OBJ) jacobi.o 22 | $(FC) $(FFLAGS) -o jacobi.x $(COBJ) $(F90OBJ) jacobi.o $(LDFLAGS) 23 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/api_itt_sde.c: -------------------------------------------------------------------------------- 1 | #include "ittnotify.h" 2 | 3 | void fortran_sde_start() 4 | { 5 | __SSC_MARK(0x111); 6 | } 7 | 8 | void fortran_sde_stop() 9 | { 10 | __SSC_MARK(0x222); 11 | } 12 | 13 | void fortran_itt_resume() 14 | { 15 | __itt_resume(); 16 | } 17 | 18 | void fortran_itt_pause() 19 | { 20 | __itt_pause(); 21 | } 22 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/module_itt_sde.f90: -------------------------------------------------------------------------------- 1 | MODULE ITT_SDE_FORTRAN 2 | USE, INTRINSIC :: ISO_C_BINDING 3 | 4 | INTERFACE 5 | 6 | SUBROUTINE FORTRAN_ITT_RESUME() & 7 | BIND(C, NAME='fortran_itt_resume') 8 | END SUBROUTINE FORTRAN_ITT_RESUME 9 | 10 | SUBROUTINE FORTRAN_ITT_PAUSE() & 11 | BIND(C, NAME='fortran_itt_pause') 12 | END SUBROUTINE FORTRAN_ITT_PAUSE 13 | 14 | SUBROUTINE FORTRAN_SDE_START() & 15 | BIND(C, NAME='fortran_sde_start') 16 | END SUBROUTINE FORTRAN_SDE_START 17 | 18 | SUBROUTINE FORTRAN_SDE_STOP() & 19 | BIND(C, NAME='fortran_sde_stop') 20 | END SUBROUTINE FORTRAN_SDE_STOP 21 | END INTERFACE 22 | 23 | contains 24 | 25 | subroutine start_collection() 26 | call fortran_sde_start() 27 | call fortran_itt_resume() 28 | end subroutine start_collection 29 | 30 | subroutine stop_collection() 31 | call fortran_itt_pause() 32 | call fortran_sde_stop() 33 | end subroutine stop_collection 34 | 35 | END MODULE 36 | -------------------------------------------------------------------------------- /3rd_party/nersc-roofline-master/stream-ai-example/stream-ai.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=m888 # Your repo goes here 3 | #SBATCH --qos=debug 4 | #SBATCH --nodes=1 5 | #SBATCH --time=00:30:00 6 | #SBATCH --job-name=stream-ai 7 | 8 | # VTune SEP driver is required 9 | #SBATCH --perf=vtune 10 | 11 | # for Cori, set appropriately. Not required for Edison 12 | #SBATCH --constraint="knl" 13 | ##SBATCH --constraint="haswell" 14 | 15 | ### start of script configuration parameters 16 | 17 | # set to yes or no to select tests to run 18 | run_baseline=yes 19 | run_sde=yes 20 | run_vtune=yes 21 | 22 | # use -knl for Cori KNL, -hsw for Cori Haswell or -ivb for Edison 23 | if [ $NERSC_HOST == "cori" ]; then 24 | if [ $SLURM_JOB_CPUS_PER_NODE == "272" ]; then # KNL node 25 | myhost=cori_knl 26 | SDE='sde -knl' 27 | 28 | # determin the number of MPI ranks per node 29 | n=$SLURM_JOB_NUM_NODES 30 | 31 | # number of threads/rank and virtual cores (needed for srun's -c) 32 | t=64 33 | vcores=$(( t * 4 )) 34 | else # Haswell node 35 | myhost=cori_hsw 36 | SDE='sde -hsw' 37 | n=$(( 2 * $SLURM_JOB_NUM_NODES )) 38 | t=16 39 | vcores=$(( t * 2 )) 40 | fi 41 | 42 | elif [ $NERSC_HOST == "edison" ]; then 43 | myhost=edison 44 | SDE='sde -ivb' 45 | n=$(( 2 * $SLURM_JOB_NUM_NODES )) 46 | t=12 47 | vcores=$(( t * 2 )) 48 | fi 49 | 50 | #module load sde # requires version 8.4.0 or later 51 | #module load vtune # script setup for Vtune 2017 or later 52 | 53 | ### End of configuration parameters 54 | 55 | echo "Running with $n MPI ranks and $t threads" 56 | export OMP_NUM_THREADS=$t 57 | suffix=${n}p${t}t_${SLURM_JOB_ID} 58 | exe=./stream_mpi.exe 59 | 60 | if [ "$run_baseline" == "yes" ]; then 61 | echo "" 62 | echo "--------------------------------------------------" 63 | echo "----->> Running Stream w/o Instrumentation <<-----" 64 | echo "--------------------------------------------------" 65 | srun -n $n -c $vcores --cpu_bind=cores $exe 66 | fi 67 | 68 | if [ "$run_sde" == "yes" ]; then 69 | echo "" 70 | echo "--------------------------------------------------" 71 | echo "----->> Running w/SDE <<-----" 72 | echo "--------------------------------------------------" 73 | srun -n $n -c $vcores --cpu_bind=cores $SDE -d -iform 1 -omix sde_${suffix}.out -i -top_blocks 500 -global_region -start_ssc_mark 111:repeat -stop_ssc_mark 222:repeat -- $exe 74 | echo "----->> Generating SDE Report <<-----" 75 | echo "For performance, the SDE report is best done on an external login node" 76 | echo "Run the following command: " 77 | echo "\$ ./parse-sde.sh sde_${suffix}.out*" 78 | fi 79 | 80 | if [ "$run_vtune" == "yes" ]; then 81 | echo "" 82 | echo "--------------------------------------------------" 83 | echo "----->> Running w/Vtune <<-----" 84 | echo "--------------------------------------------------" 85 | srun -n $n -c $vcores --cpu_bind=cores amplxe-cl -start-paused -r vtbw_${suffix} -collect memory-access -finalization-mode=none -trace-mpi -- $exe 86 | echo "----->> Finalizing VTune and generating report <<-----" 87 | echo "For performance, the finalize and report are best done on an external login node" 88 | echo "Run the following commands: " 89 | echo "Note that if using Vtune version 2017 replace \"-report hw-events -group-by=package\" with \"-report summary\" " 90 | if [ $myhost == "cori_knl" ]; then 91 | echo "\$ amplxe-cl -report hw-events -group-by=package -r vtbw_${suffix} -column=UNC_M_CAS_COUNT,UNC_E_RPQ_INSERTS,UNC_E_WPQ_INSERTS -format=csv -csv-delimiter=comma > vtbw_${suffix}.summary" 92 | else 93 | echo "\$ amplxe-cl -report hw-events -group-by=package -r vtbw_${suffix} -column=UNC_M_CAS_COUNT -format=csv -csv-delimiter=comma > vtbw_${suffix}.summary" 94 | fi 95 | echo "\$ ./parse-vtune2018.sh vtbw_${suffix}.summary" 96 | fi 97 | 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpDNN Graph_challenge 2 | source code for Sparse Deep Neural Network Graph Challenge (more detail:http://graphchallenge.mit.edu/challenges). 3 | 4 | 5 | ## Get Start 6 | First, clone the project and download the dataset. 7 | ``` 8 | git clone https://github.com/CGCL-codes/Graphchallenge21.git 9 | cd Graphchallenge21 10 | mkdir data/ 11 | wget https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024.tar.gz 12 | wget https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/mnist/sparse-images-1024.tsv.gz 13 | tar –xzf neuron1024.tar.gz 14 | tar –xzf sparse-images-1024.tsv.gz 15 | ``` 16 | Then, compile and run on single GPU version. 17 | ``` 18 | cd src/ 19 | nvcc -std=c++11 -O3 -o single.out network.cpp ./microbenchmark/all_network.cu 20 | ./single.out 1024 6000 120 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /benchmark/cublas/cublas.cu: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "cublas_v2.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include 10 | 11 | using namespace std; 12 | 13 | 14 | #define CHECK(call) \ 15 | { \ 16 | const cudaError_t error = call; \ 17 | if (error != cudaSuccess) \ 18 | { \ 19 | fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ 20 | fprintf(stderr, "code: %d, reason: %s\n", error, \ 21 | cudaGetErrorString(error)); \ 22 | exit(1); \ 23 | } \ 24 | } 25 | 26 | inline double seconds() 27 | { 28 | struct timeval tp; 29 | struct timezone tzp; 30 | int i = gettimeofday(&tp, &tzp); 31 | return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); 32 | } 33 | 34 | 35 | int main() 36 | { 37 | srand(time(0)); 38 | int M = 2000; //矩阵A的行,矩阵C的行 39 | int N = 16384; //矩阵A的列,矩阵B的行 40 | int K = 16384; //矩阵B的列,矩阵C的列 41 | 42 | float *h_A = (float*)malloc(sizeof(float)*M*N); 43 | float *h_B = (float*)malloc(sizeof(float)*N*K); 44 | float *h_C = (float*)malloc(sizeof(float)*M*K); 45 | 46 | for (int i = 0; i < M*N; i++) { 47 | h_A[i] = i; 48 | // cout << h_A[i] << " "; 49 | // if ((i + 1) % N == 0) 50 | // cout << endl; 51 | } 52 | // cout << endl; 53 | 54 | for (int i = 0; i < N*K; i++) { 55 | h_B[i] =i; 56 | // cout << h_B[i] << " "; 57 | // if ((i + 1) % K == 0) 58 | // cout << endl; 59 | } 60 | cout << endl; 61 | 62 | double iStart, iElaps; 63 | 64 | float *d_A, *d_B, *d_C,*d_CT; 65 | cudaMalloc((void**)&d_A, sizeof(float)*M*N); 66 | cudaMalloc((void**)&d_B, sizeof(float)*N*K); 67 | cudaMalloc((void**)&d_C, sizeof(float)*M*K); 68 | cudaMemcpy(d_A, h_A, M*N * sizeof(float), cudaMemcpyHostToDevice); 69 | cudaMemcpy(d_B, h_B, N*K * sizeof(float), cudaMemcpyHostToDevice); 70 | 71 | float alpha = 1; 72 | float beta = 0; 73 | 74 | //C=A*B 75 | cublasHandle_t handle; 76 | 77 | cublasCreate(&handle); 78 | 79 | // clock_t start = clock();//MNK Bt*At 80 | 81 | iStart = seconds(); 82 | 83 | 84 | cudaEvent_t start, stop; 85 | cudaEventCreate(&start); 86 | cudaEventCreate(&stop); 87 | 88 | cudaEventRecord(start, 0); 89 | cublasSgemm(handle, 90 | CUBLAS_OP_N, 91 | CUBLAS_OP_N, 92 | K, //矩阵B的列数 93 | M, //矩阵A的行数 94 | N, //矩阵A的列数 95 | &alpha, 96 | d_B, 97 | K, 98 | d_A, 99 | N, 100 | &beta, 101 | d_C, 102 | K); 103 | 104 | CHECK(cudaGetLastError()) ; 105 | 106 | cudaEventRecord(stop,0); 107 | cudaEventSynchronize(stop); 108 | 109 | 110 | float elapsed; 111 | cudaEventElapsedTime(&elapsed, start, stop); 112 | elapsed /= 1000.0f; 113 | 114 | iElaps = seconds() - iStart; 115 | 116 | // clock_t end = clock(); 117 | // double sum_time = double(double(end - start)/CLOCKS_PER_SEC) * 1000; 118 | 119 | 120 | printf("time= %lf\n", elapsed); 121 | 122 | // cout<<"inference time: "<< sum_time < 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //using namespace std; 9 | 10 | void readweights(); 11 | void readinput(); 12 | 13 | void setup_gpu(); 14 | void final_gpu(); 15 | void infer_gpu(int); 16 | 17 | //#define BALANCE 30 //BALANCE LAYER 0 FOR EVERY LAYER COMMENT OUT FOR TURN OFF 18 | //#define OUTOFCORE //COMMENT THIS OUT IF YOU HAVE ENOUGH MEMORY 19 | //#define OVERLAP //WORKS ONLY WHEN OUTOFCORE IS ENABLED 20 | #define INDPREC int 21 | #define VALPREC float 22 | #define FEATPREC float 23 | 24 | 25 | 26 | inline void checkCuda(cudaError_t result, const char *file, const int line, bool fatal=false) { 27 | if (result != cudaSuccess) { 28 | fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), 29 | cudaGetErrorString(result));\ 30 | if (fatal) { 31 | exit(EXIT_FAILURE); 32 | } 33 | } 34 | } 35 | 36 | #define OR_PRINT(stmt) checkCuda(stmt, __FILE__, __LINE__); 37 | #define OR_FATAL(stmt) checkCuda(stmt, __FILE__, __LINE__, true); 38 | 39 | #define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}} -------------------------------------------------------------------------------- /benchmark/hpec/20-champions-1/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/benchmark/hpec/20-champions-1/run.sh -------------------------------------------------------------------------------- /src/BF.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | 8 | #include 9 | #include 10 | using namespace ftxj; 11 | 12 | int main(int argc, char* argv[]) { 13 | 14 | std::cout << "begin" << std::endl; 15 | std::map stride_map = { 16 | {1, 16}, 17 | {2, 32}, 18 | {3, 64}, 19 | {4, 128}, 20 | {5, 256}, 21 | {6, 512}, 22 | {7, 1024}, 23 | {8, 2048}, 24 | {9, 4096}, 25 | {10, 8192} 26 | }; 27 | 28 | int neuron = atoi(argv[1]); 29 | int batch = atoi(argv[2]); 30 | int l = atoi(argv[3]); 31 | int hash_type = atoi(argv[4]); 32 | 33 | 34 | int TN = atoi(argv[5]); 35 | int blockx = atoi(argv[6]); 36 | int blocky = atoi(argv[7]); 37 | 38 | std::string file_name = "../data/neuron"+ 39 | std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv"; 40 | 41 | COOMatrix coo(file_name, 1, false); 42 | COOMatrix coo_cpu(file_name, 1, false); 43 | std::cout << "read coo success" << std::endl; 44 | 45 | 46 | if(hash_type == 0) { 47 | } 48 | 49 | if(hash_type == 1) { 50 | HashReorder hash_reorder_t(64, neuron, REORDER::ROW_REORDER); 51 | coo.reorder(hash_reorder_t); 52 | coo_cpu.reorder(hash_reorder_t); 53 | } 54 | 55 | if(hash_type == 2) { 56 | HashReorder hash_reorder_t(64, neuron, REORDER::COL_REORDER); 57 | coo.reorder(hash_reorder_t); 58 | coo_cpu.reorder(hash_reorder_t); 59 | } 60 | 61 | if(hash_type == 3) { 62 | HashReorder hash_reorder_t(64, neuron, REORDER::ALL_REORDER); 63 | coo.reorder(hash_reorder_t); 64 | coo_cpu.reorder(hash_reorder_t); 65 | } 66 | 67 | std::cout << "reorder success" << std::endl; 68 | BFMatrix bf(coo, neuron, TN); 69 | std::cout << "BF success" << std::endl; 70 | 71 | GpuEnv env(0); 72 | // test_benchmark_succ_load_store(batch, neuron, env); 73 | // test_benchmark_matrix_transpose(batch, neuron, env); 74 | // test_benchmark_matrix_transpose_and_delete(batch, neuron, env); 75 | // return 0; 76 | 77 | test_benchmark_19_BF( 78 | coo, bf, 79 | neuron, batch, TN, 80 | blockx, blocky, 81 | env 82 | ); 83 | return 0; 84 | } -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | run_multi_gpu_big:run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o 2 | mpicxx run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu_big 3 | run_multi_gpu_big_cu.o:./microbenchmark/multi_gpu/multi_gpu_big.cu 4 | nvcc -c ./microbenchmark/multi_gpu/multi_gpu_big.cu -o run_multi_gpu_big_cu.o 5 | run_multi_gpu_big_cpp.o:multi_gpu.cpp 6 | mpicxx -c multi_gpu.cpp -o run_multi_gpu_big_cpp.o 7 | clean: 8 | rm -f *.o -------------------------------------------------------------------------------- /src/Makefile.multi: -------------------------------------------------------------------------------- 1 | run_multi_gpu:multi_gpu_cu.o multi_gpu_cpp.o 2 | mpicxx multi_gpu_cu.o multi_gpu_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu 3 | multi_gpu_cu.o:./microbenchmark/multi_gpu/multi_gpu.cu 4 | nvcc -c ./microbenchmark/multi_gpu/multi_gpu.cu -o multi_gpu_cu.o 5 | multi_gpu_cpp.o:multi_gpu.cpp 6 | mpicxx -c multi_gpu.cpp -o multi_gpu_cpp.o 7 | clean: 8 | rm -f *.o -------------------------------------------------------------------------------- /src/Makefile.multi.big: -------------------------------------------------------------------------------- 1 | run_multi_gpu_big:run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o 2 | mpicxx run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu_big 3 | run_multi_gpu_big_cu.o:./microbenchmark/multi_gpu/multi_gpu_big.cu 4 | nvcc -c ./microbenchmark/multi_gpu/multi_gpu_big.cu -o run_multi_gpu_big_cu.o 5 | run_multi_gpu_big_cpp.o:multi_gpu.cpp 6 | mpicxx -c multi_gpu.cpp -o run_multi_gpu_big_cpp.o 7 | clean: 8 | rm -f *.o -------------------------------------------------------------------------------- /src/SNIG.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace ftxj; 12 | 13 | 14 | 15 | 16 | 17 | size_t get_sec_size(const size_t num_neurons) { 18 | 19 | //only for the same GPUs 20 | // 21 | //get tuned shared memory size 22 | //num_neurons must be divisible by shared memory (a.k.a. sec_size) 23 | //only for double float 24 | cudaDeviceProp props; 25 | cudaGetDeviceProperties(&props, 0); 26 | size_t sec_size{0}; 27 | 28 | size_t max_num_per_block = props.sharedMemPerBlock / sizeof(float); 29 | if(num_neurons <= max_num_per_block) { 30 | sec_size = num_neurons; 31 | } 32 | else{ 33 | int max_divisor = 2; 34 | while((num_neurons % max_divisor != 0) || 35 | (max_num_per_block < (num_neurons / max_divisor))) { 36 | ++max_divisor; 37 | } 38 | sec_size = num_neurons / max_divisor; 39 | } 40 | return sec_size; 41 | } 42 | 43 | std::string get_weight_file_name(int neuron, int layer) { 44 | std::string weight_file_dir = "../data/neuron"; 45 | std::string neuron_str = std::to_string(neuron); 46 | weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv"; 47 | return weight_file_dir; 48 | } 49 | 50 | void read_input(std::vector> &input, int neuron, int batch) { 51 | std::string input_file_name = "../data/sparse-images-"; 52 | input_file_name += std::to_string(neuron) + ".tsv"; 53 | std::ifstream input_file(input_file_name); 54 | if(!input_file){ 55 | std::cout << "FILE:" << input_file_name << " does not exists.\n"; 56 | exit(-1); 57 | } 58 | int b, n; 59 | float val; 60 | long read_num = 0; 61 | while(input_file >> b >> n >> val) { 62 | if(b <= batch) { 63 | read_num++; 64 | input[b - 1][n - 1] = val; 65 | if(val != 1.00) { 66 | printf("read input %d, %f\n", b, val); 67 | } 68 | } 69 | } 70 | std::cout << "Read Input success! read_numeber = " << read_num << std::endl; 71 | } 72 | 73 | int main(int argc, char* argv[]) { 74 | 75 | if(argc != 5) { 76 | std::cout << "Usage: exe neuron batch layer nnzs" << std::endl; 77 | return 0; 78 | } 79 | int neuron = atoi(argv[1]); 80 | int batch = atoi(argv[2]); 81 | int layer = atoi(argv[3]); 82 | int nnzs = atoi(argv[4]); 83 | int sec_size = get_sec_size(neuron); 84 | 85 | std::cout << "[Config] sec size = " << sec_size << std::endl; 86 | std::map bias_map = { 87 | {65536, -0.45}, 88 | {16384, -0.4}, 89 | {4096, -0.35}, 90 | {1024, -0.3} 91 | }; 92 | 93 | std::vector> input(batch, std::vector(neuron)); 94 | std::cout << "[BEGIN]..." << std::endl; 95 | read_input(input, neuron, batch); 96 | std::cout << "Read Input success!" << std::endl; 97 | std::vector weights; 98 | 99 | for(int l = 0; l < layer; ++l) { 100 | auto weight_file = get_weight_file_name(neuron, l); 101 | SNIGMatrix snig_weight(weight_file, 32 * neuron, sec_size, neuron); 102 | weights.push_back(snig_weight); 103 | std::cout << "["<< weight_file << "] to SNIG Matrix success!" << std::endl; 104 | } 105 | 106 | GpuEnv env(0); 107 | test_benchmark_SNIG(input, weights, batch, neuron, sec_size, nnzs, bias_map[neuron], env); 108 | 109 | std::cout << "[END]..." << std::endl; 110 | return 0; 111 | } -------------------------------------------------------------------------------- /src/cost.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "fuse/header.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace ftxj; 12 | 13 | 14 | std::string get_weight_file_name(int neuron, int layer) { 15 | std::string weight_file_dir = "../data/neuron"; 16 | std::string neuron_str = std::to_string(neuron); 17 | weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv"; 18 | return weight_file_dir; 19 | } 20 | 21 | int main(int argc, char* argv[]) { 22 | 23 | if(argc != 7) { 24 | std::cout << "Usage: exe neuron layer TB1 TN1 TB2 TN2" << std::endl; 25 | return 0; 26 | } 27 | int neuron = atoi(argv[1]); 28 | int layer = atoi(argv[2]); 29 | 30 | 31 | int TB1 = atoi(argv[3]); 32 | int TN1 = atoi(argv[4]); 33 | int TB2 = atoi(argv[5]); 34 | int TN2 = atoi(argv[6]); 35 | 36 | 37 | std::map hash_map = { 38 | {65536, 4096}, 39 | {16384, 1024}, 40 | {4096, 256}, 41 | {1024, 64} 42 | }; 43 | 44 | std::map type_1 = { 45 | {65536, 12}, 46 | {16384, 10}, 47 | {4096, 8}, 48 | {1024, 6} 49 | }; 50 | HashReorder hash_reorder_t(hash_map[neuron], neuron); 51 | 52 | std::cout << "[BEGIN]..." << std::endl; 53 | 54 | 55 | clock_t total = 0; 56 | 57 | for(int l = 0; l < layer; ++l) { 58 | auto weight_file = get_weight_file_name(neuron, l); 59 | COOMatrix coo(weight_file, 1, false); 60 | std::cout << "["<< weight_file << "] to COO success!" << std::endl; 61 | coo.reorder(hash_reorder_t); 62 | std::cout << "Reorder success!" << std::endl; 63 | 64 | 65 | clock_t startTime,endTime; 66 | startTime = clock(); 67 | coo.cost_analysis(TB1, TN1, TB2, TN2); 68 | endTime = clock(); 69 | total += endTime - startTime; 70 | } 71 | 72 | std::cout << "time = " << (double)(total) / CLOCKS_PER_SEC << std::endl; 73 | return 0; 74 | } -------------------------------------------------------------------------------- /src/cuSparse.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | 8 | #include 9 | #include 10 | using namespace ftxj; 11 | 12 | int main(int argc, char* argv[]) { 13 | 14 | std::cout << "begin" << std::endl; 15 | std::map stride_map = { 16 | {1, 16}, 17 | {2, 32}, 18 | {3, 64}, 19 | {4, 128}, 20 | {5, 256}, 21 | {6, 512}, 22 | {7, 1024}, 23 | {8, 2048}, 24 | {9, 4096}, 25 | {10, 8192} 26 | }; 27 | 28 | int neuron = atoi(argv[1]); 29 | int batch = atoi(argv[2]); 30 | int l = atoi(argv[3]); 31 | int hash_type = atoi(argv[4]); 32 | 33 | std::string file_name = "../data/neuron"+ 34 | std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv"; 35 | 36 | COOMatrix coo(file_name, 1, false); 37 | std::cout << "read coo success" << std::endl; 38 | 39 | 40 | if(hash_type == 0) { 41 | } 42 | 43 | if(hash_type == 1) { 44 | HashReorder hash_reorder_t(64, neuron, REORDER::ROW_REORDER); 45 | coo.reorder(hash_reorder_t); 46 | } 47 | 48 | if(hash_type == 2) { 49 | HashReorder hash_reorder_t(64, neuron, REORDER::COL_REORDER); 50 | coo.reorder(hash_reorder_t); 51 | } 52 | 53 | if(hash_type == 3) { 54 | HashReorder hash_reorder_t(64, neuron, REORDER::ALL_REORDER); 55 | coo.reorder(hash_reorder_t); 56 | } 57 | 58 | std::cout << "reorder success" << std::endl; 59 | cuSPARSEMatrix cusm(coo, neuron); 60 | std::cout << "BF success" << std::endl; 61 | 62 | GpuEnv env(0); 63 | 64 | test_benchmark_cusparse(coo, 65 | cusm, 66 | neuron, batch); 67 | 68 | return 0; 69 | } -------------------------------------------------------------------------------- /src/fuse.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace ftxj; 12 | 13 | 14 | std::string get_weight_file_name(int neuron, int layer) { 15 | std::string weight_file_dir = "../data/neuron"; 16 | std::string neuron_str = std::to_string(neuron); 17 | weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv"; 18 | return weight_file_dir; 19 | } 20 | 21 | void dense_reorder(std::vector> &input, Reorder &reorder_class) { 22 | std::vector> old = input; 23 | for(int i = 0; i < input.size(); ++i) { 24 | for(int j = 0; j < input[i].size(); ++j) { 25 | auto new_j = reorder_class.reorder(j); 26 | input[i][new_j] = old[i][j]; 27 | } 28 | } 29 | } 30 | 31 | void read_input(std::vector> &input, int neuron, int batch) { 32 | std::string input_file_name = "../data/sparse-images-"; 33 | input_file_name += std::to_string(neuron) + ".tsv"; 34 | std::ifstream input_file(input_file_name); 35 | if(!input_file){ 36 | std::cout << "FILE:" << input_file_name << " does not exists.\n"; 37 | exit(-1); 38 | } 39 | int b, n; 40 | float val; 41 | while(input_file >> b >> n >> val) { 42 | if(b <= batch) { 43 | input[b - 1][n - 1] = val; 44 | } 45 | } 46 | } 47 | 48 | int main(int argc, char* argv[]) { 49 | 50 | if(argc != 4) { 51 | std::cout << "Usage: exe neuron batch layer" << std::endl; 52 | return 0; 53 | } 54 | int neuron = atoi(argv[1]); 55 | int batch = atoi(argv[2]); 56 | int layer = atoi(argv[3]); 57 | 58 | std::map hash_map = { 59 | {65536, 4096}, 60 | {16384, 1024}, 61 | {4096, 256}, 62 | {1024, 64} 63 | }; 64 | 65 | std::map bias_map = { 66 | {65536, -0.45}, 67 | {16384, -0.4}, 68 | {4096, -0.35}, 69 | {1024, -0.3} 70 | }; 71 | 72 | std::vector> input(batch, std::vector(neuron)); 73 | std::vector> weight; 74 | std::vector> row_access; 75 | 76 | std::cout << "[BEGIN]..." << std::endl; 77 | read_input(input, neuron, batch); 78 | std::cout << "Read Input success!" << std::endl; 79 | HashReorder hash_reorder_t(hash_map[neuron], neuron); 80 | dense_reorder(input, hash_reorder_t); 81 | 82 | std::vector coo_vec; 83 | 84 | 85 | for(int l = 0; l < layer; ++l) { 86 | auto weight_file = get_weight_file_name(neuron, l); 87 | COOMatrix coo(weight_file, 1, false); 88 | std::cout << "["<< weight_file << "] to COO success!" << std::endl; 89 | coo.reorder(hash_reorder_t); 90 | coo_vec.push_back(coo); 91 | std::cout << "Reorder success!" << std::endl; 92 | CSRCSCMatrix csr_csc(coo); 93 | csr_csc.transpose(); 94 | BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method); 95 | std::cout << "Structural Info success!" << std::endl; 96 | MaxInReuseBSchedule schedule(blocks); 97 | schedule.schedule_output_parallel(128, 1, false); 98 | std::cout << "Schedule succ" << std::endl; 99 | auto data = schedule.get_data(neuron); 100 | weight.push_back(data.value); 101 | row_access.push_back(data.row_access); 102 | } 103 | GpuEnv env(3); 104 | test_benchmark_fused_layer1024_0_1(input, coo_vec, weight, row_access, batch, neuron, bias_map[neuron], env); 105 | // test_benchmark_fuse_cmp_layer1024_0_1(input, weight, row_access, batch, neuron, bias_map[neuron], env); 106 | 107 | std::cout << "[END]..." << std::endl; 108 | return 0; 109 | } -------------------------------------------------------------------------------- /src/fuse/fuse.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | 7 | namespace ftxj { 8 | class FuseLayer { 9 | int fuse_numbers_; 10 | std::vector> input_need_access_; 11 | std::vector fused_matrix_; 12 | public: 13 | FuseLayer(COOMatrix outer_matrix, std::vector> block_cols) { 14 | fuse_numbers_ = 1; 15 | input_need_access_ = std::vector>(block_cols.size(), std::set()); 16 | for(int i = 0; i < block_cols.size(); ++i) { 17 | for(int j = 0; j < block_cols[i].size(); ++j) { 18 | int need_access_col = block_cols[i][j]; 19 | for(auto x = outer_matrix.begin(); x != outer_matrix.end(); ++x) { 20 | if((*x).col == need_access_col) { 21 | input_need_access_[i].insert((*x).row); 22 | } 23 | } 24 | } 25 | } 26 | } 27 | 28 | void print_need_access() { 29 | for(int b = 0; b < input_need_access_.size(); ++b) { 30 | std::cout << "block b = " << b << ",size = "<< input_need_access_[b].size() <<" : "; 31 | for(auto x : input_need_access_[b]) { 32 | std::cout << x << ","; 33 | } 34 | std::cout << std::endl; 35 | } 36 | } 37 | 38 | void fuse(COOMatrix outer_matrix) { 39 | fuse_numbers_ += 1; 40 | fused_matrix_.push_back(outer_matrix); 41 | std::vector> old_access = input_need_access_; 42 | input_need_access_.clear(); 43 | input_need_access_ = std::vector>(old_access.size(), std::set()); 44 | for(int b = 0; b < old_access.size(); ++b) { 45 | for(auto row : old_access[b]) { 46 | for(auto x = outer_matrix.begin(); x != outer_matrix.end(); ++x) { 47 | if((*x).col == row) { 48 | input_need_access_[b].insert((*x).row); 49 | } 50 | } 51 | } 52 | } 53 | } 54 | }; 55 | } -------------------------------------------------------------------------------- /src/fuse/header.h: -------------------------------------------------------------------------------- 1 | #include "fuse.h" 2 | -------------------------------------------------------------------------------- /src/gpu_lib/gpu_env.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "gpu_runtime.h" 7 | #include 8 | #include 9 | 10 | namespace ftxj { 11 | class GpuEnv { 12 | std::vector streams; 13 | std::vector start_event; 14 | std::vector stop_event; 15 | std::vector event_name; 16 | std::map event_map; 17 | 18 | public: 19 | GpuEnv(int gpu_id, bool print_device_info = true) { 20 | set_up(gpu_id, print_device_info); 21 | } 22 | 23 | GpuEnv(std::vector gpu_id, bool print_device_info = true) { 24 | for(int i = 0; i < gpu_id.size(); ++i) { 25 | set_up(gpu_id[i], print_device_info); 26 | } 27 | } 28 | 29 | 30 | void set_up(int gpu_id, bool print_device_info = true) { 31 | Safe_Call(cudaSetDevice(gpu_id)); 32 | if(print_device_info) { 33 | int deviceCount; 34 | Safe_Call(cudaGetDeviceCount(&deviceCount)); 35 | // printf("\n"); 36 | // printf("Device Count: %d\n",deviceCount); 37 | int dev = gpu_id; 38 | 39 | cudaDeviceProp deviceProp; 40 | Safe_Call(cudaGetDeviceProperties(&deviceProp, dev)); 41 | // printf("Device %d name: %s\n",dev,deviceProp.name); 42 | // printf("Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor); 43 | // printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); 44 | // printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem); 45 | // printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); 46 | // printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); 47 | // printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); 48 | // printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); 49 | // printf("Warp size: %d\n",deviceProp.warpSize); 50 | // printf("\n"); 51 | } 52 | } 53 | 54 | void add_event(std::string name = "non") { 55 | cudaStream_t stream; 56 | cudaEvent_t start, stop; 57 | 58 | streams.push_back(stream); 59 | start_event.push_back(start); 60 | stop_event.push_back(stop); 61 | event_name.push_back(name); 62 | 63 | Safe_Call(cudaEventCreate(&start_event[start_event.size() - 1])); 64 | Safe_Call(cudaEventCreate(&stop_event[stop_event.size() - 1])); 65 | Safe_Call(cudaStreamCreate(&streams[streams.size() - 1])); 66 | 67 | event_map[name] = streams.size() - 1; 68 | } 69 | 70 | void event_start_record(std::string name = "non") { 71 | Safe_Call(cudaEventRecord(start_event[event_map[name]], streams[event_map[name]])); 72 | } 73 | 74 | void event_stop_record(std::string name = "non") { 75 | Safe_Call(cudaEventRecord(stop_event[event_map[name]], streams[event_map[name]])); 76 | } 77 | 78 | float get_event_time(std::string name = "non") { 79 | float res = 0.0; 80 | Safe_Call(cudaStreamSynchronize(streams[event_map[name]])); 81 | Safe_Call(cudaEventElapsedTime(&res, start_event[event_map[name]], stop_event[event_map[name]])); 82 | return res; 83 | } 84 | 85 | cudaStream_t get_stream(std::string name = "non") { 86 | return streams[event_map[name]]; 87 | } 88 | 89 | }; 90 | }; -------------------------------------------------------------------------------- /src/gpu_lib/gpu_runtime.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | inline void checkCuda(cudaError_t result, const char *file, const int line, bool fatal=false) { 7 | if (result != cudaSuccess) { 8 | fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), 9 | cudaGetErrorString(result));\ 10 | if (fatal) { 11 | exit(EXIT_FAILURE); 12 | } 13 | } 14 | } 15 | 16 | #define Safe_Call_Print(stmt) checkCuda(stmt, __FILE__, __LINE__) 17 | #define Safe_Call(stmt) checkCuda(stmt, __FILE__, __LINE__, true) 18 | -------------------------------------------------------------------------------- /src/gpu_lib/header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "gpu_runtime.h" 4 | #include "gpu_env.h" -------------------------------------------------------------------------------- /src/inspector/code_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "code_gen_basic.h" 2 | 3 | 4 | void generate_20champion_code { 5 | std::vector param_list_; 6 | 7 | VariableDecl nextfeat(f32, "nextfeat", Global, true); 8 | param_list_.push_back(&nextfeat); 9 | 10 | VariableDecl currfeat(f32, "currfeat", Global, true); 11 | param_list_.push_back(&currfeat); 12 | 13 | VariableDecl buffsize(i32, "buffsize", Global, false); 14 | param_list_.push_back(&buffsize); 15 | 16 | VariableDecl buffdispl(i32, "buffdispl", Global, true); 17 | param_list_.push_back(&buffdispl); 18 | 19 | VariableDecl mapdispl(i32, "mapdispl", Global, true); 20 | param_list_.push_back(&mapdispl); 21 | 22 | VariableDecl map(i16, "map", Global, true); 23 | param_list_.push_back(&map); 24 | 25 | 26 | VariableDecl displ(i32, "displ", Global, true); 27 | param_list_.push_back(&displ); 28 | 29 | VariableDecl index(i16, "index", Global, true); 30 | param_list_.push_back(&index); 31 | 32 | VariableDecl value(f32, "value", Global, true); 33 | param_list_.push_back(&value); 34 | 35 | VariableDecl bias(f32, "bias", Global, false); 36 | param_list_.push_back(&bias); 37 | 38 | VariableDecl neuron(i32, "neuron", Global, false); 39 | param_list_.push_back(&neuron); 40 | 41 | VariableDecl categories(i32, "categories", Global, true); 42 | param_list_.push_back(&categories); 43 | 44 | VariableDecl active(i32, "active", Global, true); 45 | param_list_.push_back(&active); 46 | 47 | GpuGlobalFunction dummy_kernel("dummy_kernel", param_list_, 1024, 1); 48 | dummy_kernel.emit_statement(); 49 | 50 | VariableDecl shared(f32, "shared", Shared, false); 51 | shared.set_extern(); 52 | 53 | VariableArrayDecl shared_array(shared, {}); 54 | shared_array.emit_statement(); 55 | 56 | VariableDecl wind(i32, "wind", Reg, false); 57 | ConstantVar WARPSIZE("WARPSIZE"); 58 | ConstantVar ThreadIdx_x("threadIdx.x"); 59 | Operation tmp = ThreadIdx_x % WARPSIZE; 60 | VaribaleInit wind_init_statement(wind, tmp); 61 | wind_init_statement.emit_statement(); 62 | 63 | 64 | ArrayAccess line95_1 = buffdispl[ThreadIdx_x]; 65 | ArrayAccess line95_2 = buffdispl[ThreadIdx_x + 1]; 66 | ConstantVar ConstOne(1); 67 | VariableDecl iter_var_1(i32, "buff", Global, false); 68 | Variable iter_var(iter_var_1); 69 | ForLoopScope forloop_1(line95_1, line95_2, ConstOne, iter_var); 70 | forloop_1.emit_statement(); 71 | 72 | 73 | 74 | ScopeEnd forloop_1_end; 75 | forloop_1_end.emit_statement(); 76 | 77 | 78 | ScopeEnd dummy_kernel_end; 79 | dummy_kernel_end.emit_statement(); 80 | } 81 | 82 | 83 | void generate_ramdom_block_code(Schedule &block_schedule) { 84 | VariableArrayDecl output_tile(f32, "output_tile", Reg, false, {8}); 85 | ConstantVar floatZero(0.0); 86 | 87 | VariableArrayInit output_tile_init(&output_tile, floatZero); 88 | output_tile_init.emit_statement(); 89 | 90 | VariableDecl dense_tile(f32, "dense_value", Reg, false); 91 | VaribaleInit dense_tile_init(dense_tile, floatZero); 92 | 93 | for(int b = 0; b < blockSize; ++b) { 94 | BlockScope block_scope(b); 95 | block_scope.emit_statement(); 96 | for(int t = 0; t < threadSize; ++t) { 97 | MatrixBlockBase* base_block = block_schedule.get_block(b, t); 98 | if(base_block->get_block_type() == "Random") { 99 | 100 | } 101 | else if() { 102 | 103 | } 104 | } 105 | ScopeEnd block_scope_end; 106 | block_scope_end.emit_statement(); 107 | } 108 | 109 | } -------------------------------------------------------------------------------- /src/inspector/cost_model.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/inspector/cost_model.h -------------------------------------------------------------------------------- /src/inspector/data_inspector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace ftxj { 5 | using namespace std; 6 | class Inspector { 7 | vector task_distribution; 8 | public: 9 | }; 10 | }; -------------------------------------------------------------------------------- /src/inspector/gpu_block.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "matrix_block_container.h" 3 | #include 4 | #include 5 | 6 | namespace ftxj { 7 | class GpuBlock { 8 | int block_idx_; 9 | int block_idy_; 10 | public: 11 | BlockContainer blocks_; 12 | GpuBlock(int x, int y, BlockContainer blocks) : blocks_(blocks) { 13 | block_idx_ = x; 14 | block_idy_ = y; 15 | } 16 | 17 | // std::vector 18 | void file_gen() { 19 | 20 | } 21 | 22 | void print() { 23 | std::cout << "("; 24 | if(block_idx_ == -1) { 25 | std::cout << "{...}, "; 26 | } 27 | else { 28 | std::cout << block_idx_ << ", "; 29 | } 30 | if(block_idy_ == -1) { 31 | std::cout << "{...})"; 32 | } 33 | else { 34 | std::cout << block_idy_ << ")"; 35 | } 36 | std::cout << "\n"; 37 | blocks_.print_unique(); 38 | } 39 | }; 40 | }; -------------------------------------------------------------------------------- /src/inspector/gpu_run_config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ftxj { 4 | class GpuRunConfig { 5 | public: 6 | int block_num; 7 | int thread_num; 8 | int shared_memory_size; 9 | }; 10 | 11 | }; -------------------------------------------------------------------------------- /src/inspector/gpu_wrap.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ftxj { 4 | // AC = AB * BC 5 | // dense = dense * sparse 6 | class Wrap { 7 | private: 8 | int wrap_id_; 9 | int block_id_; 10 | 11 | int batch_dim_; 12 | int output_channel_dim_; 13 | int input_channel_dim_; 14 | 15 | int batch_offset_; 16 | int output_channel_offset_; 17 | int input_channel_offset_; 18 | 19 | int write_dst_; 20 | 21 | BlockContainer blocks_; 22 | 23 | public: 24 | 25 | static const int WRAP_SIZE = 32; 26 | }; 27 | }; -------------------------------------------------------------------------------- /src/inspector/header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "matrix_block.h" 3 | #include "matrix_block_gen.h" 4 | #include "matrix_block_container.h" 5 | #include "gpu_block_scheduler.h" 6 | -------------------------------------------------------------------------------- /src/inspector/matrix_block_gen.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "matrix_block.h" 5 | 6 | #include "../utils/matrix.h" 7 | #include "../utils/string.h" 8 | 9 | namespace ftxj { 10 | 11 | class SparseMatrixBlockGen { 12 | 13 | static int row_line_succ_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) { 14 | int row_idx = start_pos.row_idx; 15 | int col_idx = start_pos.col_idx; 16 | int res = 0; 17 | for(auto iter = csr_csc.row_iter_begin_at(row_idx, col_idx); 18 | iter != csr_csc.row_iter_end_at(row_idx); ++iter) { 19 | if((*iter).col == col_idx + res) { 20 | res++; 21 | } 22 | else { 23 | return res; 24 | } 25 | } 26 | return res; 27 | } 28 | 29 | 30 | static int col_line_succ_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) { 31 | int row_idx = start_pos.row_idx; 32 | int col_idx = start_pos.col_idx; 33 | int res = 0; 34 | for(auto iter = csr_csc.col_iter_begin_at(row_idx, col_idx); 35 | iter != csr_csc.col_iter_end_at(col_idx); ++iter) { 36 | if((*iter).row == row_idx + res) { 37 | res++; 38 | } 39 | else { 40 | return res; 41 | } 42 | } 43 | return res; 44 | } 45 | 46 | 47 | static MatrixPos rectangels_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) { 48 | int row_max = row_line_succ_max(start_pos, csr_csc); 49 | if(row_max > 16) row_max = 16; 50 | 51 | // std::cout << "row max = " << row_max << std::endl; 52 | 53 | int now_max_row = 0; 54 | int now_max_col = 70000; 55 | int now_max = 0; 56 | 57 | int res_row = 0; 58 | int res_col = 0; 59 | 60 | for(int i = 0; i < row_max; ++i) { 61 | now_max_row = i + 1; 62 | int col_max = col_line_succ_max({start_pos.row_idx, start_pos.col_idx + i}, csr_csc); 63 | // std::cout << "col max = " << col_max << ", " << start_pos.row_idx << ", " << start_pos.col_idx + i << std::endl; 64 | now_max_col = std::min(col_max, now_max_col); 65 | int tmp_area = now_max_col * now_max_row; 66 | if(tmp_area > now_max) { 67 | now_max = tmp_area; 68 | res_row = now_max_row; 69 | res_col = now_max_col; 70 | } 71 | } 72 | return {start_pos.row_idx + res_col - 1, start_pos.col_idx + res_row - 1}; 73 | } 74 | 75 | public: 76 | 77 | static std::vector> naive_method(CSRCSCMatrix &csr_csc) { 78 | 79 | std::vector> res; 80 | 81 | int end_len = 0; 82 | int col_each_big_block = -1; 83 | 84 | int now_lookup_col = 0; 85 | int now_lookup_row = 0; 86 | 87 | auto col_iter = csr_csc.col_iter_begin_at(now_lookup_row, now_lookup_col); 88 | 89 | for(; col_iter != csr_csc.col_iter_end(); col_iter = col_iter.next_ncol(col_each_big_block)) { 90 | while(col_iter != csr_csc.col_iter_end_at(now_lookup_col)) { 91 | auto row_idx = (*col_iter).row; 92 | auto col_idx = (*col_iter).col; 93 | // std::cout << " row = " << row_idx << ", col = " << col_idx << std::endl; 94 | auto end_pos = rectangels_max(MatrixPos(row_idx, col_idx), csr_csc); 95 | // std::cout << "end at row = " << end_pos.row_idx << ", col = " << end_pos.col_idx << std::endl; 96 | 97 | int tmp_col_len = end_pos.col_idx - col_idx + 1; // 多少行长 98 | int tmp_row_len = end_pos.row_idx - row_idx + 1; // 多少列长 99 | if(col_each_big_block != -1 && col_each_big_block != tmp_col_len) { 100 | std::cout << "TODO Just support same len" << std::endl; 101 | exit(-1); 102 | } 103 | 104 | col_each_big_block = tmp_col_len; 105 | 106 | if(tmp_col_len != 0 || tmp_row_len != 0) { 107 | if(tmp_col_len != end_len && end_len != 0) { 108 | std::cout << "TODO fix this bug" << std::endl; 109 | exit(-1); 110 | } 111 | col_iter += tmp_row_len; 112 | res.push_back({MatrixPos(row_idx, col_idx), end_pos}); 113 | } 114 | else { 115 | std::cout << "TODO At least one point detected" << std::endl; 116 | exit(-1); 117 | } 118 | } 119 | now_lookup_col += col_each_big_block; 120 | } 121 | return res; 122 | } 123 | }; 124 | 125 | }; -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | 8 | #include 9 | #include 10 | using namespace ftxj; 11 | 12 | int main(int argc, char* argv[]) { 13 | 14 | std::cout << "begin" << std::endl; 15 | 16 | 17 | int neuron = 1024; 18 | int batch = 6000; 19 | 20 | 21 | std::map stride_map = { 22 | {1, 16}, 23 | {2, 32}, 24 | {3, 64}, 25 | {4, 128}, 26 | {5, 256}, 27 | {6, 512}, 28 | {7, 1024}, 29 | {8, 2048}, 30 | {9, 4096}, 31 | {10, 8192} 32 | }; 33 | int l = atoi(argv[1]); 34 | // int l = 5; 35 | std::string file_name = "../data/neuron1024/n1024-l" + std::to_string(l) + ".tsv"; 36 | COOMatrix coo(file_name, 1, false); 37 | // COOMatrix coo_2("../data/neuron16384/n16384-l119.tsv", 1, true); 38 | // std::cout << "read coo success" << std::endl; 39 | 40 | HashReorder hash_reorder_t(64, neuron); 41 | coo.reorder(hash_reorder_t); 42 | // std::cout << "reorder success" << std::endl; 43 | 44 | // coo_2.reorder(hash_reorder_t); 45 | // std::vector> block_cols(16384/16); 46 | // for(int b = 0; b < 16384 / 16; ++b) { 47 | // for(int j = 0; j < 16; ++j) { 48 | // block_cols[b].push_back(b * 16 + j); 49 | // } 50 | // } 51 | // FuseLayer fuse(coo, block_cols); 52 | // fuse.print_need_access(); 53 | // fuse.fuse(coo_2); 54 | // fuse.print_need_access(); 55 | // return 0; 56 | 57 | CSRCSCMatrix csr_csc(coo); 58 | std::cout << "coo to csr_csc success" << std::endl; 59 | 60 | // UIUCMatrix uiuc(csr_csc, 256, neuron); 61 | // std::cout << "uiuc success" << std::endl; 62 | 63 | GpuEnv env(0); 64 | // test_benchmark_succ_load_store(batch, neuron, env); 65 | // test_benchmark_matrix_transpose(batch, neuron, env); 66 | // test_benchmark_matrix_transpose_and_delete(batch, neuron, env); 67 | // return 0; 68 | 69 | // test_benchmark_20_uiuc(coo, uiuc, batch, env); 70 | // return 0; 71 | 72 | // uiuc_test_benchmark(coo, uiuc, env); 73 | // uiuc.print_buffdispl(); 74 | // uiuc.print_mapdispl(); 75 | // uiuc.print_map(); 76 | // uiuc.print_warpdispl(); 77 | // uiuc.print_warpindex(); 78 | 79 | csr_csc.transpose(); 80 | BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method); 81 | std::cout << "block container success" << std::endl; 82 | // blocks.print(); 83 | 84 | MaxInReuseBSchedule schedule(blocks); 85 | 86 | // schedule.schedule_output_parallel(128, 1, false); 87 | schedule.schedule(128, 1); 88 | 89 | std::cout << "block schedule succ" << std::endl; 90 | 91 | // auto data = schedule.get_data2(neuron); 92 | auto data = schedule.get_data(neuron); 93 | 94 | 95 | // std::cout << "data size = " << data.value.size() << std::endl; 96 | // std::cout << "data access size = " << data.row_access.size() << std::endl; 97 | 98 | // std::cout << "data load idx len = "; 99 | // for(int i = 0; i < data.load_idx_row_len.size(); ++i) { 100 | // std::cout << data.load_idx_row_len[i] << ", "; 101 | // } 102 | // std::cout << std::endl; 103 | 104 | // std::cout << "data row access = "; 105 | // for(int i = 0; i < data.row_access.size(); ++i) { 106 | // std::cout << data.row_access[i] << ", "; 107 | // } 108 | // std::cout << std::endl; 109 | 110 | // std::cout << "data value access = "; 111 | // for(int i = 0; i < data.value_access.size(); ++i) { 112 | // std::cout << data.value_access[i] << ", "; 113 | // } 114 | // std::cout << std::endl; 115 | 116 | // schedule.print_schedule(); 117 | 118 | // test_benchmark_row_succ_20_uiuc(coo, data.value, data.row_access, batch, neuron, env); 119 | // test_benchmark_row_succ_20_uiuc_transpose(coo, data.value, data.row_access, batch, neuron, env); 120 | // test_benchmark_row_succ_20_uiuc_transpose_no_conflict(coo, data.value, data.row_access, batch, neuron, env); 121 | // test_benchmark_rectangels_batch_parallel_kernel(coo, data.value, data.row_access, batch, neuron, env); 122 | test_benchmark_n16384_l2_l10_kernel(coo, data.value, stride_map[l], batch, neuron, env); 123 | // test_benchmark_n16384_l11_kernel(coo, data.value, data.row_access, batch, neuron, env); 124 | 125 | 126 | 127 | // GpuEnv env(0); 128 | 129 | // vector4_load_data_benchmark(env); 130 | 131 | // test_shared_memory_mm(coo, data.value, data.row_access, env); 132 | 133 | return 0; 134 | } -------------------------------------------------------------------------------- /src/mc_test.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | 8 | #include 9 | #include 10 | using namespace ftxj; 11 | 12 | int main(int argc, char* argv[]) { 13 | 14 | std::cout << "begin" << std::endl; 15 | 16 | 17 | std::map hash_map = { 18 | {65536, 4096}, 19 | {16384, 1024}, 20 | {4096, 256}, 21 | {1024, 64} 22 | }; 23 | int neuron = atoi(argv[1]); 24 | int batch = atoi(argv[2]); 25 | int l = atoi(argv[3]); 26 | int hash_type = atoi(argv[4]); 27 | 28 | std::string file_name = "../data/neuron"+ 29 | std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv"; 30 | COOMatrix coo(file_name, 1, false); 31 | std::cout << "read coo success" << std::endl; 32 | 33 | if(hash_type == 0) { 34 | } 35 | 36 | if(hash_type == 1) { 37 | HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::ROW_REORDER); 38 | coo.reorder(hash_reorder_t); 39 | } 40 | 41 | if(hash_type == 2) { 42 | HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::COL_REORDER); 43 | coo.reorder(hash_reorder_t); 44 | } 45 | 46 | if(hash_type == 3) { 47 | HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::ALL_REORDER); 48 | coo.reorder(hash_reorder_t); 49 | } 50 | 51 | std::cout << "reorder success" << std::endl; 52 | 53 | CSRCSCMatrix csr_csc(coo); 54 | std::cout << "coo to csr_csc success" << std::endl; 55 | 56 | UIUCMatrix uiuc(csr_csc, 256, neuron); 57 | std::cout << "uiuc success" << std::endl; 58 | 59 | GpuEnv env(0); 60 | 61 | test_benchmark_20_uiuc(coo, uiuc, batch, env); 62 | return 0; 63 | 64 | 65 | return 0; 66 | } -------------------------------------------------------------------------------- /src/microbenchmark/bf.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ftxj { 10 | 11 | __global__ void bf_spmm( 12 | 13 | float* Y0, // input 14 | float* Y1, 15 | 16 | int* roffW, // len neuron * N_SLAB - 1 17 | int* colsW, // index 32 * neuron 18 | float* valsW, // all 32 * neuron 0.0625 19 | 20 | int COL_BLK, // TN, shared memory size = TN 21 | int N_SLAB, // neuron / TN 22 | int neuron // neuron 23 | 24 | 25 | ) { 26 | 27 | extern __shared__ float shRow[]; 28 | 29 | int tid = threadIdx.y * blockDim.x + threadIdx.x; 30 | int rid = blockIdx.x; 31 | 32 | __syncthreads(); 33 | 34 | for(int i = 0; i < N_SLAB; i++) { 35 | __syncthreads(); 36 | for(int j = threadIdx.x; j < COL_BLK; j++) { 37 | shRow[j] = 0; 38 | } 39 | __syncthreads(); 40 | for(int j = threadIdx.y; j < neuron; j += blockDim.y) { 41 | float valY = Y0[rid * neuron + j]; 42 | // if(valY == 0) { 43 | // continue; 44 | // } 45 | 46 | int begOffW = roffW[i * neuron + j] + threadIdx.x; 47 | int endOffW = roffW[i * neuron + j + 1]; 48 | 49 | for(int k = begOffW; k < endOffW; k += blockDim.x) { 50 | int colW = colsW[k]; 51 | float valW = valsW[k]; 52 | // if(colW - i * COL_BLK < 0 || colW - i * COL_BLK >= 1024) { 53 | // printf("bugs %d %d %d %d\n", k, i, colW, colW - i * COL_BLK); 54 | // } 55 | atomicAdd(&shRow[colW - i * COL_BLK], valY * valW); 56 | } 57 | } 58 | __syncthreads(); 59 | int count = 0; 60 | for(size_t j = 0; j < COL_BLK; j += blockDim.x * blockDim.y) { 61 | // float v = j + tid < COL_BLK ? shRow[j + tid] + bias : -1; 62 | // count += __syncthreads_count(v > 0); 63 | if(j + tid < COL_BLK) { 64 | Y1[rid * neuron + i * COL_BLK + j + tid] = shRow[j + tid]; 65 | // min(T(32), max(T(0), v)); 66 | } 67 | } 68 | } 69 | } 70 | 71 | void test_benchmark_19_BF(COOMatrix &coo, BFMatrix &matrix, 72 | int neuron, int batch, int TN, 73 | int blockx, int blocky, 74 | GpuEnv &env) { 75 | 76 | float *nextfeat; 77 | float *currfeat; 78 | 79 | int *rowoff; 80 | 81 | int off_size = neuron * (neuron / TN + 1) + 1; 82 | 83 | int *rowindex; 84 | 85 | int weight_nnz = 32 * neuron; 86 | 87 | float *value; 88 | 89 | float bias = 0; 90 | int mybatch = batch; 91 | 92 | // std::vector> input(mybatch, std::vector(neuron, 0.0)); 93 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 94 | memset(input, 0, sizeof(float) * neuron * mybatch); 95 | 96 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 97 | memset(output, 0, sizeof(float) * neuron * mybatch); 98 | 99 | 100 | srand (static_cast (time(0))); 101 | for(int i = 0; i < mybatch; ++i) { 102 | for(int j = 0; j < neuron; ++j) { 103 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 104 | input[i * neuron + j] = r2; 105 | } 106 | } 107 | 108 | Safe_Call(cudaMalloc((void**)&rowoff, sizeof(int) * off_size)); 109 | Safe_Call(cudaMemcpy(rowoff, &matrix.rowoff[0], sizeof(int) * off_size, cudaMemcpyHostToDevice)); 110 | 111 | Safe_Call(cudaMalloc((void**)&rowindex, sizeof(int) * weight_nnz)); 112 | Safe_Call(cudaMemcpy(rowindex, &matrix.rowindex[0], sizeof(int) * weight_nnz, cudaMemcpyHostToDevice)); 113 | 114 | Safe_Call(cudaMalloc((void**)&value, sizeof(float) * weight_nnz)); 115 | Safe_Call(cudaMemcpy(value, &matrix.val[0], sizeof(float) * weight_nnz, cudaMemcpyHostToDevice)); 116 | 117 | Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * neuron * mybatch)); 118 | Safe_Call(cudaMemcpy(currfeat, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 119 | 120 | Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * neuron * mybatch)); 121 | Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * neuron * mybatch)); 122 | 123 | std::cout << "begin inference..." << std::endl; 124 | env.add_event("uiuc_kernel_timer"); 125 | env.event_start_record("uiuc_kernel_timer"); 126 | 127 | dim3 block(blockx, blocky); 128 | dim3 grid(batch); 129 | bf_spmm<<>>( 130 | currfeat, nextfeat, rowoff, rowindex, value, TN, neuron / TN, neuron 131 | ); 132 | 133 | env.event_stop_record("uiuc_kernel_timer"); 134 | float time = env.get_event_time("uiuc_kernel_timer"); 135 | 136 | Safe_Call(cudaMemcpy(output, nextfeat, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 137 | 138 | std::cout << "Kernel Exec Time [19-BF] = " << time << "ms"<< std::endl; 139 | std::cout << "Flops [19-BF] = " << float(2 * batch * neuron * 32) / time * 1000 /1e12 << "TFLOPS"<< std::endl; 140 | 141 | CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true, true); 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/microbenchmark/bf_opt.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ftxj { 10 | 11 | __global__ void bf_spmm( 12 | 13 | float* Y0, // input 14 | float* Y1, 15 | 16 | int* roffW, // len neuron * N_SLAB - 1 17 | int* colsW, // index 32 * neuron 18 | float* valsW, // all 32 * neuron 0.0625 19 | 20 | int* no_name_len, 21 | int* no_name_idx, 22 | 23 | int COL_BLK, // TN, shared memory size = TN 24 | int N_SLAB, // neuron / TN 25 | int neuron // neuron 26 | 27 | 28 | ) { 29 | 30 | extern __shared__ float shRow[]; 31 | 32 | int tid = threadIdx.y * blockDim.x + threadIdx.x; 33 | int rid = blockIdx.x; 34 | 35 | __syncthreads(); 36 | 37 | for(int i = 0; i < N_SLAB; i++) { 38 | __syncthreads(); 39 | for(int j = threadIdx.x; j < COL_BLK; j++) { 40 | shRow[j] = 0; 41 | } 42 | __syncthreads(); 43 | int no_name_len_beg = no_name_len[i] + threadIdx.y; 44 | int no_name_len_end = no_name_len[i + 1]; 45 | for(int j = no_name_len_beg; j < no_name_len_end; j += blockDim.y) { 46 | int real_j = no_name_idx[j] 47 | float valY = Y0[rid * neuron + real_j]; 48 | // if(valY == 0) { 49 | // continue; 50 | // } 51 | 52 | int begOffW = roffW[i * neuron + real_j] + threadIdx.x; 53 | int endOffW = roffW[i * neuron + real_j + 1]; 54 | 55 | for(int k = begOffW; k < endOffW; k += blockDim.x) { 56 | int colW = colsW[k]; 57 | float valW = valsW[k]; 58 | // if(colW - i * COL_BLK < 0 || colW - i * COL_BLK >= 1024) { 59 | // printf("bugs %d %d %d %d\n", k, i, colW, colW - i * COL_BLK); 60 | // } 61 | atomicAdd(&shRow[colW - i * COL_BLK], valY * valW); 62 | } 63 | } 64 | __syncthreads(); 65 | int count = 0; 66 | for(size_t j = 0; j < COL_BLK; j += blockDim.x * blockDim.y) { 67 | // float v = j + tid < COL_BLK ? shRow[j + tid] + bias : -1; 68 | // count += __syncthreads_count(v > 0); 69 | if(j + tid < COL_BLK) { 70 | Y1[rid * neuron + i * COL_BLK + j + tid] = shRow[j + tid]; 71 | // min(T(32), max(T(0), v)); 72 | } 73 | } 74 | } 75 | } 76 | 77 | void test_benchmark_19_BF(COOMatrix &coo, BFMatrix &matrix, 78 | int neuron, int batch, int TN, 79 | int blockx, int blocky, 80 | GpuEnv &env) { 81 | 82 | float *nextfeat; 83 | float *currfeat; 84 | 85 | int *rowoff; 86 | 87 | int off_size = neuron * (neuron / TN + 1) + 1; 88 | 89 | int *rowindex; 90 | 91 | int weight_nnz = 32 * neuron; 92 | 93 | float *value; 94 | 95 | float bias = 0; 96 | int mybatch = batch; 97 | 98 | // std::vector> input(mybatch, std::vector(neuron, 0.0)); 99 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 100 | memset(input, 0, sizeof(float) * neuron * mybatch); 101 | 102 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 103 | memset(output, 0, sizeof(float) * neuron * mybatch); 104 | 105 | 106 | srand (static_cast (time(0))); 107 | for(int i = 0; i < mybatch; ++i) { 108 | for(int j = 0; j < neuron; ++j) { 109 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 110 | input[i * neuron + j] = r2; 111 | } 112 | } 113 | 114 | Safe_Call(cudaMalloc((void**)&rowoff, sizeof(int) * off_size)); 115 | Safe_Call(cudaMemcpy(rowoff, &matrix.rowoff[0], sizeof(int) * off_size, cudaMemcpyHostToDevice)); 116 | 117 | Safe_Call(cudaMalloc((void**)&rowindex, sizeof(int) * weight_nnz)); 118 | Safe_Call(cudaMemcpy(rowindex, &matrix.rowindex[0], sizeof(int) * weight_nnz, cudaMemcpyHostToDevice)); 119 | 120 | Safe_Call(cudaMalloc((void**)&value, sizeof(float) * weight_nnz)); 121 | Safe_Call(cudaMemcpy(value, &matrix.val[0], sizeof(float) * weight_nnz, cudaMemcpyHostToDevice)); 122 | 123 | Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * neuron * mybatch)); 124 | Safe_Call(cudaMemcpy(currfeat, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 125 | 126 | Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * neuron * mybatch)); 127 | Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * neuron * mybatch)); 128 | 129 | std::cout << "begin inference..." << std::endl; 130 | env.add_event("uiuc_kernel_timer"); 131 | env.event_start_record("uiuc_kernel_timer"); 132 | 133 | dim3 block(blockx, blocky); 134 | dim3 grid(batch); 135 | bf_spmm<<>>( 136 | currfeat, nextfeat, rowoff, rowindex, value, TN, neuron / TN, neuron 137 | ); 138 | 139 | env.event_stop_record("uiuc_kernel_timer"); 140 | float time = env.get_event_time("uiuc_kernel_timer"); 141 | 142 | Safe_Call(cudaMemcpy(output, nextfeat, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 143 | 144 | std::cout << "Kernel Exec Time [19-BF] = " << time << "ms"<< std::endl; 145 | std::cout << "Flops [19-BF] = " << float(2 * batch * neuron * 32) / time * 1000 /1e12 << "TFLOPS"<< std::endl; 146 | 147 | CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true, true); 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/microbenchmark/cusparse_spmm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace ftxj { 14 | 15 | #define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}} 16 | 17 | 18 | void test_benchmark_cusparse(COOMatrix& coo, cuSPARSEMatrix &matrix, int neuron, int batch) { 19 | 20 | float * input = (float*)malloc(sizeof(float) * neuron * batch); 21 | memset(input, 0, sizeof(float) * neuron * batch); 22 | 23 | float * output = (float*)malloc(sizeof(float) * neuron * batch); 24 | memset(output, 0, sizeof(float) * neuron * batch); 25 | 26 | 27 | srand (static_cast (time(0))); 28 | for(int i = 0; i < batch; ++i) { 29 | for(int j = 0; j < neuron; ++j) { 30 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 31 | input[i * neuron + j] = r2; 32 | } 33 | } 34 | 35 | float *A_d; 36 | float *B_d; 37 | 38 | int* len_d; 39 | int* index_d; 40 | float* val_d; 41 | 42 | Safe_Call(cudaMalloc((void**)&A_d, sizeof(float) * neuron * batch)); 43 | Safe_Call(cudaMemcpy(A_d, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice)); 44 | 45 | Safe_Call(cudaMalloc((void**)&B_d, sizeof(float) * neuron * batch)); 46 | Safe_Call(cudaMemset(B_d, 0, sizeof(float) * neuron * batch)); 47 | 48 | 49 | Safe_Call(cudaMalloc((void**)&len_d, sizeof(int) * (neuron + 1))); 50 | Safe_Call(cudaMemcpy(len_d, matrix.len, sizeof(int) * (neuron + 1), cudaMemcpyHostToDevice)); 51 | 52 | Safe_Call(cudaMalloc((void**)&index_d, sizeof(int) * (neuron * 32))); 53 | Safe_Call(cudaMemcpy(index_d, matrix.index, sizeof(int) * (neuron * 32), cudaMemcpyHostToDevice)); 54 | 55 | Safe_Call(cudaMalloc((void**)&val_d, sizeof(float) * (neuron * 32))); 56 | Safe_Call(cudaMemcpy(val_d, matrix.val, sizeof(float) * (neuron * 32), cudaMemcpyHostToDevice)); 57 | 58 | 59 | 60 | cusparseHandle_t handle = NULL; 61 | cusparseSpMatDescr_t matA; 62 | cusparseDnMatDescr_t matB, matC; 63 | void* dBuffer = NULL; 64 | size_t bufferSize = 0; 65 | float alpha = 1.0f; 66 | float beta = 0.0f; 67 | 68 | CUSPARSE_CHECK( cusparseCreate(&handle) ) 69 | // Create sparse matrix A in CSR format 70 | 71 | CUSPARSE_CHECK(cusparseCreateCsr(&matA, neuron, neuron, 32 * neuron, 72 | len_d, index_d, val_d, 73 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 74 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)) 75 | // Create dense matrix B 76 | CUSPARSE_CHECK(cusparseCreateDnMat(&matB, neuron, batch, neuron, A_d, 77 | CUDA_R_32F, CUSPARSE_ORDER_COL) ) 78 | 79 | // Create dense matrix C 80 | CUSPARSE_CHECK(cusparseCreateDnMat(&matC, neuron, batch, neuron, B_d, 81 | CUDA_R_32F, CUSPARSE_ORDER_COL) ) 82 | 83 | 84 | 85 | Safe_Call(cudaMalloc(&dBuffer, bufferSize)); 86 | 87 | 88 | CUSPARSE_CHECK(cusparseSpMM_bufferSize( 89 | handle, 90 | CUSPARSE_OPERATION_NON_TRANSPOSE, 91 | CUSPARSE_OPERATION_NON_TRANSPOSE, 92 | &alpha, matA, matB, &beta, matC, CUDA_R_32F, 93 | 94 | CUSPARSE_CSRMM_ALG1, &bufferSize) ) 95 | 96 | cudaEvent_t start, stop; 97 | cudaEventCreate(&start); 98 | cudaEventCreate(&stop); 99 | cudaEventRecord(start, 0); 100 | 101 | 102 | CUSPARSE_CHECK( cusparseSpMM(handle, 103 | CUSPARSE_OPERATION_NON_TRANSPOSE, 104 | CUSPARSE_OPERATION_NON_TRANSPOSE, 105 | &alpha, matA, matB, &beta, matC, CUDA_R_32F, 106 | CUSPARSE_MM_ALG_DEFAULT, dBuffer) ) 107 | 108 | cudaEventRecord(stop,0); 109 | cudaEventSynchronize(stop); 110 | float elapsed; 111 | cudaEventElapsedTime(&elapsed, start, stop); //ms 112 | 113 | // destroy matrix/vector descriptors 114 | CUSPARSE_CHECK( cusparseDestroySpMat(matA) ) 115 | CUSPARSE_CHECK( cusparseDestroyDnMat(matB) ) 116 | CUSPARSE_CHECK( cusparseDestroyDnMat(matC) ) 117 | CUSPARSE_CHECK( cusparseDestroy(handle) ) 118 | 119 | Safe_Call(cudaMemcpy(output, B_d, neuron * batch * sizeof(float), cudaMemcpyDeviceToHost)); 120 | 121 | std::cout << "kernel time = " << elapsed << "ms" << std::endl; 122 | std::cout << "Flops [cuSparse] = " << float(2 * batch * neuron * 32) / elapsed * 1000 /1e12 << "TFLOPS"<< std::endl; 123 | 124 | CpuSpmm::run_and_cmp(coo, input, neuron, batch, output, false); 125 | 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/microbenchmark/header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../utils/header.h" 3 | 4 | namespace ftxj { 5 | 6 | void test_benchmark_succ_load_store(int, int, GpuEnv &); 7 | void test_benchmark_matrix_transpose(int batch, int neuron, GpuEnv &env); 8 | 9 | 10 | void test_benchmark_20_uiuc(COOMatrix&, UIUCMatrix &, int , GpuEnv &); 11 | void test_benchmark_row_succ_20_uiuc(COOMatrix&, std::vector &, std::vector &, int, int, GpuEnv &); 12 | void test_benchmark_row_succ_20_uiuc_transpose(COOMatrix&, std::vector &, std::vector &, int, int, GpuEnv &); 13 | void test_benchmark_row_succ_input_transpose_batch_parallel(COOMatrix&, std::vector &, std::vector &, int, int, GpuEnv &); 14 | void test_benchmark_rectangels_batch_parallel_kernel(COOMatrix&, std::vector &, std::vector &, int, int, GpuEnv &); 15 | 16 | void test_benchmark_graph_challenge(std::vector> &input, 17 | std::vector> &weight, std::vector> &row_access, 18 | int batch, int neuron, float bias,GpuEnv &env 19 | ); 20 | 21 | void test_benchmark_matrix_transpose_and_delete(int batch, int neuron, GpuEnv &env); 22 | 23 | void test_benchmark_n16384_l2_l10_kernel(COOMatrix& coo, std::vector &val, int stride, int batch, int neuron, GpuEnv &env); 24 | void test_benchmark_n16384_l11_kernel(COOMatrix& coo, std::vector &B_val, std::vector &B_index, int batch, int neuron, GpuEnv &env); 25 | 26 | void test_benchmark_fused_layer1024_0_1( 27 | std::vector> &input, 28 | std::vector& coo, 29 | std::vector> &weight, 30 | std::vector> &row_access, 31 | int batch, 32 | int neuron, 33 | float bias, 34 | GpuEnv &env 35 | ); 36 | 37 | void test_benchmark_cusparse(COOMatrix& coo, 38 | cuSPARSEMatrix &matrix, 39 | int neuron, int batch); 40 | 41 | 42 | void test_benchmark_19_BF( 43 | COOMatrix &coo, BFMatrix &matrix, 44 | int neuron, int batch, int TN, 45 | int blockx, int blocky, 46 | GpuEnv &env 47 | ); 48 | 49 | void test_benchmark_fuse_cmp_layer1024_0_1( 50 | std::vector> &input, 51 | std::vector> &weight, 52 | std::vector> &row_access, 53 | int batch, 54 | int neuron, 55 | float bias, 56 | GpuEnv &env 57 | ); 58 | // void test_benchmark_n16384_l11_kernel( 59 | // COOMatrix& coo, 60 | // std::vector &B_val, 61 | // std::vector &B_index, 62 | // std::vector &A_row_access, 63 | // std::vector &A_row_access_len, 64 | // int max_input_access, 65 | // int batch, int neuron, 66 | // GpuEnv &env 67 | // ); 68 | 69 | 70 | 71 | void test_benchmark_SNIG( 72 | std::vector> &input, 73 | std::vector &weights, 74 | int batch, 75 | int neuron, 76 | int sec_size, 77 | int nnzs, 78 | float bias, 79 | GpuEnv &env 80 | ); 81 | void vector4_load_data_benchmark(GpuEnv &env); 82 | void test_shared_memory_mm(COOMatrix&, std::vector &val, std::vector &row_access, GpuEnv &env); 83 | }; 84 | -------------------------------------------------------------------------------- /src/microbenchmark/load-data.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | namespace ftxj { 5 | 6 | #define BLOCK_LOAD (256 * 10) 7 | #define VECTOR_BLOCK_LOAD (32 * 2) 8 | 9 | __global__ void naive_copy(float *nextfeat, float *currfeat){ 10 | extern __shared__ float shared[]; 11 | int i = blockIdx.x * BLOCK_LOAD; 12 | for(int j = threadIdx.x; j < BLOCK_LOAD; j += blockDim.x) { 13 | shared[j] = currfeat[i + j]; 14 | } 15 | __syncthreads(); 16 | for(int j = threadIdx.x; j < BLOCK_LOAD; j += blockDim.x) { 17 | nextfeat[i + j] = shared[j] + 1; 18 | } 19 | }; 20 | 21 | 22 | __global__ void uiuc_copy(float *nextfeat, float *currfeat) { 23 | extern __shared__ float shared[]; 24 | 25 | int i = blockIdx.x * 12 * 16384 + blockIdx.y * 256; 26 | 27 | for(int j = threadIdx.x; j < 12 * 256; j += blockDim.x) { 28 | shared[j] = currfeat[i + j]; 29 | } 30 | __syncthreads(); 31 | for(int j = threadIdx.x; j < 12 * 256; j += blockDim.x) { 32 | nextfeat[i + j] = shared[j] + 1; 33 | } 34 | }; 35 | 36 | __global__ void vector4_copy(float* nextfeat, float* currfeat) { 37 | int idx = blockIdx.x * VECTOR_BLOCK_LOAD; 38 | float4* pin = reinterpret_cast(currfeat); 39 | float4* pout = reinterpret_cast(nextfeat); 40 | for(int i = threadIdx.x; i < VECTOR_BLOCK_LOAD; i += blockDim.x) { 41 | pout[idx + i] = pin[idx + i]; 42 | } 43 | }; 44 | 45 | 46 | void vector4_load_data_benchmark(GpuEnv &env) { 47 | float *nextfeat; 48 | float *currfeat; 49 | 50 | int mybatch = 60000; 51 | int neuron = 1024; 52 | 53 | std::vector> input(mybatch, std::vector(neuron, 1.0)); 54 | 55 | Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * mybatch * neuron)); 56 | Safe_Call(cudaMemcpy(currfeat, &input[0][0], sizeof(float) * mybatch * neuron, cudaMemcpyHostToDevice)); 57 | 58 | Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * mybatch * neuron)); 59 | Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * mybatch * neuron)); 60 | 61 | 62 | env.add_event("float4 copy"); 63 | env.event_start_record("float4 copy"); 64 | 65 | dim3 block(64); 66 | dim3 grid((mybatch * neuron) / BLOCK_LOAD); 67 | 68 | vector4_copy<<>>( 69 | nextfeat, currfeat 70 | ); 71 | 72 | env.event_stop_record("float4 copy"); 73 | float time2 = env.get_event_time("float4 copy"); 74 | 75 | std::cout << "float4 bandwidth = " << 2 * (mybatch * (float)neuron * sizeof(float)) / (time2 / 1000) / 1024.0 / 1024.0 / 1024.0 << "GB/s" << std::endl; 76 | 77 | std::cout << "data load and write timer = " << time2 << std::endl; 78 | } 79 | 80 | void test_benchmark_succ_load_store(int mybatch, int neuron, GpuEnv &env) { 81 | float *nextfeat; 82 | float *currfeat; 83 | std::vector> input(mybatch, std::vector(neuron, 1.0)); 84 | 85 | Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * mybatch * neuron)); 86 | Safe_Call(cudaMemcpy(currfeat, &input[0][0], sizeof(float) * mybatch * neuron, cudaMemcpyHostToDevice)); 87 | 88 | Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * mybatch * neuron)); 89 | Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * mybatch * neuron)); 90 | 91 | env.add_event("naive copy"); 92 | env.event_start_record("naive copy"); 93 | 94 | dim3 block(256); 95 | dim3 grid((mybatch * neuron) /BLOCK_LOAD); 96 | 97 | naive_copy<<>>( 98 | nextfeat, currfeat 99 | ); 100 | 101 | env.event_stop_record("naive copy"); 102 | 103 | float time1 = env.get_event_time("naive copy"); 104 | 105 | std::cout << "Load&Store Time [Succ] = " << time1 << "ms" << std::endl; 106 | std::cout << "Load&Store Bandwidth [Succ] = " << 2 * (mybatch * (float)neuron * sizeof(float)) / (time1 / 1000) / 1024.0 / 1024.0 / 1024.0 << "GB/s" << std::endl; 107 | 108 | } 109 | }; -------------------------------------------------------------------------------- /src/microbenchmark/matrix_transpose.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | #define TILE_DIM 64 9 | #define BLOCK_ROWS 16 10 | 11 | __global__ void matrix_transpose(float * __restrict__ odata, float * __restrict__ idata, int neuron, int batch) { 12 | 13 | __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 14 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 15 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 16 | 17 | for (int j = 0; j < TILE_DIM && (y + j) < batch; j += BLOCK_ROWS) { 18 | tile[(threadIdx.y + j)][threadIdx.x] = idata[(y + j) * neuron + x]; 19 | } 20 | 21 | __syncthreads(); 22 | 23 | 24 | x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset 25 | y = blockIdx.x * TILE_DIM + threadIdx.y; 26 | 27 | for (int j = 0; j < TILE_DIM && x < batch; j += BLOCK_ROWS) { 28 | odata[(y+j) * batch + x] = tile[threadIdx.x][threadIdx.y + j]; 29 | } 30 | }; 31 | 32 | void test_benchmark_matrix_transpose(int batch, int neuron, GpuEnv &env) { 33 | 34 | float *A; 35 | float *C; 36 | float * input = (float*)malloc(sizeof(float) * neuron * batch); 37 | memset(input, 0, sizeof(float) * neuron * batch); 38 | 39 | float * output = (float*)malloc(sizeof(float) * neuron * batch); 40 | memset(output, 0, sizeof(float) * neuron * batch); 41 | 42 | srand (static_cast (time(0))); 43 | for(int i = 0; i < batch; ++i) { 44 | for(int j = 0; j < neuron; ++j) { 45 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 46 | input[i * neuron + j] = r2; 47 | } 48 | } 49 | 50 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch)); 51 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice)); 52 | 53 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch)); 54 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch)); 55 | 56 | std::string event = "transpose"; 57 | env.add_event(event); 58 | env.event_start_record(event); 59 | 60 | 61 | dim3 grid((neuron + TILE_DIM - 1) / TILE_DIM, (batch + TILE_DIM - 1) / TILE_DIM); 62 | dim3 block(TILE_DIM, BLOCK_ROWS); 63 | 64 | matrix_transpose<<>>( 66 | C, A, neuron, batch 67 | ); 68 | 69 | env.event_stop_record(event); 70 | 71 | float time = env.get_event_time(event); 72 | 73 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * batch, cudaMemcpyDeviceToHost)); 74 | 75 | std::cout << "Kernel Exec Time [transpose] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | #define TILE_DIM 64 9 | #define BLOCK_ROWS 16 10 | 11 | __global__ void matrix_re_transpose_and_delete( 12 | float * __restrict__ odata, 13 | float * __restrict__ idata, 14 | int * __restrict__ old_to_new_map, 15 | int neuron, int batch) { 16 | 17 | __shared__ float tile[TILE_DIM][TILE_DIM + 1]; 18 | int x = blockIdx.x * TILE_DIM + threadIdx.x; 19 | int y = blockIdx.y * TILE_DIM + threadIdx.y; 20 | 21 | for (int j = 0; j < TILE_DIM && x < batch; j += BLOCK_ROWS) { 22 | tile[(threadIdx.y + j)][threadIdx.x] = idata[(y + j) * batch + x]; 23 | } 24 | 25 | __syncthreads(); 26 | 27 | 28 | x = blockIdx.y * TILE_DIM + threadIdx.x; // old row 29 | y = blockIdx.x * TILE_DIM + threadIdx.y; // old batch 30 | 31 | 32 | for (int j = 0; j < TILE_DIM && (y+j) < batch; j += BLOCK_ROWS) { 33 | if(old_to_new_map[y + j] == -1) continue; 34 | int tmp = old_to_new_map[y + j]; // new batch 35 | odata[tmp * neuron + x] = tile[threadIdx.x][threadIdx.y + j]; 36 | } 37 | }; 38 | 39 | void test_benchmark_matrix_transpose_and_delete(int batch, int neuron, GpuEnv &env) { 40 | 41 | float *A; 42 | float *C; 43 | int* old_to_new_map_d; 44 | 45 | float * input = (float*)malloc(sizeof(float) * neuron * batch); 46 | memset(input, 0, sizeof(float) * neuron * batch); 47 | 48 | float * output = (float*)malloc(sizeof(float) * neuron * batch); 49 | memset(output, 0, sizeof(float) * neuron * batch); 50 | 51 | srand (static_cast (time(0))); 52 | for(int i = 0; i < batch; ++i) { 53 | for(int j = 0; j < neuron; ++j) { 54 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 55 | input[i * neuron + j] = r2; 56 | } 57 | } 58 | 59 | 60 | int * old_to_new_map = (int*)malloc(sizeof(int) * batch); 61 | for(int i = 0; i < 2; ++i) { 62 | old_to_new_map[i] = -1; 63 | } 64 | for(int i = 2; i < batch; ++i) { 65 | old_to_new_map[i] = i - 2; 66 | } 67 | 68 | int new_batch = batch - 2; 69 | 70 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch)); 71 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice)); 72 | 73 | Safe_Call(cudaMalloc((void**)&old_to_new_map_d, sizeof(int) * batch)); 74 | Safe_Call(cudaMemcpy(old_to_new_map_d, old_to_new_map, sizeof(int) * batch, cudaMemcpyHostToDevice)); 75 | 76 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch)); 77 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch)); 78 | 79 | std::string event = "transpose_and_delete"; 80 | env.add_event(event); 81 | env.event_start_record(event); 82 | 83 | 84 | dim3 grid((batch + TILE_DIM - 1) / TILE_DIM, (neuron + TILE_DIM - 1) / TILE_DIM); 85 | dim3 block(TILE_DIM, BLOCK_ROWS); 86 | 87 | matrix_re_transpose_and_delete<<>>( 89 | C, A, old_to_new_map_d, neuron, batch 90 | ); 91 | 92 | env.event_stop_record(event); 93 | 94 | float time = env.get_event_time(event); 95 | 96 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * new_batch, cudaMemcpyDeviceToHost)); 97 | 98 | 99 | std::cout << output[1 * neuron + 0] << ", "; 100 | std::cout << std::endl; 101 | std::cout << "Kernel Exec Time [transpose] = " << time << "ms" <> &input, 7 | std::vector> &weight, 8 | std::vector> &row_access, 9 | int batch, 10 | int neuron, 11 | float bias, 12 | int gpu_index, 13 | int 14 | ); 15 | }; -------------------------------------------------------------------------------- /src/microbenchmark/n16384-l11.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x) { 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | #define OUT_CHANNEL 16 13 | // batch parallel 14 | __global__ void n16384_l11_kernel( 15 | float * __restrict__ A, 16 | float * __restrict__ B, 17 | float * __restrict__ C, 18 | int* __restrict__ index, 19 | int batch, 20 | int neuron, 21 | float bias) { 22 | 23 | extern __shared__ float shared[]; 24 | 25 | 26 | for(int n = threadIdx.x; n < OUT_CHANNEL * 32; n += blockDim.x){ 27 | shared[n] = B[(blockIdx.y * OUT_CHANNEL * 32) + n]; 28 | } 29 | __syncthreads(); 30 | 31 | if((blockIdx.x * blockDim.x + threadIdx.x) >= batch) return; 32 | 33 | int begin_idx = blockIdx.y * OUT_CHANNEL / 16 * 32; 34 | for(int o_r = 0; o_r < OUT_CHANNEL / 16; ++o_r) { 35 | float reduce[16] = {0.0}; 36 | int idx = begin_idx + o_r * 32; 37 | for(int r = 0; r < 32; ++r) { 38 | int row_idx = index[idx + r]; 39 | float val = A[row_idx * batch + blockIdx.x * blockDim.x + threadIdx.x]; 40 | // float val = 1.0; 41 | for(int c = 0; c < 16; c += 8) { 42 | // if(o_r == 0 && blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && c == 0) { 43 | // printf("%f * %f\n", shared[o_r * 32 * 16 + r * 16 + c], val); 44 | // } 45 | reduce[c + 0] += val * shared[o_r * 32 * 16 + r * 16 + c + 0]; 46 | reduce[c + 1] += val * shared[o_r * 32 * 16 + r * 16 + c + 1]; 47 | reduce[c + 2] += val * shared[o_r * 32 * 16 + r * 16 + c + 2]; 48 | reduce[c + 3] += val * shared[o_r * 32 * 16 + r * 16 + c + 3]; 49 | 50 | reduce[c + 4] += val * shared[o_r * 32 * 16 + r * 16 + c + 4]; 51 | reduce[c + 5] += val * shared[o_r * 32 * 16 + r * 16 + c + 5]; 52 | reduce[c + 6] += val * shared[o_r * 32 * 16 + r * 16 + c + 6]; 53 | reduce[c + 7] += val * shared[o_r * 32 * 16 + r * 16 + c + 7]; 54 | 55 | } 56 | } 57 | for(int c = 0; c < 16; ++c) { 58 | C[(blockIdx.y * OUT_CHANNEL + o_r * 16 + c) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c]; 59 | } 60 | } 61 | } 62 | 63 | void test_benchmark_n16384_l11_kernel( 64 | COOMatrix& coo, 65 | std::vector &B_val, 66 | std::vector &B_index, 67 | int batch, int neuron, 68 | GpuEnv &env) { 69 | 70 | float *A; 71 | float *B; 72 | float *C; 73 | int* B_index_d; 74 | 75 | int mybatch = batch; 76 | 77 | int bias = 0; 78 | 79 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 80 | memset(input, 0, sizeof(float) * neuron * mybatch); 81 | 82 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 83 | memset(output, 0, sizeof(float) * neuron * mybatch); 84 | 85 | srand (static_cast (time(0))); 86 | for(int i = 0; i < mybatch; ++i) { 87 | for(int j = 0; j < neuron; ++j) { 88 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 89 | input[i * neuron + j] = r2; 90 | } 91 | } 92 | 93 | float* W = (float*)malloc(sizeof(float) * B_val.size()); 94 | for(int i = 0; i < B_val.size(); ++i) { 95 | W[i] = B_val[i]; 96 | } 97 | 98 | int* W_idx = (int*)malloc(sizeof(int) * B_index.size()); 99 | for(int i = 0; i < B_index.size(); ++i) { 100 | W_idx[i] = B_index[i]; 101 | } 102 | 103 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch)); 104 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 105 | 106 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * B_val.size())); 107 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * B_val.size(), cudaMemcpyHostToDevice)); 108 | 109 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch)); 110 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch)); 111 | 112 | Safe_Call(cudaMalloc((void**)&B_index_d, sizeof(float) * B_index.size())); 113 | Safe_Call(cudaMemcpy(B_index_d, W_idx, sizeof(float) * B_index.size(), cudaMemcpyHostToDevice)); 114 | 115 | env.add_event("row-succ-20-uiuc-kernel"); 116 | env.event_start_record("row-succ-20-uiuc-kernel"); 117 | 118 | int blocksize = 256; 119 | dim3 block(blocksize); 120 | dim3 grid((mybatch + blocksize - 1) / blocksize, neuron / OUT_CHANNEL); 121 | 122 | n16384_l11_kernel<<>>( 123 | A, B, C, B_index_d, batch, neuron, bias 124 | ); 125 | 126 | env.event_stop_record("row-succ-20-uiuc-kernel"); 127 | 128 | float time = env.get_event_time("row-succ-20-uiuc-kernel"); 129 | 130 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 131 | 132 | std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x){ 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | #define MINIBATCH 8 13 | #define UNROLL 8 14 | 15 | __global__ void n16384_l2_l11_kernel( 16 | float * __restrict__ A, 17 | float * __restrict__ B, 18 | float * __restrict__ C, 19 | int stride, 20 | int neuron, 21 | int batch, 22 | float bias) { 23 | 24 | extern __shared__ float shared[]; 25 | int start_idx1 = (blockDim.x / 16) * (blockIdx.y) * 16; 26 | int start_idx2 = (blockDim.x / 16) * (blockIdx.y) * 16 + stride; 27 | int load_num = stride > blockDim.x ? 32 * (blockDim.x / 16) : stride + 16 * (blockDim.x / 16); 28 | int shared_size = ((load_num + 31) / 32) * 32; 29 | int col_gropu = threadIdx.x / 16; 30 | 31 | 32 | for(int n = threadIdx.x; n < load_num * MINIBATCH; n += blockDim.x){ 33 | int f = n / load_num; 34 | int k = n % load_num; 35 | int a_k = ((stride > blockDim.x) && (k >= blockDim.x)) ? (k - blockDim.x) + start_idx2 : k + start_idx1; 36 | // if(blockIdx.x == 0 && blockIdx.y == 0 && f == 0) { 37 | // printf("block 0 load %d\n", a_k); 38 | // } 39 | shared[f * shared_size + k] = A[(blockIdx.x * MINIBATCH + f) * neuron + (a_k) % neuron]; 40 | } 41 | 42 | __syncthreads(); 43 | 44 | int gap = stride >= blockDim.x ? blockDim.x : stride; 45 | 46 | float res[MINIBATCH] = {0.0}; 47 | 48 | for(int r = 0; r < 32; ++r) { 49 | float val = B[(blockIdx.y * blockDim.x * 32) + r * blockDim.x + threadIdx.x]; 50 | int idx = col_gropu * 16 + (r >= 16? r + gap - 16 : r); 51 | for(int f = 0; f < MINIBATCH / UNROLL; ++f) { 52 | if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && f == 0) { 53 | printf("%d %f * %f\n", idx, shared[(f * UNROLL + 0) * shared_size + idx], val); 54 | } 55 | res[0 + f * UNROLL] += shared[(f * UNROLL + 0) * shared_size + idx] * val; 56 | res[1 + f * UNROLL] += shared[(f * UNROLL + 1) * shared_size + idx] * val; 57 | res[2 + f * UNROLL] += shared[(f * UNROLL + 2) * shared_size + idx] * val; 58 | res[3 + f * UNROLL] += shared[(f * UNROLL + 3) * shared_size + idx] * val; 59 | res[4 + f * UNROLL] += shared[(f * UNROLL + 4) * shared_size + idx] * val; 60 | res[5 + f * UNROLL] += shared[(f * UNROLL + 5) * shared_size + idx] * val; 61 | res[6 + f * UNROLL] += shared[(f * UNROLL + 6) * shared_size + idx] * val; 62 | res[7 + f * UNROLL] += shared[(f * UNROLL + 7) * shared_size + idx] * val; 63 | } 64 | } 65 | for(int f = 0; f < MINIBATCH ; ++f) { 66 | C[(blockIdx.x * MINIBATCH + f) * neuron + blockIdx.y * 128 + threadIdx.x] = res[f]; 67 | } 68 | } 69 | 70 | void test_benchmark_n16384_l2_l10_kernel(COOMatrix& coo, std::vector &val, int stride, int batch, int neuron, GpuEnv &env) { 71 | float *A; 72 | float *B; 73 | float *C; 74 | 75 | int bias = 0; 76 | 77 | float * input = (float*)malloc(sizeof(float) * neuron * batch); 78 | memset(input, 0, sizeof(float) * neuron * batch); 79 | 80 | float * output = (float*)malloc(sizeof(float) * neuron * batch); 81 | memset(output, 0, sizeof(float) * neuron * batch); 82 | 83 | srand (static_cast (time(0))); 84 | for(int i = 0; i < batch; ++i) { 85 | for(int j = 0; j < neuron; ++j) { 86 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 87 | input[i * neuron + j] = r2; 88 | } 89 | } 90 | 91 | float* W = (float*)malloc(sizeof(float) * val.size()); 92 | for(int i = 0; i < val.size(); ++i) { 93 | W[i] = val[i]; 94 | } 95 | 96 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch)); 97 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice)); 98 | 99 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size())); 100 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice)); 101 | 102 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch)); 103 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch)); 104 | 105 | std::string event = "test_n16384_l2_l10"; 106 | env.add_event(event); 107 | env.event_start_record(event); 108 | 109 | int blocksize = 128; 110 | int load_num = stride > blocksize ? 32 * (blocksize / 16) : stride + 16 * (blocksize / 16); 111 | int shared_size = ((load_num + 31) / 32) * 32; 112 | dim3 block(blocksize); 113 | dim3 grid((batch + MINIBATCH - 1)/ MINIBATCH, (neuron + 128 - 1) / 128); 114 | n16384_l2_l11_kernel<<>>( 115 | A, B, C, stride, neuron, batch, bias 116 | ); 117 | env.event_stop_record(event); 118 | float time = env.get_event_time(event); 119 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * batch, cudaMemcpyDeviceToHost)); 120 | std::cout << "Kernel Exec Time [n16384-l2-l10] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x){ 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | #define MINIBATCH 32 13 | 14 | __global__ void rectangels_batch_parallel_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index16x16, int neuron, int batch, float bias) { 15 | 16 | extern __shared__ float shared[]; 17 | 18 | for(int n = threadIdx.x; n < 128 * 32; n += blockDim.x){ 19 | shared[n] = B[(blockIdx.y * 128 * 32) + n]; 20 | } 21 | __syncthreads(); 22 | 23 | int start_idx = index16x16[blockIdx.y]; 24 | for(int f = 0; f < 256; ++f) { 25 | for(int i = threadIdx.x; i < 128; i += blockDim.x) { 26 | shared[i + 128 * 32] = 1.0; 27 | // A[(blockIdx.x * 256 + f) * neuron + (start_idx + i) % neuron]; 28 | } 29 | __syncthreads(); 30 | 31 | float res = 0; 32 | 33 | int idx_beg = (threadIdx.x / 16) * 16; 34 | 35 | for(int r = 0; r < 32; ++r) { 36 | res += shared[r * 128 + threadIdx.x] * shared[128 * 32 + idx_beg + r]; 37 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) { 38 | // printf("%f * %f\n", shared[r * 128 + threadIdx.x], shared[128 * 32 + idx_beg + r]); 39 | // } 40 | } 41 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) { 42 | // printf("RES = %f\n", res); 43 | // } 44 | C[(blockIdx.x * 256 + f) * neuron + blockIdx.y * 128 + threadIdx.x] = res; 45 | __syncthreads(); 46 | } 47 | } 48 | 49 | void test_benchmark_rectangels_batch_parallel_kernel(COOMatrix& coo, std::vector &val, std::vector &row_access, int batch, int neuron, GpuEnv &env) { 50 | 51 | float *A; 52 | float *B; 53 | float *C; 54 | int *index; 55 | 56 | int mybatch = batch; 57 | 58 | int bias = 0; 59 | 60 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 61 | memset(input, 1.0, sizeof(float) * neuron * mybatch); 62 | 63 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 64 | memset(output, 0, sizeof(float) * neuron * mybatch); 65 | 66 | // srand (static_cast (time(0))); 67 | // for(int i = 0; i < mybatch; ++i) { 68 | // for(int j = 0; j < neuron; ++j) { 69 | // float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 70 | // input[i * neuron + j] = r2; 71 | // } 72 | // } 73 | 74 | 75 | float* W = (float*)malloc(sizeof(float) * val.size()); 76 | for(int i = 0; i < val.size(); ++i) { 77 | W[i] = val[i]; 78 | } 79 | 80 | int* access = (int*)malloc(sizeof(int) * row_access.size()); 81 | for(int i = 0; i < row_access.size(); ++i) { 82 | access[i] = row_access[i]; 83 | } 84 | 85 | 86 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch)); 87 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 88 | 89 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size())); 90 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice)); 91 | 92 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch)); 93 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch)); 94 | 95 | Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size())); 96 | Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice)); 97 | 98 | env.add_event("row-succ-20-uiuc-kernel"); 99 | env.event_start_record("row-succ-20-uiuc-kernel"); 100 | 101 | int blocksize = 128; 102 | dim3 block(blocksize); 103 | dim3 grid(mybatch / (256), neuron / blocksize); 104 | 105 | rectangels_batch_parallel_kernel<<>>( 106 | A, B, C, index, neuron, batch, bias 107 | ); 108 | 109 | env.event_stop_record("row-succ-20-uiuc-kernel"); 110 | 111 | float time = env.get_event_time("row-succ-20-uiuc-kernel"); 112 | 113 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 114 | 115 | std::cout << "Kernel Exec Time [20-uiuc-row-succ] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x){ 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | #define MINIBATCH 32 13 | 14 | __global__ void uiuc_transpose_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) { 15 | 16 | extern __shared__ float shared[]; 17 | float reduce[MINIBATCH] = {0.0}; 18 | 19 | int groupIdx = threadIdx.x / 32; 20 | int groupNum = blockDim.x / 32; 21 | int lane = threadIdx.x % 32; 22 | 23 | for(int n = threadIdx.x; n < 256 * MINIBATCH; n += blockDim.x){ 24 | int idx = index[blockIdx.y * 256 + n / 32]; 25 | shared[n] = A[idx * batch + blockIdx.x * MINIBATCH + lane]; 26 | } 27 | __syncthreads(); 28 | 29 | for(int r = 0; r < 32; ++r){ 30 | float val = B[blockIdx.y * 256 * 32 + r * 256 + threadIdx.x]; 31 | for(int f = 0; f < MINIBATCH; f++) { 32 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && f == 0) { 33 | // printf("%f * %f %d\n", shared[(threadIdx.x / 32 + r) * MINIBATCH + f], val, index[blockIdx.y * 256]); 34 | // } 35 | reduce[f] += shared[(threadIdx.x / 32 + r) * MINIBATCH + f] * val; // bank conflict!! 36 | } 37 | } 38 | 39 | __syncthreads(); 40 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) { 41 | // printf("res = %f\n", reduce[0]); 42 | // } 43 | 44 | for(int f = 0; f < MINIBATCH; ++f){ 45 | shared[threadIdx.x * MINIBATCH + f] = reduce[f]; 46 | } 47 | 48 | __syncthreads(); 49 | 50 | for(int n = threadIdx.x; n < 256 * MINIBATCH; n += blockDim.x){ 51 | C[(blockIdx.y * 256 + n / MINIBATCH) * batch + blockIdx.x * MINIBATCH + n % MINIBATCH] = shared[(threadIdx.x / MINIBATCH) * MINIBATCH + (n % MINIBATCH)]; 52 | } 53 | } 54 | 55 | void test_benchmark_row_succ_20_uiuc_transpose(COOMatrix& coo, std::vector &val, std::vector &row_access, int batch, int neuron, GpuEnv &env) { 56 | 57 | float *A; 58 | float *B; 59 | float *C; 60 | int *index; 61 | 62 | int mybatch = batch; 63 | 64 | int bias = 0; 65 | 66 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 67 | memset(input, 0, sizeof(float) * neuron * mybatch); 68 | 69 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 70 | memset(output, 0, sizeof(float) * neuron * mybatch); 71 | 72 | srand (static_cast (time(0))); 73 | for(int i = 0; i < mybatch; ++i) { 74 | for(int j = 0; j < neuron; ++j) { 75 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 76 | input[i * neuron + j] = r2; 77 | } 78 | } 79 | 80 | 81 | float* W = (float*)malloc(sizeof(float) * val.size()); 82 | for(int i = 0; i < val.size(); ++i) { 83 | W[i] = val[i]; 84 | } 85 | 86 | int* access = (int*)malloc(sizeof(int) * row_access.size()); 87 | for(int i = 0; i < row_access.size(); ++i) { 88 | access[i] = row_access[i]; 89 | } 90 | 91 | 92 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch)); 93 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 94 | 95 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size())); 96 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice)); 97 | 98 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch)); 99 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch)); 100 | 101 | Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size())); 102 | Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice)); 103 | 104 | env.add_event("row-succ-20-uiuc-kernel"); 105 | env.event_start_record("row-succ-20-uiuc-kernel"); 106 | 107 | int blocksize = 256; 108 | dim3 block(blocksize); 109 | dim3 grid(mybatch / (MINIBATCH), neuron / blocksize); 110 | 111 | uiuc_transpose_kernel<<>>( 112 | A, B, C, index, neuron, batch, bias 113 | ); 114 | 115 | env.event_stop_record("row-succ-20-uiuc-kernel"); 116 | 117 | float time = env.get_event_time("row-succ-20-uiuc-kernel"); 118 | 119 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 120 | 121 | std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x){ 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | #define MINIBATCH 32 13 | 14 | __global__ void uiuc_cut_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) { 15 | 16 | extern __shared__ float shared[]; 17 | float reduce[MINIBATCH] = {0.0}; 18 | 19 | 20 | int idx = index[(blockIdx.y * blockDim.x + threadIdx.x) / 32 + threadIdx.x % 32]; 21 | 22 | for(unsigned int f = 0; f < MINIBATCH; f++) { 23 | shared[f * blockDim.x + threadIdx.x] = A[(blockIdx.x * MINIBATCH + f) * neuron + idx]; 24 | } 25 | __syncthreads(); 26 | for(int r = 0; r < 32; ++r){ 27 | float val = B[blockIdx.y * blockDim.x * 32 + r * blockDim.x + threadIdx.x]; 28 | for(int f = 0; f < MINIBATCH; f++) { 29 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) { 30 | // printf("%f * %f\n", shared[f * blockDim.x + (threadIdx.x / 32+ r)], val); 31 | // } 32 | reduce[f] += shared[f * blockDim.x + (threadIdx.x / 32 + r)] * val; 33 | } 34 | } 35 | 36 | // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1) { 37 | // printf("res = %f\n", reduce[0]); 38 | // } 39 | 40 | int widx1 = (blockIdx.y * blockDim.x) / 2; 41 | int widx2 = (blockIdx.y * blockDim.x) / 2 + 512; 42 | int wgroup = threadIdx.x / 32; 43 | 44 | int widx = threadIdx.x % 32 > 16 ? widx2 + wgroup * 16 + threadIdx.x % 32 - 16 : widx1 + wgroup * 16 + threadIdx.x % 32; 45 | 46 | for(int f = 0; f < MINIBATCH; f++) { 47 | C[(blockIdx.x * MINIBATCH + f) * neuron + widx] = reduce[f]; 48 | } 49 | } 50 | 51 | void test_benchmark_row_succ_20_uiuc(COOMatrix& coo, std::vector &val, std::vector &row_access, int batch, int neuron, GpuEnv &env) { 52 | 53 | float *A; 54 | float *B; 55 | float *C; 56 | int *index; 57 | 58 | int mybatch = batch; 59 | 60 | int bias = 0; 61 | 62 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 63 | memset(input, 0, sizeof(float) * neuron * mybatch); 64 | 65 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 66 | memset(output, 0, sizeof(float) * neuron * mybatch); 67 | 68 | srand (static_cast (time(0))); 69 | for(int i = 0; i < mybatch; ++i) { 70 | for(int j = 0; j < neuron; ++j) { 71 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 72 | input[i * neuron + j] = r2; 73 | } 74 | } 75 | 76 | 77 | float* W = (float*)malloc(sizeof(float) * val.size()); 78 | for(int i = 0; i < val.size(); ++i) { 79 | W[i] = val[i]; 80 | } 81 | 82 | int* access = (int*)malloc(sizeof(int) * row_access.size()); 83 | for(int i = 0; i < row_access.size(); ++i) { 84 | access[i] = row_access[i]; 85 | } 86 | 87 | 88 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch)); 89 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 90 | 91 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size())); 92 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice)); 93 | 94 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch)); 95 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch)); 96 | 97 | Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size())); 98 | Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice)); 99 | 100 | env.add_event("row-succ-20-uiuc-kernel"); 101 | env.event_start_record("row-succ-20-uiuc-kernel"); 102 | 103 | int blocksize = 256; 104 | dim3 block(blocksize); 105 | dim3 grid(mybatch / (MINIBATCH), neuron / blocksize); 106 | 107 | uiuc_cut_kernel<<>>( 108 | A, B, C, index, neuron, batch, bias 109 | ); 110 | 111 | env.event_stop_record("row-succ-20-uiuc-kernel"); 112 | 113 | float time = env.get_event_time("row-succ-20-uiuc-kernel"); 114 | 115 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 116 | 117 | std::cout << "Kernel Exec Time [20-uiuc-row-succ] = " << time << "ms" < 2 | #include "../gpu_lib/header.h" 3 | #include "../utils/header.h" 4 | #include 5 | #include 6 | namespace ftxj { 7 | 8 | __device__ inline float __ReLU(float x) { 9 | return x<0.0?0.0:x>32.0?32.0:x; 10 | }; 11 | 12 | __global__ void batch_parallel_16384x32succ_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) { 13 | extern __shared__ float shared[]; 14 | float reduce[32] = {0.0}; 15 | 16 | for(int n = threadIdx.x; n < 32 * 32; n += blockDim.x){ 17 | shared[n] = B[(blockIdx.y * 32 * 32) + n]; 18 | } 19 | __syncthreads(); 20 | if((blockIdx.x * blockDim.x + threadIdx.x) >= batch) return; 21 | 22 | for(int r = 0; r < 32; ++r) { 23 | int row_idx = index[blockIdx.y * 32 + r]; 24 | float val = A[row_idx * batch + blockIdx.x * blockDim.x + threadIdx.x]; 25 | for(int c = 0; c < 32; ++c){ 26 | reduce[c] += shared[r * 32 + c] * val; 27 | } 28 | } 29 | __syncthreads(); 30 | for(int c = 0; c < 16; ++c) { 31 | C[(blockIdx.y * 16 + c) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c]; 32 | } 33 | for(int c = 16; c < 32; ++c) { 34 | C[(neuron / 2 + blockIdx.y * 16 + c - 16) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c]; 35 | } 36 | } 37 | 38 | void test_benchmark_row_succ_input_transpose_batch_parallel(COOMatrix& coo, std::vector &val, std::vector &row_access, int batch, int neuron, GpuEnv &env) { 39 | 40 | float *A; 41 | float *B; 42 | float *C; 43 | int *index; 44 | 45 | int mybatch = batch; 46 | 47 | int bias = 0; 48 | 49 | float * input = (float*)malloc(sizeof(float) * neuron * mybatch); 50 | memset(input, 0, sizeof(float) * neuron * mybatch); 51 | 52 | float * output = (float*)malloc(sizeof(float) * neuron * mybatch); 53 | memset(output, 0, sizeof(float) * neuron * mybatch); 54 | 55 | srand (static_cast (time(0))); 56 | for(int i = 0; i < mybatch; ++i) { 57 | for(int j = 0; j < neuron; ++j) { 58 | float r2 = static_cast (rand()) / (static_cast (RAND_MAX/32.0)); 59 | input[i * neuron + j] = r2; 60 | } 61 | } 62 | 63 | 64 | float* W = (float*)malloc(sizeof(float) * val.size()); 65 | for(int i = 0; i < val.size(); ++i) { 66 | W[i] = val[i]; 67 | } 68 | 69 | int* access = (int*)malloc(sizeof(int) * row_access.size()); 70 | for(int i = 0; i < row_access.size(); ++i) { 71 | access[i] = row_access[i]; 72 | } 73 | 74 | 75 | Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch)); 76 | Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice)); 77 | 78 | Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size())); 79 | Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice)); 80 | 81 | Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch)); 82 | Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch)); 83 | 84 | Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size())); 85 | Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice)); 86 | 87 | env.add_event("row-succ-20-uiuc-kernel"); 88 | env.event_start_record("row-succ-20-uiuc-kernel"); 89 | 90 | int blocksize = 256; 91 | dim3 block(blocksize); 92 | dim3 grid((mybatch + blocksize - 1) / blocksize, neuron / 32); 93 | 94 | batch_parallel_16384x32succ_kernel<<>>( 95 | A, B, C, index, neuron, batch, bias 96 | ); 97 | 98 | env.event_stop_record("row-succ-20-uiuc-kernel"); 99 | 100 | float time = env.get_event_time("row-succ-20-uiuc-kernel"); 101 | 102 | Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost)); 103 | 104 | std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time << "ms" < 8 | #include 9 | #include 10 | #include 11 | #include 12 | using namespace ftxj; 13 | 14 | 15 | std::string get_weight_file_name(int neuron, int layer) { 16 | std::string weight_file_dir = "../data/neuron"; 17 | std::string neuron_str = std::to_string(neuron); 18 | weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv"; 19 | return weight_file_dir; 20 | } 21 | 22 | void dense_reorder(std::vector> &input, Reorder &reorder_class) { 23 | // std::vector> old = input; 24 | for(int i = 0; i < input.size(); ++i) { 25 | std::vector tmp(input[i].size()); 26 | for(int j = 0; j < input[i].size(); ++j) { 27 | auto new_j = reorder_class.reorder(j); 28 | tmp[new_j] = input[i][j]; 29 | } 30 | input[i] = tmp; 31 | } 32 | } 33 | 34 | void read_input(std::vector> &input, int neuron, int batch) { 35 | std::string input_file_name = "../data/sparse-images-"; 36 | input_file_name += std::to_string(neuron) + ".tsv"; 37 | std::ifstream input_file(input_file_name); 38 | if(!input_file){ 39 | std::cout << "FILE:" << input_file_name << " does not exists.\n"; 40 | exit(-1); 41 | } 42 | int b, n; 43 | float val; 44 | long read_num = 0; 45 | while(input_file >> b >> n >> val) { 46 | if(b <= batch) { 47 | read_num++; 48 | input[b - 1][n - 1] = val; 49 | if(val != 1.00) { 50 | printf("read input %d, %f\n", b, val); 51 | } 52 | } 53 | } 54 | std::cout << "Read Input success! read_numeber = " << read_num << std::endl; 55 | } 56 | 57 | int main(int argc, char* argv[]) { 58 | 59 | char hostname[MPI_MAX_PROCESSOR_NAME]; 60 | int task_count; 61 | int rank; 62 | int len; 63 | int ret; 64 | 65 | 66 | int neuron = atoi(argv[1]); 67 | int batch = atoi(argv[2]); 68 | int layer = atoi(argv[3]); 69 | // int nnzs = atoi(argv[4]); 70 | 71 | ret = MPI_Init(&argc, &argv); 72 | if (MPI_SUCCESS != ret) { 73 | printf("start mpi fail\n"); 74 | MPI_Abort(MPI_COMM_WORLD, ret); 75 | } 76 | 77 | MPI_Comm_size(MPI_COMM_WORLD, &task_count); 78 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 79 | MPI_Get_processor_name(hostname, &len); 80 | 81 | if(rank == 0) 82 | printf("task_count = %d, my rank = %d on %s\n", task_count, rank, hostname); 83 | 84 | 85 | // int neuron = 16384; 86 | // int batch = 60000; 87 | // int layer = 1920; 88 | 89 | std::map hash_map = { 90 | {65536, 4096}, 91 | {16384, 1024}, 92 | {4096, 256}, 93 | {1024, 64} 94 | }; 95 | 96 | std::map bias_map = { 97 | {65536, -0.45}, 98 | {16384, -0.4}, 99 | {4096, -0.35}, 100 | {1024, -0.3} 101 | }; 102 | 103 | std::map type_1 = { 104 | {65536, 12}, 105 | {16384, 10}, 106 | {4096, 8}, 107 | {1024, 6} 108 | }; 109 | 110 | std::vector> input(batch, std::vector(neuron)); 111 | std::vector> weight; 112 | std::vector> row_access; 113 | 114 | 115 | std::cout << "GPU[" << rank << "] " << "[BEGIN]..." << std::endl; 116 | read_input(input, neuron, batch); 117 | std::cout << "GPU[" << rank << "] " << "Read Input success!" << std::endl; 118 | HashReorder hash_reorder_t(hash_map[neuron], neuron); 119 | dense_reorder(input, hash_reorder_t); 120 | 121 | for(int l = 0; l < layer; ++l) { 122 | auto weight_file = get_weight_file_name(neuron, l); 123 | COOMatrix coo(weight_file, 1, false); 124 | std::cout << "GPU[" << rank << "] " << "["<< weight_file << "] to COO success!" << std::endl; 125 | coo.reorder(hash_reorder_t); 126 | std::cout << "GPU[" << rank << "] " << "Reorder success!" << std::endl; 127 | CSRCSCMatrix csr_csc(coo); 128 | csr_csc.transpose(); 129 | BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method); 130 | std::cout << "GPU[" << rank << "] " << "Structural Info success!" << std::endl; 131 | MaxInReuseBSchedule schedule(blocks); 132 | if(l == 0) { 133 | schedule.schedule(16, 7); 134 | } 135 | else if(l < type_1[neuron]) { 136 | schedule.schedule_output_parallel(128, 1, false); 137 | } 138 | else { 139 | schedule.schedule(128, 1); 140 | } 141 | std::cout << "GPU[" << rank << "] " << "Schedule succ" << std::endl; 142 | auto data = schedule.get_data(neuron); 143 | weight.push_back(data.value); 144 | row_access.push_back(data.row_access); 145 | } 146 | int gpu_id = 0; 147 | if(rank == 0) gpu_id = 0; 148 | if(rank == 1) gpu_id = 1; 149 | if(rank == 2) gpu_id = 2; 150 | if(rank == 3) gpu_id = 3; 151 | test_benchmark_multi_gpu_graph_challenge(input, weight, row_access, (batch + 1) / 2, neuron, bias_map[neuron], gpu_id, rank); 152 | std::cout << "GPU[" << rank << "] " <<"[END]..." << std::endl; 153 | MPI_Barrier(MPI_COMM_WORLD); 154 | MPI_Finalize(); 155 | return 0; 156 | } -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/add_singlegpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/add_singlegpu -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/add_singlegpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void add_kernel(int n, float a, float *x, float *y) 4 | { 5 | int i = blockIdx.x*blockDim.x + threadIdx.x; 6 | if (i < n) y[i] = a*x[i] + y[i]; 7 | } 8 | 9 | int main() 10 | { 11 | 12 | //printf("gpu count : %d\n",N_GPU); 13 | 14 | //Arrange the task of each GPU 15 | int N = 1<<30; 16 | 17 | cudaSetDevice(0); 18 | 19 | float *x, *y, *d_x, *d_y; 20 | x = (float*)malloc(N*sizeof(float)); 21 | y = (float*)malloc(N*sizeof(float)); 22 | 23 | cudaMalloc(&d_x, N*sizeof(float)); 24 | cudaMalloc(&d_y, N*sizeof(float)); 25 | 26 | for (int i = 0; i < N; i++) { 27 | x[i] = 1.0f; 28 | y[i] = 2.0f; 29 | } 30 | 31 | cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); 32 | cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); 33 | 34 | float time_elapsed=0; 35 | cudaEvent_t start,stop; 36 | cudaEventCreate(&start); //创建Event 37 | cudaEventCreate(&stop); 38 | cudaEventRecord( start,0); //记录当前时间 39 | 40 | // Perform SAXPY on 1M elements 41 | add_kernel<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); 42 | 43 | cudaEventRecord(stop,0); //记录当前时间 44 | cudaEventSynchronize(start); //Waits for an event to complete. 45 | cudaEventSynchronize(stop); //Waits for an event to complete.Record之前的任务 46 | cudaEventElapsedTime(&time_elapsed,start,stop); //计算时间差 47 | 48 | 49 | 50 | 51 | 52 | cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); 53 | 54 | float maxError = 0.0f; 55 | for (int i = 0; i < N; i++) 56 | maxError = max(maxError, abs(y[i]-4.0f)); 57 | printf("Max error: %f\n", maxError); 58 | cudaEventDestroy(start); //destory the event 59 | cudaEventDestroy(stop); 60 | printf("执行时间:%f(ms)\n",time_elapsed); 61 | } -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/makefile: -------------------------------------------------------------------------------- 1 | test:saxpy.o mpi_call.o 2 | mpicxx mpi_call.o saxpy.o -L/usr/local/cuda/lib64 -lcudart -o test 3 | saxpy.o:saxpy.cu 4 | nvcc -c saxpy.cu -o saxpy.o 5 | mpi_call.o:mpi_call.cpp 6 | mpicxx -c mpi_call.cpp -o mpi_call.o 7 | clean: 8 | rm -f *.o -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/mpi_call.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "vars.h" 5 | 6 | int main(int argc, char *argv[]) { 7 | char hostname[MPI_MAX_PROCESSOR_NAME]; 8 | int task_count; 9 | int rank; 10 | int len; 11 | int ret; 12 | 13 | ret = MPI_Init(&argc, &argv); 14 | if (MPI_SUCCESS != ret) { 15 | printf("start mpi fail\n"); 16 | MPI_Abort(MPI_COMM_WORLD, ret); 17 | } 18 | 19 | MPI_Comm_size(MPI_COMM_WORLD, &task_count); 20 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 21 | MPI_Get_processor_name(hostname, &len); 22 | 23 | printf("task_count = %d, my rank = %d on %s\n", task_count, rank, hostname); 24 | 25 | float esp_time_cpu; 26 | clock_t start_cpu, stop_cpu; 27 | 28 | start_cpu = clock();// start timing 29 | 30 | handle(rank);//在此调用用cuda写的函数 31 | stop_cpu = clock();// end timing 32 | 33 | esp_time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC; 34 | 35 | printf("The time by host:\t%f(ms)\n", esp_time_cpu); 36 | 37 | MPI_Finalize(); 38 | 39 | return 0; 40 | } -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/mpi_call.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/mpi_call.o -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/run_volta.sh: -------------------------------------------------------------------------------- 1 | mpirun -np 4 ./test -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/saxpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void add_kernel(int n, float a, float *x, float *y) 4 | { 5 | int i = blockIdx.x*blockDim.x + threadIdx.x; 6 | if (i < n) y[i] = a*x[i] + y[i]; 7 | } 8 | 9 | void handle(int gpu_number) 10 | { 11 | int N_GPU; 12 | cudaGetDeviceCount(&N_GPU); 13 | //printf("gpu count : %d\n",N_GPU); 14 | 15 | //Arrange the task of each GPU 16 | int N = ((1<<30)+N_GPU - 1)/N_GPU; 17 | 18 | cudaSetDevice(gpu_number); 19 | 20 | float *x, *y, *d_x, *d_y; 21 | x = (float*)malloc(N*sizeof(float)); 22 | y = (float*)malloc(N*sizeof(float)); 23 | 24 | cudaMalloc(&d_x, N*sizeof(float)); 25 | cudaMalloc(&d_y, N*sizeof(float)); 26 | 27 | for (int i = 0; i < N; i++) { 28 | x[i] = 1.0f; 29 | y[i] = 2.0f; 30 | } 31 | 32 | cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); 33 | cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); 34 | 35 | float time_elapsed=0; 36 | cudaEvent_t start,stop; 37 | cudaEventCreate(&start); //创建Event 38 | cudaEventCreate(&stop); 39 | cudaEventRecord( start,0); //记录当前时间 40 | 41 | // Perform SAXPY on 1M elements 42 | add_kernel<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); 43 | 44 | cudaEventRecord(stop,0); //记录当前时间 45 | cudaEventSynchronize(start); //Waits for an event to complete. 46 | cudaEventSynchronize(stop); //Waits for an event to complete.Record之前的任务 47 | cudaEventElapsedTime(&time_elapsed,start,stop); //计算时间差 48 | cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); 49 | 50 | float maxError = 0.0f; 51 | for (int i = 0; i < N; i++) 52 | maxError = max(maxError, abs(y[i]-4.0f)); 53 | printf("Max error: %f\n", maxError); 54 | cudaEventDestroy(start); //destory the event 55 | cudaEventDestroy(stop); 56 | printf("card%d 执行时间:%f(ms)\n",gpu_number,time_elapsed); 57 | } -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/saxpy.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/saxpy.o -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/test -------------------------------------------------------------------------------- /src/multi_gpu/add_mpi/vars.h: -------------------------------------------------------------------------------- 1 | void handle(int); -------------------------------------------------------------------------------- /src/multi_gpu/add_omp.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include // stdio functions are used since C++ streams aren't necessarily thread safe 3 | 4 | 5 | // a simple kernel that simply increments each array element by b 6 | __global__ void kernelAddConstant(int *g_a, const int b) 7 | { 8 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 9 | g_a[idx] += b; 10 | } 11 | 12 | // a predicate that checks whether each array elemen is set to its index plus b 13 | int correctResult(int *data, const int n, const int b) 14 | { 15 | for(int i = 0; i < n; i++) 16 | if(data[i] != i + b) 17 | return 0; 18 | return 1; 19 | } 20 | 21 | int main(int argc, char *argv[]) 22 | { 23 | int num_gpus = 0; // number of CUDA GPUs 24 | 25 | ///////////////////////////////////////////////////////////////// 26 | // determine the number of CUDA capable GPUs 27 | // 28 | cudaGetDeviceCount(&num_gpus); 29 | if(num_gpus < 1) 30 | { 31 | printf("no CUDA capable devices were detected\n"); 32 | return 1; 33 | } 34 | 35 | ///////////////////////////////////////////////////////////////// 36 | // display CPU and GPU configuration 37 | // 38 | printf("number of host CPUs:\t%d\n", omp_get_num_procs()); 39 | printf("number of CUDA devices:\t%d\n", num_gpus); 40 | for(int i = 0; i < num_gpus; i++) 41 | { 42 | cudaDeviceProp dprop; 43 | cudaGetDeviceProperties(&dprop, i); 44 | printf(" %d: %s\n", i, dprop.name); 45 | } 46 | printf("---------------------------\n"); 47 | 48 | 49 | ///////////////////////////////////////////////////////////////// 50 | // initialize data 51 | // 52 | unsigned int n = num_gpus * 8192; 53 | unsigned int nbytes = n * sizeof(int); 54 | int *a = 0; // pointer to data on the CPU 55 | int b = 3; // value by which the array is incremented 56 | a = (int*)malloc(nbytes); 57 | if(0 == a) 58 | { 59 | printf("couldn't allocate CPU memory\n"); 60 | return 1; 61 | } 62 | for(unsigned int i = 0; i < n; i++) 63 | a[i] = i; 64 | 65 | 66 | //////////////////////////////////////////////////////////////// 67 | // run as many CPU threads as there are CUDA devices 68 | // each CPU thread controls a different device, processing its 69 | // portion of the data. It's possible to use more CPU threads 70 | // than there are CUDA devices, in which case several CPU 71 | // threads will be allocating resources and launching kernels 72 | // on the same device. For example, try omp_set_num_threads(2*num_gpus); 73 | // Recall that all variables declared inside an "omp parallel" scope are 74 | // local to each CPU thread 75 | // 76 | omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices 77 | //omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices 78 | #pragma omp parallel 79 | { 80 | unsigned int cpu_thread_id = omp_get_thread_num(); 81 | unsigned int num_cpu_threads = omp_get_num_threads(); 82 | 83 | // set and check the CUDA device for this CPU thread 84 | int gpu_id = -1; 85 | cudaSetDevice(cpu_thread_id % num_gpus); // "% num_gpus" allows more CPU threads than GPU devices 86 | cudaGetDevice(&gpu_id); 87 | 88 | printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id); 89 | 90 | int *d_a = 0; // pointer to memory on the device associated with this CPU thread 91 | int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data 92 | unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; 93 | dim3 gpu_threads(128); // 128 threads per block 94 | dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); 95 | 96 | cudaMalloc((void**)&d_a, nbytes_per_kernel); 97 | cudaMemset(d_a, 0, nbytes_per_kernel); 98 | cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice); 99 | kernelAddConstant<<>>(d_a, b); 100 | 101 | cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost); 102 | cudaFree(d_a); 103 | 104 | 105 | } 106 | printf("---------------------------\n"); 107 | 108 | if(cudaSuccess != cudaGetLastError()) 109 | printf("%s\n", cudaGetErrorString(cudaGetLastError())); 110 | 111 | 112 | //////////////////////////////////////////////////////////////// 113 | // check the result 114 | // 115 | if(correctResult(a, n, b)) 116 | printf("Test PASSED\n"); 117 | else 118 | printf("Test FAILED\n"); 119 | 120 | free(a); // free CPU memory 121 | 122 | //cudaThreadExit(); 123 | 124 | return 0; 125 | } -------------------------------------------------------------------------------- /src/multi_gpu/add_stream.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | // This application demonstrates how to use CUDA API to use mutiple GPUs 6 | // Function to add the elements of two arrays 7 | 8 | //Mutiple-GPU Plan Structure 9 | typedef struct 10 | { 11 | //Host-side input data 12 | float *h_x, *h_y; 13 | 14 | //Result copied back from GPU 15 | float *h_yp; 16 | //Device buffers 17 | float *d_x, *d_y; 18 | 19 | //Stream for asynchronous command execution 20 | cudaStream_t stream; 21 | 22 | } TGPUplan; 23 | 24 | 25 | 26 | // CUDA Kernel function to add the elements of two arrays on the GPU 27 | __global__ void add(int n, float *x, float *y) 28 | { 29 | int index = threadIdx.x + blockIdx.x * blockDim.x; 30 | int stride = blockDim.x * gridDim.x; 31 | for (int i = index; i < n; i += stride) 32 | y[i] = x[i] + y[i]; 33 | } 34 | 35 | int main(void) 36 | { 37 | int N = 1<<20; // 1M elements 38 | 39 | //Get the numble of CUDA-capble GPU 40 | int N_GPU; 41 | cudaGetDeviceCount(&N_GPU); 42 | printf("gpu count : %d\n",N_GPU); 43 | 44 | //Arrange the task of each GPU 45 | int Np = (N + N_GPU - 1) / N_GPU; 46 | 47 | //Create GPU plans 48 | TGPUplan plan[N_GPU]; 49 | 50 | //Initializing 51 | for(int i = 0; i < N_GPU; i++) 52 | { 53 | cudaSetDevice(i); 54 | cudaStreamCreate(&plan[i].stream); 55 | 56 | cudaMalloc((void **)&plan[i].d_x, Np * sizeof(float)); 57 | cudaMalloc((void **)&plan[i].d_y, Np * sizeof(float)); 58 | plan[i].h_x = (float *)malloc(Np * sizeof(float)); 59 | plan[i].h_y = (float *)malloc(Np * sizeof(float)); 60 | plan[i].h_yp = (float *)malloc(Np * sizeof(float)); 61 | 62 | for(int j = 0; j < Np; j++) 63 | { 64 | plan[i].h_x[j] = 1.0f; 65 | plan[i].h_y[j] = 2.0f; 66 | } 67 | } 68 | 69 | int blockSize = 256; 70 | int numBlock = (Np + blockSize - 1) / blockSize; 71 | 72 | 73 | // double iStart,iElaps; 74 | // iStart=cpuSecond(); 75 | 76 | clock_t start, finish; 77 | start = clock(); 78 | 79 | for(int i = 0; i < N_GPU; i++) 80 | { 81 | //Set device 82 | cudaSetDevice(i); 83 | 84 | //Copy input data from CPU 85 | cudaMemcpyAsync(plan[i].d_x, plan[i].h_x, Np * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream); 86 | cudaMemcpyAsync(plan[i].d_y, plan[i].h_y, Np * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream); 87 | //Run the kernel function on GPU 88 | add<<>>(Np, plan[i].d_x, plan[i].d_y); 89 | 90 | //Read back GPU results 91 | cudaMemcpyAsync(plan[i].h_yp, plan[i].d_y, Np * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream); 92 | } 93 | finish = clock(); 94 | float duration = (double)(finish - start) / CLOCKS_PER_SEC; 95 | printf("GPU Kernel time: %f\n",duration); 96 | // cudaDeviceSynchronize(); 97 | // iElaps=cpuSecond()-iStart; 98 | // printf("GPU Kernel time: %f\n",iElaps); 99 | //Process GPU results 100 | float y[N]; 101 | for(int i = 0; i < N_GPU; i++) 102 | { 103 | //Set device 104 | cudaSetDevice(i); 105 | 106 | //Wait for all operations to finish 107 | cudaStreamSynchronize(plan[i].stream); 108 | 109 | //Get the final results 110 | for(int j = 0; j < Np; j++) 111 | if(Np * i + j < N) 112 | y[Np * i + j]=plan[i].h_yp[j]; 113 | 114 | //shut down this GPU 115 | cudaFree(plan[i].d_x); 116 | cudaFree(plan[i].d_y); 117 | free(plan[i].h_x); 118 | free(plan[i].h_y); 119 | cudaStreamDestroy(plan[i].stream); //Destroy the stream 120 | } 121 | 122 | // Check for errors (all values should be 3.0f) 123 | float maxError = 0.0f; 124 | for (int i = 0; i < N; i++) 125 | maxError = fmax(maxError, fabs(y[i]-3.0f)); 126 | std::cout << "Max error: " << maxError << std::endl; 127 | 128 | return 0; 129 | 130 | } -------------------------------------------------------------------------------- /src/network.cpp: -------------------------------------------------------------------------------- 1 | #include "utils/header.h" 2 | #include "reorder/header.h" 3 | #include "inspector/header.h" 4 | #include "gpu_lib/header.h" 5 | #include "microbenchmark/header.h" 6 | #include "fuse/header.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace ftxj; 12 | 13 | 14 | std::string get_weight_file_name(int neuron, int layer) { 15 | std::string weight_file_dir = "../data/neuron"; 16 | std::string neuron_str = std::to_string(neuron); 17 | weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv"; 18 | return weight_file_dir; 19 | } 20 | 21 | void dense_reorder(std::vector> &input, Reorder &reorder_class) { 22 | // std::vector> old = input; 23 | for(int i = 0; i < input.size(); ++i) { 24 | std::vector tmp(input[i].size()); 25 | for(int j = 0; j < input[i].size(); ++j) { 26 | auto new_j = reorder_class.reorder(j); 27 | tmp[new_j] = input[i][j]; 28 | } 29 | input[i] = tmp; 30 | } 31 | } 32 | 33 | void read_input(std::vector> &input, int neuron, int batch) { 34 | std::string input_file_name = "../data/sparse-images-"; 35 | input_file_name += std::to_string(neuron) + ".tsv"; 36 | std::ifstream input_file(input_file_name); 37 | if(!input_file){ 38 | std::cout << "FILE:" << input_file_name << " does not exists.\n"; 39 | exit(-1); 40 | } 41 | int b, n; 42 | float val; 43 | long read_num = 0; 44 | while(input_file >> b >> n >> val) { 45 | if(b <= batch) { 46 | read_num++; 47 | input[b - 1][n - 1] = val; 48 | if(val != 1.00) { 49 | printf("read input %d, %f\n", b, val); 50 | } 51 | } 52 | } 53 | std::cout << "Read Input success! read_numeber = " << read_num << std::endl; 54 | } 55 | 56 | int main(int argc, char* argv[]) { 57 | 58 | if(argc != 4) { 59 | std::cout << "Usage: exe neuron batch layer" << std::endl; 60 | return 0; 61 | } 62 | int neuron = atoi(argv[1]); 63 | int batch = atoi(argv[2]); 64 | int layer = atoi(argv[3]); 65 | 66 | std::map hash_map = { 67 | {65536, 4096}, 68 | {16384, 1024}, 69 | {4096, 256}, 70 | {1024, 64} 71 | }; 72 | 73 | std::map bias_map = { 74 | {65536, -0.45}, 75 | {16384, -0.4}, 76 | {4096, -0.35}, 77 | {1024, -0.3} 78 | }; 79 | 80 | std::map type_1 = { 81 | {65536, 12}, 82 | {16384, 10}, 83 | {4096, 8}, 84 | {1024, 6} 85 | }; 86 | 87 | std::vector> input(batch, std::vector(neuron)); 88 | std::vector> weight; 89 | std::vector> row_access; 90 | 91 | std::cout << "[BEGIN]..." << std::endl; 92 | read_input(input, neuron, batch); 93 | std::cout << "Read Input success!" << std::endl; 94 | HashReorder hash_reorder_t(hash_map[neuron], neuron); 95 | dense_reorder(input, hash_reorder_t); 96 | 97 | for(int l = 0; l < layer; ++l) { 98 | auto weight_file = get_weight_file_name(neuron, l); 99 | COOMatrix coo(weight_file, 1, false); 100 | std::cout << "["<< weight_file << "] to COO success!" << std::endl; 101 | coo.reorder(hash_reorder_t); 102 | std::cout << "Reorder success!" << std::endl; 103 | CSRCSCMatrix csr_csc(coo); 104 | csr_csc.transpose(); 105 | BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method); 106 | std::cout << "Structural Info success!" << std::endl; 107 | MaxInReuseBSchedule schedule(blocks); 108 | if(l == 0) { 109 | schedule.schedule(16, 7); 110 | } 111 | else if(l < type_1[neuron]) { 112 | schedule.schedule_output_parallel(128, 1, false); 113 | } 114 | else { 115 | schedule.schedule(128, 1); 116 | } 117 | std::cout << "Schedule succ" << std::endl; 118 | auto data = schedule.get_data(neuron); 119 | weight.push_back(data.value); 120 | row_access.push_back(data.row_access); 121 | } 122 | GpuEnv env(0); 123 | test_benchmark_graph_challenge(input, weight, row_access, batch, neuron, bias_map[neuron], env); 124 | std::cout << "[END]..." << std::endl; 125 | return 0; 126 | } -------------------------------------------------------------------------------- /src/reorder/hash.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../utils/header.h" 4 | #include "reorder.h" 5 | 6 | namespace ftxj { 7 | class HashReorder : public Reorder { 8 | 9 | int buckets_num_; 10 | int max_domain_; 11 | int buckets_width; 12 | REORDER type_; 13 | 14 | int hash(int v) { 15 | if(v >= max_domain_) { 16 | std::cout << "ERROR: Hasing Function error!" << std::endl; 17 | exit(-1); 18 | } 19 | int col = v % buckets_num_; 20 | int row = v / buckets_num_; 21 | return row + col * buckets_width; 22 | } 23 | public: 24 | HashReorder(int buckets_num, int max_domain, REORDER type = ALL_REORDER) 25 | : buckets_num_(buckets_num), max_domain_(max_domain), type_(type) { 26 | buckets_width = max_domain / buckets_num_; 27 | } 28 | 29 | int reorder(int r) { 30 | return hash(r); 31 | } 32 | 33 | MatrixPos new_pos(const MatrixPos &old_pos) { 34 | MatrixPos n_pos = old_pos; 35 | if(type_ == COL_REORDER || type_ == ALL_REORDER) { 36 | n_pos.col_idx = hash(n_pos.col_idx); 37 | } 38 | if(type_ == ROW_REORDER || type_ == ALL_REORDER) { 39 | n_pos.row_idx = hash(n_pos.row_idx); 40 | } 41 | return n_pos; 42 | } 43 | }; 44 | } -------------------------------------------------------------------------------- /src/reorder/header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "reorder.h" 4 | #include "hash.h" -------------------------------------------------------------------------------- /src/reorder/reorder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../utils/matrix_base.h" 4 | 5 | namespace ftxj { 6 | 7 | enum REORDER { 8 | COL_REORDER, 9 | ROW_REORDER, 10 | ALL_REORDER 11 | }; 12 | 13 | class Reorder { 14 | public: 15 | virtual MatrixPos new_pos(const MatrixPos &old_pos) = 0; 16 | virtual int reorder(int r) = 0; 17 | 18 | }; 19 | 20 | }; -------------------------------------------------------------------------------- /src/run_bf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for col_blk in 1024 512 256 128 64 32 16 3 | do 4 | for blockDim in 1024 512 256 128 64 5 | do 6 | for((blockx=1; blockx<=blockDim; blockx+=blockx)) 7 | do 8 | blocky=`expr $blockDim / $blockx` 9 | echo "Run Config" 10 | echo $col_blk 11 | echo $blockx 12 | echo $blocky 13 | ./bf 1024 1000 1 2 $col_blk $blockx $blocky 14 | done 15 | done 16 | done -------------------------------------------------------------------------------- /src/utils/cpu_spmm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "matrix.h" 6 | #include "debug.h" 7 | 8 | namespace ftxj { 9 | class CpuSpmm { 10 | public: 11 | static void run_and_cmp(COOMatrix &weight, float* input, int neuron, int batch, float* output, bool T = false, bool resT = true, bool inputT = true) { 12 | weight.to_row_first_ordered(); 13 | std::vector> res(batch, std::vector(neuron, 0.0)); 14 | for(int b = 0; b < batch; ++b) { 15 | if(b % 10000 == 0) std::cout << "run " << b << "..." << std::endl; 16 | for(auto iter = weight.begin(); iter != weight.end(); ++iter) { 17 | int row = (*iter).row; 18 | int col = (*iter).col; 19 | float val = (*iter).val; 20 | float in = 0.0; 21 | if(T) { 22 | if(inputT) in = input[b * neuron + row]; 23 | else in = input[row * batch + b]; 24 | res[b][col] += in * val; 25 | // if(b == 1 && col == 16352) { 26 | // printf("%f * %f %d\n", in, val, row); 27 | // // } 28 | // if(b == 0 && col == 62) { 29 | // printf("0 %f * %f %d\n", in, val, row); 30 | // } 31 | } 32 | else { 33 | if(inputT) in = input[b * neuron + col]; 34 | else in = input[col * batch + b]; 35 | res[b][row] += in * val; 36 | // if(b == 1 && row == 16352) { 37 | // printf("%f * %f %d\n", in, val, col); 38 | // } 39 | // if(b == 0 && row == 62) { 40 | // printf("0 %f * %f %d\n", in, val, col); 41 | // } 42 | } 43 | } 44 | for(int j = 0; j < neuron; ++j) { 45 | float cmp = 0; 46 | if(resT) cmp = output[b * neuron + j]; 47 | else cmp = output[j * batch + b]; 48 | if(std::abs(res[b][j] - cmp) > 1e-3) { 49 | std::cout << b << ", " << j << " cpu=" << res[b][j] << ", gpu=" << cmp << std::endl; 50 | assert_msg(res[b][j] == cmp, "cpu gpu doesnot equals!"); 51 | } 52 | } 53 | } 54 | std::cout << "Compare with cpu result [Success]" << std::endl; 55 | } 56 | }; 57 | }; -------------------------------------------------------------------------------- /src/utils/cpu_spmm_fuse.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "matrix.h" 6 | #include "debug.h" 7 | 8 | namespace ftxj { 9 | class CpuSpmmFuse { 10 | public: 11 | static void run_and_cmp(std::vector &weight, float* input, int neuron, int batch, int bias, float* output, int fuse_layer, bool T = false, bool resT = true, bool inputT = true) { 12 | for(int i = 0; i < fuse_layer; ++i) { 13 | weight[i].to_row_first_ordered(); 14 | } 15 | std::vector> res1(batch, std::vector(neuron, 0.0)); 16 | std::vector> res2(batch, std::vector(neuron, 0.0)); 17 | for(int b = 0; b < batch; ++b) { 18 | if(b % 10000 == 0) std::cout << "run " << b << "..." << std::endl; 19 | for(auto iter = weight[0].begin(); iter != weight[0].end(); ++iter) { 20 | int row = (*iter).row; 21 | int col = (*iter).col; 22 | float val = (*iter).val; 23 | float in = 0.0; 24 | if(T) { 25 | if(inputT) in = input[b * neuron + row]; 26 | else in = input[row * batch + b]; 27 | res1[b][col] += in * val; 28 | // if(b == 8 && col == 0) { 29 | // printf("%f * %f = %f\n", in, val, res1[b][col]); 30 | // } 31 | } 32 | else { 33 | if(inputT) in = input[b * neuron + col]; 34 | else in = input[col * batch + b]; 35 | res1[b][row] += in * val; 36 | // if(b == 8 && row == 0) { 37 | // printf("%f * %f = %f\n", in, val, res1[b][row]); 38 | // } 39 | } 40 | } 41 | for(int j = 0; j < neuron; ++j) { 42 | // res1[b][j] = res1[b][j]; 43 | // if(b == 8 && j == 0) { 44 | // printf("res1 = %f\n", res1[b][j]); 45 | // } 46 | res1[b][j] = ((res1[b][j] + bias) > 32 ? 32.0 : ((res1[b][j] + bias) < 0) ? 0 : res1[b][j] + bias); 47 | } 48 | } 49 | for(int l = 1; l < fuse_layer; ++l) { 50 | for(int b = 0; b < batch; ++b) { 51 | if(b % 10000 == 0) std::cout << "run l = " << l << ", b = " << b << "..." << std::endl; 52 | for(auto iter = weight[l].begin(); iter != weight[l].end(); ++iter) { 53 | int row = (*iter).row; 54 | int col = (*iter).col; 55 | float val = (*iter).val; 56 | float in = 0.0; 57 | if(T) { 58 | in = res1[b][row]; 59 | res2[b][col] += in * val; 60 | // if(b == 8 && col == 0) { 61 | // printf("%f * %f %d\n", in, val, row); 62 | // } 63 | } 64 | else { 65 | in = res1[b][col]; 66 | res2[b][row] += in * val; 67 | // if(b == 8 && row == 0) { 68 | // printf("%f * %f %d\n", in, val, col); 69 | // } 70 | } 71 | } 72 | for(int j = 0; j < neuron; ++j) { 73 | // res2[b][j] = res2[b][j]; 74 | res2[b][j] = ((res2[b][j] + bias) > 32 ? 32.0 : ((res2[b][j] + bias) < 0) ? 0 : res2[b][j] + bias); 75 | } 76 | } 77 | res1 = res2; 78 | res2 = std::vector>(batch, std::vector(neuron, 0.0)); 79 | } 80 | 81 | for(int b = 0; b < batch; ++b) { 82 | for(int j = 0; j < neuron; ++j) { 83 | float cmp = 0; 84 | if(resT) cmp = output[b * neuron + j]; 85 | else cmp = output[j * batch + b]; 86 | if(std::abs(res1[b][j] - cmp) > 1e-3) { 87 | std::cout << b << ", " << j << " cpu=" << res1[b][j] << ", gpu=" << cmp << std::endl; 88 | assert_msg(res1[b][j] == cmp, "cpu gpu doesnot equals!"); 89 | } 90 | } 91 | } 92 | 93 | std::cout << "Compare with cpu result [Success]" << std::endl; 94 | } 95 | }; 96 | }; -------------------------------------------------------------------------------- /src/utils/cpu_transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "matrix.h" 6 | #include "debug.h" 7 | 8 | namespace ftxj { 9 | class CpuTranspose { 10 | public: 11 | static void run_and_cmp(float* input, int neuron, int batch, float* output) { 12 | std::vector> res(neuron, std::vector(batch, 0.0)); 13 | for(int b = 0; b < batch; ++b) { 14 | for(int n = 0; n < neuron; ++n) { 15 | res[n][b] = input[b * neuron + n]; 16 | assert_msg(res[n][b] == output[n * batch + b], "error!"); 17 | } 18 | } 19 | std::cout << "Compare with cpu result [Success]" << std::endl; 20 | } 21 | }; 22 | }; -------------------------------------------------------------------------------- /src/utils/cpu_transpose_and_delete.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "matrix.h" 6 | #include "debug.h" 7 | 8 | namespace ftxj { 9 | class CpuTransposeDelete { 10 | public: 11 | static void run_and_cmp(float* input, int* old_to_new_map, int old_batch, int neuron, int new_batch, float* output) { 12 | std::vector> res(new_batch, std::vector(neuron, 0.0)); 13 | for(int b = 0; b < old_batch; ++b) { 14 | if(old_to_new_map[b] == -1) continue; 15 | int new_b = old_to_new_map[b]; 16 | for(int n = 0; n < neuron; ++n) { 17 | res[new_b][n] = input[n * old_batch + b]; 18 | if(std::abs(res[new_b][n] - output[new_b * neuron + n]) > 1e-3) { 19 | std::cout << b << ", " << n << std::endl; 20 | std::cout << new_b << ", " << n << std::endl; 21 | std::cout << "currect = " << res[new_b][n] << ", error = " << output[new_b * neuron + n] << std::endl; 22 | assert_msg(res[new_b][n] == output[new_b * neuron + n], "error!"); 23 | } 24 | } 25 | } 26 | std::cout << "Compare with cpu result [Success]" << std::endl; 27 | } 28 | }; 29 | }; -------------------------------------------------------------------------------- /src/utils/debug.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace ftxj { 5 | #ifndef NDEBUG 6 | # define assert_msg(Expr, Msg) \ 7 | Debug::assert_msg_(#Expr, Expr, __FILE__, __LINE__, Msg) 8 | #else 9 | # define assert_msg(Expr, Msg) ; 10 | #endif 11 | 12 | class Debug { 13 | public: 14 | static void assert_msg_(const char* expr_str, bool expr, const char* file, int line, const char* msg) { 15 | if (!expr) 16 | { 17 | std::cerr << "Assert failed:\t" << msg << "\n" 18 | << "Expected:\t" << expr_str << "\n" 19 | << "Source:\t\t" << file << ", line " << line << "\n"; 20 | abort(); 21 | } 22 | } 23 | }; 24 | } -------------------------------------------------------------------------------- /src/utils/header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "debug.h" 3 | #include "matrix_base.h" 4 | #include "matrix.h" 5 | #include "type.h" 6 | #include "string.h" 7 | #include "cpu_spmm.h" 8 | #include "cpu_spmm_fuse.h" 9 | #include "cpu_transpose.h" 10 | #include "cpu_transpose_and_delete.h" -------------------------------------------------------------------------------- /src/utils/matrix_base.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | namespace ftxj { 5 | struct MatrixPos{ 6 | int row_idx; 7 | int col_idx; 8 | MatrixPos(int r, int c) {row_idx = r; col_idx = c;} 9 | MatrixPos() {} 10 | void print() { 11 | std::cout << "(" << row_idx << "," << col_idx << ")"; 12 | } 13 | }; 14 | }; -------------------------------------------------------------------------------- /src/utils/string.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace ftxj { 5 | class String { 6 | 7 | }; 8 | } -------------------------------------------------------------------------------- /src/utils/type.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace ftxj { 6 | typedef float SparseDataType; 7 | }; -------------------------------------------------------------------------------- /tools/3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/3d.png -------------------------------------------------------------------------------- /tools/3d_plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | import numpy as np 5 | 6 | 7 | 8 | labels_x = ['(b)\n(n)', '', '(b)\n(nn,k)', '(b,n)\n(nn,kk)', '', '', '', ''] 9 | labels_y = ['(b0,k0,n0)', '(b0,k0,n1)', '(b0,k1,n1)', '', '', '', '', ''] 10 | labels_z = ['(b0,k0,n0)', '', '', '(b,n,k,nn)', '(b,n,k,kk,nn)', '', '', ''] 11 | 12 | 13 | xs1 = [3] 14 | ys1 = [2] 15 | zs1 = [4] 16 | 17 | 18 | xs2 = [4] 19 | ys2 = [3] 20 | zs2 = [5] 21 | 22 | 23 | 24 | # 方式1:设置三维图形模式 25 | fig = plt.figure() # 创建一个画布figure,然后在这个画布上加各种元素。 26 | ax = Axes3D(fig) # 将画布作用于 Axes3D 对象上。 27 | 28 | ax.scatter(xs1,ys1,zs1) # 画出(xs1,ys1,zs1)的散点图。 29 | ax.scatter(xs2,ys2,zs2,c='r',marker='^') 30 | 31 | 32 | 33 | ax.set_xlabel('Parallelism') # 画出坐标轴 34 | ax.set_ylabel('Loop Tiling') 35 | ax.set_zlabel('Execute Order') 36 | 37 | locsx, labelsx = plt.xticks() # Get the current locations and labels. 38 | locsy, labelsy = plt.yticks() # Get the current locations and labels. 39 | # locsz, labelsz = plt.zticks() # Get the current locations and labels. 40 | 41 | plt.xticks(locsx, labels_x) # Set label locations. 42 | plt.yticks(locsy, labels_y) # Set label locations. 43 | # plt.zticks(locsz, labels_z) # Set label locations. 44 | 45 | plt.savefig("3d.png") 46 | # plt.show() -------------------------------------------------------------------------------- /tools/control_code_analysis.py: -------------------------------------------------------------------------------- 1 | file = "../3rd_party/20-graphchallenge/SpDNN_Challenge2020/singlegpu/kernel.sass" 2 | control_line = 0 3 | with open(file) as f: 4 | line = f.readline() 5 | x = 1 6 | gap = 0 7 | while line: 8 | idx = line.rfind('/*') 9 | if idx != -1: 10 | if control_line == 1: 11 | hex = "0x" + line[idx + 9 : idx + 11] 12 | # print(hex) 13 | stalls = int(hex, 16) 14 | stalls = (stalls >> 1) & 0x0f 15 | # if yield_code == 0: 16 | # print(x, gap) 17 | # gap = 0 18 | # else: 19 | # gap = gap + 1 20 | print(x, stalls) 21 | control_line = (control_line + 1) % 2 22 | line = f.readline() 23 | x = x + 1 -------------------------------------------------------------------------------- /tools/cost_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/cost_model.py -------------------------------------------------------------------------------- /tools/edgedraw.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | def hashElement(v,buckets,max_range): 4 | if v>= max_range: 5 | return v 6 | else: 7 | return v/buckets+v%buckets*(max_range/buckets) 8 | pass 9 | 10 | num = str(1); 11 | neuron = str(4096); 12 | bucketnumber = 256; 13 | 14 | path='../src/tmp.txt' 15 | 16 | col = [] 17 | row = [] 18 | file =open(path,'r') 19 | for eachline in file.readlines(): 20 | x = eachline.split('\t') 21 | col.append(hashElement(int(x[1])-1,bucketnumber,int(neuron)))#得到列的id 22 | row.append(hashElement(int(x[0])-1,bucketnumber,int(neuron)))#得到行的id 23 | #a.append(int(x[1])-1)#得到列的id 24 | #b.append(int(x[0])-1)#得到行的id 25 | plt.title("neuron:"+neuron+" bucket"+str(bucketnumber)+" layer:"+num) 26 | plt.xlim(xmax=64,xmin=0) 27 | plt.ylim(ymax=64,ymin=0) 28 | plt.xlabel("col") 29 | plt.ylabel("row") 30 | plt.plot(a,b,'.') 31 | 32 | plt.savefig("tmp.fig") 33 | 34 | # plt.show() 35 | -------------------------------------------------------------------------------- /tools/get_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA_SET_DIR="/home/xinjie/data/graph_challenge" 4 | 5 | if [[ -z "${DATA_SET_DIR}" ]]; then 6 | echo "ERROR: Please Set Data Set Dir Variable" 7 | exit 0 8 | fi 9 | 10 | 11 | 12 | 13 | if [ $1 == "1024" ]; then 14 | echo "Downloading Categories" 15 | wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024-l120-categories.tsv 16 | 17 | echo "Downloading Spase Images" 18 | 19 | wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/mnist/sparse-images-1024.tsv.gz 20 | 21 | echo "Downloading Weights" 22 | wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024.tar.gz 23 | 24 | fi 25 | 26 | 27 | -------------------------------------------------------------------------------- /tools/plt_show.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from numpy import fromfile 3 | import numpy as np 4 | from scipy.sparse import coo_matrix 5 | import sys 6 | 7 | 8 | num = sys.argv[1] 9 | neuron = 65536 10 | bucketnumber = 4096 11 | tile_size_str = sys.argv[2] 12 | 13 | draw_num = sys.argv[3] 14 | draw_num = int(draw_num) 15 | 16 | tile_size = int(tile_size_str) 17 | 18 | open_file_path='../data/neuron65536/n65536-l'+ num + '.tsv' 19 | save_path_root = "../data_show/" 20 | 21 | 22 | def hashElement(v): 23 | if v >= neuron: 24 | return v 25 | else: 26 | return v / bucketnumber + (v % bucketnumber) * (neuron / bucketnumber) 27 | 28 | 29 | now_num = 0 30 | for row_block in range(0, int((neuron + tile_size - 1) / tile_size)): 31 | for col_block in range(0, int((neuron + tile_size - 1)/ tile_size)): 32 | file = open(open_file_path, 'r') 33 | row = [] 34 | col = [] 35 | for eachline in file.readlines(): 36 | x = eachline.split('\t') 37 | col_h = hashElement(int(x[0]) - 1) 38 | row_h = hashElement(int(x[1]) - 1) 39 | if(col_h >= tile_size * col_block and col_h < tile_size * (col_block + 1)): 40 | if(row_h >= tile_size * row_block and row_h < tile_size * (row_block + 1)): 41 | col.append(col_h) 42 | row.append(row_h) 43 | print(len(row)) 44 | if(len(row) == 0): 45 | continue 46 | now_num = now_num + 1 47 | file_name = 'l' + str(num) + '_b' + str(bucketnumber) + '_r' + str(row_block) + '_c' + str(col_block) + '_t' + str(tile_size) + '.png' 48 | save_file_path = save_path_root + 'n' + str(neuron) + "/" + "l" + str(num) + "/" 49 | plt.title(file_name) 50 | plt.xlim(xmax = tile_size * (row_block + 1), xmin = tile_size * row_block) 51 | plt.ylim(ymax = tile_size * (col_block + 1), ymin = tile_size * col_block) 52 | plt.xlabel("row") 53 | plt.ylabel("col") 54 | plt.plot(row, col, '.') 55 | plt.savefig(save_file_path + file_name) 56 | if(draw_num == now_num): 57 | exit() 58 | 59 | 60 | -------------------------------------------------------------------------------- /tools/statistics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | open_file_path = "../data/neuron16384-l120-categories.tsv" 7 | 8 | neuron = 16384 9 | batch = 60000 10 | 11 | file = open(open_file_path, 'r') 12 | 13 | array = [] 14 | x_axis = [] 15 | for i in range(0, 6000): 16 | array.append(0) 17 | x_axis.append(i) 18 | 19 | max_v = 0 20 | for eachline in file.readlines(): 21 | x = eachline.split(' ') 22 | xx = int(x[0]) - 1 23 | print(xx) 24 | array[int(xx/10)] = array[int(xx/10)] + 1 25 | if max_v < array[int(xx/10)]: 26 | max_v = array[int(xx/10)] 27 | 28 | 29 | 30 | plt.xlim(xmax = 256, xmin = 0) 31 | plt.ylim(ymax = max_v, ymin = 0) 32 | 33 | plt.plot(x_axis, array, '.') 34 | 35 | plt.savefig("tmp.png") -------------------------------------------------------------------------------- /tools/tmp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/tmp.png --------------------------------------------------------------------------------