├── .gitignore
├── .gitmodules
├── 3rd_party
    └── nersc-roofline-master
    │   ├── ERT
    │       ├── Config
    │       │   ├── config.cori.nersc.gov.hsw
    │       │   ├── config.cori.nersc.gov.knl
    │       │   └── config.cori.nersc.gov.volta
    │       └── Kernels
    │       │   ├── kernel1.c
    │       │   ├── kernel1.h
    │       │   ├── kernel3.c
    │       │   ├── kernel3.h
    │       │   └── rep.h
    │   ├── GPP
    │       ├── KNL
    │       │   ├── CustomComplex.h
    │       │   ├── Makefile
    │       │   ├── batchjob.advisor
    │       │   ├── batchjob.likwid.survey
    │       │   ├── batchjob.notools.survey
    │       │   ├── batchjob.sde.survey
    │       │   ├── batchjob.vtune
    │       │   ├── compile.likwid.survey
    │       │   ├── compile.notools.survey
    │       │   ├── compile.tools.survey
    │       │   ├── gppCustomComplex.cpp
    │       │   ├── parse-sde2.sh
    │       │   ├── parse-vtune.py
    │       │   ├── parse-vtune.sh
    │       │   └── roofline.py
    │       └── Volta
    │       │   ├── GPUComplex.cu
    │       │   ├── GPUComplex.h
    │       │   ├── Makefile
    │       │   ├── compile.survey
    │       │   ├── gppKer_gpuComplex.cpp
    │       │   ├── ncu-section-files
    │       │       └── SpeedOfLight_HierarchicalDoubleRooflineChart.section
    │       │   └── run.survey
    │   ├── Plotting
    │       ├── plot_plot_roofline_py.eps
    │       ├── plot_plot_roofline_py.pdf
    │       ├── plot_plot_roofline_py.png
    │       ├── plot_roofline.py
    │       └── plot_roofline_hierarchical.py
    │   ├── README.md
    │   └── stream-ai-example
    │       ├── .gitignore
    │       ├── Makefile
    │       ├── READ.ME
    │       ├── README.md
    │       ├── fortran_itt_sde
    │           ├── Makefile
    │           ├── api_itt_sde.c
    │           ├── jacobi.f90
    │           └── module_itt_sde.f90
    │       ├── parse-sde.sh
    │       ├── parse-vtune2017.sh
    │       ├── parse-vtune2018.sh
    │       ├── stream-ai.sh
    │       └── stream_mpi.c
├── README.md
├── benchmark
    ├── cublas
    │   └── cublas.cu
    ├── cusparse
    │   ├── Makefile
    │   ├── Makefile.volta
    │   ├── cusparse.cu
    │   ├── run.sh
    │   └── vars.h
    ├── hpec
    │   └── 20-champions-1
    │   │   └── run.sh
    └── sputnik
    │   ├── sim.cu
    │   └── spmm.cu
├── src
    ├── BF.cpp
    ├── Makefile
    ├── Makefile.multi
    ├── Makefile.multi.big
    ├── SNIG.cpp
    ├── cost.cpp
    ├── cuSparse.cpp
    ├── fuse.cpp
    ├── fuse
    │   ├── fuse.h
    │   └── header.h
    ├── gpu_lib
    │   ├── gpu_env.h
    │   ├── gpu_runtime.h
    │   └── header.h
    ├── inspector
    │   ├── code_gen.cpp
    │   ├── code_gen.h
    │   ├── code_gen_basic.h
    │   ├── cost_model.h
    │   ├── data_inspector.h
    │   ├── gpu_block.h
    │   ├── gpu_block_scheduler.h
    │   ├── gpu_run_config.h
    │   ├── gpu_wrap.h
    │   ├── header.h
    │   ├── matrix_block.h
    │   ├── matrix_block_container.h
    │   └── matrix_block_gen.h
    ├── main.cpp
    ├── mc_test.cpp
    ├── microbenchmark
    │   ├── 20-champion.cu
    │   ├── all_network.cu
    │   ├── bf.cu
    │   ├── bf_opt.cu
    │   ├── cusparse_spmm.cu
    │   ├── fuse.cu
    │   ├── fuse_cmp.cu
    │   ├── header.h
    │   ├── load-data.cu
    │   ├── matrix_transpose.cu
    │   ├── matrix_transpose_and_delete.cu
    │   ├── multi_gpu
    │   │   ├── header.h
    │   │   ├── multi_gpu.cu
    │   │   └── multi_gpu_big.cu
    │   ├── n16284-l1.cu
    │   ├── n16384-l11.cu
    │   ├── n16384-l2-l10.cu
    │   ├── out_memory.cu
    │   ├── random.h
    │   ├── rectangels.cu
    │   ├── row-succ-20-uiuc-transpose.cu
    │   ├── row-succ-20-uiuc.cu
    │   ├── row-succ-no-transpose.cu
    │   ├── row-succ-transpose-batch-parallel.cu
    │   ├── row-succ.cu
    │   └── snig.cu
    ├── multi_gpu.cpp
    ├── multi_gpu
    │   ├── add_mpi
    │   │   ├── add_singlegpu
    │   │   ├── add_singlegpu.cu
    │   │   ├── makefile
    │   │   ├── mpi_call.cpp
    │   │   ├── mpi_call.o
    │   │   ├── run_volta.sh
    │   │   ├── saxpy.cu
    │   │   ├── saxpy.o
    │   │   ├── test
    │   │   └── vars.h
    │   ├── add_omp.cu
    │   └── add_stream.cu
    ├── network.cpp
    ├── reorder
    │   ├── hash.h
    │   ├── header.h
    │   └── reorder.h
    ├── run_bf.sh
    └── utils
    │   ├── cpu_spmm.h
    │   ├── cpu_spmm_fuse.h
    │   ├── cpu_transpose.h
    │   ├── cpu_transpose_and_delete.h
    │   ├── debug.h
    │   ├── header.h
    │   ├── matrix.h
    │   ├── matrix_base.h
    │   ├── string.h
    │   └── type.h
└── tools
    ├── 3d.png
    ├── 3d_plot.py
    ├── control_code_analysis.py
    ├── cost_model.py
    ├── edgedraw.py
    ├── get_dataset.sh
    ├── paper.cpp
    ├── plt_show.py
    ├── statistics.py
    └── tmp.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gpu
 2 | *.ptx
 3 | *.cubin
 4 | *.fatbin
 5 | *.sass
 6 | 
 7 | *.bin
 8 | *.out
 9 | 
10 | *.tar.gz
11 | *.tsv
12 | *.tmp
13 | *.txt
14 | 
15 | data
16 | 
17 | data_show
18 | 
19 | 3rd_party/20-graphchallenge/SpDNN_Challenge2020/data


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "SC Sputnik"]
 2 | 	path = 3rd_party/sputnik
 3 | 	url = https://github.com/google-research/sputnik.git
 4 | 
 5 | [submodule "PPoPP TuringAs"]
 6 | 	path = 3rd_party/turingas
 7 | 	url = https://github.com/daadaada/turingas.git
 8 | 
 9 | [submodule "PPoPP GAS"]
10 | 	path = 3rd_party/gas
11 | 	url = https://github.com/daadaada/gas.git
12 | 	
13 | [submodule "CGO GPA"]
14 | 	path = 3rd_party/GPA
15 | 	url = https://github.com/Jokeren/GPA.git
16 | 
17 | [submodule "PACT SparseRT"]
18 | 	path = 3rd_party/gpu-sparsert
19 | 	url = https://github.com/marsupialtail/gpu-sparsert.git
20 | 
21 | 
22 | [submodule "20 Champions UIUC/NVIDIA"]
23 | 	path = 3rd_party/20-graphchallenge/SpDNN_Challenge2020
24 | 	url = https://github.com/merthidayetoglu/SpDNN_Challenge2020.git
25 | 
26 |     
27 | [submodule "20 Champions Utah"]
28 | 	path = 3rd_party/20-graphchallenge/SNIG
29 | 	url = https://github.com/dian-lun-lin/SNIG.git
30 | 
31 | 
32 | [submodule "20 Innovation Pitt"]
33 | 	path = 3rd_party/20-graphchallenge/DistSparseDNN
34 | 	url = https://github.com/hmofrad/DistSparseDNN.git
35 | 
36 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.hsw:
--------------------------------------------------------------------------------
 1 | # Cori Haswell partition
 2 | 
 3 | ERT_RESULTS Results.cori.nersc.gov.hsw
 4 | 
 5 | ERT_DRIVER  driver1
 6 | ERT_KERNEL  kernel1
 7 | 
 8 | ERT_MPI         True
 9 | ERT_MPI_CFLAGS
10 | ERT_MPI_LDFLAGS
11 | 
12 | ERT_OPENMP         True
13 | ERT_OPENMP_CFLAGS  -openmp
14 | ERT_OPENMP_LDFLAGS -openmp
15 | 
16 | ERT_FLOPS   1,2,4,8,16
17 | ERT_ALIGN   32
18 | 
19 | ERT_CC      cc
20 | ERT_CFLAGS  -O3 -fno-alias -fno-fnalias -xCORE-AVX2 -DERT_INTEL
21 | 
22 | ERT_LD      cc
23 | ERT_LDFLAGS 
24 | ERT_LDLIBS  
25 | 
26 | ERT_RUN     export OMP_NUM_THREADS=ERT_OPENMP_THREADS; export OMP_PLACES=threads; export OMP_PROC_BIND=spread; srun -n ERT_MPI_PROCS --cpu_bind=cores -c `expr 64 / ERT_MPI_PROCS` ./ERT_CODE
27 | 
28 | ERT_PROCS_THREADS  32
29 | ERT_MPI_PROCS      2,4,8,16,32
30 | ERT_OPENMP_THREADS 1-32
31 | 
32 | ERT_NUM_EXPERIMENTS 5
33 | 
34 | ERT_MEMORY_MAX 1073741824
35 | 
36 | ERT_WORKING_SET_MIN 1
37 | 
38 | ERT_TRIALS_MIN 1
39 | 
40 | ERT_GNUPLOT gnuplot
41 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.knl:
--------------------------------------------------------------------------------
 1 | # Cori KNL partition
 2 | 
 3 | ERT_RESULTS Results.cori.nersc.gov.knl
 4 | 
 5 | ERT_DRIVER  driver1
 6 | ERT_KERNEL  kernel1
 7 | 
 8 | ERT_MPI         True
 9 | ERT_MPI_CFLAGS
10 | ERT_MPI_LDFLAGS
11 | 
12 | ERT_OPENMP         True
13 | ERT_OPENMP_CFLAGS  -qopenmp 
14 | ERT_OPENMP_LDFLAGS -qopenmp
15 | 
16 | ERT_FLOPS   1,2,4,8,16,32,64
17 | ERT_ALIGN   64
18 | 
19 | ERT_CC      cc
20 | ERT_CFLAGS  -O3 -fno-alias -fno-fnalias -xMIC-AVX512 -DERT_INTEL
21 | 
22 | ERT_LD      cc
23 | ERT_LDFLAGS 
24 | ERT_LDLIBS  
25 | 
26 | ERT_RUN     export SLURM_CORES=$(( 256 / ERT_MPI_PROCS )); export OMP_PLACES=threads; export OMP_PROC_BIND=spread; export OMP_NUM_THREADS=ERT_OPENMP_THREADS; srun -n ERT_MPI_PROCS -c $SLURM_CORES --cpu_bind=cores ./ERT_CODE
27 | 
28 | ERT_PROCS_THREADS  256
29 | ERT_MPI_PROCS      1,4,16,64
30 | ERT_OPENMP_THREADS 1-256
31 | 
32 | ERT_NUM_EXPERIMENTS 1
33 | 
34 | ERT_STRIDE 100
35 | ERT_MEMORY_MAX 1073741824
36 | 
37 | ERT_WORKING_SET_MIN 1
38 | 
39 | ERT_TRIALS_MIN 1
40 | 
41 | ERT_GNUPLOT gnuplot
42 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Config/config.cori.nersc.gov.volta:
--------------------------------------------------------------------------------
 1 | # Cori Volta partition
 2 | 
 3 | ERT_RESULTS Results.cori.nersc.gov.volta
 4 | 
 5 | ERT_DRIVER  driver1
 6 | ERT_KERNEL  kernel1
 7 | 
 8 | ERT_GPU         True
 9 | ERT_GPU_CFLAGS  -x cu
10 | ERT_GPU_LDFLAGS 
11 | 
12 | ERT_FLOPS   1,2,4,8,16,32,64,128,256
13 | ERT_ALIGN   32
14 | 
15 | ERT_CC      nvcc
16 | ERT_CFLAGS  -O3
17 | 
18 | ERT_LD      nvcc
19 | ERT_LDFLAGS 
20 | ERT_LDLIBS  
21 | 
22 | ERT_RUN     ./ERT_CODE
23 | 
24 | ERT_BLOCKS_THREADS 163840
25 | ERT_GPU_BLOCKS     80,160,320,640,1280,2560
26 | ERT_GPU_THREADS    64,128,256,512,1024
27 | 
28 | ERT_NUM_EXPERIMENTS 1
29 | 
30 | ERT_MEMORY_MAX 1073741824
31 | 
32 | ERT_WORKING_SET_MIN 128
33 | 
34 | ERT_TRIALS_MIN 1
35 | 
36 | ERT_GNUPLOT gnuplot
37 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Kernels/kernel1.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "rep.h"
  6 | #include "kernel1.h"
  7 | 
  8 | void initialize(uint64_t nsize,
  9 |                 double* __restrict__ A,
 10 |                 double value)
 11 | {
 12 | #ifdef ERT_INTEL
 13 |   __assume_aligned(A, ERT_ALIGN);
 14 | #elif __xlC__
 15 |   __alignx(ERT_ALIGN, A);
 16 | #endif
 17 | 
 18 |   uint64_t i;
 19 |   for (i = 0; i < nsize; ++i) {
 20 |     A[i] = value;
 21 |   }
 22 | }
 23 | 
 24 | #ifdef ERT_GPU
 25 | __global__ void block_stride(uint64_t ntrials, uint64_t nsize, double *A)
 26 | {
 27 |   uint64_t total_thr = gridDim.x * blockDim.x;
 28 |   uint64_t elem_per_thr = (nsize + (total_thr-1)) / total_thr;
 29 |   uint64_t blockOffset = blockIdx.x * blockDim.x; 
 30 | 
 31 |   uint64_t start_idx  = blockOffset + threadIdx.x;
 32 |   uint64_t end_idx    = start_idx + elem_per_thr * total_thr;
 33 |   uint64_t stride_idx = total_thr;
 34 | 
 35 |   if (start_idx > nsize) {
 36 |     start_idx = nsize;
 37 |   }
 38 | 
 39 |   if (end_idx > nsize) {
 40 |     end_idx = nsize;
 41 |   }
 42 | 
 43 |   double alpha = 0.5;
 44 |   uint64_t i, j;
 45 |   for (j = 0; j < ntrials; ++j) {
 46 |     for (i = start_idx; i < end_idx; i += stride_idx) {
 47 |       double beta = 0.8;
 48 | #if (ERT_FLOP & 1) == 1       /* add 1 flop */
 49 |       KERNEL1(beta,A[i],alpha);
 50 | #endif
 51 | #if (ERT_FLOP & 2) == 2       /* add 2 flops */
 52 |       KERNEL2(beta,A[i],alpha);
 53 | #endif
 54 | #if (ERT_FLOP & 4) == 4       /* add 4 flops */
 55 |       REP2(KERNEL2(beta,A[i],alpha));
 56 | #endif
 57 | #if (ERT_FLOP & 8) == 8       /* add 8 flops */
 58 |       REP4(KERNEL2(beta,A[i],alpha));
 59 | #endif
 60 | #if (ERT_FLOP & 16) == 16     /* add 16 flops */
 61 |       REP8(KERNEL2(beta,A[i],alpha));
 62 | #endif
 63 | #if (ERT_FLOP & 32) == 32     /* add 32 flops */
 64 |       REP16(KERNEL2(beta,A[i],alpha));
 65 | #endif
 66 | #if (ERT_FLOP & 64) == 64     /* add 64 flops */
 67 |       REP32(KERNEL2(beta,A[i],alpha));
 68 | #endif
 69 | #if (ERT_FLOP & 128) == 128   /* add 128 flops */
 70 |       REP64(KERNEL2(beta,A[i],alpha));
 71 | #endif
 72 | #if (ERT_FLOP & 256) == 256   /* add 256 flops */
 73 |       REP128(KERNEL2(beta,A[i],alpha));
 74 | #endif
 75 | #if (ERT_FLOP & 512) == 512   /* add 512 flops */
 76 |       REP256(KERNEL2(beta,A[i],alpha));
 77 | #endif
 78 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */
 79 |       REP512(KERNEL2(beta,A[i],alpha));
 80 | #endif
 81 | 
 82 |       A[i] = beta;
 83 |     }
 84 |     alpha = alpha * (1 - 1e-8);
 85 |   }
 86 | }
 87 | 
 88 | int gpu_blocks;
 89 | int gpu_threads;
 90 | 
 91 | void gpuKernel(uint64_t nsize,
 92 |                uint64_t ntrials,
 93 |                double* __restrict__ A,
 94 |                int* bytes_per_elem,
 95 |                int* mem_accesses_per_elem)
 96 | {
 97 |   *bytes_per_elem        = sizeof(*A);
 98 |   *mem_accesses_per_elem = 2;
 99 | 
100 | #ifdef ERT_INTEL
101 |   __assume_aligned(A, ERT_ALIGN);
102 | #elif __xlC__
103 |   __alignx(ERT_ALIGN, A);
104 | #endif
105 | 
106 |   block_stride <<< gpu_blocks, gpu_threads>>> (ntrials, nsize, A);
107 | }
108 | #else
109 | void kernel(uint64_t nsize,
110 |             uint64_t ntrials,
111 |             double* __restrict__ A,
112 |             int* bytes_per_elem,
113 |             int* mem_accesses_per_elem)
114 | {
115 |   *bytes_per_elem        = sizeof(*A);
116 |   *mem_accesses_per_elem = 2;
117 | 
118 | #ifdef ERT_INTEL
119 |   __assume_aligned(A, ERT_ALIGN);
120 | #elif __xlC__
121 |   __alignx(ERT_ALIGN, A);
122 | #endif
123 | 
124 |   double alpha = 0.5;
125 |   uint64_t i, j;
126 |   for (j = 0; j < ntrials; ++j) {
127 | #pragma unroll (8)
128 |     for (i = 0; i < nsize; ++i) {
129 |       double beta = 0.8;
130 | #if (ERT_FLOP & 1) == 1       /* add 1 flop */
131 |       KERNEL1(beta,A[i],alpha);
132 | #endif
133 | #if (ERT_FLOP & 2) == 2       /* add 2 flops */
134 |       KERNEL2(beta,A[i],alpha);
135 | #endif
136 | #if (ERT_FLOP & 4) == 4       /* add 4 flops */
137 |       REP2(KERNEL2(beta,A[i],alpha));
138 | #endif
139 | #if (ERT_FLOP & 8) == 8       /* add 8 flops */
140 |       REP4(KERNEL2(beta,A[i],alpha));
141 | #endif
142 | #if (ERT_FLOP & 16) == 16     /* add 16 flops */
143 |       REP8(KERNEL2(beta,A[i],alpha));
144 | #endif
145 | #if (ERT_FLOP & 32) == 32     /* add 32 flops */
146 |       REP16(KERNEL2(beta,A[i],alpha));
147 | #endif
148 | #if (ERT_FLOP & 64) == 64     /* add 64 flops */
149 |       REP32(KERNEL2(beta,A[i],alpha));
150 | #endif
151 | #if (ERT_FLOP & 128) == 128   /* add 128 flops */
152 |       REP64(KERNEL2(beta,A[i],alpha));
153 | #endif
154 | #if (ERT_FLOP & 256) == 256   /* add 256 flops */
155 |       REP128(KERNEL2(beta,A[i],alpha));
156 | #endif
157 | #if (ERT_FLOP & 512) == 512   /* add 512 flops */
158 |       REP256(KERNEL2(beta,A[i],alpha));
159 | #endif
160 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */
161 |       REP512(KERNEL2(beta,A[i],alpha));
162 | #endif
163 | 
164 |       A[i] = beta;
165 |     }
166 |     alpha = alpha * (1 - 1e-8);
167 |   }
168 | }
169 | #endif
170 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Kernels/kernel1.h:
--------------------------------------------------------------------------------
 1 | #ifndef KERNEL1_H
 2 | #define KERNEL1_H
 3 | 
 4 | #ifdef ERT_GPU
 5 | extern int gpu_blocks;
 6 | extern int gpu_threads;
 7 | #endif
 8 | 
 9 | #define KERNEL1(a,b,c)   ((a) = (b) + (c))
10 | #define KERNEL2(a,b,c)   ((a) = (a)*(b) + (c))
11 | 
12 | void initialize(uint64_t nsize,
13 |                 double* __restrict__ array,
14 |                 double value);
15 | 
16 | #ifdef ERT_GPU
17 | void gpuKernel(uint64_t nsize,
18 |                uint64_t ntrials,
19 |                double* __restrict__ array,
20 |                int* bytes_per_elem,
21 |                int* mem_accesses_per_elem);
22 | #else
23 | void kernel(uint64_t nsize,
24 |             uint64_t ntrials,
25 |             double* __restrict__ array,
26 |             int* bytes_per_elem,
27 |             int* mem_accesses_per_elem);
28 | #endif
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Kernels/kernel3.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "rep.h"
  6 | #include "kernel3.h"
  7 | 
  8 | void initialize(uint64_t nsize,
  9 |                 double* __restrict__ A,
 10 |                 double value)
 11 | {
 12 | #ifdef ERT_INTEL
 13 |   __assume_aligned(A, ERT_ALIGN);
 14 | #elif __xlC__
 15 |   __alignx(ERT_ALIGN, A);
 16 | #endif
 17 | 
 18 |   uint64_t i;
 19 |   for (i = 0; i < nsize; ++i) {
 20 |     A[i] = value;
 21 |   }
 22 | }
 23 | 
 24 | #ifdef ERT_GPU
 25 | __global__ void block_stride(uint64_t ntrials, uint64_t nsize, double *A)
 26 | {
 27 |   uint64_t total_thr = gridDim.x * blockDim.x;
 28 |   uint64_t elem_per_thr = (nsize + (total_thr-1)) / total_thr;
 29 |   uint64_t blockOffset = blockIdx.x * blockDim.x; 
 30 | 
 31 |   uint64_t start_idx  = blockOffset + threadIdx.x;
 32 |   uint64_t end_idx    = start_idx + elem_per_thr * total_thr;
 33 |   uint64_t stride_idx = total_thr;
 34 | 
 35 |   if (start_idx > nsize) {
 36 |     start_idx = nsize;
 37 |   }
 38 | 
 39 |   if (end_idx > nsize) {
 40 |     end_idx = nsize;
 41 |   }
 42 | 
 43 |   double alpha = 0.5;
 44 |   uint64_t i, j;
 45 |   for (j = 0; j < ntrials; ++j) {
 46 |     for (i = start_idx; i < end_idx; i += stride_idx) {
 47 |       double beta = 0.8;
 48 | #if (ERT_FLOP & 1) == 1       /* add 1 flop */
 49 |       KERNEL1(beta,A[i],alpha);
 50 | #endif
 51 | #if (ERT_FLOP & 2) == 2       /* add 2 flops */
 52 |       KERNEL2(beta,A[i],alpha);
 53 | #endif
 54 | #if (ERT_FLOP & 4) == 4       /* add 4 flops */
 55 |       REP2(KERNEL2(beta,A[i],alpha));
 56 | #endif
 57 | #if (ERT_FLOP & 8) == 8       /* add 8 flops */
 58 |       REP4(KERNEL2(beta,A[i],alpha));
 59 | #endif
 60 | #if (ERT_FLOP & 16) == 16     /* add 16 flops */
 61 |       REP8(KERNEL2(beta,A[i],alpha));
 62 | #endif
 63 | #if (ERT_FLOP & 32) == 32     /* add 32 flops */
 64 |       REP16(KERNEL2(beta,A[i],alpha));
 65 | #endif
 66 | #if (ERT_FLOP & 64) == 64     /* add 64 flops */
 67 |       REP32(KERNEL2(beta,A[i],alpha));
 68 | #endif
 69 | #if (ERT_FLOP & 128) == 128   /* add 128 flops */
 70 |       REP64(KERNEL2(beta,A[i],alpha));
 71 | #endif
 72 | #if (ERT_FLOP & 256) == 256   /* add 256 flops */
 73 |       REP128(KERNEL2(beta,A[i],alpha));
 74 | #endif
 75 | #if (ERT_FLOP & 512) == 512   /* add 512 flops */
 76 |       REP256(KERNEL2(beta,A[i],alpha));
 77 | #endif
 78 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */
 79 |       REP512(KERNEL2(beta,A[i],alpha));
 80 | #endif
 81 | 
 82 |       A[i] = beta;
 83 |     }
 84 |     alpha = alpha * (1 - 1e-8);
 85 |   }
 86 | }
 87 | 
 88 | int gpu_blocks;
 89 | int gpu_threads;
 90 | 
 91 | void gpuKernel(uint64_t nsize,
 92 |                uint64_t ntrials,
 93 |                double* __restrict__ A,
 94 |                int* bytes_per_elem,
 95 |                int* mem_accesses_per_elem)
 96 | {
 97 |   *bytes_per_elem        = sizeof(*A);
 98 |   *mem_accesses_per_elem = 2;
 99 | 
100 | #ifdef ERT_INTEL
101 |   __assume_aligned(A, ERT_ALIGN);
102 | #elif __xlC__
103 |   __alignx(ERT_ALIGN, A);
104 | #endif
105 | 
106 |   block_stride <<< gpu_blocks, gpu_threads>>> (ntrials, nsize, A);
107 | }
108 | #else
109 | void kernel(uint64_t nsize,
110 |             uint64_t ntrials,
111 |             double* __restrict__ A,
112 |             int* bytes_per_elem,
113 |             int* mem_accesses_per_elem)
114 | {
115 |   *bytes_per_elem        = sizeof(*A);
116 |   *mem_accesses_per_elem = 2;
117 | 
118 | #ifdef ERT_INTEL
119 |   __assume_aligned(A, ERT_ALIGN);
120 | #elif __xlC__
121 |   __alignx(ERT_ALIGN, A);
122 | #endif
123 | 
124 |   double alpha = 0.5;
125 |   uint64_t i, j;
126 |   for (j = 0; j < ntrials; ++j) {
127 | #pragma unroll (8)
128 |     for (i = 0; i < nsize; ++i) {
129 |       double beta = 0.8;
130 | #if (ERT_FLOP & 1) == 1       /* add 1 flop */
131 |       KERNEL1(beta,A[i],alpha);
132 | #endif
133 | #if (ERT_FLOP & 2) == 2       /* add 2 flops */
134 |       KERNEL2(beta,A[i],alpha);
135 | #endif
136 | #if (ERT_FLOP & 4) == 4       /* add 4 flops */
137 |       REP2(KERNEL2(beta,A[i],alpha));
138 | #endif
139 | #if (ERT_FLOP & 8) == 8       /* add 8 flops */
140 |       REP4(KERNEL2(beta,A[i],alpha));
141 | #endif
142 | #if (ERT_FLOP & 16) == 16     /* add 16 flops */
143 |       REP8(KERNEL2(beta,A[i],alpha));
144 | #endif
145 | #if (ERT_FLOP & 32) == 32     /* add 32 flops */
146 |       REP16(KERNEL2(beta,A[i],alpha));
147 | #endif
148 | #if (ERT_FLOP & 64) == 64     /* add 64 flops */
149 |       REP32(KERNEL2(beta,A[i],alpha));
150 | #endif
151 | #if (ERT_FLOP & 128) == 128   /* add 128 flops */
152 |       REP64(KERNEL2(beta,A[i],alpha));
153 | #endif
154 | #if (ERT_FLOP & 256) == 256   /* add 256 flops */
155 |       REP128(KERNEL2(beta,A[i],alpha));
156 | #endif
157 | #if (ERT_FLOP & 512) == 512   /* add 512 flops */
158 |       REP256(KERNEL2(beta,A[i],alpha));
159 | #endif
160 | #if (ERT_FLOP & 1024) == 1024 /* add 1024 flops */
161 |       REP512(KERNEL2(beta,A[i],alpha));
162 | #endif
163 | 
164 |       A[i] = beta;
165 |     }
166 |     alpha = alpha * (1 - 1e-8);
167 |   }
168 | }
169 | #endif
170 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Kernels/kernel3.h:
--------------------------------------------------------------------------------
 1 | #ifndef KERNEL3_H
 2 | #define KERNEL3_H
 3 | 
 4 | #ifdef ERT_GPU
 5 | extern int gpu_blocks;
 6 | extern int gpu_threads;
 7 | #endif
 8 | 
 9 | #define KERNEL1(a,b,c)   ((a) = (b)/(a) )
10 | #define KERNEL2(a,b,c)   ((a) = (b)/(a) + (c))
11 | 
12 | 
13 | void initialize(uint64_t nsize,
14 |                 double* __restrict__ array,
15 |                 double value);
16 | 
17 | #ifdef ERT_GPU
18 | void gpuKernel(uint64_t nsize,
19 |                uint64_t ntrials,
20 |                double* __restrict__ array,
21 |                int* bytes_per_elem,
22 |                int* mem_accesses_per_elem);
23 | #else
24 | void kernel(uint64_t nsize,
25 |             uint64_t ntrials,
26 |             double* __restrict__ array,
27 |             int* bytes_per_elem,
28 |             int* mem_accesses_per_elem);
29 | #endif
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/ERT/Kernels/rep.h:
--------------------------------------------------------------------------------
 1 | #ifndef REP_H
 2 | #define REP_H
 3 | 
 4 | #define REP2(S)        S ;        S
 5 | #define REP4(S)   REP2(S);   REP2(S)
 6 | #define REP8(S)   REP4(S);   REP4(S)
 7 | #define REP16(S)  REP8(S);   REP8(S)
 8 | #define REP32(S)  REP16(S);  REP16(S)
 9 | #define REP64(S)  REP32(S);  REP32(S)
10 | #define REP128(S) REP64(S);  REP64(S)
11 | #define REP256(S) REP128(S); REP128(S)
12 | #define REP512(S) REP256(S); REP256(S)
13 | 
14 | #endif 
15 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/CustomComplex.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Templated CustomComplex class that represents a complex class comprised of  any type of real and imaginary types.
  3 | */
  4 | #ifndef __CustomComplex
  5 | #define __CustomComplex
  6 | 
  7 | #include <iostream>
  8 | #include <cstdlib>
  9 | #include <memory>
 10 | #include <iomanip>
 11 | #include <cmath>
 12 | #include <omp.h>
 13 | #include <ctime>
 14 | #include <stdio.h>
 15 | #include <sys/time.h>
 16 | 
 17 | template<class re, class im>
 18 | 
 19 | class CustomComplex {
 20 | 
 21 |     private : 
 22 |     re x;
 23 |     im y;
 24 | 
 25 |     public:
 26 |     explicit CustomComplex () {
 27 |         x = 0.00;
 28 |         y = 0.00;
 29 |     }
 30 | 
 31 | 
 32 |     explicit CustomComplex(const double& a, const double& b) {
 33 |         x = a;
 34 |         y = b;
 35 |     }
 36 | 
 37 |     CustomComplex(const CustomComplex& src) {
 38 |         x = src.x;
 39 |         y = src.y;
 40 |     }
 41 | 
 42 |     CustomComplex& operator =(const CustomComplex& src) {
 43 |         x = src.x;
 44 |         y = src.y;
 45 | 
 46 |         return *this;
 47 |     }
 48 | 
 49 |     CustomComplex& operator +=(const CustomComplex& src) {
 50 |         x = src.x + this->x;
 51 |         y = src.y + this->y;
 52 | 
 53 |         return *this;
 54 |     }
 55 | 
 56 |     CustomComplex& operator -=(const CustomComplex& src) {
 57 |         x = src.x - this->x;
 58 |         y = src.y - this->y;
 59 | 
 60 |         return *this;
 61 |     }
 62 | 
 63 |     CustomComplex& operator -() {
 64 |         x = -this->x;
 65 |         y = -this->y;
 66 | 
 67 |         return *this;
 68 |     }
 69 | 
 70 |     CustomComplex& operator ~() {
 71 |         return *this;
 72 |     }
 73 | 
 74 |     void print() const {
 75 |         printf("( %f, %f) ", this->x, this->y);
 76 |         printf("\n");
 77 |     }
 78 | 
 79 |     double get_real() const
 80 |     {
 81 |         return this->x;
 82 |     }
 83 | 
 84 |     double get_imag() const
 85 |     {
 86 |         return this->y;
 87 |     }
 88 | 
 89 |     void set_real(double val)
 90 |     {
 91 |         this->x = val;
 92 |     }
 93 | 
 94 |     void set_imag(double val) 
 95 |     {
 96 |         this->y = val;
 97 |     }
 98 | 
 99 | // 6 flops
100 |     template<class real, class imag>
101 |     friend inline CustomComplex<real,imag> operator *(const CustomComplex<real,imag> &a, const CustomComplex<real,imag> &b) {
102 |         real x_this = a.x * b.x - a.y*b.y ;
103 |         imag y_this = a.x * b.y + a.y*b.x ;
104 |         CustomComplex<real,imag> result(x_this, y_this);
105 |         return (result);
106 |     }
107 | 
108 | //2 flops
109 |     template<class real, class imag>
110 |     friend inline CustomComplex<real,imag> operator *(const CustomComplex<real,imag> &a, const double &b) {
111 |        CustomComplex<real,imag> result(a.x*b, a.y*b);
112 |        return result;
113 |     }
114 | 
115 | //2 flops
116 |     template<class real, class imag>
117 |     friend inline CustomComplex<real,imag> operator -(const double &a, CustomComplex<real,imag>& src) {
118 |         CustomComplex<real,imag> result(a - src.x, 0 - src.y);
119 |         return result;
120 |     }
121 | 
122 |     template<class real, class imag>
123 |     friend inline CustomComplex<real,imag> operator +(const double &a, CustomComplex<real,imag>& src) {
124 |         CustomComplex<real,imag> result(a + src.x, src.y);
125 |         return result;
126 |     }
127 | 
128 |     template<class real, class imag>
129 |     friend inline CustomComplex<real,imag> CustomComplex_conj(const CustomComplex<real,imag>& src) ;
130 | 
131 |     template<class real, class imag>
132 |     friend inline double CustomComplex_abs(const CustomComplex<real,imag>& src) ;
133 | 
134 |     template<class real, class imag>
135 |     friend inline double CustomComplex_real( const CustomComplex<real,imag>& src) ;
136 | 
137 |     template<class real, class imag>
138 |     friend inline double CustomComplex_imag( const CustomComplex<real,imag>& src) ;
139 | };
140 | 
141 | /*
142 |  * Return the conjugate of a complex number 
143 |  1flop
144 |  */
145 | template<class re, class im>
146 | inline CustomComplex<re, im> CustomComplex_conj(const CustomComplex<re,im>& src) {
147 | 
148 |     re re_this = src.x;
149 |     im im_this = -1 * src.y;
150 | 
151 |     CustomComplex<re,im> result(re_this, im_this);
152 |     return result;
153 | 
154 | }
155 | 
156 | /*
157 |  * Return the absolute of a complex number 
158 |  */
159 | template<class re, class im>
160 | inline double CustomComplex_abs(const CustomComplex<re,im>& src) {
161 |     re re_this = src.x * src.x;
162 |     im im_this = src.y * src.y;
163 | 
164 |     re result = sqrt(re_this+im_this);
165 |     return result;
166 | }
167 | 
168 | /*
169 |  * Return the real part of a complex number 
170 |  */
171 | template<class re, class im>
172 | inline double CustomComplex_real( const CustomComplex<re,im>& src) {
173 |     return src.x;
174 | }
175 | 
176 | /*
177 |  * Return the imaginary part of a complex number 
178 |  */
179 | template<class re, class im>
180 | inline double CustomComplex_imag( const CustomComplex<re,im>& src) {
181 |     return src.y;
182 | }
183 | 
184 | #endif
185 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/Makefile:
--------------------------------------------------------------------------------
 1 | #EXE = gppKer_double.ex.likwid.div.dbl.nofma.iw6
 2 | #SRC = gppKer_double.cpp 
 3 | EXE = gppCustomComplex.ex.survey.likwid.div.cmplx.fma.iw6
 4 | SRC = gppCustomComplex.cpp 
 5 | 
 6 | #CXX = xlc++_r
 7 | #CXX = g++
 8 | CXX = CC
 9 | 
10 | LINK = ${CXX}
11 | 
12 | ifeq ($(CXX),CC)
13 | ##Intel compiler flag
14 | 	#CXXFLAGS= -g -O3 -qopenmp -qopt-report=5 -std=c++11
15 | 	CXXFLAGS= -g -O3 -qopenmp -std=c++11
16 |     CXXFLAGS+=-fma #Fused multiply and add
17 |     #CXXFLAGS+=-DTOOLS -I${VTUNE_DIR}/include 
18 |     #CXXFLAGS+=-I /usr/common/software/likwid/4.3.0/include/ -DLIKWID_PERFMON
19 | #    #CXXFLAGS+=-I /usr/common/software/likwid/4.3.0/include/ -DUSE_VTUNE -I${VTUNE_DIR}/include -DLIKWID_PERFMON
20 |     #CXXFLAGS+=-xCORE-AVX2
21 |     CXXFLAGS+=-xMIC-AVX512
22 |     LINKFLAGS=-qopenmp 
23 |     #LINKFLAGS+=-L /usr/common/software/likwid/4.3.0/lib -llikwid
24 |     #LINKFLAGS+=-L ${VTUNE_DIR}/lib64 -littnotify 
25 | 
26 | #Cray compiler flag
27 | #	CXXFLAGS= -hlist=a
28 | endif 
29 | 
30 | ifeq ($(CXX),g++)
31 | 	CXXFLAGS= -g -O3 -std=c++11 -fopenmp 
32 | 	LINKFLAGS=-fopenmp
33 | endif 
34 | 
35 | ifeq ($(CXX),xlc++_r)
36 | 	CXXFLAGS=-O3 -std=gnu++11 -g -qsmp
37 | 	LINKFLAGS=-qsmp
38 | endif 
39 | 
40 | ifeq ($(CXX),clang++)
41 | 	CXXFLAGS=-O3 -std=gnu++11 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME}
42 | 	LINKFLAGS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME}
43 | endif 
44 | 
45 | OBJ = $(SRC:.cpp=.o)
46 | 
47 | $(EXE): $(OBJ)  
48 | 	$(CXX) $(OBJ) -o $(EXE) $(LINKFLAGS)
49 | 
50 | $(OBJ1): $(SRC) 
51 | 	$(CXX) -c $(SRC) $(CXXFLAGS)
52 | 
53 | clean: 
54 | 	rm -f $(OBJ) $(EXE)
55 | 
56 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/batchjob.advisor:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | #SBATCH -t 03:00:00
 4 | #SBATCH -A nstaff
 5 | #SBATCH -C knl,quad,cache
 6 | 
 7 | module swap craype-haswell craype-mic-knl
 8 | 
 9 | export OMP_NUM_THREADS=64
10 | export OMP_PROC_BIND=spread
11 | export OMP_PLACES=threads
12 | 
13 | module unload darshan
14 | module load advisor
15 | 
16 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
17 | mkdir Results_adv_$SLURM_JOB_ID
18 | seqend=1
19 | 
20 | label='survey.div.cmplx.fma'
21 | res=Results_adv_$SLURM_JOB_ID/results.$label 
22 | touch $res
23 | for i in $(seq 1 $seqend)
24 | do 
25 | 
26 |   srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 advixe-cl --collect=roofline --project-dir=Results_adv_$SLURM_JOB_ID/my_adv.knl -- ./gppCustomComplex.ex.$label.iw6 512 2 32768 20 0 >> $res
27 | 
28 | done
29 | 
30 | cd Results_adv_$SLURM_JOB_ID/
31 | #advixe-cl -report roofline --project-dir Results_adv_$SLURM_JOB_ID/my_adv.knl 
32 | advixe-cl -report roofline --project-dir my_adv.knl > adv.html
33 | 
34 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/batchjob.likwid.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | #SBATCH -t 00:10:00
 4 | #SBATCH -A nstaff
 5 | #SBATCH -C knl,quad,cache
 6 | #SBATCH --perf=vtune
 7 | 
 8 | module swap craype-haswell craype-mic-knl
 9 | 
10 | export OMP_NUM_THREADS=64
11 | export OMP_PROC_BIND=spread
12 | export OMP_PLACES=threads
13 | 
14 | module unload darshan
15 | module load likwid
16 | 
17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
18 | mkdir Results_likwid_survey_$SLURM_JOB_ID
19 | seqend=1
20 | #arr=(1 2 3 4 5 6)
21 | arr=(6)
22 | gs=('FLOPS_DP' 'HBM_CACHE' 'L2' 'DATA')
23 | 
24 | 
25 | label='survey.likwid.div.cmplx.fma'
26 | for i in $(seq 1 $seqend)
27 | do 
28 |   for j in ${arr[@]}
29 |   do
30 |   for k in ${gs[@]}
31 |   do 
32 |     kk=${k/_/.}
33 |     echo $kk
34 |     res=Results_likwid_survey_$SLURM_JOB_ID/result.likwid.$kk.txt
35 |     touch $res
36 | 
37 |     srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 likwid-perfctr -c 0-271 -g $k ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 
38 |   done 
39 | 
40 |   done
41 | done
42 | 
43 | cd Results_likwid_survey_$SLURM_JOB_ID/
44 | 
45 | echo ---------------------------------------
46 | echo "Likwid on KNL"
47 | echo ---------------------------------------
48 | # FLOPS
49 | #echo "Run with -g FLOPS_DP"
50 | mflopss=`grep AVX512 result.likwid.FLOPS.DP.txt | tail -n 1 | cut -d '|' -f 3`
51 | runtime=`grep RDTSC result.likwid.FLOPS.DP.txt | tail -n 1 | cut -d '|' -f 6`
52 | gflops=`python -c "print('{0:.3f}'.format($mflopss * $runtime/1000))"`
53 | gflopss=`python -c "print('{0:.3f}'.format($mflopss/1000))"`
54 | #echo "Runtime: $runtime"
55 | #echo "GFLOP/s: $gflopss"
56 | echo "GFLOPS: $gflops"
57 | #echo 
58 | #BYTES - DDR and MCDRAM
59 | #echo "Run with -g HBM_CACHE"
60 | hbm_mbytess=`grep "MCDRAM Memory bandwidth" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3`
61 | hbm_gbytes=`grep "MCDRAM Memory data volume" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3`
62 | ddr_mbytess=`grep "DDR Memory bandwidth" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3`
63 | ddr_gbytes=`grep "DDR Memory data volume" result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 3`
64 | runtime=`grep RDTSC result.likwid.HBM.CACHE.txt | tail -n 1 | cut -d '|' -f 6`
65 | hbm_gbytess=`python -c "print('{0:.3f}'.format($hbm_mbytess/1000))"`
66 | ddr_gbytess=`python -c "print('{0:.3f}'.format($ddr_mbytess/1000))"`
67 | #echo "Runtime: $runtime s"
68 | echo "MCDRAM Bytes: $hbm_gbytes"
69 | #echo "MCDRAM bandwidth: $hbm_gbytess GB/s"
70 | echo "DDR Bytes: $ddr_gbytes"
71 | #echo "DDR bandwidth: $ddr_gbytess GB/s"
72 | #echo 
73 | #BYTES - L2
74 | #echo "Run with -g L2"
75 | l2_mbytess=`grep "L2 bandwidth" result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 3`
76 | l2_mbytes=`grep "L2 data volume" result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 3` #Likwid wrongly reports MB in GB
77 | l2_gbytes=`python -c "print('{0:.3f}'.format($l2_mbytes/1000))"`
78 | runtime=`grep RDTSC result.likwid.L2.txt | tail -n 1 | cut -d '|' -f 6`
79 | l2_gbytess=`python -c "print('{0:.3f}'.format($l2_mbytess/1000))"`
80 | #echo "Runtime: $runtime s"
81 | echo "L2 Bytes: $l2_gbytes"
82 | #echo "L2 bandwidth: $l2_gbytess GB/s"
83 | #echo 
84 | #BYTES - L1
85 | #echo "Run with -g DATA (for L1 loads/stores uops)"
86 | uops_ld=`grep MEM_UOPS_RETIRED_ALL_LOADS result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 4`
87 | uops_st=`grep MEM_UOPS_RETIRED_ALL_STORES result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 4`
88 | runtime=`grep RDTSC result.likwid.DATA.txt | tail -n 1 | cut -d '|' -f 6`
89 | l1_gbytess=`python -c "print('{0:.3f}'.format(($uops_ld+$uops_st)/1000.0/1000/1000*64/$runtime))"`
90 | l1_gbytes=`python -c "print('{0:.3f}'.format(($uops_ld+$uops_st)/1000.0/1000/1000*64))"`
91 | #echo "Runtime: $runtime s"
92 | echo "L1 Bytes: $l1_gbytes"
93 | #echo "L1 bandwidth: $l1_gbytess GB/s"
94 | 
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/batchjob.notools.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | #SBATCH -t 02:00:00
 4 | #SBATCH -A nstaff
 5 | #SBATCH -C knl,quad,cache
 6 | 
 7 | module swap craype-haswell craype-mic-knl
 8 | 
 9 | export OMP_NUM_THREADS=64
10 | export OMP_PROC_BIND=spread
11 | export OMP_PLACES=threads
12 | 
13 | module unload darshan
14 | 
15 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
16 | mkdir Results_notools_survey_$SLURM_JOB_ID
17 | seqend=1
18 | #arr=(1 2 3 4 5 6)
19 | arr=(6)
20 | 
21 | label='survey.div.cmplx.fma'
22 | res=Results_notools_survey_$SLURM_JOB_ID/results.$label 
23 | touch $res
24 | for i in $(seq 1 $seqend)
25 | do 
26 |   for j in ${arr[@]}
27 |   do 
28 |   srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 
29 |   done
30 | done
31 | 
32 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/batchjob.sde.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | #SBATCH -t 01:00:00
 4 | #SBATCH -A nstaff
 5 | #SBATCH -C knl,quad,cache
 6 | #SBATCH --perf=vtune
 7 | 
 8 | module swap craype-haswell craype-mic-knl
 9 | 
10 | export OMP_NUM_THREADS=64
11 | export OMP_PROC_BIND=spread
12 | export OMP_PLACES=threads
13 | 
14 | module unload darshan
15 | export PATH=$PATH:/global/cfs/cdirs/nstaff/cjyang/P3HPC/Empirical_Roofline_Tool-1.1.0/new-GPP/BGW-Kernels/sde/sde-external-8.16.0-2018-01-30-lin/
16 | 
17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
18 | mkdir Results_sde_survey_$SLURM_JOB_ID
19 | seqend=1
20 | #arr=(1 2 3 4 5 6)
21 | arr=(6)
22 | 
23 | label='survey.tools.div.cmplx.fma'
24 | res=Results_sde_survey_$SLURM_JOB_ID/result.sde.out 
25 | touch $res
26 | for i in $(seq 1 $seqend)
27 | do 
28 |   for j in ${arr[@]}
29 |   do
30 |     srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 sde64 -knl -d -iform 1 -omix $res -global_region -start_ssc_mark 111:repeat -stop_ssc_mark 222:repeat -- ./gppCustomComplex.ex.$label.iw$j 512 2 32768 20 0   
31 |   done
32 | done
33 | 
34 | cd Results_sde_survey_$SLURM_JOB_ID/
35 | cp ../parse-sde2.sh .
36 | 
37 | echo ---------------------------------------
38 | echo "SDE on KNL"
39 | echo ---------------------------------------
40 | ./parse-sde2.sh result.sde.out > result.sde.out.parse
41 | flops=`grep 'Total FLOPs = ' result.sde.out.parse | cut -d '=' -f 2`
42 | gflops=`python -c "print('{0:.3f}'.format($flops/1000.0/1000/1000))"`
43 | echo GFLOPS: $gflops
44 | bytes=`grep 'Total Bytes = ' result.sde.out.parse | cut -d '=' -f 2`
45 | gbytes=`python -c "print('{0:.3f}'.format($bytes/1000.0/1000/1000))"`
46 | echo L1 Bytes: $gbytes
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/batchjob.vtune:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -n 1
 3 | #SBATCH -t 03:00:00
 4 | #SBATCH -A nstaff
 5 | #SBATCH -C knl,quad,cache
 6 | #SBATCH --perf=vtune
 7 | 
 8 | module swap craype-haswell craype-mic-knl
 9 | 
10 | export OMP_NUM_THREADS=64
11 | export OMP_PROC_BIND=spread
12 | export OMP_PLACES=threads
13 | 
14 | module unload darshan
15 | module load vtune #/2018.up2
16 | 
17 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
18 | mkdir Results_tools_$SLURM_JOB_ID
19 | seqend=1
20 | 
21 | label='survey.tools.div.cmplx.fma'
22 | res=Results_tools_$SLURM_JOB_ID/results.$label 
23 | touch $res
24 | for i in $(seq 1 $seqend)
25 | do 
26 |   srun -n 1 -c 272 --cpu_bind=cores --cpu-freq=1401000 vtune -start-paused -r Results_tools_$SLURM_JOB_ID/my-vtune.knl -collect memory-access -finalization-mode=none -data-limit=0 -- ./gppCustomComplex.ex.$label.iw6 512 2 32768 20 0 >> $res
27 | done
28 | 
29 | cd Results_tools_$SLURM_JOB_ID/
30 | cp ../parse-vtune.sh .
31 | cp ../parse-vtune.py .
32 | 
33 | vtune -report hw-events -group-by=package -r my-vtune.knl/ -format csv -csv-delimiter comma  > my-vtune.knl.summary
34 | 
35 | 
36 | echo ---------------------------------------
37 | echo "VTune on KNL"
38 | echo ---------------------------------------
39 | ./parse-vtune.sh my-vtune.knl.summary  > my-vtune.knl.summary.parse
40 | ddr_bytes=`grep 'Total Bytes = ' my-vtune.knl.summary.parse | tail -n 2 | head -n 1 | cut -d '=' -f 2`
41 | ddr_gbytes=`python -c "print('{0:.3f}'.format($ddr_bytes/1000.0/1000/1000))"`
42 | echo DDR Bytes: $ddr_gbytes
43 | hbm_bytes=`grep 'Total Bytes = ' my-vtune.knl.summary.parse | tail -n 1 | cut -d '=' -f 2`
44 | hbm_gbytes=`python -c "print('{0:.3f}'.format($hbm_bytes/1000.0/1000/1000))"`
45 | echo MCDRAM Bytes: $hbm_gbytes
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/compile.likwid.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | module swap craype-haswell craype-mic-knl
 3 | module load likwid
 4 | 
 5 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
 6 | 
 7 | sed -i 's/#CXXFLAGS+=-I \/usr/CXXFLAGS+=-I \/usr/g' Makefile
 8 | sed -i 's/#LINKFLAGS+=-L \/usr/LINKFLAGS+=-L \/usr/g' Makefile
 9 | 
10 | label='survey.likwid.div.cmplx.fma'
11 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile
12 | arr=(1 2 3 4 5 6)
13 | for i in ${arr[@]}
14 | do
15 |   sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp
16 |   sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile
17 |   make clean && make
18 | done
19 | 
20 | 
21 | sed -i 's/CXXFLAGS+=-I \/usr/#CXXFLAGS+=-I \/usr/g' Makefile
22 | sed -i 's/LINKFLAGS+=-L \/usr/#LINKFLAGS+=-L \/usr/g' Makefile
23 | 
24 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/compile.notools.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | module swap craype-haswell craype-mic-knl
 3 | 
 4 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
 5 | 
 6 | label='survey.div.cmplx.fma'
 7 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile
 8 | 
 9 | arr=(1 2 3 4 5 6)
10 | for i in ${arr[@]}
11 | do
12 |   sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp
13 |   sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile
14 |   make clean && make
15 | done 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/compile.tools.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | module swap craype-haswell craype-mic-knl
 3 | module load vtune
 4 | 
 5 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/KNL
 6 | 
 7 | sed -i 's/#CXXFLAGS+=-DTOOLS/CXXFLAGS+=-DTOOLS/g' Makefile
 8 | sed -i 's/#LINKFLAGS+=-L ${VTUNE/LINKFLAGS+=-L ${VTUNE/g' Makefile
 9 | 
10 | label='survey.tools.div.cmplx.fma'
11 | sed -i 's/CXXFLAGS+=.*fma/CXXFLAGS+=-fma/g' Makefile
12 | arr=(1 2 3 4 5 6)
13 | for i in ${arr[@]}
14 | do
15 |   sed -i "s/#define nend.*/#define nend $i/g" gppCustomComplex.cpp
16 |   sed -i "s/gppCustomComplex.ex.*/gppCustomComplex.ex.$label.iw$i/g" Makefile
17 |   make clean && make
18 | done
19 | 
20 | sed -i 's/CXXFLAGS+=-DTOOLS/#CXXFLAGS+=-DTOOLS/g' Makefile
21 | sed -i 's/LINKFLAGS+=-L ${VTUNE/#LINKFLAGS+=-L ${VTUNE/g' Makefile
22 | 
23 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/parse-vtune.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | file='my-vtune.knl.summary'
 3 | df=pd.read_csv(file)
 4 | 
 5 | total_read=df.filter(regex='UNC_M_CAS_COUNT.RD').sum(axis=1)*64
 6 | total_write=df.filter(regex='UNC_M_CAS_COUNT.WR').sum(axis=1)*64
 7 | print('--->DDR Report')
 8 | print('--->Total Bytes read = '+str(total_read[0]))
 9 | print('--->Total Bytes written = '+str(total_write[0]))
10 | print('--->Total Bytes = '+str(total_read[0] + total_write[0] ))
11 | 
12 | total_read=df.filter(regex='UNC_E_RPQ_INSERTS').sum(axis=1)*64
13 | total_write=df.filter(regex='UNC_E_WPQ_INSERTS').sum(axis=1)*64
14 | print('--->MCDRAM Report')
15 | print('--->Total Bytes read = '+str(total_read[0]))
16 | print('--->Total Bytes written = '+str(total_write[0]))
17 | print('--->Total Bytes = '+str(total_read[0] + total_write[0]))
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/KNL/parse-vtune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash 
2 | # 
3 | # Parses a VTune summary report for uncore memory access counts
4 | 
5 | module load python
6 | python ./parse-vtune.py
7 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/Volta/Makefile:
--------------------------------------------------------------------------------
 1 | EXE = gppKer_gpuComplex.ex.survey.div.cmplx.fma.iw6
 2 | SRC1 = gppKer_gpuComplex.cpp 
 3 | SRC2 = GPUComplex.cu
 4 | #EXE = gppKer_double.ex
 5 | #SRC1 = gppKer_double.cpp
 6 | #SRC2 = GPUComplex_double.cu
 7 | 
 8 | 
 9 | #CXX = xlc++
10 | #CXX = CC 
11 | #CXX = g++
12 | #CXX = clang++
13 | CXX = nvcc
14 | 
15 | LINK = ${CXX}
16 | 
17 | ifeq ($(CXX),nvcc)
18 | 	CXXFLAGS= -g -O3 -std=c++11 -Wno-deprecated-gpu-targets -arch=sm_70 
19 | #	CXXFLAGS+=-Xptxas -v --maxrregcount=150 #output usage of registers
20 | 	CXXFLAGS+=-fmad=true
21 | 	LINKFLAGS=-Wno-deprecated-gpu-targets
22 | endif 
23 | 
24 | ifeq ($(CXX),g++)
25 | 	CXXFLAGS= -g -O3 -std=c++11 -fopenmp -foffload="-lm" -foffload=nvptx-none
26 | 	LINKFLAGS=-fopenmp
27 | endif 
28 | 
29 | ifeq ($(CXX),xlc++)
30 | 	CXXFLAGS=-O3 -std=gnu++11 -g -qsmp=noauto:omp -qoffload #-Xptxas -v
31 | 	LINKFLAGS=-qsmp=noauto:omp -qoffload 
32 | endif 
33 | 
34 | ifeq ($(CXX),clang++)
35 | 	CXXFLAGS=-O3 -std=gnu++11 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME}
36 | 	LINKFLAGS=-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_HOME}
37 | endif 
38 | 
39 | ifeq ($(CXX),icc)
40 | 	CXXFLAGS=-O3 -qopenmp -qopt-report=5
41 | 	CXXFLAGS+=xCORE_AVX2
42 | #	CXXFLAGS+=-xMIC_AVX512
43 | 	LINKFLAGS=-qopenmp
44 | endif 
45 | 
46 | OBJ1 = $(SRC1:.cpp=.o)
47 | OBJ2 = $(SRC2:.cu=.o)
48 | 
49 | $(EXE): $(OBJ1) $(OBJ2) 
50 | 	$(CXX) $(OBJ1) $(OBJ2) -o $(EXE) $(LINKFLAGS)
51 | 
52 | $(OBJ1): $(SRC1) 
53 | 	$(CXX) -c $(SRC1) $(CXXFLAGS)
54 | 
55 | $(OBJ2): $(SRC2) 
56 | 	$(CXX) -c $(SRC2) $(CXXFLAGS)
57 | 
58 | clean: 
59 | 	rm -f $(OBJ1) $(OBJ2) $(EXE)
60 | 
61 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/Volta/compile.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/Volta
 4 | 
 5 | module load cuda
 6 | 
 7 | label='survey.div.cmplx.fma'
 8 | sed -i 's/fmad=.*/fmad=true/g' Makefile
 9 | 
10 | #arr=(1 2 3 4 5 6)
11 | arr=(6)
12 | for i in ${arr[@]}
13 | do
14 |   sed -i "s/#define nend.*/#define nend $i/g" GPUComplex.h
15 |   sed -i "s/gppKer_gpuComplex.ex.*/gppKer_gpuComplex.ex.$label.iw$i/g" Makefile
16 |   make clean && make
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/GPP/Volta/run.survey:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #module load cuda
 4 | export CUDA_VISIBLE_DEVICES=0
 5 | 
 6 | cd /global/cscratch1/sd/cjyang/arxiv-likwid/nersc-roofline/GPP/Volta
 7 | mkdir Results_survey
 8 | seqend=1
 9 | #arr=(1 2 3 4 5 6)
10 | arr=(6)
11 | 
12 | label='survey.div.cmplx.fma'
13 | res=Results_survey/results.$label 
14 | touch $res
15 | for i in $(seq 1 $seqend)
16 | do 
17 |   for j in ${arr[@]}
18 |   do 
19 |   ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res 
20 |   done
21 | done
22 | 
23 | metnv='flop_count_dp,flop_count_sp,flop_count_hp,gld_transactions,gst_transactions,atomic_transactions,local_load_transactions,local_store_transactions,shared_load_transactions,shared_store_transactions,l2_read_transactions,l2_write_transactions,dram_read_transactions,dram_write_transactions,system_read_transactions,system_write_transactions'
24 | metncu10='sm__cycles_elapsed.avg,sm__cycles_elapsed.avg.per_second,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__sass_thread_inst_executed_op_fadd_pred_on.sum,sm__sass_thread_inst_executed_op_fmul_pred_on.sum,sm__sass_thread_inst_executed_op_ffma_pred_on.sum,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__inst_executed_pipe_tensor.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum,l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum,l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum,l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum,l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum,lts__t_sectors_op_read.sum,lts__t_sectors_op_write.sum,lts__t_sectors_op_atom.sum,lts__t_sectors_op_red.sum,dram__sectors_read.sum,dram__sectors_write.sum'
25 | metncu11='sm__cycles_elapsed.avg,sm__cycles_elapsed.avg.per_second,sm__sass_thread_inst_executed_op_dadd_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_fadd_pred_on.sum,sm__sass_thread_inst_executed_op_ffma_pred_on.sum,sm__sass_thread_inst_executed_op_fmul_pred_on.sum,sm__sass_thread_inst_executed_op_hadd_pred_on.sum,sm__sass_thread_inst_executed_op_hfma_pred_on.sum,sm__sass_thread_inst_executed_op_hmul_pred_on.sum,sm__inst_executed_pipe_tensor.sum,l1tex__t_bytes.sum,lts__t_bytes.sum,dram__bytes.sum'
26 | 
27 | 
28 | 
29 | label='survey.div.cmplx.fma'
30 | 
31 | j=6
32 | module load cuda/10.2.89
33 | res=Results_survey/results.nvprof.$label 
34 | touch $res
35 | which nvprof
36 | nvprof --version 
37 | srun -n1 nvprof --kernels "NumBandNgpown_kernel" --metrics $metnv ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res
38 | 
39 | res=Results_survey/results.ncu10.$label 
40 | touch $res
41 | which nv-nsight-cu-cli
42 | nv-nsight-cu-cli -v
43 | srun -n1 nv-nsight-cu-cli  -k "NumBandNgpown_kernel" --metrics $metncu10 ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res
44 | 
45 | module load nsight-compute/2020.1.0
46 | res=Results_survey/results.ncu11.$label 
47 | touch $res
48 | which ncu
49 | ncu -v
50 | srun -n1 ncu -k "NumBandNgpown_kernel" --metrics $metncu11 ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 >> $res
51 | 
52 | srun -n1 ncu -k "NumBandNgpown_kernel" -o ncu.prof --section-folder ./ncu-section-files --section SpeedOfLight_HierarchicalDoubleRooflineChart ./gppKer_gpuComplex.ex.$label.iw$j 512 2 32768 20 0 
53 | 
54 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.pdf


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/3rd_party/nersc-roofline-master/Plotting/plot_plot_roofline_py.png


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/Plotting/plot_roofline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | matplotlib.use('Agg')
  4 | import matplotlib.pyplot as plt
  5 | import sys
  6 | import os
  7 | import matplotlib.patches as mpatches
  8 | font = { 'size'   : 20}
  9 | plt.rc('font', **font)
 10 | 
 11 | filename = 'plot_' + sys.argv[0].replace('.','_')
 12 | markersize = 16
 13 | colors = ['b','g','r','y','m','c']
 14 | styles = ['o','s','v','^','D',">","<","*","h","H","+","1","2","3","4","8","p","d","|","_",".",","]
 15 | 
 16 | f = open(sys.argv[1], "r")
 17 | 
 18 | for line in f:
 19 |   if 'memroofs' in line:
 20 |     linesp = line.split()
 21 |     linesp = linesp[1:]
 22 |     smemroofs = [float(a) for a in linesp]
 23 |     print 'memroofs', smemroofs
 24 |   if 'mem_roof_names' in line:
 25 |     linesp = line.strip().split("\'")
 26 |     linesp = filter(lambda a: (a != ' ') and (a != ''), linesp)
 27 |     smem_roof_name  = linesp[1:]
 28 |     print 'mem_roof_names', smem_roof_name
 29 |   if 'comproofs' in line:
 30 |     linesp = line.split()
 31 |     linesp = linesp[1:]
 32 |     scomproofs  = [float(a) for a in linesp]
 33 |     print 'comproofs', scomproofs
 34 |   if 'comp_roof_names' in line:
 35 |     linesp = line.strip().split("\'")
 36 |     linesp = filter(lambda a: (a != ' ') and (a != ''), linesp)
 37 |     scomp_roof_name  = linesp[1:]
 38 |     print 'comp_roof_names', scomp_roof_name
 39 |   if 'AI' in line:
 40 |     linesp = line.split()
 41 |     linesp = linesp[1:]
 42 |     AI = [float(a) for a in linesp]
 43 |     print 'AI', AI
 44 |   if 'FLOPS' in line:
 45 |     linesp = line.split()
 46 |     linesp = linesp[1:]
 47 |     FLOPS = [float(a) for a in linesp]
 48 |     print 'FLOPS', FLOPS
 49 |   if 'labels' in line:
 50 |     linesp=line.strip().split("\'")
 51 |     linesp = filter(lambda a: (a != ' ') and (a != ''), linesp)
 52 |     labels = linesp[1:]
 53 |     print 'labels', labels
 54 | 
 55 | 
 56 | fig = plt.figure(1,figsize=(10.67,6.6))
 57 | plt.clf()
 58 | ax = fig.gca()
 59 | ax.set_xscale('log')
 60 | ax.set_yscale('log')
 61 | ax.set_xlabel('Arithmetic Intensity [FLOPs/Byte]')
 62 | ax.set_ylabel('Performance [GFLOP/sec]')
 63 | 
 64 | nx = 10000
 65 | xmin = -1
 66 | xmax = 2
 67 | ymin = 50 
 68 | ymax = 30000
 69 | 
 70 | ax.set_xlim(10**xmin, 10**xmax)
 71 | ax.set_ylim(ymin, ymax)
 72 | 
 73 | ixx = int(nx*0.02)
 74 | xlim = ax.get_xlim()
 75 | ylim = ax.get_ylim()
 76 | 
 77 | scomp_x_elbow = [] 
 78 | scomp_ix_elbow = [] 
 79 | smem_x_elbow = [] 
 80 | smem_ix_elbow = [] 
 81 | 
 82 | x = np.logspace(xmin,xmax,nx)
 83 | for roof in scomproofs:
 84 |     for ix in range(1,nx):
 85 |         if smemroofs[0] * x[ix] >= roof and smemroofs[0] * x[ix-1] < roof:
 86 |             scomp_x_elbow.append(x[ix-1])
 87 |             scomp_ix_elbow.append(ix-1)
 88 |             break
 89 | 
 90 | 
 91 | for roof in smemroofs:
 92 |     for ix in range(1,nx):
 93 |         if (scomproofs[0] <= roof * x[ix] and scomproofs[0] > roof * x[ix-1]):
 94 |             smem_x_elbow.append(x[ix-1])
 95 |             smem_ix_elbow.append(ix-1)
 96 |             break        
 97 | 
 98 | for i in range(0,len(scomproofs)):
 99 |     y = np.ones(len(x)) * scomproofs[i]  
100 |     ax.plot(x[scomp_ix_elbow[i]:],y[scomp_ix_elbow[i]:],c='k',ls='-',lw='2')
101 | 
102 | for i in range(0,len(smemroofs)):
103 |     y = x * smemroofs[i]  
104 |     ax.plot(x[:smem_ix_elbow[i]+1],y[:smem_ix_elbow[i]+1],c='k',ls='-',lw='2')
105 | 
106 | 
107 | marker_handles = list()
108 | for i in range(0,len(AI)):
109 |   ax.plot(float(AI[i]),float(FLOPS[i]),c=colors[i],marker=styles[i],linestyle='None',ms=markersize,label=labels[i])
110 |   marker_handles.append(ax.plot([],[],c=colors[i],marker=styles[i],linestyle='None',ms=markersize,label=labels[i])[0]) 
111 | 
112 | for roof in scomproofs:
113 |     ax.text(x[-ixx],roof,
114 |             scomp_roof_name[scomproofs.index(roof)] + ': ' + '{0:.1f}'.format(float(roof)) + ' GFLOP/s',
115 |             horizontalalignment='right',
116 |             verticalalignment='bottom')
117 | 
118 | for roof in smemroofs:
119 |     ang = np.arctan(np.log10(xlim[1]/xlim[0]) / np.log10(ylim[1]/ylim[0]) 
120 |                                  * fig.get_size_inches()[1]/fig.get_size_inches()[0] )
121 |     ax.text(x[ixx],x[ixx]*roof*(1+0.25*np.sin(ang)**2),
122 |             smem_roof_name[smemroofs.index(roof)] + ': ' + '{0:.1f}'.format(float(roof)) + ' GB/s',
123 |             horizontalalignment='left',
124 |             verticalalignment='bottom',
125 |             rotation=180/np.pi*ang)
126 | 
127 | 
128 | leg1 = plt.legend(handles = marker_handles,loc=4, ncol=1)
129 | ax.add_artist(leg1)
130 | 
131 | plt.savefig(filename+'.png')
132 | plt.savefig(filename+'.eps')
133 | plt.savefig(filename+'.pdf')
134 | #plt.show()
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/README.md:
--------------------------------------------------------------------------------
 1 | # nersc-roofline
 2 | 
 3 | This repo contains files necessary to generate results here 
 4 | 
 5 | https://docs.nersc.gov/programming/performance-debugging-tools/roofline/.
 6 | 
 7 | and for the following two papers
 8 | 
 9 | C. Yang, R. Gayatri, T. Kurth, P. Basu, Z. Ronaghi, A. Adetokunbo, B. Friesen, B. Cook, D. Doerfler, L. Oliker et al., An Empirical Roofline Methodology for Quantitatively Assessing Performance Portability, in 2018 IEEE/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC). IEEE, 2018, pp. 14-23.
10 | 
11 | C. Yang, Hierarchical Roofline Analysis: How to Collect Data using Performance Tools on Intel CPUs and NVIDIA GPUs, arXiv.org
12 | 
13 | The data collection methodology for Roofline analysis on NVIDIA GPUs has been updated here
14 | 
15 | https://gitlab.com/NERSC/roofline-on-nvidia-gpus
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/.gitignore:
--------------------------------------------------------------------------------
1 | stream_mpi.exe
2 | results*
3 | 
4 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/Makefile:
--------------------------------------------------------------------------------
 1 | MPICC = cc
 2 | CFLAGS = -g -O3 -dynamic -qopenmp -restrict -qopt-streaming-stores always  \
 3 |          -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=50 \
 4 |          -I$(VTUNE_AMPLIFIER_XE_2018_DIR)/include
 5 | LDFLAGS = -L$(VTUNE_AMPLIFIER_XE_2018_DIR)/lib64 -littnotify
 6 | 
 7 | stream_mpi.exe: stream_mpi.c Makefile
 8 | 	$(MPICC) $(CFLAGS) stream_mpi.c -o stream_mpi.exe $(LDFLAGS)
 9 | 
10 | clean:
11 | 	rm -f stream_mpi.exe
12 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/READ.ME:
--------------------------------------------------------------------------------
  1 | ===============================================
  2 | 
  3 | STREAM is the de facto industry standard benchmark
  4 | for measuring sustained memory bandwidth.
  5 | 
  6 | Documentation for STREAM is on the web at:
  7 |    http://www.cs.virginia.edu/stream/ref.html
  8 | 
  9 | ===============================================
 10 | NEWS
 11 | ===============================================
 12 | UPDATE: October 28 2014:
 13 | 
 14 | "stream_mpi.c" released in the Versions directory.
 15 | 
 16 | Based on Version 5.10 of stream.c, stream_mpi.c
 17 | brings the following new features:
 18 | * MPI implementation that *distributes* the arrays
 19 |   across all MPI ranks. (The older Fortran version
 20 |   of STREAM in MPI *replicates* the arrays across
 21 |   all MPI ranks.)
 22 | * Data is allocated using "posix_memalign" 
 23 |   rather than using static arrays.  Different
 24 |   compiler flags may be needed for both portability
 25 |   and optimization.
 26 |   See the READ.ME file in the Versions directory
 27 |   for more details.
 28 | * Error checking and timing done by all ranks and
 29 |   gathered by rank 0 for processing and output.
 30 | * Timing code uses barriers to ensure correct
 31 |   operation even when multiple MPI ranks run on
 32 |   shared memory systems.
 33 | 
 34 | NOTE: MPI is not a preferred implementation for
 35 |   STREAM, which is intended to measure memory
 36 |   bandwidth in shared-memory systems.  In stream_mpi,
 37 |   the MPI calls are only used to properly synchronize
 38 |   the timers (using MPI_Barrier) and to gather
 39 |   timing and error data, so the performance should 
 40 |   scale linearly with the size of the cluster.
 41 |   But it may be useful, and was an interesting 
 42 |   exercise to develop and debug.
 43 | 
 44 | ===============================================
 45 | UPDATE: January 17 2013:
 46 | 
 47 | Version 5.10 of stream.c is finally available!
 48 | 
 49 | There are no changes to what is being measured, but
 50 | a number of long-awaited improvements have been made:
 51 | 
 52 | * Updated validation code does not suffer from 
 53 |   accumulated roundoff error for large arrays.
 54 | * Defining the preprocessor variable "VERBOSE"
 55 |   when compiling will (1) cause the code to print the
 56 |   measured average relative absolute error (rather than
 57 |   simply printing "Solution Validates", and (2) print
 58 |   the first 10 array entries with relative error exceeding
 59 |   the error tolerance.
 60 | * Array index variables have been upgraded from
 61 |   "int" to "ssize_t" to allow arrays with more
 62 |   than 2 billion elements on 64-bit systems.
 63 | * Substantial improvements to the comments in 
 64 |   the source on how to configure/compile/run the
 65 |   benchmark.
 66 | * The proprocessor variable controlling the array
 67 |   size has been changed from "N" to "STREAM_ARRAY_SIZE".
 68 | * A new preprocessor variable "STREAM_TYPE" can be
 69 |   used to override the data type from the default
 70 |   "double" to "float".
 71 |   This mechanism could also be used to change to 
 72 |   non-floating-point types, but several "printf"
 73 |   statements would need to have their formats changed
 74 |   to accomodate the modified data type.
 75 | * Some small changes in output, including printing
 76 |   array sizes is GiB as well as MiB.
 77 | * Change to the default output format to print fewer
 78 |   decimals for the bandwidth and more decimals for
 79 |   the min/max/avg execution times.
 80 | 
 81 | 
 82 | ===============================================
 83 | UPDATE: February 19 2009:
 84 | 
 85 | The most recent "official" versions have been renamed
 86 | "stream.f" and "stream.c" -- all other versions have
 87 | been moved to the "Versions" subdirectory and should be
 88 | considered obsolete.
 89 | 
 90 | The "official" timer (was "second_wall.c") has been
 91 | renamed "mysecond.c".   This is embedded in the C version
 92 | ("stream.c"), but still needs to be externally linked to
 93 | the FORTRAN version ("stream.f").  The new version defines
 94 | entry points both with and without trailing underscores,
 95 | so it *should* link automagically with any Fortran compiler.
 96 | 
 97 | ===============================================
 98 | 
 99 | STREAM is a project of "Dr. Bandwidth":
100 | 	John D. McCalpin, Ph.D.
101 | 	john@mccalpin.com
102 | 
103 | ===============================================
104 | 
105 | The STREAM web and ftp sites are currently hosted at
106 | the Department of Computer Science at the University of
107 | Virginia under the generous sponsorship of Professor Bill
108 | Wulf and Professor Alan Batson.
109 | 
110 | ===============================================
111 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/README.md:
--------------------------------------------------------------------------------
1 | # stream-ai-example
2 | 
3 | This directory contains all the files necessary to illustrate calculating 
4 | arithmetic intensity using Intel's SDE and VTune tools. 
5 | 
6 | For more information, see:
7 | 
8 | https://docs.nersc.gov/performance/arithmetic_intensity/
9 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/Makefile:
--------------------------------------------------------------------------------
 1 | ##
 2 | # module load vtune
 3 | # module load sde
 4 | # module unload darshan
 5 | #
 6 | FC=ftn
 7 | CC=cc
 8 | FFLAGS= -g -dynamic -O3 -openmp
 9 | CFLAGS= -g -I $(VTUNE_AMPLIFIER_XE_2016_DIR)/include
10 | LDFLAGS=$(VTUNE_AMPLIFIER_XE_2016_DIR)/lib64/libittnotify.a
11 | EXE=jacobi.x
12 | COBJ= api_itt_sde.o
13 | F90OBJ= module_itt_sde.o
14 | default: jacobi
15 | clean:
16 | 	rm *.o *.mod $(EXE)
17 | %.o:   %.c
18 | 	$(CC) $(CFLAGS) -c $<
19 | %.o:    %.f90
20 | 	$(FC) $(FFLAGS) -c $<
21 | jacobi: $(COBJ) $(F90OBJ) jacobi.o
22 | 	$(FC) $(FFLAGS) -o jacobi.x $(COBJ) $(F90OBJ) jacobi.o $(LDFLAGS)
23 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/api_itt_sde.c:
--------------------------------------------------------------------------------
 1 | #include "ittnotify.h"
 2 | 
 3 | void fortran_sde_start()
 4 | {
 5 |   __SSC_MARK(0x111);
 6 | }  
 7 | 
 8 | void fortran_sde_stop()
 9 | {
10 |   __SSC_MARK(0x222);
11 | }  
12 | 
13 | void fortran_itt_resume()
14 | {
15 |   __itt_resume();
16 | }
17 | 
18 | void fortran_itt_pause()
19 | {
20 |   __itt_pause();
21 | }
22 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/fortran_itt_sde/module_itt_sde.f90:
--------------------------------------------------------------------------------
 1 | MODULE ITT_SDE_FORTRAN
 2 | USE, INTRINSIC :: ISO_C_BINDING
 3 | 
 4 | INTERFACE
 5 |    
 6 |    SUBROUTINE FORTRAN_ITT_RESUME() &
 7 |       BIND(C, NAME='fortran_itt_resume')
 8 |    END SUBROUTINE FORTRAN_ITT_RESUME
 9 | 
10 |    SUBROUTINE FORTRAN_ITT_PAUSE() &
11 |       BIND(C, NAME='fortran_itt_pause')
12 |    END SUBROUTINE FORTRAN_ITT_PAUSE
13 | 
14 |    SUBROUTINE FORTRAN_SDE_START() &
15 |       BIND(C, NAME='fortran_sde_start')
16 |    END SUBROUTINE FORTRAN_SDE_START
17 | 
18 |    SUBROUTINE FORTRAN_SDE_STOP() &
19 |       BIND(C, NAME='fortran_sde_stop')
20 |    END SUBROUTINE FORTRAN_SDE_STOP
21 | END INTERFACE
22 | 
23 | contains
24 | 
25 |    subroutine start_collection()
26 |      call fortran_sde_start()
27 |      call fortran_itt_resume()
28 |    end subroutine start_collection
29 | 
30 |    subroutine stop_collection() 
31 |     call fortran_itt_pause()
32 |     call fortran_sde_stop()
33 |    end subroutine stop_collection
34 | 
35 | END MODULE
36 | 


--------------------------------------------------------------------------------
/3rd_party/nersc-roofline-master/stream-ai-example/stream-ai.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=m888  	# Your repo goes here
 3 | #SBATCH --qos=debug
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=00:30:00
 6 | #SBATCH --job-name=stream-ai
 7 | 
 8 | # VTune SEP driver is required
 9 | #SBATCH --perf=vtune
10 | 
11 | # for Cori, set appropriately. Not required for Edison
12 | #SBATCH --constraint="knl"
13 | ##SBATCH --constraint="haswell"
14 | 
15 | ### start of script configuration parameters
16 | 
17 | # set to yes or no to select tests to run
18 | run_baseline=yes
19 | run_sde=yes
20 | run_vtune=yes
21 | 
22 | # use -knl for Cori KNL, -hsw for Cori Haswell or -ivb for Edison
23 | if [ $NERSC_HOST == "cori" ]; then
24 |   if [ $SLURM_JOB_CPUS_PER_NODE == "272" ]; then # KNL node
25 |     myhost=cori_knl
26 |     SDE='sde -knl'
27 | 
28 |     # determin the number of MPI ranks per node
29 |     n=$SLURM_JOB_NUM_NODES
30 | 
31 |     # number of threads/rank and virtual cores (needed for srun's -c)
32 |     t=64
33 |     vcores=$(( t * 4 ))
34 |   else 						# Haswell node
35 |     myhost=cori_hsw
36 |     SDE='sde -hsw'
37 |     n=$(( 2 * $SLURM_JOB_NUM_NODES ))
38 |     t=16
39 |     vcores=$(( t * 2 ))
40 |   fi
41 | 
42 | elif [ $NERSC_HOST == "edison" ]; then
43 |   myhost=edison
44 |   SDE='sde -ivb'
45 |   n=$(( 2 * $SLURM_JOB_NUM_NODES ))
46 |   t=12
47 |   vcores=$(( t * 2 ))
48 | fi
49 | 
50 | #module load sde	# requires version 8.4.0 or later
51 | #module load vtune	# script setup for Vtune 2017 or later
52 | 
53 | ### End of configuration parameters
54 | 
55 | echo "Running with $n MPI ranks and $t threads"
56 | export OMP_NUM_THREADS=$t
57 | suffix=${n}p${t}t_${SLURM_JOB_ID}
58 | exe=./stream_mpi.exe
59 | 
60 | if [ "$run_baseline" == "yes" ]; then
61 |   echo ""
62 |   echo "--------------------------------------------------"
63 |   echo "----->> Running Stream w/o Instrumentation <<-----"
64 |   echo "--------------------------------------------------"
65 |   srun -n $n -c $vcores --cpu_bind=cores $exe
66 | fi
67 | 
68 | if [ "$run_sde" == "yes" ]; then
69 |   echo ""
70 |   echo "--------------------------------------------------"
71 |   echo "----->> Running w/SDE <<-----"
72 |   echo "--------------------------------------------------"
73 |   srun -n $n -c $vcores --cpu_bind=cores $SDE -d -iform 1 -omix sde_${suffix}.out -i -top_blocks 500 -global_region -start_ssc_mark 111:repeat -stop_ssc_mark 222:repeat -- $exe
74 |   echo "----->> Generating SDE Report <<-----"
75 |   echo "For performance, the SDE report is best done on an external login node"
76 |   echo "Run the following command: "
77 |   echo "\$ ./parse-sde.sh sde_${suffix}.out*"
78 | fi
79 | 
80 | if [ "$run_vtune" == "yes" ]; then
81 |   echo ""
82 |   echo "--------------------------------------------------"
83 |   echo "----->> Running w/Vtune <<-----"
84 |   echo "--------------------------------------------------"
85 |   srun -n $n -c $vcores --cpu_bind=cores amplxe-cl -start-paused -r vtbw_${suffix} -collect memory-access -finalization-mode=none -trace-mpi -- $exe
86 |   echo "----->> Finalizing VTune and generating report <<-----"
87 |   echo "For performance, the finalize and report are best done on an external login node"
88 |   echo "Run the following commands: "
89 |   echo "Note that if using Vtune version 2017 replace \"-report hw-events -group-by=package\" with \"-report summary\" "
90 |   if [ $myhost == "cori_knl" ]; then
91 |     echo "\$ amplxe-cl -report hw-events -group-by=package -r vtbw_${suffix} -column=UNC_M_CAS_COUNT,UNC_E_RPQ_INSERTS,UNC_E_WPQ_INSERTS -format=csv -csv-delimiter=comma > vtbw_${suffix}.summary"
92 |   else
93 |     echo "\$ amplxe-cl -report hw-events -group-by=package -r vtbw_${suffix} -column=UNC_M_CAS_COUNT -format=csv -csv-delimiter=comma > vtbw_${suffix}.summary"
94 |   fi
95 |   echo "\$ ./parse-vtune2018.sh vtbw_${suffix}.summary"
96 | fi
97 | 
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SpDNN Graph_challenge
 2 | source code for Sparse Deep Neural Network Graph Challenge (more detail:http://graphchallenge.mit.edu/challenges).
 3 | 
 4 | 
 5 | ## Get Start
 6 | First, clone the project and download the dataset.
 7 | ```
 8 | git clone https://github.com/CGCL-codes/Graphchallenge21.git
 9 | cd Graphchallenge21
10 | mkdir data/
11 | wget https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024.tar.gz
12 | wget https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/mnist/sparse-images-1024.tsv.gz
13 | tar –xzf neuron1024.tar.gz
14 | tar –xzf sparse-images-1024.tsv.gz
15 | ```
16 | Then, compile and run on single GPU version.
17 | ```
18 | cd src/
19 | nvcc -std=c++11 -O3 -o single.out network.cpp ./microbenchmark/all_network.cu
20 | ./single.out 1024 6000 120
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/benchmark/cublas/cublas.cu:
--------------------------------------------------------------------------------
  1 | #include "cuda_runtime.h"
  2 | #include "cublas_v2.h"
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <ctime>
  7 | 
  8 | 
  9 | #include <sys/time.h>
 10 | 
 11 | using namespace std;
 12 | 
 13 | 
 14 | #define CHECK(call)                                                            \
 15 | {                                                                              \
 16 |     const cudaError_t error = call;                                            \
 17 |     if (error != cudaSuccess)                                                  \
 18 |     {                                                                          \
 19 |         fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
 20 |         fprintf(stderr, "code: %d, reason: %s\n", error,                       \
 21 |                 cudaGetErrorString(error));                                    \
 22 |         exit(1);                                                               \
 23 |     }                                                                          \
 24 | }
 25 | 
 26 | inline double seconds()
 27 | {
 28 |     struct timeval tp;
 29 |     struct timezone tzp;
 30 |     int i = gettimeofday(&tp, &tzp);
 31 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
 32 | }
 33 | 
 34 | 
 35 | int main()
 36 | {
 37 |     srand(time(0));
 38 |     int M = 2000;              //矩阵A的行，矩阵C的行
 39 |     int N = 16384;               //矩阵A的列，矩阵B的行
 40 |     int K = 16384;               //矩阵B的列，矩阵C的列
 41 | 
 42 |     float *h_A = (float*)malloc(sizeof(float)*M*N);
 43 |     float *h_B = (float*)malloc(sizeof(float)*N*K);
 44 |     float *h_C = (float*)malloc(sizeof(float)*M*K);
 45 | 
 46 |     for (int i = 0; i < M*N; i++) {
 47 |         h_A[i] = i;
 48 |         // cout << h_A[i] << "  ";
 49 |         // if ((i + 1) % N == 0)
 50 |         //     cout << endl;        
 51 |     }
 52 |     //  cout << endl;
 53 | 
 54 |     for (int i = 0; i < N*K; i++) {
 55 |         h_B[i] =i;
 56 |         // cout << h_B[i] << "  ";
 57 |         // if ((i + 1) % K == 0)
 58 |         //     cout << endl;
 59 |     }
 60 |     cout << endl;
 61 | 
 62 |     double iStart, iElaps;
 63 | 
 64 |     float *d_A, *d_B, *d_C,*d_CT;
 65 |     cudaMalloc((void**)&d_A, sizeof(float)*M*N);
 66 |     cudaMalloc((void**)&d_B, sizeof(float)*N*K);
 67 |     cudaMalloc((void**)&d_C, sizeof(float)*M*K);
 68 |     cudaMemcpy(d_A, h_A, M*N * sizeof(float), cudaMemcpyHostToDevice);
 69 |     cudaMemcpy(d_B, h_B, N*K * sizeof(float), cudaMemcpyHostToDevice);
 70 | 
 71 |     float alpha = 1;
 72 |     float beta = 0;
 73 | 
 74 |     //C=A*B
 75 |     cublasHandle_t handle;
 76 |     
 77 |     cublasCreate(&handle);
 78 |     
 79 |     // clock_t start = clock();//MNK Bt*At
 80 |     
 81 |     iStart = seconds();
 82 | 
 83 | 
 84 |     cudaEvent_t start, stop;
 85 |     cudaEventCreate(&start);
 86 |     cudaEventCreate(&stop);
 87 | 
 88 |     cudaEventRecord(start, 0);
 89 |     cublasSgemm(handle,
 90 |         CUBLAS_OP_N,  
 91 |         CUBLAS_OP_N,   
 92 |         K,                    //矩阵B的列数
 93 |         M,                    //矩阵A的行数
 94 |         N,                    //矩阵A的列数
 95 |         &alpha,           
 96 |         d_B,            
 97 |         K,                    
 98 |         d_A,         
 99 |         N,         
100 |         &beta,          
101 |         d_C,           
102 |         K);
103 |     
104 |     CHECK(cudaGetLastError()) ;
105 | 
106 |     cudaEventRecord(stop,0);
107 |     cudaEventSynchronize(stop);
108 | 
109 | 
110 |     float elapsed;
111 |     cudaEventElapsedTime(&elapsed, start, stop);
112 |     elapsed /= 1000.0f;
113 | 
114 |     iElaps = seconds() - iStart;
115 | 
116 |     // clock_t end = clock();
117 |     // double sum_time = double(double(end - start)/CLOCKS_PER_SEC) * 1000;
118 |     
119 | 
120 |     printf("time= %lf\n", elapsed);
121 | 
122 |     // cout<<"inference time: "<< sum_time <<endl; 
123 |     float teps = (2 *(long) M * N * K) / elapsed;
124 |     cout << "TEPS = " << teps << endl; 
125 | 
126 |     cudaMemcpy(h_C, d_C, M*K * sizeof(float), cudaMemcpyDeviceToHost);
127 | 
128 |     for (int i = 0; i < 1; i++)
129 |     {
130 |         cout << h_C[i] << "  ";
131 |         // if ((i+1)%K==0)
132 |         //     cout << endl;
133 |     }
134 |     cout << endl;
135 | 
136 |     cudaFree(d_A);
137 |     cudaFree(d_B);
138 |     cudaFree(d_C);
139 |     free(h_A);
140 |     free(h_B);
141 |     free(h_C);
142 |     return 0;
143 | }


--------------------------------------------------------------------------------
/benchmark/cusparse/Makefile:
--------------------------------------------------------------------------------
 1 | # ----- Make Macros -----
 2 | 
 3 | CXX = mpicxx
 4 | CXXFLAGS = -std=c++11 -fopenmp
 5 | OPTFLAGS = -O3
 6 | 
 7 | NVCC = nvcc
 8 | NVCCFLAGS = -lineinfo -O3 -std=c++11 -gencode arch=compute_60,code=sm_60 -ccbin=mpicxx -Xcompiler -fopenmp
 9 | 
10 | LD_FLAGS = -ccbin=mpicxx -Xcompiler -fopenmp -lcusparse_static
11 | 
12 | TARGETS = inference
13 | OBJECTS = main.o
14 | 
15 | # ----- Make Rules -----
16 | 
17 | all:	$(TARGETS)
18 | 
19 | %.o : %.cu vars.h
20 | 	${NVCC} ${NVCCFLAGS} $< -c -o $@
21 | 
22 | inference: $(OBJECTS)
23 | 	$(NVCC) -o $@ $(OBJECTS) $(LD_FLAGS)
24 | 
25 | clean:
26 | 	rm -f $(TARGETS) *.o *.o.* *.txt *.bin core *.html *.xml


--------------------------------------------------------------------------------
/benchmark/cusparse/Makefile.volta:
--------------------------------------------------------------------------------
 1 | # ----- Make Macros -----
 2 | 
 3 | CXX = mpicxx
 4 | CXXFLAGS = -std=c++11 -fopenmp
 5 | OPTFLAGS = -O3
 6 | 
 7 | NVCC = nvcc
 8 | NVCCFLAGS = -lineinfo -O3 -std=c++11 -gencode arch=compute_60,code=sm_60 -ccbin=mpicxx -Xcompiler -fopenmp
 9 | 
10 | LD_FLAGS = -ccbin=mpicxx -Xcompiler -fopenmp -lcusparse_static
11 | 
12 | TARGETS = inference
13 | OBJECTS = main.o
14 | 
15 | # ----- Make Rules -----
16 | 
17 | all:	$(TARGETS)
18 | 
19 | %.o : %.cu vars.h
20 | 	${NVCC} ${NVCCFLAGS} $< -c -o $@
21 | 
22 | inference: $(OBJECTS)
23 | 	$(NVCC) -o $@ $(OBJECTS) $(LD_FLAGS)
24 | 
25 | clean:
26 | 	rm -f $(TARGETS) *.o *.o.* *.txt *.bin core *.html *.xml


--------------------------------------------------------------------------------
/benchmark/cusparse/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cp -f Makefile.volta Makefile
 3 | make clean;make -j 
 4 | 
 5 | echo "Starting Benchmark"
 6 | date
 7 | 
 8 | #export DATASET=/home/vsm2/SpDNN_Challenge2020/iostream/dataset
 9 | export DATASET=//home/xinjie/xinjie/graph_challenge/data
10 | #export DATASET=/home/vsm2/dataset
11 | 
12 | #1024 4096 16384 65536
13 | #export NEURON=65536
14 | #-0.3 -0.35 -0.4 -0.45
15 | #export BIAS=-0.45
16 | #6374505 25019051 98858913 392191985
17 | #export INPUT=392191985
18 | 
19 | #120 480 1920
20 | #export LAYER=1920
21 | export BATCH=60000
22 | 
23 | export BLOCKSIZE=256
24 | export BUFFER=24
25 | 
26 | export OMP_NUM_THREADS=16
27 | 
28 | 
29 | for neuron in 1024 4096 16384
30 | do 
31 | 	for layer in 120 480 1920
32 | 	do 
33 | 		if [[ $neuron -eq 1024 ]]
34 | 		then 
35 | 			export BIAS=-0.3
36 | 			export INPUT=6374505
37 | 		fi
38 | 		if [[ $neuron -eq 4096 ]]
39 | 		then 
40 | 			export BIAS=-0.35
41 | 			export INPUT=25019051
42 | 		fi
43 | 		if [[ $neuron -eq 16384 ]]
44 | 		then 
45 | 			export BIAS=-0.4
46 | 			export INPUT=98858913
47 | 		fi
48 | 		if [[ $neuron -eq 65536 ]]
49 | 		then 
50 | 			export BIAS=-0.45
51 | 			export INPUT=392191985
52 | 		fi
53 | 
54 | 		export NEURON=$neuron
55 | 		export LAYER=$layer
56 | 
57 | 		echo $LAYER
58 | 		echo $NEURON
59 | 		echo $BIAS
60 | 		echo $INPUT
61 | 		echo $DATASET
62 | 		./inference
63 | 
64 | 	done 
65 | 
66 | done
67 | 
68 | 
69 | #for l in 120 480 1920
70 | #do
71 | #  export LAYER=$l
72 | #  jsrun -n1 -a1 -g1 -c7 -EOMP_NUM_THREADS=7 -r1 -bpacked:7 js_task_info ./inference
73 | #  jsrun -n1 -a3 -g3 -c21 -EOMP_NUM_THREADS=7 -r1 -bpacked:7 js_task_info ./inference
74 | #  jsrun -n1 -a6 -g6 -c42 -EOMP_NUM_THREADS=7 -r1 -bpacked:7 js_task_info ./inference
75 | #  jsrun -n2 -a6 -g6 -c42 -EOMP_NUM_THREADS=7 -r1 -bpacked:7 js_task_info ./inference
76 | #  jsrun -n4 -a6 -g6 -c42 -EOMP_NUM_THREADS=7 -r1 -bpacked:7 js_task_info ./inference
77 | #done
78 | #
79 | date


--------------------------------------------------------------------------------
/benchmark/cusparse/vars.h:
--------------------------------------------------------------------------------
 1 |   
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <algorithm>
 5 | #include <vector>
 6 | #include <cuda_runtime.h>
 7 | 
 8 | //using namespace std;
 9 | 
10 | void readweights();
11 | void readinput();
12 | 
13 | void setup_gpu();
14 | void final_gpu();
15 | void infer_gpu(int);
16 | 
17 | //#define BALANCE 30 //BALANCE LAYER 0 FOR EVERY LAYER COMMENT OUT FOR TURN OFF
18 | //#define OUTOFCORE //COMMENT THIS OUT IF YOU HAVE ENOUGH MEMORY
19 | //#define OVERLAP //WORKS ONLY WHEN OUTOFCORE IS ENABLED
20 | #define INDPREC int
21 | #define VALPREC float
22 | #define FEATPREC float
23 | 
24 | 
25 | 
26 | inline void checkCuda(cudaError_t result, const char *file, const int line, bool fatal=false) {
27 |   if (result != cudaSuccess) {
28 |     fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n",  file, line, int(result),
29 |             cudaGetErrorString(result));\
30 |     if (fatal) {
31 |         exit(EXIT_FAILURE);
32 |     }
33 |   }
34 | }
35 | 
36 | #define OR_PRINT(stmt) checkCuda(stmt, __FILE__, __LINE__);
37 | #define OR_FATAL(stmt) checkCuda(stmt, __FILE__, __LINE__, true);
38 | 
39 | #define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}


--------------------------------------------------------------------------------
/benchmark/hpec/20-champions-1/run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/benchmark/hpec/20-champions-1/run.sh


--------------------------------------------------------------------------------
/src/BF.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils/header.h"
 2 | #include "reorder/header.h"
 3 | #include "inspector/header.h"
 4 | #include "gpu_lib/header.h"
 5 | #include "microbenchmark/header.h"
 6 | #include "fuse/header.h"
 7 | 
 8 | #include <functional>
 9 | #include <algorithm>
10 | using namespace ftxj;
11 | 
12 | int main(int argc, char* argv[]) {
13 | 
14 |     std::cout << "begin" << std::endl;
15 |     std::map<int, int> stride_map = {
16 |         {1, 16},
17 |         {2, 32},
18 |         {3, 64},
19 |         {4, 128},
20 |         {5, 256},
21 |         {6, 512},
22 |         {7, 1024},
23 |         {8, 2048},
24 |         {9, 4096},
25 |         {10, 8192}
26 |     };
27 | 
28 |     int neuron = atoi(argv[1]);
29 |     int batch = atoi(argv[2]);
30 |     int l = atoi(argv[3]);
31 |     int hash_type = atoi(argv[4]);
32 | 
33 | 
34 |     int TN = atoi(argv[5]);
35 |     int blockx = atoi(argv[6]);
36 |     int blocky = atoi(argv[7]);
37 | 
38 |     std::string file_name = "../data/neuron"+ 
39 |         std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv";
40 | 
41 |     COOMatrix coo(file_name, 1, false);
42 |     COOMatrix coo_cpu(file_name, 1, false);
43 |     std::cout << "read coo success" << std::endl;
44 | 
45 | 
46 |     if(hash_type == 0) {
47 |     }
48 | 
49 |     if(hash_type == 1) {
50 |         HashReorder hash_reorder_t(64, neuron, REORDER::ROW_REORDER);
51 |         coo.reorder(hash_reorder_t);
52 |         coo_cpu.reorder(hash_reorder_t);
53 |     }
54 |     
55 |     if(hash_type == 2) {
56 |         HashReorder hash_reorder_t(64, neuron, REORDER::COL_REORDER);
57 |         coo.reorder(hash_reorder_t);
58 |         coo_cpu.reorder(hash_reorder_t);
59 |     }
60 | 
61 |     if(hash_type == 3) {
62 |         HashReorder hash_reorder_t(64, neuron, REORDER::ALL_REORDER);
63 |         coo.reorder(hash_reorder_t);
64 |         coo_cpu.reorder(hash_reorder_t);
65 |     }
66 |     
67 |     std::cout << "reorder success" << std::endl;
68 |     BFMatrix bf(coo, neuron, TN);
69 |     std::cout << "BF success" << std::endl;
70 | 
71 |     GpuEnv env(0);
72 |     // test_benchmark_succ_load_store(batch, neuron, env);
73 |     // test_benchmark_matrix_transpose(batch, neuron, env); 
74 |     // test_benchmark_matrix_transpose_and_delete(batch, neuron, env);
75 |     // return 0;
76 | 
77 |     test_benchmark_19_BF(
78 |         coo,  bf, 
79 |         neuron, batch, TN, 
80 |         blockx, blocky,
81 |         env
82 |     );
83 |     return 0;
84 | }


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
1 | run_multi_gpu_big:run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o
2 | 	mpicxx run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu_big
3 | run_multi_gpu_big_cu.o:./microbenchmark/multi_gpu/multi_gpu_big.cu
4 | 	nvcc -c ./microbenchmark/multi_gpu/multi_gpu_big.cu -o run_multi_gpu_big_cu.o
5 | run_multi_gpu_big_cpp.o:multi_gpu.cpp
6 | 	mpicxx -c multi_gpu.cpp -o run_multi_gpu_big_cpp.o
7 | clean:
8 | 	rm  -f  *.o


--------------------------------------------------------------------------------
/src/Makefile.multi:
--------------------------------------------------------------------------------
1 | run_multi_gpu:multi_gpu_cu.o multi_gpu_cpp.o
2 | 	mpicxx multi_gpu_cu.o multi_gpu_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu
3 | multi_gpu_cu.o:./microbenchmark/multi_gpu/multi_gpu.cu
4 | 	nvcc -c ./microbenchmark/multi_gpu/multi_gpu.cu -o multi_gpu_cu.o
5 | multi_gpu_cpp.o:multi_gpu.cpp
6 | 	mpicxx -c multi_gpu.cpp -o multi_gpu_cpp.o
7 | clean:
8 | 	rm  -f  *.o


--------------------------------------------------------------------------------
/src/Makefile.multi.big:
--------------------------------------------------------------------------------
1 | run_multi_gpu_big:run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o
2 | 	mpicxx run_multi_gpu_big_cu.o run_multi_gpu_big_cpp.o -L/usr/local/cuda/lib64 -lcudart -o run_multi_gpu_big
3 | run_multi_gpu_big_cu.o:./microbenchmark/multi_gpu/multi_gpu_big.cu
4 | 	nvcc -c ./microbenchmark/multi_gpu/multi_gpu_big.cu -o run_multi_gpu_big_cu.o
5 | run_multi_gpu_big_cpp.o:multi_gpu.cpp
6 | 	mpicxx -c multi_gpu.cpp -o run_multi_gpu_big_cpp.o
7 | clean:
8 | 	rm  -f  *.o


--------------------------------------------------------------------------------
/src/SNIG.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils/header.h"
  2 | #include "reorder/header.h"
  3 | #include "inspector/header.h"
  4 | #include "gpu_lib/header.h"
  5 | #include "microbenchmark/header.h"
  6 | #include "fuse/header.h"
  7 | #include <functional>
  8 | #include <cstdlib>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | using namespace ftxj;
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | size_t get_sec_size(const size_t num_neurons) {
 18 | 
 19 |     //only for the same GPUs
 20 |     //
 21 |     //get tuned shared memory size
 22 |     //num_neurons must be divisible by shared memory (a.k.a. sec_size)
 23 |     //only for double float
 24 |     cudaDeviceProp props;
 25 |     cudaGetDeviceProperties(&props, 0);
 26 |     size_t sec_size{0};
 27 | 
 28 |     size_t max_num_per_block = props.sharedMemPerBlock / sizeof(float);
 29 |     if(num_neurons <= max_num_per_block) {
 30 |         sec_size = num_neurons;
 31 |     }
 32 |     else{
 33 |         int max_divisor = 2;
 34 |         while((num_neurons % max_divisor != 0) || 
 35 |             (max_num_per_block < (num_neurons / max_divisor))) {
 36 |         ++max_divisor;
 37 |         }
 38 |         sec_size = num_neurons / max_divisor;
 39 |     }
 40 |     return sec_size;
 41 | }
 42 | 
 43 | std::string get_weight_file_name(int neuron, int layer) {
 44 |     std::string weight_file_dir = "../data/neuron";
 45 |     std::string neuron_str = std::to_string(neuron);
 46 |     weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv";
 47 |     return weight_file_dir;
 48 | }
 49 | 
 50 | void read_input(std::vector<std::vector<float>> &input, int neuron, int batch) {
 51 |     std::string input_file_name = "../data/sparse-images-";
 52 |     input_file_name += std::to_string(neuron) + ".tsv";
 53 |     std::ifstream input_file(input_file_name);
 54 |     if(!input_file){
 55 |         std::cout << "FILE:" << input_file_name << " does not exists.\n";
 56 |         exit(-1);
 57 |     }
 58 |     int b, n;
 59 |     float val;
 60 |     long read_num = 0;
 61 |     while(input_file >> b >> n >> val) {
 62 |         if(b <= batch) {
 63 |             read_num++;
 64 |             input[b - 1][n - 1] = val;
 65 |             if(val != 1.00) {
 66 |                 printf("read input %d, %f\n", b, val);
 67 |             }
 68 |         }
 69 |     }
 70 |     std::cout << "Read Input success! read_numeber = " << read_num << std::endl;
 71 | }
 72 | 
 73 | int main(int argc, char* argv[]) {
 74 | 
 75 |     if(argc != 5) {
 76 |         std::cout << "Usage: exe neuron batch layer nnzs" << std::endl;
 77 |         return 0;
 78 |     }
 79 |     int neuron = atoi(argv[1]);
 80 |     int batch = atoi(argv[2]);
 81 |     int layer = atoi(argv[3]);
 82 |     int nnzs = atoi(argv[4]);
 83 |     int sec_size = get_sec_size(neuron);
 84 | 
 85 |     std::cout << "[Config] sec size = " << sec_size << std::endl;
 86 |     std::map<int, float> bias_map = {
 87 |         {65536, -0.45},
 88 |         {16384, -0.4},
 89 |         {4096, -0.35},
 90 |         {1024, -0.3}
 91 |     };
 92 | 
 93 |     std::vector<std::vector<float>> input(batch, std::vector<float>(neuron));
 94 |     std::cout << "[BEGIN]..." << std::endl;
 95 |     read_input(input, neuron, batch);
 96 |     std::cout << "Read Input success!" << std::endl;
 97 |     std::vector<SNIGMatrix> weights;
 98 |     
 99 |     for(int l = 0; l < layer; ++l) {
100 |         auto weight_file = get_weight_file_name(neuron, l);
101 |         SNIGMatrix snig_weight(weight_file, 32 * neuron, sec_size, neuron);
102 |         weights.push_back(snig_weight);
103 |         std::cout << "["<< weight_file << "] to SNIG Matrix success!" << std::endl;
104 |     }
105 | 
106 |     GpuEnv env(0);
107 |     test_benchmark_SNIG(input, weights, batch, neuron, sec_size, nnzs, bias_map[neuron], env);
108 |     
109 |     std::cout << "[END]..." << std::endl;
110 |     return 0;
111 | }


--------------------------------------------------------------------------------
/src/cost.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils/header.h"
 2 | #include "reorder/header.h"
 3 | #include "inspector/header.h"
 4 | #include "fuse/header.h"
 5 | #include <functional>
 6 | #include <cstdlib>
 7 | #include <iostream>
 8 | #include <fstream>
 9 | 
10 | #include<time.h>
11 | using namespace ftxj;
12 | 
13 | 
14 | std::string get_weight_file_name(int neuron, int layer) {
15 |     std::string weight_file_dir = "../data/neuron";
16 |     std::string neuron_str = std::to_string(neuron);
17 |     weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv";
18 |     return weight_file_dir;
19 | }
20 | 
21 | int main(int argc, char* argv[]) {
22 | 
23 |     if(argc != 7) {
24 |         std::cout << "Usage: exe neuron layer TB1 TN1 TB2 TN2" << std::endl;
25 |         return 0;
26 |     }
27 |     int neuron = atoi(argv[1]);
28 |     int layer = atoi(argv[2]);
29 | 
30 | 
31 |     int TB1 = atoi(argv[3]);
32 |     int TN1 = atoi(argv[4]);
33 |     int TB2 = atoi(argv[5]);
34 |     int TN2 = atoi(argv[6]);
35 | 
36 | 
37 |     std::map<int, int> hash_map = {
38 |         {65536, 4096},
39 |         {16384, 1024},
40 |         {4096, 256},
41 |         {1024, 64}
42 |     };
43 | 
44 |     std::map<int, float> type_1 = {
45 |         {65536, 12},
46 |         {16384, 10},
47 |         {4096, 8},
48 |         {1024, 6}
49 |     };
50 |    HashReorder hash_reorder_t(hash_map[neuron], neuron);
51 | 
52 |     std::cout << "[BEGIN]..." << std::endl;
53 | 
54 | 
55 |     clock_t total = 0;
56 | 
57 |     for(int l = 0; l < layer; ++l) {
58 |         auto weight_file = get_weight_file_name(neuron, l);
59 |         COOMatrix coo(weight_file, 1, false);
60 |         std::cout << "["<< weight_file << "] to COO success!" << std::endl;
61 |         coo.reorder(hash_reorder_t);
62 |         std::cout << "Reorder success!" << std::endl;
63 | 
64 | 
65 |         clock_t startTime,endTime;
66 | 	    startTime = clock();
67 |         coo.cost_analysis(TB1, TN1, TB2, TN2);
68 |         endTime = clock();
69 |         total += endTime - startTime;
70 |     }
71 | 
72 |     std::cout << "time = " << (double)(total) / CLOCKS_PER_SEC << std::endl;
73 |     return 0;
74 | }


--------------------------------------------------------------------------------
/src/cuSparse.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils/header.h"
 2 | #include "reorder/header.h"
 3 | #include "inspector/header.h"
 4 | #include "gpu_lib/header.h"
 5 | #include "microbenchmark/header.h"
 6 | #include "fuse/header.h"
 7 | 
 8 | #include <functional>
 9 | #include <algorithm>
10 | using namespace ftxj;
11 | 
12 | int main(int argc, char* argv[]) {
13 | 
14 |     std::cout << "begin" << std::endl;
15 |     std::map<int, int> stride_map = {
16 |         {1, 16},
17 |         {2, 32},
18 |         {3, 64},
19 |         {4, 128},
20 |         {5, 256},
21 |         {6, 512},
22 |         {7, 1024},
23 |         {8, 2048},
24 |         {9, 4096},
25 |         {10, 8192}
26 |     };
27 | 
28 |     int neuron = atoi(argv[1]);
29 |     int batch = atoi(argv[2]);
30 |     int l = atoi(argv[3]);
31 |     int hash_type = atoi(argv[4]);
32 | 
33 |     std::string file_name = "../data/neuron"+ 
34 |         std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv";
35 | 
36 |     COOMatrix coo(file_name, 1, false);
37 |     std::cout << "read coo success" << std::endl;
38 | 
39 | 
40 |     if(hash_type == 0) {
41 |     }
42 | 
43 |     if(hash_type == 1) {
44 |         HashReorder hash_reorder_t(64, neuron, REORDER::ROW_REORDER);
45 |         coo.reorder(hash_reorder_t);
46 |     }
47 |     
48 |     if(hash_type == 2) {
49 |         HashReorder hash_reorder_t(64, neuron, REORDER::COL_REORDER);
50 |         coo.reorder(hash_reorder_t);
51 |     }
52 | 
53 |     if(hash_type == 3) {
54 |         HashReorder hash_reorder_t(64, neuron, REORDER::ALL_REORDER);
55 |         coo.reorder(hash_reorder_t);
56 |     }
57 |     
58 |     std::cout << "reorder success" << std::endl;
59 |     cuSPARSEMatrix cusm(coo, neuron);
60 |     std::cout << "BF success" << std::endl;
61 | 
62 |     GpuEnv env(0);
63 |     
64 |     test_benchmark_cusparse(coo, 
65 |         cusm, 
66 |         neuron, batch);
67 | 
68 |     return 0;
69 | }


--------------------------------------------------------------------------------
/src/fuse.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils/header.h"
  2 | #include "reorder/header.h"
  3 | #include "inspector/header.h"
  4 | #include "gpu_lib/header.h"
  5 | #include "microbenchmark/header.h"
  6 | #include "fuse/header.h"
  7 | #include <functional>
  8 | #include <cstdlib>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | using namespace ftxj;
 12 | 
 13 | 
 14 | std::string get_weight_file_name(int neuron, int layer) {
 15 |     std::string weight_file_dir = "../data/neuron";
 16 |     std::string neuron_str = std::to_string(neuron);
 17 |     weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv";
 18 |     return weight_file_dir;
 19 | }
 20 | 
 21 | void dense_reorder(std::vector<std::vector<float>> &input, Reorder &reorder_class) {
 22 |     std::vector<std::vector<float>> old = input;
 23 |     for(int i = 0; i < input.size(); ++i) {
 24 |         for(int j = 0; j < input[i].size(); ++j) {
 25 |             auto new_j = reorder_class.reorder(j);
 26 |             input[i][new_j] = old[i][j];
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | void read_input(std::vector<std::vector<float>> &input, int neuron, int batch) {
 32 |     std::string input_file_name = "../data/sparse-images-";
 33 |     input_file_name += std::to_string(neuron) + ".tsv";
 34 |     std::ifstream input_file(input_file_name);
 35 |     if(!input_file){
 36 |         std::cout << "FILE:" << input_file_name << " does not exists.\n";
 37 |         exit(-1);
 38 |     }
 39 |     int b, n;
 40 |     float val;
 41 |     while(input_file >> b >> n >> val) {
 42 |         if(b <= batch) {
 43 |             input[b - 1][n - 1] = val;
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | int main(int argc, char* argv[]) {
 49 | 
 50 |     if(argc != 4) {
 51 |         std::cout << "Usage: exe neuron batch layer" << std::endl;
 52 |         return 0;
 53 |     }
 54 |     int neuron = atoi(argv[1]);
 55 |     int batch = atoi(argv[2]);
 56 |     int layer = atoi(argv[3]);
 57 | 
 58 |     std::map<int, int> hash_map = {
 59 |         {65536, 4096},
 60 |         {16384, 1024},
 61 |         {4096, 256},
 62 |         {1024, 64}
 63 |     };
 64 | 
 65 |     std::map<int, float> bias_map = {
 66 |         {65536, -0.45},
 67 |         {16384, -0.4},
 68 |         {4096, -0.35},
 69 |         {1024, -0.3}
 70 |     };
 71 | 
 72 |     std::vector<std::vector<float>> input(batch, std::vector<float>(neuron));
 73 |     std::vector<std::vector<float>> weight; 
 74 |     std::vector<std::vector<int>> row_access; 
 75 | 
 76 |     std::cout << "[BEGIN]..." << std::endl;
 77 |     read_input(input, neuron, batch);
 78 |     std::cout << "Read Input success!" << std::endl;
 79 |     HashReorder hash_reorder_t(hash_map[neuron], neuron);
 80 |     dense_reorder(input, hash_reorder_t);
 81 | 
 82 |     std::vector<COOMatrix> coo_vec; 
 83 | 
 84 | 
 85 |     for(int l = 0; l < layer; ++l) {
 86 |         auto weight_file = get_weight_file_name(neuron, l);
 87 |         COOMatrix coo(weight_file, 1, false);
 88 |         std::cout << "["<< weight_file << "] to COO success!" << std::endl;
 89 |         coo.reorder(hash_reorder_t);
 90 |         coo_vec.push_back(coo);
 91 |         std::cout << "Reorder success!" << std::endl;
 92 |         CSRCSCMatrix csr_csc(coo);
 93 |         csr_csc.transpose();
 94 |         BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method);
 95 |         std::cout << "Structural Info success!" << std::endl;
 96 |         MaxInReuseBSchedule schedule(blocks);
 97 |         schedule.schedule_output_parallel(128, 1, false);
 98 |         std::cout << "Schedule succ" << std::endl;
 99 |         auto data = schedule.get_data(neuron);
100 |         weight.push_back(data.value);
101 |         row_access.push_back(data.row_access);
102 |     }
103 |     GpuEnv env(3);
104 |     test_benchmark_fused_layer1024_0_1(input, coo_vec, weight, row_access, batch, neuron, bias_map[neuron], env);
105 |     // test_benchmark_fuse_cmp_layer1024_0_1(input, weight, row_access, batch, neuron, bias_map[neuron], env);
106 |     
107 |     std::cout << "[END]..." << std::endl;
108 |     return 0;
109 | }


--------------------------------------------------------------------------------
/src/fuse/fuse.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../utils/header.h"
 4 | #include <vector>
 5 | #include <algorithm>
 6 | 
 7 | namespace ftxj {
 8 |     class FuseLayer {
 9 |         int fuse_numbers_;
10 |         std::vector<std::set<int>> input_need_access_;
11 |         std::vector<COOMatrix> fused_matrix_;
12 |     public:
13 |         FuseLayer(COOMatrix outer_matrix, std::vector<std::vector<int>> block_cols) {
14 |             fuse_numbers_ = 1;
15 |             input_need_access_ = std::vector<std::set<int>>(block_cols.size(), std::set<int>());
16 |             for(int i = 0; i < block_cols.size(); ++i) {
17 |                 for(int j = 0; j < block_cols[i].size(); ++j) {
18 |                     int need_access_col = block_cols[i][j];
19 |                     for(auto x = outer_matrix.begin(); x != outer_matrix.end(); ++x) {
20 |                         if((*x).col == need_access_col) {
21 |                             input_need_access_[i].insert((*x).row);
22 |                         }
23 |                     }
24 |                 }
25 |             }
26 |         }
27 | 
28 |         void print_need_access() {
29 |             for(int b = 0; b < input_need_access_.size(); ++b) {
30 |                 std::cout << "block b = " << b << ",size = "<< input_need_access_[b].size() <<" : ";
31 |                 for(auto x : input_need_access_[b]) {
32 |                     std::cout << x << ",";
33 |                 }
34 |                 std::cout << std::endl;
35 |             }
36 |         }
37 | 
38 |         void fuse(COOMatrix outer_matrix) {
39 |             fuse_numbers_ += 1;
40 |             fused_matrix_.push_back(outer_matrix);
41 |             std::vector<std::set<int>> old_access = input_need_access_;
42 |             input_need_access_.clear();
43 |             input_need_access_ = std::vector<std::set<int>>(old_access.size(), std::set<int>());
44 |             for(int b = 0; b < old_access.size(); ++b) {
45 |                 for(auto row : old_access[b]) {
46 |                     for(auto x = outer_matrix.begin(); x != outer_matrix.end(); ++x) {
47 |                         if((*x).col == row) {
48 |                             input_need_access_[b].insert((*x).row);
49 |                         }
50 |                     }
51 |                 }
52 |             }
53 |         }
54 |     };
55 | }


--------------------------------------------------------------------------------
/src/fuse/header.h:
--------------------------------------------------------------------------------
1 | #include "fuse.h"
2 | 


--------------------------------------------------------------------------------
/src/gpu_lib/gpu_env.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | #include <cuda.h>
 5 | #include <stdio.h>
 6 | #include "gpu_runtime.h"
 7 | #include <string>
 8 | #include <map>
 9 | 
10 | namespace ftxj {
11 |     class GpuEnv {
12 |         std::vector<cudaStream_t> streams;
13 |         std::vector<cudaEvent_t> start_event;
14 |         std::vector<cudaEvent_t> stop_event;
15 |         std::vector<std::string> event_name;
16 |         std::map<std::string, int> event_map;
17 |         
18 |     public:
19 |         GpuEnv(int gpu_id, bool print_device_info = true) {
20 |             set_up(gpu_id, print_device_info);
21 |         }
22 | 
23 |         GpuEnv(std::vector<int> gpu_id, bool print_device_info = true) {
24 |             for(int i = 0; i < gpu_id.size(); ++i) {
25 |                 set_up(gpu_id[i], print_device_info);
26 |             }
27 |         }
28 | 
29 | 
30 |         void set_up(int gpu_id, bool print_device_info = true) {
31 |             Safe_Call(cudaSetDevice(gpu_id));
32 |             if(print_device_info) {
33 |                 int deviceCount;
34 |                 Safe_Call(cudaGetDeviceCount(&deviceCount));
35 |                 // printf("\n");
36 |                 // printf("Device Count: %d\n",deviceCount);
37 |                 int dev = gpu_id;
38 |                 
39 |                 cudaDeviceProp deviceProp;
40 |                 Safe_Call(cudaGetDeviceProperties(&deviceProp, dev));
41 |                 // printf("Device %d name: %s\n",dev,deviceProp.name);
42 |                 // printf("Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor);
43 |                 // printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem);
44 |                 // printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem);
45 |                 // printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock);
46 |                 // printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
47 |                 // printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
48 |                 // printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock);
49 |                 // printf("Warp size: %d\n",deviceProp.warpSize);
50 |                 // printf("\n");
51 |             }
52 |         }
53 | 
54 |         void add_event(std::string name = "non") {
55 |             cudaStream_t stream;
56 |             cudaEvent_t start, stop;
57 | 
58 |             streams.push_back(stream);
59 |             start_event.push_back(start);
60 |             stop_event.push_back(stop);
61 |             event_name.push_back(name);
62 | 
63 |             Safe_Call(cudaEventCreate(&start_event[start_event.size() - 1]));
64 |             Safe_Call(cudaEventCreate(&stop_event[stop_event.size() - 1]));
65 |             Safe_Call(cudaStreamCreate(&streams[streams.size() - 1]));
66 | 
67 |             event_map[name] = streams.size() - 1;
68 |         }
69 | 
70 |         void event_start_record(std::string name = "non") {
71 |             Safe_Call(cudaEventRecord(start_event[event_map[name]], streams[event_map[name]]));
72 |         }
73 | 
74 |         void event_stop_record(std::string name = "non") {
75 |             Safe_Call(cudaEventRecord(stop_event[event_map[name]], streams[event_map[name]]));
76 |         }
77 | 
78 |         float get_event_time(std::string name = "non") {
79 |             float res = 0.0;
80 |             Safe_Call(cudaStreamSynchronize(streams[event_map[name]]));
81 |             Safe_Call(cudaEventElapsedTime(&res, start_event[event_map[name]], stop_event[event_map[name]]));
82 |             return res;
83 |         }
84 |         
85 |         cudaStream_t get_stream(std::string name = "non") {
86 |             return streams[event_map[name]];
87 |         }
88 | 
89 |     };
90 | };


--------------------------------------------------------------------------------
/src/gpu_lib/gpu_runtime.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdio>
 4 | #include <cuda_runtime.h>
 5 | 
 6 | inline void checkCuda(cudaError_t result, const char *file, const int line, bool fatal=false) {
 7 |   if (result != cudaSuccess) {
 8 |     fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n",  file, line, int(result),
 9 |             cudaGetErrorString(result));\
10 |     if (fatal) {
11 |         exit(EXIT_FAILURE);
12 |     }
13 |   }
14 | }
15 | 
16 | #define Safe_Call_Print(stmt) checkCuda(stmt, __FILE__, __LINE__)
17 | #define Safe_Call(stmt) checkCuda(stmt, __FILE__, __LINE__, true)
18 | 


--------------------------------------------------------------------------------
/src/gpu_lib/header.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "gpu_runtime.h"
4 | #include "gpu_env.h"


--------------------------------------------------------------------------------
/src/inspector/code_gen.cpp:
--------------------------------------------------------------------------------
  1 | #include "code_gen_basic.h"
  2 | 
  3 | 
  4 | void generate_20champion_code {
  5 |     std::vector<VariableDecl*> param_list_;
  6 | 
  7 |     VariableDecl nextfeat(f32, "nextfeat", Global, true);
  8 |     param_list_.push_back(&nextfeat);
  9 | 
 10 |     VariableDecl currfeat(f32, "currfeat", Global, true);
 11 |     param_list_.push_back(&currfeat);
 12 | 
 13 |     VariableDecl buffsize(i32, "buffsize", Global, false);
 14 |     param_list_.push_back(&buffsize);
 15 |  
 16 |     VariableDecl buffdispl(i32, "buffdispl", Global, true);
 17 |     param_list_.push_back(&buffdispl);
 18 | 
 19 |     VariableDecl mapdispl(i32, "mapdispl", Global, true);
 20 |     param_list_.push_back(&mapdispl);
 21 | 
 22 |     VariableDecl map(i16, "map", Global, true);
 23 |     param_list_.push_back(&map);
 24 | 
 25 |      
 26 |     VariableDecl displ(i32, "displ", Global, true);
 27 |     param_list_.push_back(&displ);
 28 | 
 29 |     VariableDecl index(i16, "index", Global, true);
 30 |     param_list_.push_back(&index);
 31 | 
 32 |     VariableDecl value(f32, "value", Global, true);
 33 |     param_list_.push_back(&value);
 34 | 
 35 |     VariableDecl bias(f32, "bias", Global, false);
 36 |     param_list_.push_back(&bias);
 37 | 
 38 |     VariableDecl neuron(i32, "neuron", Global, false);
 39 |     param_list_.push_back(&neuron);
 40 | 
 41 |     VariableDecl categories(i32, "categories", Global, true);
 42 |     param_list_.push_back(&categories);
 43 | 
 44 |     VariableDecl active(i32, "active", Global, true);
 45 |     param_list_.push_back(&active);
 46 | 
 47 |     GpuGlobalFunction dummy_kernel("dummy_kernel", param_list_, 1024, 1);
 48 |     dummy_kernel.emit_statement();
 49 | 
 50 |     VariableDecl shared(f32, "shared", Shared, false);
 51 |     shared.set_extern();
 52 | 
 53 |     VariableArrayDecl shared_array(shared, {});
 54 |     shared_array.emit_statement();
 55 | 
 56 |     VariableDecl wind(i32, "wind", Reg, false);
 57 |     ConstantVar WARPSIZE("WARPSIZE");
 58 |     ConstantVar ThreadIdx_x("threadIdx.x");
 59 |     Operation tmp = ThreadIdx_x % WARPSIZE;    
 60 |     VaribaleInit wind_init_statement(wind, tmp);
 61 |     wind_init_statement.emit_statement();
 62 | 
 63 | 
 64 |     ArrayAccess line95_1 = buffdispl[ThreadIdx_x];
 65 |     ArrayAccess line95_2 = buffdispl[ThreadIdx_x + 1];
 66 |     ConstantVar ConstOne(1);
 67 |     VariableDecl iter_var_1(i32, "buff", Global, false);
 68 |     Variable iter_var(iter_var_1);
 69 |     ForLoopScope forloop_1(line95_1, line95_2, ConstOne, iter_var);
 70 |     forloop_1.emit_statement();
 71 | 
 72 | 
 73 | 
 74 |     ScopeEnd forloop_1_end;
 75 |     forloop_1_end.emit_statement();
 76 | 
 77 | 
 78 |     ScopeEnd dummy_kernel_end;
 79 |     dummy_kernel_end.emit_statement();
 80 | }
 81 | 
 82 | 
 83 | void generate_ramdom_block_code(Schedule &block_schedule) {
 84 |     VariableArrayDecl output_tile(f32, "output_tile", Reg, false, {8});
 85 |     ConstantVar floatZero(0.0);
 86 | 
 87 |     VariableArrayInit output_tile_init(&output_tile, floatZero);
 88 |     output_tile_init.emit_statement();
 89 |     
 90 |     VariableDecl dense_tile(f32, "dense_value", Reg, false);
 91 |     VaribaleInit dense_tile_init(dense_tile, floatZero);
 92 | 
 93 |     for(int b = 0; b < blockSize; ++b) {
 94 |         BlockScope block_scope(b);
 95 |         block_scope.emit_statement();
 96 |         for(int t = 0; t < threadSize; ++t) {
 97 |             MatrixBlockBase* base_block = block_schedule.get_block(b, t);
 98 |             if(base_block->get_block_type() == "Random") {
 99 | 
100 |             }
101 |             else if() {
102 | 
103 |             }
104 |         }
105 |         ScopeEnd block_scope_end;
106 |         block_scope_end.emit_statement();
107 |     }
108 | 
109 | }


--------------------------------------------------------------------------------
/src/inspector/cost_model.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/inspector/cost_model.h


--------------------------------------------------------------------------------
/src/inspector/data_inspector.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | 
 4 | namespace ftxj {
 5 |     using namespace std;
 6 |     class Inspector {
 7 |         vector<WrapBlock> task_distribution;
 8 |         public:
 9 |     };
10 | };


--------------------------------------------------------------------------------
/src/inspector/gpu_block.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "matrix_block_container.h"
 3 | #include <iostream>
 4 | #include <fstream>
 5 | 
 6 | namespace ftxj {
 7 |     class GpuBlock {
 8 |         int block_idx_;
 9 |         int block_idy_;
10 |     public:
11 |         BlockContainer blocks_;
12 |         GpuBlock(int x, int y, BlockContainer blocks) : blocks_(blocks)  {
13 |             block_idx_ = x;
14 |             block_idy_ = y;
15 |         }
16 | 
17 |         // std::vector<int> 
18 |         void file_gen() {
19 | 
20 |         }
21 | 
22 |         void print() {
23 |             std::cout << "(";
24 |             if(block_idx_ == -1) {
25 |                 std::cout << "{...}, "; 
26 |             }
27 |             else {
28 |                 std::cout << block_idx_ << ", ";
29 |             }
30 |             if(block_idy_ == -1) {
31 |                 std::cout << "{...})"; 
32 |             }
33 |             else {
34 |                 std::cout << block_idy_ << ")";
35 |             }
36 |             std::cout << "\n";
37 |             blocks_.print_unique();
38 |         }
39 |     };
40 | };


--------------------------------------------------------------------------------
/src/inspector/gpu_run_config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace ftxj {
 4 |     class GpuRunConfig {
 5 |     public:
 6 |         int block_num;
 7 |         int thread_num;
 8 |         int shared_memory_size;
 9 |     };
10 | 
11 | };


--------------------------------------------------------------------------------
/src/inspector/gpu_wrap.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace ftxj {
 4 |     // AC = AB * BC 
 5 |     // dense = dense * sparse
 6 |     class Wrap {
 7 |     private:
 8 |         int wrap_id_;
 9 |         int block_id_;
10 | 
11 |         int batch_dim_;
12 |         int output_channel_dim_;
13 |         int input_channel_dim_;
14 | 
15 |         int batch_offset_;
16 |         int output_channel_offset_;
17 |         int input_channel_offset_;
18 | 
19 |         int write_dst_;
20 |         
21 |         BlockContainer blocks_;
22 | 
23 |     public:
24 | 
25 |         static const int WRAP_SIZE = 32;
26 |     };
27 | };


--------------------------------------------------------------------------------
/src/inspector/header.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "matrix_block.h"
3 | #include "matrix_block_gen.h"
4 | #include "matrix_block_container.h"
5 | #include "gpu_block_scheduler.h"
6 | 


--------------------------------------------------------------------------------
/src/inspector/matrix_block_gen.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <vector>
  3 | 
  4 | #include "matrix_block.h"
  5 | 
  6 | #include "../utils/matrix.h"
  7 | #include "../utils/string.h"
  8 | 
  9 | namespace ftxj {
 10 | 
 11 |     class SparseMatrixBlockGen {
 12 | 
 13 |         static int row_line_succ_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) {
 14 |             int row_idx = start_pos.row_idx;
 15 |             int col_idx = start_pos.col_idx;
 16 |             int res  = 0;
 17 |             for(auto iter = csr_csc.row_iter_begin_at(row_idx, col_idx); 
 18 |                 iter != csr_csc.row_iter_end_at(row_idx); ++iter) {
 19 |                 if((*iter).col == col_idx + res) {
 20 |                     res++;
 21 |                 }
 22 |                 else {
 23 |                     return res;
 24 |                 }
 25 |             }
 26 |             return res;
 27 |         }
 28 | 
 29 | 
 30 |         static int col_line_succ_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) {
 31 |             int row_idx = start_pos.row_idx;
 32 |             int col_idx = start_pos.col_idx;
 33 |             int res  = 0;
 34 |             for(auto iter = csr_csc.col_iter_begin_at(row_idx, col_idx); 
 35 |                 iter != csr_csc.col_iter_end_at(col_idx); ++iter) {
 36 |                 if((*iter).row == row_idx + res) {
 37 |                     res++;
 38 |                 }
 39 |                 else {
 40 |                     return res;
 41 |                 }
 42 |             }
 43 |             return res;
 44 |         }
 45 | 
 46 | 
 47 |         static MatrixPos rectangels_max(MatrixPos start_pos, CSRCSCMatrix &csr_csc) {
 48 |             int row_max = row_line_succ_max(start_pos, csr_csc);
 49 |             if(row_max > 16) row_max = 16;
 50 |             
 51 |             // std::cout << "row max = " << row_max << std::endl;
 52 |             
 53 |             int now_max_row = 0;
 54 |             int now_max_col = 70000;
 55 |             int now_max = 0;
 56 | 
 57 |             int res_row = 0;
 58 |             int res_col = 0;
 59 |             
 60 |             for(int i = 0; i < row_max; ++i) {
 61 |                 now_max_row = i + 1;
 62 |                 int col_max = col_line_succ_max({start_pos.row_idx, start_pos.col_idx + i}, csr_csc);
 63 |                 // std::cout << "col max = " << col_max <<  ", " << start_pos.row_idx << ", " << start_pos.col_idx + i << std::endl;
 64 |                 now_max_col = std::min(col_max, now_max_col);
 65 |                 int tmp_area = now_max_col * now_max_row;
 66 |                 if(tmp_area > now_max) {
 67 |                     now_max = tmp_area;
 68 |                     res_row = now_max_row;
 69 |                     res_col = now_max_col;
 70 |                 }
 71 |             }
 72 |             return {start_pos.row_idx + res_col - 1, start_pos.col_idx + res_row - 1};
 73 |         }
 74 | 
 75 |     public:
 76 | 
 77 |         static std::vector<std::pair<MatrixPos, MatrixPos>> naive_method(CSRCSCMatrix &csr_csc) {
 78 | 
 79 |             std::vector<std::pair<MatrixPos, MatrixPos>> res;
 80 | 
 81 |             int end_len = 0;
 82 |             int col_each_big_block = -1;
 83 | 
 84 |             int now_lookup_col = 0;
 85 |             int now_lookup_row = 0;
 86 | 
 87 |             auto col_iter = csr_csc.col_iter_begin_at(now_lookup_row, now_lookup_col);
 88 |             
 89 |             for(; col_iter != csr_csc.col_iter_end(); col_iter = col_iter.next_ncol(col_each_big_block)) {
 90 |                 while(col_iter != csr_csc.col_iter_end_at(now_lookup_col)) {
 91 |                     auto row_idx = (*col_iter).row;
 92 |                     auto col_idx = (*col_iter).col;
 93 |                     // std::cout << " row = " << row_idx << ", col = " <<  col_idx << std::endl;
 94 |                     auto end_pos = rectangels_max(MatrixPos(row_idx, col_idx), csr_csc);
 95 |                     // std::cout << "end at row = " << end_pos.row_idx << ", col = " <<  end_pos.col_idx << std::endl;
 96 | 
 97 |                     int tmp_col_len = end_pos.col_idx - col_idx + 1; // 多少行长
 98 |                     int tmp_row_len = end_pos.row_idx - row_idx + 1; // 多少列长
 99 |                     if(col_each_big_block != -1 && col_each_big_block != tmp_col_len) {
100 |                         std::cout << "TODO Just support same len" << std::endl;
101 |                         exit(-1);
102 |                     }
103 |                     
104 |                     col_each_big_block = tmp_col_len;
105 | 
106 |                     if(tmp_col_len != 0 || tmp_row_len != 0) {
107 |                         if(tmp_col_len != end_len && end_len != 0) {
108 |                             std::cout << "TODO fix this bug" << std::endl;
109 |                             exit(-1);
110 |                         }
111 |                         col_iter += tmp_row_len; 
112 |                         res.push_back({MatrixPos(row_idx, col_idx), end_pos});
113 |                     }
114 |                     else {
115 |                         std::cout << "TODO At least one point detected" << std::endl;
116 |                         exit(-1);
117 |                     }
118 |                 }
119 |                 now_lookup_col += col_each_big_block;
120 |             }
121 |             return res;
122 |         }
123 |     };
124 | 
125 | };


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils/header.h"
  2 | #include "reorder/header.h"
  3 | #include "inspector/header.h"
  4 | #include "gpu_lib/header.h"
  5 | #include "microbenchmark/header.h"
  6 | #include "fuse/header.h"
  7 | 
  8 | #include <functional>
  9 | #include <algorithm>
 10 | using namespace ftxj;
 11 | 
 12 | int main(int argc, char* argv[]) {
 13 | 
 14 |     std::cout << "begin" << std::endl;
 15 | 
 16 | 
 17 |     int neuron = 1024;
 18 |     int batch = 6000;
 19 | 
 20 | 
 21 |     std::map<int, int> stride_map = {
 22 |         {1, 16},
 23 |         {2, 32},
 24 |         {3, 64},
 25 |         {4, 128},
 26 |         {5, 256},
 27 |         {6, 512},
 28 |         {7, 1024},
 29 |         {8, 2048},
 30 |         {9, 4096},
 31 |         {10, 8192}
 32 |     };
 33 |     int l = atoi(argv[1]);
 34 |     // int l = 5;
 35 |     std::string file_name = "../data/neuron1024/n1024-l" + std::to_string(l) + ".tsv";
 36 |     COOMatrix coo(file_name, 1, false);
 37 |     // COOMatrix coo_2("../data/neuron16384/n16384-l119.tsv", 1, true);
 38 |     // std::cout << "read coo success" << std::endl;
 39 | 
 40 |     HashReorder hash_reorder_t(64, neuron);
 41 |     coo.reorder(hash_reorder_t);
 42 |     // std::cout << "reorder success" << std::endl;
 43 | 
 44 |     // coo_2.reorder(hash_reorder_t);
 45 |     // std::vector<std::vector<int>> block_cols(16384/16);
 46 |     // for(int b = 0; b < 16384 / 16; ++b) {
 47 |     //     for(int j = 0; j < 16; ++j) {
 48 |     //         block_cols[b].push_back(b * 16 + j);
 49 |     //     }
 50 |     // }
 51 |     // FuseLayer fuse(coo, block_cols);
 52 |     // fuse.print_need_access();
 53 |     // fuse.fuse(coo_2);
 54 |     // fuse.print_need_access();
 55 |     // return 0;
 56 |     
 57 |     CSRCSCMatrix csr_csc(coo);
 58 |     std::cout << "coo to csr_csc success" << std::endl;
 59 | 
 60 |     // UIUCMatrix uiuc(csr_csc, 256, neuron);
 61 |     // std::cout << "uiuc success" << std::endl;
 62 | 
 63 |     GpuEnv env(0);
 64 |     // test_benchmark_succ_load_store(batch, neuron, env);
 65 |     // test_benchmark_matrix_transpose(batch, neuron, env); 
 66 |     // test_benchmark_matrix_transpose_and_delete(batch, neuron, env);
 67 |     // return 0;
 68 | 
 69 |     // test_benchmark_20_uiuc(coo, uiuc,  batch, env);
 70 |     // return 0;
 71 | 
 72 |     // uiuc_test_benchmark(coo, uiuc, env);
 73 |     // uiuc.print_buffdispl();
 74 |     // uiuc.print_mapdispl();
 75 |     // uiuc.print_map();
 76 |     // uiuc.print_warpdispl();
 77 |     // uiuc.print_warpindex();
 78 |     
 79 |     csr_csc.transpose();
 80 |     BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method);
 81 |     std::cout << "block container success" << std::endl;
 82 |     // blocks.print();
 83 | 
 84 |     MaxInReuseBSchedule schedule(blocks);
 85 |     
 86 |     // schedule.schedule_output_parallel(128, 1, false);
 87 |     schedule.schedule(128, 1);
 88 | 
 89 |     std::cout << "block schedule succ" << std::endl;
 90 |     
 91 |     // auto data = schedule.get_data2(neuron);
 92 |     auto data = schedule.get_data(neuron);
 93 |     
 94 | 
 95 |     // std::cout << "data size = " << data.value.size() << std::endl;
 96 |     // std::cout << "data access size = " << data.row_access.size() << std::endl;
 97 |     
 98 |     // std::cout << "data load idx len = ";
 99 |     // for(int i = 0; i < data.load_idx_row_len.size(); ++i) {
100 |     //     std::cout << data.load_idx_row_len[i] << ", ";
101 |     // }
102 |     // std::cout << std::endl;
103 |     
104 |     // std::cout << "data row access = ";
105 |     // for(int i = 0; i < data.row_access.size(); ++i) {
106 |     //     std::cout << data.row_access[i] << ", ";
107 |     // }
108 |     // std::cout << std::endl;
109 | 
110 |     // std::cout << "data value access = ";
111 |     // for(int i = 0; i < data.value_access.size(); ++i) {
112 |     //     std::cout << data.value_access[i] << ", ";
113 |     // }
114 |     // std::cout << std::endl;
115 | 
116 |     // schedule.print_schedule();
117 | 
118 |     // test_benchmark_row_succ_20_uiuc(coo, data.value, data.row_access, batch, neuron, env);
119 |     // test_benchmark_row_succ_20_uiuc_transpose(coo, data.value, data.row_access, batch, neuron, env);
120 |     // test_benchmark_row_succ_20_uiuc_transpose_no_conflict(coo, data.value, data.row_access, batch, neuron, env);
121 |     // test_benchmark_rectangels_batch_parallel_kernel(coo, data.value, data.row_access, batch, neuron, env);
122 |     test_benchmark_n16384_l2_l10_kernel(coo, data.value, stride_map[l], batch, neuron, env);
123 |     // test_benchmark_n16384_l11_kernel(coo, data.value, data.row_access, batch, neuron, env);
124 | 
125 | 
126 | 
127 |     // GpuEnv env(0);
128 |     
129 |     // vector4_load_data_benchmark(env);
130 | 
131 |     // test_shared_memory_mm(coo, data.value, data.row_access, env);
132 | 
133 |     return 0;
134 | }


--------------------------------------------------------------------------------
/src/mc_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils/header.h"
 2 | #include "reorder/header.h"
 3 | #include "inspector/header.h"
 4 | #include "gpu_lib/header.h"
 5 | #include "microbenchmark/header.h"
 6 | #include "fuse/header.h"
 7 | 
 8 | #include <functional>
 9 | #include <algorithm>
10 | using namespace ftxj;
11 | 
12 | int main(int argc, char* argv[]) {
13 | 
14 |     std::cout << "begin" << std::endl;
15 | 
16 | 
17 |     std::map<int, int> hash_map = {
18 |         {65536, 4096},
19 |         {16384, 1024},
20 |         {4096, 256},
21 |         {1024, 64}
22 |     };
23 |     int neuron = atoi(argv[1]);
24 |     int batch = atoi(argv[2]);
25 |     int l = atoi(argv[3]);
26 |     int hash_type = atoi(argv[4]);
27 |     
28 |     std::string file_name = "../data/neuron"+ 
29 |         std::to_string(neuron) + "/n" + std::to_string(neuron) +"-l" + std::to_string(l) + ".tsv";
30 |     COOMatrix coo(file_name, 1, false);
31 |     std::cout << "read coo success" << std::endl;
32 | 
33 |     if(hash_type == 0) {
34 |     }
35 | 
36 |     if(hash_type == 1) {
37 |         HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::ROW_REORDER);
38 |         coo.reorder(hash_reorder_t);
39 |     }
40 |     
41 |     if(hash_type == 2) {
42 |         HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::COL_REORDER);
43 |         coo.reorder(hash_reorder_t);
44 |     }
45 | 
46 |     if(hash_type == 3) {
47 |         HashReorder hash_reorder_t(hash_map[neuron], neuron, REORDER::ALL_REORDER);
48 |         coo.reorder(hash_reorder_t);
49 |     }
50 |     
51 |     std::cout << "reorder success" << std::endl;
52 |     
53 |     CSRCSCMatrix csr_csc(coo);
54 |     std::cout << "coo to csr_csc success" << std::endl;
55 | 
56 |     UIUCMatrix uiuc(csr_csc, 256, neuron);
57 |     std::cout << "uiuc success" << std::endl;
58 | 
59 |     GpuEnv env(0);
60 | 
61 |     test_benchmark_20_uiuc(coo, uiuc, batch, env);
62 |     return 0;
63 | 
64 | 
65 |     return 0;
66 | }


--------------------------------------------------------------------------------
/src/microbenchmark/bf.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <stdio.h>
  5 | #include <algorithm>
  6 | #include <cstdlib>
  7 | #include <cstring>
  8 | 
  9 | namespace ftxj {
 10 | 
 11 | __global__ void bf_spmm(
 12 |     
 13 |     float* Y0, // input
 14 |     float* Y1,
 15 | 
 16 |     int* roffW,  // len neuron * N_SLAB - 1
 17 |     int* colsW,  // index 32 * neuron
 18 |     float* valsW, // all 32 * neuron 0.0625
 19 | 
 20 |     int COL_BLK, // TN, shared memory size = TN 
 21 |     int N_SLAB, //  neuron / TN
 22 |     int neuron // neuron
 23 | 
 24 | 
 25 | ) {
 26 | 
 27 |   extern  __shared__ float shRow[];
 28 | 
 29 |   int tid = threadIdx.y * blockDim.x + threadIdx.x;
 30 |   int rid = blockIdx.x;
 31 | 
 32 |   __syncthreads();
 33 | 
 34 |   for(int i = 0; i < N_SLAB; i++) {
 35 |     __syncthreads();
 36 |     for(int j = threadIdx.x; j < COL_BLK; j++) {
 37 |       shRow[j] = 0;  
 38 |     }
 39 |     __syncthreads();
 40 |     for(int j = threadIdx.y; j < neuron; j += blockDim.y) {
 41 |       float valY = Y0[rid * neuron + j];
 42 |     //   if(valY == 0) {
 43 |     //     continue;
 44 |     //   }
 45 | 
 46 |       int begOffW = roffW[i * neuron + j] + threadIdx.x;
 47 |       int endOffW = roffW[i * neuron + j + 1];
 48 |       
 49 |       for(int k = begOffW; k < endOffW; k += blockDim.x) {
 50 |         int colW = colsW[k];
 51 |         float valW = valsW[k];
 52 |         // if(colW - i * COL_BLK < 0 || colW - i * COL_BLK >= 1024) {
 53 |         //   printf("bugs %d %d %d %d\n", k, i, colW, colW - i * COL_BLK);
 54 |         // }
 55 |         atomicAdd(&shRow[colW - i * COL_BLK], valY * valW);
 56 |       }
 57 |     }
 58 |     __syncthreads();
 59 |     int count = 0;
 60 |     for(size_t j = 0; j < COL_BLK; j += blockDim.x * blockDim.y) {
 61 |     //   float v = j + tid < COL_BLK ? shRow[j + tid] + bias : -1;
 62 |     //   count += __syncthreads_count(v > 0);
 63 |       if(j + tid < COL_BLK) {
 64 |         Y1[rid * neuron + i * COL_BLK + j + tid] = shRow[j + tid];
 65 |         // min(T(32), max(T(0), v));
 66 |       }
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | void test_benchmark_19_BF(COOMatrix &coo, BFMatrix &matrix, 
 72 |     int neuron, int batch, int TN, 
 73 |     int blockx, int blocky,
 74 |     GpuEnv &env) {
 75 | 
 76 |     float *nextfeat;
 77 |     float *currfeat;
 78 | 
 79 |     int *rowoff;
 80 | 
 81 |     int off_size = neuron * (neuron / TN + 1) + 1;
 82 |     
 83 |     int *rowindex;
 84 |     
 85 |     int weight_nnz = 32 * neuron;
 86 | 
 87 |     float *value; 
 88 | 
 89 |     float bias = 0;
 90 |     int mybatch = batch;
 91 | 
 92 |     // std::vector<std::vector<float>> input(mybatch, std::vector<float>(neuron, 0.0));
 93 | 	  float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 94 | 	  memset(input, 0, sizeof(float) * neuron * mybatch);
 95 | 
 96 | 	  float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 97 | 	  memset(output, 0, sizeof(float) * neuron * mybatch);
 98 | 
 99 | 
100 |     srand (static_cast <unsigned> (time(0)));
101 |     for(int i = 0; i < mybatch; ++i) {
102 |       for(int j = 0; j < neuron; ++j) {
103 |         float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
104 |         input[i * neuron + j] = r2;
105 |       }
106 |     }
107 |   
108 |     Safe_Call(cudaMalloc((void**)&rowoff, sizeof(int) * off_size));
109 |     Safe_Call(cudaMemcpy(rowoff, &matrix.rowoff[0], sizeof(int) * off_size, cudaMemcpyHostToDevice));
110 |     
111 |     Safe_Call(cudaMalloc((void**)&rowindex, sizeof(int) * weight_nnz));
112 |     Safe_Call(cudaMemcpy(rowindex, &matrix.rowindex[0], sizeof(int) * weight_nnz, cudaMemcpyHostToDevice));
113 | 
114 |     Safe_Call(cudaMalloc((void**)&value, sizeof(float) * weight_nnz));
115 |     Safe_Call(cudaMemcpy(value, &matrix.val[0], sizeof(float) * weight_nnz, cudaMemcpyHostToDevice));
116 | 
117 |     Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * neuron * mybatch));
118 |     Safe_Call(cudaMemcpy(currfeat, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
119 | 
120 |     Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * neuron * mybatch));
121 |     Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * neuron * mybatch));
122 | 
123 |     std::cout << "begin inference..." << std::endl; 
124 |     env.add_event("uiuc_kernel_timer");
125 |     env.event_start_record("uiuc_kernel_timer");
126 | 
127 |     dim3 block(blockx, blocky);
128 |     dim3 grid(batch);
129 |     bf_spmm<<<grid,block, sizeof(float) * TN, env.get_stream("uiuc_kernel_timer")>>>(
130 |         currfeat, nextfeat,  rowoff, rowindex, value, TN, neuron / TN, neuron
131 |     );
132 | 
133 |     env.event_stop_record("uiuc_kernel_timer");
134 |     float time = env.get_event_time("uiuc_kernel_timer"); 
135 | 
136 |     Safe_Call(cudaMemcpy(output, nextfeat, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
137 | 
138 |     std::cout << "Kernel Exec Time [19-BF] = " << time << "ms"<< std::endl;
139 |     std::cout << "Flops [19-BF] = " << float(2 * batch * neuron * 32) /  time * 1000 /1e12 << "TFLOPS"<< std::endl;
140 |     
141 | 	  CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true, true);
142 | }
143 | 
144 | }
145 | 


--------------------------------------------------------------------------------
/src/microbenchmark/bf_opt.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <stdio.h>
  5 | #include <algorithm>
  6 | #include <cstdlib>
  7 | #include <cstring>
  8 | 
  9 | namespace ftxj {
 10 | 
 11 | __global__ void bf_spmm(
 12 |     
 13 |     float* Y0, // input
 14 |     float* Y1,
 15 | 
 16 |     int* roffW,  // len neuron * N_SLAB - 1
 17 |     int* colsW,  // index 32 * neuron
 18 |     float* valsW, // all 32 * neuron 0.0625
 19 | 
 20 |     int* no_name_len,
 21 |     int* no_name_idx,
 22 | 
 23 |     int COL_BLK, // TN, shared memory size = TN 
 24 |     int N_SLAB, //  neuron / TN
 25 |     int neuron // neuron
 26 | 
 27 | 
 28 | ) {
 29 | 
 30 |   extern  __shared__ float shRow[];
 31 | 
 32 |   int tid = threadIdx.y * blockDim.x + threadIdx.x;
 33 |   int rid = blockIdx.x;
 34 | 
 35 |   __syncthreads();
 36 | 
 37 |   for(int i = 0; i < N_SLAB; i++) {
 38 |     __syncthreads();
 39 |     for(int j = threadIdx.x; j < COL_BLK; j++) {
 40 |       shRow[j] = 0;  
 41 |     }
 42 |     __syncthreads();
 43 |     int no_name_len_beg = no_name_len[i] + threadIdx.y;
 44 |     int no_name_len_end = no_name_len[i + 1];
 45 |     for(int j = no_name_len_beg; j < no_name_len_end; j += blockDim.y) {
 46 |         int real_j = no_name_idx[j]
 47 |       float valY = Y0[rid * neuron + real_j];
 48 |     //   if(valY == 0) {
 49 |     //     continue;
 50 |     //   }
 51 | 
 52 |       int begOffW = roffW[i * neuron + real_j] + threadIdx.x;
 53 |       int endOffW = roffW[i * neuron + real_j + 1];
 54 |       
 55 |       for(int k = begOffW; k < endOffW; k += blockDim.x) {
 56 |         int colW = colsW[k];
 57 |         float valW = valsW[k];
 58 |         // if(colW - i * COL_BLK < 0 || colW - i * COL_BLK >= 1024) {
 59 |         //   printf("bugs %d %d %d %d\n", k, i, colW, colW - i * COL_BLK);
 60 |         // }
 61 |         atomicAdd(&shRow[colW - i * COL_BLK], valY * valW);
 62 |       }
 63 |     }
 64 |     __syncthreads();
 65 |     int count = 0;
 66 |     for(size_t j = 0; j < COL_BLK; j += blockDim.x * blockDim.y) {
 67 |     //   float v = j + tid < COL_BLK ? shRow[j + tid] + bias : -1;
 68 |     //   count += __syncthreads_count(v > 0);
 69 |       if(j + tid < COL_BLK) {
 70 |         Y1[rid * neuron + i * COL_BLK + j + tid] = shRow[j + tid];
 71 |         // min(T(32), max(T(0), v));
 72 |       }
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | void test_benchmark_19_BF(COOMatrix &coo, BFMatrix &matrix, 
 78 |     int neuron, int batch, int TN, 
 79 |     int blockx, int blocky,
 80 |     GpuEnv &env) {
 81 | 
 82 |     float *nextfeat;
 83 |     float *currfeat;
 84 | 
 85 |     int *rowoff;
 86 | 
 87 |     int off_size = neuron * (neuron / TN + 1) + 1;
 88 |     
 89 |     int *rowindex;
 90 |     
 91 |     int weight_nnz = 32 * neuron;
 92 | 
 93 |     float *value; 
 94 | 
 95 |     float bias = 0;
 96 |     int mybatch = batch;
 97 | 
 98 |     // std::vector<std::vector<float>> input(mybatch, std::vector<float>(neuron, 0.0));
 99 | 	  float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
100 | 	  memset(input, 0, sizeof(float) * neuron * mybatch);
101 | 
102 | 	  float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
103 | 	  memset(output, 0, sizeof(float) * neuron * mybatch);
104 | 
105 | 
106 |     srand (static_cast <unsigned> (time(0)));
107 |     for(int i = 0; i < mybatch; ++i) {
108 |       for(int j = 0; j < neuron; ++j) {
109 |         float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
110 |         input[i * neuron + j] = r2;
111 |       }
112 |     }
113 |   
114 |     Safe_Call(cudaMalloc((void**)&rowoff, sizeof(int) * off_size));
115 |     Safe_Call(cudaMemcpy(rowoff, &matrix.rowoff[0], sizeof(int) * off_size, cudaMemcpyHostToDevice));
116 |     
117 |     Safe_Call(cudaMalloc((void**)&rowindex, sizeof(int) * weight_nnz));
118 |     Safe_Call(cudaMemcpy(rowindex, &matrix.rowindex[0], sizeof(int) * weight_nnz, cudaMemcpyHostToDevice));
119 | 
120 |     Safe_Call(cudaMalloc((void**)&value, sizeof(float) * weight_nnz));
121 |     Safe_Call(cudaMemcpy(value, &matrix.val[0], sizeof(float) * weight_nnz, cudaMemcpyHostToDevice));
122 | 
123 |     Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * neuron * mybatch));
124 |     Safe_Call(cudaMemcpy(currfeat, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
125 | 
126 |     Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * neuron * mybatch));
127 |     Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * neuron * mybatch));
128 | 
129 |     std::cout << "begin inference..." << std::endl; 
130 |     env.add_event("uiuc_kernel_timer");
131 |     env.event_start_record("uiuc_kernel_timer");
132 | 
133 |     dim3 block(blockx, blocky);
134 |     dim3 grid(batch);
135 |     bf_spmm<<<grid,block, sizeof(float) * TN, env.get_stream("uiuc_kernel_timer")>>>(
136 |         currfeat, nextfeat,  rowoff, rowindex, value, TN, neuron / TN, neuron
137 |     );
138 | 
139 |     env.event_stop_record("uiuc_kernel_timer");
140 |     float time = env.get_event_time("uiuc_kernel_timer"); 
141 | 
142 |     Safe_Call(cudaMemcpy(output, nextfeat, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
143 | 
144 |     std::cout << "Kernel Exec Time [19-BF] = " << time << "ms"<< std::endl;
145 |     std::cout << "Flops [19-BF] = " << float(2 * batch * neuron * 32) /  time * 1000 /1e12 << "TFLOPS"<< std::endl;
146 |     
147 | 	  CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true, true);
148 | }
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/src/microbenchmark/cusparse_spmm.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <stdio.h>
  5 | #include <algorithm>
  6 | #include <cstdlib>
  7 | #include <cstring>
  8 | #include <stdio.h>
  9 | #include <iostream>
 10 | #include <cusparse.h>
 11 | #include <vector>
 12 | 
 13 | namespace ftxj {
 14 | 
 15 | #define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}
 16 | 
 17 | 
 18 | void test_benchmark_cusparse(COOMatrix& coo, cuSPARSEMatrix &matrix, int neuron, int batch) {
 19 | 
 20 | 	float * input = (float*)malloc(sizeof(float) * neuron * batch);
 21 | 	memset(input, 0, sizeof(float) * neuron * batch);
 22 | 
 23 | 	float * output = (float*)malloc(sizeof(float) * neuron * batch);
 24 | 	memset(output, 0, sizeof(float) * neuron * batch);
 25 | 
 26 | 
 27 |     srand (static_cast <unsigned> (time(0)));
 28 |     for(int i = 0; i < batch; ++i) {
 29 |       for(int j = 0; j < neuron; ++j) {
 30 |         float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 31 |         input[i * neuron + j] = r2;
 32 |       }
 33 |     }
 34 | 
 35 |     float *A_d;
 36 |     float *B_d;
 37 |     
 38 |     int* len_d;
 39 |     int* index_d;
 40 |     float* val_d;
 41 | 
 42 |     Safe_Call(cudaMalloc((void**)&A_d, sizeof(float) * neuron * batch));
 43 |     Safe_Call(cudaMemcpy(A_d, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice));
 44 | 
 45 |     Safe_Call(cudaMalloc((void**)&B_d, sizeof(float) * neuron * batch));
 46 |     Safe_Call(cudaMemset(B_d, 0, sizeof(float) * neuron * batch));
 47 | 
 48 | 
 49 |     Safe_Call(cudaMalloc((void**)&len_d, sizeof(int) * (neuron + 1)));
 50 |     Safe_Call(cudaMemcpy(len_d, matrix.len, sizeof(int) * (neuron + 1), cudaMemcpyHostToDevice));
 51 | 
 52 |     Safe_Call(cudaMalloc((void**)&index_d, sizeof(int) * (neuron * 32)));
 53 |     Safe_Call(cudaMemcpy(index_d, matrix.index, sizeof(int) * (neuron * 32), cudaMemcpyHostToDevice));
 54 |   
 55 |     Safe_Call(cudaMalloc((void**)&val_d, sizeof(float) * (neuron * 32)));
 56 |     Safe_Call(cudaMemcpy(val_d, matrix.val, sizeof(float) * (neuron * 32), cudaMemcpyHostToDevice));
 57 | 
 58 | 
 59 | 
 60 |     cusparseHandle_t handle = NULL;
 61 |     cusparseSpMatDescr_t matA;
 62 |     cusparseDnMatDescr_t matB, matC;
 63 |     void* dBuffer    = NULL;
 64 |     size_t bufferSize = 0;
 65 |     float alpha = 1.0f;
 66 |     float beta = 0.0f;
 67 | 
 68 |     CUSPARSE_CHECK( cusparseCreate(&handle) )
 69 |     // Create sparse matrix A in CSR format
 70 | 
 71 |     CUSPARSE_CHECK(cusparseCreateCsr(&matA, neuron, neuron, 32 * neuron,
 72 |                                       len_d, index_d, val_d,
 73 |                                       CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
 74 |                                       CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F))
 75 |     // Create dense matrix B
 76 |     CUSPARSE_CHECK(cusparseCreateDnMat(&matB, neuron, batch, neuron, A_d,
 77 |                                     CUDA_R_32F, CUSPARSE_ORDER_COL) )
 78 |                                         
 79 |     // Create dense matrix C
 80 |     CUSPARSE_CHECK(cusparseCreateDnMat(&matC, neuron, batch, neuron, B_d,
 81 |                                     CUDA_R_32F, CUSPARSE_ORDER_COL) )
 82 |     
 83 | 
 84 |                                         
 85 |     Safe_Call(cudaMalloc(&dBuffer, bufferSize));
 86 |               
 87 | 
 88 |     CUSPARSE_CHECK(cusparseSpMM_bufferSize(
 89 |                                 handle,
 90 |                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
 91 |                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
 92 |                                 &alpha, matA, matB, &beta, matC, CUDA_R_32F,
 93 |    
 94 |                                 CUSPARSE_CSRMM_ALG1, &bufferSize) )
 95 |    
 96 |     cudaEvent_t start, stop;
 97 |     cudaEventCreate(&start);
 98 |     cudaEventCreate(&stop);
 99 |     cudaEventRecord(start, 0);
100 | 
101 |    
102 |     CUSPARSE_CHECK( cusparseSpMM(handle,
103 |             CUSPARSE_OPERATION_NON_TRANSPOSE,
104 |             CUSPARSE_OPERATION_NON_TRANSPOSE,
105 |             &alpha, matA, matB, &beta, matC, CUDA_R_32F,
106 |             CUSPARSE_MM_ALG_DEFAULT, dBuffer) )
107 |     
108 |     cudaEventRecord(stop,0);
109 |     cudaEventSynchronize(stop);
110 |     float elapsed;
111 |     cudaEventElapsedTime(&elapsed, start, stop); //ms 
112 | 
113 |     // destroy matrix/vector descriptors
114 |     CUSPARSE_CHECK( cusparseDestroySpMat(matA) )
115 |     CUSPARSE_CHECK( cusparseDestroyDnMat(matB) )
116 |     CUSPARSE_CHECK( cusparseDestroyDnMat(matC) )
117 |     CUSPARSE_CHECK( cusparseDestroy(handle) )
118 | 
119 |     Safe_Call(cudaMemcpy(output, B_d, neuron * batch  * sizeof(float), cudaMemcpyDeviceToHost));
120 | 
121 | 	std::cout << "kernel time = " << elapsed << "ms" << std::endl;
122 |     std::cout << "Flops [cuSparse] = " << float(2 * batch * neuron * 32) /  elapsed * 1000 /1e12 << "TFLOPS"<< std::endl;
123 | 
124 | 	CpuSpmm::run_and_cmp(coo, input, neuron, batch, output, false);
125 | 
126 | }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/microbenchmark/header.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "../utils/header.h"
 3 | 
 4 | namespace ftxj {
 5 | 
 6 |     void test_benchmark_succ_load_store(int, int, GpuEnv &);
 7 |     void test_benchmark_matrix_transpose(int batch, int neuron, GpuEnv &env);
 8 | 
 9 | 
10 |     void test_benchmark_20_uiuc(COOMatrix&, UIUCMatrix &, int , GpuEnv &);
11 |     void test_benchmark_row_succ_20_uiuc(COOMatrix&, std::vector<float> &, std::vector<int> &, int, int, GpuEnv &);
12 |     void test_benchmark_row_succ_20_uiuc_transpose(COOMatrix&, std::vector<float> &, std::vector<int> &, int, int, GpuEnv &);
13 |     void test_benchmark_row_succ_input_transpose_batch_parallel(COOMatrix&, std::vector<float> &, std::vector<int> &, int, int, GpuEnv &);
14 |     void test_benchmark_rectangels_batch_parallel_kernel(COOMatrix&, std::vector<float> &, std::vector<int> &, int, int, GpuEnv &);
15 | 
16 |     void test_benchmark_graph_challenge(std::vector<std::vector<float>> &input, 
17 |         std::vector<std::vector<float>> &weight, std::vector<std::vector<int>> &row_access, 
18 |         int batch, int neuron, float bias,GpuEnv &env
19 |     );
20 | 
21 |     void test_benchmark_matrix_transpose_and_delete(int batch, int neuron, GpuEnv &env);
22 | 
23 |     void test_benchmark_n16384_l2_l10_kernel(COOMatrix& coo, std::vector<float> &val, int stride, int batch, int neuron, GpuEnv &env);
24 |     void test_benchmark_n16384_l11_kernel(COOMatrix& coo, std::vector<float> &B_val, std::vector<int> &B_index, int batch, int neuron, GpuEnv &env);
25 | 
26 |     void test_benchmark_fused_layer1024_0_1(
27 |         std::vector<std::vector<float>> &input,
28 |         std::vector<COOMatrix>& coo,
29 |         std::vector<std::vector<float>> &weight, 
30 |         std::vector<std::vector<int>> &row_access, 
31 |         int batch, 
32 |         int neuron, 
33 |         float bias,
34 |         GpuEnv &env
35 |     );
36 | 
37 |     void test_benchmark_cusparse(COOMatrix& coo, 
38 |         cuSPARSEMatrix &matrix, 
39 |         int neuron, int batch);
40 | 
41 |         
42 |     void test_benchmark_19_BF(
43 |         COOMatrix &coo, BFMatrix &matrix, 
44 |         int neuron, int batch, int TN, 
45 |         int blockx, int blocky,
46 |         GpuEnv &env
47 |     );
48 | 
49 |     void test_benchmark_fuse_cmp_layer1024_0_1(
50 |         std::vector<std::vector<float>> &input,
51 |         std::vector<std::vector<float>> &weight, 
52 |         std::vector<std::vector<int>> &row_access, 
53 |         int batch, 
54 |         int neuron, 
55 |         float bias,
56 |         GpuEnv &env
57 |     );
58 |     // void test_benchmark_n16384_l11_kernel(
59 |     //     COOMatrix& coo, 
60 |     //     std::vector<float> &B_val, 
61 |     //     std::vector<int> &B_index, 
62 |     //     std::vector<int> &A_row_access,
63 |     //     std::vector<int> &A_row_access_len,
64 |     //     int max_input_access,
65 |     //     int batch, int neuron, 
66 |     //     GpuEnv &env
67 |     // );
68 | 
69 | 
70 | 
71 |     void test_benchmark_SNIG(
72 |         std::vector<std::vector<float>> &input,
73 |         std::vector<SNIGMatrix> &weights, 
74 |         int batch, 
75 |         int neuron,
76 |         int sec_size,
77 |         int nnzs, 
78 |         float bias,
79 |         GpuEnv &env
80 |     );
81 |     void vector4_load_data_benchmark(GpuEnv &env);
82 |     void test_shared_memory_mm(COOMatrix&, std::vector<float> &val, std::vector<int> &row_access, GpuEnv &env);
83 | };
84 | 


--------------------------------------------------------------------------------
/src/microbenchmark/load-data.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | namespace ftxj {
  5 | 
  6 | #define BLOCK_LOAD (256 * 10)
  7 | #define VECTOR_BLOCK_LOAD (32 * 2)
  8 | 
  9 | __global__ void naive_copy(float *nextfeat, float *currfeat){
 10 |     extern __shared__ float shared[];
 11 |     int i = blockIdx.x * BLOCK_LOAD; 
 12 | 	for(int j = threadIdx.x; j < BLOCK_LOAD; j += blockDim.x) {
 13 | 		shared[j] = currfeat[i + j];
 14 | 	}
 15 |     __syncthreads();
 16 |     for(int j = threadIdx.x; j < BLOCK_LOAD; j += blockDim.x) {
 17 |         nextfeat[i + j] = shared[j] + 1;
 18 | 	}
 19 | };
 20 | 
 21 | 
 22 | __global__ void uiuc_copy(float *nextfeat, float *currfeat) {
 23 |     extern __shared__ float shared[];
 24 | 
 25 | 	int i = blockIdx.x * 12 * 16384 + blockIdx.y * 256;
 26 | 
 27 | 	for(int j = threadIdx.x; j < 12 * 256; j += blockDim.x) {
 28 |         shared[j] =  currfeat[i + j];
 29 | 	}
 30 | 	__syncthreads();
 31 |     for(int j = threadIdx.x; j < 12 * 256; j += blockDim.x) {
 32 |         nextfeat[i + j] = shared[j] + 1;
 33 | 	}
 34 | };
 35 | 
 36 | __global__ void vector4_copy(float* nextfeat, float* currfeat) {
 37 |     int idx = blockIdx.x * VECTOR_BLOCK_LOAD;
 38 | 	float4* pin = reinterpret_cast<float4*>(currfeat);
 39 | 	float4* pout = reinterpret_cast<float4*>(nextfeat);
 40 | 	for(int i = threadIdx.x; i < VECTOR_BLOCK_LOAD; i += blockDim.x) {
 41 | 		pout[idx + i] = pin[idx + i];
 42 |     }
 43 | };
 44 | 
 45 | 
 46 | void vector4_load_data_benchmark(GpuEnv &env) {
 47 |     float *nextfeat;
 48 |     float *currfeat;
 49 | 
 50 |     int mybatch = 60000;
 51 | 	int neuron = 1024;
 52 | 
 53 |     std::vector<std::vector<float>> input(mybatch, std::vector<float>(neuron, 1.0));
 54 | 
 55 |     Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * mybatch * neuron));
 56 |     Safe_Call(cudaMemcpy(currfeat, &input[0][0], sizeof(float) * mybatch * neuron, cudaMemcpyHostToDevice));
 57 | 
 58 |     Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * mybatch * neuron));
 59 |     Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * mybatch * neuron));
 60 | 
 61 | 
 62 | 	env.add_event("float4 copy");
 63 |     env.event_start_record("float4 copy");
 64 | 
 65 |     dim3 block(64);
 66 |     dim3 grid((mybatch * neuron) / BLOCK_LOAD);
 67 | 
 68 |     vector4_copy<<<grid,block, 0, env.get_stream("float4 copy")>>>(
 69 |         nextfeat, currfeat
 70 |     );
 71 | 
 72 |     env.event_stop_record("float4 copy");
 73 | 	float time2 = env.get_event_time("float4 copy");
 74 | 	
 75 |     std::cout << "float4 bandwidth = " << 2 * (mybatch * (float)neuron * sizeof(float)) / (time2 / 1000) / 1024.0 / 1024.0 / 1024.0 << "GB/s" << std::endl;
 76 |     
 77 | 	std::cout << "data load and write timer = " << time2 << std::endl;
 78 | }
 79 | 
 80 | void test_benchmark_succ_load_store(int mybatch, int neuron, GpuEnv &env) {
 81 |     float *nextfeat;
 82 |     float *currfeat;
 83 |     std::vector<std::vector<float>> input(mybatch, std::vector<float>(neuron, 1.0));
 84 | 
 85 |     Safe_Call(cudaMalloc((void**)&currfeat, sizeof(float) * mybatch * neuron));
 86 |     Safe_Call(cudaMemcpy(currfeat, &input[0][0], sizeof(float) * mybatch * neuron, cudaMemcpyHostToDevice));
 87 | 
 88 |     Safe_Call(cudaMalloc((void**)&nextfeat, sizeof(float) * mybatch * neuron));
 89 |     Safe_Call(cudaMemset(nextfeat, 0, sizeof(float) * mybatch * neuron));
 90 | 
 91 |     env.add_event("naive copy");
 92 |     env.event_start_record("naive copy");
 93 | 
 94 |     dim3 block(256);
 95 |     dim3 grid((mybatch * neuron) /BLOCK_LOAD);
 96 | 
 97 |     naive_copy<<<grid,block, BLOCK_LOAD *sizeof(float), env.get_stream("naive copy")>>>(
 98 |         nextfeat, currfeat
 99 |     );
100 | 
101 |     env.event_stop_record("naive copy");
102 | 
103 |     float time1 = env.get_event_time("naive copy"); 
104 | 	
105 | 	std::cout << "Load&Store Time [Succ] = " << time1 << "ms" << std::endl;
106 |     std::cout << "Load&Store Bandwidth [Succ] = " << 2 * (mybatch * (float)neuron * sizeof(float)) / (time1 / 1000) / 1024.0 / 1024.0 / 1024.0 << "GB/s" << std::endl;
107 |     
108 | }
109 | };


--------------------------------------------------------------------------------
/src/microbenchmark/matrix_transpose.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | #include "../gpu_lib/header.h"
 3 | #include "../utils/header.h"
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | namespace ftxj {
 7 | 
 8 | #define TILE_DIM 64
 9 | #define BLOCK_ROWS 16
10 | 
11 | __global__ void matrix_transpose(float * __restrict__ odata, float * __restrict__ idata, int neuron, int batch) {
12 | 
13 |     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
14 |     int x = blockIdx.x * TILE_DIM + threadIdx.x;
15 |     int y = blockIdx.y * TILE_DIM + threadIdx.y;
16 | 
17 |     for (int j = 0; j < TILE_DIM && (y + j) < batch; j += BLOCK_ROWS) {
18 |         tile[(threadIdx.y + j)][threadIdx.x] = idata[(y + j) * neuron + x];
19 |     }
20 | 
21 |     __syncthreads();
22 | 
23 | 
24 |     x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
25 |     y = blockIdx.x * TILE_DIM + threadIdx.y;
26 | 
27 |     for (int j = 0; j < TILE_DIM && x < batch; j += BLOCK_ROWS) {
28 |         odata[(y+j) * batch + x] = tile[threadIdx.x][threadIdx.y + j];
29 |     }    
30 | };
31 | 
32 | void test_benchmark_matrix_transpose(int batch, int neuron, GpuEnv &env) {
33 | 
34 |     float *A;
35 |     float *C;
36 | 	float * input = (float*)malloc(sizeof(float) * neuron * batch);
37 | 	memset(input, 0, sizeof(float) * neuron * batch);
38 | 
39 | 	float * output = (float*)malloc(sizeof(float) * neuron * batch);
40 | 	memset(output, 0, sizeof(float) * neuron * batch);
41 | 
42 | 	srand (static_cast <unsigned> (time(0)));
43 | 	for(int i = 0; i < batch; ++i) {
44 | 		for(int j = 0; j < neuron; ++j) {
45 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
46 | 			input[i * neuron + j] = r2;
47 | 		}
48 | 	}
49 | 
50 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch));
51 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice));
52 | 
53 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch));
54 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch));
55 | 
56 |     std::string event = "transpose";
57 | 	env.add_event(event);
58 |     env.event_start_record(event);
59 | 
60 |     
61 | 	dim3 grid((neuron + TILE_DIM - 1) / TILE_DIM, (batch +  TILE_DIM - 1) / TILE_DIM);
62 |     dim3 block(TILE_DIM, BLOCK_ROWS);
63 | 
64 | 	matrix_transpose<<<grid, block, sizeof(float) * (TILE_DIM * TILE_DIM + TILE_DIM), 
65 |         env.get_stream(event)>>>(
66 |             C, A, neuron, batch
67 | 	);
68 | 
69 |     env.event_stop_record(event);
70 | 
71 |     float time = env.get_event_time(event); 
72 | 
73 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * batch, cudaMemcpyDeviceToHost));
74 | 
75 | 	std::cout << "Kernel Exec Time [transpose] = " << time <<  "ms" <<std::endl;
76 |     long data = neuron * batch;
77 |     data = data * 8;
78 |     double gb = data / 1024.0 / 1024.0 / 1024.0;
79 |     double db = gb / time * 1000; 
80 | 	std::cout << "Kernel Bandwidth [transpose] = " << db <<  "GB/s" <<std::endl;
81 | 
82 | 	CpuTranspose::run_and_cmp(input, neuron, batch, output);
83 | }
84 | };


--------------------------------------------------------------------------------
/src/microbenchmark/matrix_transpose_and_delete.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | #define TILE_DIM 64
  9 | #define BLOCK_ROWS 16
 10 | 
 11 | __global__ void matrix_re_transpose_and_delete(
 12 |     float * __restrict__ odata, 
 13 |     float * __restrict__ idata,
 14 |     int * __restrict__ old_to_new_map,
 15 |     int neuron, int batch) {
 16 | 
 17 |     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 18 |     int x = blockIdx.x * TILE_DIM + threadIdx.x;
 19 |     int y = blockIdx.y * TILE_DIM + threadIdx.y;
 20 | 
 21 |     for (int j = 0; j < TILE_DIM && x < batch; j += BLOCK_ROWS) {
 22 |         tile[(threadIdx.y + j)][threadIdx.x] = idata[(y + j) * batch + x];
 23 |     }
 24 | 
 25 |     __syncthreads();
 26 | 
 27 | 
 28 |     x = blockIdx.y * TILE_DIM + threadIdx.x;  // old row
 29 |     y = blockIdx.x * TILE_DIM + threadIdx.y;  // old batch
 30 |     
 31 | 
 32 |     for (int j = 0; j < TILE_DIM && (y+j) < batch; j += BLOCK_ROWS) {
 33 |         if(old_to_new_map[y + j] == -1) continue;
 34 |         int tmp = old_to_new_map[y + j]; // new batch
 35 |         odata[tmp * neuron + x] = tile[threadIdx.x][threadIdx.y + j];
 36 |     }
 37 | };
 38 | 
 39 | void test_benchmark_matrix_transpose_and_delete(int batch, int neuron, GpuEnv &env) {
 40 | 
 41 |     float *A;
 42 |     float *C;
 43 |     int* old_to_new_map_d;
 44 |     
 45 | 	float * input = (float*)malloc(sizeof(float) * neuron * batch);
 46 | 	memset(input, 0, sizeof(float) * neuron * batch);
 47 | 
 48 | 	float * output = (float*)malloc(sizeof(float) * neuron * batch);
 49 | 	memset(output, 0, sizeof(float) * neuron * batch);
 50 | 
 51 | 	srand (static_cast <unsigned> (time(0)));
 52 | 	for(int i = 0; i < batch; ++i) {
 53 | 		for(int j = 0; j < neuron; ++j) {
 54 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 55 | 			input[i * neuron + j] = r2;
 56 | 		}
 57 | 	}
 58 | 
 59 | 
 60 |     int * old_to_new_map = (int*)malloc(sizeof(int) * batch);
 61 |     for(int i = 0; i < 2; ++i) {
 62 |         old_to_new_map[i] = -1;
 63 |     }
 64 |     for(int i = 2; i < batch; ++i) {
 65 |         old_to_new_map[i] = i - 2;
 66 |     }
 67 | 
 68 |     int new_batch = batch - 2;
 69 |     
 70 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch));
 71 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice));
 72 | 
 73 |     Safe_Call(cudaMalloc((void**)&old_to_new_map_d, sizeof(int) * batch));
 74 |     Safe_Call(cudaMemcpy(old_to_new_map_d, old_to_new_map, sizeof(int) * batch, cudaMemcpyHostToDevice));
 75 | 
 76 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch));
 77 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch));
 78 | 
 79 |     std::string event = "transpose_and_delete";
 80 | 	env.add_event(event);
 81 |     env.event_start_record(event);
 82 | 
 83 |     
 84 | 	dim3 grid((batch + TILE_DIM - 1) / TILE_DIM, (neuron +  TILE_DIM - 1) / TILE_DIM);
 85 |     dim3 block(TILE_DIM, BLOCK_ROWS);
 86 | 
 87 | 	matrix_re_transpose_and_delete<<<grid, block, sizeof(float) * (TILE_DIM * TILE_DIM + TILE_DIM), 
 88 |         env.get_stream(event)>>>(
 89 |             C, A, old_to_new_map_d,  neuron, batch
 90 | 	);
 91 | 
 92 |     env.event_stop_record(event);
 93 | 
 94 |     float time = env.get_event_time(event); 
 95 | 
 96 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * new_batch, cudaMemcpyDeviceToHost));
 97 | 
 98 | 
 99 |     std::cout << output[1 * neuron + 0] << ", ";
100 |     std::cout << std::endl;
101 | 	std::cout << "Kernel Exec Time [transpose] = " << time <<  "ms" <<std::endl;
102 | 	CpuTransposeDelete::run_and_cmp(input, old_to_new_map, batch, neuron, new_batch, output);
103 | }
104 | };


--------------------------------------------------------------------------------
/src/microbenchmark/multi_gpu/header.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "../../utils/header.h"
 3 | 
 4 | namespace ftxj {
 5 |     void test_benchmark_multi_gpu_graph_challenge(
 6 |     std::vector<std::vector<float>> &input,
 7 |     std::vector<std::vector<float>> &weight, 
 8 |     std::vector<std::vector<int>> &row_access,
 9 |     int batch, 
10 |     int neuron, 
11 |     float bias,
12 |     int gpu_index,
13 |     int
14 |     );
15 | };


--------------------------------------------------------------------------------
/src/microbenchmark/n16384-l11.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x) {
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | #define OUT_CHANNEL 16
 13 | // batch parallel
 14 | __global__ void n16384_l11_kernel(
 15 |     float * __restrict__ A, 
 16 |     float * __restrict__ B, 
 17 |     float * __restrict__ C, 
 18 |     int* __restrict__ index, 
 19 |     int batch, 
 20 |     int neuron, 
 21 |     float bias) {
 22 |     
 23 |     extern __shared__ float shared[];
 24 | 
 25 | 
 26 |     for(int n = threadIdx.x; n < OUT_CHANNEL * 32; n += blockDim.x){
 27 |         shared[n] = B[(blockIdx.y * OUT_CHANNEL * 32) + n];
 28 |     }
 29 |     __syncthreads();
 30 | 
 31 |     if((blockIdx.x * blockDim.x + threadIdx.x) >= batch) return;
 32 | 
 33 |     int begin_idx = blockIdx.y * OUT_CHANNEL / 16 * 32;
 34 |     for(int o_r = 0; o_r < OUT_CHANNEL / 16; ++o_r) {
 35 |         float reduce[16] = {0.0};
 36 |         int idx = begin_idx + o_r * 32;
 37 |         for(int r = 0; r < 32; ++r) {
 38 |             int row_idx = index[idx + r];
 39 |             float val = A[row_idx * batch + blockIdx.x * blockDim.x + threadIdx.x];
 40 |             // float val = 1.0;
 41 |             for(int c = 0; c < 16; c += 8) {
 42 |                 // if(o_r == 0 && blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && c == 0) {
 43 |                 //     printf("%f * %f\n", shared[o_r * 32 * 16 + r * 16 + c], val);
 44 |                 // }
 45 |                 reduce[c + 0] += val * shared[o_r * 32 * 16 + r * 16 + c + 0];
 46 |                 reduce[c + 1] += val * shared[o_r * 32 * 16 + r * 16 + c + 1];
 47 |                 reduce[c + 2] += val * shared[o_r * 32 * 16 + r * 16 + c + 2];
 48 |                 reduce[c + 3] += val * shared[o_r * 32 * 16 + r * 16 + c + 3];
 49 |                 
 50 |                 reduce[c + 4] += val * shared[o_r * 32 * 16 + r * 16 + c + 4];
 51 |                 reduce[c + 5] += val * shared[o_r * 32 * 16 + r * 16 + c + 5];
 52 |                 reduce[c + 6] += val * shared[o_r * 32 * 16 + r * 16 + c + 6];
 53 |                 reduce[c + 7] += val * shared[o_r * 32 * 16 + r * 16 + c + 7];
 54 |                 
 55 |             }
 56 |         }
 57 |         for(int c = 0; c < 16; ++c) {
 58 |             C[(blockIdx.y * OUT_CHANNEL  + o_r * 16 + c) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c];
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | void test_benchmark_n16384_l11_kernel(
 64 |     COOMatrix& coo, 
 65 |     std::vector<float> &B_val, 
 66 |     std::vector<int> &B_index, 
 67 |     int batch, int neuron, 
 68 |     GpuEnv &env) {
 69 | 
 70 | 	float *A;
 71 |     float *B;
 72 | 	float *C;
 73 |     int* B_index_d;
 74 | 
 75 | 	int mybatch = batch;
 76 | 
 77 | 	int bias = 0;
 78 | 
 79 | 	float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 80 | 	memset(input, 0, sizeof(float) * neuron * mybatch);
 81 | 
 82 | 	float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 83 | 	memset(output, 0, sizeof(float) * neuron * mybatch);
 84 | 
 85 | 	srand (static_cast <unsigned> (time(0)));
 86 | 	for(int i = 0; i < mybatch; ++i) {
 87 | 		for(int j = 0; j < neuron; ++j) {
 88 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 89 | 			input[i * neuron + j] = r2;
 90 | 		}
 91 | 	}
 92 | 
 93 | 	float* W  = (float*)malloc(sizeof(float) * B_val.size());
 94 | 	for(int i = 0; i < B_val.size(); ++i) {
 95 | 		W[i] = B_val[i];
 96 | 	}
 97 | 
 98 |     int* W_idx  = (int*)malloc(sizeof(int) * B_index.size());
 99 | 	for(int i = 0; i < B_index.size(); ++i) {
100 | 		W_idx[i] = B_index[i];
101 | 	}
102 | 
103 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch));
104 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
105 | 
106 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * B_val.size()));
107 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * B_val.size(), cudaMemcpyHostToDevice));
108 | 
109 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch));
110 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch));
111 | 
112 |     Safe_Call(cudaMalloc((void**)&B_index_d, sizeof(float) * B_index.size()));
113 |     Safe_Call(cudaMemcpy(B_index_d, W_idx, sizeof(float) * B_index.size(), cudaMemcpyHostToDevice));
114 | 
115 | 	env.add_event("row-succ-20-uiuc-kernel");
116 |     env.event_start_record("row-succ-20-uiuc-kernel");
117 | 
118 | 	int blocksize = 256;
119 | 	dim3 block(blocksize);
120 |     dim3 grid((mybatch + blocksize - 1) / blocksize,  neuron / OUT_CHANNEL);
121 | 
122 | 	n16384_l11_kernel<<<grid, block, sizeof(float) * (OUT_CHANNEL * 32), env.get_stream("row-succ-20-uiuc-kernel")>>>(
123 | 		A, B, C, B_index_d, batch, neuron, bias
124 | 	);
125 | 
126 |     env.event_stop_record("row-succ-20-uiuc-kernel");
127 | 
128 |     float time = env.get_event_time("row-succ-20-uiuc-kernel"); 
129 | 
130 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
131 | 
132 | 	std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time <<  "ms" <<std::endl;
133 | 	CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, false, false);
134 | }
135 | };


--------------------------------------------------------------------------------
/src/microbenchmark/n16384-l2-l10.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x){
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | #define MINIBATCH 8
 13 | #define UNROLL 8
 14 | 
 15 | __global__ void n16384_l2_l11_kernel(
 16 | 	float * __restrict__ A, 
 17 | 	float * __restrict__ B, 
 18 | 	float * __restrict__ C, 
 19 | 	int stride,
 20 | 	int neuron, 
 21 | 	int batch, 
 22 | 	float bias) {
 23 | 
 24 | 	extern __shared__ float shared[];
 25 | 	int start_idx1 = (blockDim.x / 16) * (blockIdx.y) * 16;
 26 | 	int start_idx2 = (blockDim.x / 16) * (blockIdx.y) * 16 + stride;
 27 | 	int load_num = stride > blockDim.x ? 32 * (blockDim.x / 16) : stride + 16 * (blockDim.x / 16);
 28 | 	int shared_size = ((load_num + 31) / 32) * 32;
 29 | 	int col_gropu = threadIdx.x / 16;
 30 | 	
 31 | 
 32 | 	for(int n = threadIdx.x; n < load_num * MINIBATCH; n += blockDim.x){
 33 | 		int f = n / load_num;
 34 | 		int k = n % load_num;
 35 | 		int a_k = ((stride > blockDim.x) && (k >= blockDim.x)) ? (k - blockDim.x) + start_idx2 : k + start_idx1;
 36 | 		// if(blockIdx.x == 0 && blockIdx.y == 0 && f == 0) {
 37 | 		// 	printf("block 0 load %d\n", a_k);
 38 | 		// }
 39 | 		shared[f * shared_size + k] = A[(blockIdx.x * MINIBATCH + f) * neuron + (a_k) % neuron];
 40 | 	}
 41 | 
 42 | 	__syncthreads();
 43 | 
 44 | 	int gap = stride >= blockDim.x ? blockDim.x : stride;
 45 | 	
 46 | 	float res[MINIBATCH] = {0.0};
 47 | 	
 48 | 	for(int r = 0; r < 32; ++r) {
 49 |         float val = B[(blockIdx.y * blockDim.x * 32) + r * blockDim.x + threadIdx.x];
 50 | 		int idx = col_gropu * 16 + (r >= 16? r + gap - 16 : r);
 51 |         for(int f = 0; f < MINIBATCH / UNROLL; ++f) {
 52 | 			if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && f == 0) {
 53 | 				printf("%d %f * %f\n", idx, shared[(f * UNROLL + 0) * shared_size + idx], val);
 54 | 			}
 55 |             res[0 + f * UNROLL] += shared[(f * UNROLL + 0) * shared_size + idx] * val;
 56 |             res[1 + f * UNROLL] += shared[(f * UNROLL + 1) * shared_size + idx] * val;
 57 |             res[2 + f * UNROLL] += shared[(f * UNROLL + 2) * shared_size + idx] * val;
 58 |             res[3 + f * UNROLL] += shared[(f * UNROLL + 3) * shared_size + idx] * val;
 59 |             res[4 + f * UNROLL] += shared[(f * UNROLL + 4) * shared_size + idx] * val;
 60 |             res[5 + f * UNROLL] += shared[(f * UNROLL + 5) * shared_size + idx] * val;
 61 |             res[6 + f * UNROLL] += shared[(f * UNROLL + 6) * shared_size + idx] * val;
 62 |             res[7 + f * UNROLL] += shared[(f * UNROLL + 7) * shared_size + idx] * val;
 63 |         }
 64 |     }
 65 | 	for(int f = 0; f < MINIBATCH ; ++f) {
 66 | 		C[(blockIdx.x * MINIBATCH + f) * neuron + blockIdx.y * 128 + threadIdx.x] = res[f];
 67 | 	}
 68 | }
 69 | 
 70 | void test_benchmark_n16384_l2_l10_kernel(COOMatrix& coo, std::vector<float> &val, int stride, int batch, int neuron, GpuEnv &env) {
 71 | 	float *A;
 72 |     float *B;
 73 | 	float *C;
 74 | 
 75 | 	int bias = 0;
 76 | 
 77 | 	float * input = (float*)malloc(sizeof(float) * neuron * batch);
 78 | 	memset(input, 0, sizeof(float) * neuron * batch);
 79 | 
 80 | 	float * output = (float*)malloc(sizeof(float) * neuron * batch);
 81 | 	memset(output, 0, sizeof(float) * neuron * batch);
 82 | 
 83 | 	srand (static_cast <unsigned> (time(0)));
 84 | 	for(int i = 0; i < batch; ++i) {
 85 | 		for(int j = 0; j < neuron; ++j) {
 86 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 87 | 			input[i * neuron + j] = r2;
 88 | 		}
 89 | 	}
 90 | 
 91 | 	float* W  = (float*)malloc(sizeof(float) * val.size());
 92 |     for(int i = 0; i < val.size(); ++i) {
 93 | 		W[i] = val[i];
 94 | 	}
 95 | 
 96 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * batch));
 97 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * batch, cudaMemcpyHostToDevice));
 98 | 
 99 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size()));
100 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice));
101 | 
102 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * batch));
103 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * batch));
104 | 
105 | 	std::string event = "test_n16384_l2_l10";
106 | 	env.add_event(event);
107 |     env.event_start_record(event);
108 | 
109 | 	int blocksize = 128;
110 | 	int load_num = stride > blocksize ? 32 * (blocksize / 16) : stride + 16 * (blocksize / 16);
111 | 	int shared_size = ((load_num + 31) / 32) * 32;
112 | 	dim3 block(blocksize);
113 |     dim3 grid((batch + MINIBATCH - 1)/ MINIBATCH, (neuron + 128 - 1) / 128);
114 | 	n16384_l2_l11_kernel<<<grid, block, sizeof(float) * (MINIBATCH * shared_size), env.get_stream(event)>>>(
115 | 		A, B, C, stride, neuron, batch, bias
116 | 	);
117 |     env.event_stop_record(event);
118 |     float time = env.get_event_time(event); 
119 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * batch, cudaMemcpyDeviceToHost));
120 | 	std::cout << "Kernel Exec Time [n16384-l2-l10] = " << time <<  "ms" <<std::endl;
121 | 	std::cout << "Kernel Exec Flops = " << (neuron * batch * 32 * 2.0) / (time / 1000.0) / 1000 / 1000 / 1000 /1000 << "TFLOPS" <<std::endl;
122 | 	CpuSpmm::run_and_cmp(coo, input, neuron, batch, output, false, true, true);
123 | }
124 | };


--------------------------------------------------------------------------------
/src/microbenchmark/random.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "../utils/header.h"
 3 | namespace ftxj {
 4 | 
 5 |     class RandomGen {
 6 |     public:
 7 |         static float random_float() {
 8 | 
 9 |         }
10 |         static int random_int() {
11 |             
12 |         }
13 |     };
14 | }
15 | 


--------------------------------------------------------------------------------
/src/microbenchmark/rectangels.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x){
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | #define MINIBATCH 32
 13 | 
 14 | __global__ void rectangels_batch_parallel_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index16x16, int neuron, int batch, float bias) {
 15 | 
 16 | 	extern __shared__ float shared[];
 17 | 
 18 | 	for(int n = threadIdx.x; n < 128 * 32; n += blockDim.x){
 19 | 		shared[n] = B[(blockIdx.y * 128 * 32) + n];
 20 | 	}
 21 | 	__syncthreads();
 22 | 
 23 | 	int start_idx = index16x16[blockIdx.y];
 24 | 	for(int f = 0; f < 256; ++f) {
 25 | 		for(int i = threadIdx.x; i < 128; i += blockDim.x) {
 26 | 			shared[i + 128 * 32] = 1.0;
 27 | 			// A[(blockIdx.x * 256 + f) * neuron + (start_idx + i) % neuron];
 28 | 		}
 29 | 		__syncthreads();
 30 | 		
 31 | 		float res = 0;
 32 | 		
 33 | 		int idx_beg =  (threadIdx.x / 16) * 16;
 34 | 		
 35 | 		for(int r = 0; r < 32; ++r) {
 36 | 			res += shared[r * 128 + threadIdx.x] * shared[128 * 32 + idx_beg + r];
 37 | 			// if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) {
 38 | 			// 	printf("%f * %f\n", shared[r * 128 + threadIdx.x], shared[128 * 32 + idx_beg + r]);
 39 | 			// }
 40 | 		}
 41 | 		// if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) {
 42 | 		// 	printf("RES = %f\n", res);
 43 | 		// }
 44 | 		C[(blockIdx.x * 256 + f) * neuron + blockIdx.y * 128 + threadIdx.x] = res;
 45 | 		__syncthreads();
 46 | 	}
 47 | }
 48 | 
 49 | void test_benchmark_rectangels_batch_parallel_kernel(COOMatrix& coo, std::vector<float> &val, std::vector<int> &row_access, int batch, int neuron, GpuEnv &env) {
 50 | 
 51 | 	float *A;
 52 |     float *B;
 53 | 	float *C;
 54 | 	int *index;
 55 | 
 56 | 	int mybatch = batch;
 57 | 
 58 | 	int bias = 0;
 59 | 
 60 | 	float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 61 | 	memset(input, 1.0, sizeof(float) * neuron * mybatch);
 62 | 
 63 | 	float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 64 | 	memset(output, 0, sizeof(float) * neuron * mybatch);
 65 | 
 66 | 	// srand (static_cast <unsigned> (time(0)));
 67 | 	// for(int i = 0; i < mybatch; ++i) {
 68 | 	// 	for(int j = 0; j < neuron; ++j) {
 69 |     //         float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 70 | 	// 		input[i * neuron + j] = r2;
 71 | 	// 	}
 72 | 	// }
 73 | 
 74 | 
 75 | 	float* W  = (float*)malloc(sizeof(float) * val.size());
 76 | 	for(int i = 0; i < val.size(); ++i) {
 77 | 		W[i] = val[i];
 78 | 	}
 79 | 
 80 | 	int* access = (int*)malloc(sizeof(int) * row_access.size());
 81 | 	for(int i = 0; i < row_access.size(); ++i) {
 82 | 		access[i] = row_access[i];
 83 | 	}
 84 | 
 85 | 
 86 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch));
 87 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
 88 | 
 89 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size()));
 90 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice));
 91 | 
 92 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch));
 93 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch));
 94 | 
 95 | 	Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size()));
 96 | 	Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice));
 97 | 
 98 | 	env.add_event("row-succ-20-uiuc-kernel");
 99 |     env.event_start_record("row-succ-20-uiuc-kernel");
100 | 
101 | 	int blocksize = 128;
102 | 	dim3 block(blocksize);
103 |     dim3 grid(mybatch / (256), neuron / blocksize);
104 | 
105 | 	rectangels_batch_parallel_kernel<<<grid, block, sizeof(float) * (32 * 128 + 128 + 16), env.get_stream("row-succ-20-uiuc-kernel")>>>(
106 | 		A, B, C, index, neuron, batch, bias
107 | 	);
108 | 
109 |     env.event_stop_record("row-succ-20-uiuc-kernel");
110 | 
111 |     float time = env.get_event_time("row-succ-20-uiuc-kernel"); 
112 | 
113 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
114 | 
115 | 	std::cout << "Kernel Exec Time [20-uiuc-row-succ] = " << time <<  "ms" <<std::endl;
116 | 	std::cout << "Kernel Exec Flops = " << (neuron * mybatch * 32 * 2.0) / (time / 1000.0) / 1000 / 1000 / 1000 /1000 << "TFLOPS" <<std::endl;
117 | 
118 | 	CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true, true);
119 | 
120 | 	
121 | }
122 | };


--------------------------------------------------------------------------------
/src/microbenchmark/row-succ-20-uiuc-transpose.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x){
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | #define MINIBATCH 32
 13 | 
 14 | __global__ void uiuc_transpose_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) {
 15 | 
 16 | 	extern __shared__ float shared[];
 17 | 	float reduce[MINIBATCH] = {0.0};
 18 | 
 19 |     int groupIdx = threadIdx.x / 32;
 20 | 	int groupNum = blockDim.x / 32;
 21 | 	int lane = threadIdx.x % 32;
 22 | 
 23 | 	for(int n = threadIdx.x; n < 256 * MINIBATCH; n += blockDim.x){
 24 | 		int idx = index[blockIdx.y * 256 + n / 32];
 25 | 		shared[n] = A[idx * batch + blockIdx.x * MINIBATCH + lane];
 26 | 	}
 27 | 	__syncthreads();
 28 |     
 29 | 	for(int r = 0; r < 32; ++r){
 30 | 		float val = B[blockIdx.y * 256 * 32 + r * 256 + threadIdx.x];
 31 | 		for(int f = 0; f < MINIBATCH; f++) {
 32 |             // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && f == 0) {
 33 | 			// 	printf("%f * %f %d\n", shared[(threadIdx.x / 32 + r) * MINIBATCH + f], val, index[blockIdx.y * 256]);
 34 | 			// }
 35 | 			reduce[f] += shared[(threadIdx.x / 32 + r) * MINIBATCH + f] * val; // bank conflict!!
 36 | 		}
 37 | 	}
 38 | 	
 39 | 	__syncthreads();
 40 |     // if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) {
 41 | 	// 	printf("res = %f\n", reduce[0]);
 42 | 	// }
 43 | 
 44 | 	for(int f = 0; f < MINIBATCH; ++f){
 45 | 		shared[threadIdx.x * MINIBATCH + f] = reduce[f];
 46 | 	}
 47 | 	
 48 | 	__syncthreads();
 49 | 	
 50 | 	for(int n = threadIdx.x; n < 256 * MINIBATCH; n += blockDim.x){
 51 | 		C[(blockIdx.y * 256 + n / MINIBATCH) * batch + blockIdx.x * MINIBATCH + n % MINIBATCH] = shared[(threadIdx.x / MINIBATCH) * MINIBATCH + (n % MINIBATCH)]; 
 52 | 	}
 53 | }
 54 | 
 55 | void test_benchmark_row_succ_20_uiuc_transpose(COOMatrix& coo, std::vector<float> &val, std::vector<int> &row_access, int batch, int neuron, GpuEnv &env) {
 56 | 
 57 | 	float *A;
 58 |     float *B;
 59 | 	float *C;
 60 | 	int *index;
 61 | 
 62 | 	int mybatch = batch;
 63 | 
 64 | 	int bias = 0;
 65 | 
 66 | 	float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 67 | 	memset(input, 0, sizeof(float) * neuron * mybatch);
 68 | 
 69 | 	float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 70 | 	memset(output, 0, sizeof(float) * neuron * mybatch);
 71 | 
 72 | 	srand (static_cast <unsigned> (time(0)));
 73 | 	for(int i = 0; i < mybatch; ++i) {
 74 | 		for(int j = 0; j < neuron; ++j) {
 75 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 76 | 			input[i * neuron + j] = r2;
 77 | 		}
 78 | 	}
 79 | 
 80 | 
 81 | 	float* W  = (float*)malloc(sizeof(float) * val.size());
 82 | 	for(int i = 0; i < val.size(); ++i) {
 83 | 		W[i] = val[i];
 84 | 	}
 85 | 
 86 | 	int* access = (int*)malloc(sizeof(int) * row_access.size());
 87 | 	for(int i = 0; i < row_access.size(); ++i) {
 88 | 		access[i] = row_access[i];
 89 | 	}
 90 | 
 91 | 
 92 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch));
 93 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
 94 | 
 95 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size()));
 96 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice));
 97 | 
 98 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch));
 99 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch));
100 | 
101 | 	Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size()));
102 | 	Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice));
103 | 
104 | 	env.add_event("row-succ-20-uiuc-kernel");
105 |     env.event_start_record("row-succ-20-uiuc-kernel");
106 | 
107 | 	int blocksize = 256;
108 | 	dim3 block(blocksize);
109 |     dim3 grid(mybatch / (MINIBATCH), neuron / blocksize);
110 | 
111 | 	uiuc_transpose_kernel<<<grid, block, sizeof(float) * (MINIBATCH * blocksize), env.get_stream("row-succ-20-uiuc-kernel")>>>(
112 | 		A, B, C, index, neuron, batch, bias
113 | 	);
114 | 
115 |     env.event_stop_record("row-succ-20-uiuc-kernel");
116 | 
117 |     float time = env.get_event_time("row-succ-20-uiuc-kernel"); 
118 | 
119 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
120 | 
121 | 	std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time <<  "ms" <<std::endl;
122 | 	std::cout << "Kernel Exec Flops = " << (neuron * mybatch * 32 * 2.0) / (time / 1000.0) / 1000 / 1000 / 1000 /1000 << "TFLOPS" <<std::endl;
123 | 
124 | 	CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, false, false);
125 | 
126 | 	
127 | }
128 | };


--------------------------------------------------------------------------------
/src/microbenchmark/row-succ-20-uiuc.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x){
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | #define MINIBATCH 32
 13 | 
 14 | __global__ void uiuc_cut_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) {
 15 | 
 16 | 	extern __shared__ float shared[];
 17 | 	float reduce[MINIBATCH] = {0.0};
 18 | 
 19 | 
 20 | 	int idx = index[(blockIdx.y * blockDim.x + threadIdx.x) / 32 + threadIdx.x % 32];
 21 | 
 22 | 	for(unsigned int f = 0; f < MINIBATCH; f++) {
 23 | 		shared[f * blockDim.x + threadIdx.x] = A[(blockIdx.x * MINIBATCH + f) * neuron + idx];
 24 | 	}
 25 | 	__syncthreads();
 26 | 	for(int r = 0; r < 32; ++r){
 27 | 		float val = B[blockIdx.y * blockDim.x * 32 + r * blockDim.x + threadIdx.x];
 28 | 		for(int f = 0; f < MINIBATCH; f++) {
 29 | 			// if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1 && f == 0) {
 30 | 			// 	printf("%f * %f\n", shared[f * blockDim.x + (threadIdx.x / 32+ r)], val);
 31 | 			// }
 32 | 			reduce[f] += shared[f * blockDim.x + (threadIdx.x / 32 + r)] * val;
 33 | 		}
 34 | 	}
 35 | 
 36 | 	// if(blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1) {
 37 | 	// 	printf("res = %f\n", reduce[0]);
 38 | 	// }
 39 | 
 40 | 	int widx1 = (blockIdx.y * blockDim.x) / 2;
 41 | 	int widx2 = (blockIdx.y * blockDim.x) / 2 + 512;
 42 | 	int wgroup = threadIdx.x / 32;
 43 | 
 44 | 	int widx = threadIdx.x % 32 > 16 ? widx2 + wgroup * 16 + threadIdx.x % 32 - 16 : widx1 + wgroup * 16 + threadIdx.x % 32;
 45 | 
 46 | 	for(int f = 0; f < MINIBATCH; f++) {
 47 | 		C[(blockIdx.x * MINIBATCH + f) * neuron + widx] = reduce[f];
 48 | 	}
 49 | }
 50 | 
 51 | void test_benchmark_row_succ_20_uiuc(COOMatrix& coo, std::vector<float> &val, std::vector<int> &row_access, int batch, int neuron, GpuEnv &env) {
 52 | 
 53 | 	float *A;
 54 |     float *B;
 55 | 	float *C;
 56 | 	int *index;
 57 | 
 58 | 	int mybatch = batch;
 59 | 
 60 | 	int bias = 0;
 61 | 
 62 | 	float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 63 | 	memset(input, 0, sizeof(float) * neuron * mybatch);
 64 | 
 65 | 	float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 66 | 	memset(output, 0, sizeof(float) * neuron * mybatch);
 67 | 
 68 | 	srand (static_cast <unsigned> (time(0)));
 69 | 	for(int i = 0; i < mybatch; ++i) {
 70 | 		for(int j = 0; j < neuron; ++j) {
 71 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 72 | 			input[i * neuron + j] = r2;
 73 | 		}
 74 | 	}
 75 | 
 76 | 
 77 | 	float* W  = (float*)malloc(sizeof(float) * val.size());
 78 | 	for(int i = 0; i < val.size(); ++i) {
 79 | 		W[i] = val[i];
 80 | 	}
 81 | 
 82 | 	int* access = (int*)malloc(sizeof(int) * row_access.size());
 83 | 	for(int i = 0; i < row_access.size(); ++i) {
 84 | 		access[i] = row_access[i];
 85 | 	}
 86 | 
 87 | 
 88 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch));
 89 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
 90 | 
 91 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size()));
 92 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice));
 93 | 
 94 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch));
 95 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch));
 96 | 
 97 | 	Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size()));
 98 | 	Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice));
 99 | 
100 | 	env.add_event("row-succ-20-uiuc-kernel");
101 |     env.event_start_record("row-succ-20-uiuc-kernel");
102 | 
103 | 	int blocksize = 256;
104 | 	dim3 block(blocksize);
105 |     dim3 grid(mybatch / (MINIBATCH), neuron / blocksize);
106 | 
107 | 	uiuc_cut_kernel<<<grid, block, sizeof(float) * (MINIBATCH * blocksize), env.get_stream("row-succ-20-uiuc-kernel")>>>(
108 | 		A, B, C, index, neuron, batch, bias
109 | 	);
110 | 
111 |     env.event_stop_record("row-succ-20-uiuc-kernel");
112 | 
113 |     float time = env.get_event_time("row-succ-20-uiuc-kernel"); 
114 | 
115 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
116 | 
117 | 	std::cout << "Kernel Exec Time [20-uiuc-row-succ] = " << time <<  "ms" <<std::endl;
118 | 	std::cout << "Kernel Exec Flops = " << (neuron * mybatch * 32 * 2.0) / (time / 1000.0) / 1000 / 1000 / 1000 /1000 << "TFLOPS" <<std::endl;
119 | 
120 | 	CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, true);
121 | 
122 | 	
123 | }
124 | };


--------------------------------------------------------------------------------
/src/microbenchmark/row-succ-transpose-batch-parallel.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include "../gpu_lib/header.h"
  3 | #include "../utils/header.h"
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | namespace ftxj {
  7 | 
  8 | __device__ inline float __ReLU(float x) {
  9 |    return x<0.0?0.0:x>32.0?32.0:x;
 10 | };
 11 | 
 12 | __global__ void batch_parallel_16384x32succ_kernel(float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int* __restrict__ index, int neuron, int batch, float bias) {
 13 | 	extern __shared__ float shared[];
 14 | 	float reduce[32] = {0.0};
 15 | 
 16 | 	for(int n = threadIdx.x; n < 32 * 32; n += blockDim.x){
 17 | 		shared[n] = B[(blockIdx.y * 32 * 32) + n];
 18 | 	}
 19 | 	__syncthreads();
 20 | 	if((blockIdx.x * blockDim.x + threadIdx.x) >= batch) return;
 21 | 	
 22 | 	for(int r = 0; r < 32; ++r) {
 23 | 		int row_idx = index[blockIdx.y * 32 + r];
 24 | 		float val = A[row_idx * batch + blockIdx.x * blockDim.x + threadIdx.x];
 25 | 		for(int c = 0; c < 32; ++c){
 26 | 			reduce[c] += shared[r * 32 + c] * val;
 27 | 		}
 28 | 	}
 29 | 	__syncthreads();
 30 | 	for(int c = 0; c < 16; ++c) {
 31 | 		C[(blockIdx.y * 16  + c) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c];
 32 | 	}
 33 | 	for(int c = 16; c < 32; ++c) {
 34 | 		C[(neuron / 2 + blockIdx.y * 16  + c - 16) * batch + blockIdx.x * blockDim.x + threadIdx.x] = reduce[c];
 35 | 	}
 36 | }
 37 | 
 38 | void test_benchmark_row_succ_input_transpose_batch_parallel(COOMatrix& coo, std::vector<float> &val, std::vector<int> &row_access, int batch, int neuron, GpuEnv &env) {
 39 | 
 40 | 	float *A;
 41 |     float *B;
 42 | 	float *C;
 43 | 	int *index;
 44 | 
 45 | 	int mybatch = batch;
 46 | 
 47 | 	int bias = 0;
 48 | 
 49 | 	float * input = (float*)malloc(sizeof(float) * neuron * mybatch);
 50 | 	memset(input, 0, sizeof(float) * neuron * mybatch);
 51 | 
 52 | 	float * output = (float*)malloc(sizeof(float) * neuron * mybatch);
 53 | 	memset(output, 0, sizeof(float) * neuron * mybatch);
 54 | 
 55 | 	srand (static_cast <unsigned> (time(0)));
 56 | 	for(int i = 0; i < mybatch; ++i) {
 57 | 		for(int j = 0; j < neuron; ++j) {
 58 |             float r2 = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX/32.0));
 59 | 			input[i * neuron + j] = r2;
 60 | 		}
 61 | 	}
 62 | 
 63 | 
 64 | 	float* W  = (float*)malloc(sizeof(float) * val.size());
 65 | 	for(int i = 0; i < val.size(); ++i) {
 66 | 		W[i] = val[i];
 67 | 	}
 68 | 
 69 | 	int* access = (int*)malloc(sizeof(int) * row_access.size());
 70 | 	for(int i = 0; i < row_access.size(); ++i) {
 71 | 		access[i] = row_access[i];
 72 | 	}
 73 | 
 74 | 
 75 |     Safe_Call(cudaMalloc((void**)&A, sizeof(float) * neuron * mybatch));
 76 |     Safe_Call(cudaMemcpy(A, input, sizeof(float) * neuron * mybatch, cudaMemcpyHostToDevice));
 77 | 
 78 |     Safe_Call(cudaMalloc((void**)&B, sizeof(float) * val.size()));
 79 |     Safe_Call(cudaMemcpy(B, W, sizeof(float) * val.size(), cudaMemcpyHostToDevice));
 80 | 
 81 | 	Safe_Call(cudaMalloc((void**)&C, sizeof(float) * neuron * mybatch));
 82 |     Safe_Call(cudaMemset(C, 0, sizeof(float) * neuron * mybatch));
 83 | 
 84 | 	Safe_Call(cudaMalloc((void**)&index, sizeof(int) * row_access.size()));
 85 | 	Safe_Call(cudaMemcpy(index, access, sizeof(int) * row_access.size(), cudaMemcpyHostToDevice));
 86 | 
 87 | 	env.add_event("row-succ-20-uiuc-kernel");
 88 |     env.event_start_record("row-succ-20-uiuc-kernel");
 89 | 
 90 | 	int blocksize = 256;
 91 | 	dim3 block(blocksize);
 92 |     dim3 grid((mybatch + blocksize - 1) / blocksize,  neuron / 32);
 93 | 
 94 | 	batch_parallel_16384x32succ_kernel<<<grid, block, sizeof(float) * (32 * 32), env.get_stream("row-succ-20-uiuc-kernel")>>>(
 95 | 		A, B, C, index, neuron, batch, bias
 96 | 	);
 97 | 
 98 |     env.event_stop_record("row-succ-20-uiuc-kernel");
 99 | 
100 |     float time = env.get_event_time("row-succ-20-uiuc-kernel"); 
101 | 
102 | 	Safe_Call(cudaMemcpy(output, C, sizeof(float) * neuron * mybatch, cudaMemcpyDeviceToHost));
103 | 
104 | 	std::cout << "Kernel Exec Time [20-uiuc-row-succ-transpose] = " << time <<  "ms" <<std::endl;
105 | 	std::cout << "Kernel Exec Flops = " << (neuron * mybatch * 32 * 2.0) / (time / 1000.0) / 1000 / 1000 / 1000 /1000 << "TFLOPS" <<std::endl;
106 | 
107 | 	CpuSpmm::run_and_cmp(coo, input, neuron, mybatch, output, false, false, false);
108 | 
109 | 	
110 | }
111 | };


--------------------------------------------------------------------------------
/src/multi_gpu.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils/header.h"
  2 | #include "reorder/header.h"
  3 | #include "inspector/header.h"
  4 | // #include "gpu_lib/header.h"
  5 | #include "microbenchmark/multi_gpu/header.h"
  6 | #include "fuse/header.h"
  7 | #include <functional>
  8 | #include <cstdlib>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | #include <mpi.h>
 12 | using namespace ftxj;
 13 | 
 14 | 
 15 | std::string get_weight_file_name(int neuron, int layer) {
 16 |     std::string weight_file_dir = "../data/neuron";
 17 |     std::string neuron_str = std::to_string(neuron);
 18 |     weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv";
 19 |     return weight_file_dir;
 20 | }
 21 | 
 22 | void dense_reorder(std::vector<std::vector<float>> &input, Reorder &reorder_class) {
 23 |     // std::vector<std::vector<float>> old = input;
 24 |     for(int i = 0; i < input.size(); ++i) {
 25 |         std::vector<float> tmp(input[i].size());
 26 |         for(int j = 0; j < input[i].size(); ++j) {
 27 |             auto new_j = reorder_class.reorder(j);
 28 |             tmp[new_j] = input[i][j];
 29 |         }
 30 |         input[i] = tmp;
 31 |     }
 32 | }
 33 | 
 34 | void read_input(std::vector<std::vector<float>> &input, int neuron, int batch) {
 35 |     std::string input_file_name = "../data/sparse-images-";
 36 |     input_file_name += std::to_string(neuron) + ".tsv";
 37 |     std::ifstream input_file(input_file_name);
 38 |     if(!input_file){
 39 |         std::cout << "FILE:" << input_file_name << " does not exists.\n";
 40 |         exit(-1);
 41 |     }
 42 |     int b, n;
 43 |     float val;
 44 |     long read_num = 0;
 45 |     while(input_file >> b >> n >> val) {
 46 |         if(b <= batch) {
 47 |             read_num++;
 48 |             input[b - 1][n - 1] = val;
 49 |             if(val != 1.00) {
 50 |                 printf("read input %d, %f\n", b, val);
 51 |             }
 52 |         }
 53 |     }
 54 |     std::cout << "Read Input success! read_numeber = " << read_num << std::endl;
 55 | }
 56 | 
 57 | int main(int argc, char* argv[]) {
 58 |     
 59 |     char hostname[MPI_MAX_PROCESSOR_NAME];
 60 |     int task_count;
 61 |     int rank;
 62 |     int len;
 63 |     int ret;
 64 | 
 65 | 
 66 |     int neuron = atoi(argv[1]);
 67 |     int batch = atoi(argv[2]);
 68 |     int layer = atoi(argv[3]);
 69 |     // int nnzs = atoi(argv[4]);
 70 | 
 71 |     ret = MPI_Init(&argc, &argv);
 72 |     if (MPI_SUCCESS != ret) {
 73 |         printf("start mpi fail\n");
 74 |         MPI_Abort(MPI_COMM_WORLD, ret);
 75 |     }
 76 | 
 77 |     MPI_Comm_size(MPI_COMM_WORLD, &task_count);
 78 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 79 |     MPI_Get_processor_name(hostname, &len);
 80 | 
 81 |     if(rank == 0)
 82 |         printf("task_count = %d, my rank = %d on %s\n", task_count, rank, hostname);
 83 | 
 84 | 
 85 |     // int neuron = 16384;
 86 |     // int batch = 60000;
 87 |     // int layer = 1920;
 88 | 
 89 |     std::map<int, int> hash_map = {
 90 |         {65536, 4096},
 91 |         {16384, 1024},
 92 |         {4096, 256},
 93 |         {1024, 64}
 94 |     };
 95 | 
 96 |     std::map<int, float> bias_map = {
 97 |         {65536, -0.45},
 98 |         {16384, -0.4},
 99 |         {4096, -0.35},
100 |         {1024, -0.3}
101 |     };
102 | 
103 |     std::map<int, float> type_1 = {
104 |         {65536, 12},
105 |         {16384, 10},
106 |         {4096, 8},
107 |         {1024, 6}
108 |     };
109 | 
110 |     std::vector<std::vector<float>> input(batch, std::vector<float>(neuron));
111 |     std::vector<std::vector<float>> weight; 
112 |     std::vector<std::vector<int>> row_access; 
113 | 
114 | 
115 |     std::cout << "GPU[" << rank << "] " << "[BEGIN]..." << std::endl;
116 |     read_input(input, neuron, batch);
117 |     std::cout << "GPU[" << rank << "] " << "Read Input success!" << std::endl;
118 |     HashReorder hash_reorder_t(hash_map[neuron], neuron);
119 |     dense_reorder(input, hash_reorder_t);
120 | 
121 |     for(int l = 0; l < layer; ++l) {
122 |         auto weight_file = get_weight_file_name(neuron, l);
123 |         COOMatrix coo(weight_file, 1, false);
124 |         std::cout << "GPU[" << rank << "] " << "["<< weight_file << "] to COO success!" << std::endl;
125 |         coo.reorder(hash_reorder_t);
126 |         std::cout << "GPU[" << rank << "] " << "Reorder success!" << std::endl;
127 |         CSRCSCMatrix csr_csc(coo);
128 |         csr_csc.transpose();
129 |         BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method);
130 |         std::cout << "GPU[" << rank << "] " << "Structural Info success!" << std::endl;
131 |         MaxInReuseBSchedule schedule(blocks);
132 |         if(l == 0) {
133 |             schedule.schedule(16, 7);
134 |         }
135 |         else if(l < type_1[neuron]) {
136 |             schedule.schedule_output_parallel(128, 1, false);
137 |         }        
138 |         else {
139 |             schedule.schedule(128, 1);
140 |         }
141 |         std::cout << "GPU[" << rank << "] " << "Schedule succ" << std::endl;
142 |         auto data = schedule.get_data(neuron);
143 |         weight.push_back(data.value);
144 |         row_access.push_back(data.row_access);
145 |     }
146 |     int gpu_id = 0;
147 |     if(rank == 0) gpu_id = 0;
148 |     if(rank == 1) gpu_id = 1;
149 |     if(rank == 2) gpu_id = 2;
150 |     if(rank == 3) gpu_id = 3;
151 |     test_benchmark_multi_gpu_graph_challenge(input, weight, row_access, (batch + 1) / 2, neuron, bias_map[neuron], gpu_id, rank);
152 |     std::cout << "GPU[" << rank << "] " <<"[END]..." << std::endl;
153 |     MPI_Barrier(MPI_COMM_WORLD);
154 |     MPI_Finalize();
155 |     return 0;
156 | }


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/add_singlegpu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/add_singlegpu


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/add_singlegpu.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void add_kernel(int n, float a, float *x, float *y)
 4 | {
 5 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
 6 |   if (i < n) y[i] = a*x[i] + y[i];
 7 | }
 8 | 
 9 | int main()
10 | {
11 |   
12 |   //printf("gpu count : %d\n",N_GPU);
13 |  
14 |    //Arrange the task of each GPU
15 |   int N = 1<<30;
16 | 
17 |   cudaSetDevice(0);
18 | 
19 |   float *x, *y, *d_x, *d_y;
20 |   x = (float*)malloc(N*sizeof(float));
21 |   y = (float*)malloc(N*sizeof(float));
22 | 
23 |   cudaMalloc(&d_x, N*sizeof(float)); 
24 |   cudaMalloc(&d_y, N*sizeof(float));
25 | 
26 |   for (int i = 0; i < N; i++) {
27 |     x[i] = 1.0f;
28 |     y[i] = 2.0f;
29 |   }
30 | 
31 |   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
32 |   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
33 | 
34 |   float time_elapsed=0;
35 |   cudaEvent_t start,stop;
36 |   cudaEventCreate(&start);    //创建Event
37 |   cudaEventCreate(&stop);
38 |   cudaEventRecord( start,0);    //记录当前时间
39 | 
40 |   // Perform SAXPY on 1M elements
41 |   add_kernel<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
42 | 
43 |   cudaEventRecord(stop,0);    //记录当前时间
44 |   cudaEventSynchronize(start);    //Waits for an event to complete.
45 |   cudaEventSynchronize(stop);    //Waits for an event to complete.Record之前的任务
46 |   cudaEventElapsedTime(&time_elapsed,start,stop);    //计算时间差
47 |   
48 |   
49 | 
50 | 
51 | 
52 |   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
53 | 
54 |   float maxError = 0.0f;
55 |   for (int i = 0; i < N; i++)
56 |     maxError = max(maxError, abs(y[i]-4.0f));
57 |   printf("Max error: %f\n", maxError);
58 |   cudaEventDestroy(start);    //destory the event
59 |   cudaEventDestroy(stop);
60 |   printf("执行时间：%f(ms)\n",time_elapsed);
61 | }


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/makefile:
--------------------------------------------------------------------------------
1 | test:saxpy.o mpi_call.o
2 | 	mpicxx mpi_call.o saxpy.o -L/usr/local/cuda/lib64 -lcudart -o test
3 | saxpy.o:saxpy.cu
4 | 	nvcc -c saxpy.cu -o saxpy.o
5 | mpi_call.o:mpi_call.cpp
6 | 	mpicxx -c mpi_call.cpp -o mpi_call.o
7 | clean:
8 | 	rm  -f  *.o


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/mpi_call.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | #include <time.h>
 4 | #include "vars.h"
 5 | 
 6 | int main(int argc, char *argv[]) {
 7 |     char hostname[MPI_MAX_PROCESSOR_NAME];
 8 |     int task_count;
 9 |     int rank;
10 |     int len;
11 |     int ret;
12 | 
13 |     ret = MPI_Init(&argc, &argv);
14 |     if (MPI_SUCCESS != ret) {
15 |         printf("start mpi fail\n");
16 |         MPI_Abort(MPI_COMM_WORLD, ret);
17 |     }
18 | 
19 |     MPI_Comm_size(MPI_COMM_WORLD, &task_count);
20 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
21 |     MPI_Get_processor_name(hostname, &len);
22 | 
23 |     printf("task_count = %d, my rank = %d on %s\n", task_count, rank, hostname);
24 |     
25 |     float esp_time_cpu;
26 | 	clock_t start_cpu, stop_cpu;
27 | 
28 |     start_cpu = clock();// start timing
29 | 
30 |     handle(rank);//在此调用用cuda写的函数
31 |     stop_cpu = clock();// end timing
32 | 
33 | 	esp_time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
34 | 
35 | 	printf("The time by host:\t%f(ms)\n", esp_time_cpu);
36 | 
37 |     MPI_Finalize();
38 | 
39 |     return 0;
40 | }


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/mpi_call.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/mpi_call.o


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/run_volta.sh:
--------------------------------------------------------------------------------
1 | mpirun -np 4 ./test


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/saxpy.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void add_kernel(int n, float a, float *x, float *y)
 4 | {
 5 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
 6 |   if (i < n) y[i] = a*x[i] + y[i];
 7 | }
 8 | 
 9 | void handle(int gpu_number)
10 | {
11 |   int N_GPU;
12 |   cudaGetDeviceCount(&N_GPU);
13 |   //printf("gpu count : %d\n",N_GPU);
14 |  
15 |    //Arrange the task of each GPU
16 |   int N = ((1<<30)+N_GPU - 1)/N_GPU;
17 | 
18 |   cudaSetDevice(gpu_number);
19 | 
20 |   float *x, *y, *d_x, *d_y;
21 |   x = (float*)malloc(N*sizeof(float));
22 |   y = (float*)malloc(N*sizeof(float));
23 | 
24 |   cudaMalloc(&d_x, N*sizeof(float)); 
25 |   cudaMalloc(&d_y, N*sizeof(float));
26 | 
27 |   for (int i = 0; i < N; i++) {
28 |     x[i] = 1.0f;
29 |     y[i] = 2.0f;
30 |   }
31 | 
32 |   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
33 |   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
34 | 
35 |   float time_elapsed=0;
36 |   cudaEvent_t start,stop;
37 |   cudaEventCreate(&start);    //创建Event
38 |   cudaEventCreate(&stop);
39 |   cudaEventRecord( start,0);    //记录当前时间
40 | 
41 |   // Perform SAXPY on 1M elements
42 |   add_kernel<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
43 | 
44 |   cudaEventRecord(stop,0);    //记录当前时间
45 |   cudaEventSynchronize(start);    //Waits for an event to complete.
46 |   cudaEventSynchronize(stop);    //Waits for an event to complete.Record之前的任务
47 |   cudaEventElapsedTime(&time_elapsed,start,stop);    //计算时间差
48 |   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
49 | 
50 |   float maxError = 0.0f;
51 |   for (int i = 0; i < N; i++)
52 |     maxError = max(maxError, abs(y[i]-4.0f));
53 |   printf("Max error: %f\n", maxError);
54 |   cudaEventDestroy(start);    //destory the event
55 |   cudaEventDestroy(stop);
56 |   printf("card%d 执行时间：%f(ms)\n",gpu_number,time_elapsed);
57 | }


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/saxpy.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/saxpy.o


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/src/multi_gpu/add_mpi/test


--------------------------------------------------------------------------------
/src/multi_gpu/add_mpi/vars.h:
--------------------------------------------------------------------------------
1 | void handle(int);


--------------------------------------------------------------------------------
/src/multi_gpu/add_omp.cu:
--------------------------------------------------------------------------------
  1 | #include <omp.h>
  2 | #include <stdio.h>      // stdio functions are used since C++ streams aren't necessarily thread safe
  3 |  
  4 |  
  5 | // a simple kernel that simply increments each array element by b
  6 | __global__ void kernelAddConstant(int *g_a, const int b)
  7 | {
  8 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  9 |     g_a[idx] += b;
 10 | }
 11 |  
 12 | // a predicate that checks whether each array elemen is set to its index plus b
 13 | int correctResult(int *data, const int n, const int b)
 14 | {
 15 |         for(int i = 0; i < n; i++)
 16 |                 if(data[i] != i + b)
 17 |                         return 0;
 18 |         return 1;
 19 | }
 20 |  
 21 | int main(int argc, char *argv[])
 22 | {
 23 |         int num_gpus = 0;       // number of CUDA GPUs
 24 |  
 25 |         /////////////////////////////////////////////////////////////////
 26 |         // determine the number of CUDA capable GPUs
 27 |         //
 28 |     cudaGetDeviceCount(&num_gpus);
 29 |         if(num_gpus < 1)
 30 |         {
 31 |                 printf("no CUDA capable devices were detected\n");
 32 |                 return 1;
 33 |         }
 34 |  
 35 |         /////////////////////////////////////////////////////////////////
 36 |         // display CPU and GPU configuration
 37 |         //
 38 |     printf("number of host CPUs:\t%d\n", omp_get_num_procs());
 39 |     printf("number of CUDA devices:\t%d\n", num_gpus);
 40 |     for(int i = 0; i < num_gpus; i++)
 41 |     {
 42 |         cudaDeviceProp dprop;
 43 |         cudaGetDeviceProperties(&dprop, i);
 44 |                 printf("   %d: %s\n", i, dprop.name);
 45 |     }
 46 |         printf("---------------------------\n");
 47 |  
 48 |  
 49 |     /////////////////////////////////////////////////////////////////
 50 |     // initialize data
 51 |         //
 52 |     unsigned int n = num_gpus * 8192;
 53 |     unsigned int nbytes = n * sizeof(int);
 54 |         int *a = 0;             // pointer to data on the CPU
 55 |         int b = 3;              // value by which the array is incremented
 56 |         a = (int*)malloc(nbytes);
 57 |         if(0 == a)
 58 |         {
 59 |                 printf("couldn't allocate CPU memory\n");
 60 |                 return 1;
 61 |         }
 62 |         for(unsigned int i = 0; i < n; i++)
 63 |         a[i] = i;
 64 |      
 65 |  
 66 |     ////////////////////////////////////////////////////////////////
 67 |         // run as many CPU threads as there are CUDA devices
 68 |         //   each CPU thread controls a different device, processing its
 69 |         //   portion of the data.  It's possible to use more CPU threads
 70 |         //   than there are CUDA devices, in which case several CPU
 71 |         //   threads will be allocating resources and launching kernels
 72 |         //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
 73 |         //   Recall that all variables declared inside an "omp parallel" scope are
 74 |         //   local to each CPU thread
 75 |         //
 76 |         omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA devices
 77 |     //omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there are CUDA devices
 78 | #pragma omp parallel
 79 |     {
 80 |         unsigned int cpu_thread_id = omp_get_thread_num();
 81 |                 unsigned int num_cpu_threads = omp_get_num_threads();
 82 |  
 83 |                 // set and check the CUDA device for this CPU thread
 84 |                 int gpu_id = -1;
 85 |                 cudaSetDevice(cpu_thread_id % num_gpus);        // "% num_gpus" allows more CPU threads than GPU devices
 86 |                 cudaGetDevice(&gpu_id);
 87 |  
 88 |                 printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
 89 |  
 90 |                 int *d_a = 0;   // pointer to memory on the device associated with this CPU thread
 91 |                 int *sub_a = a + cpu_thread_id * n / num_cpu_threads;   // pointer to this CPU thread's portion of data
 92 |                 unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
 93 |                 dim3 gpu_threads(128);  // 128 threads per block
 94 |                 dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
 95 |  
 96 |           cudaMalloc((void**)&d_a, nbytes_per_kernel);
 97 |           cudaMemset(d_a, 0, nbytes_per_kernel);
 98 |           cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice);
 99 |         kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
100 |  
101 |           cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost);
102 |           cudaFree(d_a);
103 |  
104 |  
105 |     }
106 |         printf("---------------------------\n");
107 |  
108 |         if(cudaSuccess != cudaGetLastError())
109 |                 printf("%s\n", cudaGetErrorString(cudaGetLastError()));
110 |  
111 |  
112 |         ////////////////////////////////////////////////////////////////
113 |         // check the result
114 |         //
115 |     if(correctResult(a, n, b))
116 |         printf("Test PASSED\n");
117 |     else
118 |         printf("Test FAILED\n");
119 |  
120 |     free(a);    // free CPU memory
121 |  
122 |     //cudaThreadExit();
123 |  
124 |     return 0;
125 | }


--------------------------------------------------------------------------------
/src/multi_gpu/add_stream.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <cuda_runtime.h>
  5 | // This application demonstrates how to use CUDA API to use mutiple GPUs
  6 | // Function to add the elements of two arrays
  7 |  
  8 | //Mutiple-GPU Plan Structure
  9 | typedef struct
 10 | {
 11 |     //Host-side input data
 12 |     float *h_x, *h_y;
 13 | 	  
 14 |     //Result copied back from GPU
 15 | 	  float *h_yp;
 16 |     //Device buffers
 17 |     float *d_x, *d_y;
 18 |  
 19 |     //Stream for asynchronous command execution
 20 |     cudaStream_t stream;
 21 |  
 22 | } TGPUplan;
 23 | 
 24 | 
 25 |  
 26 | // CUDA Kernel function to add the elements of two arrays on the GPU
 27 | __global__ void add(int n, float *x, float *y)
 28 | {
 29 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 30 |   int stride = blockDim.x * gridDim.x;
 31 |   for (int i = index; i < n; i += stride)
 32 |       y[i] = x[i] + y[i];
 33 | }
 34 |  
 35 | int main(void)
 36 | {
 37 |   int N = 1<<20; // 1M elements
 38 |   
 39 |   //Get the numble of CUDA-capble GPU
 40 |   int N_GPU;
 41 |   cudaGetDeviceCount(&N_GPU);
 42 |   printf("gpu count : %d\n",N_GPU);
 43 |  
 44 |   //Arrange the task of each GPU
 45 |   int Np = (N + N_GPU - 1) / N_GPU;
 46 |   
 47 |   //Create GPU plans
 48 |   TGPUplan plan[N_GPU];
 49 |  
 50 |   //Initializing 
 51 |   for(int i = 0; i < N_GPU; i++)
 52 |   {
 53 |     cudaSetDevice(i);
 54 |     cudaStreamCreate(&plan[i].stream);
 55 |  
 56 |     cudaMalloc((void **)&plan[i].d_x, Np * sizeof(float));
 57 |     cudaMalloc((void **)&plan[i].d_y, Np * sizeof(float));
 58 |     plan[i].h_x = (float *)malloc(Np * sizeof(float));
 59 |     plan[i].h_y = (float *)malloc(Np * sizeof(float));
 60 |     plan[i].h_yp = (float *)malloc(Np * sizeof(float));
 61 |  
 62 | 	  for(int j = 0; j < Np; j++)
 63 |     {
 64 |       plan[i].h_x[j] = 1.0f;
 65 |       plan[i].h_y[j] = 2.0f;
 66 |     }
 67 |   }
 68 |  
 69 |   int blockSize = 256;
 70 |   int numBlock = (Np + blockSize - 1) / blockSize;
 71 |  
 72 | 
 73 |     // double iStart,iElaps;
 74 |     // iStart=cpuSecond();
 75 | 
 76 |   clock_t start, finish;
 77 |   start = clock();
 78 | 
 79 |   for(int i = 0; i < N_GPU; i++)
 80 |   {
 81 |     //Set device
 82 |     cudaSetDevice(i);
 83 |  
 84 |     //Copy input data from CPU
 85 |     cudaMemcpyAsync(plan[i].d_x, plan[i].h_x, Np * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
 86 |     cudaMemcpyAsync(plan[i].d_y, plan[i].h_y, Np * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
 87 |     //Run the kernel function on GPU
 88 |     add<<<numBlock, blockSize, 0, plan[i].stream>>>(Np, plan[i].d_x, plan[i].d_y);
 89 |     
 90 |     //Read back GPU results
 91 |     cudaMemcpyAsync(plan[i].h_yp, plan[i].d_y, Np * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);
 92 |   }
 93 |   finish = clock();
 94 |   float duration = (double)(finish - start) / CLOCKS_PER_SEC;  
 95 |      printf("GPU Kernel time: %f\n",duration);
 96 |     // cudaDeviceSynchronize();
 97 |     // iElaps=cpuSecond()-iStart;
 98 |     // printf("GPU Kernel time: %f\n",iElaps);
 99 |   //Process GPU results
100 |   float y[N];
101 |   for(int i = 0; i < N_GPU; i++)
102 |   {
103 |     //Set device
104 |     cudaSetDevice(i);
105 |  
106 |     //Wait for all operations to finish
107 |     cudaStreamSynchronize(plan[i].stream);
108 |  
109 |     //Get the final results
110 | 	  for(int j = 0; j < Np; j++)
111 | 		  if(Np * i + j < N)
112 | 			   y[Np * i + j]=plan[i].h_yp[j];
113 | 	  
114 |     //shut down this GPU
115 |     cudaFree(plan[i].d_x);
116 |     cudaFree(plan[i].d_y);
117 |     free(plan[i].h_x);
118 |     free(plan[i].h_y);
119 |   	cudaStreamDestroy(plan[i].stream); //Destroy the stream
120 |   }
121 |  
122 |   // Check for errors (all values should be 3.0f)
123 |   float maxError = 0.0f;
124 |   for (int i = 0; i < N; i++)
125 |     maxError = fmax(maxError, fabs(y[i]-3.0f));
126 |   std::cout << "Max error: " << maxError << std::endl;
127 |  
128 |   return 0;
129 | 
130 | }


--------------------------------------------------------------------------------
/src/network.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils/header.h"
  2 | #include "reorder/header.h"
  3 | #include "inspector/header.h"
  4 | #include "gpu_lib/header.h"
  5 | #include "microbenchmark/header.h"
  6 | #include "fuse/header.h"
  7 | #include <functional>
  8 | #include <cstdlib>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | using namespace ftxj;
 12 | 
 13 | 
 14 | std::string get_weight_file_name(int neuron, int layer) {
 15 |     std::string weight_file_dir = "../data/neuron";
 16 |     std::string neuron_str = std::to_string(neuron);
 17 |     weight_file_dir += neuron_str + "/n" + neuron_str + "-l" + std::to_string(layer + 1) + ".tsv";
 18 |     return weight_file_dir;
 19 | }
 20 | 
 21 | void dense_reorder(std::vector<std::vector<float>> &input, Reorder &reorder_class) {
 22 |     // std::vector<std::vector<float>> old = input;
 23 |     for(int i = 0; i < input.size(); ++i) {
 24 |         std::vector<float> tmp(input[i].size());
 25 |         for(int j = 0; j < input[i].size(); ++j) {
 26 |             auto new_j = reorder_class.reorder(j);
 27 |             tmp[new_j] = input[i][j];
 28 |         }
 29 |         input[i] = tmp;
 30 |     }
 31 | }
 32 | 
 33 | void read_input(std::vector<std::vector<float>> &input, int neuron, int batch) {
 34 |     std::string input_file_name = "../data/sparse-images-";
 35 |     input_file_name += std::to_string(neuron) + ".tsv";
 36 |     std::ifstream input_file(input_file_name);
 37 |     if(!input_file){
 38 |         std::cout << "FILE:" << input_file_name << " does not exists.\n";
 39 |         exit(-1);
 40 |     }
 41 |     int b, n;
 42 |     float val;
 43 |     long read_num = 0;
 44 |     while(input_file >> b >> n >> val) {
 45 |         if(b <= batch) {
 46 |             read_num++;
 47 |             input[b - 1][n - 1] = val;
 48 |             if(val != 1.00) {
 49 |                 printf("read input %d, %f\n", b, val);
 50 |             }
 51 |         }
 52 |     }
 53 |     std::cout << "Read Input success! read_numeber = " << read_num << std::endl;
 54 | }
 55 | 
 56 | int main(int argc, char* argv[]) {
 57 | 
 58 |     if(argc != 4) {
 59 |         std::cout << "Usage: exe neuron batch layer" << std::endl;
 60 |         return 0;
 61 |     }
 62 |     int neuron = atoi(argv[1]);
 63 |     int batch = atoi(argv[2]);
 64 |     int layer = atoi(argv[3]);
 65 | 
 66 |     std::map<int, int> hash_map = {
 67 |         {65536, 4096},
 68 |         {16384, 1024},
 69 |         {4096, 256},
 70 |         {1024, 64}
 71 |     };
 72 | 
 73 |     std::map<int, float> bias_map = {
 74 |         {65536, -0.45},
 75 |         {16384, -0.4},
 76 |         {4096, -0.35},
 77 |         {1024, -0.3}
 78 |     };
 79 | 
 80 |     std::map<int, float> type_1 = {
 81 |         {65536, 12},
 82 |         {16384, 10},
 83 |         {4096, 8},
 84 |         {1024, 6}
 85 |     };
 86 | 
 87 |     std::vector<std::vector<float>> input(batch, std::vector<float>(neuron));
 88 |     std::vector<std::vector<float>> weight; 
 89 |     std::vector<std::vector<int>> row_access; 
 90 | 
 91 |     std::cout << "[BEGIN]..." << std::endl;
 92 |     read_input(input, neuron, batch);
 93 |     std::cout << "Read Input success!" << std::endl;
 94 |     HashReorder hash_reorder_t(hash_map[neuron], neuron);
 95 |     dense_reorder(input, hash_reorder_t);
 96 | 
 97 |     for(int l = 0; l < layer; ++l) {
 98 |         auto weight_file = get_weight_file_name(neuron, l);
 99 |         COOMatrix coo(weight_file, 1, false);
100 |         std::cout << "["<< weight_file << "] to COO success!" << std::endl;
101 |         coo.reorder(hash_reorder_t);
102 |         std::cout << "Reorder success!" << std::endl;
103 |         CSRCSCMatrix csr_csc(coo);
104 |         csr_csc.transpose();
105 |         BlockContainer blocks(csr_csc, SparseMatrixBlockGen::naive_method);
106 |         std::cout << "Structural Info success!" << std::endl;
107 |         MaxInReuseBSchedule schedule(blocks);
108 |         if(l == 0) {
109 |             schedule.schedule(16, 7);
110 |         }
111 |         else if(l < type_1[neuron]) {
112 |             schedule.schedule_output_parallel(128, 1, false);
113 |         }        
114 |         else {
115 |             schedule.schedule(128, 1);
116 |         }
117 |         std::cout << "Schedule succ" << std::endl;
118 |         auto data = schedule.get_data(neuron);
119 |         weight.push_back(data.value);
120 |         row_access.push_back(data.row_access);
121 |     }
122 |     GpuEnv env(0);
123 |     test_benchmark_graph_challenge(input, weight, row_access, batch, neuron, bias_map[neuron], env);
124 |     std::cout << "[END]..." << std::endl;
125 |     return 0;
126 | }


--------------------------------------------------------------------------------
/src/reorder/hash.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../utils/header.h"
 4 | #include "reorder.h"
 5 | 
 6 | namespace ftxj {
 7 |     class HashReorder : public Reorder {
 8 |         
 9 |         int buckets_num_;
10 |         int max_domain_;
11 |         int buckets_width;
12 |         REORDER type_;
13 | 
14 |         int hash(int v) {
15 |             if(v >= max_domain_) {
16 |                 std::cout << "ERROR: Hasing Function error!" << std::endl;
17 |                 exit(-1);
18 |             }
19 |             int col = v % buckets_num_;
20 |             int row = v / buckets_num_;
21 |             return row + col * buckets_width;
22 |         }
23 |     public:
24 |         HashReorder(int buckets_num, int max_domain, REORDER type = ALL_REORDER) 
25 |             : buckets_num_(buckets_num), max_domain_(max_domain), type_(type) {
26 |             buckets_width = max_domain / buckets_num_;
27 |         }
28 | 
29 |         int reorder(int r) {
30 |             return hash(r);
31 |         }
32 | 
33 |         MatrixPos new_pos(const MatrixPos &old_pos) {
34 |             MatrixPos n_pos = old_pos;
35 |             if(type_ == COL_REORDER || type_ == ALL_REORDER) {
36 |                 n_pos.col_idx = hash(n_pos.col_idx);
37 |             }
38 |             if(type_ == ROW_REORDER || type_ == ALL_REORDER) {
39 |                 n_pos.row_idx = hash(n_pos.row_idx);
40 |             }
41 |             return n_pos;
42 |         }
43 |     };
44 | }


--------------------------------------------------------------------------------
/src/reorder/header.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "reorder.h"
4 | #include "hash.h"


--------------------------------------------------------------------------------
/src/reorder/reorder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../utils/matrix_base.h"
 4 | 
 5 | namespace ftxj {
 6 | 
 7 | enum REORDER {
 8 |     COL_REORDER,
 9 |     ROW_REORDER,
10 |     ALL_REORDER
11 | };
12 | 
13 | class Reorder {
14 | public:
15 |     virtual MatrixPos new_pos(const MatrixPos &old_pos) = 0;
16 |     virtual int reorder(int r) = 0;
17 | 
18 | };
19 | 
20 | };


--------------------------------------------------------------------------------
/src/run_bf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for col_blk in 1024 512 256 128 64 32 16
 3 | do 
 4 |     for blockDim in 1024 512 256 128 64
 5 |     do
 6 |         for((blockx=1; blockx<=blockDim; blockx+=blockx))
 7 |         do
 8 |             blocky=`expr $blockDim / $blockx`
 9 |             echo "Run Config"
10 |             echo $col_blk 
11 |             echo $blockx 
12 |             echo $blocky
13 |             ./bf 1024 1000 1 2 $col_blk $blockx $blocky
14 |         done
15 |     done
16 | done


--------------------------------------------------------------------------------
/src/utils/cpu_spmm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | #include <algorithm>
 5 | #include "matrix.h"
 6 | #include "debug.h"
 7 | 
 8 | namespace ftxj {
 9 |     class CpuSpmm {
10 |     public:
11 |         static void run_and_cmp(COOMatrix &weight, float* input, int neuron, int batch, float* output, bool T = false, bool resT = true, bool inputT = true) {
12 |             weight.to_row_first_ordered();
13 |             std::vector<std::vector<float>> res(batch, std::vector<float>(neuron, 0.0));
14 |             for(int b = 0; b < batch; ++b) {
15 |                 if(b % 10000 == 0) std::cout << "run " << b << "..." << std::endl;
16 |                 for(auto iter = weight.begin(); iter != weight.end(); ++iter) {
17 |                     int row = (*iter).row;
18 |                     int col = (*iter).col;
19 |                     float val = (*iter).val;
20 |                     float in = 0.0;
21 |                     if(T) {
22 |                         if(inputT) in = input[b * neuron + row];
23 |                         else in = input[row * batch + b];
24 |                         res[b][col] += in * val;
25 |                         // if(b == 1 && col == 16352) {
26 |                         //     printf("%f * %f %d\n", in, val, row);
27 |                         // // }
28 |                         // if(b == 0 && col == 62) {
29 |                         //     printf("0 %f * %f %d\n", in, val, row);
30 |                         // }
31 |                     }
32 |                     else {
33 |                         if(inputT) in = input[b * neuron + col];
34 |                         else in = input[col * batch + b];
35 |                         res[b][row] += in * val;
36 |                         // if(b == 1 && row == 16352) {
37 |                         //     printf("%f * %f %d\n", in, val, col);
38 |                         // }
39 |                         // if(b == 0 && row == 62) {
40 |                         //     printf("0 %f * %f %d\n", in, val, col);
41 |                         // }
42 |                     }
43 |                 }
44 |                 for(int j = 0; j < neuron; ++j) {
45 |                     float cmp = 0;
46 |                     if(resT) cmp = output[b * neuron + j];
47 |                     else cmp = output[j * batch + b];
48 |                     if(std::abs(res[b][j] - cmp) > 1e-3) {
49 |                         std::cout << b << ", " << j << " cpu=" << res[b][j] << ", gpu=" << cmp << std::endl;
50 |                         assert_msg(res[b][j] == cmp, "cpu gpu doesnot equals!");
51 |                     }
52 |                 }
53 |             }
54 |             std::cout << "Compare with cpu result [Success]" << std::endl;
55 |         }
56 |     };
57 | };


--------------------------------------------------------------------------------
/src/utils/cpu_spmm_fuse.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | #include <algorithm>
 5 | #include "matrix.h"
 6 | #include "debug.h"
 7 | 
 8 | namespace ftxj {
 9 |     class CpuSpmmFuse {
10 |     public:
11 |         static void run_and_cmp(std::vector<COOMatrix> &weight, float* input, int neuron, int batch, int bias, float* output, int fuse_layer, bool T = false, bool resT = true, bool inputT = true) {
12 |             for(int i = 0; i < fuse_layer; ++i) {
13 |                 weight[i].to_row_first_ordered();
14 |             }
15 |             std::vector<std::vector<float>> res1(batch, std::vector<float>(neuron, 0.0));
16 |             std::vector<std::vector<float>> res2(batch, std::vector<float>(neuron, 0.0));
17 |             for(int b = 0; b < batch; ++b) {
18 |                 if(b % 10000 == 0) std::cout << "run " << b << "..." << std::endl;
19 |                 for(auto iter = weight[0].begin(); iter != weight[0].end(); ++iter) {
20 |                     int row = (*iter).row;
21 |                     int col = (*iter).col;
22 |                     float val = (*iter).val;
23 |                     float in = 0.0;
24 |                     if(T) {
25 |                         if(inputT) in = input[b * neuron + row];
26 |                         else in = input[row * batch + b];
27 |                         res1[b][col] += in * val;
28 |                         // if(b == 8 && col == 0) {
29 |                         //     printf("%f * %f = %f\n", in, val, res1[b][col]);
30 |                         // }
31 |                     }
32 |                     else {
33 |                         if(inputT) in = input[b * neuron + col];
34 |                         else in = input[col * batch + b];
35 |                         res1[b][row] += in * val;
36 |                         // if(b == 8 && row == 0) {
37 |                         //     printf("%f * %f = %f\n", in, val, res1[b][row]);
38 |                         // }
39 |                     }
40 |                 }
41 |                 for(int j = 0; j < neuron; ++j) {
42 |                     // res1[b][j] =  res1[b][j];  
43 |                     // if(b == 8 && j == 0) {
44 |                     //     printf("res1 = %f\n", res1[b][j]);
45 |                     // }
46 |                     res1[b][j] =   ((res1[b][j] + bias) > 32 ? 32.0 : ((res1[b][j] + bias) < 0) ? 0 : res1[b][j] + bias);
47 |                 }
48 |             }
49 |             for(int l = 1; l < fuse_layer; ++l) {
50 |                 for(int b = 0; b < batch; ++b) {
51 |                     if(b % 10000 == 0) std::cout << "run l = " << l << ", b = " << b << "..." << std::endl;
52 |                     for(auto iter = weight[l].begin(); iter != weight[l].end(); ++iter) {
53 |                         int row = (*iter).row;
54 |                         int col = (*iter).col;
55 |                         float val = (*iter).val;
56 |                         float in = 0.0;
57 |                         if(T) {
58 |                             in = res1[b][row];
59 |                             res2[b][col] += in * val;
60 |                             // if(b == 8 && col == 0) {
61 |                             //     printf("%f * %f %d\n", in, val, row);
62 |                             // }
63 |                         }
64 |                         else {
65 |                             in = res1[b][col];
66 |                             res2[b][row] += in * val;
67 |                             // if(b == 8 && row == 0) {
68 |                             //     printf("%f * %f %d\n", in, val, col);
69 |                             // }
70 |                         }
71 |                     }
72 |                     for(int j = 0; j < neuron; ++j) {
73 |                     //    res2[b][j] =  res2[b][j];
74 |                        res2[b][j] =  ((res2[b][j] + bias) > 32 ? 32.0 : ((res2[b][j] + bias) < 0) ? 0 : res2[b][j] + bias);
75 |                     }
76 |                 }
77 |                 res1 = res2;
78 |                 res2 = std::vector<std::vector<float>>(batch, std::vector<float>(neuron, 0.0));
79 |             }
80 |             
81 |             for(int b = 0; b < batch; ++b) {
82 |                 for(int j = 0; j < neuron; ++j) {
83 |                     float cmp = 0;
84 |                     if(resT) cmp = output[b * neuron + j];
85 |                     else cmp = output[j * batch + b];
86 |                     if(std::abs(res1[b][j] - cmp) > 1e-3) {
87 |                         std::cout << b << ", " << j << " cpu=" << res1[b][j] << ", gpu=" << cmp << std::endl;
88 |                         assert_msg(res1[b][j] == cmp, "cpu gpu doesnot equals!");
89 |                     }
90 |                 }
91 |             }
92 | 
93 |             std::cout << "Compare with cpu result [Success]" << std::endl;
94 |         }
95 |     };
96 | };


--------------------------------------------------------------------------------
/src/utils/cpu_transpose.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | #include <algorithm>
 5 | #include "matrix.h"
 6 | #include "debug.h"
 7 | 
 8 | namespace ftxj {
 9 |     class CpuTranspose {
10 |     public:
11 |         static void run_and_cmp(float* input, int neuron, int batch, float* output) {
12 |             std::vector<std::vector<float>> res(neuron, std::vector<float>(batch, 0.0));
13 |             for(int b = 0; b < batch; ++b) {
14 |                 for(int n = 0; n < neuron; ++n) {
15 |                     res[n][b] = input[b * neuron + n];
16 |                     assert_msg(res[n][b] == output[n * batch + b], "error!");
17 |                 }
18 |             }
19 |             std::cout << "Compare with cpu result [Success]" << std::endl;
20 |         }
21 |     };
22 | };


--------------------------------------------------------------------------------
/src/utils/cpu_transpose_and_delete.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | #include <algorithm>
 5 | #include "matrix.h"
 6 | #include "debug.h"
 7 | 
 8 | namespace ftxj {
 9 |     class CpuTransposeDelete {
10 |     public:
11 |         static void run_and_cmp(float* input, int* old_to_new_map, int old_batch, int neuron, int new_batch, float* output) {
12 |             std::vector<std::vector<float>> res(new_batch, std::vector<float>(neuron, 0.0));
13 |             for(int b = 0; b < old_batch; ++b) {
14 |                 if(old_to_new_map[b] == -1) continue;
15 |                 int new_b = old_to_new_map[b];
16 |                 for(int n = 0; n < neuron; ++n) {
17 |                     res[new_b][n] = input[n * old_batch + b];
18 |                     if(std::abs(res[new_b][n] - output[new_b * neuron + n]) > 1e-3) {
19 |                         std::cout << b << ", " << n << std::endl;
20 |                         std::cout << new_b << ", " << n << std::endl;
21 |                         std::cout << "currect = " << res[new_b][n] << ", error = " << output[new_b * neuron + n] << std::endl;
22 |                         assert_msg(res[new_b][n] == output[new_b * neuron + n], "error!");
23 |                     }
24 |                 }
25 |             }
26 |             std::cout << "Compare with cpu result [Success]" << std::endl;
27 |         }
28 |     };
29 | };


--------------------------------------------------------------------------------
/src/utils/debug.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <iostream>
 3 | 
 4 | namespace ftxj {
 5 |     #ifndef NDEBUG
 6 |     #   define assert_msg(Expr, Msg) \
 7 |         Debug::assert_msg_(#Expr, Expr, __FILE__, __LINE__, Msg)
 8 |     #else
 9 |     #   define assert_msg(Expr, Msg) ;
10 |     #endif
11 | 
12 |     class Debug {
13 |     public:
14 |         static void assert_msg_(const char* expr_str, bool expr, const char* file, int line, const char* msg) {
15 |             if (!expr)
16 |             {
17 |                 std::cerr << "Assert failed:\t" << msg << "\n"
18 |                     << "Expected:\t" << expr_str << "\n"
19 |                     << "Source:\t\t" << file << ", line " << line << "\n";
20 |                 abort();
21 |             }
22 |         }
23 |     };
24 | }


--------------------------------------------------------------------------------
/src/utils/header.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "debug.h"
 3 | #include "matrix_base.h"
 4 | #include "matrix.h"
 5 | #include "type.h"
 6 | #include "string.h"
 7 | #include "cpu_spmm.h"
 8 | #include "cpu_spmm_fuse.h"
 9 | #include "cpu_transpose.h"
10 | #include "cpu_transpose_and_delete.h"


--------------------------------------------------------------------------------
/src/utils/matrix_base.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <iostream>
 4 | namespace ftxj {
 5 |     struct MatrixPos{
 6 |         int row_idx;
 7 |         int col_idx;
 8 |         MatrixPos(int r, int c) {row_idx = r; col_idx = c;}
 9 |         MatrixPos() {}
10 |         void print() {
11 |             std::cout << "(" << row_idx << "," << col_idx << ")";
12 |         }
13 |     };
14 | };


--------------------------------------------------------------------------------
/src/utils/string.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <iostream>
3 | 
4 | namespace ftxj {
5 |     class String {
6 | 
7 |     };
8 | }


--------------------------------------------------------------------------------
/src/utils/type.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <vector>
3 | #include <string>
4 | 
5 | namespace ftxj {
6 |     typedef float SparseDataType;
7 | };


--------------------------------------------------------------------------------
/tools/3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/3d.png


--------------------------------------------------------------------------------
/tools/3d_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib as mpl
 2 | import matplotlib.pyplot as plt
 3 | from mpl_toolkits.mplot3d import Axes3D
 4 | import numpy as np
 5 | 
 6 | 
 7 | 
 8 | labels_x = ['(b)\n(n)', '', '(b)\n(nn,k)', '(b,n)\n(nn,kk)', '', '', '', '']
 9 | labels_y = ['(b0,k0,n0)', '(b0,k0,n1)', '(b0,k1,n1)', '', '', '', '', '']
10 | labels_z = ['(b0,k0,n0)', '', '', '(b,n,k,nn)', '(b,n,k,kk,nn)', '', '', '']
11 | 
12 | 
13 | xs1 = [3]
14 | ys1 = [2]
15 | zs1 = [4]
16 | 
17 | 
18 | xs2 = [4]
19 | ys2 = [3]
20 | zs2 = [5]
21 | 
22 | 
23 | 
24 | # 方式1：设置三维图形模式
25 | fig = plt.figure() # 创建一个画布figure，然后在这个画布上加各种元素。
26 | ax = Axes3D(fig) # 将画布作用于 Axes3D 对象上。
27 | 
28 | ax.scatter(xs1,ys1,zs1) # 画出(xs1,ys1,zs1)的散点图。
29 | ax.scatter(xs2,ys2,zs2,c='r',marker='^')
30 | 
31 | 
32 | 
33 | ax.set_xlabel('Parallelism') # 画出坐标轴
34 | ax.set_ylabel('Loop Tiling')
35 | ax.set_zlabel('Execute Order')
36 | 
37 | locsx, labelsx = plt.xticks()  # Get the current locations and labels.
38 | locsy, labelsy = plt.yticks()  # Get the current locations and labels.
39 | # locsz, labelsz = plt.zticks()  # Get the current locations and labels.
40 | 
41 | plt.xticks(locsx, labels_x)  # Set label locations.
42 | plt.yticks(locsy, labels_y)  # Set label locations.
43 | # plt.zticks(locsz, labels_z)  # Set label locations.
44 | 
45 | plt.savefig("3d.png")
46 | # plt.show()


--------------------------------------------------------------------------------
/tools/control_code_analysis.py:
--------------------------------------------------------------------------------
 1 | file = "../3rd_party/20-graphchallenge/SpDNN_Challenge2020/singlegpu/kernel.sass"
 2 | control_line = 0
 3 | with open(file) as f:
 4 |     line = f.readline()
 5 |     x = 1
 6 |     gap = 0
 7 |     while line:
 8 |         idx = line.rfind('/*')
 9 |         if idx != -1:
10 |             if control_line == 1:
11 |                 hex = "0x" + line[idx + 9 : idx + 11]
12 |                 # print(hex)
13 |                 stalls = int(hex, 16) 
14 |                 stalls = (stalls >> 1) & 0x0f
15 |                 # if yield_code == 0:
16 |                 #     print(x, gap)
17 |                 #     gap = 0
18 |                 # else:
19 |                 #     gap = gap + 1
20 |                 print(x, stalls)
21 |             control_line = (control_line + 1) % 2
22 |         line = f.readline()
23 |         x = x + 1


--------------------------------------------------------------------------------
/tools/cost_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/cost_model.py


--------------------------------------------------------------------------------
/tools/edgedraw.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | def hashElement(v,buckets,max_range):
 4 |     if v>= max_range:
 5 |         return v
 6 |     else:
 7 |         return v/buckets+v%buckets*(max_range/buckets)
 8 |     pass
 9 | 
10 | num = str(1);
11 | neuron = str(4096);
12 | bucketnumber = 256;
13 | 
14 | path='../src/tmp.txt'
15 | 
16 | col = []
17 | row = []
18 | file =open(path,'r')
19 | for eachline in file.readlines():
20 |     x = eachline.split('\t')
21 |     col.append(hashElement(int(x[1])-1,bucketnumber,int(neuron)))#得到列的id
22 |     row.append(hashElement(int(x[0])-1,bucketnumber,int(neuron)))#得到行的id
23 |     #a.append(int(x[1])-1)#得到列的id
24 |     #b.append(int(x[0])-1)#得到行的id
25 | plt.title("neuron:"+neuron+" bucket"+str(bucketnumber)+" layer:"+num)
26 | plt.xlim(xmax=64,xmin=0)
27 | plt.ylim(ymax=64,ymin=0)
28 | plt.xlabel("col")
29 | plt.ylabel("row")
30 | plt.plot(a,b,'.')
31 | 
32 | plt.savefig("tmp.fig")
33 | 
34 | # plt.show()
35 | 


--------------------------------------------------------------------------------
/tools/get_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | DATA_SET_DIR="/home/xinjie/data/graph_challenge"
 4 | 
 5 | if [[ -z "${DATA_SET_DIR}" ]]; then 
 6 |        echo "ERROR: Please Set Data Set Dir Variable"	
 7 |        exit 0
 8 | fi
 9 | 
10 | 
11 | 
12 | 
13 | if [ $1 == "1024" ]; then 
14 |     echo "Downloading Categories"
15 |     wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024-l120-categories.tsv 
16 | 
17 |     echo "Downloading Spase Images"
18 | 
19 |     wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/mnist/sparse-images-1024.tsv.gz
20 | 
21 |     echo "Downloading Weights"
22 |     wget -P $DATA_SET_DIR https://graphchallenge.s3.amazonaws.com/synthetic/sparsechallenge_2019/dnn/neuron1024.tar.gz
23 | 
24 | fi
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/tools/plt_show.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from numpy import fromfile
 3 | import numpy as np
 4 | from scipy.sparse import coo_matrix
 5 | import sys
 6 | 
 7 | 
 8 | num = sys.argv[1]
 9 | neuron = 65536
10 | bucketnumber = 4096
11 | tile_size_str = sys.argv[2]
12 | 
13 | draw_num = sys.argv[3]
14 | draw_num = int(draw_num)
15 | 
16 | tile_size = int(tile_size_str)
17 | 
18 | open_file_path='../data/neuron65536/n65536-l'+ num + '.tsv'
19 | save_path_root = "../data_show/"
20 | 
21 | 
22 | def hashElement(v):
23 |     if v >= neuron:
24 |         return v
25 |     else:
26 |         return v / bucketnumber + (v % bucketnumber) * (neuron / bucketnumber)
27 | 
28 | 
29 | now_num = 0
30 | for row_block in range(0, int((neuron + tile_size - 1) / tile_size)):
31 |     for col_block in range(0, int((neuron + tile_size - 1)/ tile_size)):
32 |         file = open(open_file_path, 'r')
33 |         row = []
34 |         col = []
35 |         for eachline in file.readlines():
36 |             x = eachline.split('\t')
37 |             col_h = hashElement(int(x[0]) - 1)
38 |             row_h = hashElement(int(x[1]) - 1)
39 |             if(col_h >= tile_size * col_block and col_h < tile_size * (col_block + 1)):
40 |                 if(row_h >= tile_size * row_block and row_h < tile_size * (row_block + 1)):
41 |                     col.append(col_h)
42 |                     row.append(row_h)
43 |         print(len(row))
44 |         if(len(row) == 0):
45 |             continue
46 |         now_num = now_num + 1
47 |         file_name = 'l' + str(num) + '_b' + str(bucketnumber) + '_r' + str(row_block) + '_c' + str(col_block) + '_t' + str(tile_size) + '.png'
48 |         save_file_path = save_path_root + 'n' + str(neuron) + "/" + "l" + str(num) + "/"
49 |         plt.title(file_name)
50 |         plt.xlim(xmax = tile_size * (row_block + 1), xmin = tile_size * row_block)
51 |         plt.ylim(ymax = tile_size * (col_block + 1), ymin = tile_size * col_block)
52 |         plt.xlabel("row")
53 |         plt.ylabel("col")
54 |         plt.plot(row, col, '.')
55 |         plt.savefig(save_file_path + file_name)
56 |         if(draw_num == now_num):
57 |             exit()
58 |         
59 | 
60 | 


--------------------------------------------------------------------------------
/tools/statistics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib as mpl
 3 | import matplotlib.pyplot as plt
 4 |  
 5 |  
 6 | open_file_path = "../data/neuron16384-l120-categories.tsv"
 7 | 
 8 | neuron = 16384
 9 | batch = 60000
10 | 
11 | file = open(open_file_path, 'r')
12 | 
13 | array = []
14 | x_axis = []
15 | for i in range(0, 6000):
16 |     array.append(0)
17 |     x_axis.append(i)
18 | 
19 | max_v = 0
20 | for eachline in file.readlines():
21 |     x = eachline.split(' ')
22 |     xx = int(x[0]) - 1
23 |     print(xx)
24 |     array[int(xx/10)] = array[int(xx/10)] + 1
25 |     if max_v < array[int(xx/10)]:
26 |         max_v = array[int(xx/10)]
27 | 
28 | 
29 | 
30 | plt.xlim(xmax = 256, xmin = 0)
31 | plt.ylim(ymax = max_v, ymin = 0)
32 | 
33 | plt.plot(x_axis, array, '.')
34 | 
35 | plt.savefig("tmp.png")


--------------------------------------------------------------------------------
/tools/tmp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CGCL-codes/Graphchallenge21/74273ac25bfd90162067cb24a9b7a38774a9619b/tools/tmp.png


--------------------------------------------------------------------------------