├── .gitignore ├── LICENSE ├── README.md ├── SDDMM ├── SDDMM │ ├── .gitignore │ ├── Makefile │ ├── eval_matrices │ │ ├── s50.txt │ │ ├── s70.txt │ │ ├── s80.txt │ │ ├── s90.txt │ │ ├── s95.txt │ │ └── s98.txt │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ ├── launch_sddmm_magicube_16b16b.py │ ├── launch_sddmm_magicube_4b4b.py │ ├── launch_sddmm_magicube_8b8b.py │ ├── run_sddmm_magicube.sh │ ├── sddmm_benchmark.cpp │ ├── setup.sh │ ├── spmm_benchmark.cpp │ └── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── sddmm_utils │ │ ├── compute_utils.h │ │ ├── lhs_tile.h │ │ ├── output_tile.h │ │ └── rhs_tile.h │ │ ├── spmm_utils │ │ ├── barrier.h │ │ ├── compute_utils.h │ │ ├── compute_utils.h_bk │ │ ├── compute_utils.h_bkkkk │ │ ├── compute_utils.h_more_shift_opt_4bit │ │ ├── dense_tile.h │ │ ├── memory_aligner.h │ │ ├── output_tile.h │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ ├── wmma_sddmm.cu_bk │ │ └── wmma_spmm.cu └── ablation_study │ ├── SDDMM_basic │ ├── .gitignore │ ├── Makefile │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ ├── sddmm_benchmark.cpp │ ├── setup.sh │ ├── spmm_benchmark.cpp │ └── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── sddmm_utils │ │ ├── compute_utils.h │ │ ├── lhs_tile.h │ │ ├── output_tile.h │ │ └── rhs_tile.h │ │ ├── spmm_utils │ │ ├── barrier.h │ │ ├── compute_utils.h │ │ ├── compute_utils.h_bk │ │ ├── compute_utils.h_bkkkk │ │ ├── compute_utils.h_more_shift_opt_4bit │ │ ├── dense_tile.h │ │ ├── memory_aligner.h │ │ ├── output_tile.h │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ ├── wmma_sddmm.cu_bk │ │ └── wmma_spmm.cu │ ├── SDDMM_lhs_pref │ ├── .gitignore │ ├── Makefile │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ ├── sddmm_benchmark.cpp │ ├── setup.sh │ ├── spmm_benchmark.cpp │ ├── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── sddmm_utils │ │ │ ├── compute_utils.h │ │ │ ├── lhs_tile.h │ │ │ ├── output_tile.h │ │ │ └── rhs_tile.h │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── compute_utils.h_bk │ │ │ ├── compute_utils.h_bkkkk │ │ │ ├── compute_utils.h_more_shift_opt_4bit │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ ├── wmma_sddmm.cu_bk │ │ └── wmma_spmm.cu │ └── usingwmma_run.sh │ ├── compile_jobs.sh │ └── sddmm_ablation_study.py ├── SpMM ├── SpMM │ ├── .gitignore │ ├── Makefile │ ├── eval_matrices │ │ ├── s50.txt │ │ ├── s70.txt │ │ ├── s80.txt │ │ ├── s90.txt │ │ ├── s95.txt │ │ └── s98.txt │ ├── file_name_server.py │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ ├── launch_spmm_magicube_16b8b.py │ ├── launch_spmm_magicube_4b4b.py │ ├── launch_spmm_magicube_8b4b.py │ ├── launch_spmm_magicube_8b8b.py │ ├── ncu_profile.py │ ├── run_spmm_magicube.sh │ ├── sddmm_benchmark.cpp │ ├── setup.sh │ ├── spmm_benchmark.cpp │ ├── spmm_pres.sh │ └── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ ├── barrier.h │ │ ├── compute_utils.h │ │ ├── dense_tile.h │ │ ├── memory_aligner.h │ │ ├── output_tile.h │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu └── ablation_study │ ├── 16b8b │ ├── SpMM_conflict_free │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ └── SpMM_conflict_free_prefetch │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ ├── 4b4b │ ├── SpMM_conflict_free │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ ├── SpMM_conflict_free_prefetch │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ └── SpMM_conflict_free_prefetch_shuffle │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ ├── 8b4b │ ├── SpMM_conflict_free │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ ├── SpMM_conflict_free_prefetch │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ └── SpMM_conflict_free_prefetch_shuffle │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ ├── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu │ │ └── usingwmma_run.sh │ ├── 8b8b │ ├── SpMM_conflict_free │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ │ ├── bm_test_utils.h │ │ │ ├── cublas_gemm.cuh │ │ │ ├── cuda_sddmm.cuh │ │ │ ├── cuda_spmm.cuh │ │ │ ├── sputnik.h │ │ │ ├── wmma_sddmm.cuh │ │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ └── src │ │ │ ├── cublas_gemm.cu │ │ │ ├── cuda_sddmm.cu │ │ │ ├── cuda_spmm.cu │ │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ │ ├── wmma_sddmm.cu │ │ │ └── wmma_spmm.cu │ └── SpMM_conflict_free_prefetch │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── file_name_server.py │ │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ │ ├── ncu_profile.py │ │ ├── run_jobs.sh │ │ ├── sddmm_benchmark.cpp │ │ ├── setup.sh │ │ ├── spmm_benchmark.cpp │ │ └── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ ├── barrier.h │ │ ├── compute_utils.h │ │ ├── dense_tile.h │ │ ├── memory_aligner.h │ │ ├── output_tile.h │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu │ ├── SpMM_basic │ ├── .gitignore │ ├── Makefile │ ├── file_name_server.py │ ├── include │ │ ├── bm_test_utils.h │ │ ├── cublas_gemm.cuh │ │ ├── cuda_sddmm.cuh │ │ ├── cuda_spmm.cuh │ │ ├── sputnik.h │ │ ├── wmma_sddmm.cuh │ │ └── wmma_spmm.cuh │ ├── ncu_profile.py │ ├── run_jobs.sh │ ├── sddmm_benchmark.cpp │ ├── setup.sh │ ├── spmm_benchmark.cpp │ ├── src │ │ ├── cublas_gemm.cu │ │ ├── cuda_sddmm.cu │ │ ├── cuda_spmm.cu │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ ├── wmma_sddmm.cu │ │ └── wmma_spmm.cu │ └── usingwmma_run.sh │ ├── compile_jobs.sh │ └── spmm_ablation_study.sh ├── baselines ├── Dockerfile ├── Makefile ├── eval_matrices │ ├── s50.txt │ ├── s70.txt │ ├── s80.txt │ ├── s90.txt │ ├── s95.txt │ └── s98.txt ├── example │ └── example.csv ├── file_name_server.py ├── include │ ├── bm_test_utils.h │ ├── cublas_gemm.cuh │ ├── cuda_sddmm.cuh │ ├── cuda_spmm.cuh │ ├── sputnik.h │ ├── wmma_sddmm.cuh │ └── wmma_spmm.cuh ├── job_launcher.py ├── launch.py ├── launch_sddmm_cublas_fp16.py ├── launch_sddmm_cublas_int8.py ├── launch_sddmm_vectorSparse.py ├── launch_spmm_cublas_fp16.py ├── launch_spmm_cublas_int8.py ├── launch_spmm_cusparse_fp16.py ├── launch_spmm_cusparse_int8.py ├── launch_spmm_vectorSparse.py ├── ncu_profile.py ├── plot_blocked_ell.py ├── plot_finegrained.py ├── plot_mem_l2_l1.py ├── plot_sddmm.py ├── plot_spmm.py ├── run_sddmm_baselines.sh ├── run_spmm_baselines.sh ├── sddmm_benchmark.cpp ├── setup.sh ├── spmm_benchmark.cpp ├── src │ ├── cublas_gemm.cu │ ├── cuda_sddmm.cu │ ├── cuda_spmm.cu │ ├── spmm_utils │ │ ├── barrier.h │ │ ├── compute_utils.h │ │ ├── dense_tile.h │ │ ├── memory_aligner.h │ │ ├── output_tile.h │ │ └── sparse_tile.h │ ├── wmma_sddmm.cu │ └── wmma_spmm.cu └── usingwmma_run.sh ├── end2end_eval ├── sparse_transformer_baselines │ ├── README.md │ ├── atten_speedup.py │ ├── attention.py │ ├── cudaprofile.py │ ├── end_to_end.py │ ├── launch_cudnn_fp16.py │ ├── launch_vectorSparse.py │ ├── run.sh │ ├── sparse_encoder.py │ ├── spattention.py │ ├── src │ │ ├── cuda │ │ │ ├── sddmm.cpp │ │ │ ├── sddmm_kernel.cu │ │ │ ├── softmax.cpp │ │ │ ├── softmax_kernel.cu │ │ │ ├── spmm.cpp │ │ │ ├── spmm_kernel.cu │ │ │ └── spmm_utils │ │ │ │ ├── barrier.h │ │ │ │ ├── compute_utils.h │ │ │ │ ├── dense_tile.h │ │ │ │ ├── memory_aligner.h │ │ │ │ ├── output_tile.h │ │ │ │ └── sparse_tile.h │ │ ├── install.sh │ │ └── setup.py │ └── verify │ │ ├── __init__.py │ │ ├── bsddmm.py │ │ ├── bsoftmax.py │ │ ├── bspmm.py │ │ ├── sddmm.py │ │ ├── softmax.py │ │ ├── spmm.py │ │ └── static_mask.py └── sparse_transformer_magicube │ ├── atten_speedup.py │ ├── attention.py │ ├── cudaprofile.py │ ├── end_to_end.py │ ├── launch_magicube.py │ ├── run.sh │ ├── sparse_encoder.py │ ├── spattention.py │ ├── src │ ├── cuda │ │ ├── deq_sddmm.cpp │ │ ├── deq_sddmm_kernel.cu │ │ ├── deq_spmm.cpp │ │ ├── deq_spmm.cpp_N128 │ │ ├── deq_spmm_kernel.cu │ │ ├── deq_spmm_kernel.cu_N128 │ │ ├── q_softmax.cpp │ │ ├── q_softmax_kernel.cu │ │ ├── quantization.cpp │ │ ├── quantization_kernel.cu │ │ ├── sddmm.cpp │ │ ├── sddmm_kernel.cu │ │ ├── sddmm_utils │ │ │ ├── compute_utils.h │ │ │ ├── lhs_tile.h │ │ │ ├── output_tile.h │ │ │ └── rhs_tile.h │ │ ├── softmax.cpp │ │ ├── softmax_kernel.cu │ │ ├── spmm.cpp │ │ ├── spmm_kernel.cu │ │ ├── spmm_utils │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ │ └── spmm_utils_N128_bk │ │ │ ├── barrier.h │ │ │ ├── compute_utils.h │ │ │ ├── dense_tile.h │ │ │ ├── memory_aligner.h │ │ │ ├── output_tile.h │ │ │ └── sparse_tile.h │ ├── install.sh │ └── setup.py │ └── verify │ ├── __init__.py │ ├── bsddmm.py │ ├── bsoftmax.py │ ├── bspmm.py │ ├── sddmm.py │ ├── softmax.py │ ├── spmm.py │ └── static_mask.py ├── magicubeLogo.svg ├── plot ├── confinter.py ├── examples │ ├── magicube_n2n.txt │ ├── pytorch_n2n.txt │ ├── sddmm_abl_study.txt │ ├── sddmm_cublas_fp16.txt │ ├── sddmm_cublas_int8.txt │ ├── sddmm_magicube_16b16b.txt │ ├── sddmm_magicube_4b4b.txt │ ├── sddmm_magicube_8b8b.txt │ ├── sddmm_vectorSparse.txt │ ├── spmm_abl_study.txt │ ├── spmm_cublas_fp16.txt │ ├── spmm_cublas_int8.txt │ ├── spmm_cusparse_fp16.txt │ ├── spmm_cusparse_int8.txt │ ├── spmm_magicube_16b8b.txt │ ├── spmm_magicube_4b4b.txt │ ├── spmm_magicube_8b4b.txt │ ├── spmm_magicube_8b8b.txt │ ├── spmm_pres.txt │ ├── spmm_vectorSparse.txt │ └── vectorSparse_n2n.txt ├── figs │ └── .gitignore ├── gen_csv.sh ├── n2n.py ├── plot.sh ├── plot_n2n_a.py ├── plot_n2n_b.py ├── plot_n2n_c.py ├── plot_n2n_d.py ├── plot_n2n_e.py ├── plot_n2n_f.py ├── plot_n2n_g.py ├── plot_n2n_h.py ├── plot_sddmm_abl_study.py ├── plot_sddmm_all_matrices.py ├── plot_spmm_abl_study.py ├── plot_spmm_all_matrices.py ├── plot_spmm_pres.py ├── sddmm_abl_study.py ├── sddmm_all_matrices.py ├── spmm_abl_study.py ├── spmm_all_matrices.py └── spmm_pres.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 3 | 4 | 5 | ################################################################## 6 | 7 | ## Project file structure ## 8 | 9 | # Source file directory: 10 | SRC_DIR = src 11 | 12 | # Object file directory: 13 | OBJ_DIR = bin 14 | 15 | # Include header file directory 16 | INC_DIR = include 17 | 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SDDMM/SDDMM/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SDDMM/SDDMM/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SDDMM/SDDMM/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const int* __restrict__ lhs_matrix, 12 | const int* __restrict__ rhs_matrix, 13 | int* __restrict__ output_values, 14 | int vec_length); 15 | 16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const int* __restrict__ lhs_matrix, 21 | const int* __restrict__ rhs_matrix, 22 | int* __restrict__ output_values, 23 | int vec_length); 24 | 25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const int* __restrict__ lhs_matrix, 30 | const int* __restrict__ rhs_matrix, 31 | int* __restrict__ output_values, 32 | int vec_length); 33 | 34 | } // namespace sddmm 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/launch_sddmm_magicube_16b16b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | # Args 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks') 6 | 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark") 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 12 | args = parser.parse_args() 13 | 14 | dataset_dir = os.environ.get('dataset_dir') 15 | sparsities = ['50', '70', '80', '90', '95', '98'] 16 | dimKs = [128, 256] 17 | vec_lens = [2, 4, 8] 18 | 19 | for dimK in dimKs: 20 | for vec_len in vec_lens: 21 | for sparsity in sparsities: 22 | print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity) 23 | 24 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 25 | lines = matrix_list.readlines() 26 | #for i in range(1): 27 | for i in range(len(lines)): 28 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 29 | cmd = './sddmm_benchmark %s %d %d 1 0 1 16 16' % (matrix, dimK, vec_len) 30 | os.system(cmd) 31 | 32 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/launch_sddmm_magicube_4b4b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | # Args 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks') 6 | 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark") 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 12 | args = parser.parse_args() 13 | 14 | dataset_dir = os.environ.get('dataset_dir') 15 | sparsities = ['50', '70', '80', '90', '95', '98'] 16 | dimKs = [128, 256] 17 | vec_lens = [2, 4, 8] 18 | 19 | for dimK in dimKs: 20 | for vec_len in vec_lens: 21 | for sparsity in sparsities: 22 | print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity) 23 | 24 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 25 | lines = matrix_list.readlines() 26 | #for i in range(1): 27 | for i in range(len(lines)): 28 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 29 | cmd = './sddmm_benchmark %s %d %d 1 0 1 4 4' % (matrix, dimK, vec_len) 30 | os.system(cmd) 31 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/launch_sddmm_magicube_8b8b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | # Args 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks') 6 | 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark") 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 12 | args = parser.parse_args() 13 | 14 | dataset_dir = os.environ.get('dataset_dir') 15 | sparsities = ['50', '70', '80', '90', '95', '98'] 16 | dimKs = [128, 256] 17 | vec_lens = [2, 4, 8] 18 | 19 | for dimK in dimKs: 20 | for vec_len in vec_lens: 21 | for sparsity in sparsities: 22 | print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity) 23 | 24 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 25 | lines = matrix_list.readlines() 26 | #for i in range(1): 27 | for i in range(len(lines)): 28 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 29 | cmd = './sddmm_benchmark %s %d %d 1 0 1 8 8' % (matrix, dimK, vec_len) 30 | os.system(cmd) 31 | 32 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/run_sddmm_magicube.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "Tesing sddmm_magicube_16b16b" 3 | python launch_sddmm_magicube_16b16b.py > sddmm_magicube_16b16b.txt 4 | echo "Finish sddmm_magicube_16b16b" 5 | 6 | echo "Tesing sddmm_magicube_8b8b" 7 | python launch_sddmm_magicube_8b8b.py > sddmm_magicube_8b8b.txt 8 | echo "Finish sddmm_magicube_8b8b" 9 | 10 | echo "Tesing sddmm_magicube_4b4b" 11 | python launch_sddmm_magicube_4b4b.py > sddmm_magicube_4b4b.txt 12 | echo "Finish sddmm_magicube_4b4b" 13 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make sddmm_benchmark 3 | -------------------------------------------------------------------------------- /SDDMM/SDDMM/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 3 | 4 | 5 | ################################################################## 6 | 7 | ## Project file structure ## 8 | 9 | # Source file directory: 10 | SRC_DIR = src 11 | 12 | # Object file directory: 13 | OBJ_DIR = bin 14 | 15 | # Include header file directory 16 | INC_DIR = include 17 | 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const int* __restrict__ lhs_matrix, 12 | const int* __restrict__ rhs_matrix, 13 | int* __restrict__ output_values, 14 | int vec_length); 15 | 16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const int* __restrict__ lhs_matrix, 21 | const int* __restrict__ rhs_matrix, 22 | int* __restrict__ output_values, 23 | int vec_length); 24 | 25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const int* __restrict__ lhs_matrix, 30 | const int* __restrict__ rhs_matrix, 31 | int* __restrict__ output_values, 32 | int vec_length); 33 | 34 | } // namespace sddmm 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make sddmm_benchmark 3 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_basic/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 3 | 4 | 5 | ################################################################## 6 | 7 | ## Project file structure ## 8 | 9 | # Source file directory: 10 | SRC_DIR = src 11 | 12 | # Object file directory: 13 | OBJ_DIR = bin 14 | 15 | # Include header file directory 16 | INC_DIR = include 17 | 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const int* __restrict__ lhs_matrix, 12 | const int* __restrict__ rhs_matrix, 13 | int* __restrict__ output_values, 14 | int vec_length); 15 | 16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const int* __restrict__ lhs_matrix, 21 | const int* __restrict__ rhs_matrix, 22 | int* __restrict__ output_values, 23 | int vec_length); 24 | 25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const int* __restrict__ lhs_matrix, 30 | const int* __restrict__ rhs_matrix, 31 | int* __restrict__ output_values, 32 | int vec_length); 33 | 34 | } // namespace sddmm 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make sddmm_benchmark 3 | -------------------------------------------------------------------------------- /SDDMM/ablation_study/SDDMM_lhs_pref/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SDDMM/ablation_study/compile_jobs.sh: -------------------------------------------------------------------------------- 1 | cd ./SDDMM_basic 2 | chmod 777 setup.sh 3 | ./setup.sh 4 | echo "SDDMM basic is compiled." 5 | 6 | cd - 7 | cd ./SDDMM_lhs_pref 8 | chmod 777 setup.sh 9 | ./setup.sh 10 | echo "SDDMM with LHS prefetch is compiled." 11 | -------------------------------------------------------------------------------- /SpMM/SpMM/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/SpMM/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 3 | 4 | 5 | ################################################################## 6 | 7 | ## Project file structure ## 8 | 9 | # Source file directory: 10 | SRC_DIR = src 11 | 12 | # Object file directory: 13 | OBJ_DIR = bin 14 | 15 | # Include header file directory 16 | INC_DIR = include 17 | 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SpMM/SpMM/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/SpMM/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/SpMM/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/SpMM/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/SpMM/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/SpMM/launch_spmm_magicube_16b8b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 1 16 8' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /SpMM/SpMM/launch_spmm_magicube_4b4b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 1 4 4' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /SpMM/SpMM/launch_spmm_magicube_8b4b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 1 8 4' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /SpMM/SpMM/launch_spmm_magicube_8b8b.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 1 8 8' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /SpMM/SpMM/run_spmm_magicube.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "Tesing spmm_magicube_16b8b" 3 | python launch_spmm_magicube_16b8b.py > spmm_magicube_16b8b.txt 4 | echo "Finish spmm_magicube_16b8b" 5 | 6 | echo "Tesing spmm_magicube_8b8b" 7 | python launch_spmm_magicube_8b8b.py > spmm_magicube_8b8b.txt 8 | echo "Finish spmm_magicube_8b8b" 9 | 10 | echo "Tesing spmm_magicube_8b4b" 11 | python launch_spmm_magicube_8b4b.py > spmm_magicube_8b4b.txt 12 | echo "Finish spmm_magicube_8b4b" 13 | 14 | echo "Tesing spmm_magicube_4b4b" 15 | python launch_spmm_magicube_4b4b.py > spmm_magicube_4b4b.txt 16 | echo "Finish spmm_magicube_4b4b" 17 | -------------------------------------------------------------------------------- /SpMM/SpMM/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/SpMM/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L16-R8 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8 12 | echo -e "\n" 13 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L16-R8 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8 12 | echo -e "\n" 13 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | 5 | echo -e "L4-R4 \n" 6 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 7 | echo -e "\n" 8 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 9 | echo -e "\n" 10 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 11 | echo -e "\n" 12 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 13 | echo -e "\n" 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | 5 | echo -e "L4-R4 \n" 6 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 7 | echo -e "\n" 8 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 9 | echo -e "\n" 10 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 11 | echo -e "\n" 12 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 13 | echo -e "\n" 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | 5 | echo -e "L4-R4 \n" 6 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 7 | echo -e "\n" 8 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 9 | echo -e "\n" 10 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4 11 | echo -e "\n" 12 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 13 | echo -e "\n" 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L8-R4 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 12 | echo -e "\n" 13 | 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L8-R4 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 12 | echo -e "\n" 13 | 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L8-R4 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 12 | echo -e "\n" 13 | 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L8-R8 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8 12 | echo -e "\n" 13 | 14 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/run_jobs.sh: -------------------------------------------------------------------------------- 1 | 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n" 3 | 4 | echo -e "L8-R8 \n" 5 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8 6 | echo -e "\n" 7 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8 8 | echo -e "\n" 9 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8 10 | echo -e "\n" 11 | ./spmm_benchmark ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8 12 | echo -e "\n" 13 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | ################################################################## 20 | 21 | ## Compile ## 22 | 23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 24 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 25 | 26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 27 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 28 | 29 | # Compile main file to object file 30 | $(OBJ_DIR)/%.o : %.cpp 31 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 32 | 33 | 34 | # Compile CUDA source files to object files 35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 36 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 37 | 38 | clean: 39 | @rm -f $(OBJ_DIR)/*.o 40 | -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make spmm_benchmark 3 | -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /SpMM/ablation_study/SpMM_basic/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8 3 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4 4 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4 6 | nsys profile --force-overwrite true -t cuda -o spmm_report ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4 7 | -------------------------------------------------------------------------------- /SpMM/ablation_study/compile_jobs.sh: -------------------------------------------------------------------------------- 1 | cd ./SpMM_basic 2 | chmod 777 setup.sh 3 | ./setup.sh 4 | echo "SpMM basic compiled." 5 | 6 | cd - 7 | cd ./16b8b/SpMM_conflict_free 8 | chmod 777 setup.sh 9 | ./setup.sh 10 | 11 | cd - 12 | cd ./8b8b/SpMM_conflict_free 13 | chmod 777 setup.sh 14 | ./setup.sh 15 | 16 | cd - 17 | cd ./8b4b/SpMM_conflict_free 18 | chmod 777 setup.sh 19 | ./setup.sh 20 | 21 | cd - 22 | cd ./4b4b/SpMM_conflict_free 23 | chmod 777 setup.sh 24 | ./setup.sh 25 | echo "SpMM with conflict-free SM compiled." 26 | 27 | cd - 28 | cd ./16b8b/SpMM_conflict_free_prefetch 29 | chmod 777 setup.sh 30 | ./setup.sh 31 | 32 | cd - 33 | cd ./8b8b/SpMM_conflict_free_prefetch 34 | chmod 777 setup.sh 35 | ./setup.sh 36 | 37 | cd - 38 | cd ./8b4b/SpMM_conflict_free_prefetch 39 | chmod 777 setup.sh 40 | ./setup.sh 41 | 42 | cd - 43 | cd ./4b4b/SpMM_conflict_free_prefetch 44 | chmod 777 setup.sh 45 | ./setup.sh 46 | echo "SpMM with conflict-free SM + prefetch compiled." 47 | 48 | cd - 49 | cd ./8b4b/SpMM_conflict_free_prefetch_shuffle 50 | chmod 777 setup.sh 51 | ./setup.sh 52 | 53 | cd - 54 | cd ./4b4b/SpMM_conflict_free_prefetch_shuffle 55 | chmod 777 setup.sh 56 | ./setup.sh 57 | echo "SpMM with conflict-free SM + prefetch + shuffle compiled." 58 | -------------------------------------------------------------------------------- /SpMM/ablation_study/spmm_ablation_study.sh: -------------------------------------------------------------------------------- 1 | cd ./SpMM_basic 2 | chmod 777 run_jobs.sh 3 | ./run_jobs.sh 4 | echo "SpMM basic tested." 5 | 6 | cd - 7 | cd ./16b8b/SpMM_conflict_free 8 | chmod 777 run_jobs.sh 9 | ./run_jobs.sh 10 | 11 | cd - 12 | cd ./8b8b/SpMM_conflict_free 13 | chmod 777 run_jobs.sh 14 | ./run_jobs.sh 15 | 16 | cd - 17 | cd ./8b4b/SpMM_conflict_free 18 | chmod 777 run_jobs.sh 19 | ./run_jobs.sh 20 | 21 | cd - 22 | cd ./4b4b/SpMM_conflict_free 23 | chmod 777 run_jobs.sh 24 | ./run_jobs.sh 25 | echo "SpMM with conflict-free SM tested." 26 | 27 | cd - 28 | cd ./16b8b/SpMM_conflict_free_prefetch 29 | chmod 777 run_jobs.sh 30 | ./run_jobs.sh 31 | 32 | cd - 33 | cd ./8b8b/SpMM_conflict_free_prefetch 34 | chmod 777 run_jobs.sh 35 | ./run_jobs.sh 36 | 37 | cd - 38 | cd ./8b4b/SpMM_conflict_free_prefetch 39 | chmod 777 run_jobs.sh 40 | ./run_jobs.sh 41 | 42 | cd - 43 | cd ./4b4b/SpMM_conflict_free_prefetch 44 | chmod 777 run_jobs.sh 45 | ./run_jobs.sh 46 | echo "SpMM with conflict-free SM + prefetch tested." 47 | 48 | cd - 49 | cd ./8b4b/SpMM_conflict_free_prefetch_shuffle 50 | chmod 777 run_jobs.sh 51 | ./run_jobs.sh 52 | 53 | cd - 54 | cd ./4b4b/SpMM_conflict_free_prefetch_shuffle 55 | chmod 777 run_jobs.sh 56 | ./run_jobs.sh 57 | echo "SpMM with conflict-free SM + prefetch + shuffle tested." 58 | -------------------------------------------------------------------------------- /baselines/Makefile: -------------------------------------------------------------------------------- 1 | 2 | NVCC = nvcc 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 4 | 5 | 6 | ################################################################## 7 | 8 | ## Project file structure ## 9 | 10 | # Source file directory: 11 | SRC_DIR = src 12 | 13 | # Object file directory: 14 | OBJ_DIR = bin 15 | 16 | # Include header file directory 17 | INC_DIR = include 18 | 19 | 20 | ################################################################## 21 | 22 | ## Compile ## 23 | 24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o 25 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 26 | 27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o 28 | @$(NVCC) $(NVCC_FLAGS) $^ -o $@ 29 | 30 | # Compile main file to object file 31 | $(OBJ_DIR)/%.o : %.cpp 32 | @$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 33 | 34 | 35 | # Compile CUDA source files to object files 36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh 37 | @$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ 38 | 39 | clean: 40 | @rm -f $(OBJ_DIR)/*.o 41 | -------------------------------------------------------------------------------- /baselines/include/cublas_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef CUBLAS_GEMM_H 2 | #define CUBLAS_GEMM_H 3 | #include 4 | #include "cuda_fp16.h" 5 | 6 | 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 8 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 9 | 10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 11 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 12 | 13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 14 | float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix); 15 | 16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 17 | half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix); 18 | 19 | #endif -------------------------------------------------------------------------------- /baselines/include/cuda_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SDDMM_H 3 | #define CUDA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream) ; 15 | 16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 17 | const int* __restrict__ row_indices, 18 | const int* __restrict__ row_offsets, 19 | const int* __restrict__ col_indices, 20 | const half* __restrict__ lhs_matrix, 21 | const half* __restrict__ rhs_matrix, 22 | half* __restrict__ output_values, 23 | int vec_length, cudaStream_t stream) ; 24 | 25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec, 26 | const int* __restrict__ row_indices, 27 | const int* __restrict__ row_offsets, 28 | const int* __restrict__ col_indices, 29 | const float* __restrict__ lhs_matrix, 30 | const float* __restrict__ rhs_matrix, 31 | float* __restrict__ output_values, 32 | int vec_length, cudaStream_t stream) ; 33 | 34 | } // namespace sddmm 35 | 36 | #endif -------------------------------------------------------------------------------- /baselines/include/cuda_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef CUDA_SPMM_H 3 | #define CUDA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /baselines/include/sputnik.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 The Sputnik Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 17 | 18 | #include "sputnik/bias_relu/bias_relu.h" 19 | #include "sputnik/depthwise/cuda_depthwise.h" 20 | #include "sputnik/sddmm/cuda_sddmm.h" 21 | #include "sputnik/softmax/softmax.h" 22 | #include "sputnik/softmax/sparse_softmax.h" 23 | #include "sputnik/spmm/cuda_spmm.h" 24 | #include "sputnik/utils/index_format.h" 25 | 26 | 27 | #endif // THIRD_PARTY_SPUTNIK_SPUTNIK_H_ 28 | -------------------------------------------------------------------------------- /baselines/include/wmma_sddmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SDDMM_H 3 | #define WMMA_SDDMM_H 4 | 5 | namespace sddmm{ 6 | 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ col_indices, 11 | const half* __restrict__ lhs_matrix, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_values, 14 | int vec_length, cudaStream_t stream, int algorithm) ; 15 | 16 | 17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 18 | const int* __restrict__ row_indices, 19 | const int* __restrict__ row_offsets, 20 | const int* __restrict__ col_indices, 21 | const half* __restrict__ lhs_matrix, 22 | const half* __restrict__ rhs_matrix, 23 | half* __restrict__ output_values, 24 | int vec_length, cudaStream_t stream, int algorithm) ; 25 | 26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec, 27 | const int* __restrict__ row_indices, 28 | const int* __restrict__ row_offsets, 29 | const int* __restrict__ col_indices, 30 | const float* __restrict__ lhs_matrix, 31 | const float* __restrict__ rhs_matrix, 32 | float* __restrict__ output_values, 33 | int vec_length, cudaStream_t stream, int algorithm) ; 34 | 35 | } // namespace sddmm 36 | 37 | #endif -------------------------------------------------------------------------------- /baselines/include/wmma_spmm.cuh: -------------------------------------------------------------------------------- 1 | #include "cuda_fp16.h" 2 | #ifndef WMMA_SPMM_H 3 | #define WMMA_SPMM_H 4 | 5 | namespace spmm{ 6 | 7 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 8 | const int* __restrict__ row_indices, 9 | const int* __restrict__ row_offsets, 10 | const int* __restrict__ column_indices, 11 | const half* __restrict__ values, 12 | const half* __restrict__ rhs_matrix, 13 | float* __restrict__ output_matrix) ; 14 | 15 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 16 | const int* __restrict__ row_indices, 17 | const int* __restrict__ row_offsets, 18 | const int* __restrict__ column_indices, 19 | const half* __restrict__ values, 20 | const half* __restrict__ rhs_matrix, 21 | half* __restrict__ output_matrix) ; 22 | 23 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 24 | const int* __restrict__ row_indices, 25 | const int* __restrict__ row_offsets, 26 | const int* __restrict__ column_indices, 27 | const float* __restrict__ values, 28 | const float* __restrict__ rhs_matrix, 29 | float* __restrict__ output_matrix) ; 30 | 31 | } // namespace spmm 32 | 33 | #endif -------------------------------------------------------------------------------- /baselines/launch_sddmm_cublas_fp16.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimKs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimK in dimKs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | #for i in range(1): 28 | for i in range(len(lines)): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './sddmm_benchmark %s %d %d 0 2 1 0 0 1' % (matrix, dimK, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/launch_spmm_cublas_fp16.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 -2 1' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/launch_spmm_cublas_int8.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 -1 1' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/launch_spmm_cusparse_fp16.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 2 1' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/launch_spmm_cusparse_int8.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 3 1' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/launch_spmm_vectorSparse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | # Args 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks') 7 | 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark") 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length") 10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix') 11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A") 12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B") 13 | args = parser.parse_args() 14 | 15 | dataset_dir = os.environ.get('dataset_dir') 16 | sparsities = ['50', '70', '80', '90', '95', '98'] 17 | dimNs = [128, 256] 18 | vec_lens = [2, 4, 8] 19 | 20 | for dimN in dimNs: 21 | for vec_len in vec_lens: 22 | for sparsity in sparsities: 23 | print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity) 24 | 25 | matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r') 26 | lines = matrix_list.readlines() 27 | for i in range(len(lines)): 28 | #for i in range(1): 29 | matrix = '%s/%s' % (dataset_dir, lines[i][:-1]) 30 | cmd = './spmm_benchmark %s %d %d 0 1 0 1 1' % (matrix, dimN, vec_len) 31 | os.system(cmd) 32 | 33 | -------------------------------------------------------------------------------- /baselines/run_sddmm_baselines.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "Tesing sddmm_cublas_fp16" 3 | python launch_sddmm_cublas_fp16.py > sddmm_cublas_fp16.txt 4 | echo "Finish sddmm_cublas_fp16" 5 | 6 | echo "Tesing sddmm_cublas_int8" 7 | python launch_sddmm_cublas_int8.py > sddmm_cublas_int8.txt 8 | echo "Finish sddmm_cublas_int8" 9 | 10 | echo "Tesing sddmm_vectorSparse" 11 | python launch_sddmm_vectorSparse.py > sddmm_vectorSparse.txt 12 | echo "Finish sddmm_vectorSparse" 13 | -------------------------------------------------------------------------------- /baselines/run_spmm_baselines.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "Tesing spmm_cublas_fp16" 3 | python launch_spmm_cublas_fp16.py > spmm_cublas_fp16.txt 4 | echo "Finish spmm_cublas_fp16" 5 | 6 | echo "Tesing spmm_cublas_int8" 7 | python launch_spmm_cublas_int8.py > spmm_cublas_int8.txt 8 | echo "Finish spmm_cublas_int8" 9 | 10 | echo "Tesing spmm_vectorSparse" 11 | python launch_spmm_vectorSparse.py > spmm_vectorSparse.txt 12 | echo "Finish spmm_vectorSparse" 13 | 14 | echo "Tesing spmm_cusparse_fp16" 15 | python launch_spmm_cusparse_fp16.py > spmm_cusparse_fp16.txt 16 | echo "Finish spmm_cusparse_fp16" 17 | 18 | echo "Tesing spmm_cusparse_int8" 19 | python launch_spmm_cusparse_int8.py > spmm_cusparse_int8.txt 20 | echo "Finish spmm_cusparse_int8" 21 | -------------------------------------------------------------------------------- /baselines/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./bin 2 | make sddmm_benchmark 3 | make spmm_benchmark -------------------------------------------------------------------------------- /baselines/src/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /baselines/usingwmma_run.sh: -------------------------------------------------------------------------------- 1 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 1 2 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 1 3 | 4 | ./sddmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 1 1 5 | ./sddmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 2 1 1 1 1 6 | 7 | 8 | 9 | 10 | 11 | 12 | no cpu checking 13 | ./spmm_benchmark /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 0 1 1 14 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/README.md: -------------------------------------------------------------------------------- 1 | # Sparse Transformer Inference 2 | 3 | This repo provides a pytorch extension that speedup transformer inference with fixed structured sparsity. 4 | 5 | The end-to-end speedup & memory profiling can be obtained with `end_to_end.py`. 6 | * To profile the execution time of sparse transformer, launch `python3 end_to_end.py --model sparse` with nsight system. 7 | * To profile the execution time of dense transformer, launch `python3 end_to_end.py --model dense` with nsight system. 8 | * To profile the memory of sparse transformer, launch `python3 end_to_end.py --model sparse --mem` with nsight system. 9 | * To profile the memory of dense transformer, launch `python3 end_to_end.py --model dense --mem` with nsight system. 10 | 11 | *** 12 | 13 | #### Dependencies 14 | We generate the sparse mask with `scipy.sparse`. The pytorch version is `1.8.1+cu111`. The memory profiling is based on [`pytorch_memlab`](https://github.com/Stonesjtu/pytorch_memlab), and we annotate our program with `nvtx`. 15 | 16 | To build the custom kernels, please use the `src/install.sh`. As our kernels target on the V100 GPU's tensor core architecture, currently only `sm70` is supported. -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/cudaprofile.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | 3 | _cudart = ctypes.CDLL('libcudart.so') 4 | 5 | 6 | def start(): 7 | # As shown at http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PROFILER.html, 8 | # the return value will unconditionally be 0. This check is just in case it changes in 9 | # the future. 10 | ret = _cudart.cudaProfilerStart() 11 | if ret != 0: 12 | raise Exception("cudaProfilerStart() returned %d" % ret) 13 | 14 | def stop(): 15 | ret = _cudart.cudaProfilerStop() 16 | if ret != 0: 17 | raise Exception("cudaProfilerStop() returned %d" % ret) -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/run.sh: -------------------------------------------------------------------------------- 1 | nsys profile --force-overwrite true -t cuda -o trans_report python3 end_to_end.py --model sparse 2 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/src/cuda/softmax.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor csr_softmax_cuda( 4 | torch::Tensor row_indices, 5 | torch::Tensor row_offsets, 6 | torch::Tensor values, 7 | float scaler, 8 | int vec_length); 9 | 10 | 11 | torch::Tensor csr_softmax( 12 | torch::Tensor row_indices, 13 | torch::Tensor row_offsets, 14 | torch::Tensor values, 15 | float scaler, 16 | int vec_length) 17 | { 18 | return csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length); 19 | } 20 | 21 | 22 | torch::Tensor batched_csr_softmax_cuda( 23 | torch::Tensor row_indices, 24 | torch::Tensor row_offsets, 25 | torch::Tensor values, 26 | float scaler, 27 | int vec_length, 28 | int batch_size); 29 | 30 | 31 | torch::Tensor batched_csr_softmax( 32 | torch::Tensor row_indices, 33 | torch::Tensor row_offsets, 34 | torch::Tensor values, 35 | float scaler, 36 | int vec_length, 37 | int batch_size) 38 | { 39 | return batched_csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length, batch_size); 40 | } 41 | 42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 43 | m.def("csr_softmax", &csr_softmax, "Custom Softmax kernel"); 44 | m.def("bcsr_softmax", &batched_csr_softmax, "Custom Batched Softmax kernel"); 45 | } -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/src/cuda/spmm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor spmm_cuda( 4 | torch::Tensor row_indices, 5 | torch::Tensor row_offsets, 6 | torch::Tensor column_indices, 7 | torch::Tensor values, 8 | torch::Tensor rhs_matrix, 9 | int vec_length); 10 | 11 | torch::Tensor spmm( 12 | torch::Tensor row_indices, 13 | torch::Tensor row_offsets, 14 | torch::Tensor column_indices, 15 | torch::Tensor values, 16 | torch::Tensor rhs_matrix, 17 | int vec_length) 18 | { 19 | return spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length); 20 | } 21 | 22 | 23 | 24 | torch::Tensor batched_spmm_cuda( 25 | torch::Tensor row_indices, 26 | torch::Tensor row_offsets, 27 | torch::Tensor column_indices, 28 | torch::Tensor values, 29 | torch::Tensor rhs_matrix, 30 | int vec_length); 31 | 32 | torch::Tensor batched_spmm( 33 | torch::Tensor row_indices, 34 | torch::Tensor row_offsets, 35 | torch::Tensor column_indices, 36 | torch::Tensor values, 37 | torch::Tensor rhs_matrix, 38 | int vec_length) 39 | { 40 | return batched_spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length); 41 | } 42 | 43 | 44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 45 | m.def("spmm", &spmm, "Custom SPMM kernel"); 46 | m.def("bspmm", &batched_spmm, "Custom Batched SPMM kernel"); 47 | } -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/src/cuda/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | namespace spmm{ 7 | 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 9 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 10 | } 11 | 12 | template 13 | struct Barrier{ 14 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 15 | static constexpr int kThreadsPerOutputTile = BlockWidth; 16 | uint32_t thread_mask = 0xffffffff; 17 | 18 | __device__ __forceinline__ Barrier(int thread_idx_y){ 19 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 20 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 21 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 22 | } 23 | } 24 | 25 | __device__ __forceinline__ void Sync(){ 26 | if (kThreadsPerOutputTile > 32){ 27 | __syncthreads(); 28 | } else if (kThreadsPerOutputTile > 1){ 29 | __syncwarp(thread_mask); 30 | } 31 | } 32 | }; 33 | } 34 | #endif -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/src/install.sh: -------------------------------------------------------------------------------- 1 | python3 -W ignore setup.py build 2 | python3 -W ignore setup.py install -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/src/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='sptrans', 6 | version='0.0.1', 7 | description='Custom library for Sparse Transformer for pytorch', 8 | author='Zhaodong Chen', 9 | author_email='chenzd15thu@ucsb.edu', 10 | ext_modules=[ 11 | CUDAExtension('sptrans.sddmm', 12 | ['cuda/sddmm.cpp', 'cuda/sddmm_kernel.cu'], 13 | extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}), 14 | CUDAExtension('sptrans.spmm', 15 | ['cuda/spmm.cpp', 'cuda/spmm_kernel.cu'], 16 | extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}), 17 | CUDAExtension('sptrans.softmax', 18 | ['cuda/softmax.cpp', 'cuda/softmax_kernel.cu'], 19 | extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}), 20 | ], 21 | cmdclass={'build_ext': BuildExtension}, 22 | install_requires=['torch'] 23 | ) 24 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_baselines/verify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCIS/Magicube/8f92b69e9c1d7a0406eacb773ef5e79a71eda4f0/end2end_eval/sparse_transformer_baselines/verify/__init__.py -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/cudaprofile.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | 3 | _cudart = ctypes.CDLL('libcudart.so') 4 | 5 | 6 | def start(): 7 | # As shown at http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PROFILER.html, 8 | # the return value will unconditionally be 0. This check is just in case it changes in 9 | # the future. 10 | ret = _cudart.cudaProfilerStart() 11 | if ret != 0: 12 | raise Exception("cudaProfilerStart() returned %d" % ret) 13 | 14 | def stop(): 15 | ret = _cudart.cudaProfilerStop() 16 | if ret != 0: 17 | raise Exception("cudaProfilerStop() returned %d" % ret) -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/run.sh: -------------------------------------------------------------------------------- 1 | nsys profile --force-overwrite true -t cuda -o trans_report python3 end_to_end.py --model sparse 2 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/cuda/quantization.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor quantization_cuda(torch::Tensor input_matrix, int bits, float scale); 4 | 5 | torch::Tensor quantization(torch::Tensor input_matrix, int bits, float scale) 6 | { 7 | return quantization_cuda(input_matrix, bits, scale); 8 | } 9 | 10 | torch::Tensor batched_quantization_cuda(torch::Tensor input_matrix, int bits, float scale); 11 | 12 | torch::Tensor batched_quantization(torch::Tensor input_matrix, int bits, float scale) 13 | { 14 | return batched_quantization_cuda(input_matrix, bits, scale); 15 | } 16 | 17 | 18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 19 | m.def("quantization", &quantization, "Custom symmetric quantization kernel"); 20 | m.def("bquantization", &batched_quantization, "Custom Batched symmetric quantization kernel"); 21 | } 22 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/cuda/softmax.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor csr_softmax_cuda( 4 | torch::Tensor row_indices, 5 | torch::Tensor row_offsets, 6 | torch::Tensor values, 7 | float scaler, 8 | int vec_length); 9 | 10 | 11 | torch::Tensor csr_softmax( 12 | torch::Tensor row_indices, 13 | torch::Tensor row_offsets, 14 | torch::Tensor values, 15 | float scaler, 16 | int vec_length) 17 | { 18 | return csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length); 19 | } 20 | 21 | 22 | torch::Tensor batched_csr_softmax_cuda( 23 | torch::Tensor row_indices, 24 | torch::Tensor row_offsets, 25 | torch::Tensor values, 26 | float scaler, 27 | int vec_length, 28 | int batch_size); 29 | 30 | 31 | torch::Tensor batched_csr_softmax( 32 | torch::Tensor row_indices, 33 | torch::Tensor row_offsets, 34 | torch::Tensor values, 35 | float scaler, 36 | int vec_length, 37 | int batch_size) 38 | { 39 | return batched_csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length, batch_size); 40 | } 41 | 42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 43 | m.def("csr_softmax", &csr_softmax, "Custom Softmax kernel"); 44 | m.def("bcsr_softmax", &batched_csr_softmax, "Custom Batched Softmax kernel"); 45 | } -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/cuda/spmm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor spmm_cuda( 4 | torch::Tensor row_indices, 5 | torch::Tensor row_offsets, 6 | torch::Tensor column_indices, 7 | torch::Tensor values, 8 | torch::Tensor rhs_matrix, 9 | int vec_length); 10 | 11 | torch::Tensor spmm( 12 | torch::Tensor row_indices, 13 | torch::Tensor row_offsets, 14 | torch::Tensor column_indices, 15 | torch::Tensor values, 16 | torch::Tensor rhs_matrix, 17 | int vec_length) 18 | { 19 | return spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length); 20 | } 21 | 22 | 23 | 24 | torch::Tensor batched_spmm_cuda( 25 | torch::Tensor row_indices, 26 | torch::Tensor row_offsets, 27 | torch::Tensor column_indices, 28 | torch::Tensor values, 29 | torch::Tensor rhs_matrix, 30 | int vec_length); 31 | 32 | torch::Tensor batched_spmm( 33 | torch::Tensor row_indices, 34 | torch::Tensor row_offsets, 35 | torch::Tensor column_indices, 36 | torch::Tensor values, 37 | torch::Tensor rhs_matrix, 38 | int vec_length) 39 | { 40 | return batched_spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length); 41 | } 42 | 43 | 44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 45 | m.def("spmm", &spmm, "Custom SPMM kernel"); 46 | m.def("bspmm", &batched_spmm, "Custom Batched SPMM kernel"); 47 | } -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/cuda/spmm_utils/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | 7 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 8 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 9 | } 10 | 11 | template 12 | struct Barrier{ 13 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 14 | static constexpr int kThreadsPerOutputTile = BlockWidth; 15 | uint32_t thread_mask = 0xffffffff; 16 | 17 | __device__ __forceinline__ Barrier(int thread_idx_y){ 18 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 19 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 20 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 21 | } 22 | } 23 | 24 | __device__ __forceinline__ void Sync(){ 25 | if (kThreadsPerOutputTile > 32){ 26 | __syncthreads(); 27 | } else if (kThreadsPerOutputTile > 1){ 28 | __syncwarp(thread_mask); 29 | } 30 | } 31 | }; 32 | #endif 33 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/cuda/spmm_utils_N128_bk/barrier.h: -------------------------------------------------------------------------------- 1 | #ifndef BARRIER_H 2 | #define BARRIER_H 3 | 4 | #include 5 | 6 | 7 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) { 8 | return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1); 9 | } 10 | 11 | template 12 | struct Barrier{ 13 | static constexpr int kThreadsPerBlock = Tile_M * BlockWidth; 14 | static constexpr int kThreadsPerOutputTile = BlockWidth; 15 | uint32_t thread_mask = 0xffffffff; 16 | 17 | __device__ __forceinline__ Barrier(int thread_idx_y){ 18 | if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){ 19 | constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1; 20 | thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile); 21 | } 22 | } 23 | 24 | __device__ __forceinline__ void Sync(){ 25 | if (kThreadsPerOutputTile > 32){ 26 | __syncthreads(); 27 | } else if (kThreadsPerOutputTile > 1){ 28 | __syncwarp(thread_mask); 29 | } 30 | } 31 | }; 32 | #endif 33 | -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/src/install.sh: -------------------------------------------------------------------------------- 1 | python3 -W ignore setup.py build 2 | python3 -W ignore setup.py install -------------------------------------------------------------------------------- /end2end_eval/sparse_transformer_magicube/verify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCIS/Magicube/8f92b69e9c1d7a0406eacb773ef5e79a71eda4f0/end2end_eval/sparse_transformer_magicube/verify/__init__.py -------------------------------------------------------------------------------- /plot/confinter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | np.random.seed(0) 7 | #data = np.random.randint(10, 30, 50) 8 | #data = np.array([0.01021, 0.011004, 0.010868, 0.011072, 0.011223, 0.010629, 0.011198, 0.01027, 0.010863, 0.010955, 0.010587, 0.011011, 0.010517, 0.011234, 0.011296, 0.010959]) 9 | data = np.array([0.00621, 0.007004, 0.010868, 0.011072, 0.011223, 0.010629, 0.011198, 0.01027, 0.010863, 0.010955, 0.010587, 0.011011, 0.010517, 0.011234, 0.011296, 0.010959]) 10 | 11 | meanv = np.mean(data) 12 | inter = st.norm.interval(alpha=0.95, loc=np.mean(data), scale=st.sem(data)) 13 | print("interval: ", inter, "mean: ", meanv) 14 | 15 | -------------------------------------------------------------------------------- /plot/figs/.gitignore: -------------------------------------------------------------------------------- 1 | ## ignore this file ## 2 | *.log 3 | *.o 4 | *.pdf 5 | -------------------------------------------------------------------------------- /plot/gen_csv.sh: -------------------------------------------------------------------------------- 1 | 2 | python spmm_abl_study.py 3 | python spmm_pres.py 4 | python sddmm_abl_study.py 5 | python spmm_all_matrices.py 6 | python sddmm_all_matrices.py 7 | python n2n.py 8 | 9 | -------------------------------------------------------------------------------- /plot/plot.sh: -------------------------------------------------------------------------------- 1 | 2 | python plot_spmm_abl_study.py 3 | python plot_spmm_pres.py 4 | python plot_sddmm_abl_study.py 5 | python plot_spmm_all_matrices.py 6 | python plot_sddmm_all_matrices.py 7 | python plot_n2n_a.py 8 | python plot_n2n_b.py 9 | python plot_n2n_c.py 10 | python plot_n2n_d.py 11 | python plot_n2n_e.py 12 | python plot_n2n_f.py 13 | python plot_n2n_g.py 14 | python plot_n2n_h.py 15 | -------------------------------------------------------------------------------- /plot/plot_n2n_a.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | n2n_a_data = pd.read_csv('n2n_a.csv') 12 | sns.set(rc={"lines.linewidth": 0.5}) 13 | sns.set(rc={'figure.figsize':(5, 3)}) 14 | g = sns.barplot(data=n2n_a_data, x="S0.9,Seq_l=4096,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 15 | #plt.xticks(rotation=20) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 25)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.9, Seq_len=4096, num_h=4') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-a.pdf') 23 | 24 | -------------------------------------------------------------------------------- /plot/plot_n2n_b.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | n2n_b_data = pd.read_csv('n2n_b.csv') 12 | sns.set(rc={"lines.linewidth": 0.5}) 13 | sns.set(rc={'figure.figsize':(5, 3)}) 14 | g = sns.barplot(data=n2n_b_data, x="S0.9,Seq_l=4096,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 15 | g.tick_params(labelsize=8) 16 | g.set(ylim=(0, 50)) 17 | g.set_xlabel(" ", fontsize=8) 18 | g.set_title('Sparsity=0.9, Seq_len=4096, num_h=8') 19 | plt.setp(g.get_legend().get_texts(), fontsize='6') 20 | plt.setp(g.get_legend().get_title(), fontsize='6') 21 | g.figure.savefig('./figs/Figure16-b.pdf') 22 | 23 | -------------------------------------------------------------------------------- /plot/plot_n2n_c.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | n2n_c_data = pd.read_csv('n2n_c.csv') 13 | sns.set(rc={"lines.linewidth": 0.5}) 14 | sns.set(rc={'figure.figsize':(5, 3)}) 15 | g = sns.barplot(data=n2n_c_data, x="S0.9,Seq_l=8192,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 70)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.9, Seq_len=8192, num_h=4') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-c.pdf') 23 | 24 | -------------------------------------------------------------------------------- /plot/plot_n2n_d.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | n2n_d_data = pd.read_csv('n2n_d.csv') 13 | sns.set(rc={"lines.linewidth": 0.5}) 14 | sns.set(rc={'figure.figsize':(5, 3)}) 15 | g = sns.barplot(data=n2n_d_data, x="S0.9,Seq_l=8192,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 150)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.9, Seq_len=8192, num_h=8') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-d.pdf') 23 | 24 | 25 | -------------------------------------------------------------------------------- /plot/plot_n2n_e.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | 13 | n2n_e_data = pd.read_csv('n2n_e.csv') 14 | sns.set(rc={"lines.linewidth": 0.5}) 15 | sns.set(rc={'figure.figsize':(5, 3)}) 16 | g = sns.barplot(data=n2n_e_data, x="S0.95,Seq_l=4096,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 17 | #plt.xticks(rotation=20) 18 | g.tick_params(labelsize=8) 19 | g.set(ylim=(0, 25)) 20 | g.set_xlabel(" ", fontsize=8) 21 | g.set_title('Sparsity=0.95, Seq_len=4096, num_h=4') 22 | plt.setp(g.get_legend().get_texts(), fontsize='6') 23 | plt.setp(g.get_legend().get_title(), fontsize='6') 24 | g.figure.savefig('./figs/Figure16-e.pdf') 25 | 26 | -------------------------------------------------------------------------------- /plot/plot_n2n_f.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | n2n_f_data = pd.read_csv('n2n_f.csv') 13 | sns.set(rc={"lines.linewidth": 0.5}) 14 | sns.set(rc={'figure.figsize':(5, 3)}) 15 | g = sns.barplot(data=n2n_f_data, x="S0.95,Seq_l=4096,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 50)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.95, Seq_len=4096, num_h=8') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-f.pdf') 23 | 24 | -------------------------------------------------------------------------------- /plot/plot_n2n_g.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | n2n_g_data = pd.read_csv('n2n_g.csv') 13 | sns.set(rc={"lines.linewidth": 0.5}) 14 | sns.set(rc={'figure.figsize':(5, 3)}) 15 | g = sns.barplot(data=n2n_g_data, x="S0.95,Seq_l=8192,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 70)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.95, Seq_len=8192, num_h=4') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-g.pdf') 23 | 24 | -------------------------------------------------------------------------------- /plot/plot_n2n_h.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #sns.set_context(rc = {'patch.linewidth': 0.0}) 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b'] 10 | 11 | 12 | n2n_h_data = pd.read_csv('n2n_h.csv') 13 | sns.set(rc={"lines.linewidth": 0.5}) 14 | sns.set(rc={'figure.figsize':(5, 3)}) 15 | g = sns.barplot(data=n2n_h_data, x="S0.95,Seq_l=8192,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8) 16 | g.tick_params(labelsize=8) 17 | g.set(ylim=(0, 150)) 18 | g.set_xlabel(" ", fontsize=8) 19 | g.set_title('Sparsity=0.95, Seq_len=8192, num_h=8') 20 | plt.setp(g.get_legend().get_texts(), fontsize='6') 21 | plt.setp(g.get_legend().get_title(), fontsize='6') 22 | g.figure.savefig('./figs/Figure16-h.pdf') 23 | -------------------------------------------------------------------------------- /plot/plot_sddmm_abl_study.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | sddmm_abl_study_data = pd.read_csv('sddmm_abl_study.csv') 9 | sns.set(rc={"lines.linewidth": 0.5}) 10 | sns.set(rc={'figure.figsize':(15, 5)}) 11 | g = sns.barplot(data=sddmm_abl_study_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d") 12 | plt.xticks(rotation=20) 13 | g.tick_params(labelsize=8) 14 | g.set(ylim=(0, 40)) 15 | g.figure.savefig('./figs/Figure13.pdf') 16 | 17 | -------------------------------------------------------------------------------- /plot/plot_sddmm_all_matrices.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | 9 | sddmm_data = pd.read_csv('sddmm_all_matrices.csv') 10 | ##print(sddmm_data) 11 | # 12 | #fgrid = sns.FacetGrid(sddmm_data, col="vecLen", row="dimN") 13 | #fgrid.map_dataframe(sns.boxplot, x="Sparsity", y="", data=sddmm_data) 14 | # 15 | # 16 | #fgrid.figure.savefig('test.pdf') 17 | sns.set(rc={"lines.linewidth": 0.5}) 18 | g = sns.catplot(x="sparsity", y="speedup", 19 | hue="algs", col="V", row="K", fliersize=3, 20 | data=sddmm_data, kind="box", 21 | height=4, aspect=1.6) 22 | g.set(ylim=(0.0, 3.0)) 23 | plt.axhline(1.0, linestyle='--', linewidth=2.7, color='blue') 24 | g.figure.savefig('./figs/Figure15.pdf') 25 | -------------------------------------------------------------------------------- /plot/plot_spmm_abl_study.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | order = ['V2,L16R8,S0.7', 'V8,L16R8,S0.7', 'V2,L8R8,S0.7', 'V8,L8R8,S0.7', 9 | 'V2,L8R4,S0.7', 'V8,L8R4,S0.7', 'V2,L4R4,S0.7', 'V8,L4R4,S0.7', 10 | 'V2,L16R8,S0.9', 'V8,L16R8,S0.9', 'V2,L8R8,S0.9', 'V8,L8R8,S0.9', 11 | 'V2,L8R4,S0.9', 'V8,L8R4,S0.9', 'V2,L4R4,S0.9', 'V8,L4R4,S0.9'] 12 | 13 | #sns.color_palette("Blues", as_cmap=True) 14 | spmm_abl_data = pd.read_csv('spmm_abl_study.csv') 15 | sns.set(rc={"lines.linewidth": 0.5}) 16 | sns.set(rc={'figure.figsize':(15, 5)}) 17 | g = sns.barplot(data=spmm_abl_data, x="configs", y="TOP/s", hue="opts", order=order, palette="Blues_d") 18 | plt.xticks(rotation=20) 19 | g.tick_params(labelsize=8) 20 | g.set(ylim=(0, 40)) 21 | g.figure.savefig('./figs/Figure11.pdf') 22 | 23 | -------------------------------------------------------------------------------- /plot/plot_spmm_all_matrices.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | 9 | spmm_data = pd.read_csv('spmm_all_matrices.csv') 10 | ##print(spmm_data) 11 | # 12 | #fgrid = sns.FacetGrid(spmm_data, col="vecLen", row="dimN") 13 | #fgrid.map_dataframe(sns.boxplot, x="Sparsity", y="", data=spmm_data) 14 | # 15 | # 16 | #fgrid.figure.savefig('test.pdf') 17 | sns.set(rc={"lines.linewidth": 0.5}) 18 | g = sns.catplot(x="sparsity", y="speedup", 19 | hue="algs", col="V", row="N", fliersize=3, 20 | data=spmm_data, kind="box", 21 | height=4, aspect=1.6) 22 | g.set(ylim=(0.0, 3.0)) 23 | plt.axhline(1.0, linestyle='--', linewidth=2.7, color='blue') 24 | g.figure.savefig('./figs/Figure14.pdf') 25 | -------------------------------------------------------------------------------- /plot/plot_spmm_pres.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | import csv 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | #order = ['L4-R4', 'L8-R4', 'L12-R4', 'L16-R4', 'L8-R8', 'L16-R8', 'L16-R16'] 9 | spmm_pres_data = pd.read_csv('spmm_pres.csv') 10 | sns.set(rc={"lines.linewidth": 0.5}) 11 | sns.set(rc={'figure.figsize':(15, 5)}) 12 | #g = sns.barplot(data=spmm_pres_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d", hue_order=order) 13 | g = sns.barplot(data=spmm_pres_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d") 14 | plt.xticks(rotation=20) 15 | g.tick_params(labelsize=8) 16 | g.set(ylim=(0, 45)) 17 | g.figure.savefig('./figs/Figure12.pdf') 18 | 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nvtx 2 | scipy 3 | pytorch_memlab 4 | seaborn 5 | --------------------------------------------------------------------------------