├── .gitignore
├── LICENSE
├── README.md
├── SDDMM
    ├── SDDMM
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── eval_matrices
    │   │   ├── s50.txt
    │   │   ├── s70.txt
    │   │   ├── s80.txt
    │   │   ├── s90.txt
    │   │   ├── s95.txt
    │   │   └── s98.txt
    │   ├── include
    │   │   ├── bm_test_utils.h
    │   │   ├── cublas_gemm.cuh
    │   │   ├── cuda_sddmm.cuh
    │   │   ├── cuda_spmm.cuh
    │   │   ├── sputnik.h
    │   │   ├── wmma_sddmm.cuh
    │   │   └── wmma_spmm.cuh
    │   ├── launch_sddmm_magicube_16b16b.py
    │   ├── launch_sddmm_magicube_4b4b.py
    │   ├── launch_sddmm_magicube_8b8b.py
    │   ├── run_sddmm_magicube.sh
    │   ├── sddmm_benchmark.cpp
    │   ├── setup.sh
    │   ├── spmm_benchmark.cpp
    │   └── src
    │   │   ├── cublas_gemm.cu
    │   │   ├── cuda_sddmm.cu
    │   │   ├── cuda_spmm.cu
    │   │   ├── sddmm_utils
    │   │       ├── compute_utils.h
    │   │       ├── lhs_tile.h
    │   │       ├── output_tile.h
    │   │       └── rhs_tile.h
    │   │   ├── spmm_utils
    │   │       ├── barrier.h
    │   │       ├── compute_utils.h
    │   │       ├── compute_utils.h_bk
    │   │       ├── compute_utils.h_bkkkk
    │   │       ├── compute_utils.h_more_shift_opt_4bit
    │   │       ├── dense_tile.h
    │   │       ├── memory_aligner.h
    │   │       ├── output_tile.h
    │   │       └── sparse_tile.h
    │   │   ├── wmma_sddmm.cu
    │   │   ├── wmma_sddmm.cu_bk
    │   │   └── wmma_spmm.cu
    └── ablation_study
    │   ├── SDDMM_basic
    │       ├── .gitignore
    │       ├── Makefile
    │       ├── include
    │       │   ├── bm_test_utils.h
    │       │   ├── cublas_gemm.cuh
    │       │   ├── cuda_sddmm.cuh
    │       │   ├── cuda_spmm.cuh
    │       │   ├── sputnik.h
    │       │   ├── wmma_sddmm.cuh
    │       │   └── wmma_spmm.cuh
    │       ├── sddmm_benchmark.cpp
    │       ├── setup.sh
    │       ├── spmm_benchmark.cpp
    │       └── src
    │       │   ├── cublas_gemm.cu
    │       │   ├── cuda_sddmm.cu
    │       │   ├── cuda_spmm.cu
    │       │   ├── sddmm_utils
    │       │       ├── compute_utils.h
    │       │       ├── lhs_tile.h
    │       │       ├── output_tile.h
    │       │       └── rhs_tile.h
    │       │   ├── spmm_utils
    │       │       ├── barrier.h
    │       │       ├── compute_utils.h
    │       │       ├── compute_utils.h_bk
    │       │       ├── compute_utils.h_bkkkk
    │       │       ├── compute_utils.h_more_shift_opt_4bit
    │       │       ├── dense_tile.h
    │       │       ├── memory_aligner.h
    │       │       ├── output_tile.h
    │       │       └── sparse_tile.h
    │       │   ├── wmma_sddmm.cu
    │       │   ├── wmma_sddmm.cu_bk
    │       │   └── wmma_spmm.cu
    │   ├── SDDMM_lhs_pref
    │       ├── .gitignore
    │       ├── Makefile
    │       ├── include
    │       │   ├── bm_test_utils.h
    │       │   ├── cublas_gemm.cuh
    │       │   ├── cuda_sddmm.cuh
    │       │   ├── cuda_spmm.cuh
    │       │   ├── sputnik.h
    │       │   ├── wmma_sddmm.cuh
    │       │   └── wmma_spmm.cuh
    │       ├── sddmm_benchmark.cpp
    │       ├── setup.sh
    │       ├── spmm_benchmark.cpp
    │       ├── src
    │       │   ├── cublas_gemm.cu
    │       │   ├── cuda_sddmm.cu
    │       │   ├── cuda_spmm.cu
    │       │   ├── sddmm_utils
    │       │   │   ├── compute_utils.h
    │       │   │   ├── lhs_tile.h
    │       │   │   ├── output_tile.h
    │       │   │   └── rhs_tile.h
    │       │   ├── spmm_utils
    │       │   │   ├── barrier.h
    │       │   │   ├── compute_utils.h
    │       │   │   ├── compute_utils.h_bk
    │       │   │   ├── compute_utils.h_bkkkk
    │       │   │   ├── compute_utils.h_more_shift_opt_4bit
    │       │   │   ├── dense_tile.h
    │       │   │   ├── memory_aligner.h
    │       │   │   ├── output_tile.h
    │       │   │   └── sparse_tile.h
    │       │   ├── wmma_sddmm.cu
    │       │   ├── wmma_sddmm.cu_bk
    │       │   └── wmma_spmm.cu
    │       └── usingwmma_run.sh
    │   ├── compile_jobs.sh
    │   └── sddmm_ablation_study.py
├── SpMM
    ├── SpMM
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── eval_matrices
    │   │   ├── s50.txt
    │   │   ├── s70.txt
    │   │   ├── s80.txt
    │   │   ├── s90.txt
    │   │   ├── s95.txt
    │   │   └── s98.txt
    │   ├── file_name_server.py
    │   ├── include
    │   │   ├── bm_test_utils.h
    │   │   ├── cublas_gemm.cuh
    │   │   ├── cuda_sddmm.cuh
    │   │   ├── cuda_spmm.cuh
    │   │   ├── sputnik.h
    │   │   ├── wmma_sddmm.cuh
    │   │   └── wmma_spmm.cuh
    │   ├── launch_spmm_magicube_16b8b.py
    │   ├── launch_spmm_magicube_4b4b.py
    │   ├── launch_spmm_magicube_8b4b.py
    │   ├── launch_spmm_magicube_8b8b.py
    │   ├── ncu_profile.py
    │   ├── run_spmm_magicube.sh
    │   ├── sddmm_benchmark.cpp
    │   ├── setup.sh
    │   ├── spmm_benchmark.cpp
    │   ├── spmm_pres.sh
    │   └── src
    │   │   ├── cublas_gemm.cu
    │   │   ├── cuda_sddmm.cu
    │   │   ├── cuda_spmm.cu
    │   │   ├── spmm_utils
    │   │       ├── barrier.h
    │   │       ├── compute_utils.h
    │   │       ├── dense_tile.h
    │   │       ├── memory_aligner.h
    │   │       ├── output_tile.h
    │   │       └── sparse_tile.h
    │   │   ├── wmma_sddmm.cu
    │   │   └── wmma_spmm.cu
    └── ablation_study
    │   ├── 16b8b
    │       ├── SpMM_conflict_free
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │   │   ├── barrier.h
    │       │   │   │   ├── compute_utils.h
    │       │   │   │   ├── dense_tile.h
    │       │   │   │   ├── memory_aligner.h
    │       │   │   │   ├── output_tile.h
    │       │   │   │   └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │       └── SpMM_conflict_free_prefetch
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │       ├── bm_test_utils.h
    │       │       ├── cublas_gemm.cuh
    │       │       ├── cuda_sddmm.cuh
    │       │       ├── cuda_spmm.cuh
    │       │       ├── sputnik.h
    │       │       ├── wmma_sddmm.cuh
    │       │       └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │       ├── cublas_gemm.cu
    │       │       ├── cuda_sddmm.cu
    │       │       ├── cuda_spmm.cu
    │       │       ├── spmm_utils
    │       │       │   ├── barrier.h
    │       │       │   ├── compute_utils.h
    │       │       │   ├── dense_tile.h
    │       │       │   ├── memory_aligner.h
    │       │       │   ├── output_tile.h
    │       │       │   └── sparse_tile.h
    │       │       ├── wmma_sddmm.cu
    │       │       └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │   ├── 4b4b
    │       ├── SpMM_conflict_free
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │   │   ├── barrier.h
    │       │   │   │   ├── compute_utils.h
    │       │   │   │   ├── dense_tile.h
    │       │   │   │   ├── memory_aligner.h
    │       │   │   │   ├── output_tile.h
    │       │   │   │   └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │       ├── SpMM_conflict_free_prefetch
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │   │   ├── barrier.h
    │       │   │   │   ├── compute_utils.h
    │       │   │   │   ├── dense_tile.h
    │       │   │   │   ├── memory_aligner.h
    │       │   │   │   ├── output_tile.h
    │       │   │   │   └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │       └── SpMM_conflict_free_prefetch_shuffle
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │       ├── bm_test_utils.h
    │       │       ├── cublas_gemm.cuh
    │       │       ├── cuda_sddmm.cuh
    │       │       ├── cuda_spmm.cuh
    │       │       ├── sputnik.h
    │       │       ├── wmma_sddmm.cuh
    │       │       └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │       ├── cublas_gemm.cu
    │       │       ├── cuda_sddmm.cu
    │       │       ├── cuda_spmm.cu
    │       │       ├── spmm_utils
    │       │       │   ├── barrier.h
    │       │       │   ├── compute_utils.h
    │       │       │   ├── dense_tile.h
    │       │       │   ├── memory_aligner.h
    │       │       │   ├── output_tile.h
    │       │       │   └── sparse_tile.h
    │       │       ├── wmma_sddmm.cu
    │       │       └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │   ├── 8b4b
    │       ├── SpMM_conflict_free
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │   │   ├── barrier.h
    │       │   │   │   ├── compute_utils.h
    │       │   │   │   ├── dense_tile.h
    │       │   │   │   ├── memory_aligner.h
    │       │   │   │   ├── output_tile.h
    │       │   │   │   └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │       ├── SpMM_conflict_free_prefetch
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │   │   ├── barrier.h
    │       │   │   │   ├── compute_utils.h
    │       │   │   │   ├── dense_tile.h
    │       │   │   │   ├── memory_aligner.h
    │       │   │   │   ├── output_tile.h
    │       │   │   │   └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │       └── SpMM_conflict_free_prefetch_shuffle
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │       ├── bm_test_utils.h
    │       │       ├── cublas_gemm.cuh
    │       │       ├── cuda_sddmm.cuh
    │       │       ├── cuda_spmm.cuh
    │       │       ├── sputnik.h
    │       │       ├── wmma_sddmm.cuh
    │       │       └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   ├── src
    │       │       ├── cublas_gemm.cu
    │       │       ├── cuda_sddmm.cu
    │       │       ├── cuda_spmm.cu
    │       │       ├── spmm_utils
    │       │       │   ├── barrier.h
    │       │       │   ├── compute_utils.h
    │       │       │   ├── dense_tile.h
    │       │       │   ├── memory_aligner.h
    │       │       │   ├── output_tile.h
    │       │       │   └── sparse_tile.h
    │       │       ├── wmma_sddmm.cu
    │       │       └── wmma_spmm.cu
    │       │   └── usingwmma_run.sh
    │   ├── 8b8b
    │       ├── SpMM_conflict_free
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │   │   ├── bm_test_utils.h
    │       │   │   ├── cublas_gemm.cuh
    │       │   │   ├── cuda_sddmm.cuh
    │       │   │   ├── cuda_spmm.cuh
    │       │   │   ├── sputnik.h
    │       │   │   ├── wmma_sddmm.cuh
    │       │   │   └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   └── src
    │       │   │   ├── cublas_gemm.cu
    │       │   │   ├── cuda_sddmm.cu
    │       │   │   ├── cuda_spmm.cu
    │       │   │   ├── spmm_utils
    │       │   │       ├── barrier.h
    │       │   │       ├── compute_utils.h
    │       │   │       ├── dense_tile.h
    │       │   │       ├── memory_aligner.h
    │       │   │       ├── output_tile.h
    │       │   │       └── sparse_tile.h
    │       │   │   ├── wmma_sddmm.cu
    │       │   │   └── wmma_spmm.cu
    │       └── SpMM_conflict_free_prefetch
    │       │   ├── .gitignore
    │       │   ├── Makefile
    │       │   ├── file_name_server.py
    │       │   ├── include
    │       │       ├── bm_test_utils.h
    │       │       ├── cublas_gemm.cuh
    │       │       ├── cuda_sddmm.cuh
    │       │       ├── cuda_spmm.cuh
    │       │       ├── sputnik.h
    │       │       ├── wmma_sddmm.cuh
    │       │       └── wmma_spmm.cuh
    │       │   ├── ncu_profile.py
    │       │   ├── run_jobs.sh
    │       │   ├── sddmm_benchmark.cpp
    │       │   ├── setup.sh
    │       │   ├── spmm_benchmark.cpp
    │       │   └── src
    │       │       ├── cublas_gemm.cu
    │       │       ├── cuda_sddmm.cu
    │       │       ├── cuda_spmm.cu
    │       │       ├── spmm_utils
    │       │           ├── barrier.h
    │       │           ├── compute_utils.h
    │       │           ├── dense_tile.h
    │       │           ├── memory_aligner.h
    │       │           ├── output_tile.h
    │       │           └── sparse_tile.h
    │       │       ├── wmma_sddmm.cu
    │       │       └── wmma_spmm.cu
    │   ├── SpMM_basic
    │       ├── .gitignore
    │       ├── Makefile
    │       ├── file_name_server.py
    │       ├── include
    │       │   ├── bm_test_utils.h
    │       │   ├── cublas_gemm.cuh
    │       │   ├── cuda_sddmm.cuh
    │       │   ├── cuda_spmm.cuh
    │       │   ├── sputnik.h
    │       │   ├── wmma_sddmm.cuh
    │       │   └── wmma_spmm.cuh
    │       ├── ncu_profile.py
    │       ├── run_jobs.sh
    │       ├── sddmm_benchmark.cpp
    │       ├── setup.sh
    │       ├── spmm_benchmark.cpp
    │       ├── src
    │       │   ├── cublas_gemm.cu
    │       │   ├── cuda_sddmm.cu
    │       │   ├── cuda_spmm.cu
    │       │   ├── spmm_utils
    │       │   │   ├── barrier.h
    │       │   │   ├── compute_utils.h
    │       │   │   ├── dense_tile.h
    │       │   │   ├── memory_aligner.h
    │       │   │   ├── output_tile.h
    │       │   │   └── sparse_tile.h
    │       │   ├── wmma_sddmm.cu
    │       │   └── wmma_spmm.cu
    │       └── usingwmma_run.sh
    │   ├── compile_jobs.sh
    │   └── spmm_ablation_study.sh
├── baselines
    ├── Dockerfile
    ├── Makefile
    ├── eval_matrices
    │   ├── s50.txt
    │   ├── s70.txt
    │   ├── s80.txt
    │   ├── s90.txt
    │   ├── s95.txt
    │   └── s98.txt
    ├── example
    │   └── example.csv
    ├── file_name_server.py
    ├── include
    │   ├── bm_test_utils.h
    │   ├── cublas_gemm.cuh
    │   ├── cuda_sddmm.cuh
    │   ├── cuda_spmm.cuh
    │   ├── sputnik.h
    │   ├── wmma_sddmm.cuh
    │   └── wmma_spmm.cuh
    ├── job_launcher.py
    ├── launch.py
    ├── launch_sddmm_cublas_fp16.py
    ├── launch_sddmm_cublas_int8.py
    ├── launch_sddmm_vectorSparse.py
    ├── launch_spmm_cublas_fp16.py
    ├── launch_spmm_cublas_int8.py
    ├── launch_spmm_cusparse_fp16.py
    ├── launch_spmm_cusparse_int8.py
    ├── launch_spmm_vectorSparse.py
    ├── ncu_profile.py
    ├── plot_blocked_ell.py
    ├── plot_finegrained.py
    ├── plot_mem_l2_l1.py
    ├── plot_sddmm.py
    ├── plot_spmm.py
    ├── run_sddmm_baselines.sh
    ├── run_spmm_baselines.sh
    ├── sddmm_benchmark.cpp
    ├── setup.sh
    ├── spmm_benchmark.cpp
    ├── src
    │   ├── cublas_gemm.cu
    │   ├── cuda_sddmm.cu
    │   ├── cuda_spmm.cu
    │   ├── spmm_utils
    │   │   ├── barrier.h
    │   │   ├── compute_utils.h
    │   │   ├── dense_tile.h
    │   │   ├── memory_aligner.h
    │   │   ├── output_tile.h
    │   │   └── sparse_tile.h
    │   ├── wmma_sddmm.cu
    │   └── wmma_spmm.cu
    └── usingwmma_run.sh
├── end2end_eval
    ├── sparse_transformer_baselines
    │   ├── README.md
    │   ├── atten_speedup.py
    │   ├── attention.py
    │   ├── cudaprofile.py
    │   ├── end_to_end.py
    │   ├── launch_cudnn_fp16.py
    │   ├── launch_vectorSparse.py
    │   ├── run.sh
    │   ├── sparse_encoder.py
    │   ├── spattention.py
    │   ├── src
    │   │   ├── cuda
    │   │   │   ├── sddmm.cpp
    │   │   │   ├── sddmm_kernel.cu
    │   │   │   ├── softmax.cpp
    │   │   │   ├── softmax_kernel.cu
    │   │   │   ├── spmm.cpp
    │   │   │   ├── spmm_kernel.cu
    │   │   │   └── spmm_utils
    │   │   │   │   ├── barrier.h
    │   │   │   │   ├── compute_utils.h
    │   │   │   │   ├── dense_tile.h
    │   │   │   │   ├── memory_aligner.h
    │   │   │   │   ├── output_tile.h
    │   │   │   │   └── sparse_tile.h
    │   │   ├── install.sh
    │   │   └── setup.py
    │   └── verify
    │   │   ├── __init__.py
    │   │   ├── bsddmm.py
    │   │   ├── bsoftmax.py
    │   │   ├── bspmm.py
    │   │   ├── sddmm.py
    │   │   ├── softmax.py
    │   │   ├── spmm.py
    │   │   └── static_mask.py
    └── sparse_transformer_magicube
    │   ├── atten_speedup.py
    │   ├── attention.py
    │   ├── cudaprofile.py
    │   ├── end_to_end.py
    │   ├── launch_magicube.py
    │   ├── run.sh
    │   ├── sparse_encoder.py
    │   ├── spattention.py
    │   ├── src
    │       ├── cuda
    │       │   ├── deq_sddmm.cpp
    │       │   ├── deq_sddmm_kernel.cu
    │       │   ├── deq_spmm.cpp
    │       │   ├── deq_spmm.cpp_N128
    │       │   ├── deq_spmm_kernel.cu
    │       │   ├── deq_spmm_kernel.cu_N128
    │       │   ├── q_softmax.cpp
    │       │   ├── q_softmax_kernel.cu
    │       │   ├── quantization.cpp
    │       │   ├── quantization_kernel.cu
    │       │   ├── sddmm.cpp
    │       │   ├── sddmm_kernel.cu
    │       │   ├── sddmm_utils
    │       │   │   ├── compute_utils.h
    │       │   │   ├── lhs_tile.h
    │       │   │   ├── output_tile.h
    │       │   │   └── rhs_tile.h
    │       │   ├── softmax.cpp
    │       │   ├── softmax_kernel.cu
    │       │   ├── spmm.cpp
    │       │   ├── spmm_kernel.cu
    │       │   ├── spmm_utils
    │       │   │   ├── barrier.h
    │       │   │   ├── compute_utils.h
    │       │   │   ├── dense_tile.h
    │       │   │   ├── memory_aligner.h
    │       │   │   ├── output_tile.h
    │       │   │   └── sparse_tile.h
    │       │   └── spmm_utils_N128_bk
    │       │   │   ├── barrier.h
    │       │   │   ├── compute_utils.h
    │       │   │   ├── dense_tile.h
    │       │   │   ├── memory_aligner.h
    │       │   │   ├── output_tile.h
    │       │   │   └── sparse_tile.h
    │       ├── install.sh
    │       └── setup.py
    │   └── verify
    │       ├── __init__.py
    │       ├── bsddmm.py
    │       ├── bsoftmax.py
    │       ├── bspmm.py
    │       ├── sddmm.py
    │       ├── softmax.py
    │       ├── spmm.py
    │       └── static_mask.py
├── magicubeLogo.svg
├── plot
    ├── confinter.py
    ├── examples
    │   ├── magicube_n2n.txt
    │   ├── pytorch_n2n.txt
    │   ├── sddmm_abl_study.txt
    │   ├── sddmm_cublas_fp16.txt
    │   ├── sddmm_cublas_int8.txt
    │   ├── sddmm_magicube_16b16b.txt
    │   ├── sddmm_magicube_4b4b.txt
    │   ├── sddmm_magicube_8b8b.txt
    │   ├── sddmm_vectorSparse.txt
    │   ├── spmm_abl_study.txt
    │   ├── spmm_cublas_fp16.txt
    │   ├── spmm_cublas_int8.txt
    │   ├── spmm_cusparse_fp16.txt
    │   ├── spmm_cusparse_int8.txt
    │   ├── spmm_magicube_16b8b.txt
    │   ├── spmm_magicube_4b4b.txt
    │   ├── spmm_magicube_8b4b.txt
    │   ├── spmm_magicube_8b8b.txt
    │   ├── spmm_pres.txt
    │   ├── spmm_vectorSparse.txt
    │   └── vectorSparse_n2n.txt
    ├── figs
    │   └── .gitignore
    ├── gen_csv.sh
    ├── n2n.py
    ├── plot.sh
    ├── plot_n2n_a.py
    ├── plot_n2n_b.py
    ├── plot_n2n_c.py
    ├── plot_n2n_d.py
    ├── plot_n2n_e.py
    ├── plot_n2n_f.py
    ├── plot_n2n_g.py
    ├── plot_n2n_h.py
    ├── plot_sddmm_abl_study.py
    ├── plot_sddmm_all_matrices.py
    ├── plot_spmm_abl_study.py
    ├── plot_spmm_all_matrices.py
    ├── plot_spmm_pres.py
    ├── sddmm_abl_study.py
    ├── sddmm_all_matrices.py
    ├── spmm_abl_study.py
    ├── spmm_all_matrices.py
    └── spmm_pres.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 3 | 
 4 | 
 5 | ##################################################################
 6 | 
 7 | ## Project file structure ##
 8 | 
 9 | # Source file directory:
10 | SRC_DIR = src
11 | 
12 | # Object file directory:
13 | OBJ_DIR = bin
14 | 
15 | # Include header file directory
16 | INC_DIR = include
17 | 
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SDDMM/SDDMM/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SDDMM/SDDMM/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SDDMM/SDDMM/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const int* __restrict__ lhs_matrix,
12 |     const int* __restrict__ rhs_matrix,
13 |     int* __restrict__ output_values, 
14 |     int vec_length);
15 | 
16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const int* __restrict__ lhs_matrix,
21 |     const int* __restrict__ rhs_matrix,
22 |     int* __restrict__ output_values, 
23 |     int vec_length);
24 | 
25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const int* __restrict__ lhs_matrix,
30 |     const int* __restrict__ rhs_matrix,
31 |     int* __restrict__ output_values, 
32 |     int vec_length);
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/launch_sddmm_magicube_16b16b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | # Args
 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks')
 6 | 
 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark")
 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
12 | args = parser.parse_args()
13 | 
14 | dataset_dir = os.environ.get('dataset_dir')
15 | sparsities = ['50', '70', '80', '90', '95', '98']
16 | dimKs = [128, 256]
17 | vec_lens = [2, 4, 8]
18 | 
19 | for dimK in dimKs:
20 |     for vec_len in vec_lens:
21 |         for sparsity in sparsities:
22 |             print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity)
23 |         
24 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
25 |             lines = matrix_list.readlines()
26 |             #for i in range(1):
27 |             for i in range(len(lines)):
28 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
29 |                 cmd = './sddmm_benchmark %s %d %d 1 0 1 16 16' % (matrix, dimK, vec_len)
30 |                 os.system(cmd)
31 | 
32 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/launch_sddmm_magicube_4b4b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | # Args
 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks')
 6 | 
 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark")
 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
12 | args = parser.parse_args()
13 | 
14 | dataset_dir = os.environ.get('dataset_dir')
15 | sparsities = ['50', '70', '80', '90', '95', '98']
16 | dimKs = [128, 256]
17 | vec_lens = [2, 4, 8]
18 | 
19 | for dimK in dimKs:
20 |     for vec_len in vec_lens:
21 |         for sparsity in sparsities:
22 |             print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity)
23 |         
24 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
25 |             lines = matrix_list.readlines()
26 |             #for i in range(1):
27 |             for i in range(len(lines)):
28 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
29 |                 cmd = './sddmm_benchmark %s %d %d 1 0 1 4 4' % (matrix, dimK, vec_len)
30 |                 os.system(cmd)
31 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/launch_sddmm_magicube_8b8b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | # Args
 5 | parser = argparse.ArgumentParser(description='lauch the sddmm benchmarks')
 6 | 
 7 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark")
 8 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
 9 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
10 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
11 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
12 | args = parser.parse_args()
13 | 
14 | dataset_dir = os.environ.get('dataset_dir')
15 | sparsities = ['50', '70', '80', '90', '95', '98']
16 | dimKs = [128, 256]
17 | vec_lens = [2, 4, 8]
18 | 
19 | for dimK in dimKs:
20 |     for vec_len in vec_lens:
21 |         for sparsity in sparsities:
22 |             print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity)
23 |         
24 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
25 |             lines = matrix_list.readlines()
26 |             #for i in range(1):
27 |             for i in range(len(lines)):
28 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
29 |                 cmd = './sddmm_benchmark %s %d %d 1 0 1 8 8' % (matrix, dimK, vec_len)
30 |                 os.system(cmd)
31 | 
32 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/run_sddmm_magicube.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Tesing sddmm_magicube_16b16b"
 3 | python launch_sddmm_magicube_16b16b.py > sddmm_magicube_16b16b.txt
 4 | echo "Finish sddmm_magicube_16b16b"
 5 | 
 6 | echo "Tesing sddmm_magicube_8b8b"
 7 | python launch_sddmm_magicube_8b8b.py > sddmm_magicube_8b8b.txt
 8 | echo "Finish sddmm_magicube_8b8b"
 9 | 
10 | echo "Tesing sddmm_magicube_4b4b"
11 | python launch_sddmm_magicube_4b4b.py > sddmm_magicube_4b4b.txt
12 | echo "Finish sddmm_magicube_4b4b"
13 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make sddmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SDDMM/SDDMM/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 3 | 
 4 | 
 5 | ##################################################################
 6 | 
 7 | ## Project file structure ##
 8 | 
 9 | # Source file directory:
10 | SRC_DIR = src
11 | 
12 | # Object file directory:
13 | OBJ_DIR = bin
14 | 
15 | # Include header file directory
16 | INC_DIR = include
17 | 
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const int* __restrict__ lhs_matrix,
12 |     const int* __restrict__ rhs_matrix,
13 |     int* __restrict__ output_values, 
14 |     int vec_length);
15 | 
16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const int* __restrict__ lhs_matrix,
21 |     const int* __restrict__ rhs_matrix,
22 |     int* __restrict__ output_values, 
23 |     int vec_length);
24 | 
25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const int* __restrict__ lhs_matrix,
30 |     const int* __restrict__ rhs_matrix,
31 |     int* __restrict__ output_values, 
32 |     int vec_length);
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make sddmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_basic/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 3 | 
 4 | 
 5 | ##################################################################
 6 | 
 7 | ## Project file structure ##
 8 | 
 9 | # Source file directory:
10 | SRC_DIR = src
11 | 
12 | # Object file directory:
13 | OBJ_DIR = bin
14 | 
15 | # Include header file directory
16 | INC_DIR = include
17 | 
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm_4b(int m_vec, int k, int n,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const int* __restrict__ lhs_matrix,
12 |     const int* __restrict__ rhs_matrix,
13 |     int* __restrict__ output_values, 
14 |     int vec_length);
15 | 
16 | cudaError_t wmmaSddmm_8b(int m_vec, int k, int n,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const int* __restrict__ lhs_matrix,
21 |     const int* __restrict__ rhs_matrix,
22 |     int* __restrict__ output_values, 
23 |     int vec_length);
24 | 
25 | cudaError_t wmmaSddmm_16b(int m_vec, int k, int n,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const int* __restrict__ lhs_matrix,
30 |     const int* __restrict__ rhs_matrix,
31 |     int* __restrict__ output_values, 
32 |     int vec_length);
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make sddmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SDDMM/ablation_study/SDDMM_lhs_pref/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SDDMM/ablation_study/compile_jobs.sh:
--------------------------------------------------------------------------------
 1 | cd ./SDDMM_basic
 2 | chmod 777 setup.sh
 3 | ./setup.sh
 4 | echo "SDDMM basic is compiled."
 5 | 
 6 | cd -
 7 | cd ./SDDMM_lhs_pref
 8 | chmod 777 setup.sh
 9 | ./setup.sh
10 | echo "SDDMM with LHS prefetch is compiled."
11 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 
 3 | 
 4 | 
 5 | ##################################################################
 6 | 
 7 | ## Project file structure ##
 8 | 
 9 | # Source file directory:
10 | SRC_DIR = src
11 | 
12 | # Object file directory:
13 | OBJ_DIR = bin
14 | 
15 | # Include header file directory
16 | INC_DIR = include
17 | 
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/SpMM/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/SpMM/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/SpMM/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/SpMM/launch_spmm_magicube_16b8b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 1 16 8' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/launch_spmm_magicube_4b4b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 1 4 4' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/launch_spmm_magicube_8b4b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 1 8 4' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/launch_spmm_magicube_8b8b.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 1 8 8' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/run_spmm_magicube.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Tesing spmm_magicube_16b8b"
 3 | python launch_spmm_magicube_16b8b.py > spmm_magicube_16b8b.txt
 4 | echo "Finish spmm_magicube_16b8b"
 5 | 
 6 | echo "Tesing spmm_magicube_8b8b"
 7 | python launch_spmm_magicube_8b8b.py > spmm_magicube_8b8b.txt
 8 | echo "Finish spmm_magicube_8b8b"
 9 | 
10 | echo "Tesing spmm_magicube_8b4b"
11 | python launch_spmm_magicube_8b4b.py > spmm_magicube_8b4b.txt
12 | echo "Finish spmm_magicube_8b4b"
13 | 
14 | echo "Tesing spmm_magicube_4b4b"
15 | python launch_spmm_magicube_4b4b.py > spmm_magicube_4b4b.txt
16 | echo "Finish spmm_magicube_4b4b"
17 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/SpMM/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L16-R8 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8
12 | echo -e "\n"
13 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L16-R8 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 16 8
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 16 8
12 | echo -e "\n"
13 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/16b8b/SpMM_conflict_free_prefetch/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | 
 5 | echo -e "L4-R4 \n"
 6 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
 7 | echo -e "\n"
 8 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
 9 | echo -e "\n"
10 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
11 | echo -e "\n"
12 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
13 | echo -e "\n"
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | 
 5 | echo -e "L4-R4 \n"
 6 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
 7 | echo -e "\n"
 8 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
 9 | echo -e "\n"
10 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
11 | echo -e "\n"
12 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
13 | echo -e "\n"
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | 
 5 | echo -e "L4-R4 \n"
 6 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
 7 | echo -e "\n"
 8 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
 9 | echo -e "\n"
10 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 4 4
11 | echo -e "\n"
12 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
13 | echo -e "\n"
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/4b4b/SpMM_conflict_free_prefetch_shuffle/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L8-R4 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
12 | echo -e "\n"
13 | 
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L8-R4 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
12 | echo -e "\n"
13 | 
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L8-R4 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 4
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
12 | echo -e "\n"
13 | 
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b4b/SpMM_conflict_free_prefetch_shuffle/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse 
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L8-R8 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8
12 | echo -e "\n"
13 | 
14 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/run_jobs.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo -e "Evaluation perf for different precisions: N = 512, Iteration = 1024 \n"
 3 | 
 4 | echo -e "L8-R8 \n"
 5 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8
 6 | echo -e "\n"
 7 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8
 8 | echo -e "\n"
 9 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 2 0 1 1 1 8 8
10 | echo -e "\n"
11 | ./spmm_benchmark  ${dataset_dir}/rn50/random_pruning/0.9/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 8
12 | echo -e "\n"
13 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/8b8b/SpMM_conflict_free_prefetch/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | ##################################################################
20 | 
21 | ## Compile ##
22 | 
23 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
24 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
25 | 
26 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
27 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
28 | 
29 | # Compile main file to object file
30 | $(OBJ_DIR)/%.o : %.cpp
31 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
32 | 
33 | 
34 | # Compile CUDA source files to object files
35 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
36 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
37 | 
38 | clean:
39 | 	@rm -f $(OBJ_DIR)/*.o
40 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make spmm_benchmark
3 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/SpMM/ablation_study/SpMM_basic/usingwmma_run.sh:
--------------------------------------------------------------------------------
1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 0 1 1 1
2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 8 8
3 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 4 4
4 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
5 | CUDA_VISIBLE_DEVICES=GPU-31acddbe-f963-b876-2508-0c529c73da36 ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 4 4
6 | nsys profile --force-overwrite true  -t cuda -o spmm_report ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 8 4
7 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/compile_jobs.sh:
--------------------------------------------------------------------------------
 1 | cd ./SpMM_basic
 2 | chmod 777 setup.sh
 3 | ./setup.sh
 4 | echo "SpMM basic compiled."
 5 | 
 6 | cd -
 7 | cd ./16b8b/SpMM_conflict_free
 8 | chmod 777 setup.sh
 9 | ./setup.sh
10 | 
11 | cd -
12 | cd ./8b8b/SpMM_conflict_free
13 | chmod 777 setup.sh
14 | ./setup.sh
15 | 
16 | cd -
17 | cd ./8b4b/SpMM_conflict_free
18 | chmod 777 setup.sh
19 | ./setup.sh
20 | 
21 | cd -
22 | cd ./4b4b/SpMM_conflict_free
23 | chmod 777 setup.sh
24 | ./setup.sh
25 | echo "SpMM with conflict-free SM compiled."
26 | 
27 | cd -
28 | cd ./16b8b/SpMM_conflict_free_prefetch
29 | chmod 777 setup.sh
30 | ./setup.sh
31 | 
32 | cd -
33 | cd ./8b8b/SpMM_conflict_free_prefetch
34 | chmod 777 setup.sh
35 | ./setup.sh
36 | 
37 | cd -
38 | cd ./8b4b/SpMM_conflict_free_prefetch
39 | chmod 777 setup.sh
40 | ./setup.sh
41 | 
42 | cd -
43 | cd ./4b4b/SpMM_conflict_free_prefetch
44 | chmod 777 setup.sh
45 | ./setup.sh
46 | echo "SpMM with conflict-free SM + prefetch compiled."
47 | 
48 | cd -
49 | cd ./8b4b/SpMM_conflict_free_prefetch_shuffle
50 | chmod 777 setup.sh
51 | ./setup.sh
52 | 
53 | cd -
54 | cd ./4b4b/SpMM_conflict_free_prefetch_shuffle
55 | chmod 777 setup.sh
56 | ./setup.sh
57 | echo "SpMM with conflict-free SM + prefetch + shuffle compiled."
58 | 


--------------------------------------------------------------------------------
/SpMM/ablation_study/spmm_ablation_study.sh:
--------------------------------------------------------------------------------
 1 | cd ./SpMM_basic
 2 | chmod 777 run_jobs.sh
 3 | ./run_jobs.sh
 4 | echo "SpMM basic tested."
 5 | 
 6 | cd -
 7 | cd ./16b8b/SpMM_conflict_free
 8 | chmod 777 run_jobs.sh
 9 | ./run_jobs.sh
10 | 
11 | cd -
12 | cd ./8b8b/SpMM_conflict_free
13 | chmod 777 run_jobs.sh
14 | ./run_jobs.sh
15 | 
16 | cd -
17 | cd ./8b4b/SpMM_conflict_free
18 | chmod 777 run_jobs.sh
19 | ./run_jobs.sh
20 | 
21 | cd -
22 | cd ./4b4b/SpMM_conflict_free
23 | chmod 777 run_jobs.sh
24 | ./run_jobs.sh
25 | echo "SpMM with conflict-free SM tested."
26 | 
27 | cd -
28 | cd ./16b8b/SpMM_conflict_free_prefetch
29 | chmod 777 run_jobs.sh
30 | ./run_jobs.sh
31 | 
32 | cd -
33 | cd ./8b8b/SpMM_conflict_free_prefetch
34 | chmod 777 run_jobs.sh
35 | ./run_jobs.sh
36 | 
37 | cd -
38 | cd ./8b4b/SpMM_conflict_free_prefetch
39 | chmod 777 run_jobs.sh
40 | ./run_jobs.sh
41 | 
42 | cd -
43 | cd ./4b4b/SpMM_conflict_free_prefetch
44 | chmod 777 run_jobs.sh
45 | ./run_jobs.sh
46 | echo "SpMM with conflict-free SM + prefetch tested."
47 | 
48 | cd -
49 | cd ./8b4b/SpMM_conflict_free_prefetch_shuffle
50 | chmod 777 run_jobs.sh
51 | ./run_jobs.sh
52 | 
53 | cd -
54 | cd ./4b4b/SpMM_conflict_free_prefetch_shuffle
55 | chmod 777 run_jobs.sh
56 | ./run_jobs.sh
57 | echo "SpMM with conflict-free SM + prefetch + shuffle tested."
58 | 


--------------------------------------------------------------------------------
/baselines/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | NVCC = nvcc
 3 | NVCC_FLAGS = -std=c++11 -arch=sm_80 -lineinfo -lcublas -lcusparse
 4 | 
 5 | 
 6 | ##################################################################
 7 | 
 8 | ## Project file structure ##
 9 | 
10 | # Source file directory:
11 | SRC_DIR = src
12 | 
13 | # Object file directory:
14 | OBJ_DIR = bin
15 | 
16 | # Include header file directory
17 | INC_DIR = include
18 | 
19 | 
20 | ##################################################################
21 | 
22 | ## Compile ##
23 | 
24 | sddmm_benchmark: $(OBJ_DIR)/sddmm_benchmark.o $(OBJ_DIR)/cuda_sddmm.o $(OBJ_DIR)/wmma_sddmm.o $(OBJ_DIR)/cublas_gemm.o
25 | 	@$(NVCC) $(NVCC_FLAGS) $^ -o $@
26 | 
27 | spmm_benchmark: $(OBJ_DIR)/spmm_benchmark.o $(OBJ_DIR)/cuda_spmm.o $(OBJ_DIR)/wmma_spmm.o $(OBJ_DIR)/cublas_gemm.o
28 | 	@$(NVCC) $(NVCC_FLAGS) $^  -o $@
29 | 
30 | # Compile main file to object file
31 | $(OBJ_DIR)/%.o : %.cpp
32 | 	@$(NVCC) $(NVCC_FLAGS) -x c++ -c $< -o $@ 
33 | 
34 | 
35 | # Compile CUDA source files to object files
36 | $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cu $(INC_DIR)/%.cuh
37 | 	@$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
38 | 
39 | clean:
40 | 	@rm -f $(OBJ_DIR)/*.o
41 | 


--------------------------------------------------------------------------------
/baselines/include/cublas_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef CUBLAS_GEMM_H
 2 | #define CUBLAS_GEMM_H
 3 | #include <cublas_v2.h>
 4 | #include "cuda_fp16.h"
 5 | 
 6 | 
 7 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
 8 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
 9 | 
10 | cublasStatus_t cublasGeMM(cublasHandle_t handle, int m, int n, int k, 
11 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
12 | 
13 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
14 |     float* d_rhs_matrix, float* d_lhs_matrix, float* d_output_matrix);
15 | 
16 | cublasStatus_t cublasGeMMT(cublasHandle_t handle, int m, int n, int k, 
17 |     half* d_rhs_matrix, half* d_lhs_matrix, half* d_output_matrix);
18 | 
19 | #endif


--------------------------------------------------------------------------------
/baselines/include/cuda_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SDDMM_H
 3 | #define CUDA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream) ;
15 | 
16 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
17 |     const int* __restrict__ row_indices,
18 |     const int* __restrict__ row_offsets,
19 |     const int* __restrict__ col_indices,
20 |     const half* __restrict__ lhs_matrix,
21 |     const half* __restrict__ rhs_matrix,
22 |     half* __restrict__ output_values, 
23 |     int vec_length, cudaStream_t stream) ;
24 | 
25 | cudaError_t cudaSddmm(int m_vec, int k, int n, int nonzeros_vec,
26 |     const int* __restrict__ row_indices,
27 |     const int* __restrict__ row_offsets,
28 |     const int* __restrict__ col_indices,
29 |     const float* __restrict__ lhs_matrix,
30 |     const float* __restrict__ rhs_matrix,
31 |     float* __restrict__ output_values, 
32 |     int vec_length, cudaStream_t stream) ;
33 | 
34 | } // namespace sddmm
35 | 
36 | #endif


--------------------------------------------------------------------------------
/baselines/include/cuda_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef CUDA_SPMM_H
 3 | #define CUDA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t cudaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/baselines/include/sputnik.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 The Sputnik Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef THIRD_PARTY_SPUTNIK_SPUTNIK_H_
16 | #define THIRD_PARTY_SPUTNIK_SPUTNIK_H_
17 | 
18 | #include "sputnik/bias_relu/bias_relu.h"
19 | #include "sputnik/depthwise/cuda_depthwise.h"
20 | #include "sputnik/sddmm/cuda_sddmm.h"
21 | #include "sputnik/softmax/softmax.h"
22 | #include "sputnik/softmax/sparse_softmax.h"
23 | #include "sputnik/spmm/cuda_spmm.h"
24 | #include "sputnik/utils/index_format.h"
25 | 
26 | 
27 | #endif  // THIRD_PARTY_SPUTNIK_SPUTNIK_H_
28 | 


--------------------------------------------------------------------------------
/baselines/include/wmma_sddmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SDDMM_H
 3 | #define WMMA_SDDMM_H
 4 | 
 5 | namespace sddmm{
 6 | 
 7 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
 8 |     const int* __restrict__ row_indices,
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ col_indices,
11 |     const half* __restrict__ lhs_matrix,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_values, 
14 |     int vec_length, cudaStream_t stream, int algorithm) ;
15 | 
16 | 
17 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
18 |     const int* __restrict__ row_indices,
19 |     const int* __restrict__ row_offsets,
20 |     const int* __restrict__ col_indices,
21 |     const half* __restrict__ lhs_matrix,
22 |     const half* __restrict__ rhs_matrix,
23 |     half* __restrict__ output_values, 
24 |     int vec_length, cudaStream_t stream, int algorithm) ;
25 | 
26 | cudaError_t wmmaSddmm(int m_vec, int k, int n, int nonzeros_vec,
27 |     const int* __restrict__ row_indices,
28 |     const int* __restrict__ row_offsets,
29 |     const int* __restrict__ col_indices,
30 |     const float* __restrict__ lhs_matrix,
31 |     const float* __restrict__ rhs_matrix,
32 |     float* __restrict__ output_values, 
33 |     int vec_length, cudaStream_t stream, int algorithm) ;
34 | 
35 | } // namespace sddmm
36 | 
37 | #endif


--------------------------------------------------------------------------------
/baselines/include/wmma_spmm.cuh:
--------------------------------------------------------------------------------
 1 | #include "cuda_fp16.h"
 2 | #ifndef WMMA_SPMM_H
 3 | #define WMMA_SPMM_H
 4 | 
 5 | namespace spmm{
 6 | 
 7 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 
 8 |     const int* __restrict__ row_indices, 
 9 |     const int* __restrict__ row_offsets,
10 |     const int* __restrict__ column_indices,
11 |     const half* __restrict__ values,
12 |     const half* __restrict__ rhs_matrix,
13 |     float* __restrict__ output_matrix) ;
14 | 
15 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 
16 |     const int* __restrict__ row_indices, 
17 |     const int* __restrict__ row_offsets,
18 |     const int* __restrict__ column_indices,
19 |     const half* __restrict__ values,
20 |     const half* __restrict__ rhs_matrix,
21 |     half* __restrict__ output_matrix) ;
22 | 
23 | cudaError_t wmmaSpmm(int m_vec, int vec_length, int k, int n, 
24 |     const int* __restrict__ row_indices, 
25 |     const int* __restrict__ row_offsets,
26 |     const int* __restrict__ column_indices,
27 |     const float* __restrict__ values,
28 |     const float* __restrict__ rhs_matrix,
29 |     float* __restrict__ output_matrix) ;
30 | 
31 | } // namespace spmm
32 | 
33 | #endif


--------------------------------------------------------------------------------
/baselines/launch_sddmm_cublas_fp16.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimK', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimKs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimK in dimKs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimK: ", dimK, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             #for i in range(1):
28 |             for i in range(len(lines)):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './sddmm_benchmark %s %d %d 0 2 1 0 0 1' % (matrix, dimK, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/launch_spmm_cublas_fp16.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 -2 1' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/launch_spmm_cublas_int8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 -1 1' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/launch_spmm_cusparse_fp16.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 2 1' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/launch_spmm_cusparse_int8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 3 1' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/launch_spmm_vectorSparse.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | # Args
 6 | parser = argparse.ArgumentParser(description='lauch the spmm benchmarks')
 7 | 
 8 | #parser.add_argument('--dimN', type=int, default=256, help="the dimension N of the benchmark")
 9 | #parser.add_argument('--dimV', type=int, default=8, help="vector length")
10 | #parser.add_argument('--sparsity', choices=['50', '70', '80', '90', '95', '98'], default='70', help='sparsity of the matrix')
11 | #parser.add_argument('--preA', type=int, default=8, help="number of bits for A")
12 | #parser.add_argument('--preB', type=int, default=8, help="number of bits for B")
13 | args = parser.parse_args()
14 | 
15 | dataset_dir = os.environ.get('dataset_dir')
16 | sparsities = ['50', '70', '80', '90', '95', '98']
17 | dimNs = [128, 256]
18 | vec_lens = [2, 4, 8]
19 | 
20 | for dimN in dimNs:
21 |     for vec_len in vec_lens:
22 |         for sparsity in sparsities:
23 |             print("dimN: ", dimN, "vec_len: ", vec_len, "sparsity: ", sparsity)
24 |         
25 |             matrix_list = open('./eval_matrices/s%s.txt' % sparsity, 'r')
26 |             lines = matrix_list.readlines()
27 |             for i in range(len(lines)):
28 |             #for i in range(1):
29 |                 matrix = '%s/%s' % (dataset_dir, lines[i][:-1])
30 |                 cmd = './spmm_benchmark %s %d %d 0 1 0 1 1' % (matrix, dimN, vec_len)
31 |                 os.system(cmd)
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/run_sddmm_baselines.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Tesing sddmm_cublas_fp16"
 3 | python launch_sddmm_cublas_fp16.py > sddmm_cublas_fp16.txt
 4 | echo "Finish sddmm_cublas_fp16"
 5 | 
 6 | echo "Tesing sddmm_cublas_int8"
 7 | python launch_sddmm_cublas_int8.py > sddmm_cublas_int8.txt
 8 | echo "Finish sddmm_cublas_int8"
 9 | 
10 | echo "Tesing sddmm_vectorSparse"
11 | python launch_sddmm_vectorSparse.py > sddmm_vectorSparse.txt
12 | echo "Finish sddmm_vectorSparse"
13 | 


--------------------------------------------------------------------------------
/baselines/run_spmm_baselines.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Tesing spmm_cublas_fp16"
 3 | python launch_spmm_cublas_fp16.py > spmm_cublas_fp16.txt
 4 | echo "Finish spmm_cublas_fp16"
 5 | 
 6 | echo "Tesing spmm_cublas_int8"
 7 | python launch_spmm_cublas_int8.py > spmm_cublas_int8.txt
 8 | echo "Finish spmm_cublas_int8"
 9 | 
10 | echo "Tesing spmm_vectorSparse"
11 | python launch_spmm_vectorSparse.py > spmm_vectorSparse.txt
12 | echo "Finish spmm_vectorSparse"
13 | 
14 | echo "Tesing spmm_cusparse_fp16"
15 | python launch_spmm_cusparse_fp16.py > spmm_cusparse_fp16.txt
16 | echo "Finish spmm_cusparse_fp16"
17 | 
18 | echo "Tesing spmm_cusparse_int8"
19 | python launch_spmm_cusparse_int8.py > spmm_cusparse_int8.txt
20 | echo "Finish spmm_cusparse_int8"
21 | 


--------------------------------------------------------------------------------
/baselines/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./bin
2 | make sddmm_benchmark
3 | make spmm_benchmark


--------------------------------------------------------------------------------
/baselines/src/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/baselines/usingwmma_run.sh:
--------------------------------------------------------------------------------
 1 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 1
 2 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 4 0 1 1 1 1
 3 | 
 4 | ./sddmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 1 1 1 1
 5 | ./sddmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 2 1 1 1 1
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | no cpu checking
13 | ./spmm_benchmark  /users/shigang/gitrepo/dlmc/rn50/random_pruning/0.7/bottleneck_2_block_group3_5_1.smtx 512 8 0 1 0 1 1
14 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse Transformer Inference
 2 | 
 3 | This repo provides a pytorch extension that speedup transformer inference with fixed structured sparsity. 
 4 | 
 5 | The end-to-end speedup & memory profiling can be obtained with `end_to_end.py`. 
 6 | * To profile the execution time of sparse transformer, launch `python3 end_to_end.py --model sparse` with nsight system.
 7 | * To profile the execution time of dense transformer, launch `python3 end_to_end.py --model dense` with nsight system.
 8 | * To profile the memory of sparse transformer, launch `python3 end_to_end.py --model sparse --mem` with nsight system.
 9 | * To profile the memory of dense transformer, launch `python3 end_to_end.py --model dense --mem` with nsight system.
10 | 
11 | ***
12 | 
13 | #### Dependencies
14 | We generate the sparse mask with `scipy.sparse`. The pytorch version is `1.8.1+cu111`. The memory profiling is based on [`pytorch_memlab`](https://github.com/Stonesjtu/pytorch_memlab), and we annotate our program with `nvtx`. 
15 | 
16 | To build the custom kernels, please use the `src/install.sh`. As our kernels target on the V100 GPU's tensor core architecture, currently only `sm70` is supported.


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/cudaprofile.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | 
 3 | _cudart = ctypes.CDLL('libcudart.so')
 4 | 
 5 | 
 6 | def start():
 7 |     # As shown at http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PROFILER.html,
 8 |     # the return value will unconditionally be 0. This check is just in case it changes in
 9 |     # the future.
10 |     ret = _cudart.cudaProfilerStart()
11 |     if ret != 0:
12 |         raise Exception("cudaProfilerStart() returned %d" % ret)
13 | 
14 | def stop():
15 |     ret = _cudart.cudaProfilerStop()
16 |     if ret != 0:
17 |         raise Exception("cudaProfilerStop() returned %d" % ret)


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/run.sh:
--------------------------------------------------------------------------------
1 | nsys profile --force-overwrite true  -t cuda -o trans_report python3 end_to_end.py --model sparse
2 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/src/cuda/softmax.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor csr_softmax_cuda(
 4 |     torch::Tensor row_indices,
 5 |     torch::Tensor row_offsets,
 6 |     torch::Tensor values,
 7 |     float scaler,
 8 |     int vec_length);
 9 | 
10 | 
11 | torch::Tensor csr_softmax(
12 |     torch::Tensor row_indices,
13 |     torch::Tensor row_offsets,
14 |     torch::Tensor values,
15 |     float scaler,
16 |     int vec_length)
17 | {
18 |     return csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length);
19 | }
20 | 
21 | 
22 | torch::Tensor batched_csr_softmax_cuda(
23 |     torch::Tensor row_indices,
24 |     torch::Tensor row_offsets,
25 |     torch::Tensor values,
26 |     float scaler,
27 |     int vec_length,
28 |     int batch_size);
29 | 
30 | 
31 | torch::Tensor batched_csr_softmax(
32 |     torch::Tensor row_indices,
33 |     torch::Tensor row_offsets,
34 |     torch::Tensor values,
35 |     float scaler,
36 |     int vec_length,
37 |     int batch_size)
38 | {
39 |     return batched_csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length, batch_size);
40 | }
41 | 
42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
43 |     m.def("csr_softmax", &csr_softmax, "Custom Softmax kernel");
44 |     m.def("bcsr_softmax", &batched_csr_softmax, "Custom Batched Softmax kernel");
45 | }


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/src/cuda/spmm.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor spmm_cuda(
 4 |     torch::Tensor row_indices,
 5 |     torch::Tensor row_offsets,
 6 |     torch::Tensor column_indices,
 7 |     torch::Tensor values,
 8 |     torch::Tensor rhs_matrix,
 9 |     int vec_length);
10 | 
11 | torch::Tensor spmm(
12 |     torch::Tensor row_indices,
13 |     torch::Tensor row_offsets,
14 |     torch::Tensor column_indices,
15 |     torch::Tensor values,
16 |     torch::Tensor rhs_matrix,
17 |     int vec_length)
18 | {
19 |     return spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length);
20 | }
21 | 
22 | 
23 | 
24 | torch::Tensor batched_spmm_cuda(
25 |     torch::Tensor row_indices,
26 |     torch::Tensor row_offsets,
27 |     torch::Tensor column_indices,
28 |     torch::Tensor values,
29 |     torch::Tensor rhs_matrix,
30 |     int vec_length);
31 | 
32 | torch::Tensor batched_spmm(
33 |     torch::Tensor row_indices,
34 |     torch::Tensor row_offsets,
35 |     torch::Tensor column_indices,
36 |     torch::Tensor values,
37 |     torch::Tensor rhs_matrix,
38 |     int vec_length)
39 | {
40 |     return batched_spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length);
41 | }
42 | 
43 | 
44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
45 |     m.def("spmm", &spmm, "Custom SPMM kernel");
46 |     m.def("bspmm", &batched_spmm, "Custom Batched SPMM kernel");
47 | }


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/src/cuda/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace spmm{
 7 | 
 8 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 9 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
10 | }
11 | 
12 | template <int Tile_M, int BlockWidth>
13 | struct Barrier{
14 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
15 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
16 |     uint32_t thread_mask = 0xffffffff;
17 |     
18 |     __device__ __forceinline__ Barrier(int thread_idx_y){
19 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
20 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
21 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
22 |         }
23 |     }
24 | 
25 |     __device__ __forceinline__ void Sync(){
26 |         if (kThreadsPerOutputTile > 32){
27 |             __syncthreads();
28 |         } else if (kThreadsPerOutputTile > 1){
29 |             __syncwarp(thread_mask);
30 |         }
31 |     }
32 | };
33 | }
34 | #endif


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/src/install.sh:
--------------------------------------------------------------------------------
1 | python3 -W ignore setup.py build
2 | python3 -W ignore setup.py install


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/src/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='sptrans',
 6 |     version='0.0.1',
 7 |     description='Custom library for Sparse Transformer for pytorch',
 8 |     author='Zhaodong Chen',
 9 |     author_email='chenzd15thu@ucsb.edu',
10 |     ext_modules=[
11 |         CUDAExtension('sptrans.sddmm', 
12 |                       ['cuda/sddmm.cpp', 'cuda/sddmm_kernel.cu'],
13 |                       extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}),
14 |         CUDAExtension('sptrans.spmm', 
15 |                       ['cuda/spmm.cpp', 'cuda/spmm_kernel.cu'],
16 |                       extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}),
17 |         CUDAExtension('sptrans.softmax', 
18 |                       ['cuda/softmax.cpp', 'cuda/softmax_kernel.cu'],
19 |                       extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcusparse', '--ptxas-options=-v', '-lineinfo']}),
20 |         ],
21 |     cmdclass={'build_ext': BuildExtension},
22 |     install_requires=['torch']
23 | )
24 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_baselines/verify/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCIS/Magicube/8f92b69e9c1d7a0406eacb773ef5e79a71eda4f0/end2end_eval/sparse_transformer_baselines/verify/__init__.py


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/cudaprofile.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | 
 3 | _cudart = ctypes.CDLL('libcudart.so')
 4 | 
 5 | 
 6 | def start():
 7 |     # As shown at http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PROFILER.html,
 8 |     # the return value will unconditionally be 0. This check is just in case it changes in
 9 |     # the future.
10 |     ret = _cudart.cudaProfilerStart()
11 |     if ret != 0:
12 |         raise Exception("cudaProfilerStart() returned %d" % ret)
13 | 
14 | def stop():
15 |     ret = _cudart.cudaProfilerStop()
16 |     if ret != 0:
17 |         raise Exception("cudaProfilerStop() returned %d" % ret)


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/run.sh:
--------------------------------------------------------------------------------
1 | nsys profile --force-overwrite true  -t cuda -o trans_report python3 end_to_end.py --model sparse
2 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/cuda/quantization.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor quantization_cuda(torch::Tensor input_matrix, int bits, float scale);
 4 | 
 5 | torch::Tensor quantization(torch::Tensor input_matrix, int bits, float scale)
 6 | {
 7 |     return quantization_cuda(input_matrix, bits, scale);
 8 | }
 9 | 
10 | torch::Tensor batched_quantization_cuda(torch::Tensor input_matrix, int bits, float scale);
11 | 
12 | torch::Tensor batched_quantization(torch::Tensor input_matrix, int bits, float scale)
13 | {
14 |     return batched_quantization_cuda(input_matrix, bits, scale);
15 | }
16 | 
17 | 
18 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
19 |     m.def("quantization", &quantization, "Custom symmetric quantization kernel");
20 |     m.def("bquantization", &batched_quantization, "Custom Batched symmetric quantization kernel");
21 | }
22 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/cuda/softmax.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor csr_softmax_cuda(
 4 |     torch::Tensor row_indices,
 5 |     torch::Tensor row_offsets,
 6 |     torch::Tensor values,
 7 |     float scaler,
 8 |     int vec_length);
 9 | 
10 | 
11 | torch::Tensor csr_softmax(
12 |     torch::Tensor row_indices,
13 |     torch::Tensor row_offsets,
14 |     torch::Tensor values,
15 |     float scaler,
16 |     int vec_length)
17 | {
18 |     return csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length);
19 | }
20 | 
21 | 
22 | torch::Tensor batched_csr_softmax_cuda(
23 |     torch::Tensor row_indices,
24 |     torch::Tensor row_offsets,
25 |     torch::Tensor values,
26 |     float scaler,
27 |     int vec_length,
28 |     int batch_size);
29 | 
30 | 
31 | torch::Tensor batched_csr_softmax(
32 |     torch::Tensor row_indices,
33 |     torch::Tensor row_offsets,
34 |     torch::Tensor values,
35 |     float scaler,
36 |     int vec_length,
37 |     int batch_size)
38 | {
39 |     return batched_csr_softmax_cuda(row_indices, row_offsets, values, scaler, vec_length, batch_size);
40 | }
41 | 
42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
43 |     m.def("csr_softmax", &csr_softmax, "Custom Softmax kernel");
44 |     m.def("bcsr_softmax", &batched_csr_softmax, "Custom Batched Softmax kernel");
45 | }


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/cuda/spmm.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor spmm_cuda(
 4 |     torch::Tensor row_indices,
 5 |     torch::Tensor row_offsets,
 6 |     torch::Tensor column_indices,
 7 |     torch::Tensor values,
 8 |     torch::Tensor rhs_matrix,
 9 |     int vec_length);
10 | 
11 | torch::Tensor spmm(
12 |     torch::Tensor row_indices,
13 |     torch::Tensor row_offsets,
14 |     torch::Tensor column_indices,
15 |     torch::Tensor values,
16 |     torch::Tensor rhs_matrix,
17 |     int vec_length)
18 | {
19 |     return spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length);
20 | }
21 | 
22 | 
23 | 
24 | torch::Tensor batched_spmm_cuda(
25 |     torch::Tensor row_indices,
26 |     torch::Tensor row_offsets,
27 |     torch::Tensor column_indices,
28 |     torch::Tensor values,
29 |     torch::Tensor rhs_matrix,
30 |     int vec_length);
31 | 
32 | torch::Tensor batched_spmm(
33 |     torch::Tensor row_indices,
34 |     torch::Tensor row_offsets,
35 |     torch::Tensor column_indices,
36 |     torch::Tensor values,
37 |     torch::Tensor rhs_matrix,
38 |     int vec_length)
39 | {
40 |     return batched_spmm_cuda(row_indices, row_offsets, column_indices, values, rhs_matrix, vec_length);
41 | }
42 | 
43 | 
44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
45 |     m.def("spmm", &spmm, "Custom SPMM kernel");
46 |     m.def("bspmm", &batched_spmm, "Custom Batched SPMM kernel");
47 | }


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/cuda/spmm_utils/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | 
 7 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 8 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
 9 | }
10 | 
11 | template <int Tile_M, int BlockWidth>
12 | struct Barrier{
13 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
14 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
15 |     uint32_t thread_mask = 0xffffffff;
16 |     
17 |     __device__ __forceinline__ Barrier(int thread_idx_y){
18 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
19 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
20 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
21 |         }
22 |     }
23 | 
24 |     __device__ __forceinline__ void Sync(){
25 |         if (kThreadsPerOutputTile > 32){
26 |             __syncthreads();
27 |         } else if (kThreadsPerOutputTile > 1){
28 |             __syncwarp(thread_mask);
29 |         }
30 |     }
31 | };
32 | #endif
33 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/cuda/spmm_utils_N128_bk/barrier.h:
--------------------------------------------------------------------------------
 1 | #ifndef BARRIER_H
 2 | #define BARRIER_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | 
 7 | __device__ constexpr uint32_t StaticPow(uint32_t base, uint32_t exponent) {
 8 |   return exponent == 0 ? 1 : base * StaticPow(base, exponent - 1);
 9 | }
10 | 
11 | template <int Tile_M, int BlockWidth>
12 | struct Barrier{
13 |     static constexpr int kThreadsPerBlock = Tile_M * BlockWidth;
14 |     static constexpr int kThreadsPerOutputTile = BlockWidth;
15 |     uint32_t thread_mask = 0xffffffff;
16 |     
17 |     __device__ __forceinline__ Barrier(int thread_idx_y){
18 |         if ((kThreadsPerOutputTile < 32) && (kThreadsPerOutputTile < 1)){
19 |             constexpr uint32_t kBaseSubwarpMask = StaticPow(2, kThreadsPerOutputTile) - 1;
20 |             thread_mask = kBaseSubwarpMask << (thread_idx_y * kThreadsPerOutputTile);
21 |         }
22 |     }
23 | 
24 |     __device__ __forceinline__ void Sync(){
25 |         if (kThreadsPerOutputTile > 32){
26 |             __syncthreads();
27 |         } else if (kThreadsPerOutputTile > 1){
28 |             __syncwarp(thread_mask);
29 |         }
30 |     }
31 | };
32 | #endif
33 | 


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/src/install.sh:
--------------------------------------------------------------------------------
1 | python3 -W ignore setup.py build
2 | python3 -W ignore setup.py install


--------------------------------------------------------------------------------
/end2end_eval/sparse_transformer_magicube/verify/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCIS/Magicube/8f92b69e9c1d7a0406eacb773ef5e79a71eda4f0/end2end_eval/sparse_transformer_magicube/verify/__init__.py


--------------------------------------------------------------------------------
/plot/confinter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as st
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | np.random.seed(0)
 7 | #data = np.random.randint(10, 30, 50)
 8 | #data = np.array([0.01021, 0.011004, 0.010868, 0.011072, 0.011223, 0.010629, 0.011198, 0.01027, 0.010863, 0.010955, 0.010587, 0.011011, 0.010517, 0.011234, 0.011296, 0.010959])
 9 | data = np.array([0.00621, 0.007004, 0.010868, 0.011072, 0.011223, 0.010629, 0.011198, 0.01027, 0.010863, 0.010955, 0.010587, 0.011011, 0.010517, 0.011234, 0.011296, 0.010959])
10 | 
11 | meanv = np.mean(data)
12 | inter = st.norm.interval(alpha=0.95, loc=np.mean(data), scale=st.sem(data))
13 | print("interval: ", inter, "mean: ", meanv)
14 | 
15 | 


--------------------------------------------------------------------------------
/plot/figs/.gitignore:
--------------------------------------------------------------------------------
1 | ## ignore this file ##
2 | *.log
3 | *.o
4 | *.pdf
5 | 


--------------------------------------------------------------------------------
/plot/gen_csv.sh:
--------------------------------------------------------------------------------
1 | 
2 | python spmm_abl_study.py
3 | python spmm_pres.py
4 | python sddmm_abl_study.py
5 | python spmm_all_matrices.py
6 | python sddmm_all_matrices.py
7 | python n2n.py
8 | 
9 | 


--------------------------------------------------------------------------------
/plot/plot.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python plot_spmm_abl_study.py
 3 | python plot_spmm_pres.py
 4 | python plot_sddmm_abl_study.py
 5 | python plot_spmm_all_matrices.py
 6 | python plot_sddmm_all_matrices.py
 7 | python plot_n2n_a.py
 8 | python plot_n2n_b.py
 9 | python plot_n2n_c.py
10 | python plot_n2n_d.py
11 | python plot_n2n_e.py
12 | python plot_n2n_f.py
13 | python plot_n2n_g.py
14 | python plot_n2n_h.py
15 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_a.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | n2n_a_data = pd.read_csv('n2n_a.csv')
12 | sns.set(rc={"lines.linewidth": 0.5})
13 | sns.set(rc={'figure.figsize':(5, 3)})
14 | g = sns.barplot(data=n2n_a_data, x="S0.9,Seq_l=4096,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
15 | #plt.xticks(rotation=20)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 25))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.9, Seq_len=4096, num_h=4')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-a.pdf')
23 | 
24 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_b.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | n2n_b_data = pd.read_csv('n2n_b.csv')
12 | sns.set(rc={"lines.linewidth": 0.5})
13 | sns.set(rc={'figure.figsize':(5, 3)})
14 | g = sns.barplot(data=n2n_b_data, x="S0.9,Seq_l=4096,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
15 | g.tick_params(labelsize=8)
16 | g.set(ylim=(0, 50))
17 | g.set_xlabel(" ", fontsize=8)
18 | g.set_title('Sparsity=0.9, Seq_len=4096, num_h=8')
19 | plt.setp(g.get_legend().get_texts(), fontsize='6')
20 | plt.setp(g.get_legend().get_title(), fontsize='6')
21 | g.figure.savefig('./figs/Figure16-b.pdf')
22 | 
23 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_c.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | n2n_c_data = pd.read_csv('n2n_c.csv')
13 | sns.set(rc={"lines.linewidth": 0.5})
14 | sns.set(rc={'figure.figsize':(5, 3)})
15 | g = sns.barplot(data=n2n_c_data, x="S0.9,Seq_l=8192,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 70))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.9, Seq_len=8192, num_h=4')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-c.pdf')
23 | 
24 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_d.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | n2n_d_data = pd.read_csv('n2n_d.csv')
13 | sns.set(rc={"lines.linewidth": 0.5})
14 | sns.set(rc={'figure.figsize':(5, 3)})
15 | g = sns.barplot(data=n2n_d_data, x="S0.9,Seq_l=8192,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 150))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.9, Seq_len=8192, num_h=8')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-d.pdf')
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_e.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | 
13 | n2n_e_data = pd.read_csv('n2n_e.csv')
14 | sns.set(rc={"lines.linewidth": 0.5})
15 | sns.set(rc={'figure.figsize':(5, 3)})
16 | g = sns.barplot(data=n2n_e_data, x="S0.95,Seq_l=4096,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
17 | #plt.xticks(rotation=20)
18 | g.tick_params(labelsize=8)
19 | g.set(ylim=(0, 25))
20 | g.set_xlabel(" ", fontsize=8)
21 | g.set_title('Sparsity=0.95, Seq_len=4096, num_h=4')
22 | plt.setp(g.get_legend().get_texts(), fontsize='6')
23 | plt.setp(g.get_legend().get_title(), fontsize='6')
24 | g.figure.savefig('./figs/Figure16-e.pdf')
25 | 
26 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_f.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | n2n_f_data = pd.read_csv('n2n_f.csv')
13 | sns.set(rc={"lines.linewidth": 0.5})
14 | sns.set(rc={'figure.figsize':(5, 3)})
15 | g = sns.barplot(data=n2n_f_data, x="S0.95,Seq_l=4096,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 50))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.95, Seq_len=4096, num_h=8')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-f.pdf')
23 | 
24 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_g.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | n2n_g_data = pd.read_csv('n2n_g.csv')
13 | sns.set(rc={"lines.linewidth": 0.5})
14 | sns.set(rc={'figure.figsize':(5, 3)})
15 | g = sns.barplot(data=n2n_g_data, x="S0.95,Seq_l=8192,num_h=4", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 70))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.95, Seq_len=8192, num_h=4')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-g.pdf')
23 | 
24 | 


--------------------------------------------------------------------------------
/plot/plot_n2n_h.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #sns.set_context(rc = {'patch.linewidth': 0.0})
 9 | order = ['Pytorch-fp16', 'vectorSparse-fp16', 'Magicube-16b8b', 'Magicube-8b8b', 'Magicube-8b4b', 'Magicube-4b4b']
10 | 
11 | 
12 | n2n_h_data = pd.read_csv('n2n_h.csv')
13 | sns.set(rc={"lines.linewidth": 0.5})
14 | sns.set(rc={'figure.figsize':(5, 3)})
15 | g = sns.barplot(data=n2n_h_data, x="S0.95,Seq_l=8192,num_h=8", y="Latency(ms)", hue="algs", palette="Blues_d", hue_order=order, ci=95, capsize=.1, errwidth=0.8)
16 | g.tick_params(labelsize=8)
17 | g.set(ylim=(0, 150))
18 | g.set_xlabel(" ", fontsize=8)
19 | g.set_title('Sparsity=0.95, Seq_len=8192, num_h=8')
20 | plt.setp(g.get_legend().get_texts(), fontsize='6')
21 | plt.setp(g.get_legend().get_title(), fontsize='6')
22 | g.figure.savefig('./figs/Figure16-h.pdf')
23 | 


--------------------------------------------------------------------------------
/plot/plot_sddmm_abl_study.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | sddmm_abl_study_data = pd.read_csv('sddmm_abl_study.csv')
 9 | sns.set(rc={"lines.linewidth": 0.5})
10 | sns.set(rc={'figure.figsize':(15, 5)})
11 | g = sns.barplot(data=sddmm_abl_study_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d")
12 | plt.xticks(rotation=20)
13 | g.tick_params(labelsize=8)
14 | g.set(ylim=(0, 40))
15 | g.figure.savefig('./figs/Figure13.pdf')
16 | 
17 | 


--------------------------------------------------------------------------------
/plot/plot_sddmm_all_matrices.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | 
 9 | sddmm_data = pd.read_csv('sddmm_all_matrices.csv')
10 | ##print(sddmm_data)
11 | #
12 | #fgrid = sns.FacetGrid(sddmm_data, col="vecLen", row="dimN")
13 | #fgrid.map_dataframe(sns.boxplot, x="Sparsity", y="", data=sddmm_data)
14 | #
15 | #
16 | #fgrid.figure.savefig('test.pdf')
17 | sns.set(rc={"lines.linewidth": 0.5})
18 | g = sns.catplot(x="sparsity", y="speedup",
19 |                 hue="algs", col="V", row="K", fliersize=3,
20 |                 data=sddmm_data, kind="box",
21 |                 height=4, aspect=1.6)
22 | g.set(ylim=(0.0, 3.0))
23 | plt.axhline(1.0, linestyle='--', linewidth=2.7, color='blue')
24 | g.figure.savefig('./figs/Figure15.pdf')
25 | 


--------------------------------------------------------------------------------
/plot/plot_spmm_abl_study.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | order = ['V2,L16R8,S0.7', 'V8,L16R8,S0.7', 'V2,L8R8,S0.7', 'V8,L8R8,S0.7',
 9 |          'V2,L8R4,S0.7', 'V8,L8R4,S0.7', 'V2,L4R4,S0.7', 'V8,L4R4,S0.7',
10 |          'V2,L16R8,S0.9', 'V8,L16R8,S0.9', 'V2,L8R8,S0.9', 'V8,L8R8,S0.9',
11 |          'V2,L8R4,S0.9', 'V8,L8R4,S0.9', 'V2,L4R4,S0.9', 'V8,L4R4,S0.9']
12 | 
13 | #sns.color_palette("Blues", as_cmap=True)
14 | spmm_abl_data = pd.read_csv('spmm_abl_study.csv')
15 | sns.set(rc={"lines.linewidth": 0.5})
16 | sns.set(rc={'figure.figsize':(15, 5)})
17 | g = sns.barplot(data=spmm_abl_data, x="configs", y="TOP/s", hue="opts", order=order, palette="Blues_d")
18 | plt.xticks(rotation=20)
19 | g.tick_params(labelsize=8)
20 | g.set(ylim=(0, 40))
21 | g.figure.savefig('./figs/Figure11.pdf')
22 | 
23 | 


--------------------------------------------------------------------------------
/plot/plot_spmm_all_matrices.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | 
 9 | spmm_data = pd.read_csv('spmm_all_matrices.csv')
10 | ##print(spmm_data)
11 | #
12 | #fgrid = sns.FacetGrid(spmm_data, col="vecLen", row="dimN")
13 | #fgrid.map_dataframe(sns.boxplot, x="Sparsity", y="", data=spmm_data)
14 | #
15 | #
16 | #fgrid.figure.savefig('test.pdf')
17 | sns.set(rc={"lines.linewidth": 0.5})
18 | g = sns.catplot(x="sparsity", y="speedup",
19 |                 hue="algs", col="V", row="N", fliersize=3,
20 |                 data=spmm_data, kind="box",
21 |                 height=4, aspect=1.6)
22 | g.set(ylim=(0.0, 3.0))
23 | plt.axhline(1.0, linestyle='--', linewidth=2.7, color='blue')
24 | g.figure.savefig('./figs/Figure14.pdf')
25 | 


--------------------------------------------------------------------------------
/plot/plot_spmm_pres.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import six
 3 | import csv
 4 | import seaborn as sns
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | #order = ['L4-R4', 'L8-R4', 'L12-R4', 'L16-R4', 'L8-R8', 'L16-R8', 'L16-R16']
 9 | spmm_pres_data = pd.read_csv('spmm_pres.csv')
10 | sns.set(rc={"lines.linewidth": 0.5})
11 | sns.set(rc={'figure.figsize':(15, 5)})
12 | #g = sns.barplot(data=spmm_pres_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d", hue_order=order)
13 | g = sns.barplot(data=spmm_pres_data, x="configs", y="TOP/s", hue="pres", palette="Blues_d")
14 | plt.xticks(rotation=20)
15 | g.tick_params(labelsize=8)
16 | g.set(ylim=(0, 45))
17 | g.figure.savefig('./figs/Figure12.pdf')
18 | 
19 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nvtx
2 | scipy
3 | pytorch_memlab
4 | seaborn
5 | 


--------------------------------------------------------------------------------