├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── common.mk ├── src ├── cusync.cu ├── examples │ ├── sync-overhead │ │ └── sync-overhead.cu │ └── twoMatMuls │ │ ├── Makefile │ │ └── matrixMul.cu ├── include │ ├── cusync │ │ ├── cusync.h │ │ ├── cusync_defines.h │ │ ├── cusync_device_defs.h │ │ ├── device-functions.h │ │ ├── policies.h │ │ ├── tile-orders.h │ │ └── wait-kernel.h │ └── cutlass │ │ └── cusync-cutlass │ │ └── include │ │ └── cutlass │ │ ├── conv │ │ ├── device │ │ │ └── cusyncimplicit_gemm_convolution.h │ │ ├── kernel │ │ │ ├── cusyncdefault_conv2d_fprop.h │ │ │ └── implicit_cusyncgemm_convolution.h │ │ └── threadblock │ │ │ └── implicit_cusyncgemm_pipelined.h │ │ └── gemm │ │ ├── device │ │ └── cusyncgemm.h │ │ ├── kernel │ │ ├── cusyncgemm.h │ │ └── default_cusyncgemm.h │ │ └── threadblock │ │ ├── cusync_threadblock_swizzle.h │ │ ├── cusyncmma_multistage.h │ │ ├── cusyncmma_pipelined.h │ │ └── default_cusyncmma.h └── ml-bench │ ├── README.md │ ├── common.mk │ ├── plots │ ├── Makefile │ ├── common.py │ ├── mlp-gpt3-a100.png │ ├── mlp-gpt3-v100.png │ ├── mlp-llama-a100.png │ ├── mlp-llama-v100.png │ └── plotGPT.py │ ├── transformer │ ├── Makefile │ ├── allreduce_times.py │ ├── attention.cu │ ├── common.h │ ├── eval_mlp.py │ ├── mlp-lib.cu │ ├── mlp.cu │ ├── results │ │ ├── allreduce_times-12288 │ │ ├── attention-results │ │ ├── attention-results-gpt-3-cuda-12.2 │ │ ├── attention-results-gpt3 │ │ ├── attention-results-llama │ │ ├── attention-stream-k-output │ │ ├── mlp-gpt3-a100.csv │ │ ├── mlp-gpt3-v100.csv │ │ ├── mlp-llama-a100.csv │ │ ├── mlp-llama-v100.csv │ │ ├── mlp-results-2 │ │ ├── mlp-results-gpt-3 │ │ ├── mlp-results-gpt3-cuda-12.2 │ │ ├── mlp-results-in-paper │ │ ├── mlp-results-llama │ │ └── mlp-stream-k-output │ ├── streamk.cu │ ├── tile_sizes_db.py │ └── torch-baselines │ │ ├── cublasBaseline.py │ │ ├── torchAttention.py │ │ └── torchmlp.py │ └── volta_conv2d │ ├── Makefile │ ├── eval_resnet.py │ ├── resnet.cu │ ├── resnet_results.csv │ ├── torchconv2d.py │ ├── vgg-results-cuda-12.2 │ └── vgg.cu └── tests ├── cusync-test.h └── simple-test.cu /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/.gitmodules -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/LICENSE -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/SECURITY.md -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/SUPPORT.md -------------------------------------------------------------------------------- /common.mk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/common.mk -------------------------------------------------------------------------------- /src/cusync.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/cusync.cu -------------------------------------------------------------------------------- /src/examples/sync-overhead/sync-overhead.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/examples/sync-overhead/sync-overhead.cu -------------------------------------------------------------------------------- /src/examples/twoMatMuls/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/examples/twoMatMuls/Makefile -------------------------------------------------------------------------------- /src/examples/twoMatMuls/matrixMul.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/examples/twoMatMuls/matrixMul.cu -------------------------------------------------------------------------------- /src/include/cusync/cusync.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/cusync.h -------------------------------------------------------------------------------- /src/include/cusync/cusync_defines.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/cusync_defines.h -------------------------------------------------------------------------------- /src/include/cusync/cusync_device_defs.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/cusync_device_defs.h -------------------------------------------------------------------------------- /src/include/cusync/device-functions.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/device-functions.h -------------------------------------------------------------------------------- /src/include/cusync/policies.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/policies.h -------------------------------------------------------------------------------- /src/include/cusync/tile-orders.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/tile-orders.h -------------------------------------------------------------------------------- /src/include/cusync/wait-kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cusync/wait-kernel.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/conv/device/cusyncimplicit_gemm_convolution.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/conv/device/cusyncimplicit_gemm_convolution.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/conv/kernel/cusyncdefault_conv2d_fprop.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/conv/kernel/cusyncdefault_conv2d_fprop.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/conv/kernel/implicit_cusyncgemm_convolution.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/conv/kernel/implicit_cusyncgemm_convolution.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/conv/threadblock/implicit_cusyncgemm_pipelined.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/conv/threadblock/implicit_cusyncgemm_pipelined.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/device/cusyncgemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/device/cusyncgemm.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/kernel/cusyncgemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/kernel/cusyncgemm.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/kernel/default_cusyncgemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/kernel/default_cusyncgemm.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusync_threadblock_swizzle.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusync_threadblock_swizzle.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusyncmma_multistage.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusyncmma_multistage.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusyncmma_pipelined.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/cusyncmma_pipelined.h -------------------------------------------------------------------------------- /src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/default_cusyncmma.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/include/cutlass/cusync-cutlass/include/cutlass/gemm/threadblock/default_cusyncmma.h -------------------------------------------------------------------------------- /src/ml-bench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/README.md -------------------------------------------------------------------------------- /src/ml-bench/common.mk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/common.mk -------------------------------------------------------------------------------- /src/ml-bench/plots/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/Makefile -------------------------------------------------------------------------------- /src/ml-bench/plots/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/common.py -------------------------------------------------------------------------------- /src/ml-bench/plots/mlp-gpt3-a100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/mlp-gpt3-a100.png -------------------------------------------------------------------------------- /src/ml-bench/plots/mlp-gpt3-v100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/mlp-gpt3-v100.png -------------------------------------------------------------------------------- /src/ml-bench/plots/mlp-llama-a100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/mlp-llama-a100.png -------------------------------------------------------------------------------- /src/ml-bench/plots/mlp-llama-v100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/mlp-llama-v100.png -------------------------------------------------------------------------------- /src/ml-bench/plots/plotGPT.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/plots/plotGPT.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/Makefile -------------------------------------------------------------------------------- /src/ml-bench/transformer/allreduce_times.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/allreduce_times.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/attention.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/attention.cu -------------------------------------------------------------------------------- /src/ml-bench/transformer/common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/common.h -------------------------------------------------------------------------------- /src/ml-bench/transformer/eval_mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/eval_mlp.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/mlp-lib.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/mlp-lib.cu -------------------------------------------------------------------------------- /src/ml-bench/transformer/mlp.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/mlp.cu -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/allreduce_times-12288: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/allreduce_times-12288 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/attention-results: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/attention-results -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/attention-results-gpt-3-cuda-12.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/attention-results-gpt-3-cuda-12.2 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/attention-results-gpt3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/attention-results-gpt3 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/attention-results-llama: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/attention-results-llama -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/attention-stream-k-output: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/attention-stream-k-output -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-gpt3-a100.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-gpt3-a100.csv -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-gpt3-v100.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-gpt3-v100.csv -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-llama-a100.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-llama-a100.csv -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-llama-v100.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-llama-v100.csv -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-results-2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-results-2 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-results-gpt-3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-results-gpt-3 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-results-gpt3-cuda-12.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-results-gpt3-cuda-12.2 -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-results-in-paper: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-results-in-paper -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-results-llama: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-results-llama -------------------------------------------------------------------------------- /src/ml-bench/transformer/results/mlp-stream-k-output: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/results/mlp-stream-k-output -------------------------------------------------------------------------------- /src/ml-bench/transformer/streamk.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/streamk.cu -------------------------------------------------------------------------------- /src/ml-bench/transformer/tile_sizes_db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/tile_sizes_db.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/torch-baselines/cublasBaseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/torch-baselines/cublasBaseline.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/torch-baselines/torchAttention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/torch-baselines/torchAttention.py -------------------------------------------------------------------------------- /src/ml-bench/transformer/torch-baselines/torchmlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/transformer/torch-baselines/torchmlp.py -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/Makefile -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/eval_resnet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/eval_resnet.py -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/resnet.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/resnet.cu -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/resnet_results.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/resnet_results.csv -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/torchconv2d.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/torchconv2d.py -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/vgg-results-cuda-12.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/vgg-results-cuda-12.2 -------------------------------------------------------------------------------- /src/ml-bench/volta_conv2d/vgg.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/src/ml-bench/volta_conv2d/vgg.cu -------------------------------------------------------------------------------- /tests/cusync-test.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/tests/cusync-test.h -------------------------------------------------------------------------------- /tests/simple-test.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/cusync/HEAD/tests/simple-test.cu --------------------------------------------------------------------------------