├── measure_metric ├── measureMetricPW.cpp ├── ScopeExit.h ├── pythonInterface.cpp ├── Parser.hpp ├── Parser.h └── Utils.h ├── gpu-metrics ├── cuda_metrics │ ├── measureMetricPW.cpp │ ├── ScopeExit.h │ ├── pythonInterface.cpp │ ├── Parser.hpp │ ├── Parser.h │ └── Utils.h ├── rocm_metrics │ ├── test_rocm_metrics │ ├── Makefile │ └── test_rocm_metrics.hip ├── gpu-metrics.hpp └── README.md ├── gpu-stream ├── maxbars.pdf ├── minbars.pdf ├── cuda-stream.pdf ├── Makefile ├── rx6900xt.txt ├── a40.txt ├── l40.txt ├── h100_pcie.txt ├── past_results │ ├── h100_pcie.txt │ └── a100_40.txt ├── a100_40.txt ├── a100_80.txt ├── gh200.txt ├── mi100.txt ├── mi210.txt ├── mi300a.txt ├── mi300x.txt ├── v100.txt └── plot.py ├── gpu-latency ├── latencies.pdf ├── latencies_NV.pdf ├── latencies_AMD.pdf ├── Makefile ├── plot.py └── main.cu ├── gpu-roofline ├── L40_plot.pdf ├── series.sh ├── Makefile ├── plot.py ├── main.cu ├── mi300x.txt └── h200.txt ├── .gitignore ├── gpu-l2-cache ├── sycl │ ├── build.sh │ └── sycl-gpu-l2-cache.cpp ├── Makefile ├── plot.py └── main.cu ├── dtime.hpp ├── gpu-metrics.hpp ├── um-stream ├── Makefile └── main.cu ├── gpu-small-kernels ├── readme.md ├── Makefile ├── a40_pt.txt └── plot.py ├── cuda-memcpy ├── Makefile └── main.cu ├── unmaintained ├── cuda-busy │ ├── Makefile │ └── main.cu ├── cuda-cache-overlap │ └── Makefile ├── cuda-3d-stream │ ├── Makefile │ └── main.cu └── cuda-gapped-stream │ ├── Makefile │ └── main.cu ├── cuda-incore ├── Makefile └── main.cu ├── gpu-error.h ├── gpu-strides ├── Makefile └── h200.txt ├── gpu-l2-stream ├── Makefile └── plot.py ├── gpu-cache ├── Makefile ├── plot.py ├── mi100.txt └── mi210.txt ├── device_order.py ├── gpu-stats.h ├── MeasurementSeries.hpp ├── gpu-clock.cuh └── rocm-metrics └── rocm-metrics.hpp /measure_metric/measureMetricPW.cpp: -------------------------------------------------------------------------------- 1 | #include "measureMetricPW.hpp" 2 | -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/measureMetricPW.cpp: -------------------------------------------------------------------------------- 1 | #include "measureMetricPW.hpp" 2 | -------------------------------------------------------------------------------- /gpu-stream/maxbars.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/maxbars.pdf -------------------------------------------------------------------------------- /gpu-stream/minbars.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/minbars.pdf -------------------------------------------------------------------------------- /gpu-latency/latencies.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies.pdf -------------------------------------------------------------------------------- /gpu-roofline/L40_plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-roofline/L40_plot.pdf -------------------------------------------------------------------------------- /gpu-stream/cuda-stream.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/cuda-stream.pdf -------------------------------------------------------------------------------- /gpu-latency/latencies_NV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies_NV.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | # Compiled CUDA/HIP binaries 4 | cuda-* 5 | cu-* 6 | hip-* 7 | *.o 8 | *.so 9 | *.a -------------------------------------------------------------------------------- /gpu-latency/latencies_AMD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies_AMD.pdf -------------------------------------------------------------------------------- /gpu-metrics/rocm_metrics/test_rocm_metrics: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-metrics/rocm_metrics/test_rocm_metrics -------------------------------------------------------------------------------- /gpu-l2-cache/sycl/build.sh: -------------------------------------------------------------------------------- 1 | clang++ -O3 -fsycl -fsycl-targets=nvptx64-nvidia-cuda sycl-gpu-l2-cache.cpp -o sycl-gpu-l2-cache -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 2 | ./sycl-gpu-l2-cache -------------------------------------------------------------------------------- /dtime.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | double dtime() { 5 | double tseconds = 0; 6 | struct timeval t; 7 | gettimeofday(&t, NULL); 8 | tseconds = (double)t.tv_sec + (double)t.tv_usec * 1.0e-6; 9 | return tseconds; 10 | } 11 | -------------------------------------------------------------------------------- /gpu-metrics.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GPU_MEASURE_METRICS_H_ 2 | #define GPU_MEASURE_METRICS_H_ 3 | 4 | 5 | #ifdef __NVCC__ 6 | #include "measure_metric/measureMetricPW.hpp" 7 | #elif defined __HIP__ 8 | #include "rocm-metrics/rocm-metrics.hpp" 9 | #endif 10 | #endif // GPU_MEASURE_METRICS_H_ 11 | -------------------------------------------------------------------------------- /gpu-metrics/gpu-metrics.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GPU_MEASURE_METRICS_H_ 2 | #define GPU_MEASURE_METRICS_H_ 3 | 4 | 5 | #ifdef __NVCC__ 6 | #include "cuda_metrics/measureMetricPW.hpp" 7 | #elif defined __HIP__ 8 | #include "rocm_metrics/rocm_metrics.hpp" 9 | #endif 10 | #endif // GPU_MEASURE_METRICS_H_ 11 | -------------------------------------------------------------------------------- /gpu-metrics/rocm_metrics/Makefile: -------------------------------------------------------------------------------- 1 | HIP_HOME := /opt/rocm 2 | 3 | HIPFLAGS := -ldl -O3 -std=c++2a -I/opt/rocm/include/hip $(shell python3-config --includes) -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 4 | 5 | 6 | 7 | test_rocm_metrics: test_rocm_metrics.hip 8 | hipcc $< -o $@ $(HIPFLAGS) 9 | 10 | # end 11 | -------------------------------------------------------------------------------- /measure_metric/ScopeExit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | 5 | class ScopeExit 6 | { 7 | public: 8 | ScopeExit(T t) : t(t) {} 9 | ~ScopeExit() { t(); } 10 | T t; 11 | }; 12 | 13 | template 14 | ScopeExit MoveScopeExit(T t) { 15 | return ScopeExit(t); 16 | }; 17 | 18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line 19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line) 20 | 21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func;}) -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/ScopeExit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | 5 | class ScopeExit 6 | { 7 | public: 8 | ScopeExit(T t) : t(t) {} 9 | ~ScopeExit() { t(); } 10 | T t; 11 | }; 12 | 13 | template 14 | ScopeExit MoveScopeExit(T t) { 15 | return ScopeExit(t); 16 | }; 17 | 18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line 19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line) 20 | 21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func;}) -------------------------------------------------------------------------------- /um-stream/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | # internal flags 4 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 5 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info 6 | CCFLAGS := 7 | LDFLAGS := -L/opt/cuda/lib64 -lcublas 8 | NAME := um-stream 9 | PREFIX := . 10 | 11 | 12 | $(PREFIX)/$(NAME): main.cu Makefile ../dtime.hpp ../MeasurementSeries.hpp ../gpu-error.h 13 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 14 | 15 | clean: 16 | rm -f ./$(NAME) 17 | 18 | -------------------------------------------------------------------------------- /gpu-small-kernels/readme.md: -------------------------------------------------------------------------------- 1 | # Repeated Small Kernel Performance 2 | 3 | 4 | This benchmark explors the potential for cache blocking, where kernels work on a small data set that fits into caches. Because the data set is small, and the L2 cache is fast, the kernel executues so quickly that the startup overhead of a kernel launch becomes dominant. The benchmark queues 10000 calls of a streaming SCALE kernel of varying size. Use commandline option "-graph" to use the cudaGraph/hipGraph API. 5 | 6 | ![latency plot](repeated-stream.svg) 7 | 8 | Each device gets a fit of \$a,b\$ for the function 9 | 10 | $$T = \frac{V}{a + V/b}$$ 11 | 12 | which models the performance with a startup overhead \$a\$ and a bandwidth \$b\$ depending on the data volume \$V\$. 13 | -------------------------------------------------------------------------------- /gpu-roofline/series.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | range=1024 4 | 5 | mkdir -p build 6 | 7 | 8 | make ./build/$10 N=0 PREFIX=./build 1>&2 9 | make ./build/$11 N=1 PREFIX=./build 1>&2 & 10 | make ./build/$12 N=2 PREFIX=./build 1>&2 & 11 | 12 | 13 | for (( d=4 ; d<=$range; d+= (d / 24 + 1)*2 )) 14 | do 15 | echo $d 16 | make ./build/$1$d N=$d PREFIX=./build 1>&2 & 17 | while test $(jobs -p | wc -w) -ge 64; do 18 | sleep 1; 19 | done 20 | done 21 | 22 | while test $(jobs -p | wc -w) -ge 2; do 23 | echo $(jobs -p | wc -w) 24 | sleep 1; 25 | done 26 | 27 | wait 28 | 29 | echo "-- Finished Building --" 30 | 31 | 32 | ./build/$10 33 | ./build/$11 34 | ./build/$12 35 | 36 | 37 | for (( d=4 ; d<=$range; d+= (d / 24 + 1)*2 )) 38 | do 39 | ./build/$1$d 40 | done 41 | 42 | -------------------------------------------------------------------------------- /measure_metric/pythonInterface.cpp: -------------------------------------------------------------------------------- 1 | #include "measureMetricPW.hpp" 2 | 3 | #include 4 | 5 | extern "C" PyObject *measureMetricStop() { 6 | 7 | runTestEnd(); 8 | 9 | /*CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = { 10 | CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE}; 11 | CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams)); 12 | */ 13 | auto values = NV::Metric::Eval::GetMetricValues(chipName, counterDataImage, 14 | metricNames); 15 | 16 | PyGILState_STATE gstate = PyGILState_Ensure(); 17 | 18 | 19 | PyObject *result = PyList_New(0); 20 | for (auto value : values) { 21 | PyList_Append(result, PyFloat_FromDouble(value)); 22 | } 23 | 24 | PyGILState_Release(gstate); 25 | 26 | return result; 27 | } 28 | -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/pythonInterface.cpp: -------------------------------------------------------------------------------- 1 | #include "measureMetricPW.hpp" 2 | 3 | #include 4 | 5 | extern "C" PyObject *measureMetricStop() { 6 | 7 | runTestEnd(); 8 | 9 | /*CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = { 10 | CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE}; 11 | CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams)); 12 | */ 13 | auto values = NV::Metric::Eval::GetMetricValues(chipName, counterDataImage, 14 | metricNames); 15 | 16 | PyGILState_STATE gstate = PyGILState_Ensure(); 17 | 18 | 19 | PyObject *result = PyList_New(0); 20 | for (auto value : values) { 21 | PyList_Append(result, PyFloat_FromDouble(value)); 22 | } 23 | 24 | PyGILState_Release(gstate); 25 | 26 | return result; 27 | } 28 | -------------------------------------------------------------------------------- /cuda-memcpy/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 9 | CCFLAGS := 10 | LDFLAGS := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda 11 | NAME := cuda-memcpy 12 | PREFIX := . 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | 16 | $(PREFIX)/$(NAME): main.cu Makefile 17 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 18 | 19 | clean: 20 | rm -f ./$(NAME) 21 | -------------------------------------------------------------------------------- /gpu-metrics/rocm_metrics/test_rocm_metrics.hip: -------------------------------------------------------------------------------- 1 | #include 2 | #include "rocm_metrics.hpp" 3 | 4 | 5 | __global__ void updateKernel(double* A, size_t N) { 6 | int tidx = threadIdx.x + blockDim.x * blockIdx.x; 7 | for(size_t i = tidx; i < N; i += blockDim.x * gridDim.x) { 8 | A[i] = 0.2 * A[i];; 9 | } 10 | } 11 | 12 | 13 | int main(int argc, char** argv) { 14 | initMeasureMetric(); 15 | 16 | double *dA; 17 | int bufferCount = 1024 * 1024 * 1024; 18 | hipMalloc(&dA, bufferCount * sizeof(double)); 19 | 20 | for(int i = 0; i < 10; i++) { 21 | measureBandwidthStart(); 22 | 23 | updateKernel<<<100, 1024>>>(dA, bufferCount); 24 | auto vals = measureMetricStop(); 25 | 26 | for(auto v: vals) { 27 | std::cout << v * 1024 / bufferCount << "\n"; 28 | } 29 | } 30 | 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /unmaintained/cuda-busy/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++14 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-std=c++14 -O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 9 | CCFLAGS := 10 | LDFLAGS := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml 11 | NAME := cuda-busy 12 | PREFIX := . 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | $(PREFIX)/$(NAME): main.cu Makefile 16 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 17 | 18 | 19 | clean: 20 | rm -f ./$(NAME) 21 | 22 | -------------------------------------------------------------------------------- /cuda-incore/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | 4 | TEMP_NVCC := $(shell which nvcc) 5 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 6 | 7 | 8 | # internal flags 9 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 10 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 11 | CCFLAGS := 12 | LDFLAGS := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml 13 | NAME := cuda-incore 14 | PREFIX := . 15 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 16 | 17 | $(PREFIX)/$(NAME): main.cu Makefile 18 | echo $(CUDA_HOME) 19 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 20 | 21 | 22 | clean: 23 | rm -f ./$(NAME) 24 | -------------------------------------------------------------------------------- /unmaintained/cuda-cache-overlap/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 9 | CCFLAGS := 10 | LDFLAGS := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml 11 | NAME := cuda-cache-overlap 12 | PREFIX := . 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | $(PREFIX)/$(NAME): main.cu Makefile 16 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 17 | 18 | 19 | clean: 20 | rm -f ./$(NAME) 21 | 22 | -------------------------------------------------------------------------------- /unmaintained/cuda-3d-stream/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 9 | CCFLAGS := 10 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 11 | NAME := cuda-stream 12 | PREFIX := . 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | $(PREFIX)/$(NAME): main.cu Makefile 16 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 17 | 18 | 19 | clean: 20 | rm -f ./$(NAME) 21 | -------------------------------------------------------------------------------- /unmaintained/cuda-gapped-stream/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 9 | CCFLAGS := 10 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 11 | NAME := cuda-gapped-stream 12 | PREFIX := . 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | $(PREFIX)/$(NAME): main.cu Makefile 16 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 17 | 18 | 19 | clean: 20 | rm -f ./$(NAME) 21 | -------------------------------------------------------------------------------- /gpu-latency/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | # internal flags 4 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 5 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info 6 | CCFLAGS := 7 | LDFLAGS := -L/opt/cuda/lib64 -lcublas -lnvidia-ml 8 | NAME := latency 9 | PREFIX := 10 | N := 1 11 | 12 | 13 | $(PREFIX)cuda-$(NAME): main.cu Makefile 14 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 15 | 16 | main.hip: main.cu 17 | hipify-perl main.cu > main.hip 18 | 19 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 20 | echo $(HIP_HOME) 21 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $< 22 | 23 | clean: 24 | rm -f cuda-$(NAME) hip-$(NAME) 25 | rm -f *-cuda-$(NAME) *-hip-$(NAME) 26 | rm main.hip 27 | -------------------------------------------------------------------------------- /gpu-error.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #pragma once 4 | 5 | #ifdef __NVCC__ 6 | #define GPU_ERROR(ans) \ 7 | { gpuAssert((ans), __FILE__, __LINE__); } 8 | inline void gpuAssert(cudaError_t code, const char *file, int line, 9 | bool abort = true) { 10 | if (code != cudaSuccess) { 11 | std::cerr << "GPUassert: \"" << cudaGetErrorString(code) << "\" in " 12 | << file << ": " << line << "\n"; 13 | if (abort) 14 | exit(code); 15 | } 16 | } 17 | #elif defined __HIP__ 18 | #define GPU_ERROR(ans) \ 19 | { gpuAssert((ans), __FILE__, __LINE__); } 20 | inline void gpuAssert(hipError_t code, const char *file, int line, 21 | bool abort = true) { 22 | if (code != hipSuccess) { 23 | std::cerr << "GPUassert: \"" << hipGetErrorString(code) << "\" in " << file 24 | << ": " << line << "\n"; 25 | if (abort) 26 | exit(code); 27 | } 28 | } 29 | #endif 30 | -------------------------------------------------------------------------------- /gpu-stream/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++20 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 9 | CCFLAGS := 10 | NAME := stream 11 | LDFLAGS := -L/opt/cuda/lib64 -lcuda -lnvidia-ml 12 | PREFIX := 13 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 14 | 15 | $(PREFIX)cuda-$(NAME): main.cu Makefile 16 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 17 | 18 | $(PREFIX)$(NAME)-gsl: main_gsl.cu Makefile 19 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 20 | 21 | 22 | main.hip: main.cu 23 | hipify-perl main.cu > main.hip 24 | 25 | $(PREFIX)hip-$(NAME): main.hip Makefile 26 | hipcc -std=c++20 -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lhsa-runtime64 -ldl -o $@ $< 27 | 28 | clean: 29 | rm -f cuda-$(NAME) hip-$(NAME) *-hip-$(NAME) *-cuda-$(NAME) main.hip 30 | -------------------------------------------------------------------------------- /gpu-roofline/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | # internal flags 7 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 8 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 9 | 10 | CCFLAGS := 11 | LDFLAGS := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml 12 | NAME := roof 13 | PREFIX := . 14 | N := 100 15 | 16 | $(PREFIX)/cu-$(NAME)$N: main.cu Makefile series.sh 17 | $(NVCC) -DPARN=$N $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 18 | 19 | 20 | main.hip: main.cu 21 | hipify-perl main.cu > main.hip 22 | 23 | $(PREFIX)/hip-$(NAME)$N: main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 24 | echo $(HIP_HOME) 25 | hipcc -DPARN=$N -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -fopenmp -o $@ $< 26 | 27 | 28 | clean: 29 | rm -f ./$(NAME) 30 | 31 | -------------------------------------------------------------------------------- /gpu-small-kernels/Makefile: -------------------------------------------------------------------------------- 1 | ## 2 | NVCC := nvcc 3 | 4 | TEMP_NVCC := $(shell which nvcc) 5 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 6 | 7 | # internal flags 8 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 9 | NVCCFLAGS := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 10 | CCFLAGS := 11 | NAME := small-kernels 12 | LDFLAGS := -L/opt/cuda/lib64 -lcuda -lnvidia-ml 13 | PREFIX := . 14 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 15 | 16 | $(PREFIX)/cuda-$(NAME): main.cu Makefile 17 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 18 | 19 | 20 | 21 | 22 | main.hip: main.cu 23 | hipify-perl main.cu > main.hip 24 | 25 | $(PREFIX)/hip-$(NAME): main.hip Makefile 26 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -ldl -o $@ $< 27 | 28 | clean: 29 | rm -f cuda-$(NAME) hip-$(NAME) 30 | # Project Title 31 | # 32 | # @file 33 | # @version 0.1 34 | 35 | 36 | 37 | # end 38 | -------------------------------------------------------------------------------- /gpu-l2-cache/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | HIP_HOME := /opt/rocm 7 | 8 | 9 | # internal flags 10 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 11 | NVCCFLAGS := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 12 | CCFLAGS := 13 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 14 | NAME := l2-cache 15 | PREFIX := . 16 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include -I$(CUDA_HOME)/include 17 | 18 | $(PREFIX)/cuda-$(NAME): main.cu Makefile 19 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 20 | 21 | 22 | main.hip: main.cu 23 | hipify-perl main.cu > main.hip 24 | 25 | $(PREFIX)/hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 26 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -ldl -o $@ $< 27 | 28 | clean: 29 | rm -f ./cuda-$(NAME) 30 | rm -f main.hip 31 | rm -f ./hip-$(NAME) 32 | -------------------------------------------------------------------------------- /gpu-strides/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | HIP_HOME := /opt/rocm 7 | 8 | 9 | # internal flags 10 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 11 | NVCCFLAGS := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 12 | CCFLAGS := 13 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 14 | NAME := strides 15 | PREFIX := . 16 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 17 | 18 | $(PREFIX)/cuda-$(NAME): main.cu Makefile 19 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 20 | 21 | 22 | 23 | main.hip: main.cu 24 | hipify-perl main.cu > main.hip 25 | 26 | $(PREFIX)/hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 27 | echo $(HIP_HOME) 28 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -mcumode -ldl -o $@ $< 29 | 30 | 31 | 32 | 33 | clean: 34 | rm -f cuda-$(NAME) hip-$(NAME) 35 | -------------------------------------------------------------------------------- /measure_metric/Parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace NV { 6 | namespace Metric { 7 | namespace Parser { 8 | inline bool ParseMetricNameString(const std::string &metricName, 9 | std::string *reqName, bool *isolated, 10 | bool *keepInstances) { 11 | std::string &name = *reqName; 12 | name = metricName; 13 | if (name.empty()) { 14 | return false; 15 | } 16 | 17 | // boost program_options sometimes inserts a \n between the metric name and a 18 | // '&' at the end 19 | size_t pos = name.find('\n'); 20 | if (pos != std::string::npos) { 21 | name.erase(pos, 1); 22 | } 23 | 24 | // trim whitespace 25 | while (name.back() == ' ') { 26 | name.pop_back(); 27 | if (name.empty()) { 28 | return false; 29 | } 30 | } 31 | 32 | *keepInstances = false; 33 | if (name.back() == '+') { 34 | *keepInstances = true; 35 | name.pop_back(); 36 | if (name.empty()) { 37 | return false; 38 | } 39 | } 40 | 41 | *isolated = true; 42 | if (name.back() == '$') { 43 | name.pop_back(); 44 | if (name.empty()) { 45 | return false; 46 | } 47 | } else if (name.back() == '&') { 48 | *isolated = false; 49 | name.pop_back(); 50 | if (name.empty()) { 51 | return false; 52 | } 53 | } 54 | 55 | return true; 56 | } 57 | } // namespace Parser 58 | } // namespace Metric 59 | } // namespace NV 60 | -------------------------------------------------------------------------------- /gpu-l2-stream/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | TEMP_HIPCC := $(shell which hipcc) 7 | HIP_HOME := /opt/rocm 8 | 9 | # internal flags 10 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 11 | NVCCFLAGS := -std=c++20 -O3 -gencode arch=compute_$(SM),code=sm_$(SM)--compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall" 12 | CCFLAGS := 13 | NAME := l2-stream 14 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 15 | PREFIX := 16 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include 17 | 18 | $(PREFIX)cuda-$(NAME): main.cu Makefile 19 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 20 | 21 | $(PREFIX)$(NAME)-gsl: main_gsl.cu Makefile 22 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 23 | 24 | 25 | 26 | 27 | main.hip: main.cu 28 | hipify-perl main.cu > main.hip 29 | 30 | 31 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 32 | echo $(HIP_HOME) 33 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $< 34 | 35 | clean: 36 | rm -f cuda-$(NAME) hip-$(NAME) 37 | -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/Parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace NV { 6 | namespace Metric { 7 | namespace Parser { 8 | inline bool ParseMetricNameString(const std::string &metricName, 9 | std::string *reqName, bool *isolated, 10 | bool *keepInstances) { 11 | std::string &name = *reqName; 12 | name = metricName; 13 | if (name.empty()) { 14 | return false; 15 | } 16 | 17 | // boost program_options sometimes inserts a \n between the metric name and a 18 | // '&' at the end 19 | size_t pos = name.find('\n'); 20 | if (pos != std::string::npos) { 21 | name.erase(pos, 1); 22 | } 23 | 24 | // trim whitespace 25 | while (name.back() == ' ') { 26 | name.pop_back(); 27 | if (name.empty()) { 28 | return false; 29 | } 30 | } 31 | 32 | *keepInstances = false; 33 | if (name.back() == '+') { 34 | *keepInstances = true; 35 | name.pop_back(); 36 | if (name.empty()) { 37 | return false; 38 | } 39 | } 40 | 41 | *isolated = true; 42 | if (name.back() == '$') { 43 | name.pop_back(); 44 | if (name.empty()) { 45 | return false; 46 | } 47 | } else if (name.back() == '&') { 48 | *isolated = false; 49 | name.pop_back(); 50 | if (name.empty()) { 51 | return false; 52 | } 53 | } 54 | 55 | return true; 56 | } 57 | } // namespace Parser 58 | } // namespace Metric 59 | } // namespace NV 60 | -------------------------------------------------------------------------------- /gpu-cache/Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc 2 | 3 | TEMP_NVCC := $(shell which nvcc) 4 | CUDA_HOME := $(shell echo $(TEMP_NVCC) | rev | cut -d'/' -f3- | rev) 5 | 6 | TEMP_HIPCC := $(shell which hipcc) 7 | HIP_HOME := /opt/rocm 8 | 9 | #(shell echo $(TEMP_HIPCC) | rev | cut -d'/' -f4- | rev) 10 | 11 | # internal flags 12 | SM ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g') 13 | NVCCFLAGS := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/lib64/\" 14 | CCFLAGS := 15 | LDFLAGS := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda -lnvidia-ml -lnvperf_host -lnvperf_target 16 | NAME := cache 17 | PREFIX := 18 | INCLUDES := -I$(CUDA_HOME)/extras/CUPTI/include -I$(CUDA_HOME)/include 19 | 20 | 21 | 22 | $(PREFIX)cuda-$(NAME): main.cu Makefile 23 | echo $(CUDA_HOME) 24 | $(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS) 25 | 26 | 27 | main.hip: main.cu 28 | hipify-perl main.cu > main.hip 29 | 30 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp 31 | echo $(HIP_HOME) 32 | hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $< 33 | 34 | clean: 35 | rm -f cuda-$(NAME) hip-$(NAME) 36 | -------------------------------------------------------------------------------- /device_order.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | 6 | plt.style.use("bmh") 7 | plt.rcParams["axes.facecolor"] = "white" 8 | 9 | 10 | device_color_palette = [ 11 | "#378ABD", 12 | "#FFB33A", 13 | "#7EC75B", 14 | "#DA5252", 15 | "#793B67", 16 | "#10CFCC", 17 | "#FFE100", 18 | "#09047f", 19 | "#296F20", 20 | ] 21 | 22 | order = [ 23 | "a40", 24 | "l40", 25 | "v100", 26 | "a100", 27 | "gh200", 28 | "mi100", 29 | "mi210", 30 | "mi300x", 31 | "rx6900xt", 32 | "mi300a", 33 | "a100_40", 34 | "h100_pcie", 35 | ] 36 | 37 | 38 | long_order = [ 39 | "NVIDIA A40", 40 | "NVIDIA L40", 41 | "Tesla V100", 42 | "NVIDIA A100-SXM4-80GB", 43 | "NVIDIA GH200 480GB", 44 | "AMD Instinct MI100", 45 | "AMD Instinct MI210", 46 | "AMD Instinct MI300X", 47 | "AMD Radeon RX 6900 XT", 48 | "NVIDIA A100-SXM4-40G", 49 | ] 50 | 51 | 52 | def getOrderNumber(f, use_order=order): 53 | for o in range(len(use_order)): 54 | if f.startswith(use_order[o]): 55 | return o 56 | return len(use_order) + 1 57 | 58 | 59 | def getDeviceColor(f): 60 | n = getOrderNumber(f) 61 | if n >= len(device_color_palette): 62 | n = getOrderNumber(f, use_order=long_order) 63 | if n >= len(device_color_palette): 64 | return "C" + str(n - len(device_color_palette)) 65 | 66 | return device_color_palette[n] 67 | 68 | 69 | lineStyle = {"linewidth": 2.0, "alpha": 1, "markersize": 3, "marker": ""} 70 | -------------------------------------------------------------------------------- /gpu-stats.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_STATS_H_ 2 | #define GPU_STATS_H_ 3 | 4 | #ifdef __NVCC__ 5 | #include 6 | #elif defined __HIP__ 7 | #include 8 | #endif 9 | 10 | struct GPU_stats { 11 | double clock; 12 | double power; 13 | double temperature; 14 | }; 15 | 16 | GPU_stats getGPUStats(int deviceId) { 17 | #ifdef __NVCC__ 18 | static bool initialized = false; 19 | if (!initialized) { 20 | initialized = true; 21 | nvmlInit(); 22 | } 23 | nvmlDevice_t device; 24 | nvmlDeviceGetHandleByIndex(deviceId, &device); 25 | unsigned int power = 0; 26 | unsigned int clock = 0; 27 | unsigned int temperature = 0; 28 | 29 | nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock); 30 | nvmlDeviceGetPowerUsage(device, &power); 31 | nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature); 32 | 33 | return {clock, power, temperature}; 34 | #elif defined __HIP__ 35 | 36 | static bool initialized = false; 37 | rsmi_status_t ret; 38 | if (!initialized) { 39 | initialized = true; 40 | ret = rsmi_init(0); 41 | unsigned int num_devices; 42 | ret = rsmi_num_monitor_devices(&num_devices); 43 | } 44 | 45 | uint64_t power = 0; 46 | rsmi_frequencies_t clockStruct; 47 | int currentClock = 0; 48 | int64_t temperature = 0; 49 | ret = rsmi_dev_temp_metric_get(deviceId, RSMI_TEMP_TYPE_EDGE, 50 | RSMI_TEMP_CURRENT, &temperature); 51 | ret = rsmi_dev_power_ave_get(deviceId, 0, &power); 52 | ret = rsmi_dev_gpu_clk_freq_get(deviceId, RSMI_CLK_TYPE_SYS, &clockStruct); 53 | 54 | power /= 1000; 55 | temperature /= 1000; 56 | currentClock = clockStruct.frequency[clockStruct.current] / 1e6; 57 | 58 | return {(double)currentClock, (double)power, (double)temperature}; 59 | #endif 60 | } 61 | 62 | #endif // GPU-STATS_H_ 63 | -------------------------------------------------------------------------------- /gpu-metrics/README.md: -------------------------------------------------------------------------------- 1 | # Performance Counter Measurement Library for AMD and NVIDIA GPUs 2 | 3 | This folder contains a header that provides pairs of functions: 4 | 5 | ``` 6 | void measureMetricsStart(std::vector metricNames); 7 | std::vector measureMetricsStop(); 8 | ``` 9 | The second function will return the measured metrics specicied in the start function of a GPU kernel launched in between the two. Launch only a single GPU kernel, otherwise it will probably crash. 10 | For metricNames, any metric supported by your GPU can be used. Multiple metrics can be measured at the same time. The NVIDIA backend does multi pass if all metrics cannot be profiled in a single pass, the rocprofiler backend crashes but suggestes a different metric combination. 11 | 12 | There are two more pairs of start/stop function 13 | 14 | ``` 15 | void measureDRAMBytesStart(); 16 | std::vector measureDRAMBytesStop() 17 | 18 | void measureL2BytesStart(); 19 | void measureL2BytesStop(); 20 | ``` 21 | which contain the metric names and evaluation for very selected GPU models. On the AMD side, should work and tested for gfx90a and gfx1030, on the NVIDIA side, sm_80 aka A100. Dont forget to call 22 | 23 | ``` 24 | void initMeasureMetric(); 25 | ``` 26 | before doing anything. 27 | 28 | Example usage from gpu-l2-cache/main.cu: (where it is currently commented out because it doesn't work on all models. 29 | 30 | ``` 31 | measureDRAMBytesStart(); 32 | callKernel(blockCount, blockRun); 33 | auto metrics = measureDRAMBytesStop(); 34 | dram_read.add(metrics[0]); 35 | dram_write.add(metrics[1]); 36 | 37 | measureL2BytesStart(); 38 | callKernel(blockCount, blockRun); 39 | metrics = measureL2BytesStop(); 40 | L2_read.add(metrics[0]); 41 | L2_write.add(metrics[1]); 42 | ``` 43 | 44 | The APIs (perf works and rocprofiler) are unstable and fragile, if something is slightly off. Issues and comments welcome. 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /MeasurementSeries.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | class MeasurementSeries { 8 | public: 9 | void add(double v) { data.push_back(v); } 10 | double value() { 11 | if (data.size() == 0) 12 | return 0.0; 13 | if (data.size() == 1) 14 | return data[0]; 15 | if (data.size() == 2) 16 | return (data[0] + data[1]) / 2.0; 17 | std::sort(begin(data), end(data)); 18 | return std::accumulate(begin(data) + 1, end(data) - 1, 0.0) / 19 | (data.size() - 2); 20 | } 21 | double median() { 22 | if (data.size() == 0) 23 | return 0.0; 24 | if (data.size() == 1) 25 | return data[0]; 26 | if (data.size() == 2) 27 | return (data[0] + data[1]) / 2.0; 28 | 29 | std::sort(begin(data), end(data)); 30 | if (data.size() % 2 == 0) { 31 | return (data[data.size() / 2] + data[data.size() / 2 + 1]) / 2; 32 | } 33 | return data[data.size() / 2]; 34 | } 35 | 36 | double minValue() { 37 | if (data.size() == 0) 38 | return 0.0; 39 | std::sort(begin(data), end(data)); 40 | return *begin(data); 41 | } 42 | 43 | double getPercentile(double percentile) { 44 | if (data.size() == 0) 45 | return 0.0; 46 | std::sort(begin(data), end(data)); 47 | int index = (int)round((data.size() - 1) * percentile); 48 | return data[index]; 49 | } 50 | 51 | double maxValue() { 52 | if (data.size() == 0) 53 | return 0.0; 54 | std::sort(begin(data), end(data)); 55 | return data.back(); 56 | } 57 | 58 | double spread() { 59 | if (data.size() <= 1) 60 | return 0.0; 61 | if (data.size() == 2) 62 | return abs(data[0] - data[1]) / value(); 63 | std::sort(begin(data), end(data)); 64 | return abs(*(begin(data)) - *(end(data) - 1)) / value(); 65 | } 66 | int count() { return data.size(); } 67 | 68 | private: 69 | std::vector data; 70 | }; 71 | -------------------------------------------------------------------------------- /measure_metric/Parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | namespace NV { 7 | namespace Metric { 8 | namespace Parser { 9 | inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances) 10 | { 11 | std::string& name = *reqName; 12 | name = metricName; 13 | if (name.empty()) 14 | { 15 | return false; 16 | } 17 | 18 | // boost program_options sometimes inserts a \n between the metric name and a '&' at the end 19 | size_t pos = name.find('\n'); 20 | if (pos != std::string::npos) 21 | { 22 | name.erase(pos, 1); 23 | } 24 | 25 | // trim whitespace 26 | while (name.back() == ' ') 27 | { 28 | name.pop_back(); 29 | if (name.empty()) 30 | { 31 | return false; 32 | } 33 | } 34 | 35 | *keepInstances = false; 36 | if (name.back() == '+') 37 | { 38 | *keepInstances = true; 39 | name.pop_back(); 40 | if (name.empty()) 41 | { 42 | return false; 43 | } 44 | } 45 | 46 | *isolated = true; 47 | if (name.back() == '$') 48 | { 49 | name.pop_back(); 50 | if (name.empty()) 51 | { 52 | return false; 53 | } 54 | } 55 | else if (name.back() == '&') 56 | { 57 | *isolated = false; 58 | name.pop_back(); 59 | if (name.empty()) 60 | { 61 | return false; 62 | } 63 | } 64 | 65 | return true; 66 | } 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/Parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | namespace NV { 7 | namespace Metric { 8 | namespace Parser { 9 | inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances) 10 | { 11 | std::string& name = *reqName; 12 | name = metricName; 13 | if (name.empty()) 14 | { 15 | return false; 16 | } 17 | 18 | // boost program_options sometimes inserts a \n between the metric name and a '&' at the end 19 | size_t pos = name.find('\n'); 20 | if (pos != std::string::npos) 21 | { 22 | name.erase(pos, 1); 23 | } 24 | 25 | // trim whitespace 26 | while (name.back() == ' ') 27 | { 28 | name.pop_back(); 29 | if (name.empty()) 30 | { 31 | return false; 32 | } 33 | } 34 | 35 | *keepInstances = false; 36 | if (name.back() == '+') 37 | { 38 | *keepInstances = true; 39 | name.pop_back(); 40 | if (name.empty()) 41 | { 42 | return false; 43 | } 44 | } 45 | 46 | *isolated = true; 47 | if (name.back() == '$') 48 | { 49 | name.pop_back(); 50 | if (name.empty()) 51 | { 52 | return false; 53 | } 54 | } 55 | else if (name.back() == '&') 56 | { 57 | *isolated = false; 58 | name.pop_back(); 59 | if (name.empty()) 60 | { 61 | return false; 62 | } 63 | } 64 | 65 | return true; 66 | } 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /gpu-clock.cuh: -------------------------------------------------------------------------------- 1 | #include "dtime.hpp" 2 | #include "gpu-error.h" 3 | #include 4 | #include 5 | 6 | #ifdef __NVCC__ 7 | #include 8 | #endif 9 | #ifdef __HIP__ 10 | #include 11 | #endif 12 | 13 | 14 | __global__ void powerKernel(double* A, int iters) { 15 | int tidx = threadIdx.x + blockIdx.x*blockDim.x; 16 | 17 | double start = A[0]; 18 | #pragma unroll 1 19 | for(int i = 0; i < iters; i++) { 20 | start -= (tidx*0.1)*start; 21 | } 22 | A[0] = start; 23 | } 24 | 25 | 26 | 27 | unsigned int getGPUClock() { 28 | 29 | double* dA = NULL; 30 | #ifdef __NVCC__ 31 | GPU_ERROR(cudaMalloc(&dA, sizeof(double))); 32 | #endif 33 | #ifdef __HIP__ 34 | GPU_ERROR(hipMalloc(&dA, sizeof(double))); 35 | #endif 36 | 37 | unsigned int gpu_clock; 38 | 39 | 40 | 41 | int iters = 10; 42 | 43 | powerKernel<<<1000, 1024>>>(dA, iters); 44 | 45 | double dt = 0; 46 | std::cout << "clock: "; 47 | while (dt < 0.4) { 48 | #ifdef __NVCC__ 49 | GPU_ERROR(cudaDeviceSynchronize()); 50 | #endif 51 | #ifdef __HIP__ 52 | GPU_ERROR(hipDeviceSynchronize()); 53 | #endif 54 | double t1 = dtime(); 55 | 56 | powerKernel<<<1000, 1024>>>(dA, iters); 57 | usleep(10000); 58 | 59 | #ifdef __NVCC__ 60 | nvmlInit(); 61 | nvmlDevice_t device; 62 | nvmlDeviceGetHandleByIndex(0, &device); 63 | nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &gpu_clock); 64 | GPU_ERROR(cudaDeviceSynchronize()); 65 | #endif 66 | #ifdef __HIP__ 67 | int deviceId; 68 | GPU_ERROR(hipGetDevice(&deviceId)); 69 | rsmi_status_t ret; 70 | uint32_t num_devices; 71 | uint16_t dev_id; 72 | rsmi_frequencies_t clockStruct; 73 | ret = rsmi_init(0); 74 | ret = rsmi_num_monitor_devices(&num_devices); 75 | ret = rsmi_dev_gpu_clk_freq_get(deviceId, RSMI_CLK_TYPE_SYS, &clockStruct); 76 | gpu_clock = clockStruct.frequency[clockStruct.current] / 1e6; 77 | GPU_ERROR(hipDeviceSynchronize()); 78 | #endif 79 | double t2 = dtime(); 80 | std::cout << gpu_clock << " "; 81 | std::cout.flush(); 82 | dt = t2 - t1; 83 | iters *= 2; 84 | } 85 | std::cout << "\n"; 86 | #ifdef __NVCC__ 87 | GPU_ERROR(cudaFree(dA)); 88 | #endif 89 | #ifdef __HIP__ 90 | GPU_ERROR(hipFree(dA)); 91 | #endif 92 | return gpu_clock; 93 | } 94 | -------------------------------------------------------------------------------- /um-stream/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../dtime.hpp" 3 | #include "../gpu-error.h" 4 | #include 5 | #include 6 | using namespace std; 7 | 8 | __global__ void scale(double *A, double *B, size_t N) { 9 | size_t tidx = threadIdx.x + blockDim.x * blockIdx.x; 10 | for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) { 11 | A[i] = B[i] * 1.3; 12 | } 13 | } 14 | 15 | __global__ void triad(double *A, double *B, double *C, size_t N) { 16 | size_t tidx = threadIdx.x + blockDim.x * blockIdx.x; 17 | for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) { 18 | A[i] = B[i] + C[i] * 1.3; 19 | } 20 | } 21 | 22 | int main(int argc, char **argv) { 23 | double *A, *B, *C; 24 | 25 | cout << setw(12) << "buffer size" << setw(10) << "time" << setw(9) << "spread" 26 | << setw(13) << "bandwidth\n"; 27 | 28 | const int blockSize = 256; 29 | 30 | cudaDeviceProp prop; 31 | int deviceId; 32 | GPU_ERROR(cudaGetDevice(&deviceId)); 33 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 34 | std::string deviceName = prop.name; 35 | int smCount = prop.multiProcessorCount; 36 | int maxActiveBlocks = 0; 37 | GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, 38 | triad, blockSize, 0)); 39 | 40 | int blockCount = smCount * maxActiveBlocks; 41 | 42 | for (size_t N = 1024 * 1024; N < (size_t)1024 * 1024 * 1024 * 16; N *= 2) { 43 | GPU_ERROR(cudaMallocManaged(&A, N * sizeof(double))); 44 | GPU_ERROR(cudaMallocManaged(&B, N * sizeof(double))); 45 | GPU_ERROR(cudaMallocManaged(&C, N * sizeof(double))); 46 | 47 | triad<<>>(A, B, C, N); 48 | // scale<<<640, 256>>>(A, B, N); 49 | GPU_ERROR(cudaDeviceSynchronize()); 50 | 51 | MeasurementSeries time; 52 | for (int i = 0; i < 5; i++) { 53 | double t1 = dtime(); 54 | triad<<<640, 256>>>(A, B, C, N); 55 | GPU_ERROR(cudaDeviceSynchronize()); 56 | double t2 = dtime(); 57 | time.add(t2 - t1); 58 | } 59 | 60 | double bw = N * sizeof(double) * 3 / time.value() / 1.0e9; 61 | cout << fixed << setprecision(1) << setw(9) 62 | << 3 * N * sizeof(double) / (1 << 20) << " MB" << setw(8) 63 | << time.value() * 1000 << "ms" << setprecision(1) << setw(8) 64 | << time.spread() * 100 << "%" << setw(8) << bw << "GB/s\n"; 65 | 66 | GPU_ERROR(cudaFree(A)); 67 | GPU_ERROR(cudaFree(B)); 68 | GPU_ERROR(cudaFree(C)); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /gpu-l2-cache/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | 6 | import sys 7 | 8 | sys.path.append(".") 9 | sys.path.append("..") 10 | from device_order import * 11 | 12 | 13 | fig, ax = plt.subplots(figsize=(8, 4)) 14 | fig2, ax2 = plt.subplots(figsize=(8, 4)) 15 | 16 | 17 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)): 18 | if not filename.endswith(".txt"): 19 | continue 20 | 21 | with open(filename, newline="") as csvfile: 22 | 23 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 24 | sizes = [] 25 | bw = [] 26 | L2bw = [] 27 | for row in csvreader: 28 | if row[0] == "data" or not row[0].isnumeric(): 29 | continue 30 | sizes.append(float(row[2])) 31 | bw.append(float(row[6])) 32 | L2bw.append(float(row[12])) 33 | 34 | # print(sizes) 35 | # print(bw) 36 | ax.plot( 37 | sizes, 38 | bw, 39 | label=filename[:-4].upper(), 40 | color="C" + str(getOrderNumber(filename)), 41 | **lineStyle 42 | ) 43 | ax2.plot( 44 | sizes, 45 | L2bw, 46 | label=filename[:-4].upper(), 47 | color="C" + str(getOrderNumber(filename)), 48 | **lineStyle 49 | ) 50 | print(filename, getOrderNumber(filename)) 51 | 52 | 53 | ax.set_xlabel("total data volume, MB") 54 | ax.set_ylabel("GB/s") 55 | ax.set_xscale("log", base=2) 56 | 57 | 58 | ax2.set_xlabel("total data volume, kB") 59 | ax2.set_ylabel("GB/s") 60 | ax2.set_xscale("log", base=2) 61 | 62 | formatter = matplotlib.ticker.FuncFormatter(lambda x, pos: "{0:g}".format(x / 1024)) 63 | ax.get_xaxis().set_major_formatter(formatter) 64 | # ax.get_yaxis().set_major_formatter(formatter) 65 | 66 | ax2.get_xaxis().set_major_formatter(formatter) 67 | ax2.get_yaxis().set_major_formatter(formatter) 68 | 69 | ax.set_xticks([1024, 4 * 1024, 8 * 1024, 20 * 1024, 40 * 1024, 128 * 1024, 1024 * 1024]) 70 | 71 | ax2.set_xticks([1024, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024]) 72 | 73 | fig.autofmt_xdate() 74 | ax.set_ylim([0, ax.get_ylim()[1]]) 75 | ax.set_xlim([1024 * 1.5, 1024 * 1024]) 76 | 77 | fig2.autofmt_xdate() 78 | ax2.set_xlim([1024 * 1.5, 1024 * 1024]) 79 | 80 | ax.set_xlim([1024, ax.get_xlim()[1]]) 81 | ax.legend() 82 | fig.tight_layout() 83 | 84 | ax2.legend() 85 | fig2.tight_layout() 86 | 87 | plt.show() 88 | fig.savefig("cuda-cache.svg") 89 | # fig2.savefig("cuda-cache-l2.svg") 90 | -------------------------------------------------------------------------------- /gpu-cache/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | 6 | import sys 7 | 8 | sys.path.append("..") 9 | from device_order import * 10 | 11 | 12 | fig, ax = plt.subplots(figsize=(8, 4)) 13 | fig2, ax2 = plt.subplots(figsize=(8, 4)) 14 | 15 | 16 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)): 17 | if not filename.endswith(".txt"): 18 | continue 19 | 20 | with open(filename, newline="") as csvfile: 21 | style = "P-" 22 | if filename.endswith("f.txt"): 23 | style = "o--" 24 | 25 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 26 | sizes = [] 27 | bw = [] 28 | L2bw = [] 29 | for row in csvreader: 30 | if row[0] == "data" or not row[0].isnumeric(): 31 | continue 32 | sizes.append(float(row[0])) 33 | bw.append(float(row[4])) 34 | L2bw.append(float(row[10])) 35 | 36 | # print(sizes) 37 | # print(bw) 38 | print(filename, len(sizes), getOrderNumber(filename)) 39 | ax.plot( 40 | sizes, 41 | bw, 42 | label=order[getOrderNumber(filename)].upper(), 43 | color=getDeviceColor(filename), 44 | **lineStyle, 45 | ) 46 | ax2.plot( 47 | sizes, 48 | L2bw, 49 | label=order[getOrderNumber(filename)].upper(), 50 | color=getDeviceColor(filename), 51 | **lineStyle, 52 | ) 53 | 54 | ax.set_xlabel("data volume per SM/CU, kB") 55 | ax.set_ylabel("GB/s") 56 | ax.set_xscale("log", base=2) 57 | 58 | 59 | ax2.set_xlabel("data volume per SM/CU, kB") 60 | ax2.set_ylabel("GB/s") 61 | ax2.set_xscale("log", base=2) 62 | 63 | formatter = matplotlib.ticker.FuncFormatter( 64 | lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g} MB".format(x / 1024) 65 | ) 66 | ax.get_xaxis().set_major_formatter(formatter) 67 | # ax.get_yaxis().set_major_formatter(formatter) 68 | 69 | ax2.get_xaxis().set_major_formatter(formatter) 70 | # ax2.get_yaxis().set_major_formatter(formatter) 71 | 72 | ax.set_xticks([4, 16, 128, 256, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024]) 73 | 74 | ax2.set_xticks([4, 16, 128, 256, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024]) 75 | 76 | ax.set_xlim((1.8, 256 * 1024)) 77 | ax2.set_xlim((1.8, 256 * 1024)) 78 | 79 | fig.autofmt_xdate() 80 | ax.set_ylim([0, ax.get_ylim()[1] * 1.1]) 81 | 82 | fig2.autofmt_xdate() 83 | ax2.set_ylim([0, ax2.get_ylim()[1] * 1.1]) 84 | 85 | ax.legend() 86 | fig.tight_layout() 87 | 88 | ax2.legend() 89 | fig2.tight_layout() 90 | 91 | plt.show() 92 | fig.savefig("cuda-cache.svg") 93 | fig2.savefig("cuda-cache-l2.pdf") 94 | -------------------------------------------------------------------------------- /gpu-stream/rx6900xt.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1280 1 2% | GB/s: 73 41 65 125 61 58 3 | 32 2560 1 3.1% | GB/s: 145 82 129 237 119 113 4 | 48 3840 1 4.7% | GB/s: 210 120 188 330 179 169 5 | 64 5120 1 6.2% | GB/s: 278 160 246 398 235 223 6 | 80 6400 1 7.8% | GB/s: 344 194 303 419 288 273 7 | 96 7680 1 9.4% | GB/s: 412 230 354 430 336 319 8 | 112 8960 1 10.9% | GB/s: 474 263 404 428 379 357 9 | 64 10240 2 12.5% | GB/s: 459 259 392 431 360 342 10 | 160 12800 1 15.6% | GB/s: 506 387 463 430 461 449 11 | 96 15360 2 18.8% | GB/s: 505 404 462 430 461 452 12 | 128 20480 2 25.0% | GB/s: 503 505 459 430 458 458 13 | 160 25600 2 31.2% | GB/s: 502 508 459 430 458 458 14 | 192 30720 2 37.5% | GB/s: 502 508 458 430 459 458 15 | 224 35840 2 43.8% | GB/s: 501 508 457 430 457 458 16 | 256 40960 2 50.0% | GB/s: 501 506 457 431 456 456 17 | 288 46080 2 56.2% | GB/s: 501 505 457 431 456 456 18 | 320 51200 2 62.5% | GB/s: 500 504 456 432 456 456 19 | 352 56320 2 68.8% | GB/s: 500 504 456 434 456 456 20 | 384 61440 2 75.0% | GB/s: 500 504 456 432 455 455 21 | 416 66560 2 81.2% | GB/s: 501 504 456 436 456 456 22 | 448 71680 2 87.5% | GB/s: 500 504 456 436 456 456 23 | 480 76800 2 93.8% | GB/s: 501 504 456 437 456 456 24 | 512 81920 2 100.0% | GB/s: 500 505 457 433 455 456 25 | -------------------------------------------------------------------------------- /gpu-roofline/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import os 5 | import csv 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | 9 | import sys 10 | 11 | sys.path.append("..") 12 | from device_order import * 13 | 14 | 15 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(7, 7)) 16 | # fig2, ax2 = plt.subplots(figsize=(8, 4)) 17 | # fig3, ax3 = plt.subplots(figsize=(8, 4)) 18 | 19 | 20 | filenames = ["h200.txt", "alex_a100_40.txt", "bxx.txt"] 21 | 22 | colors = ["#349999", "#CC1343", "#649903", "#c7aa3e"] 23 | 24 | c = 0 25 | for filename in filenames: 26 | with open(filename, newline="") as csvfile: 27 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 28 | 29 | datapoints = [[]] 30 | 31 | for row in csvreader: 32 | print(row) 33 | if len(row) == 0: 34 | datapoints.append([]) 35 | 36 | elif len(row) == 16: 37 | datapoints[-1].append( 38 | [float(row[5]), float(row[9]), float(row[13]), float(row[11])] 39 | ) 40 | 41 | print(datapoints) 42 | print() 43 | 44 | for i in range(min(1, len(datapoints[1]))): 45 | print([d[i][1] for d in datapoints if len(d) > 0]) 46 | ax1.plot( 47 | [d[i][0] for d in datapoints if len(d) > 0], 48 | [d[i][1] / 1000 for d in datapoints if len(d) > 0], 49 | "-", 50 | color=colors[c], 51 | label=filename 52 | ) 53 | 54 | ax2.plot( 55 | [d[i][0] for d in datapoints if len(d) > 0], 56 | [d[i][2] for d in datapoints if len(d) > 0], 57 | "-", 58 | color=colors[c], 59 | ) 60 | 61 | ax3.plot( 62 | [d[i][0] for d in datapoints if len(d) > 0], 63 | [d[i][3] / 1000 for d in datapoints if len(d) > 0], 64 | "-", 65 | color=colors[c], 66 | ) 67 | c += 1 68 | 69 | 70 | ax1.legend() 71 | 72 | ax3.set_xlabel("Arithmetic Intensity, Flop/B") 73 | ax1.set_ylabel("FP32, TFlop/s") 74 | ax2.set_ylabel("Power, W") 75 | ax3.set_ylabel("Clock, GHz") 76 | 77 | 78 | ax1.set_ylim([0, ax1.get_ylim()[1]]) 79 | ax1.set_xlim([0, ax1.get_xlim()[1]]) 80 | 81 | ax2.set_ylim([0, ax2.get_ylim()[1]]) 82 | ax2.set_xlim([0, ax2.get_xlim()[1]]) 83 | 84 | ax3.set_ylim([0, ax3.get_ylim()[1]]) 85 | ax3.set_xlim([0, ax3.get_xlim()[1]]) 86 | 87 | # ax.set_xscale("log") 88 | # ax2.set_xscale("log") 89 | 90 | # ax.set_yscale("log") 91 | # ax2.set_yscale("log") 92 | 93 | 94 | fig.tight_layout() 95 | 96 | plt.savefig("L40_plot.pdf", dpi=4000) 97 | plt.show() 98 | -------------------------------------------------------------------------------- /cuda-memcpy/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../MeasurementSeries.hpp" 4 | #include "../dtime.hpp" 5 | #include "../gpu-error.h" 6 | using namespace std; 7 | 8 | int main(int argc, char** argv) { 9 | int deviceCount; 10 | GPU_ERROR(cudaGetDeviceCount(&deviceCount)); 11 | 12 | 13 | vector deviceBuffers(deviceCount, nullptr); 14 | char *host_buffer; 15 | const size_t buffer_size_bytes = (size_t)2 * 1024 * 1024 * 1024; 16 | 17 | 18 | for( int d = 0; d < deviceCount; d++) { 19 | GPU_ERROR(cudaSetDevice(d)); 20 | GPU_ERROR(cudaMalloc(& (deviceBuffers[d]), buffer_size_bytes)); 21 | GPU_ERROR(cudaDeviceSynchronize()); 22 | } 23 | GPU_ERROR(cudaMallocHost(&host_buffer, buffer_size_bytes)); 24 | 25 | 26 | const int num_streams = 1; 27 | cudaStream_t streams[num_streams]; 28 | 29 | for (int i = 0; i < num_streams; i++) { 30 | cudaStreamCreate(&streams[i]); 31 | } 32 | 33 | memset(host_buffer, 0, buffer_size_bytes); 34 | 35 | for (size_t transfer_size_bytes = 2 << 16; 36 | transfer_size_bytes <= buffer_size_bytes / num_streams; 37 | transfer_size_bytes *= 16) { 38 | 39 | for(int d = 0; d < deviceCount; d++) { 40 | GPU_ERROR(cudaSetDevice(d)); 41 | MeasurementSeries time; 42 | for (int sample = 0; sample < 5; sample++) { 43 | memset(host_buffer, 0, buffer_size_bytes); 44 | double t1 = dtime(); 45 | for (int stream = 0; stream < num_streams; stream++) { 46 | GPU_ERROR(cudaMemcpyAsync( 47 | deviceBuffers[d] + (size_t)stream * transfer_size_bytes, 48 | host_buffer + (size_t)stream * transfer_size_bytes, 49 | transfer_size_bytes, cudaMemcpyDefault, streams[stream])); 50 | } 51 | 52 | GPU_ERROR(cudaDeviceSynchronize()); 53 | double t2 = dtime(); 54 | time.add(t2 - t1); 55 | } 56 | double bw = num_streams * transfer_size_bytes / time.value(); 57 | cout << fixed // 58 | << "Device: " << d << " " 59 | << setw(10) << setprecision(0) << (transfer_size_bytes >> 10) 60 | << "kB " // 61 | << setprecision(2) << setw(7) << time.value() * 1000 << "ms " // 62 | << setprecision(2) << setw(7) << bw * 1e-9 << "GB/s " // 63 | << time.spread() * 100 << "%\n"; 64 | } 65 | if(deviceCount > 1) cout << "\n"; 66 | } 67 | 68 | for(int d = 0; d< deviceCount; d++) { 69 | GPU_ERROR(cudaFree(deviceBuffers[d])); 70 | } 71 | // GPU_ERROR(cudaFree(host_buffer)); 72 | GPU_ERROR(cudaFreeHost(host_buffer)); 73 | } 74 | -------------------------------------------------------------------------------- /gpu-cache/mi100.txt: -------------------------------------------------------------------------------- 1 | clock: 300 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 2 | data set exec time spread Eff. bw DRAM read DRAM write L2 read L2 store 3 | 4 kB 424ms 1.5% 9098.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 4 | 8 kB 469ms 6.5% 8304.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 5 | 16 kB 371ms 3.6% 10485.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 6 | 24 kB 1469ms 20.2% 2700.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 7 | 32 kB 1527ms 17.5% 2672.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 8 | 48 kB 1595ms 12.8% 2543.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 9 | 64 kB 1587ms 7.1% 2515.0 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 10 | 80 kB 1621ms 8.6% 2506.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 11 | 96 kB 1616ms 7.2% 2492.0 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 12 | 112 kB 1612ms 6.8% 2505.4 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 13 | 128 kB 1622ms 13.8% 2516.7 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 14 | 144 kB 1661ms 15.3% 2482.9 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 15 | 160 kB 1659ms 15.0% 2504.0 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 16 | 176 kB 1574ms 12.3% 2570.4 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 17 | 192 kB 1589ms 11.3% 2591.5 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 18 | 208 kB 1584ms 17.7% 2565.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 19 | 224 kB 1566ms 7.0% 2574.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 20 | 240 kB 1578ms 12.3% 2571.0 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 21 | 256 kB 1561ms 11.1% 2561.4 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 22 | 272 kB 1655ms 9.4% 2471.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 23 | 288 kB 1656ms 10.4% 2487.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 24 | 304 kB 1658ms 8.5% 2467.5 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 25 | 320 kB 1661ms 9.3% 2475.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 26 | -------------------------------------------------------------------------------- /gpu-strides/h200.txt: -------------------------------------------------------------------------------- 1 | clock: 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 2 | float stride 0 0 1.01 126 1.00 1.00 0.03 3 | float stride 1 1 1.01 126 1.00 1.00 0.04 4 | float stride 2 2 2.01 64 2.00 1.22 0.02 5 | float stride 3 3 1.63 79 1.00 1.62 0.04 6 | float stride 4 4 4.01 32.0 4.00 1.84 0.05 7 | float stride 5 5 2.00 64 1.00 2.00 0.10 8 | float stride 6 6 2.32 55 2.00 2.31 0.04 9 | float stride 7 7 2.41 53 1.00 2.41 0.03 10 | float stride 8 8 8.0 16.0 8.0 2.72 0.11 11 | float stride 9 9 3.00 42.6 1.00 3.00 0.04 12 | float stride 10 10 3.16 40.5 2.00 3.16 0.04 13 | float stride 11 11 3.41 37.5 1.00 3.41 0.08 14 | float stride 12 12 4.01 32.0 4.00 3.62 0.05 15 | float stride 13 13 4.10 31.2 1.00 4.09 0.03 16 | float stride 14 14 4.25 30.1 2.00 4.25 0.04 17 | float stride 15 15 4.60 27.8 1.00 4.59 0.05 18 | 19 | double stride 0 0 2.01 128 2.00 1.00 0.04 20 | double stride 1 1 2.01 128 2.00 1.19 0.03 21 | double stride 2 2 4.01 64 4.00 1.75 0.08 22 | double stride 3 3 2.32 110 2.00 2.31 0.05 23 | double stride 4 4 8.0 32.0 8.0 2.78 0.06 24 | double stride 5 5 3.16 81 2.00 3.16 0.05 25 | double stride 6 6 4.01 64 4.00 3.72 0.03 26 | double stride 7 7 4.32 59 2.00 4.31 0.04 27 | double stride 8 8 16.0 16.0 16.0 4.66 0.13 28 | double stride 9 9 5.3 48.7 2.00 5.2 0.05 29 | double stride 10 10 5.5 46.8 4.00 5.5 0.18 30 | double stride 11 11 6.0 42.6 2.00 6.0 0.04 31 | double stride 12 12 8.0 32.0 8.0 6.6 0.09 32 | double stride 13 13 7.0 36.5 2.00 7.0 0.06 33 | double stride 14 14 7.0 36.5 4.00 7.0 0.05 34 | double stride 15 15 8.0 32.0 2.00 8.0 0.05 35 | 36 | float block 1 4098 9.3 13.8 2.00 9.2 0.08 37 | float block 2 4098 4.82 26.6 1.00 4.81 0.09 38 | float block 4 4098 3.00 42.6 2.00 3.00 0.03 39 | float block 8 4098 4.01 32.0 4.00 2.12 0.05 40 | float block 16 4098 2.01 64 2.00 1.47 0.04 41 | float block 32 4098 1.02 125 1.00 1.00 0.04 42 | float block 64 4098 1.02 126 1.00 1.00 0.05 43 | 44 | double block 1 4098 9.0 28.4 4.00 9.0 0.07 45 | double block 2 4098 5.3 48.7 2.00 5.2 0.07 46 | double block 4 4098 4.01 64 4.00 3.25 0.05 47 | double block 8 4098 4.01 64 4.00 2.27 0.03 48 | double block 16 4098 2.01 128 2.00 1.38 0.04 49 | double block 32 4098 2.01 128 2.00 1.00 0.02 50 | double block 64 4098 2.01 128 2.00 1.00 0.06 51 | -------------------------------------------------------------------------------- /gpu-l2-cache/sycl/sycl-gpu-l2-cache.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace sycl; 11 | using dtype = double; 12 | 13 | int main(int argc, char **argv) { 14 | const int N = 64; 15 | std::cout << std::setw(13) << "data set" // 16 | << std::setw(12) << "exec time" // 17 | << std::setw(11) << "spread" // 18 | << std::setw(15) << "Eff. bw\n"; // 19 | 20 | sycl::queue q{sycl::gpu_selector_v,sycl::property::queue::enable_profiling{}}; 21 | std::cout << "Running on GPU:" << q.get_device().get_info()<< std::endl; 22 | 23 | 24 | for (int blockRun = 3; blockRun < 10000; blockRun += max(1.0, blockRun * 0.1)) { 25 | const int blockSize = 1024; 26 | const int blockCount = 200000; 27 | 28 | std::vector time; 29 | 30 | for (int i = 0; i < 11; i++) { 31 | const size_t bufferCount = blockRun * blockSize * N + i * 128; 32 | dtype *dA = malloc_device(bufferCount, q); 33 | dtype *dB = malloc_device(bufferCount, q); 34 | 35 | q.parallel_for(range<1>(bufferCount), [=](id<1> idx) { 36 | dA[idx] = dtype(1.1); 37 | dB[idx] = dtype(1.1); 38 | }).wait(); 39 | 40 | auto start = std::chrono::high_resolution_clock::now(); 41 | q.parallel_for(nd_range<1>(range<1>(blockCount * blockSize), range<1>(blockSize)), [=](nd_item<1> item) { 42 | int threadIdx = item.get_local_id(0); 43 | int blockIdx = item.get_group(0); 44 | 45 | dtype localSum = dtype(0); 46 | for (int i = 0; i < N / 2; i++) { 47 | int idx = (blockSize * blockRun * i + (blockIdx % blockRun) * blockSize) * 2 + threadIdx; 48 | localSum += dB[idx] * dB[idx + blockSize]; 49 | } 50 | localSum *= (dtype)1.3; 51 | if (threadIdx > 1233 || localSum == (dtype)23.12) 52 | dA[threadIdx] += localSum; 53 | }).wait(); 54 | auto end = std::chrono::high_resolution_clock::now(); 55 | auto elapsedtime = std::chrono::duration_cast(end - start); 56 | time.push_back(std::chrono::duration(elapsedtime).count()); 57 | 58 | free(dA, q); 59 | free(dB, q); 60 | } 61 | 62 | std::sort(time.begin(), time.end()); 63 | double blockDV = N * blockSize * sizeof(dtype); 64 | double bw = blockDV * blockCount / time[0] / 1.0e9; // time min value 65 | 66 | std::cout << std::fixed << std::setprecision(0) << std::setw(10) << blockDV / 1024 << " kB" 67 | << std::fixed << std::setprecision(0) << std::setw(10) << blockDV * blockRun / 1024 << " kB" 68 | << std::fixed << std::setprecision(0) << std::setw(10) << (time[0] * 1000.0) << "ms" 69 | << std::setprecision(1) << std::setw(10) 70 | << abs(*(begin(time)) - *(end(time) - 1)) / 71 | std::accumulate(begin(time) + 1, end(time) - 1, 0.0) / (time.size() - 2) * 100 72 | << "%" << std::setw(10) << bw << " GB/s " << std::endl; 73 | } 74 | } -------------------------------------------------------------------------------- /gpu-latency/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | 6 | import sys 7 | 8 | sys.path.append("..") 9 | from device_order import * 10 | 11 | 12 | devicesToInclude = [] 13 | 14 | 15 | if len(sys.argv) > 1 and sys.argv[1] == "AMD": 16 | devicesToInclude = ["MI100", "MI210", "MI300X", "RX6900XT"] 17 | 18 | if len(sys.argv) > 1 and sys.argv[1] == "NV": 19 | devicesToInclude = ["A40", "L40", "V100", "A100", "GH200"] 20 | 21 | 22 | fig, ax = plt.subplots(figsize=(6, 4)) 23 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)): 24 | if not filename.endswith(".txt") or getOrderNumber(filename) > len(order): 25 | continue 26 | if len(devicesToInclude) > 0 and not any( 27 | [filename.upper().startswith(d) for d in devicesToInclude] 28 | ): 29 | continue 30 | 31 | with open(filename, newline="") as csvfile: 32 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 33 | sizes = [] 34 | min = [] 35 | max = [] 36 | avg = [] 37 | med = [] 38 | for row in csvreader: 39 | if len(row) < 8 or row[0] == "clock:": 40 | continue 41 | sizes.append(float(row[2])) 42 | avg.append(float(row[4])) 43 | med.append(float(row[5])) 44 | min.append(float(row[6])) 45 | max.append(float(row[7])) 46 | 47 | print(filename, getOrderNumber(filename)) 48 | 49 | ax.plot( 50 | sizes, 51 | med, 52 | # "-x", 53 | label=order[getOrderNumber(filename)].upper(), 54 | color=getDeviceColor(filename), 55 | **lineStyle 56 | ) 57 | 58 | plt.fill_between( 59 | sizes, min, max, alpha=0.4, color=getDeviceColor(filename), edgecolor=None 60 | ) 61 | 62 | 63 | ax.set_xlabel("chain data volume, kB") 64 | ax.set_ylabel("latency, cycles") 65 | ax.set_xscale("log", base=2) 66 | 67 | 68 | # ax.axvline(16) 69 | # ax.axvline(4*1024) 70 | 71 | formatter = matplotlib.ticker.FuncFormatter( 72 | lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g} MB".format(x // 1024) 73 | ) 74 | ax.get_xaxis().set_major_formatter(formatter) 75 | # ax.get_yaxis().set_major_formatter(formatter) 76 | 77 | ax.set_xticks( 78 | [ 79 | 16, 80 | 32, 81 | 128, 82 | 256, 83 | 4 * 1024, 84 | 6 * 1024, 85 | 8 * 1024, 86 | 20 * 1024, 87 | 30 * 1024, 88 | 60 * 1024, 89 | 128 * 1024, 90 | 256 * 1024, 91 | 512 * 1024, 92 | ] 93 | ) 94 | ax.set_xlim([8, 800 * 1024]) 95 | 96 | 97 | ax.set_ylim([0, 980]) 98 | 99 | ax.set_yticks((0, 30, 100, 200, 300, 400, 500, 600, 700, 800, 900)) 100 | 101 | fig.autofmt_xdate() 102 | ax.legend() 103 | ax.set_ylim([0, ax.get_ylim()[1]]) 104 | fig.tight_layout(pad=0) 105 | fig.savefig("latencies" + ("_" + sys.argv[1] if len(sys.argv) > 1 else "") + ".svg") 106 | fig.savefig("latencies" + ("_" + sys.argv[1] if len(sys.argv) > 1 else "") + ".pdf") 107 | 108 | plt.show() 109 | -------------------------------------------------------------------------------- /gpu-small-kernels/a40_pt.txt: -------------------------------------------------------------------------------- 1 | 4096 64kB 71 81 81 77 66 55 2 | 4341 67kB 74 86 86 81 70 58 3 | 4601 71kB 77 91 91 86 74 62 4 | 4877 76kB 80 96 96 91 79 65 5 | 5169 80kB 82 101 102 97 83 69 6 | 5479 85kB 78 105 108 102 88 73 7 | 5807 90kB 82 108 113 108 93 77 8 | 6155 96kB 86 113 118 115 99 82 9 | 6524 101kB 91 118 125 121 105 86 10 | 6915 108kB 94 123 132 129 111 92 11 | 7329 114kB 99 128 140 136 118 97 12 | 7768 121kB 104 133 148 144 125 102 13 | 8234 128kB 110 138 156 153 132 108 14 | 8728 136kB 116 142 165 162 140 115 15 | 9251 144kB 122 148 175 172 149 122 16 | 9806 153kB 130 153 185 182 157 129 17 | 10394 162kB 137 158 195 193 167 137 18 | 11017 172kB 133 151 200 200 177 145 19 | 11678 182kB 140 159 207 210 187 154 20 | 12378 193kB 146 168 216 220 198 163 21 | 13120 205kB 152 177 226 233 210 172 22 | 13907 217kB 160 185 235 246 222 183 23 | 14741 230kB 168 196 244 260 235 193 24 | 15625 244kB 179 206 254 275 249 205 25 | 16562 258kB 175 214 262 290 264 217 26 | 17555 274kB 184 228 272 306 279 230 27 | 18608 290kB 190 239 283 323 296 244 28 | 19724 308kB 199 254 292 341 313 258 29 | 20907 326kB 210 268 303 359 332 274 30 | 22161 346kB 208 258 290 352 349 289 31 | 23490 367kB 215 273 305 357 367 305 32 | 24899 389kB 224 283 322 374 385 323 33 | 26392 412kB 238 298 340 393 409 342 34 | 27975 437kB 237 312 350 412 432 362 35 | 29653 463kB 242 330 371 430 458 382 36 | 31432 491kB 256 349 393 448 484 405 37 | 33317 520kB 257 340 411 468 509 428 38 | 35316 551kB 262 353 436 488 537 453 39 | 37434 584kB 278 370 459 507 563 478 40 | 39680 620kB 275 388 486 525 593 506 41 | 42060 657kB 288 412 511 544 621 534 42 | 44583 696kB 286 405 494 523 579 556 43 | 47257 738kB 300 418 522 551 593 582 44 | 50092 782kB 300 438 540 581 619 612 45 | 53097 829kB 313 464 571 616 650 645 46 | 56282 879kB 313 460 597 636 679 680 47 | 59658 932kB 319 472 632 671 703 716 48 | 63237 988kB 328 501 667 711 731 753 49 | 67031 1047kB 331 499 651 737 764 787 50 | 71052 1110kB 336 513 674 778 795 825 51 | 75315 1176kB 342 531 708 822 820 865 52 | 79833 1247kB 351 535 741 874 845 904 53 | 84622 1322kB 353 563 787 922 871 946 54 | 89699 1401kB 358 559 775 896 842 -------------------------------------------------------------------------------- /gpu-stream/a40.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1344 1 1% | GB/s: 52 20 39 73 36 32 3 | 32 2688 1 2.1% | GB/s: 104 41 78 144 71 63 4 | 48 4032 1 3.1% | GB/s: 154 60 112 208 105 93 5 | 64 5376 1 4.2% | GB/s: 205 80 150 276 138 123 6 | 80 6720 1 5.2% | GB/s: 253 97 181 331 169 149 7 | 96 8064 1 6.2% | GB/s: 304 117 217 389 200 176 8 | 112 9408 1 7.3% | GB/s: 351 134 246 435 228 194 9 | 64 10752 2 8.3% | GB/s: 400 160 291 498 268 237 10 | 160 13440 1 10.4% | GB/s: 483 188 339 549 308 254 11 | 96 16128 2 12.5% | GB/s: 589 233 408 610 377 325 12 | 128 21504 2 16.7% | GB/s: 680 305 515 653 466 391 13 | 160 26880 2 20.8% | GB/s: 680 371 576 666 540 404 14 | 192 32256 2 25.0% | GB/s: 680 436 614 670 587 447 15 | 224 37632 2 29.2% | GB/s: 680 494 631 671 616 468 16 | 256 43008 2 33.3% | GB/s: 680 553 644 670 630 512 17 | 288 48384 2 37.5% | GB/s: 680 598 652 670 641 502 18 | 320 53760 2 41.7% | GB/s: 680 639 659 670 650 531 19 | 352 59136 2 45.8% | GB/s: 680 662 657 670 658 558 20 | 384 64512 2 50.0% | GB/s: 680 677 658 670 656 597 21 | 416 69888 2 54.2% | GB/s: 680 680 658 670 656 602 22 | 448 75264 2 58.3% | GB/s: 680 680 658 670 657 621 23 | 480 80640 2 62.5% | GB/s: 680 680 658 670 657 630 24 | 512 86016 2 66.7% | GB/s: 680 680 657 670 656 641 25 | 544 91392 2 70.8% | GB/s: 680 680 657 670 655 646 26 | 576 96768 2 75.0% | GB/s: 680 680 657 670 655 656 27 | 608 102144 2 79.2% | GB/s: 680 680 656 670 654 658 28 | 640 107520 2 83.3% | GB/s: 680 680 655 670 653 656 29 | 672 112896 2 87.5% | GB/s: 680 680 655 670 653 656 30 | 704 118272 2 91.7% | GB/s: 680 680 655 670 653 656 31 | 736 123648 2 95.8% | GB/s: 680 680 654 670 652 656 32 | 768 129024 2 100.0% | GB/s: 680 680 653 670 651 656 33 | -------------------------------------------------------------------------------- /gpu-stream/l40.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 2272 1 1% | GB/s: 81 41 78 147 73 66 3 | 32 4544 1 2.1% | GB/s: 161 83 155 290 143 130 4 | 48 6816 1 3.1% | GB/s: 242 121 224 413 211 192 5 | 64 9088 1 4.2% | GB/s: 322 163 297 532 276 251 6 | 80 11360 1 5.2% | GB/s: 403 199 357 617 337 304 7 | 96 13632 1 6.2% | GB/s: 483 240 424 690 397 359 8 | 112 15904 1 7.3% | GB/s: 564 274 479 734 450 397 9 | 64 18176 2 8.3% | GB/s: 644 321 551 781 518 474 10 | 160 22720 1 10.4% | GB/s: 805 386 634 805 596 519 11 | 96 27264 2 12.5% | GB/s: 847 470 719 826 693 641 12 | 128 36352 2 16.7% | GB/s: 847 617 774 830 758 728 13 | 160 45440 2 20.8% | GB/s: 847 740 800 830 789 761 14 | 192 54528 2 25.0% | GB/s: 847 824 773 831 812 788 15 | 224 63616 2 29.2% | GB/s: 847 844 773 831 763 769 16 | 256 72704 2 33.3% | GB/s: 847 845 795 830 764 765 17 | 288 81792 2 37.5% | GB/s: 847 845 796 830 764 763 18 | 320 90880 2 41.7% | GB/s: 847 845 796 830 770 765 19 | 352 99968 2 45.8% | GB/s: 847 845 797 830 769 765 20 | 384 109056 2 50.0% | GB/s: 847 845 797 830 771 768 21 | 416 118144 2 54.2% | GB/s: 847 845 794 830 768 768 22 | 448 127232 2 58.3% | GB/s: 847 845 797 830 769 767 23 | 480 136320 2 62.5% | GB/s: 847 845 797 830 766 767 24 | 512 145408 2 66.7% | GB/s: 847 845 797 830 769 766 25 | 544 154496 2 70.8% | GB/s: 847 845 795 830 768 765 26 | 576 163584 2 75.0% | GB/s: 847 845 795 830 768 766 27 | 608 172672 2 79.2% | GB/s: 847 845 795 830 769 765 28 | 640 181760 2 83.3% | GB/s: 846 845 781 830 770 767 29 | 672 190848 2 87.5% | GB/s: 847 845 778 830 769 766 30 | 704 199936 2 91.7% | GB/s: 847 845 779 830 768 767 31 | 736 209024 2 95.8% | GB/s: 846 845 778 830 770 769 32 | 768 218112 2 100.0% | GB/s: 847 845 776 830 768 766 33 | -------------------------------------------------------------------------------- /gpu-stream/h100_pcie.txt: -------------------------------------------------------------------------------- 1 | blockSize threads %occ | init read scale triad 3pt 5pt 2 | 32 3648 3 % | GB/s: 228 96 183 254 168 164 3 | 64 7296 6.2 % | GB/s: 452 189 341 459 316 310 4 | 96 10944 9.4 % | GB/s: 676 277 472 635 443 436 5 | 128 14592 12.5 % | GB/s: 888 368 607 821 567 558 6 | 160 18240 15.6 % | GB/s: 1093 449 704 966 680 670 7 | 192 21888 18.8 % | GB/s: 1301 533 817 1121 794 781 8 | 224 25536 21.9 % | GB/s: 1495 612 925 1264 903 889 9 | 256 29184 25.0 % | GB/s: 1686 702 1037 1399 1005 989 10 | 288 32832 28.1 % | GB/s: 1832 764 1124 1487 1100 1082 11 | 320 36480 31.2 % | GB/s: 2015 841 1213 1564 1188 1169 12 | 352 40128 34.4 % | GB/s: 2016 908 1295 1615 1269 1250 13 | 384 43776 37.5 % | GB/s: 2016 985 1378 1644 1348 1326 14 | 416 47424 40.6 % | GB/s: 2016 1045 1439 1641 1415 1395 15 | 448 51072 43.8 % | GB/s: 2016 1116 1497 1649 1472 1453 16 | 480 54720 46.9 % | GB/s: 2016 1179 1544 1655 1521 1505 17 | 512 58368 50.0 % | GB/s: 2017 1261 1583 1675 1556 1545 18 | 544 62016 53.1 % | GB/s: 2016 1300 1591 1669 1572 1563 19 | 576 65664 56.2 % | GB/s: 2016 1362 1607 1678 1587 1579 20 | 608 69312 59.4 % | GB/s: 2018 1416 1619 1689 1598 1592 21 | 640 72960 62.5 % | GB/s: 2016 1473 1639 1712 1613 1607 22 | 672 76608 65.6 % | GB/s: 2016 1527 1638 1714 1618 1613 23 | 704 80256 68.8 % | GB/s: 2015 1578 1644 1725 1625 1619 24 | 736 83904 71.9 % | GB/s: 2016 1624 1651 1738 1632 1628 25 | 768 87552 75.0 % | GB/s: 2016 1680 1666 1755 1642 1638 26 | 800 91200 78.1 % | GB/s: 2015 1714 1663 1758 1645 1642 27 | 832 94848 81.2 % | GB/s: 2016 1759 1668 1770 1649 1647 28 | 864 98496 84.4 % | GB/s: 2016 1795 1673 1779 1654 1651 29 | 896 102144 87.5 % | GB/s: 2016 1837 1686 1796 1663 1662 30 | 928 105792 90.6 % | GB/s: 2018 1871 1684 1800 1666 1664 31 | 960 109440 93.8 % | GB/s: 2016 1897 1688 1808 1672 1670 32 | 992 113088 96.9 % | GB/s: 2016 1919 1693 1818 1678 1675 33 | 1024 116736 100.0 % | GB/s: 2016 1942 1704 1832 1686 1683 34 | -------------------------------------------------------------------------------- /gpu-stream/past_results/h100_pcie.txt: -------------------------------------------------------------------------------- 1 | blockSize threads %occ | init read scale triad 3pt 5pt 2 | 32 3648 3 % | GB/s: 228 96 183 254 168 164 3 | 64 7296 6.2 % | GB/s: 452 189 341 459 316 310 4 | 96 10944 9.4 % | GB/s: 676 277 472 635 443 436 5 | 128 14592 12.5 % | GB/s: 888 368 607 821 567 558 6 | 160 18240 15.6 % | GB/s: 1093 449 704 966 680 670 7 | 192 21888 18.8 % | GB/s: 1301 533 817 1121 794 781 8 | 224 25536 21.9 % | GB/s: 1495 612 925 1264 903 889 9 | 256 29184 25.0 % | GB/s: 1686 702 1037 1399 1005 989 10 | 288 32832 28.1 % | GB/s: 1832 764 1124 1487 1100 1082 11 | 320 36480 31.2 % | GB/s: 2015 841 1213 1564 1188 1169 12 | 352 40128 34.4 % | GB/s: 2016 908 1295 1615 1269 1250 13 | 384 43776 37.5 % | GB/s: 2016 985 1378 1644 1348 1326 14 | 416 47424 40.6 % | GB/s: 2016 1045 1439 1641 1415 1395 15 | 448 51072 43.8 % | GB/s: 2016 1116 1497 1649 1472 1453 16 | 480 54720 46.9 % | GB/s: 2016 1179 1544 1655 1521 1505 17 | 512 58368 50.0 % | GB/s: 2017 1261 1583 1675 1556 1545 18 | 544 62016 53.1 % | GB/s: 2016 1300 1591 1669 1572 1563 19 | 576 65664 56.2 % | GB/s: 2016 1362 1607 1678 1587 1579 20 | 608 69312 59.4 % | GB/s: 2018 1416 1619 1689 1598 1592 21 | 640 72960 62.5 % | GB/s: 2016 1473 1639 1712 1613 1607 22 | 672 76608 65.6 % | GB/s: 2016 1527 1638 1714 1618 1613 23 | 704 80256 68.8 % | GB/s: 2015 1578 1644 1725 1625 1619 24 | 736 83904 71.9 % | GB/s: 2016 1624 1651 1738 1632 1628 25 | 768 87552 75.0 % | GB/s: 2016 1680 1666 1755 1642 1638 26 | 800 91200 78.1 % | GB/s: 2015 1714 1663 1758 1645 1642 27 | 832 94848 81.2 % | GB/s: 2016 1759 1668 1770 1649 1647 28 | 864 98496 84.4 % | GB/s: 2016 1795 1673 1779 1654 1651 29 | 896 102144 87.5 % | GB/s: 2016 1837 1686 1796 1663 1662 30 | 928 105792 90.6 % | GB/s: 2018 1871 1684 1800 1666 1664 31 | 960 109440 93.8 % | GB/s: 2016 1897 1688 1808 1672 1670 32 | 992 113088 96.9 % | GB/s: 2016 1919 1693 1818 1678 1675 33 | 1024 116736 100.0 % | GB/s: 2016 1942 1704 1832 1686 1683 34 | -------------------------------------------------------------------------------- /gpu-cache/mi210.txt: -------------------------------------------------------------------------------- 1 | clock: 800 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 2 | data set exec time spread Eff. bw DRAM read DRAM write L2 read L2 store 3 | 4 kB 326ms 1.0% 10227.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 4 | 8 kB 316ms 0.8% 10555.5 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 5 | 16 kB 317ms 1.0% 10518.7 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 6 | 24 kB 647ms 48.6% 7019.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 7 | 32 kB 521ms 42.3% 8437.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 8 | 48 kB 973ms 45.7% 4695.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 9 | 64 kB 924ms 46.2% 4941.5 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 10 | 80 kB 1093ms 47.4% 4246.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 11 | 96 kB 1113ms 47.6% 4172.5 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 12 | 112 kB 999ms 45.4% 4551.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 13 | 128 kB 1044ms 44.3% 4299.3 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 14 | 144 kB 1171ms 42.7% 3802.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 15 | 160 kB 1166ms 42.0% 3775.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 16 | 176 kB 862ms 38.0% 4860.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 17 | 192 kB 859ms 38.1% 4922.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 18 | 208 kB 858ms 39.3% 4953.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 19 | 224 kB 858ms 39.0% 4930.7 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 20 | 240 kB 854ms 38.1% 4929.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 21 | 256 kB 852ms 36.7% 4847.0 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 22 | 272 kB 1321ms 41.9% 3234.9 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 23 | 288 kB 1327ms 42.0% 3253.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 24 | 304 kB 1326ms 40.9% 3229.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 25 | 320 kB 1331ms 39.7% 3213.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 26 | 336 kB 1338ms 39.8% 3209.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 27 | 352 kB 1339ms 40.2% 3200.6 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 28 | 368 kB 1335ms 39.9% 3200.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 29 | 384 kB 1332ms 42.3% 3244.7 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 30 | 400 kB 1326ms 41.5% 3219.7 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 31 | 416 kB 1305ms 42.3% 3209.2 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 32 | 432 kB 1276ms 43.7% 3245.9 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 33 | 448 kB 1247ms 47.9% 3345.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 34 | 464 kB 1207ms 52.1% 3489.1 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 35 | 480 kB 1159ms 54.1% 3704.8 GB/s 0 GB/s 0 GB/s 0 GB/s 0 GB/s 36 | -------------------------------------------------------------------------------- /gpu-stream/a100_40.txt: -------------------------------------------------------------------------------- 1 | blockSize threads %occ | init read scale triad 3pt 5pt 2 | 16 1728 0.8 % | GB/s: 54 20 39 55 37 37 3 | 32 3456 1.6 % | GB/s: 107 41 77 110 73 72 4 | 48 5184 2.3 % | GB/s: 161 61 112 158 109 107 5 | 64 6912 3.1 % | GB/s: 208 82 151 212 143 140 6 | 96 10368 4.7 % | GB/s: 317 119 215 300 205 201 7 | 128 13824 6.2 % | GB/s: 412 162 292 406 270 265 8 | 160 17280 7.8 % | GB/s: 510 193 337 465 322 314 9 | 192 20736 9.4 % | GB/s: 609 235 407 562 389 383 10 | 224 24192 10.9 % | GB/s: 697 264 450 618 433 422 11 | 256 27648 12.5 % | GB/s: 802 310 548 743 506 498 12 | 320 34560 15.6 % | GB/s: 976 377 630 858 609 595 13 | 384 41472 18.8 % | GB/s: 1159 449 749 1006 714 698 14 | 448 48384 21.9 % | GB/s: 1329 514 836 1112 812 792 15 | 512 55296 25.0 % | GB/s: 1501 592 956 1232 909 887 16 | 576 62208 28.1 % | GB/s: 1539 645 1021 1290 993 966 17 | 640 69120 31.2 % | GB/s: 1538 713 1112 1333 1081 1052 18 | 704 76032 34.4 % | GB/s: 1538 769 1180 1332 1151 1119 19 | 768 82944 37.5 % | GB/s: 1539 838 1245 1346 1209 1180 20 | 832 89856 40.6 % | GB/s: 1539 887 1284 1344 1258 1231 21 | 896 96768 43.8 % | GB/s: 1538 946 1318 1353 1298 1277 22 | 960 103680 46.9 % | GB/s: 1539 989 1324 1353 1316 1305 23 | 1024 110592 50.0 % | GB/s: 1536 1064 1338 1360 1325 1322 24 | 1088 117504 53.1 % | GB/s: 1537 1079 1338 1360 1331 1327 25 | 1152 124416 56.2 % | GB/s: 1539 1138 1347 1365 1337 1335 26 | 1216 131328 59.4 % | GB/s: 1537 1175 1347 1366 1339 1336 27 | 1280 138240 62.5 % | GB/s: 1537 1224 1354 1370 1343 1339 28 | 1344 145152 65.6 % | GB/s: 1537 1251 1355 1372 1347 1344 29 | 1408 152064 68.8 % | GB/s: 1538 1316 1365 1375 1352 1348 30 | 1472 158976 71.9 % | GB/s: 1538 1335 1363 1368 1353 1349 31 | 1536 165888 75.0 % | GB/s: 1539 1372 1367 1365 1356 1351 32 | 1600 172800 78.1 % | GB/s: 1536 1379 1366 1367 1358 1353 33 | 1664 179712 81.2 % | GB/s: 1539 1403 1368 1366 1360 1356 34 | 1728 186624 84.4 % | GB/s: 1539 1419 1369 1366 1362 1357 35 | 1792 193536 87.5 % | GB/s: 1539 1440 1373 1363 1363 1359 36 | 1856 200448 90.6 % | GB/s: 1538 1453 1374 1364 1365 1361 37 | 1920 207360 93.8 % | GB/s: 1538 1473 1376 1362 1367 1364 38 | 1984 214272 96.9 % | GB/s: 1537 1483 1377 1362 1369 1365 39 | 2048 221184 100.0 % | GB/s: 1539 1500 1382 1358 1371 1368 40 | -------------------------------------------------------------------------------- /gpu-stream/past_results/a100_40.txt: -------------------------------------------------------------------------------- 1 | blockSize threads %occ | init read scale triad 3pt 5pt 2 | 16 1728 0.8 % | GB/s: 54 20 39 55 37 37 3 | 32 3456 1.6 % | GB/s: 107 41 77 110 73 72 4 | 48 5184 2.3 % | GB/s: 161 61 112 158 109 107 5 | 64 6912 3.1 % | GB/s: 208 82 151 212 143 140 6 | 96 10368 4.7 % | GB/s: 317 119 215 300 205 201 7 | 128 13824 6.2 % | GB/s: 412 162 292 406 270 265 8 | 160 17280 7.8 % | GB/s: 510 193 337 465 322 314 9 | 192 20736 9.4 % | GB/s: 609 235 407 562 389 383 10 | 224 24192 10.9 % | GB/s: 697 264 450 618 433 422 11 | 256 27648 12.5 % | GB/s: 802 310 548 743 506 498 12 | 320 34560 15.6 % | GB/s: 976 377 630 858 609 595 13 | 384 41472 18.8 % | GB/s: 1159 449 749 1006 714 698 14 | 448 48384 21.9 % | GB/s: 1329 514 836 1112 812 792 15 | 512 55296 25.0 % | GB/s: 1501 592 956 1232 909 887 16 | 576 62208 28.1 % | GB/s: 1539 645 1021 1290 993 966 17 | 640 69120 31.2 % | GB/s: 1538 713 1112 1333 1081 1052 18 | 704 76032 34.4 % | GB/s: 1538 769 1180 1332 1151 1119 19 | 768 82944 37.5 % | GB/s: 1539 838 1245 1346 1209 1180 20 | 832 89856 40.6 % | GB/s: 1539 887 1284 1344 1258 1231 21 | 896 96768 43.8 % | GB/s: 1538 946 1318 1353 1298 1277 22 | 960 103680 46.9 % | GB/s: 1539 989 1324 1353 1316 1305 23 | 1024 110592 50.0 % | GB/s: 1536 1064 1338 1360 1325 1322 24 | 1088 117504 53.1 % | GB/s: 1537 1079 1338 1360 1331 1327 25 | 1152 124416 56.2 % | GB/s: 1539 1138 1347 1365 1337 1335 26 | 1216 131328 59.4 % | GB/s: 1537 1175 1347 1366 1339 1336 27 | 1280 138240 62.5 % | GB/s: 1537 1224 1354 1370 1343 1339 28 | 1344 145152 65.6 % | GB/s: 1537 1251 1355 1372 1347 1344 29 | 1408 152064 68.8 % | GB/s: 1538 1316 1365 1375 1352 1348 30 | 1472 158976 71.9 % | GB/s: 1538 1335 1363 1368 1353 1349 31 | 1536 165888 75.0 % | GB/s: 1539 1372 1367 1365 1356 1351 32 | 1600 172800 78.1 % | GB/s: 1536 1379 1366 1367 1358 1353 33 | 1664 179712 81.2 % | GB/s: 1539 1403 1368 1366 1360 1356 34 | 1728 186624 84.4 % | GB/s: 1539 1419 1369 1366 1362 1357 35 | 1792 193536 87.5 % | GB/s: 1539 1440 1373 1363 1363 1359 36 | 1856 200448 90.6 % | GB/s: 1538 1453 1374 1364 1365 1361 37 | 1920 207360 93.8 % | GB/s: 1538 1473 1376 1362 1367 1364 38 | 1984 214272 96.9 % | GB/s: 1537 1483 1377 1362 1369 1365 39 | 2048 221184 100.0 % | GB/s: 1539 1500 1382 1358 1371 1368 40 | -------------------------------------------------------------------------------- /gpu-stream/a100_80.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1728 1 0.8% | GB/s: 53 21 40 72 38 38 3 | 32 3456 1 1.6% | GB/s: 106 42 80 142 76 74 4 | 48 5184 1 2.3% | GB/s: 158 63 115 205 112 110 5 | 64 6912 1 3.1% | GB/s: 210 85 159 280 146 143 6 | 80 8640 1 3.9% | GB/s: 261 103 183 329 179 176 7 | 96 10368 1 4.7% | GB/s: 312 123 223 394 211 207 8 | 112 12096 1 5.5% | GB/s: 362 142 250 445 244 240 9 | 64 13824 2 6.2% | GB/s: 406 166 303 534 278 273 10 | 160 17280 1 7.8% | GB/s: 503 199 351 611 333 325 11 | 96 20736 2 9.4% | GB/s: 601 242 424 744 403 396 12 | 128 27648 2 12.5% | GB/s: 792 323 579 993 527 518 13 | 160 34560 2 15.6% | GB/s: 965 389 662 1130 635 619 14 | 192 41472 2 18.8% | GB/s: 1148 465 791 1324 748 730 15 | 224 48384 2 21.9% | GB/s: 1317 531 885 1443 854 832 16 | 256 55296 2 25.0% | GB/s: 1488 615 1025 1571 961 936 17 | 288 62208 2 28.1% | GB/s: 1657 669 1088 1582 1049 1018 18 | 320 69120 2 31.2% | GB/s: 1806 740 1195 1633 1147 1113 19 | 352 76032 2 34.4% | GB/s: 1891 801 1276 1628 1229 1190 20 | 384 82944 2 37.5% | GB/s: 1901 872 1379 1667 1311 1269 21 | 416 89856 2 40.6% | GB/s: 1896 924 1429 1651 1375 1330 22 | 448 96768 2 43.8% | GB/s: 1901 986 1498 1673 1438 1402 23 | 480 103680 2 46.9% | GB/s: 1900 1037 1520 1670 1472 1444 24 | 512 110592 2 50.0% | GB/s: 1908 1119 1577 1697 1516 1491 25 | 544 117504 2 53.1% | GB/s: 1900 1134 1574 1688 1531 1507 26 | 576 124416 2 56.2% | GB/s: 1898 1198 1609 1705 1560 1543 27 | 608 131328 2 59.4% | GB/s: 1900 1246 1613 1704 1575 1555 28 | 640 138240 2 62.5% | GB/s: 1905 1306 1646 1726 1598 1579 29 | 672 145152 2 65.6% | GB/s: 1900 1341 1639 1720 1609 1591 30 | 704 152064 2 68.8% | GB/s: 1901 1393 1660 1734 1626 1608 31 | 736 158976 2 71.9% | GB/s: 1894 1432 1660 1735 1633 1618 32 | 768 165888 2 75.0% | GB/s: 1901 1491 1682 1753 1644 1629 33 | 800 172800 2 78.1% | GB/s: 1898 1511 1674 1748 1653 1636 34 | 832 179712 2 81.2% | GB/s: 1896 1555 1689 1760 1662 1645 35 | 864 186624 2 84.4% | GB/s: 1898 1582 1689 1761 1667 1651 36 | 896 193536 2 87.5% | GB/s: 1900 1622 1706 1775 1672 1656 37 | 928 200448 2 90.6% | GB/s: 1896 1645 1700 1773 1680 1662 38 | 960 207360 2 93.8% | GB/s: 1898 1687 1711 1781 1686 1670 39 | 992 214272 2 96.9% | GB/s: 1893 1713 1712 1783 1691 1674 40 | 1024 221184 2 100.0% | GB/s: 1894 1776 1737 1794 1702 1686 41 | -------------------------------------------------------------------------------- /gpu-stream/gh200.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 2112 1 0.8% | GB/s: 77 31 61 108 58 57 3 | 32 4224 1 1.6% | GB/s: 153 62 120 216 113 111 4 | 48 6336 1 2.3% | GB/s: 228 91 174 308 168 165 5 | 64 8448 1 3.1% | GB/s: 303 122 237 418 219 215 6 | 80 10560 1 3.9% | GB/s: 376 150 281 492 272 267 7 | 96 12672 1 4.7% | GB/s: 449 180 342 592 320 314 8 | 112 14784 1 5.5% | GB/s: 521 208 384 660 372 365 9 | 64 16896 2 6.2% | GB/s: 591 245 464 798 426 419 10 | 160 21120 1 7.8% | GB/s: 731 295 539 907 507 499 11 | 96 25344 2 9.4% | GB/s: 888 362 655 1097 614 604 12 | 128 33792 2 12.5% | GB/s: 1167 486 854 1408 783 771 13 | 160 42240 2 15.6% | GB/s: 1435 589 980 1643 928 916 14 | 192 50688 2 18.8% | GB/s: 1703 704 1122 1897 1069 1055 15 | 224 59136 2 21.9% | GB/s: 1930 806 1252 2132 1206 1187 16 | 256 67584 2 25.0% | GB/s: 2186 935 1397 2367 1340 1320 17 | 288 76032 2 28.1% | GB/s: 2401 1012 1511 2544 1464 1442 18 | 320 84480 2 31.2% | GB/s: 2634 1118 1635 2714 1585 1561 19 | 352 92928 2 34.4% | GB/s: 2833 1211 1751 2859 1696 1670 20 | 384 101376 2 37.5% | GB/s: 3052 1320 1868 2991 1805 1775 21 | 416 109824 2 40.6% | GB/s: 3228 1402 1969 3092 1895 1867 22 | 448 118272 2 43.8% | GB/s: 3408 1496 2070 3187 1994 1961 23 | 480 126720 2 46.9% | GB/s: 3546 1580 2164 3267 2090 2052 24 | 512 135168 2 50.0% | GB/s: 3718 1715 2269 3338 2190 2154 25 | 544 143616 2 53.1% | GB/s: 3870 1746 2341 3388 2271 2237 26 | 576 152064 2 56.2% | GB/s: 3944 1830 2425 3433 2353 2317 27 | 608 160512 2 59.4% | GB/s: 3941 1911 2508 3469 2439 2400 28 | 640 168960 2 62.5% | GB/s: 3944 2009 2587 3501 2525 2489 29 | 672 177408 2 65.6% | GB/s: 3941 2074 2657 3535 2593 2560 30 | 704 185856 2 68.8% | GB/s: 3939 2148 2727 3585 2661 2627 31 | 736 194304 2 71.9% | GB/s: 3941 2220 2791 3648 2731 2696 32 | 768 202752 2 75.0% | GB/s: 3941 2314 2854 3686 2799 2762 33 | 800 211200 2 78.1% | GB/s: 3940 2358 2909 3713 2853 2821 34 | 832 219648 2 81.2% | GB/s: 3936 2422 2960 3731 2905 2873 35 | 864 228096 2 84.4% | GB/s: 3942 2488 3008 3745 2951 2924 36 | 896 236544 2 87.5% | GB/s: 3939 2556 3052 3759 3000 2972 37 | 928 244992 2 90.6% | GB/s: 3942 2609 3090 3768 3041 3016 38 | 960 253440 2 93.8% | GB/s: 3939 2669 3126 3773 3080 3057 39 | 992 261888 2 96.9% | GB/s: 3942 2729 3155 3779 3112 3092 40 | 1024 270336 2 100.0% | GB/s: 3942 2775 3174 3783 3138 3120 41 | -------------------------------------------------------------------------------- /gpu-stream/mi100.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1920 1 0.6% | GB/s: 33 24 37 68 36 34 3 | 32 3840 1 1.3% | GB/s: 65 48 74 134 70 67 4 | 48 5760 1 1.9% | GB/s: 97 74 108 199 104 100 5 | 64 7680 1 2.5% | GB/s: 128 98 142 264 136 130 6 | 80 9600 1 3.1% | GB/s: 160 122 178 321 169 161 7 | 96 11520 1 3.8% | GB/s: 190 145 211 377 201 192 8 | 112 13440 1 4.4% | GB/s: 221 169 244 432 231 220 9 | 64 15360 2 5.0% | GB/s: 256 194 280 494 268 257 10 | 160 19200 1 6.2% | GB/s: 308 235 337 577 320 302 11 | 96 23040 2 7.5% | GB/s: 378 291 405 671 383 366 12 | 128 30720 2 10.0% | GB/s: 493 352 495 802 464 444 13 | 160 38400 2 12.5% | GB/s: 605 472 624 922 594 546 14 | 192 46080 2 15.0% | GB/s: 714 559 700 959 663 625 15 | 224 53760 2 17.5% | GB/s: 815 646 787 982 740 699 16 | 256 61440 2 20.0% | GB/s: 901 693 837 969 793 757 17 | 288 69120 2 22.5% | GB/s: 967 796 922 932 870 828 18 | 320 76800 2 25.0% | GB/s: 1032 697 969 892 891 882 19 | 352 84480 2 27.5% | GB/s: 1032 724 925 855 901 863 20 | 384 92160 2 30.0% | GB/s: 1071 724 908 779 860 852 21 | 416 99840 2 32.5% | GB/s: 1034 760 809 778 806 794 22 | 448 107520 2 35.0% | GB/s: 1078 750 918 772 861 859 23 | 480 115200 2 37.5% | GB/s: 1051 802 806 770 805 794 24 | 512 122880 2 40.0% | GB/s: 1095 796 807 767 796 780 25 | 544 130560 2 42.5% | GB/s: 1050 833 806 767 788 782 26 | 576 138240 2 45.0% | GB/s: 1088 876 801 753 784 778 27 | 608 145920 2 47.5% | GB/s: 920 882 803 753 792 780 28 | 640 153600 2 50.0% | GB/s: 944 894 804 758 796 783 29 | 672 161280 2 52.5% | GB/s: 824 900 798 756 792 775 30 | 704 168960 2 55.0% | GB/s: 798 901 799 748 782 778 31 | 736 176640 2 57.5% | GB/s: 810 892 791 743 771 764 32 | 768 184320 2 60.0% | GB/s: 795 879 780 727 777 774 33 | 800 192000 2 62.5% | GB/s: 795 883 782 729 778 783 34 | 832 199680 2 65.0% | GB/s: 805 892 793 741 789 784 35 | 864 207360 2 67.5% | GB/s: 798 891 791 746 790 787 36 | 896 215040 2 70.0% | GB/s: 802 882 779 720 773 776 37 | 928 222720 2 72.5% | GB/s: 789 876 776 720 771 779 38 | 960 230400 2 75.0% | GB/s: 791 825 781 727 780 774 39 | 992 238080 2 77.5% | GB/s: 791 818 780 727 787 785 40 | 1024 245760 2 80.0% | GB/s: 793 737 784 718 778 772 41 | -------------------------------------------------------------------------------- /gpu-stream/mi210.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1664 1 0.8% | GB/s: 33 24 36 65 35 35 3 | 32 3328 1 1.6% | GB/s: 67 48 72 129 69 68 4 | 48 4992 1 2.3% | GB/s: 97 70 102 183 100 99 5 | 64 6656 1 3.1% | GB/s: 128 92 133 243 129 127 6 | 80 8320 1 3.9% | GB/s: 158 115 166 294 162 158 7 | 96 9984 1 4.7% | GB/s: 188 137 198 350 191 188 8 | 112 11648 1 5.5% | GB/s: 217 161 226 396 218 213 9 | 64 13312 2 6.2% | GB/s: 253 185 257 465 248 245 10 | 160 16640 1 7.8% | GB/s: 303 223 312 534 297 289 11 | 96 19968 2 9.4% | GB/s: 372 277 376 642 360 353 12 | 128 26624 2 12.5% | GB/s: 487 345 463 825 447 437 13 | 160 33280 2 15.6% | GB/s: 594 445 598 944 558 538 14 | 192 39936 2 18.8% | GB/s: 704 518 684 1055 645 618 15 | 224 46592 2 21.9% | GB/s: 805 593 766 1114 724 692 16 | 256 53248 2 25.0% | GB/s: 907 662 844 1144 796 759 17 | 288 59904 2 28.1% | GB/s: 994 742 921 1170 871 830 18 | 320 66560 2 31.2% | GB/s: 1091 819 996 1194 941 893 19 | 352 73216 2 34.4% | GB/s: 1165 866 1045 1182 983 909 20 | 384 79872 2 37.5% | GB/s: 1242 912 1095 1189 1045 960 21 | 416 86528 2 40.6% | GB/s: 1323 1005 1148 1194 1094 975 22 | 448 93184 2 43.8% | GB/s: 1410 1075 1192 1207 1144 1022 23 | 480 99840 2 46.9% | GB/s: 1442 1120 1219 1205 1152 1021 24 | 512 106496 2 50.0% | GB/s: 1446 1159 1230 1217 1156 1029 25 | 544 113152 2 53.1% | GB/s: 1483 1203 1255 1219 1160 1056 26 | 576 119808 2 56.2% | GB/s: 1507 1281 1282 1232 1209 1093 27 | 608 126464 2 59.4% | GB/s: 1499 1325 1297 1228 1209 1092 28 | 640 133120 2 62.5% | GB/s: 1500 1373 1311 1231 1230 1120 29 | 672 139776 2 65.6% | GB/s: 1504 1392 1317 1223 1224 1112 30 | 704 146432 2 68.8% | GB/s: 1519 1396 1330 1227 1240 1132 31 | 736 153088 2 71.9% | GB/s: 1505 1397 1346 1217 1237 1127 32 | 768 159744 2 75.0% | GB/s: 1518 1362 1356 1216 1247 1142 33 | 800 166400 2 78.1% | GB/s: 1519 1383 1372 1215 1246 1138 34 | 832 173056 2 81.2% | GB/s: 1533 1372 1394 1221 1258 1156 35 | 864 179712 2 84.4% | GB/s: 1513 1372 1401 1220 1257 1149 36 | 896 186368 2 87.5% | GB/s: 1518 1373 1412 1222 1262 1163 37 | 928 193024 2 90.6% | GB/s: 1510 1365 1391 1230 1264 1164 38 | 960 199680 2 93.8% | GB/s: 1529 1377 1388 1230 1271 1180 39 | 992 206336 2 96.9% | GB/s: 1523 1380 1369 1230 1266 1177 40 | 1024 212992 2 100.0% | GB/s: 1511 1370 1362 1225 1256 1143 41 | -------------------------------------------------------------------------------- /gpu-stream/mi300a.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 3648 1 0.8% | GB/s: 105 57 93 171 91 90 3 | 32 7296 1 1.6% | GB/s: 210 114 183 335 179 177 4 | 48 10944 1 2.3% | GB/s: 308 167 267 483 261 258 5 | 64 14592 1 3.1% | GB/s: 408 223 355 642 344 340 6 | 80 18240 1 3.9% | GB/s: 499 262 421 748 414 405 7 | 96 21888 1 4.7% | GB/s: 596 318 505 890 494 485 8 | 112 25536 1 5.5% | GB/s: 690 371 579 998 572 561 9 | 64 29184 2 6.2% | GB/s: 807 447 677 1194 656 648 10 | 160 36480 1 7.8% | GB/s: 953 508 781 1337 762 742 11 | 96 43776 2 9.4% | GB/s: 1170 640 933 1551 917 899 12 | 128 58368 2 12.5% | GB/s: 1483 846 1200 1874 1169 1142 13 | 160 72960 2 15.6% | GB/s: 1681 1011 1369 2081 1356 1288 14 | 192 87552 2 18.8% | GB/s: 1867 1196 1549 2318 1546 1465 15 | 224 102144 2 21.9% | GB/s: 2026 1338 1691 2466 1671 1575 16 | 256 116736 2 25.0% | GB/s: 2174 1470 1837 2626 1796 1712 17 | 288 131328 2 28.1% | GB/s: 2337 1614 1933 2620 1877 1774 18 | 320 145920 2 31.2% | GB/s: 2487 1759 2059 2590 1996 1882 19 | 352 160512 2 34.4% | GB/s: 2594 1836 2146 2634 2082 1929 20 | 384 175104 2 37.5% | GB/s: 2705 1959 2273 2642 2203 2030 21 | 416 189696 2 40.6% | GB/s: 2655 2008 2342 2633 2242 2038 22 | 448 204288 2 43.8% | GB/s: 2712 2113 2446 2691 2339 2144 23 | 480 218880 2 46.9% | GB/s: 2699 2189 2525 2754 2382 2155 24 | 512 233472 2 50.0% | GB/s: 2778 2269 2645 2894 2492 2321 25 | 544 248064 2 53.1% | GB/s: 2831 2349 2655 2852 2554 2308 26 | 576 262656 2 56.2% | GB/s: 2839 2442 2686 2894 2637 2378 27 | 608 277248 2 59.4% | GB/s: 2863 2462 2590 2877 2645 2356 28 | 640 291840 2 62.5% | GB/s: 2942 2540 2652 2904 2626 2361 29 | 672 306432 2 65.6% | GB/s: 2923 2592 2738 2950 2684 2366 30 | 704 321024 2 68.8% | GB/s: 2925 2657 2759 2944 2723 2409 31 | 736 335616 2 71.9% | GB/s: 2887 2688 2687 2928 2689 2372 32 | 768 350208 2 75.0% | GB/s: 2880 2660 2690 2955 2727 2405 33 | 800 364800 2 78.1% | GB/s: 2854 2640 2679 2935 2708 2394 34 | 832 379392 2 81.2% | GB/s: 2891 2602 2696 2963 2648 2443 35 | 864 393984 2 84.4% | GB/s: 2844 2602 2700 2956 2637 2530 36 | 896 408576 2 87.5% | GB/s: 2973 2650 2850 2978 2780 2459 37 | 928 423168 2 90.6% | GB/s: 2830 2632 2865 2993 2651 2590 38 | 960 437760 2 93.8% | GB/s: 3015 2573 2873 2994 2670 2628 39 | 992 452352 2 96.9% | GB/s: 2924 2558 2887 2983 2682 2612 40 | 1024 466944 2 100.0% | GB/s: 3069 2677 2956 3034 2744 2522 41 | -------------------------------------------------------------------------------- /gpu-stream/mi300x.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 4864 1 0.8% | GB/s: 136 75 122 233 117 116 3 | 32 9728 1 1.6% | GB/s: 266 148 242 454 231 228 4 | 48 14592 1 2.3% | GB/s: 394 208 339 626 337 333 5 | 64 19456 1 3.1% | GB/s: 524 282 459 839 440 434 6 | 80 24320 1 3.9% | GB/s: 639 338 539 977 535 524 7 | 96 29184 1 4.7% | GB/s: 758 408 638 1148 635 623 8 | 112 34048 1 5.5% | GB/s: 879 474 728 1316 728 714 9 | 64 38912 2 6.2% | GB/s: 1030 554 855 1560 824 815 10 | 160 48640 1 7.8% | GB/s: 1213 655 978 1732 975 951 11 | 96 58368 2 9.4% | GB/s: 1489 808 1156 2065 1159 1139 12 | 128 77824 2 12.5% | GB/s: 1948 1049 1534 2647 1451 1423 13 | 160 97280 2 15.6% | GB/s: 2348 1291 1695 2942 1685 1643 14 | 192 116736 2 18.8% | GB/s: 2735 1494 1935 3266 1924 1880 15 | 224 136192 2 21.9% | GB/s: 2974 1719 2148 3454 2142 2079 16 | 256 155648 2 25.0% | GB/s: 3377 1959 2729 3740 2432 2336 17 | 288 175104 2 28.1% | GB/s: 3431 2153 2514 3735 2477 2383 18 | 320 194560 2 31.2% | GB/s: 3596 2343 2704 3829 2623 2512 19 | 352 214016 2 34.4% | GB/s: 3684 2510 2866 3839 2780 2598 20 | 384 233472 2 37.5% | GB/s: 3916 2657 3088 3944 2954 2772 21 | 416 252928 2 40.6% | GB/s: 3959 2791 3189 3894 3039 2797 22 | 448 272384 2 43.8% | GB/s: 4081 2907 3354 3964 3191 2937 23 | 480 291840 2 46.9% | GB/s: 4147 3043 3492 3955 3250 2939 24 | 512 311296 2 50.0% | GB/s: 4319 3098 3427 3169 3469 3104 25 | 544 330752 2 53.1% | GB/s: 4269 3272 3696 3964 3440 3060 26 | 576 350208 2 56.2% | GB/s: 4298 3320 3791 3968 3536 3108 27 | 608 369664 2 59.4% | GB/s: 4402 3406 3820 3931 3580 3116 28 | 640 389120 2 62.5% | GB/s: 4532 3479 3927 4000 3694 3221 29 | 672 408576 2 65.6% | GB/s: 4607 3540 3944 3962 3690 3195 30 | 704 428032 2 68.8% | GB/s: 4609 3622 3988 3974 3754 3273 31 | 736 447488 2 71.9% | GB/s: 4534 3670 4017 3952 3782 3277 32 | 768 466944 2 75.0% | GB/s: 4849 3756 3924 3947 3903 3436 33 | 800 486400 2 78.1% | GB/s: 4663 3775 4008 3914 3830 3339 34 | 832 505856 2 81.2% | GB/s: 4579 3875 4033 3942 3896 3403 35 | 864 525312 2 84.4% | GB/s: 4611 3864 3993 3932 3852 3395 36 | 896 544768 2 87.5% | GB/s: 4751 3932 4058 3957 3926 3461 37 | 928 564224 2 90.6% | GB/s: 4640 3935 3964 3906 3918 3458 38 | 960 583680 2 93.8% | GB/s: 4638 4052 4002 3893 3935 3499 39 | 992 603136 2 96.9% | GB/s: 4617 4001 3589 3818 3880 3516 40 | 1024 622592 2 100.0% | GB/s: 4801 4199 3997 3866 4146 3677 41 | -------------------------------------------------------------------------------- /gpu-stream/v100.txt: -------------------------------------------------------------------------------- 1 | block smBlocks threads occ% | init read scale triad 3pt 5pt 2 | 16 1280 1 0.8% | GB/s: 39 19 34 64 33 32 3 | 32 2560 1 1.6% | GB/s: 78 36 67 123 63 61 4 | 48 3840 1 2.3% | GB/s: 116 54 98 175 93 90 5 | 64 5120 1 3.1% | GB/s: 155 70 130 230 123 119 6 | 80 6400 1 3.9% | GB/s: 192 87 157 269 151 147 7 | 96 7680 1 4.7% | GB/s: 229 103 184 316 178 173 8 | 112 8960 1 5.5% | GB/s: 267 119 212 352 204 198 9 | 64 10240 2 6.2% | GB/s: 306 139 246 400 233 226 10 | 160 12800 1 7.8% | GB/s: 369 165 281 452 272 263 11 | 96 15360 2 9.4% | GB/s: 454 200 335 524 325 318 12 | 128 20480 2 12.5% | GB/s: 599 263 426 638 406 397 13 | 160 25600 2 15.6% | GB/s: 724 322 487 682 478 464 14 | 192 30720 2 18.8% | GB/s: 850 376 555 722 540 527 15 | 224 35840 2 21.9% | GB/s: 897 432 612 737 599 585 16 | 256 40960 2 25.0% | GB/s: 897 491 675 759 648 635 17 | 288 46080 2 28.1% | GB/s: 897 539 700 755 689 675 18 | 320 51200 2 31.2% | GB/s: 897 557 733 764 720 708 19 | 352 56320 2 34.4% | GB/s: 897 608 749 765 740 731 20 | 384 61440 2 37.5% | GB/s: 897 647 765 774 753 744 21 | 416 66560 2 40.6% | GB/s: 897 686 769 770 763 755 22 | 448 71680 2 43.8% | GB/s: 897 708 776 772 770 764 23 | 480 76800 2 46.9% | GB/s: 897 728 780 773 776 771 24 | 512 81920 2 50.0% | GB/s: 897 746 784 777 778 774 25 | 544 87040 2 53.1% | GB/s: 897 749 787 774 783 778 26 | 576 92160 2 56.2% | GB/s: 897 752 790 776 786 782 27 | 608 97280 2 59.4% | GB/s: 897 755 792 776 789 785 28 | 640 102400 2 62.5% | GB/s: 897 773 794 781 789 785 29 | 672 107520 2 65.6% | GB/s: 897 787 796 779 792 788 30 | 704 112640 2 68.8% | GB/s: 897 804 797 780 794 790 31 | 736 117760 2 71.9% | GB/s: 897 813 799 780 796 792 32 | 768 122880 2 75.0% | GB/s: 897 824 801 783 796 794 33 | 800 128000 2 78.1% | GB/s: 897 829 802 782 799 795 34 | 832 133120 2 81.2% | GB/s: 897 838 803 784 800 796 35 | 864 138240 2 84.4% | GB/s: 897 842 805 784 801 797 36 | 896 143360 2 87.5% | GB/s: 897 850 805 785 801 798 37 | 928 148480 2 90.6% | GB/s: 897 854 807 785 803 800 38 | 960 153600 2 93.8% | GB/s: 897 859 807 787 804 801 39 | 992 158720 2 96.9% | GB/s: 897 863 808 787 804 801 40 | 1024 163840 2 100.0% | GB/s: 897 867 809 788 804 801 41 | -------------------------------------------------------------------------------- /unmaintained/cuda-3d-stream/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../measure_metric/measureMetricPW.hpp" 3 | #include "../dtime.hpp" 4 | #include "../gpu-error.h" 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | const size_t xdim = 2000; 12 | const size_t ydim = 1000; 13 | const size_t zdim = 100; 14 | const size_t buffer_size = (size_t) xdim * ydim * zdim; 15 | double *dA, *dB; 16 | 17 | template 18 | __global__ void init_kernel(T *A, const T *__restrict__ B, 19 | const T *__restrict__ C, const T *__restrict__ D, 20 | const size_t N) { 21 | size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; 22 | for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) { 23 | A[i] = 0.1; 24 | } 25 | } 26 | 27 | template 28 | __global__ void scale_kernel(T *A, const T *__restrict__ B) { 29 | __shared__ double spoiler[1024]; 30 | int tidx = threadIdx.x + blockIdx.x * blockDim.x; 31 | int tidy = threadIdx.y + blockIdx.y * blockDim.y; 32 | int tidz = threadIdx.z + blockIdx.z * blockDim.z; 33 | if (tidx >= xdim || tidy >= ydim || tidz >= zdim) 34 | return; 35 | 36 | if (threadIdx.x > 1243) 37 | spoiler[threadIdx.x] = B[threadIdx.x]; 38 | 39 | size_t idx = tidz * xdim * ydim + tidy * xdim + tidx; 40 | A[idx] = B[idx] * 1.2; 41 | 42 | if (threadIdx.x > 1243) 43 | A[idx] = spoiler[idx]; 44 | } 45 | 46 | void measureFunc(dim3 blockSize) { 47 | 48 | GPU_ERROR(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); 49 | MeasurementSeries time; 50 | 51 | dim3 grid = dim3((xdim - 1) / blockSize.x + 1, (ydim - 1) / blockSize.y + 1, 52 | (zdim - 1) / blockSize.z + 1); 53 | 54 | scale_kernel<<>>(dA, dB); 55 | 56 | nvmlDevice_t device; 57 | int deviceId; 58 | cudaGetDevice(&deviceId); 59 | nvmlDeviceGetHandleByIndex(deviceId, &device); 60 | 61 | for (int iter = 0; iter < 10; iter++) { 62 | GPU_ERROR(cudaDeviceSynchronize()); 63 | double t1 = dtime(); 64 | GPU_ERROR(cudaDeviceSynchronize()); 65 | scale_kernel<<>>(dA, dB); 66 | scale_kernel<<>>(dA, dB); 67 | GPU_ERROR(cudaDeviceSynchronize()); 68 | double t2 = dtime(); 69 | time.add((t2 - t1) / 2); 70 | } 71 | 72 | measureBandwidthStart(); 73 | scale_kernel<<>>(dA, dB); 74 | auto metrics = measureMetricStop(); 75 | 76 | cudaDeviceProp prop; 77 | GPU_ERROR(cudaGetDevice(&deviceId)); 78 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 79 | std::string deviceName = prop.name; 80 | int smCount = prop.multiProcessorCount; 81 | int maxActiveBlocks = 0; 82 | GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor( 83 | &maxActiveBlocks, scale_kernel, blockSize.x*blockSize.y*blockSize.z, 0)); 84 | 85 | 86 | cout << fixed << setprecision(0) << "(" << setw(4) << blockSize.x << "," 87 | << setw(4) << blockSize.y << "," << setw(4) << blockSize.z << ") " 88 | << maxActiveBlocks << " " 89 | << setw(2) << " " << setw(5) 90 | << buffer_size * 2 * sizeof(double) / time.median() * 1e-9 << " " 91 | << (maxActiveBlocks*smCount*blockSize.x*blockSize.y*blockSize.z) * time.median() * 1.41e9 / buffer_size << " " 92 | 93 | << setprecision(0) << setw(8) << metrics[0] / time.value() / 1.0e9 << " GB/s " // 94 | << setprecision(0) << setw(8) << metrics[1] / time.value() / 1.0e9 << " GB/s " // 95 | << setprecision(0) << setw(8) << metrics[2]*32 / time.value() / 1.0e9 << " GB/s " // 96 | << setprecision(0) << setw(8) << metrics[3]*32 / time.value() / 1.0e9 << " GB/s " << endl; // 97 | cout.flush(); 98 | } 99 | 100 | int main(int argc, char **argv) { 101 | nvmlInit(); 102 | GPU_ERROR(cudaMalloc(&dA, buffer_size * sizeof(double))); 103 | GPU_ERROR(cudaMalloc(&dB, buffer_size * sizeof(double))); 104 | 105 | init_kernel<<<256, 400>>>(dB, dB, dB, dB, buffer_size); 106 | init_kernel<<<256, 400>>>(dA, dA, dA, dA, buffer_size); 107 | GPU_ERROR(cudaDeviceSynchronize()); 108 | 109 | for (int blockDimX : {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) { 110 | for (int blockDimY : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) { 111 | for (int blockDimZ : {1, 2, 4, 8, 16, 32, 64}) { 112 | int threadCount = blockDimX * blockDimY * blockDimZ; 113 | 114 | if (threadCount != 256) //threadCount > 1024 || threadCount < 64) 115 | continue; 116 | 117 | measureFunc(dim3(blockDimX, blockDimY, blockDimZ)); 118 | } 119 | } 120 | } 121 | 122 | cudaDeviceProp prop; 123 | int deviceId; 124 | GPU_ERROR(cudaGetDevice(&deviceId)); 125 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 126 | std::string deviceName = prop.name; 127 | 128 | GPU_ERROR(cudaFree(dA)); 129 | GPU_ERROR(cudaFree(dB)); 130 | } 131 | -------------------------------------------------------------------------------- /gpu-latency/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../dtime.hpp" 3 | #include "../gpu-clock.cuh" 4 | #include "../gpu-error.h" 5 | // #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | typedef int64_t dtype; 16 | 17 | __device__ unsigned int smid() { 18 | unsigned int r; 19 | 20 | asm("mov.u32 %0, %%smid;" : "=r"(r)); 21 | 22 | return r; 23 | } 24 | 25 | template 26 | __global__ void pchase(T *buf, T *__restrict__ dummy_buf, int64_t N) { 27 | 28 | int tidx = threadIdx.x + blockIdx.x * blockDim.x; 29 | int64_t *idx = buf; 30 | 31 | const int unroll_factor = 32; 32 | #pragma unroll 1 33 | for (int64_t n = 0; n < N; n += unroll_factor) { 34 | #pragma unroll 35 | for (int u = 0; u < unroll_factor; u++) { 36 | idx = (int64_t *)*idx; 37 | } 38 | } 39 | 40 | if (tidx > 12313) { 41 | dummy_buf[0] = (int64_t)idx; 42 | } 43 | } 44 | 45 | int main(int argc, char **argv) { 46 | 47 | #ifdef __NVCC__ 48 | GPU_ERROR(cudaFuncSetAttribute( 49 | pchase, cudaFuncAttributePreferredSharedMemoryCarveout, 0)); 50 | #endif 51 | unsigned int clock = getGPUClock(); 52 | 53 | const int cl_size = 128 / sizeof(int64_t); 54 | const int skip_factor = 1; 55 | 56 | std::random_device rd; 57 | std::mt19937 g(rd()); 58 | 59 | for (int64_t LEN = 16; LEN < (1 << 24); LEN = LEN * 1.042 + 1 + rand() % 11) { 60 | if (LEN * skip_factor * cl_size * sizeof(dtype) > 120 * 1024 * 1024) 61 | LEN *= 1.1; 62 | 63 | MeasurementSeries times; 64 | const int64_t iters = max(LEN, (int64_t)100000); 65 | 66 | for (int i = 0; i < 21; i++) { 67 | 68 | vector order(LEN); 69 | int64_t *buf = NULL; 70 | int64_t *dbuf = NULL; 71 | dtype *dummy_buf = NULL; 72 | 73 | GPU_ERROR( 74 | cudaMallocManaged(&buf, skip_factor * cl_size * LEN * sizeof(dtype))); 75 | GPU_ERROR(cudaMalloc(&dbuf, skip_factor * cl_size * LEN * sizeof(dtype))); 76 | GPU_ERROR(cudaMallocManaged(&dummy_buf, sizeof(dtype))); 77 | for (int64_t i = 0; i < LEN; i++) { 78 | order[i] = i + 1; 79 | } 80 | order[LEN - 1] = 0; 81 | 82 | shuffle(begin(order), end(order) - 1, g); 83 | 84 | for (int cl_lane = 0; cl_lane < cl_size; cl_lane++) { 85 | dtype idx = 0; 86 | for (int64_t i = 0; i < LEN; i++) { 87 | 88 | buf[(idx * cl_size + cl_lane) * skip_factor] = 89 | skip_factor * 90 | (order[i] * cl_size + cl_lane + (order[i] == 0 ? 1 : 0)); 91 | idx = order[i]; 92 | } 93 | } 94 | buf[skip_factor * (order[LEN - 2] * cl_size + cl_size - 1)] = 0; 95 | 96 | for (int64_t n = 0; n < LEN * cl_size * skip_factor; n++) { 97 | buf[n] = (int64_t)dbuf + buf[n] * sizeof(int64_t *); 98 | } 99 | 100 | GPU_ERROR(cudaMemcpy(dbuf, buf, 101 | skip_factor * cl_size * LEN * sizeof(dtype), 102 | cudaMemcpyHostToDevice)); 103 | 104 | pchase<<<1, 1>>>(buf, dummy_buf, iters); 105 | pchase<<<1, 1>>>(buf, dummy_buf, iters); 106 | 107 | cudaEvent_t start, stop; 108 | GPU_ERROR(cudaEventCreate(&start)); 109 | GPU_ERROR(cudaEventCreate(&stop)); 110 | 111 | GPU_ERROR(cudaDeviceSynchronize()); 112 | 113 | GPU_ERROR(cudaEventRecord(start)); 114 | pchase<<<1, 1>>>(buf, dummy_buf, iters); 115 | GPU_ERROR(cudaEventRecord(stop)); 116 | 117 | GPU_ERROR(cudaEventSynchronize(stop)); 118 | float milliseconds = 0; 119 | GPU_ERROR(cudaEventElapsedTime(&milliseconds, start, stop)); 120 | 121 | times.add(milliseconds / 1000); 122 | 123 | GPU_ERROR(cudaGetLastError()); 124 | GPU_ERROR(cudaFree(buf)); 125 | GPU_ERROR(cudaFree(dbuf)); 126 | GPU_ERROR(cudaFree(dummy_buf)); 127 | } 128 | double dt = times.value(); 129 | double dtmed = times.median(); 130 | double dtmin = times.getPercentile(0.05); 131 | double dtmax = times.getPercentile(0.95); 132 | cout << setw(9) << iters << " " << setw(5) << clock << " " // 133 | << setw(8) << skip_factor * LEN * cl_size * sizeof(dtype) / 1024.0 134 | << " " // 135 | << fixed // 136 | << setprecision(1) << setw(8) << dt * 1000 << " " // 137 | << setw(7) << setprecision(1) 138 | << (double)dt / iters * clock * 1000 * 1000 << " " 139 | << (double)dtmed / iters * clock * 1000 * 1000 << " " 140 | << (double)dtmin / iters * clock * 1000 * 1000 << " " 141 | << (double)dtmax / iters * clock * 1000 * 1000 << "\n" 142 | << flush; 143 | } 144 | cout << "\n"; 145 | } 146 | -------------------------------------------------------------------------------- /measure_metric/Utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #define RETURN_IF_NVPW_ERROR(retval, actual) \ 5 | do { \ 6 | NVPA_Status status = actual; \ 7 | if (NVPA_STATUS_SUCCESS != status) { \ 8 | fprintf(stderr, "FAILED: %s with error %s\n", #actual, NV::Metric::Utils::GetNVPWResultString(status)); \ 9 | return retval; \ 10 | } \ 11 | } while (0) 12 | 13 | namespace NV { 14 | namespace Metric { 15 | namespace Utils { 16 | 17 | static const char* GetNVPWResultString(NVPA_Status status) { 18 | const char* errorMsg = NULL; 19 | switch (status) 20 | { 21 | case NVPA_STATUS_ERROR: 22 | errorMsg = "NVPA_STATUS_ERROR"; 23 | break; 24 | case NVPA_STATUS_INTERNAL_ERROR: 25 | errorMsg = "NVPA_STATUS_INTERNAL_ERROR"; 26 | break; 27 | case NVPA_STATUS_NOT_INITIALIZED: 28 | errorMsg = "NVPA_STATUS_NOT_INITIALIZED"; 29 | break; 30 | case NVPA_STATUS_NOT_LOADED: 31 | errorMsg = "NVPA_STATUS_NOT_LOADED"; 32 | break; 33 | case NVPA_STATUS_FUNCTION_NOT_FOUND: 34 | errorMsg = "NVPA_STATUS_FUNCTION_NOT_FOUND"; 35 | break; 36 | case NVPA_STATUS_NOT_SUPPORTED: 37 | errorMsg = "NVPA_STATUS_NOT_SUPPORTED"; 38 | break; 39 | case NVPA_STATUS_NOT_IMPLEMENTED: 40 | errorMsg = "NVPA_STATUS_NOT_IMPLEMENTED"; 41 | break; 42 | case NVPA_STATUS_INVALID_ARGUMENT: 43 | errorMsg = "NVPA_STATUS_INVALID_ARGUMENT"; 44 | break; 45 | case NVPA_STATUS_INVALID_METRIC_ID: 46 | errorMsg = "NVPA_STATUS_INVALID_METRIC_ID"; 47 | break; 48 | case NVPA_STATUS_DRIVER_NOT_LOADED: 49 | errorMsg = "NVPA_STATUS_DRIVER_NOT_LOADED"; 50 | break; 51 | case NVPA_STATUS_OUT_OF_MEMORY: 52 | errorMsg = "NVPA_STATUS_OUT_OF_MEMORY"; 53 | break; 54 | case NVPA_STATUS_INVALID_THREAD_STATE: 55 | errorMsg = "NVPA_STATUS_INVALID_THREAD_STATE"; 56 | break; 57 | case NVPA_STATUS_FAILED_CONTEXT_ALLOC: 58 | errorMsg = "NVPA_STATUS_FAILED_CONTEXT_ALLOC"; 59 | break; 60 | case NVPA_STATUS_UNSUPPORTED_GPU: 61 | errorMsg = "NVPA_STATUS_UNSUPPORTED_GPU"; 62 | break; 63 | case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION: 64 | errorMsg = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION"; 65 | break; 66 | case NVPA_STATUS_OBJECT_NOT_REGISTERED: 67 | errorMsg = "NVPA_STATUS_OBJECT_NOT_REGISTERED"; 68 | break; 69 | case NVPA_STATUS_INSUFFICIENT_PRIVILEGE: 70 | errorMsg = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE"; 71 | break; 72 | case NVPA_STATUS_INVALID_CONTEXT_STATE: 73 | errorMsg = "NVPA_STATUS_INVALID_CONTEXT_STATE"; 74 | break; 75 | case NVPA_STATUS_INVALID_OBJECT_STATE: 76 | errorMsg = "NVPA_STATUS_INVALID_OBJECT_STATE"; 77 | break; 78 | case NVPA_STATUS_RESOURCE_UNAVAILABLE: 79 | errorMsg = "NVPA_STATUS_RESOURCE_UNAVAILABLE"; 80 | break; 81 | case NVPA_STATUS_DRIVER_LOADED_TOO_LATE: 82 | errorMsg = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE"; 83 | break; 84 | case NVPA_STATUS_INSUFFICIENT_SPACE: 85 | errorMsg = "NVPA_STATUS_INSUFFICIENT_SPACE"; 86 | break; 87 | case NVPA_STATUS_OBJECT_MISMATCH: 88 | errorMsg = "NVPA_STATUS_OBJECT_MISMATCH"; 89 | break; 90 | case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED: 91 | errorMsg = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED"; 92 | break; 93 | default: 94 | break; 95 | } 96 | 97 | return errorMsg; 98 | } 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /gpu-l2-cache/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../dtime.hpp" 3 | #include "../gpu-error.h" 4 | #include "../gpu-metrics/gpu-metrics.hpp" 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | using dtype = double; 11 | dtype *dA, *dB; 12 | 13 | __global__ void initKernel(dtype *A, size_t N) { 14 | size_t tidx = blockDim.x * blockIdx.x + threadIdx.x; 15 | for (int idx = tidx; idx < N; idx += blockDim.x * gridDim.x) { 16 | A[idx] = dtype(1.1); 17 | } 18 | } 19 | 20 | template 21 | __global__ void sumKernel(dtype *__restrict__ A, const dtype *__restrict__ B, 22 | int blockRun) { 23 | dtype localSum = dtype(0); 24 | 25 | for (int i = 0; i < N / 2; i++) { 26 | int idx = 27 | (blockDim.x * blockRun * i + (blockIdx.x % blockRun) * BLOCKSIZE) * 2 + 28 | threadIdx.x; 29 | localSum += B[idx] * B[idx + BLOCKSIZE]; 30 | } 31 | 32 | localSum *= (dtype)1.3; 33 | if (threadIdx.x > 1233 || localSum == (dtype)23.12) 34 | A[threadIdx.x] += localSum; 35 | } 36 | template 37 | double callKernel(int blockCount, int blockRun) { 38 | sumKernel<<>>(dA, dB, blockRun); 39 | GPU_ERROR(cudaPeekAtLastError()); 40 | return 0.0; 41 | } 42 | template void measure(int blockRun) { 43 | 44 | const int blockSize = 1024; 45 | 46 | cudaDeviceProp prop; 47 | int deviceId; 48 | GPU_ERROR(cudaGetDevice(&deviceId)); 49 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 50 | std::string deviceName = prop.name; 51 | int smCount = prop.multiProcessorCount; 52 | int maxActiveBlocks = 0; 53 | GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor( 54 | &maxActiveBlocks, sumKernel, blockSize, 0)); 55 | 56 | int blockCount = 200000; 57 | 58 | // GPU_ERROR(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); 59 | 60 | MeasurementSeries time; 61 | MeasurementSeries dram_read; 62 | MeasurementSeries dram_write; 63 | MeasurementSeries L2_read; 64 | MeasurementSeries L2_write; 65 | 66 | GPU_ERROR(cudaDeviceSynchronize()); 67 | for (int i = 0; i < 11; i++) { 68 | const size_t bufferCount = blockRun * blockSize * N + i * 128; 69 | GPU_ERROR(cudaMalloc(&dA, bufferCount * sizeof(dtype))); 70 | initKernel<<<52, 256>>>(dA, bufferCount); 71 | GPU_ERROR(cudaMalloc(&dB, bufferCount * sizeof(dtype))); 72 | initKernel<<<52, 256>>>(dB, bufferCount); 73 | GPU_ERROR(cudaDeviceSynchronize()); 74 | 75 | double t1 = dtime(); 76 | callKernel(blockCount, blockRun); 77 | GPU_ERROR(cudaDeviceSynchronize()); 78 | double t2 = dtime(); 79 | time.add(t2 - t1); 80 | 81 | /* measureDRAMBytesStart(); 82 | callKernel(blockCount, blockRun); 83 | auto metrics = measureDRAMBytesStop(); 84 | dram_read.add(metrics[0]); 85 | dram_write.add(metrics[1]); 86 | 87 | measureL2BytesStart(); 88 | callKernel(blockCount, blockRun); 89 | metrics = measureL2BytesStop(); 90 | L2_read.add(metrics[0]); 91 | L2_write.add(metrics[1]);*/ 92 | GPU_ERROR(cudaFree(dA)); 93 | GPU_ERROR(cudaFree(dB)); 94 | } 95 | 96 | double blockDV = N * blockSize * sizeof(dtype); 97 | 98 | double bw = blockDV * blockCount / time.minValue() / 1.0e9; 99 | cout << fixed << setprecision(0) << setw(10) << blockDV / 1024 << " kB" // 100 | << fixed << setprecision(0) << setw(10) << blockDV * blockRun / 1024 101 | << " kB" // 102 | << setprecision(0) << setw(10) << time.minValue() * 1000.0 << "ms" // 103 | << setprecision(1) << setw(10) << time.spread() * 100 << "%" // 104 | << setw(10) << bw << " GB/s " // 105 | << setprecision(0) << setw(6) 106 | << dram_read.median() / time.minValue() / 1.0e9 << " GB/s " // 107 | << setprecision(0) << setw(6) 108 | << dram_write.median() / time.minValue() / 1.0e9 << " GB/s " // 109 | << setprecision(0) << setw(6) 110 | << L2_read.median() / time.minValue() / 1.0e9 << " GB/s " // 111 | << setprecision(0) << setw(6) 112 | << L2_write.median() / time.minValue() / 1.0e9 << " GB/s " << endl; // 113 | } 114 | 115 | size_t constexpr expSeries(size_t N) { 116 | size_t val = 20; 117 | for (size_t i = 0; i < N; i++) { 118 | val = val * 1.04 + 1; 119 | } 120 | return val; 121 | } 122 | 123 | int main(int argc, char **argv) { 124 | initMeasureMetric(); 125 | cout << setw(13) << "data set" // 126 | << setw(12) << "exec time" // 127 | << setw(11) << "spread" // 128 | << setw(15) << "Eff. bw\n"; // 129 | 130 | for (int i = 3; i < 10000; i += max(1.0, i * 0.1)) { 131 | #ifdef __NVCC__ 132 | measure<64>(i); 133 | #else 134 | measure<64>(i); 135 | #endif 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /gpu-metrics/cuda_metrics/Utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #define RETURN_IF_NVPW_ERROR(retval, actual) \ 5 | do { \ 6 | NVPA_Status status = actual; \ 7 | if (NVPA_STATUS_SUCCESS != status) { \ 8 | fprintf(stderr, "FAILED: %s with error %s\n", #actual, NV::Metric::Utils::GetNVPWResultString(status)); \ 9 | return retval; \ 10 | } \ 11 | } while (0) 12 | 13 | namespace NV { 14 | namespace Metric { 15 | namespace Utils { 16 | 17 | static const char* GetNVPWResultString(NVPA_Status status) { 18 | const char* errorMsg = NULL; 19 | switch (status) 20 | { 21 | case NVPA_STATUS_ERROR: 22 | errorMsg = "NVPA_STATUS_ERROR"; 23 | break; 24 | case NVPA_STATUS_INTERNAL_ERROR: 25 | errorMsg = "NVPA_STATUS_INTERNAL_ERROR"; 26 | break; 27 | case NVPA_STATUS_NOT_INITIALIZED: 28 | errorMsg = "NVPA_STATUS_NOT_INITIALIZED"; 29 | break; 30 | case NVPA_STATUS_NOT_LOADED: 31 | errorMsg = "NVPA_STATUS_NOT_LOADED"; 32 | break; 33 | case NVPA_STATUS_FUNCTION_NOT_FOUND: 34 | errorMsg = "NVPA_STATUS_FUNCTION_NOT_FOUND"; 35 | break; 36 | case NVPA_STATUS_NOT_SUPPORTED: 37 | errorMsg = "NVPA_STATUS_NOT_SUPPORTED"; 38 | break; 39 | case NVPA_STATUS_NOT_IMPLEMENTED: 40 | errorMsg = "NVPA_STATUS_NOT_IMPLEMENTED"; 41 | break; 42 | case NVPA_STATUS_INVALID_ARGUMENT: 43 | errorMsg = "NVPA_STATUS_INVALID_ARGUMENT"; 44 | break; 45 | case NVPA_STATUS_INVALID_METRIC_ID: 46 | errorMsg = "NVPA_STATUS_INVALID_METRIC_ID"; 47 | break; 48 | case NVPA_STATUS_DRIVER_NOT_LOADED: 49 | errorMsg = "NVPA_STATUS_DRIVER_NOT_LOADED"; 50 | break; 51 | case NVPA_STATUS_OUT_OF_MEMORY: 52 | errorMsg = "NVPA_STATUS_OUT_OF_MEMORY"; 53 | break; 54 | case NVPA_STATUS_INVALID_THREAD_STATE: 55 | errorMsg = "NVPA_STATUS_INVALID_THREAD_STATE"; 56 | break; 57 | case NVPA_STATUS_FAILED_CONTEXT_ALLOC: 58 | errorMsg = "NVPA_STATUS_FAILED_CONTEXT_ALLOC"; 59 | break; 60 | case NVPA_STATUS_UNSUPPORTED_GPU: 61 | errorMsg = "NVPA_STATUS_UNSUPPORTED_GPU"; 62 | break; 63 | case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION: 64 | errorMsg = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION"; 65 | break; 66 | case NVPA_STATUS_OBJECT_NOT_REGISTERED: 67 | errorMsg = "NVPA_STATUS_OBJECT_NOT_REGISTERED"; 68 | break; 69 | case NVPA_STATUS_INSUFFICIENT_PRIVILEGE: 70 | errorMsg = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE"; 71 | break; 72 | case NVPA_STATUS_INVALID_CONTEXT_STATE: 73 | errorMsg = "NVPA_STATUS_INVALID_CONTEXT_STATE"; 74 | break; 75 | case NVPA_STATUS_INVALID_OBJECT_STATE: 76 | errorMsg = "NVPA_STATUS_INVALID_OBJECT_STATE"; 77 | break; 78 | case NVPA_STATUS_RESOURCE_UNAVAILABLE: 79 | errorMsg = "NVPA_STATUS_RESOURCE_UNAVAILABLE"; 80 | break; 81 | case NVPA_STATUS_DRIVER_LOADED_TOO_LATE: 82 | errorMsg = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE"; 83 | break; 84 | case NVPA_STATUS_INSUFFICIENT_SPACE: 85 | errorMsg = "NVPA_STATUS_INSUFFICIENT_SPACE"; 86 | break; 87 | case NVPA_STATUS_OBJECT_MISMATCH: 88 | errorMsg = "NVPA_STATUS_OBJECT_MISMATCH"; 89 | break; 90 | case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED: 91 | errorMsg = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED"; 92 | break; 93 | default: 94 | break; 95 | } 96 | 97 | return errorMsg; 98 | } 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /unmaintained/cuda-gapped-stream/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../measure_metric/measureMetricPW.hpp" 3 | #include "../dtime.hpp" 4 | #include "../gpu-error.h" 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | const size_t elementCount = 4 * 1024 * 1024 * 1024ull; 12 | double *dA, *dB; 13 | 14 | template 15 | __global__ void init_kernel(T *A, const T *__restrict__ B, 16 | const T *__restrict__ C, const T *__restrict__ D, 17 | const size_t N) { 18 | size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; 19 | for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) { 20 | A[i] = 0.1; 21 | } 22 | } 23 | 24 | template 25 | __global__ void scale_kernel(T *A, const T *__restrict__ B, int blocks, int spacing) { 26 | size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; 27 | if (tidx >= elementCount) 28 | return; 29 | 30 | size_t idx = ((tidx * spacing) % elementCount + (tidx*spacing) / elementCount) % elementCount; 31 | 32 | 33 | T temp = B[idx]; 34 | 35 | if(temp == 12223.0 && threadIdx.x > 10000) 36 | A[idx] = 1.2; // = B[idx] * 1.2; 37 | } 38 | 39 | void measureFunc(int blocks, int spacing) { 40 | 41 | MeasurementSeries time; 42 | int blockSize = 256; 43 | int gridSize = (elementCount - 1) / blockSize + 1; 44 | 45 | scale_kernel<<>>(dA, dB, blocks, spacing); 46 | 47 | nvmlDevice_t device; 48 | int deviceId; 49 | cudaGetDevice(&deviceId); 50 | nvmlDeviceGetHandleByIndex(deviceId, &device); 51 | 52 | for (int iter = 0; iter < 7; iter++) { 53 | GPU_ERROR(cudaDeviceSynchronize()); 54 | double t1 = dtime(); 55 | GPU_ERROR(cudaDeviceSynchronize()); 56 | scale_kernel<<>>(dA, dB, blocks, spacing); 57 | scale_kernel<<>>(dA, dB, blocks, spacing); 58 | GPU_ERROR(cudaDeviceSynchronize()); 59 | double t2 = dtime(); 60 | time.add((t2 - t1) / 2); 61 | } 62 | 63 | measureMetricStart({"dram__bytes_read.sum", "dram__bytes_write.sum"}); 64 | scale_kernel<<>>(dA, dB, blocks, spacing); 65 | GPU_ERROR(cudaDeviceSynchronize()); 66 | auto dram_metrics = measureMetricStop(); 67 | 68 | measureMetricStart({"lts__t_sectors_srcunit_tex.sum", 69 | "lts__t_sectors_srcunit_ltcfabric.sum", 70 | "lts__t_sectors.sum"}); 71 | scale_kernel<<>>(dA, dB, blocks, spacing); 72 | GPU_ERROR(cudaDeviceSynchronize()); 73 | auto l2_metrics = measureMetricStop(); 74 | 75 | measureMetricStart({"lts__t_tag_requests.sum", 76 | "lts__t_tag_requests.avg.pct_of_peak_sustained_elapsed"}); 77 | 78 | scale_kernel<<>>(dA, dB, blocks, spacing); 79 | GPU_ERROR(cudaDeviceSynchronize()); 80 | auto tag_requests = measureMetricStop(); 81 | 82 | cudaDeviceProp prop; 83 | GPU_ERROR(cudaGetDevice(&deviceId)); 84 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 85 | std::string deviceName = prop.name; 86 | int smCount = prop.multiProcessorCount; 87 | int maxActiveBlocks = 0; 88 | GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor( 89 | &maxActiveBlocks, scale_kernel, blockSize, 0)); 90 | 91 | 92 | cout << fixed << setprecision(0) 93 | << maxActiveBlocks << " " 94 | << setw(2) << " " << setw(5) 95 | << blocks << " " << setw(5) 96 | << spacing << " eff: " 97 | << elementCount * sizeof(double) / time.value() * 1e-9 << " GB/s " 98 | << setprecision(0) << setw(8) << dram_metrics[0] / time.value() / 1.0e9 << " GB/s " // 99 | << setprecision(0) << setw(8) << l2_metrics[0]*32 / time.value() / 1.0e9 << " GB/s " // 100 | << setprecision(0) << setw(8) << l2_metrics[1]*32 / time.value() / 1.0e9 << " GB/s " // 101 | << setprecision(0) << setw(8) << l2_metrics[2]*32 / time.value() / 1.0e9 << " GB/s " // 102 | << setprecision(0) << setw(8) << tag_requests[0] / time.value() / 1.41e9 << " /cyc " // 103 | << setprecision(0) << setw(8) << tag_requests[1] << " % "; // 104 | // 105 | cout << " " << setprecision(2) << setw(5) << dram_metrics[0] / (elementCount * sizeof(double)) << " "; 106 | cout << " " << setprecision(2) << setw(5) << l2_metrics[0]*32 / (elementCount * sizeof(double)) << " "; 107 | cout << " " << setprecision(2) << setw(5) << l2_metrics[1]*32 / (elementCount * sizeof(double)) << " "; 108 | cout << " " << setprecision(2) << setw(5) << l2_metrics[2]*32 / (elementCount * sizeof(double)) << " "; 109 | cout << " " << setprecision(3) << setw(5) << tag_requests[0] / (elementCount) << " "; 110 | 111 | 112 | cout << std::endl; 113 | } 114 | 115 | int main(int argc, char **argv) { 116 | int maxSpacing = 512 * 1024 * 1024; 117 | size_t bufferSize = elementCount * sizeof(double); 118 | nvmlInit(); 119 | //GPU_ERROR(cudaMalloc(&dA, bufferSize)); 120 | GPU_ERROR(cudaMalloc(&dB, bufferSize)); 121 | 122 | init_kernel<<<256, 400>>>(dB, dB, dB, dB, elementCount ); 123 | //init_kernel<<<256, 400>>>(dA, dA, dA, dA, elementCount * maxSpacing); 124 | GPU_ERROR(cudaDeviceSynchronize()); 125 | 126 | cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); 127 | 128 | for(int blocks = 1; blocks <= 1; blocks *=2) { 129 | for(int spacing = 1; spacing <= maxSpacing; spacing *= 2) { 130 | measureFunc(blocks, spacing); 131 | } 132 | } 133 | 134 | 135 | //GPU_ERROR(cudaFree(dA)); 136 | GPU_ERROR(cudaFree(dB)); 137 | } 138 | -------------------------------------------------------------------------------- /gpu-l2-stream/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | 8 | import sys 9 | 10 | sys.path.append("..") 11 | from device_order import * 12 | 13 | 14 | fig, ax = plt.subplots(figsize=(6, 4)) 15 | fig2, ax2 = plt.subplots(figsize=(6, 4)) 16 | 17 | 18 | maxbars = {} 19 | minbars = {} 20 | 21 | devicesToInclude = [ 22 | "a40", 23 | "l40", 24 | "v100", 25 | "a100_80", 26 | "gh200", 27 | "mi210", 28 | "rx6900xt", 29 | "mi300x", 30 | ] 31 | 32 | 33 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)): 34 | if not filename.endswith(".txt") or not any( 35 | [True if filename.lower().startswith(f) else False for f in devicesToInclude] 36 | ): 37 | continue 38 | with open(filename, newline="") as csvfile: 39 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 40 | 41 | mediData = {} 42 | maxiData = {} 43 | miniData = {} 44 | readData = {} 45 | triadData = {} 46 | initData = {} 47 | 48 | for row in csvreader: 49 | if len(row) < 18 or not row[0].isnumeric(): 50 | continue 51 | 52 | print(row) 53 | threads = int(row[2]) 54 | size = int(row[3]) 55 | mini = float(row[10]) 56 | medi = float(row[11]) 57 | maxi = float(row[12]) 58 | 59 | read = float(row[8]) 60 | triad = float(row[14]) 61 | init = float(row[17]) 62 | 63 | if threads not in mediData: 64 | mediData[threads] = {} 65 | maxiData[threads] = {} 66 | miniData[threads] = {} 67 | readData[threads] = {} 68 | triadData[threads] = {} 69 | initData[threads] = {} 70 | 71 | mediData[threads][size] = medi 72 | maxiData[threads][size] = maxi 73 | miniData[threads][size] = mini 74 | readData[threads][size] = read 75 | triadData[threads][size] = triad 76 | initData[threads][size] = init 77 | 78 | # ax.scatter( 79 | # [v for b in data for v in data[b].keys()], 80 | # [v for b in data for v in data[b].values()], 81 | # label=filename[:-4].upper(), 82 | # color=getDeviceColor(filename), 83 | # alpha=0.2, 84 | # # **lineStyle 85 | # ) 86 | 87 | miniBWPerSize = {} 88 | maxBWPerSize = {} 89 | mediBWPerSize = {} 90 | 91 | for threads in mediData.keys(): 92 | for size in mediData[threads].keys(): 93 | if ( 94 | size not in mediBWPerSize 95 | or mediBWPerSize[size] < mediData[threads][size] 96 | ): 97 | maxBWPerSize[size] = maxiData[threads][size] 98 | mediBWPerSize[size] = mediData[threads][size] 99 | miniBWPerSize[size] = miniData[threads][size] 100 | 101 | ax.fill_between( 102 | maxBWPerSize.keys(), 103 | miniBWPerSize.values(), 104 | maxBWPerSize.values(), 105 | alpha=0.4, 106 | color=getDeviceColor(filename), 107 | edgecolor=None, 108 | ) 109 | ax.plot( 110 | maxBWPerSize.keys(), 111 | mediBWPerSize.values(), 112 | color=getDeviceColor(filename), 113 | label=order[getOrderNumber(filename)].upper(), 114 | # *lineStyle, 115 | ) 116 | if len(maxBWPerSize) > 0: 117 | ax.set_xlim([list(maxBWPerSize.keys())[0], list(maxBWPerSize.keys())[-1]]) 118 | 119 | bws = [] 120 | 121 | closestSize = 0 122 | for b in mediData.values(): 123 | bws.append(0) 124 | closestSize = 0 125 | for v in b.items(): 126 | if abs(v[0] - 2000) < abs(closestSize - 2000): 127 | 128 | bws[-1] = v[1] 129 | closestSize = v[0] 130 | 131 | ax2.plot( 132 | [k for k in mediData.keys() if k < 400000], 133 | bws[: len([k for k in mediData.keys() if k < 400000])], 134 | label=filename[:-4].upper(), 135 | color=getDeviceColor(filename), 136 | # *lineStyle, 137 | ) 138 | 139 | print(closestSize) 140 | 141 | print(filename, getOrderNumber(filename)) 142 | 143 | 144 | ########ax.set_xticks(threads[::5]) 145 | # ax.set_xticklabels(threads, rotation="vertical") 146 | ax.set_xlabel("dataset size, MB") 147 | ax.set_ylabel("Bandwidth, GB/s") 148 | 149 | # ax.axhline(1400, linestyle="--", color="C1") 150 | # ax.axhline(800, linestyle="--", color="C0") 151 | 152 | # ax.grid() 153 | ax.legend() 154 | ax.set_ylim([0, ax.get_ylim()[1]]) 155 | 156 | ax.set_xscale("log", base=2) 157 | formatter = matplotlib.ticker.FuncFormatter( 158 | lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g}".format(x / 1024) 159 | ) 160 | ax.get_xaxis().set_major_formatter(formatter) 161 | ax.set_xticks( 162 | [ 163 | 1024, 164 | 2048, 165 | 4096, 166 | 8192, 167 | 20 * 1024, 168 | 40 * 1024, 169 | 96 * 1024, 170 | 256 * 1024, 171 | 512 * 1024, 172 | ] 173 | ) 174 | 175 | 176 | fig.tight_layout() 177 | fig.savefig("gpu-l2-stream.pdf", dpi=300) 178 | 179 | 180 | ax2.set_xlabel("threads") 181 | ax2.set_ylabel("Bandwidth, GB/s") 182 | 183 | 184 | ax2.legend() 185 | ax2.set_xlim([0, 370000]) 186 | ax2.set_ylim([0, ax2.get_ylim()[1]]) 187 | 188 | fig2.tight_layout() 189 | fig2.savefig("gpu-l2-stream-scaling.pdf", dpi=300) 190 | 191 | 192 | plt.show() 193 | -------------------------------------------------------------------------------- /cuda-incore/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../dtime.hpp" 3 | #include "../gpu-clock.cuh" 4 | #include "../gpu-error.h" 5 | #include "../metrics.cuh" 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | template __global__ void initKernel(T *A, size_t N) { 13 | size_t tidx = blockDim.x * blockIdx.x + threadIdx.x; 14 | for (int idx = tidx; idx < N; idx += blockDim.x * gridDim.x) { 15 | A[idx] = 1.1; 16 | } 17 | } 18 | 19 | template 20 | __global__ void FMA_mixed(T p, T *A, int iters) { 21 | #pragma unroll(1) 22 | for (int iter = 0; iter < iters; iter++) { 23 | T t[M]; 24 | #pragma unroll 25 | for (int m = 0; m < M; m++) { 26 | t[m] = p + threadIdx.x + iter + m; 27 | } 28 | #pragma unroll 29 | for (int n = 0; n < N / M; n++) { 30 | #pragma unroll 31 | for (int m = 0; m < M; m++) { 32 | t[m] = t[m] * (T)0.9 + (T)0.5; 33 | } 34 | } 35 | #pragma unroll 36 | for (int m = 0; m < M; m++) { 37 | if (t[m] > (T)22313.0) { 38 | A[0] = t[m]; 39 | } 40 | } 41 | } 42 | } 43 | 44 | template 45 | __global__ void FMA_separated(T p, T *A, int iters) { 46 | 47 | for (int iter = 0; iter < iters; iter++) { 48 | #pragma unroll 49 | for (int m = 0; m < M; m++) { 50 | T t = p + threadIdx.x + iter + m; 51 | for (int n = 0; n < N; n++) { 52 | t = t * (T)0.9 + (T)0.5; 53 | } 54 | if (t > (T)22313.0) { 55 | A[0] = t; 56 | } 57 | } 58 | } 59 | } 60 | 61 | template 62 | __global__ void DIV_separated(T p, T *A, int iters) { 63 | 64 | #pragma unroll(1) 65 | for (int iter = 0; iter < iters; iter++) { 66 | for (int m = 0; m < M; m++) { 67 | T t = p + threadIdx.x + iter + m; 68 | 69 | for (int n = 0; n < N; n++) { 70 | t = 0.1 / (t + 0.2); 71 | } 72 | 73 | A[threadIdx.x + iter] = t; 74 | } 75 | } 76 | } 77 | 78 | template 79 | __global__ void SQRT_separated(T p, T *A, int iters) { 80 | 81 | #pragma unroll(1) 82 | for (int iter = 0; iter < iters; iter++) { 83 | 84 | for (int m = 0; m < M; m++) { 85 | T t = p + threadIdx.x + iter + m; 86 | 87 | for (int n = 0; n < N; n++) { 88 | t = sqrt(t + 0.2); 89 | } 90 | 91 | A[threadIdx.x + iter] = t; 92 | } 93 | } 94 | } 95 | 96 | unsigned int gpu_clock = 0; 97 | 98 | template 99 | double measure(int warpCount, void (*kernel)(T, T *, int)) { 100 | nvmlDevice_t device; 101 | nvmlDeviceGetHandleByIndex(0, &device); 102 | 103 | const int iters = 10000; 104 | const int blockSize = 32 * warpCount; 105 | const int blockCount = 1; 106 | 107 | MeasurementSeries time; 108 | 109 | T *dA; 110 | GPU_ERROR(cudaMalloc(&dA, iters * 2 * sizeof(T))); 111 | initKernel<<<52, 256>>>(dA, iters * 2); 112 | GPU_ERROR(cudaDeviceSynchronize()); 113 | 114 | kernel<<>>((T)0.32, dA, iters); 115 | GPU_ERROR(cudaDeviceSynchronize()); 116 | for (int i = 0; i < 1; i++) { 117 | double t1 = dtime(); 118 | kernel<<>>((T)0.32, dA, iters); 119 | GPU_ERROR(cudaDeviceSynchronize()); 120 | double t2 = dtime(); 121 | time.add(t2 - t1); 122 | } 123 | cudaFree(dA); 124 | 125 | double rcpThru = time.value() * gpu_clock * 1.0e6 / N / iters / warpCount; 126 | /*cout << setprecision(1) << fixed << typeid(T).name() << " " << setw(5) << N 127 | << " " << warpCount << " " << setw(5) << M << " " 128 | << " " << setw(5) << time.value() * 100 << " " << setw(5) 129 | << time.spread() * 100 << "% " << setw(5) << setprecision(2) << rcpThru 130 | << " " << setw(9) << clock << "MHz\n" ;*/ 131 | return rcpThru; 132 | } 133 | 134 | template void measureTabular(int maxWarpCount) { 135 | 136 | vector, double>> r(3); 137 | const int N = 1024; 138 | for (int warpCount = 1; warpCount <= maxWarpCount; warpCount *= 2) { 139 | r[0][{warpCount, 1}] = measure(warpCount, FMA_mixed); 140 | r[1][{warpCount, 1}] = 141 | measure(warpCount, DIV_separated); 142 | r[2][{warpCount, 1}] = 143 | measure(warpCount, SQRT_separated); 144 | r[0][{warpCount, 2}] = measure(warpCount, FMA_mixed); 145 | r[1][{warpCount, 2}] = 146 | measure(warpCount, DIV_separated); 147 | r[2][{warpCount, 2}] = 148 | measure(warpCount, SQRT_separated); 149 | r[0][{warpCount, 4}] = measure(warpCount, FMA_mixed); 150 | r[1][{warpCount, 4}] = 151 | measure(warpCount, DIV_separated); 152 | r[2][{warpCount, 4}] = 153 | measure(warpCount, SQRT_separated); 154 | r[0][{warpCount, 8}] = measure(warpCount, FMA_mixed); 155 | r[1][{warpCount, 8}] = 156 | measure(warpCount, DIV_separated); 157 | r[2][{warpCount, 8}] = 158 | measure(warpCount, SQRT_separated); 159 | // cout << "\n"; 160 | } 161 | 162 | for (int i = 0; i < 3; i++) { 163 | for (int warpCount = 1; warpCount <= maxWarpCount; warpCount *= 2) { 164 | for (int streams = 1; streams <= 8; streams *= 2) { 165 | cout << setw(7) << setprecision(3) << r[i][{warpCount, streams}] << " "; 166 | } 167 | cout << "\n"; 168 | } 169 | cout << "\n"; 170 | } 171 | } 172 | 173 | int main(int argc, char **argv) { 174 | gpu_clock = getGPUClock(); 175 | measureTabular(32); 176 | measureTabular(32); 177 | } 178 | -------------------------------------------------------------------------------- /gpu-roofline/main.cu: -------------------------------------------------------------------------------- 1 | #include "../dtime.hpp" 2 | #include "../gpu-error.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "../MeasurementSeries.hpp" 12 | 13 | #include "../gpu-stats.h" 14 | 15 | using namespace std; 16 | 17 | template __global__ void initKernel(T *data, size_t data_len) { 18 | int tidx = blockIdx.x * blockDim.x + threadIdx.x; 19 | for (int idx = tidx; idx < data_len; idx += gridDim.x * blockDim.x) { 20 | data[idx] = idx; 21 | } 22 | } 23 | 24 | template 25 | __global__ void testfun(T *const __restrict__ dA, T *const __restrict__ dB, 26 | T *dC) { 27 | T *sA = dA + threadIdx.x + blockIdx.x * BLOCKSIZE * M; 28 | T *sB = dB + threadIdx.x + blockIdx.x * BLOCKSIZE * M; 29 | 30 | T sum = 0; 31 | 32 | //#pragma unroll 1 33 | for (int i = 0; i < M; i += 2) { 34 | T a = sA[i * BLOCKSIZE]; 35 | T b = sB[i * BLOCKSIZE]; 36 | T v = a - b; 37 | T a2 = sA[(i + 1) * BLOCKSIZE]; 38 | T b2 = sB[(i + 1) * BLOCKSIZE]; 39 | T v2 = a2 - b2; 40 | //#pragma unroll N 41 | for (int i = 0; i < N; i++) { 42 | v = v * a - b; 43 | v2 = v2 * a - b; 44 | } 45 | sum += v + v2; 46 | } 47 | if (threadIdx.x == 0) 48 | dC[blockIdx.x] = sum; 49 | } 50 | 51 | template 52 | __global__ void testfun_max_power(T *const __restrict__ dA, 53 | T *const __restrict__ dB, T *dC) { 54 | T *sA = dA + threadIdx.x + (blockIdx.x / 2) * BLOCKSIZE * M; 55 | T *sB = dB + threadIdx.x + (blockIdx.x / 2) * BLOCKSIZE * M; 56 | 57 | T sum = 0; 58 | 59 | // #pragma unroll 1 60 | for (int i = 0; i < M; i += 2) { 61 | T a = sA[i * BLOCKSIZE]; 62 | T b = sB[i * BLOCKSIZE]; 63 | T v = a - b; 64 | T a2 = sA[(i + 1) * BLOCKSIZE]; 65 | T b2 = sB[(i + 1) * BLOCKSIZE]; 66 | T v2 = a2 - b2; 67 | for (int i = 0; i < N; i++) { 68 | v = v * a - b; 69 | v2 = v2 * a2 - b2; 70 | } 71 | sum += v + v2; 72 | } 73 | if (threadIdx.x == 0) 74 | dC[blockIdx.x] = sum; 75 | } 76 | 77 | int main(int argc, char **argv) { 78 | 79 | typedef float dtype; 80 | const int M = 4000; 81 | // PARN is a constant from the Makefile, set via -DPARN=X 82 | const int N = PARN; 83 | const int BLOCKSIZE = 256; 84 | 85 | int nDevices; 86 | GPU_ERROR(cudaGetDeviceCount(&nDevices)); 87 | 88 | #pragma omp parallel num_threads(nDevices) 89 | { 90 | GPU_ERROR(cudaSetDevice(omp_get_thread_num())); 91 | #pragma omp barrier 92 | int deviceId; 93 | GPU_ERROR(cudaGetDevice(&deviceId)); 94 | cudaDeviceProp prop; 95 | GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId)); 96 | int numBlocks; 97 | 98 | GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor( 99 | &numBlocks, testfun, BLOCKSIZE, 0)); 100 | int blockCount = prop.multiProcessorCount * numBlocks; 101 | 102 | size_t data_len = (size_t)blockCount * BLOCKSIZE * M; 103 | dtype *dA = NULL; 104 | dtype *dB = NULL; 105 | dtype *dC = NULL; 106 | size_t iters = 1000; 107 | 108 | GPU_ERROR(cudaMalloc(&dA, data_len * sizeof(dtype))); 109 | GPU_ERROR(cudaMalloc(&dB, data_len * sizeof(dtype))); 110 | GPU_ERROR(cudaMalloc(&dC, data_len * sizeof(dtype))); 111 | #pragma omp barrier 112 | initKernel<<>>(dA, data_len); 113 | initKernel<<>>(dB, data_len); 114 | initKernel<<>>(dC, data_len); 115 | GPU_ERROR(cudaDeviceSynchronize()); 116 | 117 | #pragma omp barrier 118 | 119 | cudaEvent_t start, stop; 120 | GPU_ERROR(cudaEventCreate(&start)); 121 | GPU_ERROR(cudaEventCreate(&stop)); 122 | 123 | GPU_ERROR(cudaEventRecord(start)); 124 | for (size_t iter = 0; iter < iters; iter++) { 125 | testfun<<>>(dA, dB, dC); 126 | } 127 | GPU_ERROR(cudaEventRecord(stop)); 128 | 129 | MeasurementSeries powerSeries; 130 | MeasurementSeries clockSeries; 131 | MeasurementSeries temperatureSeries; 132 | 133 | do { 134 | usleep(1000); 135 | auto stats = getGPUStats(deviceId); 136 | powerSeries.add(stats.power); 137 | clockSeries.add(stats.clock); 138 | temperatureSeries.add(stats.temperature); 139 | } while (cudaEventQuery(stop) == cudaErrorNotReady); 140 | 141 | GPU_ERROR(cudaEventSynchronize(stop)); 142 | GPU_ERROR(cudaGetLastError()); 143 | 144 | float milliseconds; 145 | GPU_ERROR(cudaEventElapsedTime(&milliseconds, start, stop)); 146 | float dt = milliseconds / 1000; 147 | 148 | #pragma omp barrier 149 | #pragma omp for ordered schedule(static, 1) 150 | for (int i = 0; i < omp_get_num_threads(); i++) { 151 | #pragma omp ordered 152 | { 153 | 154 | //for (auto v : clockSeries) 155 | // std::cout << v << " "; 156 | //std::cout << "\n"; 157 | cout << setprecision(3) << fixed << deviceId << " " << blockCount 158 | << " blocks " << setw(3) << N << " its " 159 | << (2.0 + N * 2.0) / (2.0 * sizeof(dtype)) << " Fl/B " 160 | << setprecision(0) << setw(5) 161 | << iters * 2 * data_len * sizeof(dtype) / dt * 1.0e-9 162 | << " GB/s " << setw(6) 163 | << iters * (2 + N * 2) * data_len / dt * 1.0e-9 << " GF/s " 164 | << clockSeries.median() << " Mhz " 165 | << powerSeries.median() / 1000 << " W " 166 | << temperatureSeries.median() << "°C\n"; 167 | } 168 | } 169 | GPU_ERROR(cudaFree(dA)); 170 | GPU_ERROR(cudaFree(dB)); 171 | GPU_ERROR(cudaFree(dC)); 172 | } 173 | cout << "\n"; 174 | } 175 | -------------------------------------------------------------------------------- /unmaintained/cuda-busy/main.cu: -------------------------------------------------------------------------------- 1 | #include "../MeasurementSeries.hpp" 2 | #include "../dtime.hpp" 3 | #include "../gpu-error.h" 4 | #include "../metrics.cuh" 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | double *dA, *dB; 11 | 12 | using kernel_ptr_type = void (*)(int iters, double *A, const double *B); 13 | 14 | template 15 | __global__ __launch_bounds__(1024, 1) void kernel(int iters, double *A, 16 | double *B) { 17 | 18 | int widx = threadIdx.x / 32; 19 | double sum = 0.0; 20 | #pragma unroll(1) 21 | for (int w = 0; w < (widx % 5) * 11; w++) { 22 | sum += w; 23 | } 24 | 25 | double *dA = A + threadIdx.x; 26 | double *dB = B + threadIdx.x; 27 | 28 | #pragma unroll(1) 29 | for (int iter = 0; iter < iters; iter++) { 30 | #pragma unroll(UNROLL) 31 | for (int n = 0; n < N; n++) { 32 | if (DOTPRODUCT) 33 | sum += dA[n * 32] * dB[n * 32]; 34 | else 35 | sum += dA[n * 32]; 36 | } 37 | } 38 | 39 | if (sum == -12.3) { 40 | A[threadIdx.x] = sum; 41 | } 42 | } 43 | 44 | double pred(int Iint, int Ild, int Idp, int Nsm, int ClL1) { 45 | int Nq = ceil((double)Nsm / 4); 46 | 47 | int Tdp = Idp * 4; 48 | int Tld = Ild * 4; 49 | int Tint = Iint * 2; 50 | int TL1lat = 32; 51 | int TL1thru = ClL1 * Nsm; 52 | 53 | int Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru); 54 | 55 | cout << setw(5) << Tdp << " "; 56 | cout << setw(5) << Tld << " "; 57 | cout << setw(5) << Tint << " "; 58 | 59 | 60 | TL1lat = 32 + (double)TL1thru / Ttotal * 16; 61 | TL1thru = ClL1 * Nsm * (1.0f + (double)TL1thru / Ttotal) * 0.5f; 62 | Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru); 63 | 64 | 65 | 66 | TL1lat = 32 + (double)TL1thru / Ttotal * 16; 67 | TL1thru = ClL1 * Nsm * (1.0f + (double)TL1thru / Ttotal) * 0.5f; 68 | Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru); 69 | 70 | string cont = "Tint + "; 71 | if (TL1thru >= max(Tld, TL1lat) + Tdp) { 72 | cont += " TL1thru "; 73 | } else if (TL1thru == max(Tld, TL1lat) + Tdp) { 74 | cont += " | "; 75 | } else { 76 | cont += "( "; 77 | if (Tld > TL1lat) { 78 | cont += "Tld"; 79 | } else if (Tld == TL1lat) { 80 | cont += "TL1lat|Tld"; 81 | } else { 82 | cont += "TL1lat"; 83 | } 84 | cont += " + Tdp)"; 85 | } 86 | 87 | cout << cont << " "; 88 | 89 | return Ttotal; 90 | } 91 | 92 | template 93 | void measure(int blockSize, bool concise = false) { 94 | 95 | if (DV % (32 * UNROLL) != 0) 96 | cout << DV << " % " << 32 * UNROLL << " != 0\n"; 97 | 98 | if (DV * 8 * 2 > 128 * 1024) 99 | cout << DV * 8 * 2 << " > " << 128 * 1024 << "\n"; 100 | 101 | if (DV * 8 * 2 < 64 * 1024) 102 | cout << DV * 8 * 2 << " < " << 64 * 1024 << "\n"; 103 | 104 | int blockCount = 1; 105 | const int N = DV / 32; 106 | int iters = 100000 / N; 107 | 108 | GPU_ERROR(cudaFuncSetCacheConfig(kernel, 109 | cudaFuncCachePreferL1)); 110 | 111 | MeasurementSeries time; 112 | for (int i = 0; i < 20; i++) { 113 | 114 | GPU_ERROR(cudaDeviceSynchronize()); 115 | double t1 = dtime(); 116 | 117 | kernel<<>>(iters, dA, dB); 118 | 119 | GPU_ERROR(cudaDeviceSynchronize()); 120 | double t2 = dtime(); 121 | time.add(t2 - t1); 122 | } 123 | GPU_ERROR(cudaGetLastError()); 124 | 125 | double spread = (time.median() - time.minValue()) / time.median() * 100; 126 | double dt = time.minValue(); 127 | double bw = (DOTPRODUCT ? 2 : 1) * DV * iters * sizeof(double) / dt / 1e9; 128 | double cyc = dt / (DV * iters) * 1.38e9 * 32; 129 | 130 | if (concise) { 131 | cout << fixed << setprecision(2) << setw(7) << cyc << " "; 132 | } else { 133 | 134 | cout << fixed << setprecision(2); 135 | cout << setw(3) << UNROLL << " " // 136 | << setw(8) << dt * 1000 << " " // 137 | << setw(8) << spread << " " // 138 | << setw(8) << bw << " " // 139 | << setw(8) << cyc << " -- "; 140 | // << setw(8) 141 | //<< (20.0 + max(UNROLL * (DOTPRODUCT ? 8 : 4), 30) + UNROLL * 8) / 142 | // UNROLL 143 | 144 | int Iint = 10; 145 | int Ild = UNROLL * (DOTPRODUCT ? 2 : 1); 146 | int Idp = UNROLL; 147 | int ClL1 = Ild * 2; 148 | int Nsm = max(1, blockSize / 32); 149 | int Nq = max(1, blockSize / 32 / 4); 150 | 151 | cout << setw(5) << pred(Iint, Ild, Idp, ClL1, Nsm) / UNROLL << " "; 152 | 153 | cout << "\n"; 154 | } 155 | } 156 | 157 | int main(int argc, char **argv) { 158 | 159 | size_t maxBufferSize = 1024 * 1024; 160 | GPU_ERROR(cudaMallocManaged(&dA, sizeof(double) * maxBufferSize)); 161 | GPU_ERROR(cudaMallocManaged(&dB, sizeof(double) * maxBufferSize)); 162 | for (size_t i = 0; i < maxBufferSize; i++) { 163 | dA[i] = 1.2; 164 | dB[i] = 1.21; 165 | } 166 | 167 | bool concise = false; 168 | const bool dotProduct = false; 169 | for (int blockSize = 32; blockSize <= 1024; blockSize *= 2) { 170 | measure<8 * 512, 1, dotProduct>(blockSize, concise); 171 | measure<8 * 512, 2, dotProduct>(blockSize, concise); 172 | measure<3 * 2048, 3, dotProduct>(blockSize, concise); 173 | measure<8 * 512, 4, dotProduct>(blockSize, concise); 174 | measure<6 * 1024, 6, dotProduct>(blockSize, concise); 175 | measure<8 * 512, 8, dotProduct>(blockSize, concise); 176 | measure<9 * 512, 9, dotProduct>(blockSize, concise); 177 | measure<3 * 2048, 12, dotProduct>(blockSize, concise); 178 | measure<8 * 512, 16, dotProduct>(blockSize, concise); 179 | measure<9 * 512, 18, dotProduct>(blockSize, concise); 180 | measure<6 * 1024, 24, dotProduct>(blockSize, concise); 181 | measure<27 * 256, 27, dotProduct>(blockSize, concise); 182 | measure<8 * 512, 32, dotProduct>(blockSize, concise); 183 | cout << "\n"; 184 | } 185 | GPU_ERROR(cudaFree(dA)); 186 | GPU_ERROR(cudaFree(dB)); 187 | return 0; 188 | } 189 | -------------------------------------------------------------------------------- /gpu-stream/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | 8 | import sys 9 | 10 | sys.path.append("..") 11 | from device_order import * 12 | 13 | # fig, (ax, ax2) = plt.subplots(1, 2, sharey=True, facecolor="w", figsize=(8, 5)) 14 | fig, ax = plt.subplots(figsize=(6, 4)) 15 | 16 | # fig2, ax2 = plt.subplots(figsize=(8, 4)) 17 | # fig3, ax3 = plt.subplots(figsize=(8, 4)) 18 | 19 | 20 | maxbars = {} 21 | minbars = {} 22 | 23 | devicesToInclude = [ 24 | "a40", 25 | "l40", 26 | "v100", 27 | "a100_80", 28 | "gh200", 29 | "mi210", 30 | "rx6900xt", 31 | "mi300x", 32 | # "mi300a", 33 | ] 34 | 35 | 36 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)): 37 | if not filename.endswith(".txt") or not any( 38 | [True if filename.lower().startswith(f) else False for f in devicesToInclude] 39 | ): 40 | continue 41 | with open(filename, newline="") as csvfile: 42 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 43 | threads = [] 44 | locs = [] 45 | init = [] 46 | read = [] 47 | scale = [] 48 | triad = [] 49 | stencil3pt = [] 50 | stencil5pt = [] 51 | 52 | for row in csvreader: 53 | if row[0].startswith("block") or len(row) < 12: 54 | continue 55 | 56 | # print(row) 57 | threads.append(int(row[1])) 58 | init.append(float(row[6])) 59 | read.append(float(row[7])) 60 | scale.append(float(row[8])) 61 | triad.append(float(row[9])) 62 | locs.append(float(row[2])) 63 | stencil3pt.append(float(row[10])) 64 | stencil5pt.append(float(row[11])) 65 | 66 | if len(threads) < 1: 67 | continue 68 | 69 | # locs = threads#[15 + l / 6 if l > 15 else l for l in locs] 70 | # print(locs) 71 | # print(threads) 72 | # ax.plot(locs, init, "-v", label=filename, color="C" + str(color)) 73 | ax.plot( 74 | np.array(threads), 75 | scale, 76 | label=order[getOrderNumber(filename)].upper(), 77 | color=getDeviceColor(filename), 78 | **lineStyle 79 | ) 80 | # ax2.plot( 81 | # np.array(threads), 82 | # triad, 83 | # label=filename[:-4].upper(), 84 | # color=getDeviceColor(filename), 85 | # **lineStyle 86 | # ) 87 | print(filename, getOrderNumber(filename)) 88 | 89 | # ax.plot(threads, triad, "-<", label=filename, color="C" + str(color)) 90 | # ax.plot(threads, read, "-^", label=filename, color="C" + str(color)) 91 | 92 | maxbars[filename] = [ 93 | read[-1], 94 | scale[-1], 95 | triad[-1], 96 | init[-1], 97 | # stencil3pt[-1], 98 | # stencil5pt[-1], 99 | ] 100 | 101 | mClosest = 0 102 | for m in range(len(threads)): 103 | if abs(threads[m] - 10000) < abs(threads[mClosest] - 10000): 104 | mClosest = m 105 | 106 | print(threads[mClosest]) 107 | minbars[filename] = [ 108 | read[mClosest], 109 | scale[mClosest], 110 | triad[mClosest], 111 | init[mClosest], 112 | # stencil3pt[0], 113 | # stencil5pt[0], 114 | ] 115 | 116 | 117 | ########ax.set_xticks(threads[::5]) 118 | # ax.set_xticklabels(threads, rotation="vertical") 119 | ax.set_xlabel("threads") 120 | ax.set_ylabel("DRAM bandwidth, GB/s") 121 | 122 | # ax.axhline(1400, linestyle="--", color="C1") 123 | # ax.axhline(800, linestyle="--", color="C0") 124 | 125 | # ax.grid() 126 | # 127 | # 128 | # ax.set_xscale("log") 129 | ax.legend() 130 | 131 | ax.set_ylim([0, ax.get_ylim()[1]]) 132 | ax.set_xlim([0, 400000]) 133 | 134 | formatter = matplotlib.ticker.FuncFormatter(lambda x, pos: "{:.0f}K".format(x // 1000)) 135 | ax.get_xaxis().set_major_formatter(formatter) 136 | 137 | fig.tight_layout(pad=0) 138 | fig.savefig("cuda-stream.svg", dpi=300) 139 | fig.savefig("cuda-stream.pdf", dpi=300) 140 | 141 | 142 | plt.show() 143 | 144 | print(maxbars) 145 | 146 | 147 | def plotXbars(xbars, filename): 148 | fig2, ax2 = plt.subplots(figsize=(6, 3)) 149 | 150 | valueCount = len(list(xbars.values())[0]) 151 | c = 0 152 | for m in range(valueCount): 153 | ax2.bar( 154 | np.arange(len(xbars)) 155 | + 0.8 156 | / valueCount 157 | * (m + 0.5 - valueCount / 2), # + (0.9 * valueCount) - 0.5, 158 | [i[m] for i in xbars.values()], 159 | width=0.8 / valueCount, 160 | color=device_color_palette[c], 161 | label=["read", "scale", "triad", "init", "1D3PT", "1D5PT"][m], 162 | ) 163 | # for n in range(len(maxbars)): 164 | # ax2.text( 165 | # n + 0.9 * (m - 0.5) / valueCount - 0.35, 166 | # 150, 167 | # ["init", "read", "scale", "triad", "1D3PT", "1D5PT"][m], 168 | # rotation=90, 169 | # color="w", 170 | # horizontalalignment="left", 171 | # ) 172 | c += 1 173 | 174 | # ax2.text(-0.4, 51, "init", rotation=90, color="w") 175 | # ax2.text(-0.28, 51, "read", rotation=90, color="w") 176 | # ax2.text(-0.16, 51, "scale", rotation=90, color="w") 177 | # ax2.text(-0.04, 51, "triad", rotation=90, color="w") 178 | # ax2.text(0.08, 51, "1D3PT", rotation=90, color="w") 179 | # ax2.text(0.22, 51, "1D5pt", rotation=90, color="w") 180 | 181 | print(list(maxbars.keys())) 182 | ax2.set_xticks(range(len(list(maxbars.keys())))) 183 | ax2.set_xticklabels( 184 | [order[getOrderNumber(f)].upper() for f in list(maxbars.keys())] 185 | ) 186 | ax2.set_ylabel("DRAM Bandwidth, GB/s") 187 | fig2.autofmt_xdate() 188 | ax2.legend() 189 | fig2.tight_layout(pad=0) 190 | fig2.savefig(filename, dpi=300) 191 | plt.show() 192 | 193 | 194 | plotXbars(maxbars, "maxbars.pdf") 195 | plotXbars(minbars, "minbars.pdf") 196 | -------------------------------------------------------------------------------- /gpu-roofline/mi300x.txt: -------------------------------------------------------------------------------- 1 | 4 2 | 6 3 | 8 4 | 10 5 | 12 6 | 14 7 | 16 8 | 18 9 | 20 10 | 22 11 | 24 12 | 28 13 | 32 14 | 36 15 | 40 16 | 44 17 | 48 18 | 54 19 | 60 20 | 66 21 | 72 22 | 80 23 | 88 24 | 96 25 | 106 26 | 116 27 | 126 28 | 138 29 | 150 30 | 164 31 | 178 32 | 194 33 | 212 34 | 230 35 | 250 36 | 272 37 | 296 38 | 322 39 | 350 40 | 380 41 | 412 42 | 448 43 | 486 44 | 528 45 | 574 46 | 622 47 | 674 48 | 732 49 | 794 50 | 862 51 | 934 52 | 1012 53 | 1 54 | -- Finished Building -- 55 | 0 2432 blocks 0 its 0.250 Fl/B 4073 GB/s 1018 GF/s 1254 Mhz 750 W 61°C 56 | 57 | 0 2432 blocks 1 its 0.500 Fl/B 4006 GB/s 2003 GF/s 1248 Mhz 750 W 63°C 58 | 59 | 0 2432 blocks 2 its 0.750 Fl/B 3986 GB/s 2990 GF/s 1243 Mhz 750 W 64°C 60 | 61 | 0 2432 blocks 4 its 1.250 Fl/B 3889 GB/s 4861 GF/s 1243 Mhz 750 W 66°C 62 | 63 | 0 2432 blocks 6 its 1.750 Fl/B 3896 GB/s 6818 GF/s 1249 Mhz 750 W 67°C 64 | 65 | 0 2432 blocks 8 its 2.250 Fl/B 3860 GB/s 8684 GF/s 1255 Mhz 750 W 66°C 66 | 67 | 0 2432 blocks 10 its 2.750 Fl/B 3840 GB/s 10559 GF/s 1266 Mhz 750 W 66°C 68 | 69 | 0 2432 blocks 12 its 3.250 Fl/B 3787 GB/s 12307 GF/s 1251 Mhz 750 W 67°C 70 | 71 | 0 2432 blocks 14 its 3.750 Fl/B 3762 GB/s 14109 GF/s 1251 Mhz 750 W 67°C 72 | 73 | 0 2432 blocks 16 its 4.250 Fl/B 3743 GB/s 15909 GF/s 1204 Mhz 750 W 68°C 74 | 75 | 0 2432 blocks 18 its 4.750 Fl/B 3731 GB/s 17723 GF/s 1144 Mhz 750 W 69°C 76 | 77 | 0 2432 blocks 20 its 5.250 Fl/B 3710 GB/s 19475 GF/s 1093 Mhz 750 W 70°C 78 | 79 | 0 2432 blocks 22 its 5.750 Fl/B 3686 GB/s 21193 GF/s 1047 Mhz 750 W 70°C 80 | 81 | 0 2432 blocks 24 its 6.250 Fl/B 3646 GB/s 22788 GF/s 1056 Mhz 750 W 70°C 82 | 83 | 0 2432 blocks 28 its 7.250 Fl/B 3582 GB/s 25970 GF/s 1061 Mhz 750 W 70°C 84 | 85 | 0 2432 blocks 32 its 8.250 Fl/B 3571 GB/s 29461 GF/s 1046 Mhz 750 W 71°C 86 | 87 | 0 2432 blocks 36 its 9.250 Fl/B 3515 GB/s 32512 GF/s 1041 Mhz 750 W 72°C 88 | 89 | 0 2432 blocks 40 its 10.250 Fl/B 3464 GB/s 35506 GF/s 1030 Mhz 750 W 72°C 90 | 91 | 0 2432 blocks 44 its 11.250 Fl/B 3398 GB/s 38226 GF/s 1010 Mhz 750 W 73°C 92 | 93 | 0 2432 blocks 48 its 12.250 Fl/B 3342 GB/s 40940 GF/s 989 Mhz 750 W 73°C 94 | 95 | 0 2432 blocks 54 its 13.750 Fl/B 3255 GB/s 44756 GF/s 980 Mhz 749 W 74°C 96 | 97 | 0 2432 blocks 60 its 15.250 Fl/B 3137 GB/s 47841 GF/s 914 Mhz 750 W 74°C 98 | 99 | 0 2432 blocks 66 its 16.750 Fl/B 3019 GB/s 50574 GF/s 943 Mhz 750 W 75°C 100 | 101 | 0 2432 blocks 72 its 18.250 Fl/B 2911 GB/s 53125 GF/s 975 Mhz 749 W 76°C 102 | 103 | 0 2432 blocks 80 its 20.250 Fl/B 2834 GB/s 57384 GF/s 981 Mhz 750 W 76°C 104 | 105 | 0 2432 blocks 88 its 22.250 Fl/B 2656 GB/s 59103 GF/s 1000 Mhz 750 W 77°C 106 | 107 | 0 2432 blocks 96 its 24.250 Fl/B 2585 GB/s 62680 GF/s 1009 Mhz 750 W 77°C 108 | 109 | 0 2432 blocks 106 its 26.750 Fl/B 1993 GB/s 53307 GF/s 1051 Mhz 750 W 79°C 110 | 111 | 0 2432 blocks 116 its 29.250 Fl/B 2097 GB/s 61330 GF/s 1037 Mhz 750 W 79°C 112 | 113 | 0 2432 blocks 126 its 31.750 Fl/B 1989 GB/s 63135 GF/s 1055 Mhz 750 W 79°C 114 | 115 | 0 2432 blocks 138 its 34.750 Fl/B 1851 GB/s 64308 GF/s 1052 Mhz 750 W 79°C 116 | 117 | 0 2432 blocks 150 its 37.750 Fl/B 1704 GB/s 64325 GF/s 1058 Mhz 751 W 80°C 118 | 119 | 0 2432 blocks 164 its 41.250 Fl/B 1634 GB/s 67390 GF/s 1073 Mhz 751 W 80°C 120 | 121 | 0 2432 blocks 178 its 44.750 Fl/B 1362 GB/s 60939 GF/s 1156 Mhz 751 W 82°C 122 | 123 | 0 2432 blocks 194 its 48.750 Fl/B 1245 GB/s 60712 GF/s 1140 Mhz 750 W 82°C 124 | 125 | 0 2432 blocks 212 its 53.250 Fl/B 1321 GB/s 70324 GF/s 1187 Mhz 751 W 81°C 126 | 127 | 0 2432 blocks 230 its 57.750 Fl/B 1334 GB/s 77051 GF/s 1202 Mhz 751 W 81°C 128 | 129 | 0 2432 blocks 250 its 62.750 Fl/B 1206 GB/s 75647 GF/s 1203 Mhz 751 W 81°C 130 | 131 | 0 2432 blocks 272 its 68.250 Fl/B 1162 GB/s 79328 GF/s 1228 Mhz 751 W 81°C 132 | 133 | 0 2432 blocks 296 its 74.250 Fl/B 1085 GB/s 80589 GF/s 1241 Mhz 751 W 81°C 134 | 135 | 0 2432 blocks 322 its 80.750 Fl/B 1018 GB/s 82230 GF/s 1259 Mhz 751 W 80°C 136 | 137 | 0 2432 blocks 350 its 87.750 Fl/B 949 GB/s 83258 GF/s 1272 Mhz 751 W 81°C 138 | 139 | 0 2432 blocks 380 its 95.250 Fl/B 878 GB/s 83621 GF/s 1273 Mhz 751 W 82°C 140 | 141 | 0 2432 blocks 412 its 103.250 Fl/B 767 GB/s 79183 GF/s 1278 Mhz 745 W 81°C 142 | 143 | 0 2432 blocks 448 its 112.250 Fl/B 764 GB/s 85721 GF/s 1292 Mhz 750 W 80°C 144 | 145 | 0 2432 blocks 486 its 121.750 Fl/B 710 GB/s 86456 GF/s 1305 Mhz 748 W 79°C 146 | 147 | 0 2432 blocks 528 its 132.250 Fl/B 667 GB/s 88246 GF/s 1312 Mhz 750 W 81°C 148 | 149 | 0 2432 blocks 574 its 143.750 Fl/B 617 GB/s 88723 GF/s 1320 Mhz 748 W 82°C 150 | 151 | 0 2432 blocks 622 its 155.750 Fl/B 508 GB/s 79101 GF/s 1427 Mhz 733 W 82°C 152 | 153 | 0 2432 blocks 674 its 168.750 Fl/B 473 GB/s 79819 GF/s 1439 Mhz 731 W 82°C 154 | 155 | 0 2432 blocks 732 its 183.250 Fl/B 496 GB/s 90942 GF/s 1366 Mhz 745 W 82°C 156 | 157 | 0 2432 blocks 794 its 198.750 Fl/B 410 GB/s 81524 GF/s 1464 Mhz 730 W 82°C 158 | 159 | 0 2432 blocks 862 its 215.750 Fl/B 382 GB/s 82440 GF/s 1476 Mhz 730 W 81°C 160 | 161 | 0 2432 blocks 934 its 233.750 Fl/B 355 GB/s 83022 GF/s 1486 Mhz 728 W 81°C 162 | 163 | 0 2432 blocks 1012 its 253.250 Fl/B 381 GB/s 96524 GF/s 1401 Mhz 747 W 82°C 164 | 165 | -------------------------------------------------------------------------------- /gpu-roofline/h200.txt: -------------------------------------------------------------------------------- 1 | 8 2 | 16 3 | 24 4 | 32 5 | 40 6 | 48 7 | 56 8 | 64 9 | 72 10 | 80 11 | 88 12 | 96 13 | 104 14 | 112 15 | 120 16 | 128 17 | 136 18 | 144 19 | 152 20 | 160 21 | 168 22 | 176 23 | 184 24 | 192 25 | 200 26 | 208 27 | 216 28 | 224 29 | 232 30 | 240 31 | 248 32 | 256 33 | 264 34 | 272 35 | 280 36 | 288 37 | 296 38 | 304 39 | 312 40 | 320 41 | 328 42 | 336 43 | 344 44 | 352 45 | 360 46 | 368 47 | 376 48 | 384 49 | 392 50 | 400 51 | 408 52 | 416 53 | 424 54 | 432 55 | 440 56 | 448 57 | 456 58 | 464 59 | 472 60 | 480 61 | 488 62 | 496 63 | 504 64 | 512 65 | 2 66 | -- Finished Building -- 67 | 0 1056 blocks 0 its 0.250 Fl/B 3759 GB/s 940 GF/s 1980 Mhz 572 W 57°C 68 | 69 | 0 1056 blocks 1 its 0.500 Fl/B 3780 GB/s 1890 GF/s 1980 Mhz 578 W 58°C 70 | 71 | 0 1056 blocks 2 its 0.750 Fl/B 3763 GB/s 2822 GF/s 1980 Mhz 583 W 58°C 72 | 73 | 0 1056 blocks 4 its 1.250 Fl/B 3779 GB/s 4724 GF/s 1980 Mhz 597 W 59°C 74 | 75 | 0 1056 blocks 8 its 2.250 Fl/B 3760 GB/s 8459 GF/s 1980 Mhz 615 W 60°C 76 | 77 | 0 1056 blocks 16 its 4.250 Fl/B 3735 GB/s 15874 GF/s 1980 Mhz 650 W 62°C 78 | 79 | 0 1056 blocks 24 its 6.250 Fl/B 3629 GB/s 22683 GF/s 1980 Mhz 675 W 63°C 80 | 81 | 0 1056 blocks 32 its 8.250 Fl/B 3232 GB/s 26667 GF/s 1980 Mhz 653 W 64°C 82 | 83 | 0 1056 blocks 40 its 10.250 Fl/B 2690 GB/s 27571 GF/s 1980 Mhz 595 W 63°C 84 | 85 | 0 1056 blocks 48 its 12.250 Fl/B 2323 GB/s 28456 GF/s 1980 Mhz 558 W 63°C 86 | 87 | 0 1056 blocks 56 its 14.250 Fl/B 2029 GB/s 28920 GF/s 1980 Mhz 526 W 63°C 88 | 89 | 0 1056 blocks 64 its 16.250 Fl/B 2204 GB/s 35812 GF/s 1980 Mhz 583 W 65°C 90 | 91 | 0 1056 blocks 72 its 18.250 Fl/B 2002 GB/s 36541 GF/s 1980 Mhz 562 W 65°C 92 | 93 | 0 1056 blocks 80 its 20.250 Fl/B 1838 GB/s 37227 GF/s 1980 Mhz 547 W 65°C 94 | 95 | 0 1056 blocks 88 its 22.250 Fl/B 1700 GB/s 37819 GF/s 1980 Mhz 531 W 65°C 96 | 97 | 0 1056 blocks 96 its 24.250 Fl/B 1578 GB/s 38276 GF/s 1980 Mhz 519 W 65°C 98 | 99 | 0 1056 blocks 104 its 26.250 Fl/B 1475 GB/s 38727 GF/s 1980 Mhz 511 W 65°C 100 | 101 | 0 1056 blocks 112 its 28.250 Fl/B 1365 GB/s 38569 GF/s 1980 Mhz 501 W 65°C 102 | 103 | 0 1056 blocks 120 its 30.250 Fl/B 1303 GB/s 39404 GF/s 1980 Mhz 495 W 65°C 104 | 105 | 0 1056 blocks 128 its 32.250 Fl/B 1213 GB/s 39109 GF/s 1980 Mhz 491 W 65°C 106 | 107 | 0 1056 blocks 136 its 34.250 Fl/B 1155 GB/s 39570 GF/s 1980 Mhz 479 W 65°C 108 | 109 | 0 1056 blocks 144 its 36.250 Fl/B 1095 GB/s 39692 GF/s 1980 Mhz 473 W 65°C 110 | 111 | 0 1056 blocks 152 its 38.250 Fl/B 1045 GB/s 39981 GF/s 1980 Mhz 467 W 65°C 112 | 113 | 0 1056 blocks 160 its 40.250 Fl/B 989 GB/s 39827 GF/s 1980 Mhz 467 W 65°C 114 | 115 | 0 1056 blocks 168 its 42.250 Fl/B 954 GB/s 40308 GF/s 1980 Mhz 463 W 65°C 116 | 117 | 0 1056 blocks 176 its 44.250 Fl/B 910 GB/s 40254 GF/s 1980 Mhz 455 W 64°C 118 | 119 | 0 1056 blocks 184 its 46.250 Fl/B 876 GB/s 40524 GF/s 1980 Mhz 450 W 64°C 120 | 121 | 0 1056 blocks 192 its 48.250 Fl/B 837 GB/s 40399 GF/s 1980 Mhz 450 W 64°C 122 | 123 | 0 1056 blocks 200 its 50.250 Fl/B 813 GB/s 40870 GF/s 1980 Mhz 443 W 64°C 124 | 125 | 0 1056 blocks 208 its 52.250 Fl/B 784 GB/s 40956 GF/s 1980 Mhz 442 W 64°C 126 | 127 | 0 1056 blocks 216 its 54.250 Fl/B 761 GB/s 41269 GF/s 1980 Mhz 440 W 64°C 128 | 129 | 0 1056 blocks 224 its 56.250 Fl/B 727 GB/s 40898 GF/s 1980 Mhz 439 W 64°C 130 | 131 | 0 1056 blocks 232 its 58.250 Fl/B 709 GB/s 41309 GF/s 1980 Mhz 431 W 64°C 132 | 133 | 0 1056 blocks 240 its 60.250 Fl/B 690 GB/s 41602 GF/s 1980 Mhz 433 W 64°C 134 | 135 | 0 1056 blocks 248 its 62.250 Fl/B 667 GB/s 41499 GF/s 1980 Mhz 425 W 64°C 136 | 137 | 0 1056 blocks 256 its 64.250 Fl/B 642 GB/s 41232 GF/s 1980 Mhz 428 W 64°C 138 | 139 | 0 1056 blocks 264 its 66.250 Fl/B 629 GB/s 41687 GF/s 1980 Mhz 422 W 63°C 140 | 141 | 0 1056 blocks 272 its 68.250 Fl/B 611 GB/s 41706 GF/s 1980 Mhz 422 W 63°C 142 | 143 | 0 1056 blocks 280 its 70.250 Fl/B 595 GB/s 41795 GF/s 1980 Mhz 421 W 63°C 144 | 145 | 0 1056 blocks 288 its 72.250 Fl/B 575 GB/s 41519 GF/s 1980 Mhz 420 W 63°C 146 | 147 | 0 1056 blocks 296 its 74.250 Fl/B 566 GB/s 42006 GF/s 1980 Mhz 414 W 63°C 148 | 149 | 0 1056 blocks 304 its 76.250 Fl/B 548 GB/s 41818 GF/s 1980 Mhz 415 W 63°C 150 | 151 | 0 1056 blocks 312 its 78.250 Fl/B 536 GB/s 41925 GF/s 1980 Mhz 412 W 63°C 152 | 153 | 0 1056 blocks 320 its 80.250 Fl/B 520 GB/s 41724 GF/s 1980 Mhz 413 W 63°C 154 | 155 | 0 1056 blocks 328 its 82.250 Fl/B 512 GB/s 42115 GF/s 1980 Mhz 408 W 63°C 156 | 157 | 0 1056 blocks 336 its 84.250 Fl/B 500 GB/s 42103 GF/s 1980 Mhz 408 W 63°C 158 | 159 | 0 1056 blocks 344 its 86.250 Fl/B 488 GB/s 42129 GF/s 1980 Mhz 405 W 63°C 160 | 161 | 0 1056 blocks 352 its 88.250 Fl/B 474 GB/s 41817 GF/s 1980 Mhz 409 W 63°C 162 | 163 | 0 1056 blocks 360 its 90.250 Fl/B 469 GB/s 42329 GF/s 1980 Mhz 405 W 63°C 164 | 165 | 0 1056 blocks 368 its 92.250 Fl/B 456 GB/s 42111 GF/s 1980 Mhz 403 W 63°C 166 | 167 | 0 1056 blocks 376 its 94.250 Fl/B 447 GB/s 42092 GF/s 1980 Mhz 402 W 63°C 168 | 169 | 0 1056 blocks 384 its 96.250 Fl/B 436 GB/s 41961 GF/s 1980 Mhz 402 W 63°C 170 | 171 | 0 1056 blocks 392 its 98.250 Fl/B 432 GB/s 42426 GF/s 1980 Mhz 396 W 63°C 172 | 173 | 0 1056 blocks 400 its 100.250 Fl/B 425 GB/s 42558 GF/s 1980 Mhz 398 W 63°C 174 | 175 | 0 1056 blocks 408 its 102.250 Fl/B 415 GB/s 42442 GF/s 1980 Mhz 396 W 63°C 176 | 177 | -------------------------------------------------------------------------------- /gpu-small-kernels/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import numpy as np 6 | import math 7 | from random import * 8 | 9 | 10 | import sys 11 | 12 | sys.path.append("..") 13 | from device_order import * 14 | 15 | fig, ax = plt.subplots(figsize=(10, 6)) 16 | 17 | 18 | maxbars = {} 19 | minbars = {} 20 | 21 | peakBW = [897, 1555, 2039, 2039, 1229, 1638] 22 | 23 | 24 | filesToInclude = ["L40", "A100", "RX6900XT", "MI210", "H200"] 25 | 26 | # filesToInclude = ["L40", "RX6900XT"] 27 | 28 | 29 | def getIncludeNumber(filename): 30 | for i in range(len(filesToInclude)): 31 | if filename.upper().startswith(filesToInclude[i]): 32 | return i 33 | return len(filesToInclude) + 1 34 | 35 | 36 | def fitValues(xdata, ydata, color=None): 37 | ydata[2:-2] = ( 38 | ydata[0:-4] + ydata[1:-3] + ydata[2:-2] + ydata[3:-1] + ydata[4:] 39 | ) / 5 40 | 41 | from scipy.optimize import curve_fit 42 | 43 | # def func(x, a, b, c): 44 | # return a * np.exp(-b * np.exp(-c * x)) 45 | 46 | def func(x, a, b): 47 | return x / (a / 1e9 + (x / 1e9 / b)) 48 | 49 | best = 0 50 | lim = 1 51 | bestLim = lim 52 | perr = -1 53 | 54 | while lim + 1 < len(xdata): 55 | lim += 1 56 | if xdata[lim] < 3 * 1024 * 1024 or xdata[lim] > 100 * 1024 * 1024: 57 | continue 58 | 59 | popt, pcov, infodict, mesg, ier = curve_fit( 60 | func, 61 | xdata[:lim], 62 | ydata[:lim], 63 | bounds=([0, 0], [np.inf, np.inf]), 64 | full_output=True, 65 | ) 66 | # print(popt) 67 | # print(pcov) 68 | # print(mesg) 69 | perr = np.diag(pcov)[0] * np.diag(pcov)[1] 70 | if perr < best or best == 0: 71 | best = perr 72 | bestLim = lim 73 | 74 | print("%d fit: a=%5.0f ns, b=%5.0f GB/s," % (lim, popt[0], popt[1])) 75 | print() 76 | # print(perr) 77 | 78 | lim = bestLim 79 | popt, pcov, infodict, mesg, ier = curve_fit( 80 | func, 81 | xdata[:lim], 82 | ydata[:lim], 83 | bounds=([0, 0], [np.inf, np.inf]), 84 | full_output=True, 85 | ) 86 | print(lim, best) 87 | 88 | # xdata = np.array([*list(xdata), *[i / 25 for i in range(1, 25)]]) 89 | # xdata.sort() 90 | 91 | plt.plot( 92 | xdata[:lim] / 1024, 93 | func(xdata[:lim], *popt) / 1e9, 94 | "-", 95 | color="black", # icolor, 96 | label="fit: a=%5.0f ns, b=%5.0f GB/s," % (popt[0], popt[1]), 97 | zorder=-1, 98 | linewidth=2, 99 | alpha=1.0, 100 | ) 101 | return perr 102 | 103 | 104 | def fitCurve(splitA, splitB, color=None): 105 | fitValues( 106 | sizes[splitA:splitB], 107 | np.array( 108 | [max([v[b] if b < len(v) else 0 for b in range(len(bw[0]))]) for v in bw][ 109 | splitA:splitB 110 | ] 111 | ) 112 | * 1e9, 113 | color, 114 | ) 115 | 116 | 117 | def getOrderNumber(f): 118 | for o in range(len(order)): 119 | if f.startswith(order[o]): 120 | return o 121 | return len(order) + 1 122 | 123 | 124 | def getData(filename): 125 | with open(filename, newline="") as csvfile: 126 | csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True) 127 | dims = [] 128 | bw = [] 129 | 130 | for row in csvreader: 131 | if row[0] == "blockSize": 132 | continue 133 | dims.append(float(row[0])) 134 | values = [] 135 | for r in row[2:]: 136 | if len(r) == 0: 137 | continue 138 | values.append(float(r)) 139 | bw.append(values) 140 | 141 | return dims, bw 142 | 143 | 144 | blockSizes = [ 145 | (xblock, 1024 // xblock) for xblock in [4, 8, 16, 32, 64, 128, 256, 512, 1024] 146 | ] 147 | 148 | 149 | def getColor(b): 150 | return tuple(min(1.0, math.log2(c) / math.log2(128) * 1.4) for c in b) 151 | 152 | 153 | for filename in sorted(sorted(os.listdir(".")), key=lambda f1: getOrderNumber(f1)): 154 | if ( 155 | any([filename.upper().startswith(f) for f in filesToInclude]) 156 | and not "linear" in filename 157 | and not "graph" in filename 158 | and not "pt" in filename 159 | and not "gsync" in filename 160 | ): 161 | dims, bw = getData(filename) 162 | if len(bw) < 3: 163 | continue 164 | 165 | dims = np.array(dims) 166 | sizes = dims * 16 167 | 168 | lineStyle["marker"] = None # "|" if "graph" in filename.lower() else "_" 169 | lineStyle["linewidth"] = 2 170 | lineStyle["linestyle"] = ( 171 | "-." 172 | if "gsync" in filename.lower() 173 | else ( 174 | ":" 175 | if "pt" in filename.lower() 176 | else "--" if "graph" in filename.lower() else "-" 177 | ) 178 | ) 179 | b = 2 180 | ax.plot( 181 | sizes / 1024, 182 | [max([v[b] if b < len(v) else 0 for b in range(len(bw[0]))]) for v in bw], 183 | label=filename[:-4].upper(), 184 | color="C" + str(getOrderNumber(filename)), 185 | **lineStyle, 186 | zorder=0 187 | ) 188 | 189 | # rx6900 190 | # fitCurve(0, 80) 191 | # fitCurve(84, 110) 192 | # fitCurve(110, 139) 193 | # fitCurve(146, 240) 194 | 195 | # mi210 196 | # fitCurve(0, 76) 197 | # fitCurve(98, 160) 198 | 199 | # A100 200 | # fitCurve(0, 102) 201 | # fitCurve(117, 220) 202 | 203 | # L100 204 | # fitCurve(0, 128) 205 | # fitCurve(146, 195) 206 | 207 | # v100 208 | fitCurve(2, 120) 209 | 210 | # fitCurve(102, 250) 211 | 212 | 213 | def func(x, a, b): 214 | return x / (a / 1e9 + (x * 16 / 1e9 / b)) 215 | 216 | 217 | # values = np.arange(256, 32 * 1024, 256) 218 | # ax.plot( 219 | # values, 220 | # func(values * 1024 / 8, 3000, 2100) * 1e-9 * 16, 221 | # color="red", 222 | # linewidth=3, 223 | # label="MI300X, \n fit: a = 15000 GB/s, \n b = 3000 ns", 224 | # ) 225 | 226 | # values = np.arange(32 * 1024, 1024 * 1024, 256) 227 | # ax.plot( 228 | # values, func(values * 1024 / 8, 3000, 500) * 1e-9 * 16, color="red", linewidth=3 229 | # ) 230 | 231 | 232 | ax.set_xlabel("grid size, kB") 233 | ax.set_ylabel("GB/s") 234 | ax.set_xscale("log") 235 | 236 | ax.set_xscale("log") 237 | ax.set_xticks([128, 256, 512, 1024, 2048, 8192, 20 * 1024, 64 * 1024]) 238 | ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) 239 | 240 | # ax.axhline(1400, linestyle="--", color="C1") 241 | # ax.axhline(800, linestyle="--", color="C0") 242 | 243 | # ax.grid() 244 | ax.legend() 245 | ax.set_ylim([0, ax.get_ylim()[1]]) 246 | ax.set_xlim([64, 512 * 1024]) 247 | 248 | fig.tight_layout() 249 | fig.savefig("repeated-stream.svg", dpi=300) 250 | 251 | 252 | plt.show() 253 | -------------------------------------------------------------------------------- /rocm-metrics/rocm-metrics.hpp: -------------------------------------------------------------------------------- 1 | 2 | /****************************************************************************** 3 | Copyright (c) 2018 Advanced Micro Devices, Inc. and Dominik Ernst 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 18 | THE SOFTWARE. 19 | *******************************************************************************/ 20 | 21 | #ifndef ROCM_METRICS_H_ 22 | #define ROCM_METRICS_H_ 23 | 24 | #include "hip/hip_runtime.h" 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #define HSA_ASSERT(x) (assert((x) == HSA_STATUS_SUCCESS)) 32 | 33 | #define ROCP_CALL_CK(call) \ 34 | do { \ 35 | hsa_status_t _status = call; \ 36 | if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) { \ 37 | const char *profErr; \ 38 | rocprofiler_error_string(&profErr); \ 39 | std::cout << "ERROR: function call \n \"" << #call \ 40 | << "\" at " __FILE__ ":" << __LINE__ \ 41 | << " \n failed with status " << _status << ": \" " << profErr \ 42 | << "\"\n"; \ 43 | } \ 44 | } while (0); 45 | 46 | hsa_agent_t agent_info_arr[16]; 47 | unsigned agent_info_arr_len; 48 | 49 | static hsa_status_t _count_devices(hsa_agent_t agent, void *data) { 50 | unsigned *count = (unsigned *)data; 51 | hsa_device_type_t type; 52 | hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); 53 | assert(status == HSA_STATUS_SUCCESS); 54 | if (type == HSA_DEVICE_TYPE_GPU) { 55 | agent_info_arr[(*count)++] = agent; 56 | } 57 | return status; 58 | } 59 | 60 | static unsigned _get_device_count(void) { 61 | unsigned count = 0; 62 | hsa_status_t status = hsa_iterate_agents(&_count_devices, &count); 63 | assert(status == HSA_STATUS_SUCCESS); 64 | return count; 65 | } 66 | 67 | static hsa_agent_t _get_agent(unsigned gpu_id) { 68 | return agent_info_arr[gpu_id]; 69 | } 70 | 71 | hsa_status_t info_data_callback(const rocprofiler_info_data_t info, 72 | void *data) { 73 | 74 | std::cout << "info data callback\n"; 75 | switch (info.kind) { 76 | case ROCPROFILER_INFO_KIND_METRIC: { 77 | if (info.metric.expr != NULL) { 78 | std::cout << "Derived counter: gpu-agent" << info.agent_index << " " 79 | << info.metric.name << ": " << info.metric.description; 80 | 81 | std::cout << info.metric.name << " = " << info.metric.expr << "\n"; 82 | } else { 83 | std::cout << "Basic counter: gpu-agent" << info.agent_index << ": " 84 | << info.metric.name << "\n"; 85 | if (info.metric.instances > 1) { 86 | std::cout << "[0-" << info.metric.instances - 1 << "]\n"; 87 | } 88 | std::cout << " : " << info.metric.description; 89 | std::cout << " block " << info.metric.block_name << " has " 90 | << info.metric.block_counters << " counters\n"; 91 | } 92 | break; 93 | } 94 | default: 95 | return HSA_STATUS_ERROR; 96 | } 97 | return HSA_STATUS_SUCCESS; 98 | } 99 | void printMetrics(hsa_agent_t agent) { 100 | ROCP_CALL_CK( rocprofiler_iterate_info( 101 | &agent, ROCPROFILER_INFO_KIND_METRIC, info_data_callback, NULL)); 102 | 103 | } 104 | 105 | hsa_agent_t agent; 106 | // Profiling context 107 | rocprofiler_t *context = NULL; 108 | 109 | const unsigned feature_count = 2; 110 | rocprofiler_feature_t feature[feature_count]; 111 | double prevValues[feature_count]; 112 | 113 | 114 | void measureBandwidthStart() { 115 | hipDeviceSynchronize(); 116 | // Start counters and sample them in the loop with the sampling rate 117 | } 118 | 119 | std::vector measureMetricStop() { 120 | hipDeviceSynchronize(); 121 | 122 | 123 | std::vector results(6,0); 124 | 125 | ROCP_CALL_CK( rocprofiler_read(context, 0)); 126 | ROCP_CALL_CK( rocprofiler_get_data(context, 0)); 127 | ROCP_CALL_CK( rocprofiler_get_metrics(context)); 128 | // print_results(feature, feature_count); 129 | 130 | 131 | double v1 = (feature[0].data.result_double - prevValues[0]); 132 | double v2 = (feature[1].data.result_double - prevValues[1]); 133 | 134 | 135 | results[0] = v1 * 32; 136 | results[2] = (v2 * 32) / 32; 137 | 138 | 139 | 140 | for (unsigned i = 0; i < feature_count; ++i) { 141 | const rocprofiler_feature_t *p = &feature[i]; 142 | //std::cout << p->name << ": "; 143 | 144 | double val = 0; 145 | switch(p->data.kind) { 146 | case ROCPROFILER_DATA_KIND_INT64: 147 | val = p->data.result_int64; 148 | break; 149 | case ROCPROFILER_DATA_KIND_DOUBLE: 150 | val = p->data.result_double; 151 | break; 152 | default: 153 | std::cout << "Undefined data kind: " << p->data.kind << "\n"; 154 | assert(0); 155 | } 156 | //std::cout << "= " << val << ", Delta: " << val - prevValues[i] << "\n"; 157 | prevValues[i] = val; 158 | } 159 | 160 | 161 | 162 | // Stop counters 163 | //ROCP_CALL_CK( rocprofiler_stop(context, 0)); 164 | 165 | return results; 166 | } 167 | 168 | void initMeasureMetric() { 169 | setenv("HSA_TOOLS_LIB", "/opt/rocm/rocprofiler/lib/librocprofiler64.so", 1); 170 | setenv("ROCP_METRICS", "/opt/rocm/lib/rocprofiler/metrics.xml", 1); 171 | 172 | 173 | HSA_ASSERT(hsa_init()); 174 | hsa_status_t status = HSA_STATUS_ERROR; 175 | // HSA agent 176 | 177 | unsigned gpu_count = _get_device_count(); 178 | agent_info_arr_len = gpu_count; 179 | 180 | for (unsigned gpu_id = 0; gpu_id < gpu_count; ++gpu_id) { 181 | hsa_agent_t agent = _get_agent(gpu_id); 182 | std::cout << "Agent " << gpu_id << "\n"; 183 | } 184 | 185 | agent = _get_agent(0); 186 | 187 | //printMetrics(agent); 188 | 189 | // Profiling feature objects 190 | 191 | // Counters and metrics 192 | feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; 193 | feature[0].name = "TCP_TOTAL_CACHE_ACCESSES_sum"; 194 | feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; 195 | feature[1].name = "TCP_TCC_READ_REQ_sum"; 196 | 197 | //feature[2].kind = ROCPROFILER_FEATURE_KIND_METRIC; 198 | //feature[2].name = "FETCH_SIZE"; 199 | //feature[3].kind = ROCPROFILER_FEATURE_KIND_METRIC; 200 | //feature[3].name = "WRITE_SIZE"; 201 | 202 | // Creating profiling context with standalone queue 203 | rocprofiler_properties_t properties = {}; 204 | properties.queue_depth = 128; 205 | uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE | 206 | ROCPROFILER_MODE_SINGLEGROUP; 207 | 208 | properties.queue_depth = 128; 209 | 210 | ROCP_CALL_CK(rocprofiler_open(agent, feature, feature_count, &context, mode, 211 | &properties)); 212 | 213 | ROCP_CALL_CK(rocprofiler_start(context, 0)); 214 | } 215 | 216 | #endif // ROCM-METRICS_H_ 217 | --------------------------------------------------------------------------------