├── measure_metric
    ├── measureMetricPW.cpp
    ├── ScopeExit.h
    ├── pythonInterface.cpp
    ├── Parser.hpp
    ├── Parser.h
    └── Utils.h
├── gpu-metrics
    ├── cuda_metrics
    │   ├── measureMetricPW.cpp
    │   ├── ScopeExit.h
    │   ├── pythonInterface.cpp
    │   ├── Parser.hpp
    │   ├── Parser.h
    │   └── Utils.h
    ├── rocm_metrics
    │   ├── test_rocm_metrics
    │   ├── Makefile
    │   └── test_rocm_metrics.hip
    ├── gpu-metrics.hpp
    └── README.md
├── gpu-stream
    ├── maxbars.pdf
    ├── minbars.pdf
    ├── cuda-stream.pdf
    ├── Makefile
    ├── rx6900xt.txt
    ├── a40.txt
    ├── l40.txt
    ├── h100_pcie.txt
    ├── past_results
    │   ├── h100_pcie.txt
    │   └── a100_40.txt
    ├── a100_40.txt
    ├── a100_80.txt
    ├── gh200.txt
    ├── mi100.txt
    ├── mi210.txt
    ├── mi300a.txt
    ├── mi300x.txt
    ├── v100.txt
    └── plot.py
├── gpu-latency
    ├── latencies.pdf
    ├── latencies_NV.pdf
    ├── latencies_AMD.pdf
    ├── Makefile
    ├── plot.py
    └── main.cu
├── gpu-roofline
    ├── L40_plot.pdf
    ├── series.sh
    ├── Makefile
    ├── plot.py
    ├── main.cu
    ├── mi300x.txt
    └── h200.txt
├── .gitignore
├── gpu-l2-cache
    ├── sycl
    │   ├── build.sh
    │   └── sycl-gpu-l2-cache.cpp
    ├── Makefile
    ├── plot.py
    └── main.cu
├── dtime.hpp
├── gpu-metrics.hpp
├── um-stream
    ├── Makefile
    └── main.cu
├── gpu-small-kernels
    ├── readme.md
    ├── Makefile
    ├── a40_pt.txt
    └── plot.py
├── cuda-memcpy
    ├── Makefile
    └── main.cu
├── unmaintained
    ├── cuda-busy
    │   ├── Makefile
    │   └── main.cu
    ├── cuda-cache-overlap
    │   └── Makefile
    ├── cuda-3d-stream
    │   ├── Makefile
    │   └── main.cu
    └── cuda-gapped-stream
    │   ├── Makefile
    │   └── main.cu
├── cuda-incore
    ├── Makefile
    └── main.cu
├── gpu-error.h
├── gpu-strides
    ├── Makefile
    └── h200.txt
├── gpu-l2-stream
    ├── Makefile
    └── plot.py
├── gpu-cache
    ├── Makefile
    ├── plot.py
    ├── mi100.txt
    └── mi210.txt
├── device_order.py
├── gpu-stats.h
├── MeasurementSeries.hpp
├── gpu-clock.cuh
└── rocm-metrics
    └── rocm-metrics.hpp


/measure_metric/measureMetricPW.cpp:
--------------------------------------------------------------------------------
1 | #include "measureMetricPW.hpp"
2 | 


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/measureMetricPW.cpp:
--------------------------------------------------------------------------------
1 | #include "measureMetricPW.hpp"
2 | 


--------------------------------------------------------------------------------
/gpu-stream/maxbars.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/maxbars.pdf


--------------------------------------------------------------------------------
/gpu-stream/minbars.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/minbars.pdf


--------------------------------------------------------------------------------
/gpu-latency/latencies.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies.pdf


--------------------------------------------------------------------------------
/gpu-roofline/L40_plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-roofline/L40_plot.pdf


--------------------------------------------------------------------------------
/gpu-stream/cuda-stream.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-stream/cuda-stream.pdf


--------------------------------------------------------------------------------
/gpu-latency/latencies_NV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies_NV.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 
3 | # Compiled CUDA/HIP binaries
4 | cuda-*
5 | cu-*
6 | hip-*
7 | *.o
8 | *.so
9 | *.a


--------------------------------------------------------------------------------
/gpu-latency/latencies_AMD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-latency/latencies_AMD.pdf


--------------------------------------------------------------------------------
/gpu-metrics/rocm_metrics/test_rocm_metrics:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RRZE-HPC/gpu-benches/HEAD/gpu-metrics/rocm_metrics/test_rocm_metrics


--------------------------------------------------------------------------------
/gpu-l2-cache/sycl/build.sh:
--------------------------------------------------------------------------------
1 | clang++ -O3 -fsycl -fsycl-targets=nvptx64-nvidia-cuda sycl-gpu-l2-cache.cpp -o  sycl-gpu-l2-cache -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80
2 | ./sycl-gpu-l2-cache


--------------------------------------------------------------------------------
/dtime.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <sys/time.h>
 3 | 
 4 | double dtime() {
 5 |   double tseconds = 0;
 6 |   struct timeval t;
 7 |   gettimeofday(&t, NULL);
 8 |   tseconds = (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
 9 |   return tseconds;
10 | }
11 | 


--------------------------------------------------------------------------------
/gpu-metrics.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_MEASURE_METRICS_H_
 2 | #define GPU_MEASURE_METRICS_H_
 3 | 
 4 | 
 5 | #ifdef __NVCC__
 6 | #include "measure_metric/measureMetricPW.hpp"
 7 | #elif defined __HIP__
 8 | #include "rocm-metrics/rocm-metrics.hpp"
 9 | #endif
10 | #endif // GPU_MEASURE_METRICS_H_
11 | 


--------------------------------------------------------------------------------
/gpu-metrics/gpu-metrics.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_MEASURE_METRICS_H_
 2 | #define GPU_MEASURE_METRICS_H_
 3 | 
 4 | 
 5 | #ifdef __NVCC__
 6 | #include "cuda_metrics/measureMetricPW.hpp"
 7 | #elif defined __HIP__
 8 | #include "rocm_metrics/rocm_metrics.hpp"
 9 | #endif
10 | #endif // GPU_MEASURE_METRICS_H_
11 | 


--------------------------------------------------------------------------------
/gpu-metrics/rocm_metrics/Makefile:
--------------------------------------------------------------------------------
 1 | HIP_HOME := /opt/rocm
 2 | 
 3 | HIPFLAGS := -ldl -O3 -std=c++2a -I/opt/rocm/include/hip $(shell python3-config --includes) -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64
 4 | 
 5 | 
 6 | 
 7 | test_rocm_metrics: test_rocm_metrics.hip
 8 | 	hipcc $< -o $@ $(HIPFLAGS)
 9 | 
10 | # end
11 | 


--------------------------------------------------------------------------------
/measure_metric/ScopeExit.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T>
 4 | 
 5 | class ScopeExit
 6 | {
 7 | public:
 8 |     ScopeExit(T t) : t(t) {}
 9 |     ~ScopeExit() { t(); }
10 |     T t;
11 | };
12 | 
13 | template <typename T>
14 | ScopeExit<T> MoveScopeExit(T t) {
15 |     return ScopeExit<T>(t);
16 | };
17 | 
18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line
19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line)
20 | 
21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func;})


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/ScopeExit.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T>
 4 | 
 5 | class ScopeExit
 6 | {
 7 | public:
 8 |     ScopeExit(T t) : t(t) {}
 9 |     ~ScopeExit() { t(); }
10 |     T t;
11 | };
12 | 
13 | template <typename T>
14 | ScopeExit<T> MoveScopeExit(T t) {
15 |     return ScopeExit<T>(t);
16 | };
17 | 
18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line
19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line)
20 | 
21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func;})


--------------------------------------------------------------------------------
/um-stream/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | # internal flags
 4 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 5 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info
 6 | CCFLAGS     := 
 7 | LDFLAGS     := -L/opt/cuda/lib64 -lcublas
 8 | NAME 		:= um-stream
 9 | PREFIX		:= .
10 | 
11 | 
12 | $(PREFIX)/$(NAME): main.cu Makefile ../dtime.hpp ../MeasurementSeries.hpp ../gpu-error.h
13 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
14 | 
15 | clean:
16 | 	rm -f ./$(NAME)
17 | 
18 | 


--------------------------------------------------------------------------------
/gpu-small-kernels/readme.md:
--------------------------------------------------------------------------------
 1 | # Repeated Small Kernel Performance
 2 | 
 3 | 
 4 | This benchmark explors the potential for cache blocking, where kernels work on a small data set that fits into caches. Because the data set is small, and the L2 cache is fast, the kernel executues so quickly that the startup overhead of a kernel launch becomes dominant. The benchmark queues 10000 calls of a streaming SCALE kernel of varying size. Use commandline option "-graph" to use the cudaGraph/hipGraph API. 
 5 | 
 6 | ![latency plot](repeated-stream.svg)
 7 | 
 8 | Each device gets a fit of \$a,b\$ for the function
 9 | 
10 | $$T = \frac{V}{a + V/b}$$
11 | 
12 | which models the performance with a startup overhead \$a\$ and a bandwidth \$b\$ depending on the data volume \$V\$. 
13 | 


--------------------------------------------------------------------------------
/gpu-roofline/series.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | range=1024
 4 | 
 5 | mkdir -p build
 6 | 
 7 |     
 8 | make ./build/$10 N=0 PREFIX=./build 1>&2
 9 | make ./build/$11 N=1 PREFIX=./build 1>&2 &
10 | make ./build/$12 N=2 PREFIX=./build 1>&2 &
11 | 
12 | 
13 | for (( d=4 ; d<=$range; d+= (d /  24 + 1)*2  ))
14 | do
15 |     echo $d
16 |     make ./build/$1$d N=$d PREFIX=./build 1>&2 &
17 |     while test $(jobs -p | wc -w) -ge 64; do
18 |         sleep 1;
19 |     done
20 | done
21 | 
22 | while test $(jobs -p | wc -w) -ge 2; do
23 |     echo $(jobs -p | wc -w)
24 |     sleep 1;
25 | done
26 | 
27 | wait
28 | 
29 | echo "-- Finished Building --"
30 | 
31 | 
32 | ./build/$10
33 | ./build/$11
34 | ./build/$12
35 | 
36 | 
37 | for (( d=4 ; d<=$range; d+= (d /  24 + 1)*2  ))
38 | do
39 |     ./build/$1$d
40 | done
41 | 
42 | 


--------------------------------------------------------------------------------
/measure_metric/pythonInterface.cpp:
--------------------------------------------------------------------------------
 1 | #include "measureMetricPW.hpp"
 2 | 
 3 | #include <Python.h>
 4 | 
 5 | extern "C" PyObject *measureMetricStop() {
 6 | 
 7 |   runTestEnd();
 8 | 
 9 |   /*CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = {
10 |       CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE};
11 |   CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams));
12 | */
13 |   auto values = NV::Metric::Eval::GetMetricValues(chipName, counterDataImage,
14 |                                                   metricNames);
15 | 
16 |   PyGILState_STATE gstate = PyGILState_Ensure();
17 | 
18 | 
19 |   PyObject *result = PyList_New(0);
20 |   for (auto value : values) {
21 |     PyList_Append(result, PyFloat_FromDouble(value));
22 |   }
23 | 
24 |   PyGILState_Release(gstate);
25 | 
26 |   return result;
27 | }
28 | 


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/pythonInterface.cpp:
--------------------------------------------------------------------------------
 1 | #include "measureMetricPW.hpp"
 2 | 
 3 | #include <Python.h>
 4 | 
 5 | extern "C" PyObject *measureMetricStop() {
 6 | 
 7 |   runTestEnd();
 8 | 
 9 |   /*CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = {
10 |       CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE};
11 |   CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams));
12 | */
13 |   auto values = NV::Metric::Eval::GetMetricValues(chipName, counterDataImage,
14 |                                                   metricNames);
15 | 
16 |   PyGILState_STATE gstate = PyGILState_Ensure();
17 | 
18 | 
19 |   PyObject *result = PyList_New(0);
20 |   for (auto value : values) {
21 |     PyList_Append(result, PyFloat_FromDouble(value));
22 |   }
23 | 
24 |   PyGILState_Release(gstate);
25 | 
26 |   return result;
27 | }
28 | 


--------------------------------------------------------------------------------
/cuda-memcpy/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\"
 9 | CCFLAGS     :=
10 | LDFLAGS     := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda
11 | NAME 		:= cuda-memcpy
12 | PREFIX		:= .
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | 
16 | $(PREFIX)/$(NAME): main.cu Makefile
17 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
18 | 
19 | clean:
20 | 	rm -f ./$(NAME)
21 | 


--------------------------------------------------------------------------------
/gpu-metrics/rocm_metrics/test_rocm_metrics.hip:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "rocm_metrics.hpp"
 3 | 
 4 | 
 5 | __global__ void updateKernel(double* A, size_t N) {
 6 |     int tidx = threadIdx.x + blockDim.x * blockIdx.x;
 7 |     for(size_t i = tidx; i < N; i += blockDim.x * gridDim.x) {
 8 |         A[i] = 0.2 * A[i];;
 9 |     }
10 | }
11 | 
12 | 
13 | int main(int argc, char** argv) {
14 |     initMeasureMetric();
15 | 
16 |     double *dA;
17 |     int bufferCount = 1024 * 1024 * 1024;
18 |     hipMalloc(&dA, bufferCount * sizeof(double));
19 | 
20 |     for(int i = 0; i < 10; i++) {
21 |         measureBandwidthStart();
22 | 
23 |         updateKernel<<<100, 1024>>>(dA, bufferCount);
24 |         auto vals = measureMetricStop();
25 | 
26 |         for(auto v: vals) {
27 |             std::cout << v * 1024 / bufferCount << "\n";
28 |         }
29 |     }
30 | 
31 |     return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-busy/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++14 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-std=c++14 -O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 
 9 | CCFLAGS     := 
10 | LDFLAGS     := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml
11 | NAME 		:= cuda-busy
12 | PREFIX		:= .
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | $(PREFIX)/$(NAME): main.cu Makefile
16 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
17 | 
18 | 
19 | clean:
20 | 	rm -f ./$(NAME)
21 | 
22 | 


--------------------------------------------------------------------------------
/cuda-incore/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | 
 4 | TEMP_NVCC := $(shell which nvcc)
 5 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 6 | 
 7 | 
 8 | # internal flags
 9 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
10 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 
11 | CCFLAGS     := 
12 | LDFLAGS     := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml
13 | NAME 		:= cuda-incore
14 | PREFIX		:= .
15 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
16 | 
17 | $(PREFIX)/$(NAME): main.cu Makefile
18 | 	echo $(CUDA_HOME)
19 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
20 | 
21 | 
22 | clean:
23 | 	rm -f ./$(NAME)
24 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-cache-overlap/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" 
 9 | CCFLAGS     := 
10 | LDFLAGS     := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml
11 | NAME 		:= cuda-cache-overlap
12 | PREFIX		:= .
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | $(PREFIX)/$(NAME): main.cu Makefile
16 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
17 | 
18 | 
19 | clean:
20 | 	rm -f ./$(NAME)
21 | 
22 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-3d-stream/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
 9 | CCFLAGS     := 
10 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
11 | NAME 		:= cuda-stream
12 | PREFIX		:= .
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | $(PREFIX)/$(NAME): main.cu Makefile
16 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
17 | 
18 | 
19 | clean:
20 | 	rm -f ./$(NAME)
21 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-gapped-stream/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
 9 | CCFLAGS     := 
10 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
11 | NAME 		:= cuda-gapped-stream
12 | PREFIX		:= .
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | $(PREFIX)/$(NAME): main.cu Makefile
16 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
17 | 
18 | 
19 | clean:
20 | 	rm -f ./$(NAME)
21 | 


--------------------------------------------------------------------------------
/gpu-latency/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | # internal flags
 4 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 5 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info
 6 | CCFLAGS     := 
 7 | LDFLAGS     := -L/opt/cuda/lib64 -lcublas -lnvidia-ml
 8 | NAME 		:= latency
 9 | PREFIX		:=
10 | N 			:= 1
11 | 
12 | 
13 | $(PREFIX)cuda-$(NAME): main.cu Makefile
14 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
15 | 
16 | main.hip: main.cu
17 | 	hipify-perl main.cu > main.hip
18 | 
19 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
20 | 	echo $(HIP_HOME)
21 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $<
22 | 
23 | clean:
24 | 	rm -f cuda-$(NAME) hip-$(NAME)
25 | 	rm -f *-cuda-$(NAME) *-hip-$(NAME)
26 | 	rm main.hip
27 | 


--------------------------------------------------------------------------------
/gpu-error.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #pragma once
 4 | 
 5 | #ifdef __NVCC__
 6 | #define GPU_ERROR(ans)                                                         \
 7 |   { gpuAssert((ans), __FILE__, __LINE__); }
 8 | inline void gpuAssert(cudaError_t code, const char *file, int line,
 9 |                       bool abort = true) {
10 |   if (code != cudaSuccess) {
11 |     std::cerr << "GPUassert: \"" << cudaGetErrorString(code) << "\"  in "
12 |               << file << ": " << line << "\n";
13 |     if (abort)
14 |       exit(code);
15 |   }
16 | }
17 | #elif defined __HIP__
18 | #define GPU_ERROR(ans)                                                         \
19 |   { gpuAssert((ans), __FILE__, __LINE__); }
20 | inline void gpuAssert(hipError_t code, const char *file, int line,
21 |                       bool abort = true) {
22 |   if (code != hipSuccess) {
23 |     std::cerr << "GPUassert: \"" << hipGetErrorString(code) << "\"  in " << file
24 |               << ": " << line << "\n";
25 |     if (abort)
26 |       exit(code);
27 |   }
28 | }
29 | #endif
30 | 


--------------------------------------------------------------------------------
/gpu-stream/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++20 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
 9 | CCFLAGS     := 
10 | NAME 		:= stream
11 | LDFLAGS     := -L/opt/cuda/lib64 -lcuda -lnvidia-ml
12 | PREFIX		:=
13 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
14 | 
15 | $(PREFIX)cuda-$(NAME): main.cu Makefile
16 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
17 | 
18 | $(PREFIX)$(NAME)-gsl: main_gsl.cu Makefile
19 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
20 | 
21 | 
22 | main.hip: main.cu
23 | 	hipify-perl main.cu > main.hip
24 | 
25 | $(PREFIX)hip-$(NAME): main.hip Makefile
26 | 	hipcc -std=c++20  -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lhsa-runtime64 -ldl -o $@ $<
27 | 
28 | clean:
29 | 	rm -f cuda-$(NAME) hip-$(NAME) *-hip-$(NAME) *-cuda-$(NAME) main.hip
30 | 


--------------------------------------------------------------------------------
/gpu-roofline/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | # internal flags
 7 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 8 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\"
 9 | 
10 | CCFLAGS     := 
11 | LDFLAGS     := -L/opt/cuda/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml
12 | NAME 		:= roof
13 | PREFIX		:= .
14 | N 			:= 100
15 | 
16 | $(PREFIX)/cu-$(NAME)$N: main.cu Makefile series.sh
17 | 	$(NVCC) -DPARN=$N $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
18 | 
19 | 
20 | main.hip: main.cu
21 | 	hipify-perl main.cu > main.hip
22 | 
23 | $(PREFIX)/hip-$(NAME)$N: main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
24 | 	echo $(HIP_HOME)
25 | 	hipcc -DPARN=$N -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -fopenmp -o $@ $<
26 | 
27 | 
28 | clean:
29 | 	rm -f ./$(NAME)
30 | 
31 | 


--------------------------------------------------------------------------------
/gpu-small-kernels/Makefile:
--------------------------------------------------------------------------------
 1 | ##
 2 | NVCC := nvcc
 3 | 
 4 | TEMP_NVCC := $(shell which nvcc)
 5 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 6 | 
 7 | # internal flags
 8 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
 9 | NVCCFLAGS   := -std=c++11 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
10 | CCFLAGS     :=
11 | NAME 		:= small-kernels
12 | LDFLAGS     := -L/opt/cuda/lib64 -lcuda -lnvidia-ml
13 | PREFIX		:= .
14 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
15 | 
16 | $(PREFIX)/cuda-$(NAME): main.cu Makefile
17 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
18 | 
19 | 
20 | 
21 | 
22 | main.hip: main.cu
23 | 	hipify-perl main.cu > main.hip
24 | 
25 | $(PREFIX)/hip-$(NAME): main.hip Makefile
26 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -ldl -o $@ $<
27 | 
28 | clean:
29 | 	rm -f cuda-$(NAME) hip-$(NAME)
30 | # Project Title
31 | #
32 | # @file
33 | # @version 0.1
34 | 
35 | 
36 | 
37 | # end
38 | 


--------------------------------------------------------------------------------
/gpu-l2-cache/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | HIP_HOME :=  /opt/rocm
 7 | 
 8 | 
 9 | # internal flags
10 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
11 | NVCCFLAGS   := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\"
12 | CCFLAGS     := 
13 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
14 | NAME 		:= l2-cache
15 | PREFIX		:= .
16 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include -I$(CUDA_HOME)/include
17 | 
18 | $(PREFIX)/cuda-$(NAME): main.cu Makefile
19 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
20 | 
21 | 
22 | main.hip: main.cu
23 | 	hipify-perl main.cu > main.hip
24 | 
25 | $(PREFIX)/hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
26 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -ldl -o $@ $<
27 | 
28 | clean:
29 | 	rm -f ./cuda-$(NAME)
30 | 	rm -f main.hip
31 | 	rm -f ./hip-$(NAME)
32 | 


--------------------------------------------------------------------------------
/gpu-strides/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | HIP_HOME :=  /opt/rocm
 7 | 
 8 | 
 9 | # internal flags
10 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
11 | NVCCFLAGS   := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
12 | CCFLAGS     := 
13 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
14 | NAME 		:= strides
15 | PREFIX		:= .
16 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
17 | 
18 | $(PREFIX)/cuda-$(NAME): main.cu Makefile
19 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
20 | 
21 | 
22 | 
23 | main.hip: main.cu
24 | 	hipify-perl main.cu > main.hip
25 | 
26 | $(PREFIX)/hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
27 | 	echo $(HIP_HOME)
28 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -mcumode -ldl -o $@ $<
29 | 
30 | 
31 | 
32 | 
33 | clean:
34 | 	rm -f cuda-$(NAME) hip-$(NAME)
35 | 


--------------------------------------------------------------------------------
/measure_metric/Parser.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | namespace NV {
 6 | namespace Metric {
 7 | namespace Parser {
 8 | inline bool ParseMetricNameString(const std::string &metricName,
 9 |                                   std::string *reqName, bool *isolated,
10 |                                   bool *keepInstances) {
11 |   std::string &name = *reqName;
12 |   name = metricName;
13 |   if (name.empty()) {
14 |     return false;
15 |   }
16 | 
17 |   // boost program_options sometimes inserts a \n between the metric name and a
18 |   // '&' at the end
19 |   size_t pos = name.find('\n');
20 |   if (pos != std::string::npos) {
21 |     name.erase(pos, 1);
22 |   }
23 | 
24 |   // trim whitespace
25 |   while (name.back() == ' ') {
26 |     name.pop_back();
27 |     if (name.empty()) {
28 |       return false;
29 |     }
30 |   }
31 | 
32 |   *keepInstances = false;
33 |   if (name.back() == '+') {
34 |     *keepInstances = true;
35 |     name.pop_back();
36 |     if (name.empty()) {
37 |       return false;
38 |     }
39 |   }
40 | 
41 |   *isolated = true;
42 |   if (name.back() == '$') {
43 |     name.pop_back();
44 |     if (name.empty()) {
45 |       return false;
46 |     }
47 |   } else if (name.back() == '&') {
48 |     *isolated = false;
49 |     name.pop_back();
50 |     if (name.empty()) {
51 |       return false;
52 |     }
53 |   }
54 | 
55 |   return true;
56 | }
57 | } // namespace Parser
58 | } // namespace Metric
59 | } // namespace NV
60 | 


--------------------------------------------------------------------------------
/gpu-l2-stream/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | TEMP_HIPCC := $(shell which hipcc)
 7 | HIP_HOME :=  /opt/rocm
 8 | 
 9 | # internal flags
10 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
11 | NVCCFLAGS   := -std=c++20 -O3 -gencode arch=compute_$(SM),code=sm_$(SM)--compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler "-Wall"
12 | CCFLAGS     := 
13 | NAME 		:= l2-stream
14 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
15 | PREFIX		:=
16 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include
17 | 
18 | $(PREFIX)cuda-$(NAME): main.cu Makefile
19 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
20 | 
21 | $(PREFIX)$(NAME)-gsl: main_gsl.cu Makefile
22 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
23 | 
24 | 
25 | 
26 | 
27 | main.hip: main.cu
28 | 	hipify-perl main.cu > main.hip
29 | 
30 | 
31 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
32 | 	echo $(HIP_HOME)
33 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $<
34 | 
35 | clean:
36 | 	rm -f cuda-$(NAME) hip-$(NAME)
37 | 


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/Parser.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | namespace NV {
 6 | namespace Metric {
 7 | namespace Parser {
 8 | inline bool ParseMetricNameString(const std::string &metricName,
 9 |                                   std::string *reqName, bool *isolated,
10 |                                   bool *keepInstances) {
11 |   std::string &name = *reqName;
12 |   name = metricName;
13 |   if (name.empty()) {
14 |     return false;
15 |   }
16 | 
17 |   // boost program_options sometimes inserts a \n between the metric name and a
18 |   // '&' at the end
19 |   size_t pos = name.find('\n');
20 |   if (pos != std::string::npos) {
21 |     name.erase(pos, 1);
22 |   }
23 | 
24 |   // trim whitespace
25 |   while (name.back() == ' ') {
26 |     name.pop_back();
27 |     if (name.empty()) {
28 |       return false;
29 |     }
30 |   }
31 | 
32 |   *keepInstances = false;
33 |   if (name.back() == '+') {
34 |     *keepInstances = true;
35 |     name.pop_back();
36 |     if (name.empty()) {
37 |       return false;
38 |     }
39 |   }
40 | 
41 |   *isolated = true;
42 |   if (name.back() == '$') {
43 |     name.pop_back();
44 |     if (name.empty()) {
45 |       return false;
46 |     }
47 |   } else if (name.back() == '&') {
48 |     *isolated = false;
49 |     name.pop_back();
50 |     if (name.empty()) {
51 |       return false;
52 |     }
53 |   }
54 | 
55 |   return true;
56 | }
57 | } // namespace Parser
58 | } // namespace Metric
59 | } // namespace NV
60 | 


--------------------------------------------------------------------------------
/gpu-cache/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc
 2 | 
 3 | TEMP_NVCC := $(shell which nvcc)
 4 | CUDA_HOME :=  $(shell echo $(TEMP_NVCC) | rev |  cut -d'/' -f3- | rev)
 5 | 
 6 | TEMP_HIPCC := $(shell which hipcc)
 7 | HIP_HOME :=  /opt/rocm
 8 | 
 9 | #(shell echo $(TEMP_HIPCC) | rev |  cut -d'/' -f4- | rev)
10 | 
11 | # internal flags
12 | SM      	?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/\.//g')
13 | NVCCFLAGS   := -std=c++17 -O3 -gencode arch=compute_$(SM),code=sm_$(SM) --compiler-options="-O2 -pipe -Wall -fopenmp -g" -Xcompiler -rdynamic --generate-line-info  -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/extras/CUPTI/lib64/\" -Xcompiler \"-Wl,-rpath,$(CUDA_HOME)/lib64/\"
14 | CCFLAGS     := 
15 | LDFLAGS     := -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/extras/CUPTI/lib64 -lcupti -lcuda   -lnvidia-ml -lnvperf_host -lnvperf_target
16 | NAME 		:= cache
17 | PREFIX		:=
18 | INCLUDES 	:=  -I$(CUDA_HOME)/extras/CUPTI/include -I$(CUDA_HOME)/include
19 | 
20 | 
21 | 
22 | $(PREFIX)cuda-$(NAME): main.cu Makefile
23 | 	echo $(CUDA_HOME)
24 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDES) -o $@ $< $(LDFLAGS)
25 | 
26 | 
27 | main.hip: main.cu
28 | 	hipify-perl main.cu > main.hip
29 | 
30 | $(PREFIX)hip-$(NAME): main.hip Makefile ../rocm-metrics/rocm-metrics.hpp
31 | 	echo $(HIP_HOME)
32 | 	hipcc -std=c++20 -I$(HIP_HOME)/include/rocprofiler/ -I$(HIP_HOME)/hsa/include/hsa -L$(HIP_HOME)/rocprofiler/lib -lrocprofiler64 -lhsa-runtime64 -lrocm_smi64 -ldl -o $@ $<
33 | 
34 | clean:
35 | 	rm -f cuda-$(NAME) hip-$(NAME)
36 | 


--------------------------------------------------------------------------------
/device_order.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import matplotlib
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | plt.style.use("bmh")
 7 | plt.rcParams["axes.facecolor"] = "white"
 8 | 
 9 | 
10 | device_color_palette = [
11 |     "#378ABD",
12 |     "#FFB33A",
13 |     "#7EC75B",
14 |     "#DA5252",
15 |     "#793B67",
16 |     "#10CFCC",
17 |     "#FFE100",
18 |     "#09047f",
19 |     "#296F20",
20 | ]
21 | 
22 | order = [
23 |     "a40",
24 |     "l40",
25 |     "v100",
26 |     "a100",
27 |     "gh200",
28 |     "mi100",
29 |     "mi210",
30 |     "mi300x",
31 |     "rx6900xt",
32 |     "mi300a",
33 |     "a100_40",
34 |     "h100_pcie",
35 | ]
36 | 
37 | 
38 | long_order = [
39 |     "NVIDIA A40",
40 |     "NVIDIA L40",
41 |     "Tesla V100",
42 |     "NVIDIA A100-SXM4-80GB",
43 |     "NVIDIA GH200 480GB",
44 |     "AMD Instinct MI100",
45 |     "AMD Instinct MI210",
46 |     "AMD Instinct MI300X",
47 |     "AMD Radeon RX 6900 XT",
48 |     "NVIDIA A100-SXM4-40G",
49 | ]
50 | 
51 | 
52 | def getOrderNumber(f, use_order=order):
53 |     for o in range(len(use_order)):
54 |         if f.startswith(use_order[o]):
55 |             return o
56 |     return len(use_order) + 1
57 | 
58 | 
59 | def getDeviceColor(f):
60 |     n = getOrderNumber(f)
61 |     if n >= len(device_color_palette):
62 |         n = getOrderNumber(f, use_order=long_order)
63 |     if n >= len(device_color_palette):
64 |         return "C" + str(n - len(device_color_palette))
65 | 
66 |     return device_color_palette[n]
67 | 
68 | 
69 | lineStyle = {"linewidth": 2.0, "alpha": 1, "markersize": 3, "marker": ""}
70 | 


--------------------------------------------------------------------------------
/gpu-stats.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_STATS_H_
 2 | #define GPU_STATS_H_
 3 | 
 4 | #ifdef __NVCC__
 5 | #include <nvml.h>
 6 | #elif defined __HIP__
 7 | #include <rocm_smi/rocm_smi.h>
 8 | #endif
 9 | 
10 | struct GPU_stats {
11 |   double clock;
12 |   double power;
13 |   double temperature;
14 | };
15 | 
16 | GPU_stats getGPUStats(int deviceId) {
17 | #ifdef __NVCC__
18 |   static bool initialized = false;
19 |   if (!initialized) {
20 |     initialized = true;
21 |     nvmlInit();
22 |   }
23 |   nvmlDevice_t device;
24 |   nvmlDeviceGetHandleByIndex(deviceId, &device);
25 |   unsigned int power = 0;
26 |   unsigned int clock = 0;
27 |   unsigned int temperature = 0;
28 | 
29 |   nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock);
30 |   nvmlDeviceGetPowerUsage(device, &power);
31 |   nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
32 | 
33 |   return {clock, power, temperature};
34 | #elif defined __HIP__
35 | 
36 |   static bool initialized = false;
37 |   rsmi_status_t ret;
38 |   if (!initialized) {
39 |     initialized = true;
40 |     ret = rsmi_init(0);
41 |     unsigned int num_devices;
42 |     ret = rsmi_num_monitor_devices(&num_devices);
43 |   }
44 | 
45 |   uint64_t power = 0;
46 |   rsmi_frequencies_t clockStruct;
47 |   int currentClock = 0;
48 |   int64_t temperature = 0;
49 |   ret = rsmi_dev_temp_metric_get(deviceId, RSMI_TEMP_TYPE_EDGE,
50 |                                  RSMI_TEMP_CURRENT, &temperature);
51 |   ret = rsmi_dev_power_ave_get(deviceId, 0, &power);
52 |   ret = rsmi_dev_gpu_clk_freq_get(deviceId, RSMI_CLK_TYPE_SYS, &clockStruct);
53 | 
54 |   power /= 1000;
55 |   temperature /= 1000;
56 |   currentClock = clockStruct.frequency[clockStruct.current] / 1e6;
57 | 
58 |   return {(double)currentClock, (double)power, (double)temperature};
59 | #endif
60 | }
61 | 
62 | #endif // GPU-STATS_H_
63 | 


--------------------------------------------------------------------------------
/gpu-metrics/README.md:
--------------------------------------------------------------------------------
 1 | # Performance Counter Measurement Library for AMD and NVIDIA GPUs
 2 | 
 3 | This folder contains a header that provides pairs of functions:
 4 | 
 5 | ```
 6 | void measureMetricsStart(std::vector<const char *> metricNames);
 7 | std::vector<double> measureMetricsStop();
 8 | ```
 9 | The second function will return the measured metrics specicied in the start function of a GPU kernel launched in between the two. Launch only a single GPU kernel, otherwise it will probably crash.
10 | For metricNames, any metric supported by your GPU can be used. Multiple metrics can be measured at the same time. The NVIDIA backend does multi pass if all metrics cannot be profiled in a single pass, the rocprofiler backend crashes but suggestes a different metric combination.
11 | 
12 | There are two more pairs of start/stop function
13 | 
14 | ```
15 | void measureDRAMBytesStart();
16 | std::vector<double> measureDRAMBytesStop()
17 | 
18 | void measureL2BytesStart();
19 | void measureL2BytesStop();
20 | ```
21 | which contain the metric names and evaluation for very selected GPU models. On the AMD side, should work and tested for gfx90a and gfx1030, on the NVIDIA side, sm_80 aka A100. Dont forget to call
22 | 
23 | ```
24 | void initMeasureMetric();
25 | ```
26 | before doing anything.
27 | 
28 | Example usage from gpu-l2-cache/main.cu: (where it is currently commented out because it doesn't work on all models.
29 | 
30 | ```
31 | measureDRAMBytesStart();
32 | callKernel<N, blockSize>(blockCount, blockRun);
33 | auto metrics = measureDRAMBytesStop();
34 | dram_read.add(metrics[0]);
35 | dram_write.add(metrics[1]);
36 | 
37 | measureL2BytesStart();
38 | callKernel<N, blockSize>(blockCount, blockRun);
39 | metrics = measureL2BytesStop();
40 | L2_read.add(metrics[0]);
41 | L2_write.add(metrics[1]);
42 | ```
43 | 
44 | The APIs (perf works and rocprofiler) are unstable and fragile, if something is slightly off. Issues and comments welcome. 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/MeasurementSeries.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <algorithm>
 3 | #include <cmath>
 4 | #include <numeric>
 5 | #include <vector>
 6 | 
 7 | class MeasurementSeries {
 8 | public:
 9 |   void add(double v) { data.push_back(v); }
10 |   double value() {
11 |     if (data.size() == 0)
12 |       return 0.0;
13 |     if (data.size() == 1)
14 |       return data[0];
15 |     if (data.size() == 2)
16 |       return (data[0] + data[1]) / 2.0;
17 |     std::sort(begin(data), end(data));
18 |     return std::accumulate(begin(data) + 1, end(data) - 1, 0.0) /
19 |            (data.size() - 2);
20 |   }
21 |   double median() {
22 |     if (data.size() == 0)
23 |       return 0.0;
24 |     if (data.size() == 1)
25 |       return data[0];
26 |     if (data.size() == 2)
27 |       return (data[0] + data[1]) / 2.0;
28 | 
29 |     std::sort(begin(data), end(data));
30 |     if (data.size() % 2 == 0) {
31 |       return (data[data.size() / 2] + data[data.size() / 2 + 1]) / 2;
32 |     }
33 |     return data[data.size() / 2];
34 |   }
35 | 
36 |   double minValue() {
37 |     if (data.size() == 0)
38 |       return 0.0;
39 |     std::sort(begin(data), end(data));
40 |     return *begin(data);
41 |   }
42 | 
43 |   double getPercentile(double percentile) {
44 |     if (data.size() == 0)
45 |       return 0.0;
46 |     std::sort(begin(data), end(data));
47 |     int index = (int)round((data.size() - 1) * percentile);
48 |     return data[index];
49 |   }
50 | 
51 |   double maxValue() {
52 |     if (data.size() == 0)
53 |       return 0.0;
54 |     std::sort(begin(data), end(data));
55 |     return data.back();
56 |   }
57 | 
58 |   double spread() {
59 |     if (data.size() <= 1)
60 |       return 0.0;
61 |     if (data.size() == 2)
62 |       return abs(data[0] - data[1]) / value();
63 |     std::sort(begin(data), end(data));
64 |     return abs(*(begin(data)) - *(end(data) - 1)) / value();
65 |   }
66 |   int count() { return data.size(); }
67 | 
68 | private:
69 |   std::vector<double> data;
70 | };
71 | 


--------------------------------------------------------------------------------
/measure_metric/Parser.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | 
 6 | namespace NV {
 7 |     namespace Metric {
 8 |         namespace Parser {
 9 |             inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances)
10 |             {
11 |                 std::string& name = *reqName;
12 |                 name = metricName;
13 |                 if (name.empty())
14 |                 {
15 |                     return false;
16 |                 }
17 | 
18 |                 // boost program_options sometimes inserts a \n between the metric name and a '&' at the end
19 |                 size_t pos = name.find('\n');
20 |                 if (pos != std::string::npos)
21 |                 {
22 |                     name.erase(pos, 1);
23 |                 }
24 | 
25 |                 // trim whitespace
26 |                 while (name.back() == ' ')
27 |                 {
28 |                     name.pop_back();
29 |                     if (name.empty())
30 |                     {
31 |                         return false;
32 |                     }
33 |                 }
34 | 
35 |                 *keepInstances = false;
36 |                 if (name.back() == '+')
37 |                 {
38 |                     *keepInstances = true;
39 |                     name.pop_back();
40 |                     if (name.empty())
41 |                     {
42 |                         return false;
43 |                     }
44 |                 }
45 | 
46 |                 *isolated = true;
47 |                 if (name.back() == '$')
48 |                 {
49 |                     name.pop_back();
50 |                     if (name.empty())
51 |                     {
52 |                         return false;
53 |                     }
54 |                 }
55 |                 else if (name.back() == '&')
56 |                 {
57 |                     *isolated = false;
58 |                     name.pop_back();
59 |                     if (name.empty())
60 |                     {
61 |                         return false;
62 |                     }
63 |                 }
64 | 
65 |                 return true;
66 |             }
67 |         }
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/Parser.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | 
 6 | namespace NV {
 7 |     namespace Metric {
 8 |         namespace Parser {
 9 |             inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances)
10 |             {
11 |                 std::string& name = *reqName;
12 |                 name = metricName;
13 |                 if (name.empty())
14 |                 {
15 |                     return false;
16 |                 }
17 | 
18 |                 // boost program_options sometimes inserts a \n between the metric name and a '&' at the end
19 |                 size_t pos = name.find('\n');
20 |                 if (pos != std::string::npos)
21 |                 {
22 |                     name.erase(pos, 1);
23 |                 }
24 | 
25 |                 // trim whitespace
26 |                 while (name.back() == ' ')
27 |                 {
28 |                     name.pop_back();
29 |                     if (name.empty())
30 |                     {
31 |                         return false;
32 |                     }
33 |                 }
34 | 
35 |                 *keepInstances = false;
36 |                 if (name.back() == '+')
37 |                 {
38 |                     *keepInstances = true;
39 |                     name.pop_back();
40 |                     if (name.empty())
41 |                     {
42 |                         return false;
43 |                     }
44 |                 }
45 | 
46 |                 *isolated = true;
47 |                 if (name.back() == '$')
48 |                 {
49 |                     name.pop_back();
50 |                     if (name.empty())
51 |                     {
52 |                         return false;
53 |                     }
54 |                 }
55 |                 else if (name.back() == '&')
56 |                 {
57 |                     *isolated = false;
58 |                     name.pop_back();
59 |                     if (name.empty())
60 |                     {
61 |                         return false;
62 |                     }
63 |                 }
64 | 
65 |                 return true;
66 |             }
67 |         }
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/gpu-clock.cuh:
--------------------------------------------------------------------------------
 1 | #include "dtime.hpp"
 2 | #include "gpu-error.h"
 3 | #include <unistd.h>
 4 | #include <iostream>
 5 | 
 6 | #ifdef __NVCC__
 7 | #include <nvml.h>
 8 | #endif
 9 | #ifdef __HIP__
10 | #include <rocm_smi/rocm_smi.h>
11 | #endif
12 | 
13 | 
14 | __global__ void powerKernel(double* A, int iters) {
15 |     int tidx = threadIdx.x + blockIdx.x*blockDim.x;
16 | 
17 |     double start = A[0];
18 |     #pragma unroll 1
19 |     for(int i = 0; i < iters; i++) {
20 |         start -= (tidx*0.1)*start;
21 |     }
22 |     A[0] = start;
23 | }
24 | 
25 | 
26 | 
27 | unsigned int getGPUClock() {
28 | 
29 |     double* dA = NULL;
30 | #ifdef __NVCC__
31 |     GPU_ERROR(cudaMalloc(&dA, sizeof(double)));
32 | #endif
33 | #ifdef __HIP__
34 |     GPU_ERROR(hipMalloc(&dA, sizeof(double)));
35 | #endif
36 | 
37 |     unsigned int gpu_clock;
38 | 
39 | 
40 | 
41 |     int iters = 10;
42 | 
43 |     powerKernel<<<1000, 1024>>>(dA, iters);
44 | 
45 |   double dt = 0;
46 |   std::cout << "clock: ";
47 |   while (dt < 0.4) {
48 | #ifdef __NVCC__
49 |     GPU_ERROR(cudaDeviceSynchronize());
50 | #endif
51 | #ifdef __HIP__
52 |     GPU_ERROR(hipDeviceSynchronize());
53 | #endif
54 |     double t1 = dtime();
55 | 
56 |     powerKernel<<<1000, 1024>>>(dA, iters);
57 |     usleep(10000);
58 | 
59 | #ifdef __NVCC__
60 |     nvmlInit();
61 |     nvmlDevice_t device;
62 |     nvmlDeviceGetHandleByIndex(0, &device);
63 |     nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &gpu_clock);
64 |     GPU_ERROR(cudaDeviceSynchronize());
65 | #endif
66 | #ifdef __HIP__
67 |   int deviceId;
68 |     GPU_ERROR(hipGetDevice(&deviceId));
69 |     rsmi_status_t ret;
70 |     uint32_t num_devices;
71 |     uint16_t dev_id;
72 |     rsmi_frequencies_t clockStruct;
73 |     ret = rsmi_init(0);
74 |     ret = rsmi_num_monitor_devices(&num_devices);
75 |     ret = rsmi_dev_gpu_clk_freq_get(deviceId, RSMI_CLK_TYPE_SYS, &clockStruct);
76 |     gpu_clock = clockStruct.frequency[clockStruct.current] / 1e6;
77 |     GPU_ERROR(hipDeviceSynchronize());
78 | #endif
79 |     double t2 = dtime();
80 |     std::cout << gpu_clock << " ";
81 |     std::cout.flush();
82 |     dt = t2 - t1;
83 |     iters *= 2;
84 |   }
85 |   std::cout << "\n";
86 | #ifdef __NVCC__
87 |     GPU_ERROR(cudaFree(dA));
88 | #endif
89 | #ifdef __HIP__
90 |     GPU_ERROR(hipFree(dA));
91 | #endif
92 |   return gpu_clock;
93 | }
94 | 


--------------------------------------------------------------------------------
/um-stream/main.cu:
--------------------------------------------------------------------------------
 1 | #include "../MeasurementSeries.hpp"
 2 | #include "../dtime.hpp"
 3 | #include "../gpu-error.h"
 4 | #include <iomanip>
 5 | #include <iostream>
 6 | using namespace std;
 7 | 
 8 | __global__ void scale(double *A, double *B, size_t N) {
 9 |   size_t tidx = threadIdx.x + blockDim.x * blockIdx.x;
10 |   for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) {
11 |     A[i] = B[i] * 1.3;
12 |   }
13 | }
14 | 
15 | __global__ void triad(double *A, double *B, double *C, size_t N) {
16 |   size_t tidx = threadIdx.x + blockDim.x * blockIdx.x;
17 |   for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) {
18 |     A[i] = B[i] + C[i] * 1.3;
19 |   }
20 | }
21 | 
22 | int main(int argc, char **argv) {
23 |   double *A, *B, *C;
24 | 
25 |   cout << setw(12) << "buffer size" << setw(10) << "time" << setw(9) << "spread"
26 |        << setw(13) << "bandwidth\n";
27 | 
28 |   const int blockSize = 256;
29 | 
30 |   cudaDeviceProp prop;
31 |   int deviceId;
32 |   GPU_ERROR(cudaGetDevice(&deviceId));
33 |   GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
34 |   std::string deviceName = prop.name;
35 |   int smCount = prop.multiProcessorCount;
36 |   int maxActiveBlocks = 0;
37 |   GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks,
38 |                                                           triad, blockSize, 0));
39 | 
40 |   int blockCount = smCount * maxActiveBlocks;
41 | 
42 |   for (size_t N = 1024 * 1024; N < (size_t)1024 * 1024 * 1024 * 16; N *= 2) {
43 |     GPU_ERROR(cudaMallocManaged(&A, N * sizeof(double)));
44 |     GPU_ERROR(cudaMallocManaged(&B, N * sizeof(double)));
45 |     GPU_ERROR(cudaMallocManaged(&C, N * sizeof(double)));
46 | 
47 |     triad<<<blockCount, blockSize>>>(A, B, C, N);
48 |     //	scale<<<640, 256>>>(A, B, N);
49 |     GPU_ERROR(cudaDeviceSynchronize());
50 | 
51 |     MeasurementSeries time;
52 |     for (int i = 0; i < 5; i++) {
53 |       double t1 = dtime();
54 |       triad<<<640, 256>>>(A, B, C, N);
55 |       GPU_ERROR(cudaDeviceSynchronize());
56 |       double t2 = dtime();
57 |       time.add(t2 - t1);
58 |     }
59 | 
60 |     double bw = N * sizeof(double) * 3 / time.value() / 1.0e9;
61 |     cout << fixed << setprecision(1) << setw(9)
62 |          << 3 * N * sizeof(double) / (1 << 20) << " MB" << setw(8)
63 |          << time.value() * 1000 << "ms" << setprecision(1) << setw(8)
64 |          << time.spread() * 100 << "%" << setw(8) << bw << "GB/s\n";
65 | 
66 |     GPU_ERROR(cudaFree(A));
67 |     GPU_ERROR(cudaFree(B));
68 |     GPU_ERROR(cudaFree(C));
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/gpu-l2-cache/plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import csv
 5 | 
 6 | import sys
 7 | 
 8 | sys.path.append(".")
 9 | sys.path.append("..")
10 | from device_order import *
11 | 
12 | 
13 | fig, ax = plt.subplots(figsize=(8, 4))
14 | fig2, ax2 = plt.subplots(figsize=(8, 4))
15 | 
16 | 
17 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)):
18 |     if not filename.endswith(".txt"):
19 |         continue
20 | 
21 |     with open(filename, newline="") as csvfile:
22 | 
23 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
24 |         sizes = []
25 |         bw = []
26 |         L2bw = []
27 |         for row in csvreader:
28 |             if row[0] == "data" or not row[0].isnumeric():
29 |                 continue
30 |             sizes.append(float(row[2]))
31 |             bw.append(float(row[6]))
32 |             L2bw.append(float(row[12]))
33 | 
34 |         # print(sizes)
35 |         # print(bw)
36 |         ax.plot(
37 |             sizes,
38 |             bw,
39 |             label=filename[:-4].upper(),
40 |             color="C" + str(getOrderNumber(filename)),
41 |             **lineStyle
42 |         )
43 |         ax2.plot(
44 |             sizes,
45 |             L2bw,
46 |             label=filename[:-4].upper(),
47 |             color="C" + str(getOrderNumber(filename)),
48 |             **lineStyle
49 |         )
50 |         print(filename, getOrderNumber(filename))
51 | 
52 | 
53 | ax.set_xlabel("total data volume, MB")
54 | ax.set_ylabel("GB/s")
55 | ax.set_xscale("log", base=2)
56 | 
57 | 
58 | ax2.set_xlabel("total data volume, kB")
59 | ax2.set_ylabel("GB/s")
60 | ax2.set_xscale("log", base=2)
61 | 
62 | formatter = matplotlib.ticker.FuncFormatter(lambda x, pos: "{0:g}".format(x / 1024))
63 | ax.get_xaxis().set_major_formatter(formatter)
64 | # ax.get_yaxis().set_major_formatter(formatter)
65 | 
66 | ax2.get_xaxis().set_major_formatter(formatter)
67 | ax2.get_yaxis().set_major_formatter(formatter)
68 | 
69 | ax.set_xticks([1024, 4 * 1024, 8 * 1024, 20 * 1024, 40 * 1024, 128 * 1024, 1024 * 1024])
70 | 
71 | ax2.set_xticks([1024, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024])
72 | 
73 | fig.autofmt_xdate()
74 | ax.set_ylim([0, ax.get_ylim()[1]])
75 | ax.set_xlim([1024 * 1.5, 1024 * 1024])
76 | 
77 | fig2.autofmt_xdate()
78 | ax2.set_xlim([1024 * 1.5, 1024 * 1024])
79 | 
80 | ax.set_xlim([1024, ax.get_xlim()[1]])
81 | ax.legend()
82 | fig.tight_layout()
83 | 
84 | ax2.legend()
85 | fig2.tight_layout()
86 | 
87 | plt.show()
88 | fig.savefig("cuda-cache.svg")
89 | # fig2.savefig("cuda-cache-l2.svg")
90 | 


--------------------------------------------------------------------------------
/gpu-cache/plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import csv
 5 | 
 6 | import sys
 7 | 
 8 | sys.path.append("..")
 9 | from device_order import *
10 | 
11 | 
12 | fig, ax = plt.subplots(figsize=(8, 4))
13 | fig2, ax2 = plt.subplots(figsize=(8, 4))
14 | 
15 | 
16 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)):
17 |     if not filename.endswith(".txt"):
18 |         continue
19 | 
20 |     with open(filename, newline="") as csvfile:
21 |         style = "P-"
22 |         if filename.endswith("f.txt"):
23 |             style = "o--"
24 | 
25 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
26 |         sizes = []
27 |         bw = []
28 |         L2bw = []
29 |         for row in csvreader:
30 |             if row[0] == "data" or not row[0].isnumeric():
31 |                 continue
32 |             sizes.append(float(row[0]))
33 |             bw.append(float(row[4]))
34 |             L2bw.append(float(row[10]))
35 | 
36 |         # print(sizes)
37 |         # print(bw)
38 |         print(filename, len(sizes), getOrderNumber(filename))
39 |         ax.plot(
40 |             sizes,
41 |             bw,
42 |             label=order[getOrderNumber(filename)].upper(),
43 |             color=getDeviceColor(filename),
44 |             **lineStyle,
45 |         )
46 |         ax2.plot(
47 |             sizes,
48 |             L2bw,
49 |             label=order[getOrderNumber(filename)].upper(),
50 |             color=getDeviceColor(filename),
51 |             **lineStyle,
52 |         )
53 | 
54 | ax.set_xlabel("data volume per SM/CU, kB")
55 | ax.set_ylabel("GB/s")
56 | ax.set_xscale("log", base=2)
57 | 
58 | 
59 | ax2.set_xlabel("data volume per SM/CU, kB")
60 | ax2.set_ylabel("GB/s")
61 | ax2.set_xscale("log", base=2)
62 | 
63 | formatter = matplotlib.ticker.FuncFormatter(
64 |     lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g} MB".format(x / 1024)
65 | )
66 | ax.get_xaxis().set_major_formatter(formatter)
67 | # ax.get_yaxis().set_major_formatter(formatter)
68 | 
69 | ax2.get_xaxis().set_major_formatter(formatter)
70 | # ax2.get_yaxis().set_major_formatter(formatter)
71 | 
72 | ax.set_xticks([4, 16, 128, 256, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024])
73 | 
74 | ax2.set_xticks([4, 16, 128, 256, 6 * 1024, 20 * 1024, 40 * 1024, 128 * 1024])
75 | 
76 | ax.set_xlim((1.8, 256 * 1024))
77 | ax2.set_xlim((1.8, 256 * 1024))
78 | 
79 | fig.autofmt_xdate()
80 | ax.set_ylim([0, ax.get_ylim()[1] * 1.1])
81 | 
82 | fig2.autofmt_xdate()
83 | ax2.set_ylim([0, ax2.get_ylim()[1] * 1.1])
84 | 
85 | ax.legend()
86 | fig.tight_layout()
87 | 
88 | ax2.legend()
89 | fig2.tight_layout()
90 | 
91 | plt.show()
92 | fig.savefig("cuda-cache.svg")
93 | fig2.savefig("cuda-cache-l2.pdf")
94 | 


--------------------------------------------------------------------------------
/gpu-stream/rx6900xt.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1280       1      2%     |  GB/s:          73         41         65        125         61         58
 3 |   32      2560       1    3.1%     |  GB/s:         145         82        129        237        119        113
 4 |   48      3840       1    4.7%     |  GB/s:         210        120        188        330        179        169
 5 |   64      5120       1    6.2%     |  GB/s:         278        160        246        398        235        223
 6 |   80      6400       1    7.8%     |  GB/s:         344        194        303        419        288        273
 7 |   96      7680       1    9.4%     |  GB/s:         412        230        354        430        336        319
 8 |  112      8960       1   10.9%     |  GB/s:         474        263        404        428        379        357
 9 |   64     10240       2   12.5%     |  GB/s:         459        259        392        431        360        342
10 |  160     12800       1   15.6%     |  GB/s:         506        387        463        430        461        449
11 |   96     15360       2   18.8%     |  GB/s:         505        404        462        430        461        452
12 |  128     20480       2   25.0%     |  GB/s:         503        505        459        430        458        458
13 |  160     25600       2   31.2%     |  GB/s:         502        508        459        430        458        458
14 |  192     30720       2   37.5%     |  GB/s:         502        508        458        430        459        458
15 |  224     35840       2   43.8%     |  GB/s:         501        508        457        430        457        458
16 |  256     40960       2   50.0%     |  GB/s:         501        506        457        431        456        456
17 |  288     46080       2   56.2%     |  GB/s:         501        505        457        431        456        456
18 |  320     51200       2   62.5%     |  GB/s:         500        504        456        432        456        456
19 |  352     56320       2   68.8%     |  GB/s:         500        504        456        434        456        456
20 |  384     61440       2   75.0%     |  GB/s:         500        504        456        432        455        455
21 |  416     66560       2   81.2%     |  GB/s:         501        504        456        436        456        456
22 |  448     71680       2   87.5%     |  GB/s:         500        504        456        436        456        456
23 |  480     76800       2   93.8%     |  GB/s:         501        504        456        437        456        456
24 |  512     81920       2  100.0%     |  GB/s:         500        505        457        433        455        456
25 | 


--------------------------------------------------------------------------------
/gpu-roofline/plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | import os
 5 | import csv
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | 
 9 | import sys
10 | 
11 | sys.path.append("..")
12 | from device_order import *
13 | 
14 | 
15 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(7, 7))
16 | # fig2, ax2 = plt.subplots(figsize=(8, 4))
17 | # fig3, ax3 = plt.subplots(figsize=(8, 4))
18 | 
19 | 
20 | filenames = ["h200.txt", "alex_a100_40.txt", "bxx.txt"]
21 | 
22 | colors = ["#349999", "#CC1343", "#649903", "#c7aa3e"]
23 | 
24 | c = 0
25 | for filename in filenames:
26 |     with open(filename, newline="") as csvfile:
27 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
28 | 
29 |         datapoints = [[]]
30 | 
31 |         for row in csvreader:
32 |             print(row)
33 |             if len(row) == 0:
34 |                 datapoints.append([])
35 | 
36 |             elif len(row) == 16:
37 |                 datapoints[-1].append(
38 |                     [float(row[5]), float(row[9]), float(row[13]), float(row[11])]
39 |                 )
40 | 
41 |         print(datapoints)
42 |         print()
43 | 
44 |         for i in range(min(1, len(datapoints[1]))):
45 |             print([d[i][1] for d in datapoints if len(d) > 0])
46 |             ax1.plot(
47 |                 [d[i][0] for d in datapoints if len(d) > 0],
48 |                 [d[i][1] / 1000 for d in datapoints if len(d) > 0],
49 |                 "-",
50 |                 color=colors[c],
51 |                 label=filename
52 |             )
53 | 
54 |             ax2.plot(
55 |                 [d[i][0] for d in datapoints if len(d) > 0],
56 |                 [d[i][2] for d in datapoints if len(d) > 0],
57 |                 "-",
58 |                 color=colors[c],
59 |             )
60 | 
61 |             ax3.plot(
62 |                 [d[i][0] for d in datapoints if len(d) > 0],
63 |                 [d[i][3] / 1000 for d in datapoints if len(d) > 0],
64 |                 "-",
65 |                 color=colors[c],
66 |             )
67 |             c += 1
68 | 
69 | 
70 | ax1.legend()
71 | 
72 | ax3.set_xlabel("Arithmetic Intensity, Flop/B")
73 | ax1.set_ylabel("FP32, TFlop/s")
74 | ax2.set_ylabel("Power, W")
75 | ax3.set_ylabel("Clock, GHz")
76 | 
77 | 
78 | ax1.set_ylim([0, ax1.get_ylim()[1]])
79 | ax1.set_xlim([0, ax1.get_xlim()[1]])
80 | 
81 | ax2.set_ylim([0, ax2.get_ylim()[1]])
82 | ax2.set_xlim([0, ax2.get_xlim()[1]])
83 | 
84 | ax3.set_ylim([0, ax3.get_ylim()[1]])
85 | ax3.set_xlim([0, ax3.get_xlim()[1]])
86 | 
87 | # ax.set_xscale("log")
88 | # ax2.set_xscale("log")
89 | 
90 | # ax.set_yscale("log")
91 | # ax2.set_yscale("log")
92 | 
93 | 
94 | fig.tight_layout()
95 | 
96 | plt.savefig("L40_plot.pdf", dpi=4000)
97 | plt.show()
98 | 


--------------------------------------------------------------------------------
/cuda-memcpy/main.cu:
--------------------------------------------------------------------------------
 1 | #include <iomanip>
 2 | #include <iostream>
 3 | #include "../MeasurementSeries.hpp"
 4 | #include "../dtime.hpp"
 5 | #include "../gpu-error.h"
 6 | using namespace std;
 7 | 
 8 | int main(int argc, char** argv) {
 9 |     int deviceCount;
10 |     GPU_ERROR(cudaGetDeviceCount(&deviceCount));
11 | 
12 | 
13 |     vector<char*> deviceBuffers(deviceCount, nullptr);
14 |     char *host_buffer;
15 |     const size_t buffer_size_bytes = (size_t)2 * 1024 * 1024 * 1024;
16 | 
17 | 
18 |     for( int d  = 0; d < deviceCount; d++) {
19 |         GPU_ERROR(cudaSetDevice(d));
20 |         GPU_ERROR(cudaMalloc(& (deviceBuffers[d]), buffer_size_bytes));
21 |         GPU_ERROR(cudaDeviceSynchronize());
22 |     }
23 |     GPU_ERROR(cudaMallocHost(&host_buffer, buffer_size_bytes));
24 | 
25 | 
26 |     const int num_streams = 1;
27 |     cudaStream_t streams[num_streams];
28 | 
29 |     for (int i = 0; i < num_streams; i++) {
30 |         cudaStreamCreate(&streams[i]);
31 |     }
32 | 
33 |     memset(host_buffer, 0, buffer_size_bytes);
34 | 
35 |     for (size_t transfer_size_bytes = 2 << 16;
36 |        transfer_size_bytes <= buffer_size_bytes / num_streams;
37 |        transfer_size_bytes *= 16) {
38 | 
39 |         for(int d = 0; d < deviceCount; d++) {
40 |             GPU_ERROR(cudaSetDevice(d));
41 |             MeasurementSeries time;
42 |             for (int sample = 0; sample < 5; sample++) {
43 |                 memset(host_buffer, 0, buffer_size_bytes);
44 |                 double t1 = dtime();
45 |                 for (int stream = 0; stream < num_streams; stream++) {
46 |                     GPU_ERROR(cudaMemcpyAsync(
47 |                                   deviceBuffers[d] + (size_t)stream * transfer_size_bytes,
48 |                                   host_buffer + (size_t)stream * transfer_size_bytes,
49 |                                   transfer_size_bytes, cudaMemcpyDefault, streams[stream]));
50 |                 }
51 | 
52 |                 GPU_ERROR(cudaDeviceSynchronize());
53 |                 double t2 = dtime();
54 |                 time.add(t2 - t1);
55 |             }
56 |             double bw = num_streams * transfer_size_bytes / time.value();
57 |             cout << fixed  //
58 |                 << "Device: " << d << "   "
59 |                  << setw(10) << setprecision(0) << (transfer_size_bytes >> 10)
60 |                  << "kB  "                                                      //
61 |                  << setprecision(2) << setw(7) << time.value() * 1000 << "ms "  //
62 |                  << setprecision(2) << setw(7) << bw * 1e-9 << "GB/s   "        //
63 |                  << time.spread() * 100 << "%\n";
64 |         }
65 |         if(deviceCount > 1) cout << "\n";
66 |     }
67 | 
68 |     for(int d = 0; d< deviceCount; d++) {
69 |         GPU_ERROR(cudaFree(deviceBuffers[d]));
70 | }
71 |     //  GPU_ERROR(cudaFree(host_buffer));
72 |     GPU_ERROR(cudaFreeHost(host_buffer));
73 | }
74 | 


--------------------------------------------------------------------------------
/gpu-cache/mi100.txt:
--------------------------------------------------------------------------------
 1 | clock: 300 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 1502 
 2 |      data set   exec time     spread        Eff. bw       DRAM read      DRAM write         L2 read       L2 store
 3 |          4 kB       424ms       1.5%    9098.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 4 |          8 kB       469ms       6.5%    8304.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 5 |         16 kB       371ms       3.6%   10485.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 6 |         24 kB      1469ms      20.2%    2700.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 7 |         32 kB      1527ms      17.5%    2672.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 8 |         48 kB      1595ms      12.8%    2543.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 9 |         64 kB      1587ms       7.1%    2515.0 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
10 |         80 kB      1621ms       8.6%    2506.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
11 |         96 kB      1616ms       7.2%    2492.0 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
12 |        112 kB      1612ms       6.8%    2505.4 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
13 |        128 kB      1622ms      13.8%    2516.7 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
14 |        144 kB      1661ms      15.3%    2482.9 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
15 |        160 kB      1659ms      15.0%    2504.0 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
16 |        176 kB      1574ms      12.3%    2570.4 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
17 |        192 kB      1589ms      11.3%    2591.5 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
18 |        208 kB      1584ms      17.7%    2565.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
19 |        224 kB      1566ms       7.0%    2574.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
20 |        240 kB      1578ms      12.3%    2571.0 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
21 |        256 kB      1561ms      11.1%    2561.4 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
22 |        272 kB      1655ms       9.4%    2471.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
23 |        288 kB      1656ms      10.4%    2487.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
24 |        304 kB      1658ms       8.5%    2467.5 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
25 |        320 kB      1661ms       9.3%    2475.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
26 | 


--------------------------------------------------------------------------------
/gpu-strides/h200.txt:
--------------------------------------------------------------------------------
 1 | clock: 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 
 2 | float stride     0    0    1.01    126   1.00   1.00   0.03 
 3 | float stride     1    1    1.01    126   1.00   1.00   0.04 
 4 | float stride     2    2    2.01     64   2.00   1.22   0.02 
 5 | float stride     3    3    1.63     79   1.00   1.62   0.04 
 6 | float stride     4    4    4.01   32.0   4.00   1.84   0.05 
 7 | float stride     5    5    2.00     64   1.00   2.00   0.10 
 8 | float stride     6    6    2.32     55   2.00   2.31   0.04 
 9 | float stride     7    7    2.41     53   1.00   2.41   0.03 
10 | float stride     8    8     8.0   16.0    8.0   2.72   0.11 
11 | float stride     9    9    3.00   42.6   1.00   3.00   0.04 
12 | float stride    10   10    3.16   40.5   2.00   3.16   0.04 
13 | float stride    11   11    3.41   37.5   1.00   3.41   0.08 
14 | float stride    12   12    4.01   32.0   4.00   3.62   0.05 
15 | float stride    13   13    4.10   31.2   1.00   4.09   0.03 
16 | float stride    14   14    4.25   30.1   2.00   4.25   0.04 
17 | float stride    15   15    4.60   27.8   1.00   4.59   0.05 
18 | 
19 | double stride     0    0    2.01    128   2.00   1.00   0.04 
20 | double stride     1    1    2.01    128   2.00   1.19   0.03 
21 | double stride     2    2    4.01     64   4.00   1.75   0.08 
22 | double stride     3    3    2.32    110   2.00   2.31   0.05 
23 | double stride     4    4     8.0   32.0    8.0   2.78   0.06 
24 | double stride     5    5    3.16     81   2.00   3.16   0.05 
25 | double stride     6    6    4.01     64   4.00   3.72   0.03 
26 | double stride     7    7    4.32     59   2.00   4.31   0.04 
27 | double stride     8    8    16.0   16.0   16.0   4.66   0.13 
28 | double stride     9    9     5.3   48.7   2.00    5.2   0.05 
29 | double stride    10   10     5.5   46.8   4.00    5.5   0.18 
30 | double stride    11   11     6.0   42.6   2.00    6.0   0.04 
31 | double stride    12   12     8.0   32.0    8.0    6.6   0.09 
32 | double stride    13   13     7.0   36.5   2.00    7.0   0.06 
33 | double stride    14   14     7.0   36.5   4.00    7.0   0.05 
34 | double stride    15   15     8.0   32.0   2.00    8.0   0.05 
35 | 
36 | float block     1  4098     9.3   13.8   2.00    9.2   0.08 
37 | float block     2  4098    4.82   26.6   1.00   4.81   0.09 
38 | float block     4  4098    3.00   42.6   2.00   3.00   0.03 
39 | float block     8  4098    4.01   32.0   4.00   2.12   0.05 
40 | float block    16  4098    2.01     64   2.00   1.47   0.04 
41 | float block    32  4098    1.02    125   1.00   1.00   0.04 
42 | float block    64  4098    1.02    126   1.00   1.00   0.05 
43 | 
44 | double block     1  4098     9.0   28.4   4.00    9.0   0.07 
45 | double block     2  4098     5.3   48.7   2.00    5.2   0.07 
46 | double block     4  4098    4.01     64   4.00   3.25   0.05 
47 | double block     8  4098    4.01     64   4.00   2.27   0.03 
48 | double block    16  4098    2.01    128   2.00   1.38   0.04 
49 | double block    32  4098    2.01    128   2.00   1.00   0.02 
50 | double block    64  4098    2.01    128   2.00   1.00   0.06 
51 | 


--------------------------------------------------------------------------------
/gpu-l2-cache/sycl/sycl-gpu-l2-cache.cpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <iostream>
 3 | #include <chrono>
 4 | #include <stdlib.h>
 5 | #include <numeric>
 6 | #include <iomanip>
 7 | #include <algorithm>
 8 | #include <vector>
 9 | 
10 | using namespace sycl;
11 | using dtype = double;
12 | 
13 | int main(int argc, char **argv) {
14 |   const int N = 64;
15 |   std::cout << std::setw(13) << "data set"   //
16 |        << std::setw(12) << "exec time"  //
17 |        << std::setw(11) << "spread"     //
18 |        << std::setw(15) << "Eff. bw\n"; //
19 | 
20 |   sycl::queue q{sycl::gpu_selector_v,sycl::property::queue::enable_profiling{}};
21 |   std::cout << "Running on GPU:" << q.get_device().get_info<sycl::info::device::name>()<< std::endl;
22 |      
23 | 
24 |   for (int blockRun = 3; blockRun < 10000; blockRun += max(1.0, blockRun * 0.1)) {
25 |     const int blockSize = 1024;
26 |     const int blockCount = 200000;
27 | 
28 |     std::vector<double> time;
29 | 
30 |     for (int i = 0; i < 11; i++) {
31 |       const size_t bufferCount = blockRun * blockSize * N + i * 128;
32 |       dtype *dA = malloc_device<dtype>(bufferCount, q);
33 |       dtype *dB = malloc_device<dtype>(bufferCount, q);
34 | 
35 |       q.parallel_for(range<1>(bufferCount), [=](id<1> idx) {
36 |         dA[idx] = dtype(1.1);
37 |         dB[idx] = dtype(1.1);
38 |       }).wait();
39 | 
40 |       auto start = std::chrono::high_resolution_clock::now();
41 |       q.parallel_for(nd_range<1>(range<1>(blockCount * blockSize), range<1>(blockSize)), [=](nd_item<1> item) {
42 |         int threadIdx = item.get_local_id(0);
43 |         int blockIdx = item.get_group(0);
44 | 
45 |         dtype localSum = dtype(0);
46 |         for (int i = 0; i < N / 2; i++) {
47 |           int idx = (blockSize * blockRun * i + (blockIdx % blockRun) * blockSize) * 2 + threadIdx;
48 |           localSum += dB[idx] * dB[idx + blockSize];
49 |         }
50 |         localSum *= (dtype)1.3;
51 |         if (threadIdx > 1233 || localSum == (dtype)23.12)
52 |           dA[threadIdx] += localSum;
53 |       }).wait();
54 |       auto end = std::chrono::high_resolution_clock::now();
55 |       auto elapsedtime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
56 |       time.push_back(std::chrono::duration<double>(elapsedtime).count());
57 | 
58 |       free(dA, q);
59 |       free(dB, q);
60 |     }
61 | 
62 |     std::sort(time.begin(), time.end());
63 |     double blockDV = N * blockSize * sizeof(dtype);
64 |     double bw = blockDV * blockCount / time[0] / 1.0e9; // time min value
65 | 
66 |     std::cout << std::fixed << std::setprecision(0) << std::setw(10) << blockDV / 1024 << " kB"
67 |               << std::fixed << std::setprecision(0) << std::setw(10) << blockDV * blockRun / 1024 << " kB"
68 |               << std::fixed << std::setprecision(0) << std::setw(10) << (time[0] * 1000.0) << "ms"
69 |               << std::setprecision(1) << std::setw(10)
70 |               << abs(*(begin(time)) - *(end(time) - 1)) /
71 |                      std::accumulate(begin(time) + 1, end(time) - 1, 0.0) / (time.size() - 2) * 100
72 |               << "%" << std::setw(10) << bw << " GB/s   " << std::endl;
73 |   }
74 | }


--------------------------------------------------------------------------------
/gpu-latency/plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import csv
  5 | 
  6 | import sys
  7 | 
  8 | sys.path.append("..")
  9 | from device_order import *
 10 | 
 11 | 
 12 | devicesToInclude = []
 13 | 
 14 | 
 15 | if len(sys.argv) > 1 and sys.argv[1] == "AMD":
 16 |     devicesToInclude = ["MI100", "MI210", "MI300X", "RX6900XT"]
 17 | 
 18 | if len(sys.argv) > 1 and sys.argv[1] == "NV":
 19 |     devicesToInclude = ["A40", "L40", "V100", "A100", "GH200"]
 20 | 
 21 | 
 22 | fig, ax = plt.subplots(figsize=(6, 4))
 23 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)):
 24 |     if not filename.endswith(".txt") or getOrderNumber(filename) > len(order):
 25 |         continue
 26 |     if len(devicesToInclude) > 0 and not any(
 27 |         [filename.upper().startswith(d) for d in devicesToInclude]
 28 |     ):
 29 |         continue
 30 | 
 31 |     with open(filename, newline="") as csvfile:
 32 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
 33 |         sizes = []
 34 |         min = []
 35 |         max = []
 36 |         avg = []
 37 |         med = []
 38 |         for row in csvreader:
 39 |             if len(row) < 8 or row[0] == "clock:":
 40 |                 continue
 41 |             sizes.append(float(row[2]))
 42 |             avg.append(float(row[4]))
 43 |             med.append(float(row[5]))
 44 |             min.append(float(row[6]))
 45 |             max.append(float(row[7]))
 46 | 
 47 |         print(filename, getOrderNumber(filename))
 48 | 
 49 |         ax.plot(
 50 |             sizes,
 51 |             med,
 52 |             # "-x",
 53 |             label=order[getOrderNumber(filename)].upper(),
 54 |             color=getDeviceColor(filename),
 55 |             **lineStyle
 56 |         )
 57 | 
 58 |         plt.fill_between(
 59 |             sizes, min, max, alpha=0.4, color=getDeviceColor(filename), edgecolor=None
 60 |         )
 61 | 
 62 | 
 63 | ax.set_xlabel("chain data volume, kB")
 64 | ax.set_ylabel("latency, cycles")
 65 | ax.set_xscale("log", base=2)
 66 | 
 67 | 
 68 | # ax.axvline(16)
 69 | # ax.axvline(4*1024)
 70 | 
 71 | formatter = matplotlib.ticker.FuncFormatter(
 72 |     lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g} MB".format(x // 1024)
 73 | )
 74 | ax.get_xaxis().set_major_formatter(formatter)
 75 | # ax.get_yaxis().set_major_formatter(formatter)
 76 | 
 77 | ax.set_xticks(
 78 |     [
 79 |         16,
 80 |         32,
 81 |         128,
 82 |         256,
 83 |         4 * 1024,
 84 |         6 * 1024,
 85 |         8 * 1024,
 86 |         20 * 1024,
 87 |         30 * 1024,
 88 |         60 * 1024,
 89 |         128 * 1024,
 90 |         256 * 1024,
 91 |         512 * 1024,
 92 |     ]
 93 | )
 94 | ax.set_xlim([8, 800 * 1024])
 95 | 
 96 | 
 97 | ax.set_ylim([0, 980])
 98 | 
 99 | ax.set_yticks((0, 30, 100, 200, 300, 400, 500, 600, 700, 800, 900))
100 | 
101 | fig.autofmt_xdate()
102 | ax.legend()
103 | ax.set_ylim([0, ax.get_ylim()[1]])
104 | fig.tight_layout(pad=0)
105 | fig.savefig("latencies" + ("_" + sys.argv[1] if len(sys.argv) > 1 else "") + ".svg")
106 | fig.savefig("latencies" + ("_" + sys.argv[1] if len(sys.argv) > 1 else "") + ".pdf")
107 | 
108 | plt.show()
109 | 


--------------------------------------------------------------------------------
/gpu-small-kernels/a40_pt.txt:
--------------------------------------------------------------------------------
 1 | 4096  64kB     71     81     81     77     66     55  
 2 | 4341  67kB     74     86     86     81     70     58  
 3 | 4601  71kB     77     91     91     86     74     62  
 4 | 4877  76kB     80     96     96     91     79     65  
 5 | 5169  80kB     82    101    102     97     83     69  
 6 | 5479  85kB     78    105    108    102     88     73  
 7 | 5807  90kB     82    108    113    108     93     77  
 8 | 6155  96kB     86    113    118    115     99     82  
 9 | 6524  101kB     91    118    125    121    105     86  
10 | 6915  108kB     94    123    132    129    111     92  
11 | 7329  114kB     99    128    140    136    118     97  
12 | 7768  121kB    104    133    148    144    125    102  
13 | 8234  128kB    110    138    156    153    132    108  
14 | 8728  136kB    116    142    165    162    140    115  
15 | 9251  144kB    122    148    175    172    149    122  
16 | 9806  153kB    130    153    185    182    157    129  
17 | 10394  162kB    137    158    195    193    167    137  
18 | 11017  172kB    133    151    200    200    177    145  
19 | 11678  182kB    140    159    207    210    187    154  
20 | 12378  193kB    146    168    216    220    198    163  
21 | 13120  205kB    152    177    226    233    210    172  
22 | 13907  217kB    160    185    235    246    222    183  
23 | 14741  230kB    168    196    244    260    235    193  
24 | 15625  244kB    179    206    254    275    249    205  
25 | 16562  258kB    175    214    262    290    264    217  
26 | 17555  274kB    184    228    272    306    279    230  
27 | 18608  290kB    190    239    283    323    296    244  
28 | 19724  308kB    199    254    292    341    313    258  
29 | 20907  326kB    210    268    303    359    332    274  
30 | 22161  346kB    208    258    290    352    349    289  
31 | 23490  367kB    215    273    305    357    367    305  
32 | 24899  389kB    224    283    322    374    385    323  
33 | 26392  412kB    238    298    340    393    409    342  
34 | 27975  437kB    237    312    350    412    432    362  
35 | 29653  463kB    242    330    371    430    458    382  
36 | 31432  491kB    256    349    393    448    484    405  
37 | 33317  520kB    257    340    411    468    509    428  
38 | 35316  551kB    262    353    436    488    537    453  
39 | 37434  584kB    278    370    459    507    563    478  
40 | 39680  620kB    275    388    486    525    593    506  
41 | 42060  657kB    288    412    511    544    621    534  
42 | 44583  696kB    286    405    494    523    579    556  
43 | 47257  738kB    300    418    522    551    593    582  
44 | 50092  782kB    300    438    540    581    619    612  
45 | 53097  829kB    313    464    571    616    650    645  
46 | 56282  879kB    313    460    597    636    679    680  
47 | 59658  932kB    319    472    632    671    703    716  
48 | 63237  988kB    328    501    667    711    731    753  
49 | 67031  1047kB    331    499    651    737    764    787  
50 | 71052  1110kB    336    513    674    778    795    825  
51 | 75315  1176kB    342    531    708    822    820    865  
52 | 79833  1247kB    351    535    741    874    845    904  
53 | 84622  1322kB    353    563    787    922    871    946  
54 | 89699  1401kB    358    559    775    896    842  


--------------------------------------------------------------------------------
/gpu-stream/a40.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1344       1      1%     |  GB/s:          52         20         39         73         36         32
 3 |   32      2688       1    2.1%     |  GB/s:         104         41         78        144         71         63
 4 |   48      4032       1    3.1%     |  GB/s:         154         60        112        208        105         93
 5 |   64      5376       1    4.2%     |  GB/s:         205         80        150        276        138        123
 6 |   80      6720       1    5.2%     |  GB/s:         253         97        181        331        169        149
 7 |   96      8064       1    6.2%     |  GB/s:         304        117        217        389        200        176
 8 |  112      9408       1    7.3%     |  GB/s:         351        134        246        435        228        194
 9 |   64     10752       2    8.3%     |  GB/s:         400        160        291        498        268        237
10 |  160     13440       1   10.4%     |  GB/s:         483        188        339        549        308        254
11 |   96     16128       2   12.5%     |  GB/s:         589        233        408        610        377        325
12 |  128     21504       2   16.7%     |  GB/s:         680        305        515        653        466        391
13 |  160     26880       2   20.8%     |  GB/s:         680        371        576        666        540        404
14 |  192     32256       2   25.0%     |  GB/s:         680        436        614        670        587        447
15 |  224     37632       2   29.2%     |  GB/s:         680        494        631        671        616        468
16 |  256     43008       2   33.3%     |  GB/s:         680        553        644        670        630        512
17 |  288     48384       2   37.5%     |  GB/s:         680        598        652        670        641        502
18 |  320     53760       2   41.7%     |  GB/s:         680        639        659        670        650        531
19 |  352     59136       2   45.8%     |  GB/s:         680        662        657        670        658        558
20 |  384     64512       2   50.0%     |  GB/s:         680        677        658        670        656        597
21 |  416     69888       2   54.2%     |  GB/s:         680        680        658        670        656        602
22 |  448     75264       2   58.3%     |  GB/s:         680        680        658        670        657        621
23 |  480     80640       2   62.5%     |  GB/s:         680        680        658        670        657        630
24 |  512     86016       2   66.7%     |  GB/s:         680        680        657        670        656        641
25 |  544     91392       2   70.8%     |  GB/s:         680        680        657        670        655        646
26 |  576     96768       2   75.0%     |  GB/s:         680        680        657        670        655        656
27 |  608    102144       2   79.2%     |  GB/s:         680        680        656        670        654        658
28 |  640    107520       2   83.3%     |  GB/s:         680        680        655        670        653        656
29 |  672    112896       2   87.5%     |  GB/s:         680        680        655        670        653        656
30 |  704    118272       2   91.7%     |  GB/s:         680        680        655        670        653        656
31 |  736    123648       2   95.8%     |  GB/s:         680        680        654        670        652        656
32 |  768    129024       2  100.0%     |  GB/s:         680        680        653        670        651        656
33 | 


--------------------------------------------------------------------------------
/gpu-stream/l40.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      2272       1      1%     |  GB/s:          81         41         78        147         73         66
 3 |   32      4544       1    2.1%     |  GB/s:         161         83        155        290        143        130
 4 |   48      6816       1    3.1%     |  GB/s:         242        121        224        413        211        192
 5 |   64      9088       1    4.2%     |  GB/s:         322        163        297        532        276        251
 6 |   80     11360       1    5.2%     |  GB/s:         403        199        357        617        337        304
 7 |   96     13632       1    6.2%     |  GB/s:         483        240        424        690        397        359
 8 |  112     15904       1    7.3%     |  GB/s:         564        274        479        734        450        397
 9 |   64     18176       2    8.3%     |  GB/s:         644        321        551        781        518        474
10 |  160     22720       1   10.4%     |  GB/s:         805        386        634        805        596        519
11 |   96     27264       2   12.5%     |  GB/s:         847        470        719        826        693        641
12 |  128     36352       2   16.7%     |  GB/s:         847        617        774        830        758        728
13 |  160     45440       2   20.8%     |  GB/s:         847        740        800        830        789        761
14 |  192     54528       2   25.0%     |  GB/s:         847        824        773        831        812        788
15 |  224     63616       2   29.2%     |  GB/s:         847        844        773        831        763        769
16 |  256     72704       2   33.3%     |  GB/s:         847        845        795        830        764        765
17 |  288     81792       2   37.5%     |  GB/s:         847        845        796        830        764        763
18 |  320     90880       2   41.7%     |  GB/s:         847        845        796        830        770        765
19 |  352     99968       2   45.8%     |  GB/s:         847        845        797        830        769        765
20 |  384    109056       2   50.0%     |  GB/s:         847        845        797        830        771        768
21 |  416    118144       2   54.2%     |  GB/s:         847        845        794        830        768        768
22 |  448    127232       2   58.3%     |  GB/s:         847        845        797        830        769        767
23 |  480    136320       2   62.5%     |  GB/s:         847        845        797        830        766        767
24 |  512    145408       2   66.7%     |  GB/s:         847        845        797        830        769        766
25 |  544    154496       2   70.8%     |  GB/s:         847        845        795        830        768        765
26 |  576    163584       2   75.0%     |  GB/s:         847        845        795        830        768        766
27 |  608    172672       2   79.2%     |  GB/s:         847        845        795        830        769        765
28 |  640    181760       2   83.3%     |  GB/s:         846        845        781        830        770        767
29 |  672    190848       2   87.5%     |  GB/s:         847        845        778        830        769        766
30 |  704    199936       2   91.7%     |  GB/s:         847        845        779        830        768        767
31 |  736    209024       2   95.8%     |  GB/s:         846        845        778        830        770        769
32 |  768    218112       2  100.0%     |  GB/s:         847        845        776        830        768        766
33 | 


--------------------------------------------------------------------------------
/gpu-stream/h100_pcie.txt:
--------------------------------------------------------------------------------
 1 | blockSize   threads       %occ  |                init       read       scale     triad       3pt        5pt
 2 |        32        3648      3 %  |  GB/s:         228         96        183        254        168        164
 3 |        64        7296    6.2 %  |  GB/s:         452        189        341        459        316        310
 4 |        96       10944    9.4 %  |  GB/s:         676        277        472        635        443        436
 5 |       128       14592   12.5 %  |  GB/s:         888        368        607        821        567        558
 6 |       160       18240   15.6 %  |  GB/s:        1093        449        704        966        680        670
 7 |       192       21888   18.8 %  |  GB/s:        1301        533        817       1121        794        781
 8 |       224       25536   21.9 %  |  GB/s:        1495        612        925       1264        903        889
 9 |       256       29184   25.0 %  |  GB/s:        1686        702       1037       1399       1005        989
10 |       288       32832   28.1 %  |  GB/s:        1832        764       1124       1487       1100       1082
11 |       320       36480   31.2 %  |  GB/s:        2015        841       1213       1564       1188       1169
12 |       352       40128   34.4 %  |  GB/s:        2016        908       1295       1615       1269       1250
13 |       384       43776   37.5 %  |  GB/s:        2016        985       1378       1644       1348       1326
14 |       416       47424   40.6 %  |  GB/s:        2016       1045       1439       1641       1415       1395
15 |       448       51072   43.8 %  |  GB/s:        2016       1116       1497       1649       1472       1453
16 |       480       54720   46.9 %  |  GB/s:        2016       1179       1544       1655       1521       1505
17 |       512       58368   50.0 %  |  GB/s:        2017       1261       1583       1675       1556       1545
18 |       544       62016   53.1 %  |  GB/s:        2016       1300       1591       1669       1572       1563
19 |       576       65664   56.2 %  |  GB/s:        2016       1362       1607       1678       1587       1579
20 |       608       69312   59.4 %  |  GB/s:        2018       1416       1619       1689       1598       1592
21 |       640       72960   62.5 %  |  GB/s:        2016       1473       1639       1712       1613       1607
22 |       672       76608   65.6 %  |  GB/s:        2016       1527       1638       1714       1618       1613
23 |       704       80256   68.8 %  |  GB/s:        2015       1578       1644       1725       1625       1619
24 |       736       83904   71.9 %  |  GB/s:        2016       1624       1651       1738       1632       1628
25 |       768       87552   75.0 %  |  GB/s:        2016       1680       1666       1755       1642       1638
26 |       800       91200   78.1 %  |  GB/s:        2015       1714       1663       1758       1645       1642
27 |       832       94848   81.2 %  |  GB/s:        2016       1759       1668       1770       1649       1647
28 |       864       98496   84.4 %  |  GB/s:        2016       1795       1673       1779       1654       1651
29 |       896      102144   87.5 %  |  GB/s:        2016       1837       1686       1796       1663       1662
30 |       928      105792   90.6 %  |  GB/s:        2018       1871       1684       1800       1666       1664
31 |       960      109440   93.8 %  |  GB/s:        2016       1897       1688       1808       1672       1670
32 |       992      113088   96.9 %  |  GB/s:        2016       1919       1693       1818       1678       1675
33 |      1024      116736  100.0 %  |  GB/s:        2016       1942       1704       1832       1686       1683
34 | 


--------------------------------------------------------------------------------
/gpu-stream/past_results/h100_pcie.txt:
--------------------------------------------------------------------------------
 1 | blockSize   threads       %occ  |                init       read       scale     triad       3pt        5pt
 2 |        32        3648      3 %  |  GB/s:         228         96        183        254        168        164
 3 |        64        7296    6.2 %  |  GB/s:         452        189        341        459        316        310
 4 |        96       10944    9.4 %  |  GB/s:         676        277        472        635        443        436
 5 |       128       14592   12.5 %  |  GB/s:         888        368        607        821        567        558
 6 |       160       18240   15.6 %  |  GB/s:        1093        449        704        966        680        670
 7 |       192       21888   18.8 %  |  GB/s:        1301        533        817       1121        794        781
 8 |       224       25536   21.9 %  |  GB/s:        1495        612        925       1264        903        889
 9 |       256       29184   25.0 %  |  GB/s:        1686        702       1037       1399       1005        989
10 |       288       32832   28.1 %  |  GB/s:        1832        764       1124       1487       1100       1082
11 |       320       36480   31.2 %  |  GB/s:        2015        841       1213       1564       1188       1169
12 |       352       40128   34.4 %  |  GB/s:        2016        908       1295       1615       1269       1250
13 |       384       43776   37.5 %  |  GB/s:        2016        985       1378       1644       1348       1326
14 |       416       47424   40.6 %  |  GB/s:        2016       1045       1439       1641       1415       1395
15 |       448       51072   43.8 %  |  GB/s:        2016       1116       1497       1649       1472       1453
16 |       480       54720   46.9 %  |  GB/s:        2016       1179       1544       1655       1521       1505
17 |       512       58368   50.0 %  |  GB/s:        2017       1261       1583       1675       1556       1545
18 |       544       62016   53.1 %  |  GB/s:        2016       1300       1591       1669       1572       1563
19 |       576       65664   56.2 %  |  GB/s:        2016       1362       1607       1678       1587       1579
20 |       608       69312   59.4 %  |  GB/s:        2018       1416       1619       1689       1598       1592
21 |       640       72960   62.5 %  |  GB/s:        2016       1473       1639       1712       1613       1607
22 |       672       76608   65.6 %  |  GB/s:        2016       1527       1638       1714       1618       1613
23 |       704       80256   68.8 %  |  GB/s:        2015       1578       1644       1725       1625       1619
24 |       736       83904   71.9 %  |  GB/s:        2016       1624       1651       1738       1632       1628
25 |       768       87552   75.0 %  |  GB/s:        2016       1680       1666       1755       1642       1638
26 |       800       91200   78.1 %  |  GB/s:        2015       1714       1663       1758       1645       1642
27 |       832       94848   81.2 %  |  GB/s:        2016       1759       1668       1770       1649       1647
28 |       864       98496   84.4 %  |  GB/s:        2016       1795       1673       1779       1654       1651
29 |       896      102144   87.5 %  |  GB/s:        2016       1837       1686       1796       1663       1662
30 |       928      105792   90.6 %  |  GB/s:        2018       1871       1684       1800       1666       1664
31 |       960      109440   93.8 %  |  GB/s:        2016       1897       1688       1808       1672       1670
32 |       992      113088   96.9 %  |  GB/s:        2016       1919       1693       1818       1678       1675
33 |      1024      116736  100.0 %  |  GB/s:        2016       1942       1704       1832       1686       1683
34 | 


--------------------------------------------------------------------------------
/gpu-cache/mi210.txt:
--------------------------------------------------------------------------------
 1 | clock: 800 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 1700 
 2 |      data set   exec time     spread        Eff. bw       DRAM read      DRAM write         L2 read       L2 store
 3 |          4 kB       326ms       1.0%   10227.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 4 |          8 kB       316ms       0.8%   10555.5 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 5 |         16 kB       317ms       1.0%   10518.7 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 6 |         24 kB       647ms      48.6%    7019.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 7 |         32 kB       521ms      42.3%    8437.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 8 |         48 kB       973ms      45.7%    4695.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
 9 |         64 kB       924ms      46.2%    4941.5 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
10 |         80 kB      1093ms      47.4%    4246.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
11 |         96 kB      1113ms      47.6%    4172.5 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
12 |        112 kB       999ms      45.4%    4551.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
13 |        128 kB      1044ms      44.3%    4299.3 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
14 |        144 kB      1171ms      42.7%    3802.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
15 |        160 kB      1166ms      42.0%    3775.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
16 |        176 kB       862ms      38.0%    4860.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
17 |        192 kB       859ms      38.1%    4922.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
18 |        208 kB       858ms      39.3%    4953.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
19 |        224 kB       858ms      39.0%    4930.7 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
20 |        240 kB       854ms      38.1%    4929.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
21 |        256 kB       852ms      36.7%    4847.0 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
22 |        272 kB      1321ms      41.9%    3234.9 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
23 |        288 kB      1327ms      42.0%    3253.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
24 |        304 kB      1326ms      40.9%    3229.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
25 |        320 kB      1331ms      39.7%    3213.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
26 |        336 kB      1338ms      39.8%    3209.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
27 |        352 kB      1339ms      40.2%    3200.6 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
28 |        368 kB      1335ms      39.9%    3200.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
29 |        384 kB      1332ms      42.3%    3244.7 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
30 |        400 kB      1326ms      41.5%    3219.7 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
31 |        416 kB      1305ms      42.3%    3209.2 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
32 |        432 kB      1276ms      43.7%    3245.9 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
33 |        448 kB      1247ms      47.9%    3345.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
34 |        464 kB      1207ms      52.1%    3489.1 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
35 |        480 kB      1159ms      54.1%    3704.8 GB/s         0 GB/s          0 GB/s          0 GB/s          0 GB/s 
36 | 


--------------------------------------------------------------------------------
/gpu-stream/a100_40.txt:
--------------------------------------------------------------------------------
 1 | blockSize   threads       %occ  |                init       read       scale     triad       3pt        5pt
 2 |        16        1728    0.8 %  |  GB/s:          54         20         39         55         37         37
 3 |        32        3456    1.6 %  |  GB/s:         107         41         77        110         73         72
 4 |        48        5184    2.3 %  |  GB/s:         161         61        112        158        109        107
 5 |        64        6912    3.1 %  |  GB/s:         208         82        151        212        143        140
 6 |        96       10368    4.7 %  |  GB/s:         317        119        215        300        205        201
 7 |       128       13824    6.2 %  |  GB/s:         412        162        292        406        270        265
 8 |       160       17280    7.8 %  |  GB/s:         510        193        337        465        322        314
 9 |       192       20736    9.4 %  |  GB/s:         609        235        407        562        389        383
10 |       224       24192   10.9 %  |  GB/s:         697        264        450        618        433        422
11 |       256       27648   12.5 %  |  GB/s:         802        310        548        743        506        498
12 |       320       34560   15.6 %  |  GB/s:         976        377        630        858        609        595
13 |       384       41472   18.8 %  |  GB/s:        1159        449        749       1006        714        698
14 |       448       48384   21.9 %  |  GB/s:        1329        514        836       1112        812        792
15 |       512       55296   25.0 %  |  GB/s:        1501        592        956       1232        909        887
16 |       576       62208   28.1 %  |  GB/s:        1539        645       1021       1290        993        966
17 |       640       69120   31.2 %  |  GB/s:        1538        713       1112       1333       1081       1052
18 |       704       76032   34.4 %  |  GB/s:        1538        769       1180       1332       1151       1119
19 |       768       82944   37.5 %  |  GB/s:        1539        838       1245       1346       1209       1180
20 |       832       89856   40.6 %  |  GB/s:        1539        887       1284       1344       1258       1231
21 |       896       96768   43.8 %  |  GB/s:        1538        946       1318       1353       1298       1277
22 |       960      103680   46.9 %  |  GB/s:        1539        989       1324       1353       1316       1305
23 |      1024      110592   50.0 %  |  GB/s:        1536       1064       1338       1360       1325       1322
24 |      1088      117504   53.1 %  |  GB/s:        1537       1079       1338       1360       1331       1327
25 |      1152      124416   56.2 %  |  GB/s:        1539       1138       1347       1365       1337       1335
26 |      1216      131328   59.4 %  |  GB/s:        1537       1175       1347       1366       1339       1336
27 |      1280      138240   62.5 %  |  GB/s:        1537       1224       1354       1370       1343       1339
28 |      1344      145152   65.6 %  |  GB/s:        1537       1251       1355       1372       1347       1344
29 |      1408      152064   68.8 %  |  GB/s:        1538       1316       1365       1375       1352       1348
30 |      1472      158976   71.9 %  |  GB/s:        1538       1335       1363       1368       1353       1349
31 |      1536      165888   75.0 %  |  GB/s:        1539       1372       1367       1365       1356       1351
32 |      1600      172800   78.1 %  |  GB/s:        1536       1379       1366       1367       1358       1353
33 |      1664      179712   81.2 %  |  GB/s:        1539       1403       1368       1366       1360       1356
34 |      1728      186624   84.4 %  |  GB/s:        1539       1419       1369       1366       1362       1357
35 |      1792      193536   87.5 %  |  GB/s:        1539       1440       1373       1363       1363       1359
36 |      1856      200448   90.6 %  |  GB/s:        1538       1453       1374       1364       1365       1361
37 |      1920      207360   93.8 %  |  GB/s:        1538       1473       1376       1362       1367       1364
38 |      1984      214272   96.9 %  |  GB/s:        1537       1483       1377       1362       1369       1365
39 |      2048      221184  100.0 %  |  GB/s:        1539       1500       1382       1358       1371       1368
40 | 


--------------------------------------------------------------------------------
/gpu-stream/past_results/a100_40.txt:
--------------------------------------------------------------------------------
 1 | blockSize   threads       %occ  |                init       read       scale     triad       3pt        5pt
 2 |        16        1728    0.8 %  |  GB/s:          54         20         39         55         37         37
 3 |        32        3456    1.6 %  |  GB/s:         107         41         77        110         73         72
 4 |        48        5184    2.3 %  |  GB/s:         161         61        112        158        109        107
 5 |        64        6912    3.1 %  |  GB/s:         208         82        151        212        143        140
 6 |        96       10368    4.7 %  |  GB/s:         317        119        215        300        205        201
 7 |       128       13824    6.2 %  |  GB/s:         412        162        292        406        270        265
 8 |       160       17280    7.8 %  |  GB/s:         510        193        337        465        322        314
 9 |       192       20736    9.4 %  |  GB/s:         609        235        407        562        389        383
10 |       224       24192   10.9 %  |  GB/s:         697        264        450        618        433        422
11 |       256       27648   12.5 %  |  GB/s:         802        310        548        743        506        498
12 |       320       34560   15.6 %  |  GB/s:         976        377        630        858        609        595
13 |       384       41472   18.8 %  |  GB/s:        1159        449        749       1006        714        698
14 |       448       48384   21.9 %  |  GB/s:        1329        514        836       1112        812        792
15 |       512       55296   25.0 %  |  GB/s:        1501        592        956       1232        909        887
16 |       576       62208   28.1 %  |  GB/s:        1539        645       1021       1290        993        966
17 |       640       69120   31.2 %  |  GB/s:        1538        713       1112       1333       1081       1052
18 |       704       76032   34.4 %  |  GB/s:        1538        769       1180       1332       1151       1119
19 |       768       82944   37.5 %  |  GB/s:        1539        838       1245       1346       1209       1180
20 |       832       89856   40.6 %  |  GB/s:        1539        887       1284       1344       1258       1231
21 |       896       96768   43.8 %  |  GB/s:        1538        946       1318       1353       1298       1277
22 |       960      103680   46.9 %  |  GB/s:        1539        989       1324       1353       1316       1305
23 |      1024      110592   50.0 %  |  GB/s:        1536       1064       1338       1360       1325       1322
24 |      1088      117504   53.1 %  |  GB/s:        1537       1079       1338       1360       1331       1327
25 |      1152      124416   56.2 %  |  GB/s:        1539       1138       1347       1365       1337       1335
26 |      1216      131328   59.4 %  |  GB/s:        1537       1175       1347       1366       1339       1336
27 |      1280      138240   62.5 %  |  GB/s:        1537       1224       1354       1370       1343       1339
28 |      1344      145152   65.6 %  |  GB/s:        1537       1251       1355       1372       1347       1344
29 |      1408      152064   68.8 %  |  GB/s:        1538       1316       1365       1375       1352       1348
30 |      1472      158976   71.9 %  |  GB/s:        1538       1335       1363       1368       1353       1349
31 |      1536      165888   75.0 %  |  GB/s:        1539       1372       1367       1365       1356       1351
32 |      1600      172800   78.1 %  |  GB/s:        1536       1379       1366       1367       1358       1353
33 |      1664      179712   81.2 %  |  GB/s:        1539       1403       1368       1366       1360       1356
34 |      1728      186624   84.4 %  |  GB/s:        1539       1419       1369       1366       1362       1357
35 |      1792      193536   87.5 %  |  GB/s:        1539       1440       1373       1363       1363       1359
36 |      1856      200448   90.6 %  |  GB/s:        1538       1453       1374       1364       1365       1361
37 |      1920      207360   93.8 %  |  GB/s:        1538       1473       1376       1362       1367       1364
38 |      1984      214272   96.9 %  |  GB/s:        1537       1483       1377       1362       1369       1365
39 |      2048      221184  100.0 %  |  GB/s:        1539       1500       1382       1358       1371       1368
40 | 


--------------------------------------------------------------------------------
/gpu-stream/a100_80.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1728       1    0.8%     |  GB/s:          53         21         40         72         38         38
 3 |   32      3456       1    1.6%     |  GB/s:         106         42         80        142         76         74
 4 |   48      5184       1    2.3%     |  GB/s:         158         63        115        205        112        110
 5 |   64      6912       1    3.1%     |  GB/s:         210         85        159        280        146        143
 6 |   80      8640       1    3.9%     |  GB/s:         261        103        183        329        179        176
 7 |   96     10368       1    4.7%     |  GB/s:         312        123        223        394        211        207
 8 |  112     12096       1    5.5%     |  GB/s:         362        142        250        445        244        240
 9 |   64     13824       2    6.2%     |  GB/s:         406        166        303        534        278        273
10 |  160     17280       1    7.8%     |  GB/s:         503        199        351        611        333        325
11 |   96     20736       2    9.4%     |  GB/s:         601        242        424        744        403        396
12 |  128     27648       2   12.5%     |  GB/s:         792        323        579        993        527        518
13 |  160     34560       2   15.6%     |  GB/s:         965        389        662       1130        635        619
14 |  192     41472       2   18.8%     |  GB/s:        1148        465        791       1324        748        730
15 |  224     48384       2   21.9%     |  GB/s:        1317        531        885       1443        854        832
16 |  256     55296       2   25.0%     |  GB/s:        1488        615       1025       1571        961        936
17 |  288     62208       2   28.1%     |  GB/s:        1657        669       1088       1582       1049       1018
18 |  320     69120       2   31.2%     |  GB/s:        1806        740       1195       1633       1147       1113
19 |  352     76032       2   34.4%     |  GB/s:        1891        801       1276       1628       1229       1190
20 |  384     82944       2   37.5%     |  GB/s:        1901        872       1379       1667       1311       1269
21 |  416     89856       2   40.6%     |  GB/s:        1896        924       1429       1651       1375       1330
22 |  448     96768       2   43.8%     |  GB/s:        1901        986       1498       1673       1438       1402
23 |  480    103680       2   46.9%     |  GB/s:        1900       1037       1520       1670       1472       1444
24 |  512    110592       2   50.0%     |  GB/s:        1908       1119       1577       1697       1516       1491
25 |  544    117504       2   53.1%     |  GB/s:        1900       1134       1574       1688       1531       1507
26 |  576    124416       2   56.2%     |  GB/s:        1898       1198       1609       1705       1560       1543
27 |  608    131328       2   59.4%     |  GB/s:        1900       1246       1613       1704       1575       1555
28 |  640    138240       2   62.5%     |  GB/s:        1905       1306       1646       1726       1598       1579
29 |  672    145152       2   65.6%     |  GB/s:        1900       1341       1639       1720       1609       1591
30 |  704    152064       2   68.8%     |  GB/s:        1901       1393       1660       1734       1626       1608
31 |  736    158976       2   71.9%     |  GB/s:        1894       1432       1660       1735       1633       1618
32 |  768    165888       2   75.0%     |  GB/s:        1901       1491       1682       1753       1644       1629
33 |  800    172800       2   78.1%     |  GB/s:        1898       1511       1674       1748       1653       1636
34 |  832    179712       2   81.2%     |  GB/s:        1896       1555       1689       1760       1662       1645
35 |  864    186624       2   84.4%     |  GB/s:        1898       1582       1689       1761       1667       1651
36 |  896    193536       2   87.5%     |  GB/s:        1900       1622       1706       1775       1672       1656
37 |  928    200448       2   90.6%     |  GB/s:        1896       1645       1700       1773       1680       1662
38 |  960    207360       2   93.8%     |  GB/s:        1898       1687       1711       1781       1686       1670
39 |  992    214272       2   96.9%     |  GB/s:        1893       1713       1712       1783       1691       1674
40 | 1024    221184       2  100.0%     |  GB/s:        1894       1776       1737       1794       1702       1686
41 | 


--------------------------------------------------------------------------------
/gpu-stream/gh200.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      2112       1    0.8%     |  GB/s:          77         31         61        108         58         57
 3 |   32      4224       1    1.6%     |  GB/s:         153         62        120        216        113        111
 4 |   48      6336       1    2.3%     |  GB/s:         228         91        174        308        168        165
 5 |   64      8448       1    3.1%     |  GB/s:         303        122        237        418        219        215
 6 |   80     10560       1    3.9%     |  GB/s:         376        150        281        492        272        267
 7 |   96     12672       1    4.7%     |  GB/s:         449        180        342        592        320        314
 8 |  112     14784       1    5.5%     |  GB/s:         521        208        384        660        372        365
 9 |   64     16896       2    6.2%     |  GB/s:         591        245        464        798        426        419
10 |  160     21120       1    7.8%     |  GB/s:         731        295        539        907        507        499
11 |   96     25344       2    9.4%     |  GB/s:         888        362        655       1097        614        604
12 |  128     33792       2   12.5%     |  GB/s:        1167        486        854       1408        783        771
13 |  160     42240       2   15.6%     |  GB/s:        1435        589        980       1643        928        916
14 |  192     50688       2   18.8%     |  GB/s:        1703        704       1122       1897       1069       1055
15 |  224     59136       2   21.9%     |  GB/s:        1930        806       1252       2132       1206       1187
16 |  256     67584       2   25.0%     |  GB/s:        2186        935       1397       2367       1340       1320
17 |  288     76032       2   28.1%     |  GB/s:        2401       1012       1511       2544       1464       1442
18 |  320     84480       2   31.2%     |  GB/s:        2634       1118       1635       2714       1585       1561
19 |  352     92928       2   34.4%     |  GB/s:        2833       1211       1751       2859       1696       1670
20 |  384    101376       2   37.5%     |  GB/s:        3052       1320       1868       2991       1805       1775
21 |  416    109824       2   40.6%     |  GB/s:        3228       1402       1969       3092       1895       1867
22 |  448    118272       2   43.8%     |  GB/s:        3408       1496       2070       3187       1994       1961
23 |  480    126720       2   46.9%     |  GB/s:        3546       1580       2164       3267       2090       2052
24 |  512    135168       2   50.0%     |  GB/s:        3718       1715       2269       3338       2190       2154
25 |  544    143616       2   53.1%     |  GB/s:        3870       1746       2341       3388       2271       2237
26 |  576    152064       2   56.2%     |  GB/s:        3944       1830       2425       3433       2353       2317
27 |  608    160512       2   59.4%     |  GB/s:        3941       1911       2508       3469       2439       2400
28 |  640    168960       2   62.5%     |  GB/s:        3944       2009       2587       3501       2525       2489
29 |  672    177408       2   65.6%     |  GB/s:        3941       2074       2657       3535       2593       2560
30 |  704    185856       2   68.8%     |  GB/s:        3939       2148       2727       3585       2661       2627
31 |  736    194304       2   71.9%     |  GB/s:        3941       2220       2791       3648       2731       2696
32 |  768    202752       2   75.0%     |  GB/s:        3941       2314       2854       3686       2799       2762
33 |  800    211200       2   78.1%     |  GB/s:        3940       2358       2909       3713       2853       2821
34 |  832    219648       2   81.2%     |  GB/s:        3936       2422       2960       3731       2905       2873
35 |  864    228096       2   84.4%     |  GB/s:        3942       2488       3008       3745       2951       2924
36 |  896    236544       2   87.5%     |  GB/s:        3939       2556       3052       3759       3000       2972
37 |  928    244992       2   90.6%     |  GB/s:        3942       2609       3090       3768       3041       3016
38 |  960    253440       2   93.8%     |  GB/s:        3939       2669       3126       3773       3080       3057
39 |  992    261888       2   96.9%     |  GB/s:        3942       2729       3155       3779       3112       3092
40 | 1024    270336       2  100.0%     |  GB/s:        3942       2775       3174       3783       3138       3120
41 | 


--------------------------------------------------------------------------------
/gpu-stream/mi100.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1920       1    0.6%     |  GB/s:          33         24         37         68         36         34
 3 |   32      3840       1    1.3%     |  GB/s:          65         48         74        134         70         67
 4 |   48      5760       1    1.9%     |  GB/s:          97         74        108        199        104        100
 5 |   64      7680       1    2.5%     |  GB/s:         128         98        142        264        136        130
 6 |   80      9600       1    3.1%     |  GB/s:         160        122        178        321        169        161
 7 |   96     11520       1    3.8%     |  GB/s:         190        145        211        377        201        192
 8 |  112     13440       1    4.4%     |  GB/s:         221        169        244        432        231        220
 9 |   64     15360       2    5.0%     |  GB/s:         256        194        280        494        268        257
10 |  160     19200       1    6.2%     |  GB/s:         308        235        337        577        320        302
11 |   96     23040       2    7.5%     |  GB/s:         378        291        405        671        383        366
12 |  128     30720       2   10.0%     |  GB/s:         493        352        495        802        464        444
13 |  160     38400       2   12.5%     |  GB/s:         605        472        624        922        594        546
14 |  192     46080       2   15.0%     |  GB/s:         714        559        700        959        663        625
15 |  224     53760       2   17.5%     |  GB/s:         815        646        787        982        740        699
16 |  256     61440       2   20.0%     |  GB/s:         901        693        837        969        793        757
17 |  288     69120       2   22.5%     |  GB/s:         967        796        922        932        870        828
18 |  320     76800       2   25.0%     |  GB/s:        1032        697        969        892        891        882
19 |  352     84480       2   27.5%     |  GB/s:        1032        724        925        855        901        863
20 |  384     92160       2   30.0%     |  GB/s:        1071        724        908        779        860        852
21 |  416     99840       2   32.5%     |  GB/s:        1034        760        809        778        806        794
22 |  448    107520       2   35.0%     |  GB/s:        1078        750        918        772        861        859
23 |  480    115200       2   37.5%     |  GB/s:        1051        802        806        770        805        794
24 |  512    122880       2   40.0%     |  GB/s:        1095        796        807        767        796        780
25 |  544    130560       2   42.5%     |  GB/s:        1050        833        806        767        788        782
26 |  576    138240       2   45.0%     |  GB/s:        1088        876        801        753        784        778
27 |  608    145920       2   47.5%     |  GB/s:         920        882        803        753        792        780
28 |  640    153600       2   50.0%     |  GB/s:         944        894        804        758        796        783
29 |  672    161280       2   52.5%     |  GB/s:         824        900        798        756        792        775
30 |  704    168960       2   55.0%     |  GB/s:         798        901        799        748        782        778
31 |  736    176640       2   57.5%     |  GB/s:         810        892        791        743        771        764
32 |  768    184320       2   60.0%     |  GB/s:         795        879        780        727        777        774
33 |  800    192000       2   62.5%     |  GB/s:         795        883        782        729        778        783
34 |  832    199680       2   65.0%     |  GB/s:         805        892        793        741        789        784
35 |  864    207360       2   67.5%     |  GB/s:         798        891        791        746        790        787
36 |  896    215040       2   70.0%     |  GB/s:         802        882        779        720        773        776
37 |  928    222720       2   72.5%     |  GB/s:         789        876        776        720        771        779
38 |  960    230400       2   75.0%     |  GB/s:         791        825        781        727        780        774
39 |  992    238080       2   77.5%     |  GB/s:         791        818        780        727        787        785
40 | 1024    245760       2   80.0%     |  GB/s:         793        737        784        718        778        772
41 | 


--------------------------------------------------------------------------------
/gpu-stream/mi210.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1664       1    0.8%     |  GB/s:          33         24         36         65         35         35
 3 |   32      3328       1    1.6%     |  GB/s:          67         48         72        129         69         68
 4 |   48      4992       1    2.3%     |  GB/s:          97         70        102        183        100         99
 5 |   64      6656       1    3.1%     |  GB/s:         128         92        133        243        129        127
 6 |   80      8320       1    3.9%     |  GB/s:         158        115        166        294        162        158
 7 |   96      9984       1    4.7%     |  GB/s:         188        137        198        350        191        188
 8 |  112     11648       1    5.5%     |  GB/s:         217        161        226        396        218        213
 9 |   64     13312       2    6.2%     |  GB/s:         253        185        257        465        248        245
10 |  160     16640       1    7.8%     |  GB/s:         303        223        312        534        297        289
11 |   96     19968       2    9.4%     |  GB/s:         372        277        376        642        360        353
12 |  128     26624       2   12.5%     |  GB/s:         487        345        463        825        447        437
13 |  160     33280       2   15.6%     |  GB/s:         594        445        598        944        558        538
14 |  192     39936       2   18.8%     |  GB/s:         704        518        684       1055        645        618
15 |  224     46592       2   21.9%     |  GB/s:         805        593        766       1114        724        692
16 |  256     53248       2   25.0%     |  GB/s:         907        662        844       1144        796        759
17 |  288     59904       2   28.1%     |  GB/s:         994        742        921       1170        871        830
18 |  320     66560       2   31.2%     |  GB/s:        1091        819        996       1194        941        893
19 |  352     73216       2   34.4%     |  GB/s:        1165        866       1045       1182        983        909
20 |  384     79872       2   37.5%     |  GB/s:        1242        912       1095       1189       1045        960
21 |  416     86528       2   40.6%     |  GB/s:        1323       1005       1148       1194       1094        975
22 |  448     93184       2   43.8%     |  GB/s:        1410       1075       1192       1207       1144       1022
23 |  480     99840       2   46.9%     |  GB/s:        1442       1120       1219       1205       1152       1021
24 |  512    106496       2   50.0%     |  GB/s:        1446       1159       1230       1217       1156       1029
25 |  544    113152       2   53.1%     |  GB/s:        1483       1203       1255       1219       1160       1056
26 |  576    119808       2   56.2%     |  GB/s:        1507       1281       1282       1232       1209       1093
27 |  608    126464       2   59.4%     |  GB/s:        1499       1325       1297       1228       1209       1092
28 |  640    133120       2   62.5%     |  GB/s:        1500       1373       1311       1231       1230       1120
29 |  672    139776       2   65.6%     |  GB/s:        1504       1392       1317       1223       1224       1112
30 |  704    146432       2   68.8%     |  GB/s:        1519       1396       1330       1227       1240       1132
31 |  736    153088       2   71.9%     |  GB/s:        1505       1397       1346       1217       1237       1127
32 |  768    159744       2   75.0%     |  GB/s:        1518       1362       1356       1216       1247       1142
33 |  800    166400       2   78.1%     |  GB/s:        1519       1383       1372       1215       1246       1138
34 |  832    173056       2   81.2%     |  GB/s:        1533       1372       1394       1221       1258       1156
35 |  864    179712       2   84.4%     |  GB/s:        1513       1372       1401       1220       1257       1149
36 |  896    186368       2   87.5%     |  GB/s:        1518       1373       1412       1222       1262       1163
37 |  928    193024       2   90.6%     |  GB/s:        1510       1365       1391       1230       1264       1164
38 |  960    199680       2   93.8%     |  GB/s:        1529       1377       1388       1230       1271       1180
39 |  992    206336       2   96.9%     |  GB/s:        1523       1380       1369       1230       1266       1177
40 | 1024    212992       2  100.0%     |  GB/s:        1511       1370       1362       1225       1256       1143
41 | 


--------------------------------------------------------------------------------
/gpu-stream/mi300a.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      3648       1    0.8%     |  GB/s:         105         57         93        171         91         90
 3 |   32      7296       1    1.6%     |  GB/s:         210        114        183        335        179        177
 4 |   48     10944       1    2.3%     |  GB/s:         308        167        267        483        261        258
 5 |   64     14592       1    3.1%     |  GB/s:         408        223        355        642        344        340
 6 |   80     18240       1    3.9%     |  GB/s:         499        262        421        748        414        405
 7 |   96     21888       1    4.7%     |  GB/s:         596        318        505        890        494        485
 8 |  112     25536       1    5.5%     |  GB/s:         690        371        579        998        572        561
 9 |   64     29184       2    6.2%     |  GB/s:         807        447        677       1194        656        648
10 |  160     36480       1    7.8%     |  GB/s:         953        508        781       1337        762        742
11 |   96     43776       2    9.4%     |  GB/s:        1170        640        933       1551        917        899
12 |  128     58368       2   12.5%     |  GB/s:        1483        846       1200       1874       1169       1142
13 |  160     72960       2   15.6%     |  GB/s:        1681       1011       1369       2081       1356       1288
14 |  192     87552       2   18.8%     |  GB/s:        1867       1196       1549       2318       1546       1465
15 |  224    102144       2   21.9%     |  GB/s:        2026       1338       1691       2466       1671       1575
16 |  256    116736       2   25.0%     |  GB/s:        2174       1470       1837       2626       1796       1712
17 |  288    131328       2   28.1%     |  GB/s:        2337       1614       1933       2620       1877       1774
18 |  320    145920       2   31.2%     |  GB/s:        2487       1759       2059       2590       1996       1882
19 |  352    160512       2   34.4%     |  GB/s:        2594       1836       2146       2634       2082       1929
20 |  384    175104       2   37.5%     |  GB/s:        2705       1959       2273       2642       2203       2030
21 |  416    189696       2   40.6%     |  GB/s:        2655       2008       2342       2633       2242       2038
22 |  448    204288       2   43.8%     |  GB/s:        2712       2113       2446       2691       2339       2144
23 |  480    218880       2   46.9%     |  GB/s:        2699       2189       2525       2754       2382       2155
24 |  512    233472       2   50.0%     |  GB/s:        2778       2269       2645       2894       2492       2321
25 |  544    248064       2   53.1%     |  GB/s:        2831       2349       2655       2852       2554       2308
26 |  576    262656       2   56.2%     |  GB/s:        2839       2442       2686       2894       2637       2378
27 |  608    277248       2   59.4%     |  GB/s:        2863       2462       2590       2877       2645       2356
28 |  640    291840       2   62.5%     |  GB/s:        2942       2540       2652       2904       2626       2361
29 |  672    306432       2   65.6%     |  GB/s:        2923       2592       2738       2950       2684       2366
30 |  704    321024       2   68.8%     |  GB/s:        2925       2657       2759       2944       2723       2409
31 |  736    335616       2   71.9%     |  GB/s:        2887       2688       2687       2928       2689       2372
32 |  768    350208       2   75.0%     |  GB/s:        2880       2660       2690       2955       2727       2405
33 |  800    364800       2   78.1%     |  GB/s:        2854       2640       2679       2935       2708       2394
34 |  832    379392       2   81.2%     |  GB/s:        2891       2602       2696       2963       2648       2443
35 |  864    393984       2   84.4%     |  GB/s:        2844       2602       2700       2956       2637       2530
36 |  896    408576       2   87.5%     |  GB/s:        2973       2650       2850       2978       2780       2459
37 |  928    423168       2   90.6%     |  GB/s:        2830       2632       2865       2993       2651       2590
38 |  960    437760       2   93.8%     |  GB/s:        3015       2573       2873       2994       2670       2628
39 |  992    452352       2   96.9%     |  GB/s:        2924       2558       2887       2983       2682       2612
40 | 1024    466944       2  100.0%     |  GB/s:        3069       2677       2956       3034       2744       2522
41 | 


--------------------------------------------------------------------------------
/gpu-stream/mi300x.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      4864       1    0.8%     |  GB/s:         136         75        122        233        117        116
 3 |   32      9728       1    1.6%     |  GB/s:         266        148        242        454        231        228
 4 |   48     14592       1    2.3%     |  GB/s:         394        208        339        626        337        333
 5 |   64     19456       1    3.1%     |  GB/s:         524        282        459        839        440        434
 6 |   80     24320       1    3.9%     |  GB/s:         639        338        539        977        535        524
 7 |   96     29184       1    4.7%     |  GB/s:         758        408        638       1148        635        623
 8 |  112     34048       1    5.5%     |  GB/s:         879        474        728       1316        728        714
 9 |   64     38912       2    6.2%     |  GB/s:        1030        554        855       1560        824        815
10 |  160     48640       1    7.8%     |  GB/s:        1213        655        978       1732        975        951
11 |   96     58368       2    9.4%     |  GB/s:        1489        808       1156       2065       1159       1139
12 |  128     77824       2   12.5%     |  GB/s:        1948       1049       1534       2647       1451       1423
13 |  160     97280       2   15.6%     |  GB/s:        2348       1291       1695       2942       1685       1643
14 |  192    116736       2   18.8%     |  GB/s:        2735       1494       1935       3266       1924       1880
15 |  224    136192       2   21.9%     |  GB/s:        2974       1719       2148       3454       2142       2079
16 |  256    155648       2   25.0%     |  GB/s:        3377       1959       2729       3740       2432       2336
17 |  288    175104       2   28.1%     |  GB/s:        3431       2153       2514       3735       2477       2383
18 |  320    194560       2   31.2%     |  GB/s:        3596       2343       2704       3829       2623       2512
19 |  352    214016       2   34.4%     |  GB/s:        3684       2510       2866       3839       2780       2598
20 |  384    233472       2   37.5%     |  GB/s:        3916       2657       3088       3944       2954       2772
21 |  416    252928       2   40.6%     |  GB/s:        3959       2791       3189       3894       3039       2797
22 |  448    272384       2   43.8%     |  GB/s:        4081       2907       3354       3964       3191       2937
23 |  480    291840       2   46.9%     |  GB/s:        4147       3043       3492       3955       3250       2939
24 |  512    311296       2   50.0%     |  GB/s:        4319       3098       3427       3169       3469       3104
25 |  544    330752       2   53.1%     |  GB/s:        4269       3272       3696       3964       3440       3060
26 |  576    350208       2   56.2%     |  GB/s:        4298       3320       3791       3968       3536       3108
27 |  608    369664       2   59.4%     |  GB/s:        4402       3406       3820       3931       3580       3116
28 |  640    389120       2   62.5%     |  GB/s:        4532       3479       3927       4000       3694       3221
29 |  672    408576       2   65.6%     |  GB/s:        4607       3540       3944       3962       3690       3195
30 |  704    428032       2   68.8%     |  GB/s:        4609       3622       3988       3974       3754       3273
31 |  736    447488       2   71.9%     |  GB/s:        4534       3670       4017       3952       3782       3277
32 |  768    466944       2   75.0%     |  GB/s:        4849       3756       3924       3947       3903       3436
33 |  800    486400       2   78.1%     |  GB/s:        4663       3775       4008       3914       3830       3339
34 |  832    505856       2   81.2%     |  GB/s:        4579       3875       4033       3942       3896       3403
35 |  864    525312       2   84.4%     |  GB/s:        4611       3864       3993       3932       3852       3395
36 |  896    544768       2   87.5%     |  GB/s:        4751       3932       4058       3957       3926       3461
37 |  928    564224       2   90.6%     |  GB/s:        4640       3935       3964       3906       3918       3458
38 |  960    583680       2   93.8%     |  GB/s:        4638       4052       4002       3893       3935       3499
39 |  992    603136       2   96.9%     |  GB/s:        4617       4001       3589       3818       3880       3516
40 | 1024    622592       2  100.0%     |  GB/s:        4801       4199       3997       3866       4146       3677
41 | 


--------------------------------------------------------------------------------
/gpu-stream/v100.txt:
--------------------------------------------------------------------------------
 1 | block smBlocks   threads    occ%   |                init       read       scale     triad       3pt        5pt
 2 |   16      1280       1    0.8%     |  GB/s:          39         19         34         64         33         32
 3 |   32      2560       1    1.6%     |  GB/s:          78         36         67        123         63         61
 4 |   48      3840       1    2.3%     |  GB/s:         116         54         98        175         93         90
 5 |   64      5120       1    3.1%     |  GB/s:         155         70        130        230        123        119
 6 |   80      6400       1    3.9%     |  GB/s:         192         87        157        269        151        147
 7 |   96      7680       1    4.7%     |  GB/s:         229        103        184        316        178        173
 8 |  112      8960       1    5.5%     |  GB/s:         267        119        212        352        204        198
 9 |   64     10240       2    6.2%     |  GB/s:         306        139        246        400        233        226
10 |  160     12800       1    7.8%     |  GB/s:         369        165        281        452        272        263
11 |   96     15360       2    9.4%     |  GB/s:         454        200        335        524        325        318
12 |  128     20480       2   12.5%     |  GB/s:         599        263        426        638        406        397
13 |  160     25600       2   15.6%     |  GB/s:         724        322        487        682        478        464
14 |  192     30720       2   18.8%     |  GB/s:         850        376        555        722        540        527
15 |  224     35840       2   21.9%     |  GB/s:         897        432        612        737        599        585
16 |  256     40960       2   25.0%     |  GB/s:         897        491        675        759        648        635
17 |  288     46080       2   28.1%     |  GB/s:         897        539        700        755        689        675
18 |  320     51200       2   31.2%     |  GB/s:         897        557        733        764        720        708
19 |  352     56320       2   34.4%     |  GB/s:         897        608        749        765        740        731
20 |  384     61440       2   37.5%     |  GB/s:         897        647        765        774        753        744
21 |  416     66560       2   40.6%     |  GB/s:         897        686        769        770        763        755
22 |  448     71680       2   43.8%     |  GB/s:         897        708        776        772        770        764
23 |  480     76800       2   46.9%     |  GB/s:         897        728        780        773        776        771
24 |  512     81920       2   50.0%     |  GB/s:         897        746        784        777        778        774
25 |  544     87040       2   53.1%     |  GB/s:         897        749        787        774        783        778
26 |  576     92160       2   56.2%     |  GB/s:         897        752        790        776        786        782
27 |  608     97280       2   59.4%     |  GB/s:         897        755        792        776        789        785
28 |  640    102400       2   62.5%     |  GB/s:         897        773        794        781        789        785
29 |  672    107520       2   65.6%     |  GB/s:         897        787        796        779        792        788
30 |  704    112640       2   68.8%     |  GB/s:         897        804        797        780        794        790
31 |  736    117760       2   71.9%     |  GB/s:         897        813        799        780        796        792
32 |  768    122880       2   75.0%     |  GB/s:         897        824        801        783        796        794
33 |  800    128000       2   78.1%     |  GB/s:         897        829        802        782        799        795
34 |  832    133120       2   81.2%     |  GB/s:         897        838        803        784        800        796
35 |  864    138240       2   84.4%     |  GB/s:         897        842        805        784        801        797
36 |  896    143360       2   87.5%     |  GB/s:         897        850        805        785        801        798
37 |  928    148480       2   90.6%     |  GB/s:         897        854        807        785        803        800
38 |  960    153600       2   93.8%     |  GB/s:         897        859        807        787        804        801
39 |  992    158720       2   96.9%     |  GB/s:         897        863        808        787        804        801
40 | 1024    163840       2  100.0%     |  GB/s:         897        867        809        788        804        801
41 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-3d-stream/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../measure_metric/measureMetricPW.hpp"
  3 | #include "../dtime.hpp"
  4 | #include "../gpu-error.h"
  5 | #include <iomanip>
  6 | #include <iostream>
  7 | #include <nvml.h>
  8 | 
  9 | using namespace std;
 10 | 
 11 | const size_t xdim = 2000;
 12 | const size_t ydim = 1000;
 13 | const size_t zdim = 100;
 14 | const size_t buffer_size = (size_t) xdim * ydim * zdim;
 15 | double *dA, *dB;
 16 | 
 17 | template <typename T>
 18 | __global__ void init_kernel(T *A, const T *__restrict__ B,
 19 |                             const T *__restrict__ C, const T *__restrict__ D,
 20 |                             const size_t N) {
 21 |   size_t tidx = threadIdx.x + blockIdx.x * blockDim.x;
 22 |   for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) {
 23 |     A[i] = 0.1;
 24 |   }
 25 | }
 26 | 
 27 | template <typename T>
 28 | __global__ void scale_kernel(T *A, const T *__restrict__ B) {
 29 |   __shared__ double spoiler[1024];
 30 |   int tidx = threadIdx.x + blockIdx.x * blockDim.x;
 31 |   int tidy = threadIdx.y + blockIdx.y * blockDim.y;
 32 |   int tidz = threadIdx.z + blockIdx.z * blockDim.z;
 33 |   if (tidx >= xdim || tidy >= ydim || tidz >= zdim)
 34 |     return;
 35 | 
 36 |   if (threadIdx.x > 1243)
 37 |     spoiler[threadIdx.x] = B[threadIdx.x];
 38 | 
 39 |   size_t idx = tidz * xdim * ydim + tidy * xdim + tidx;
 40 |   A[idx] = B[idx] * 1.2;
 41 | 
 42 |   if (threadIdx.x > 1243)
 43 |     A[idx] = spoiler[idx];
 44 | }
 45 | 
 46 | void measureFunc(dim3 blockSize) {
 47 | 
 48 |   GPU_ERROR(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
 49 |   MeasurementSeries time;
 50 | 
 51 |   dim3 grid = dim3((xdim - 1) / blockSize.x + 1, (ydim - 1) / blockSize.y + 1,
 52 |                    (zdim - 1) / blockSize.z + 1);
 53 | 
 54 |   scale_kernel<<<grid, blockSize>>>(dA, dB);
 55 | 
 56 |   nvmlDevice_t device;
 57 |   int deviceId;
 58 |   cudaGetDevice(&deviceId);
 59 |   nvmlDeviceGetHandleByIndex(deviceId, &device);
 60 | 
 61 |   for (int iter = 0; iter < 10; iter++) {
 62 |     GPU_ERROR(cudaDeviceSynchronize());
 63 |     double t1 = dtime();
 64 |     GPU_ERROR(cudaDeviceSynchronize());
 65 |     scale_kernel<<<grid, blockSize>>>(dA, dB);
 66 |     scale_kernel<<<grid, blockSize>>>(dA, dB);
 67 |     GPU_ERROR(cudaDeviceSynchronize());
 68 |     double t2 = dtime();
 69 |     time.add((t2 - t1) / 2);
 70 |   }
 71 | 
 72 |   measureBandwidthStart();
 73 |   scale_kernel<<<grid, blockSize>>>(dA, dB);
 74 |   auto metrics = measureMetricStop();
 75 | 
 76 |   cudaDeviceProp prop;
 77 |   GPU_ERROR(cudaGetDevice(&deviceId));
 78 |   GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 79 |   std::string deviceName = prop.name;
 80 |   int smCount = prop.multiProcessorCount;
 81 |   int maxActiveBlocks = 0;
 82 |   GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 83 |       &maxActiveBlocks, scale_kernel<double>, blockSize.x*blockSize.y*blockSize.z, 0));
 84 | 
 85 | 
 86 |   cout << fixed << setprecision(0) << "(" << setw(4) << blockSize.x << ","
 87 |        << setw(4) << blockSize.y << "," << setw(4) << blockSize.z << ")      "
 88 |        << maxActiveBlocks << " "
 89 |        << setw(2) << " " << setw(5)
 90 |        << buffer_size * 2 * sizeof(double) / time.median() * 1e-9 << "  "
 91 |        << (maxActiveBlocks*smCount*blockSize.x*blockSize.y*blockSize.z) * time.median() * 1.41e9 / buffer_size << " "
 92 | 
 93 |        << setprecision(0) << setw(8) << metrics[0] / time.value() / 1.0e9 << " GB/s "    //
 94 |        << setprecision(0) << setw(8) << metrics[1] / time.value() / 1.0e9 << " GB/s "    //
 95 |        << setprecision(0) << setw(8) << metrics[2]*32 / time.value() / 1.0e9 << " GB/s "    //
 96 |        << setprecision(0) << setw(8) << metrics[3]*32 / time.value() / 1.0e9 << " GB/s " << endl;   //
 97 |   cout.flush();
 98 | }
 99 | 
100 | int main(int argc, char **argv) {
101 |   nvmlInit();
102 |   GPU_ERROR(cudaMalloc(&dA, buffer_size * sizeof(double)));
103 |   GPU_ERROR(cudaMalloc(&dB, buffer_size * sizeof(double)));
104 | 
105 |   init_kernel<<<256, 400>>>(dB, dB, dB, dB, buffer_size);
106 |   init_kernel<<<256, 400>>>(dA, dA, dA, dA, buffer_size);
107 |   GPU_ERROR(cudaDeviceSynchronize());
108 | 
109 |   for (int blockDimX : {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) {
110 |     for (int blockDimY : {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}) {
111 |       for (int blockDimZ : {1, 2, 4, 8, 16, 32, 64}) {
112 |           int threadCount = blockDimX * blockDimY * blockDimZ;
113 | 
114 |         if (threadCount != 256) //threadCount > 1024 || threadCount < 64)
115 |           continue;
116 | 
117 |         measureFunc(dim3(blockDimX, blockDimY, blockDimZ));
118 |       }
119 |     }
120 |   }
121 | 
122 |   cudaDeviceProp prop;
123 |   int deviceId;
124 |   GPU_ERROR(cudaGetDevice(&deviceId));
125 |   GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
126 |   std::string deviceName = prop.name;
127 | 
128 |   GPU_ERROR(cudaFree(dA));
129 |   GPU_ERROR(cudaFree(dB));
130 | }
131 | 


--------------------------------------------------------------------------------
/gpu-latency/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../dtime.hpp"
  3 | #include "../gpu-clock.cuh"
  4 | #include "../gpu-error.h"
  5 | // #include <algorithm>
  6 | #include <cuComplex.h>
  7 | #include <cuda_runtime.h>
  8 | #include <iomanip>
  9 | #include <iostream>
 10 | #include <random>
 11 | #include <sys/time.h>
 12 | 
 13 | using namespace std;
 14 | 
 15 | typedef int64_t dtype;
 16 | 
 17 | __device__ unsigned int smid() {
 18 |   unsigned int r;
 19 | 
 20 |   asm("mov.u32 %0, %%smid;" : "=r"(r));
 21 | 
 22 |   return r;
 23 | }
 24 | 
 25 | template <typename T>
 26 | __global__ void pchase(T *buf, T *__restrict__ dummy_buf, int64_t N) {
 27 | 
 28 |   int tidx = threadIdx.x + blockIdx.x * blockDim.x;
 29 |   int64_t *idx = buf;
 30 | 
 31 |   const int unroll_factor = 32;
 32 | #pragma unroll 1
 33 |   for (int64_t n = 0; n < N; n += unroll_factor) {
 34 | #pragma unroll
 35 |     for (int u = 0; u < unroll_factor; u++) {
 36 |       idx = (int64_t *)*idx;
 37 |     }
 38 |   }
 39 | 
 40 |   if (tidx > 12313) {
 41 |     dummy_buf[0] = (int64_t)idx;
 42 |   }
 43 | }
 44 | 
 45 | int main(int argc, char **argv) {
 46 | 
 47 | #ifdef __NVCC__
 48 |   GPU_ERROR(cudaFuncSetAttribute(
 49 |       pchase<dtype>, cudaFuncAttributePreferredSharedMemoryCarveout, 0));
 50 | #endif
 51 |   unsigned int clock = getGPUClock();
 52 | 
 53 |   const int cl_size = 128 / sizeof(int64_t);
 54 |   const int skip_factor = 1;
 55 | 
 56 |   std::random_device rd;
 57 |   std::mt19937 g(rd());
 58 | 
 59 |   for (int64_t LEN = 16; LEN < (1 << 24); LEN = LEN * 1.042 + 1 + rand() % 11) {
 60 |     if (LEN * skip_factor * cl_size * sizeof(dtype) > 120 * 1024 * 1024)
 61 |       LEN *= 1.1;
 62 | 
 63 |     MeasurementSeries times;
 64 |     const int64_t iters = max(LEN, (int64_t)100000);
 65 | 
 66 |     for (int i = 0; i < 21; i++) {
 67 | 
 68 |       vector<int64_t> order(LEN);
 69 |       int64_t *buf = NULL;
 70 |       int64_t *dbuf = NULL;
 71 |       dtype *dummy_buf = NULL;
 72 | 
 73 |       GPU_ERROR(
 74 |           cudaMallocManaged(&buf, skip_factor * cl_size * LEN * sizeof(dtype)));
 75 |       GPU_ERROR(cudaMalloc(&dbuf, skip_factor * cl_size * LEN * sizeof(dtype)));
 76 |       GPU_ERROR(cudaMallocManaged(&dummy_buf, sizeof(dtype)));
 77 |       for (int64_t i = 0; i < LEN; i++) {
 78 |         order[i] = i + 1;
 79 |       }
 80 |       order[LEN - 1] = 0;
 81 | 
 82 |       shuffle(begin(order), end(order) - 1, g);
 83 | 
 84 |       for (int cl_lane = 0; cl_lane < cl_size; cl_lane++) {
 85 |         dtype idx = 0;
 86 |         for (int64_t i = 0; i < LEN; i++) {
 87 | 
 88 |           buf[(idx * cl_size + cl_lane) * skip_factor] =
 89 |               skip_factor *
 90 |               (order[i] * cl_size + cl_lane + (order[i] == 0 ? 1 : 0));
 91 |           idx = order[i];
 92 |         }
 93 |       }
 94 |       buf[skip_factor * (order[LEN - 2] * cl_size + cl_size - 1)] = 0;
 95 | 
 96 |       for (int64_t n = 0; n < LEN * cl_size * skip_factor; n++) {
 97 |         buf[n] = (int64_t)dbuf + buf[n] * sizeof(int64_t *);
 98 |       }
 99 | 
100 |       GPU_ERROR(cudaMemcpy(dbuf, buf,
101 |                            skip_factor * cl_size * LEN * sizeof(dtype),
102 |                            cudaMemcpyHostToDevice));
103 | 
104 |       pchase<dtype><<<1, 1>>>(buf, dummy_buf, iters);
105 |       pchase<dtype><<<1, 1>>>(buf, dummy_buf, iters);
106 | 
107 |       cudaEvent_t start, stop;
108 |       GPU_ERROR(cudaEventCreate(&start));
109 |       GPU_ERROR(cudaEventCreate(&stop));
110 | 
111 |       GPU_ERROR(cudaDeviceSynchronize());
112 | 
113 |       GPU_ERROR(cudaEventRecord(start));
114 |       pchase<dtype><<<1, 1>>>(buf, dummy_buf, iters);
115 |       GPU_ERROR(cudaEventRecord(stop));
116 | 
117 |       GPU_ERROR(cudaEventSynchronize(stop));
118 |       float milliseconds = 0;
119 |       GPU_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
120 | 
121 |       times.add(milliseconds / 1000);
122 | 
123 |       GPU_ERROR(cudaGetLastError());
124 |       GPU_ERROR(cudaFree(buf));
125 |       GPU_ERROR(cudaFree(dbuf));
126 |       GPU_ERROR(cudaFree(dummy_buf));
127 |     }
128 |     double dt = times.value();
129 |     double dtmed = times.median();
130 |     double dtmin = times.getPercentile(0.05);
131 |     double dtmax = times.getPercentile(0.95);
132 |     cout << setw(9) << iters << " " << setw(5) << clock << " " //
133 |          << setw(8) << skip_factor * LEN * cl_size * sizeof(dtype) / 1024.0
134 |          << " "                                            //
135 |          << fixed                                          //
136 |          << setprecision(1) << setw(8) << dt * 1000 << " " //
137 |          << setw(7) << setprecision(1)
138 |          << (double)dt / iters * clock * 1000 * 1000 << " "
139 |          << (double)dtmed / iters * clock * 1000 * 1000 << " "
140 |          << (double)dtmin / iters * clock * 1000 * 1000 << " "
141 |          << (double)dtmax / iters * clock * 1000 * 1000 << "\n"
142 |          << flush;
143 |   }
144 |   cout << "\n";
145 | }
146 | 


--------------------------------------------------------------------------------
/measure_metric/Utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <nvperf_host.h>
  3 | 
  4 | #define RETURN_IF_NVPW_ERROR(retval, actual)                                        \
  5 | do {                                                                                \
  6 |     NVPA_Status status = actual;                                                    \
  7 |     if (NVPA_STATUS_SUCCESS != status) {                                            \
  8 |         fprintf(stderr, "FAILED: %s with error %s\n", #actual, NV::Metric::Utils::GetNVPWResultString(status)); \
  9 |         return retval;                                                              \
 10 |     }                                                                               \
 11 | } while (0)
 12 | 
 13 | namespace NV {
 14 |     namespace Metric {
 15 |         namespace Utils {
 16 | 
 17 |             static const char* GetNVPWResultString(NVPA_Status status) {
 18 |                 const char* errorMsg = NULL;
 19 |                 switch (status)
 20 |                 {
 21 |                 case NVPA_STATUS_ERROR:
 22 |                     errorMsg = "NVPA_STATUS_ERROR";
 23 |                     break;
 24 |                 case NVPA_STATUS_INTERNAL_ERROR:
 25 |                     errorMsg = "NVPA_STATUS_INTERNAL_ERROR";
 26 |                     break;
 27 |                 case NVPA_STATUS_NOT_INITIALIZED:
 28 |                     errorMsg = "NVPA_STATUS_NOT_INITIALIZED";
 29 |                     break;
 30 |                 case NVPA_STATUS_NOT_LOADED:
 31 |                     errorMsg = "NVPA_STATUS_NOT_LOADED";
 32 |                     break;
 33 |                 case NVPA_STATUS_FUNCTION_NOT_FOUND:
 34 |                     errorMsg = "NVPA_STATUS_FUNCTION_NOT_FOUND";
 35 |                     break;
 36 |                 case NVPA_STATUS_NOT_SUPPORTED:
 37 |                     errorMsg = "NVPA_STATUS_NOT_SUPPORTED";
 38 |                     break;
 39 |                 case NVPA_STATUS_NOT_IMPLEMENTED:
 40 |                     errorMsg = "NVPA_STATUS_NOT_IMPLEMENTED";
 41 |                     break;
 42 |                 case NVPA_STATUS_INVALID_ARGUMENT:
 43 |                     errorMsg = "NVPA_STATUS_INVALID_ARGUMENT";
 44 |                     break;
 45 |                 case NVPA_STATUS_INVALID_METRIC_ID:
 46 |                     errorMsg = "NVPA_STATUS_INVALID_METRIC_ID";
 47 |                     break;
 48 |                 case NVPA_STATUS_DRIVER_NOT_LOADED:
 49 |                     errorMsg = "NVPA_STATUS_DRIVER_NOT_LOADED";
 50 |                     break;
 51 |                 case NVPA_STATUS_OUT_OF_MEMORY:
 52 |                     errorMsg = "NVPA_STATUS_OUT_OF_MEMORY";
 53 |                     break;
 54 |                 case NVPA_STATUS_INVALID_THREAD_STATE:
 55 |                     errorMsg = "NVPA_STATUS_INVALID_THREAD_STATE";
 56 |                     break;
 57 |                 case NVPA_STATUS_FAILED_CONTEXT_ALLOC:
 58 |                     errorMsg = "NVPA_STATUS_FAILED_CONTEXT_ALLOC";
 59 |                     break;
 60 |                 case NVPA_STATUS_UNSUPPORTED_GPU:
 61 |                     errorMsg = "NVPA_STATUS_UNSUPPORTED_GPU";
 62 |                     break;
 63 |                 case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION:
 64 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION";
 65 |                     break;
 66 |                 case NVPA_STATUS_OBJECT_NOT_REGISTERED:
 67 |                     errorMsg = "NVPA_STATUS_OBJECT_NOT_REGISTERED";
 68 |                     break;
 69 |                 case NVPA_STATUS_INSUFFICIENT_PRIVILEGE:
 70 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE";
 71 |                     break;
 72 |                 case NVPA_STATUS_INVALID_CONTEXT_STATE:
 73 |                     errorMsg = "NVPA_STATUS_INVALID_CONTEXT_STATE";
 74 |                     break;
 75 |                 case NVPA_STATUS_INVALID_OBJECT_STATE:
 76 |                     errorMsg = "NVPA_STATUS_INVALID_OBJECT_STATE";
 77 |                     break;
 78 |                 case NVPA_STATUS_RESOURCE_UNAVAILABLE:
 79 |                     errorMsg = "NVPA_STATUS_RESOURCE_UNAVAILABLE";
 80 |                     break;
 81 |                 case NVPA_STATUS_DRIVER_LOADED_TOO_LATE:
 82 |                     errorMsg = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE";
 83 |                     break;
 84 |                 case NVPA_STATUS_INSUFFICIENT_SPACE:
 85 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_SPACE";
 86 |                     break;
 87 |                 case NVPA_STATUS_OBJECT_MISMATCH:
 88 |                     errorMsg = "NVPA_STATUS_OBJECT_MISMATCH";
 89 |                     break;
 90 |                 case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED:
 91 |                     errorMsg = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED";
 92 |                     break;
 93 |                 default:
 94 |                     break;
 95 |                 }
 96 | 
 97 |                 return errorMsg;
 98 |             }
 99 |         }
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/gpu-l2-cache/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../dtime.hpp"
  3 | #include "../gpu-error.h"
  4 | #include "../gpu-metrics/gpu-metrics.hpp"
  5 | #include <iomanip>
  6 | #include <iostream>
  7 | 
  8 | using namespace std;
  9 | 
 10 | using dtype = double;
 11 | dtype *dA, *dB;
 12 | 
 13 | __global__ void initKernel(dtype *A, size_t N) {
 14 |   size_t tidx = blockDim.x * blockIdx.x + threadIdx.x;
 15 |   for (int idx = tidx; idx < N; idx += blockDim.x * gridDim.x) {
 16 |     A[idx] = dtype(1.1);
 17 |   }
 18 | }
 19 | 
 20 | template <int N, int BLOCKSIZE>
 21 | __global__ void sumKernel(dtype *__restrict__ A, const dtype *__restrict__ B,
 22 |                           int blockRun) {
 23 |   dtype localSum = dtype(0);
 24 | 
 25 |   for (int i = 0; i < N / 2; i++) {
 26 |     int idx =
 27 |         (blockDim.x * blockRun * i + (blockIdx.x % blockRun) * BLOCKSIZE) * 2 +
 28 |         threadIdx.x;
 29 |     localSum += B[idx] * B[idx + BLOCKSIZE];
 30 |   }
 31 | 
 32 |   localSum *= (dtype)1.3;
 33 |   if (threadIdx.x > 1233 || localSum == (dtype)23.12)
 34 |     A[threadIdx.x] += localSum;
 35 | }
 36 | template <int N, int blockSize>
 37 | double callKernel(int blockCount, int blockRun) {
 38 |   sumKernel<N, blockSize><<<blockCount, blockSize>>>(dA, dB, blockRun);
 39 |   GPU_ERROR(cudaPeekAtLastError());
 40 |   return 0.0;
 41 | }
 42 | template <int N> void measure(int blockRun) {
 43 | 
 44 |   const int blockSize = 1024;
 45 | 
 46 |   cudaDeviceProp prop;
 47 |   int deviceId;
 48 |   GPU_ERROR(cudaGetDevice(&deviceId));
 49 |   GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 50 |   std::string deviceName = prop.name;
 51 |   int smCount = prop.multiProcessorCount;
 52 |   int maxActiveBlocks = 0;
 53 |   GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 54 |       &maxActiveBlocks, sumKernel<N, blockSize>, blockSize, 0));
 55 | 
 56 |   int blockCount = 200000;
 57 | 
 58 |   // GPU_ERROR(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
 59 | 
 60 |   MeasurementSeries time;
 61 |   MeasurementSeries dram_read;
 62 |   MeasurementSeries dram_write;
 63 |   MeasurementSeries L2_read;
 64 |   MeasurementSeries L2_write;
 65 | 
 66 |   GPU_ERROR(cudaDeviceSynchronize());
 67 |   for (int i = 0; i < 11; i++) {
 68 |     const size_t bufferCount = blockRun * blockSize * N + i * 128;
 69 |     GPU_ERROR(cudaMalloc(&dA, bufferCount * sizeof(dtype)));
 70 |     initKernel<<<52, 256>>>(dA, bufferCount);
 71 |     GPU_ERROR(cudaMalloc(&dB, bufferCount * sizeof(dtype)));
 72 |     initKernel<<<52, 256>>>(dB, bufferCount);
 73 |     GPU_ERROR(cudaDeviceSynchronize());
 74 | 
 75 |     double t1 = dtime();
 76 |     callKernel<N, blockSize>(blockCount, blockRun);
 77 |     GPU_ERROR(cudaDeviceSynchronize());
 78 |     double t2 = dtime();
 79 |     time.add(t2 - t1);
 80 | 
 81 |     /* measureDRAMBytesStart();
 82 |      callKernel<N, blockSize>(blockCount, blockRun);
 83 |      auto metrics = measureDRAMBytesStop();
 84 |      dram_read.add(metrics[0]);
 85 |      dram_write.add(metrics[1]);
 86 | 
 87 |      measureL2BytesStart();
 88 |      callKernel<N, blockSize>(blockCount, blockRun);
 89 |      metrics = measureL2BytesStop();
 90 |      L2_read.add(metrics[0]);
 91 |      L2_write.add(metrics[1]);*/
 92 |     GPU_ERROR(cudaFree(dA));
 93 |     GPU_ERROR(cudaFree(dB));
 94 |   }
 95 | 
 96 |   double blockDV = N * blockSize * sizeof(dtype);
 97 | 
 98 |   double bw = blockDV * blockCount / time.minValue() / 1.0e9;
 99 |   cout << fixed << setprecision(0) << setw(10) << blockDV / 1024 << " kB" //
100 |        << fixed << setprecision(0) << setw(10) << blockDV * blockRun / 1024
101 |        << " kB"                                                           //
102 |        << setprecision(0) << setw(10) << time.minValue() * 1000.0 << "ms" //
103 |        << setprecision(1) << setw(10) << time.spread() * 100 << "%"       //
104 |        << setw(10) << bw << " GB/s   "                                    //
105 |        << setprecision(0) << setw(6)
106 |        << dram_read.median() / time.minValue() / 1.0e9 << " GB/s " //
107 |        << setprecision(0) << setw(6)
108 |        << dram_write.median() / time.minValue() / 1.0e9 << " GB/s " //
109 |        << setprecision(0) << setw(6)
110 |        << L2_read.median() / time.minValue() / 1.0e9 << " GB/s " //
111 |        << setprecision(0) << setw(6)
112 |        << L2_write.median() / time.minValue() / 1.0e9 << " GB/s " << endl; //
113 | }
114 | 
115 | size_t constexpr expSeries(size_t N) {
116 |   size_t val = 20;
117 |   for (size_t i = 0; i < N; i++) {
118 |     val = val * 1.04 + 1;
119 |   }
120 |   return val;
121 | }
122 | 
123 | int main(int argc, char **argv) {
124 |   initMeasureMetric();
125 |   cout << setw(13) << "data set"   //
126 |        << setw(12) << "exec time"  //
127 |        << setw(11) << "spread"     //
128 |        << setw(15) << "Eff. bw\n"; //
129 | 
130 |   for (int i = 3; i < 10000; i += max(1.0, i * 0.1)) {
131 | #ifdef __NVCC__
132 |     measure<64>(i);
133 | #else
134 |     measure<64>(i);
135 | #endif
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/gpu-metrics/cuda_metrics/Utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <nvperf_host.h>
  3 | 
  4 | #define RETURN_IF_NVPW_ERROR(retval, actual)                                        \
  5 | do {                                                                                \
  6 |     NVPA_Status status = actual;                                                    \
  7 |     if (NVPA_STATUS_SUCCESS != status) {                                            \
  8 |         fprintf(stderr, "FAILED: %s with error %s\n", #actual, NV::Metric::Utils::GetNVPWResultString(status)); \
  9 |         return retval;                                                              \
 10 |     }                                                                               \
 11 | } while (0)
 12 | 
 13 | namespace NV {
 14 |     namespace Metric {
 15 |         namespace Utils {
 16 | 
 17 |             static const char* GetNVPWResultString(NVPA_Status status) {
 18 |                 const char* errorMsg = NULL;
 19 |                 switch (status)
 20 |                 {
 21 |                 case NVPA_STATUS_ERROR:
 22 |                     errorMsg = "NVPA_STATUS_ERROR";
 23 |                     break;
 24 |                 case NVPA_STATUS_INTERNAL_ERROR:
 25 |                     errorMsg = "NVPA_STATUS_INTERNAL_ERROR";
 26 |                     break;
 27 |                 case NVPA_STATUS_NOT_INITIALIZED:
 28 |                     errorMsg = "NVPA_STATUS_NOT_INITIALIZED";
 29 |                     break;
 30 |                 case NVPA_STATUS_NOT_LOADED:
 31 |                     errorMsg = "NVPA_STATUS_NOT_LOADED";
 32 |                     break;
 33 |                 case NVPA_STATUS_FUNCTION_NOT_FOUND:
 34 |                     errorMsg = "NVPA_STATUS_FUNCTION_NOT_FOUND";
 35 |                     break;
 36 |                 case NVPA_STATUS_NOT_SUPPORTED:
 37 |                     errorMsg = "NVPA_STATUS_NOT_SUPPORTED";
 38 |                     break;
 39 |                 case NVPA_STATUS_NOT_IMPLEMENTED:
 40 |                     errorMsg = "NVPA_STATUS_NOT_IMPLEMENTED";
 41 |                     break;
 42 |                 case NVPA_STATUS_INVALID_ARGUMENT:
 43 |                     errorMsg = "NVPA_STATUS_INVALID_ARGUMENT";
 44 |                     break;
 45 |                 case NVPA_STATUS_INVALID_METRIC_ID:
 46 |                     errorMsg = "NVPA_STATUS_INVALID_METRIC_ID";
 47 |                     break;
 48 |                 case NVPA_STATUS_DRIVER_NOT_LOADED:
 49 |                     errorMsg = "NVPA_STATUS_DRIVER_NOT_LOADED";
 50 |                     break;
 51 |                 case NVPA_STATUS_OUT_OF_MEMORY:
 52 |                     errorMsg = "NVPA_STATUS_OUT_OF_MEMORY";
 53 |                     break;
 54 |                 case NVPA_STATUS_INVALID_THREAD_STATE:
 55 |                     errorMsg = "NVPA_STATUS_INVALID_THREAD_STATE";
 56 |                     break;
 57 |                 case NVPA_STATUS_FAILED_CONTEXT_ALLOC:
 58 |                     errorMsg = "NVPA_STATUS_FAILED_CONTEXT_ALLOC";
 59 |                     break;
 60 |                 case NVPA_STATUS_UNSUPPORTED_GPU:
 61 |                     errorMsg = "NVPA_STATUS_UNSUPPORTED_GPU";
 62 |                     break;
 63 |                 case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION:
 64 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION";
 65 |                     break;
 66 |                 case NVPA_STATUS_OBJECT_NOT_REGISTERED:
 67 |                     errorMsg = "NVPA_STATUS_OBJECT_NOT_REGISTERED";
 68 |                     break;
 69 |                 case NVPA_STATUS_INSUFFICIENT_PRIVILEGE:
 70 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE";
 71 |                     break;
 72 |                 case NVPA_STATUS_INVALID_CONTEXT_STATE:
 73 |                     errorMsg = "NVPA_STATUS_INVALID_CONTEXT_STATE";
 74 |                     break;
 75 |                 case NVPA_STATUS_INVALID_OBJECT_STATE:
 76 |                     errorMsg = "NVPA_STATUS_INVALID_OBJECT_STATE";
 77 |                     break;
 78 |                 case NVPA_STATUS_RESOURCE_UNAVAILABLE:
 79 |                     errorMsg = "NVPA_STATUS_RESOURCE_UNAVAILABLE";
 80 |                     break;
 81 |                 case NVPA_STATUS_DRIVER_LOADED_TOO_LATE:
 82 |                     errorMsg = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE";
 83 |                     break;
 84 |                 case NVPA_STATUS_INSUFFICIENT_SPACE:
 85 |                     errorMsg = "NVPA_STATUS_INSUFFICIENT_SPACE";
 86 |                     break;
 87 |                 case NVPA_STATUS_OBJECT_MISMATCH:
 88 |                     errorMsg = "NVPA_STATUS_OBJECT_MISMATCH";
 89 |                     break;
 90 |                 case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED:
 91 |                     errorMsg = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED";
 92 |                     break;
 93 |                 default:
 94 |                     break;
 95 |                 }
 96 | 
 97 |                 return errorMsg;
 98 |             }
 99 |         }
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-gapped-stream/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../measure_metric/measureMetricPW.hpp"
  3 | #include "../dtime.hpp"
  4 | #include "../gpu-error.h"
  5 | #include <iomanip>
  6 | #include <iostream>
  7 | #include <nvml.h>
  8 | 
  9 | using namespace std;
 10 | 
 11 | const size_t elementCount = 4 * 1024 * 1024 * 1024ull;
 12 | double *dA, *dB;
 13 | 
 14 | template <typename T>
 15 | __global__ void init_kernel(T *A, const T *__restrict__ B,
 16 |                             const T *__restrict__ C, const T *__restrict__ D,
 17 |                             const size_t N) {
 18 |   size_t tidx = threadIdx.x + blockIdx.x * blockDim.x;
 19 |   for (size_t i = tidx; i < N; i += blockDim.x * gridDim.x) {
 20 |     A[i] = 0.1;
 21 |   }
 22 | }
 23 | 
 24 | template <typename T>
 25 | __global__ void scale_kernel(T *A, const T *__restrict__ B, int blocks, int spacing) {
 26 |   size_t tidx = threadIdx.x + blockIdx.x * blockDim.x;
 27 |   if (tidx >= elementCount)
 28 |     return;
 29 | 
 30 |   size_t idx = ((tidx * spacing) % elementCount + (tidx*spacing) / elementCount) % elementCount;
 31 | 
 32 | 
 33 |   T temp = B[idx];
 34 | 
 35 |   if(temp == 12223.0 && threadIdx.x > 10000)
 36 |       A[idx] = 1.2; // = B[idx] * 1.2;
 37 | }
 38 | 
 39 | void measureFunc(int blocks, int spacing) {
 40 | 
 41 |   MeasurementSeries time;
 42 |   int blockSize = 256;
 43 |   int gridSize = (elementCount - 1) / blockSize + 1;
 44 | 
 45 |   scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 46 | 
 47 |   nvmlDevice_t device;
 48 |   int deviceId;
 49 |   cudaGetDevice(&deviceId);
 50 |   nvmlDeviceGetHandleByIndex(deviceId, &device);
 51 | 
 52 |   for (int iter = 0; iter < 7; iter++) {
 53 |     GPU_ERROR(cudaDeviceSynchronize());
 54 |     double t1 = dtime();
 55 |     GPU_ERROR(cudaDeviceSynchronize());
 56 |     scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 57 |     scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 58 |     GPU_ERROR(cudaDeviceSynchronize());
 59 |     double t2 = dtime();
 60 |     time.add((t2 - t1) / 2);
 61 |   }
 62 | 
 63 |   measureMetricStart({"dram__bytes_read.sum", "dram__bytes_write.sum"});
 64 |   scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 65 |   GPU_ERROR(cudaDeviceSynchronize());
 66 |   auto dram_metrics = measureMetricStop();
 67 | 
 68 |   measureMetricStart({"lts__t_sectors_srcunit_tex.sum",
 69 |                       "lts__t_sectors_srcunit_ltcfabric.sum",
 70 |                       "lts__t_sectors.sum"});
 71 |   scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 72 |   GPU_ERROR(cudaDeviceSynchronize());
 73 |   auto l2_metrics = measureMetricStop();
 74 | 
 75 |   measureMetricStart({"lts__t_tag_requests.sum",
 76 |           "lts__t_tag_requests.avg.pct_of_peak_sustained_elapsed"});
 77 | 
 78 |   scale_kernel<<<gridSize, blockSize>>>(dA, dB, blocks, spacing);
 79 |   GPU_ERROR(cudaDeviceSynchronize());
 80 |   auto tag_requests = measureMetricStop();
 81 | 
 82 |   cudaDeviceProp prop;
 83 |   GPU_ERROR(cudaGetDevice(&deviceId));
 84 |   GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 85 |   std::string deviceName = prop.name;
 86 |   int smCount = prop.multiProcessorCount;
 87 |   int maxActiveBlocks = 0;
 88 |   GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 89 |       &maxActiveBlocks, scale_kernel<double>, blockSize, 0));
 90 | 
 91 | 
 92 |   cout << fixed << setprecision(0)
 93 |        << maxActiveBlocks << " "
 94 |        << setw(2) << " " << setw(5)
 95 |        << blocks << " " << setw(5)
 96 |        << spacing << "       eff:  "
 97 |        << elementCount  * sizeof(double) / time.value() * 1e-9 << " GB/s "
 98 |        << setprecision(0) << setw(8) << dram_metrics[0] / time.value() / 1.0e9 << " GB/s "    //
 99 |        << setprecision(0) << setw(8) << l2_metrics[0]*32 / time.value() / 1.0e9 << " GB/s "    //
100 |        << setprecision(0) << setw(8) << l2_metrics[1]*32 / time.value() / 1.0e9 << " GB/s "    //
101 |        << setprecision(0) << setw(8) << l2_metrics[2]*32 / time.value() / 1.0e9 << " GB/s "   //
102 |        << setprecision(0) << setw(8) << tag_requests[0] / time.value() / 1.41e9 << " /cyc "   //
103 |        << setprecision(0) << setw(8) << tag_requests[1] << " % ";   //
104 |                                                                                              //
105 |   cout << "   "  << setprecision(2) << setw(5) << dram_metrics[0] / (elementCount * sizeof(double)) << "  ";
106 |   cout << "   "  << setprecision(2) << setw(5) << l2_metrics[0]*32 / (elementCount * sizeof(double)) << "  ";
107 |   cout << "   "  << setprecision(2) << setw(5) << l2_metrics[1]*32 / (elementCount * sizeof(double)) << "  ";
108 |   cout << "   "  << setprecision(2) << setw(5) << l2_metrics[2]*32 / (elementCount * sizeof(double)) << " ";
109 |   cout << "   "  << setprecision(3) << setw(5) << tag_requests[0] / (elementCount) << " ";
110 | 
111 | 
112 |   cout << std::endl;
113 | }
114 | 
115 | int main(int argc, char **argv) {
116 |     int maxSpacing = 512 * 1024 * 1024;
117 |     size_t bufferSize = elementCount * sizeof(double);
118 |     nvmlInit();
119 |     //GPU_ERROR(cudaMalloc(&dA, bufferSize));
120 |     GPU_ERROR(cudaMalloc(&dB, bufferSize));
121 | 
122 |     init_kernel<<<256, 400>>>(dB, dB, dB, dB, elementCount );
123 |     //init_kernel<<<256, 400>>>(dA, dA, dA, dA, elementCount * maxSpacing);
124 |     GPU_ERROR(cudaDeviceSynchronize());
125 | 
126 |     cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32);
127 | 
128 |     for(int blocks = 1; blocks <= 1; blocks *=2) {
129 |         for(int spacing = 1; spacing <= maxSpacing; spacing *= 2) {
130 |             measureFunc(blocks, spacing);
131 |         }
132 |     }
133 | 
134 | 
135 |     //GPU_ERROR(cudaFree(dA));
136 |     GPU_ERROR(cudaFree(dB));
137 | }
138 | 


--------------------------------------------------------------------------------
/gpu-l2-stream/plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import csv
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | 
  8 | import sys
  9 | 
 10 | sys.path.append("..")
 11 | from device_order import *
 12 | 
 13 | 
 14 | fig, ax = plt.subplots(figsize=(6, 4))
 15 | fig2, ax2 = plt.subplots(figsize=(6, 4))
 16 | 
 17 | 
 18 | maxbars = {}
 19 | minbars = {}
 20 | 
 21 | devicesToInclude = [
 22 |     "a40",
 23 |     "l40",
 24 |     "v100",
 25 |     "a100_80",
 26 |     "gh200",
 27 |     "mi210",
 28 |     "rx6900xt",
 29 |     "mi300x",
 30 | ]
 31 | 
 32 | 
 33 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)):
 34 |     if not filename.endswith(".txt") or not any(
 35 |         [True if filename.lower().startswith(f) else False for f in devicesToInclude]
 36 |     ):
 37 |         continue
 38 |     with open(filename, newline="") as csvfile:
 39 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
 40 | 
 41 |         mediData = {}
 42 |         maxiData = {}
 43 |         miniData = {}
 44 |         readData = {}
 45 |         triadData = {}
 46 |         initData = {}
 47 | 
 48 |         for row in csvreader:
 49 |             if len(row) < 18 or not row[0].isnumeric():
 50 |                 continue
 51 | 
 52 |             print(row)
 53 |             threads = int(row[2])
 54 |             size = int(row[3])
 55 |             mini = float(row[10])
 56 |             medi = float(row[11])
 57 |             maxi = float(row[12])
 58 | 
 59 |             read = float(row[8])
 60 |             triad = float(row[14])
 61 |             init = float(row[17])
 62 | 
 63 |             if threads not in mediData:
 64 |                 mediData[threads] = {}
 65 |                 maxiData[threads] = {}
 66 |                 miniData[threads] = {}
 67 |                 readData[threads] = {}
 68 |                 triadData[threads] = {}
 69 |                 initData[threads] = {}
 70 | 
 71 |             mediData[threads][size] = medi
 72 |             maxiData[threads][size] = maxi
 73 |             miniData[threads][size] = mini
 74 |             readData[threads][size] = read
 75 |             triadData[threads][size] = triad
 76 |             initData[threads][size] = init
 77 | 
 78 |         # ax.scatter(
 79 |         #    [v for b in data for v in data[b].keys()],
 80 |         #    [v for b in data for v in data[b].values()],
 81 |         #    label=filename[:-4].upper(),
 82 |         #    color=getDeviceColor(filename),
 83 |         #    alpha=0.2,
 84 |         #    #   **lineStyle
 85 |         # )
 86 | 
 87 |         miniBWPerSize = {}
 88 |         maxBWPerSize = {}
 89 |         mediBWPerSize = {}
 90 | 
 91 |         for threads in mediData.keys():
 92 |             for size in mediData[threads].keys():
 93 |                 if (
 94 |                     size not in mediBWPerSize
 95 |                     or mediBWPerSize[size] < mediData[threads][size]
 96 |                 ):
 97 |                     maxBWPerSize[size] = maxiData[threads][size]
 98 |                     mediBWPerSize[size] = mediData[threads][size]
 99 |                     miniBWPerSize[size] = miniData[threads][size]
100 | 
101 |         ax.fill_between(
102 |             maxBWPerSize.keys(),
103 |             miniBWPerSize.values(),
104 |             maxBWPerSize.values(),
105 |             alpha=0.4,
106 |             color=getDeviceColor(filename),
107 |             edgecolor=None,
108 |         )
109 |         ax.plot(
110 |             maxBWPerSize.keys(),
111 |             mediBWPerSize.values(),
112 |             color=getDeviceColor(filename),
113 |             label=order[getOrderNumber(filename)].upper(),
114 |             # *lineStyle,
115 |         )
116 |         if len(maxBWPerSize) > 0:
117 |             ax.set_xlim([list(maxBWPerSize.keys())[0], list(maxBWPerSize.keys())[-1]])
118 | 
119 |         bws = []
120 | 
121 |         closestSize = 0
122 |         for b in mediData.values():
123 |             bws.append(0)
124 |             closestSize = 0
125 |             for v in b.items():
126 |                 if abs(v[0] - 2000) < abs(closestSize - 2000):
127 | 
128 |                     bws[-1] = v[1]
129 |                     closestSize = v[0]
130 | 
131 |         ax2.plot(
132 |             [k for k in mediData.keys() if k < 400000],
133 |             bws[: len([k for k in mediData.keys() if k < 400000])],
134 |             label=filename[:-4].upper(),
135 |             color=getDeviceColor(filename),
136 |             # *lineStyle,
137 |         )
138 | 
139 |         print(closestSize)
140 | 
141 |         print(filename, getOrderNumber(filename))
142 | 
143 | 
144 | ########ax.set_xticks(threads[::5])
145 | # ax.set_xticklabels(threads, rotation="vertical")
146 | ax.set_xlabel("dataset size, MB")
147 | ax.set_ylabel("Bandwidth, GB/s")
148 | 
149 | # ax.axhline(1400, linestyle="--", color="C1")
150 | # ax.axhline(800, linestyle="--", color="C0")
151 | 
152 | # ax.grid()
153 | ax.legend()
154 | ax.set_ylim([0, ax.get_ylim()[1]])
155 | 
156 | ax.set_xscale("log", base=2)
157 | formatter = matplotlib.ticker.FuncFormatter(
158 |     lambda x, pos: "{0:g} kB".format(x) if x < 1024 else "{0:g}".format(x / 1024)
159 | )
160 | ax.get_xaxis().set_major_formatter(formatter)
161 | ax.set_xticks(
162 |     [
163 |         1024,
164 |         2048,
165 |         4096,
166 |         8192,
167 |         20 * 1024,
168 |         40 * 1024,
169 |         96 * 1024,
170 |         256 * 1024,
171 |         512 * 1024,
172 |     ]
173 | )
174 | 
175 | 
176 | fig.tight_layout()
177 | fig.savefig("gpu-l2-stream.pdf", dpi=300)
178 | 
179 | 
180 | ax2.set_xlabel("threads")
181 | ax2.set_ylabel("Bandwidth, GB/s")
182 | 
183 | 
184 | ax2.legend()
185 | ax2.set_xlim([0, 370000])
186 | ax2.set_ylim([0, ax2.get_ylim()[1]])
187 | 
188 | fig2.tight_layout()
189 | fig2.savefig("gpu-l2-stream-scaling.pdf", dpi=300)
190 | 
191 | 
192 | plt.show()
193 | 


--------------------------------------------------------------------------------
/cuda-incore/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../dtime.hpp"
  3 | #include "../gpu-clock.cuh"
  4 | #include "../gpu-error.h"
  5 | #include "../metrics.cuh"
  6 | #include <iomanip>
  7 | #include <iostream>
  8 | #include <map>
  9 | 
 10 | using namespace std;
 11 | 
 12 | template <typename T> __global__ void initKernel(T *A, size_t N) {
 13 |   size_t tidx = blockDim.x * blockIdx.x + threadIdx.x;
 14 |   for (int idx = tidx; idx < N; idx += blockDim.x * gridDim.x) {
 15 |     A[idx] = 1.1;
 16 |   }
 17 | }
 18 | 
 19 | template <typename T, int N, int M>
 20 | __global__ void FMA_mixed(T p, T *A, int iters) {
 21 | #pragma unroll(1)
 22 |   for (int iter = 0; iter < iters; iter++) {
 23 |     T t[M];
 24 | #pragma unroll
 25 |     for (int m = 0; m < M; m++) {
 26 |       t[m] = p + threadIdx.x + iter + m;
 27 |     }
 28 | #pragma unroll
 29 |     for (int n = 0; n < N / M; n++) {
 30 | #pragma unroll
 31 |       for (int m = 0; m < M; m++) {
 32 |         t[m] = t[m] * (T)0.9 + (T)0.5;
 33 |       }
 34 |     }
 35 | #pragma unroll
 36 |     for (int m = 0; m < M; m++) {
 37 |       if (t[m] > (T)22313.0) {
 38 |         A[0] = t[m];
 39 |       }
 40 |     }
 41 |   }
 42 | }
 43 | 
 44 | template <typename T, int N, int M>
 45 | __global__ void FMA_separated(T p, T *A, int iters) {
 46 | 
 47 |   for (int iter = 0; iter < iters; iter++) {
 48 | #pragma unroll
 49 |     for (int m = 0; m < M; m++) {
 50 |       T t = p + threadIdx.x + iter + m;
 51 |       for (int n = 0; n < N; n++) {
 52 |         t = t * (T)0.9 + (T)0.5;
 53 |       }
 54 |       if (t > (T)22313.0) {
 55 |         A[0] = t;
 56 |       }
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | template <typename T, int N, int M>
 62 | __global__ void DIV_separated(T p, T *A, int iters) {
 63 | 
 64 | #pragma unroll(1)
 65 |   for (int iter = 0; iter < iters; iter++) {
 66 |     for (int m = 0; m < M; m++) {
 67 |       T t = p + threadIdx.x + iter + m;
 68 | 
 69 |       for (int n = 0; n < N; n++) {
 70 |         t = 0.1 / (t + 0.2);
 71 |       }
 72 | 
 73 |       A[threadIdx.x + iter] = t;
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | template <typename T, int N, int M>
 79 | __global__ void SQRT_separated(T p, T *A, int iters) {
 80 | 
 81 | #pragma unroll(1)
 82 |   for (int iter = 0; iter < iters; iter++) {
 83 | 
 84 |     for (int m = 0; m < M; m++) {
 85 |       T t = p + threadIdx.x + iter + m;
 86 | 
 87 |       for (int n = 0; n < N; n++) {
 88 |         t = sqrt(t + 0.2);
 89 |       }
 90 | 
 91 |       A[threadIdx.x + iter] = t;
 92 |     }
 93 |   }
 94 | }
 95 | 
 96 | unsigned int gpu_clock = 0;
 97 | 
 98 | template <typename T, int N, int M>
 99 | double measure(int warpCount, void (*kernel)(T, T *, int)) {
100 |   nvmlDevice_t device;
101 |   nvmlDeviceGetHandleByIndex(0, &device);
102 | 
103 |   const int iters = 10000;
104 |   const int blockSize = 32 * warpCount;
105 |   const int blockCount = 1;
106 | 
107 |   MeasurementSeries time;
108 | 
109 |   T *dA;
110 |   GPU_ERROR(cudaMalloc(&dA, iters * 2 * sizeof(T)));
111 |   initKernel<<<52, 256>>>(dA, iters * 2);
112 |   GPU_ERROR(cudaDeviceSynchronize());
113 | 
114 |   kernel<<<blockCount, blockSize>>>((T)0.32, dA, iters);
115 |   GPU_ERROR(cudaDeviceSynchronize());
116 |   for (int i = 0; i < 1; i++) {
117 |     double t1 = dtime();
118 |     kernel<<<blockCount, blockSize>>>((T)0.32, dA, iters);
119 |     GPU_ERROR(cudaDeviceSynchronize());
120 |     double t2 = dtime();
121 |     time.add(t2 - t1);
122 |   }
123 |   cudaFree(dA);
124 | 
125 |   double rcpThru = time.value() * gpu_clock * 1.0e6 / N / iters / warpCount;
126 |   /*cout << setprecision(1) << fixed << typeid(T).name() << " " << setw(5) << N
127 |        << " " << warpCount << " " << setw(5) << M << " "
128 |        << " " << setw(5) << time.value() * 100 << " " << setw(5)
129 |        << time.spread() * 100 << "%   " << setw(5) << setprecision(2) << rcpThru
130 |        << "  " << setw(9) << clock << "MHz\n" ;*/
131 |   return rcpThru;
132 | }
133 | 
134 | template <typename T> void measureTabular(int maxWarpCount) {
135 | 
136 |   vector<map<pair<int, int>, double>> r(3);
137 |   const int N = 1024;
138 |   for (int warpCount = 1; warpCount <= maxWarpCount; warpCount *= 2) {
139 |     r[0][{warpCount, 1}] = measure<T, N, 1>(warpCount, FMA_mixed<T, N, 1>);
140 |     r[1][{warpCount, 1}] =
141 |         measure<T, N / 8, 1>(warpCount, DIV_separated<T, N / 8, 1>);
142 |     r[2][{warpCount, 1}] =
143 |         measure<T, N / 8, 1>(warpCount, SQRT_separated<T, N / 8, 1>);
144 |     r[0][{warpCount, 2}] = measure<T, N, 2>(warpCount, FMA_mixed<T, N, 2>);
145 |     r[1][{warpCount, 2}] =
146 |         measure<T, N / 8, 2>(warpCount, DIV_separated<T, N / 8, 2>);
147 |     r[2][{warpCount, 2}] =
148 |         measure<T, N / 8, 2>(warpCount, SQRT_separated<T, N / 8, 2>);
149 |     r[0][{warpCount, 4}] = measure<T, N, 4>(warpCount, FMA_mixed<T, N, 4>);
150 |     r[1][{warpCount, 4}] =
151 |         measure<T, N / 8, 4>(warpCount, DIV_separated<T, N / 8, 4>);
152 |     r[2][{warpCount, 4}] =
153 |         measure<T, N / 8, 4>(warpCount, SQRT_separated<T, N / 8, 4>);
154 |     r[0][{warpCount, 8}] = measure<T, N, 8>(warpCount, FMA_mixed<T, N, 8>);
155 |     r[1][{warpCount, 8}] =
156 |         measure<T, N / 8, 8>(warpCount, DIV_separated<T, N / 8, 8>);
157 |     r[2][{warpCount, 8}] =
158 |         measure<T, N / 8, 8>(warpCount, SQRT_separated<T, N / 8, 8>);
159 |     // cout << "\n";
160 |   }
161 | 
162 |   for (int i = 0; i < 3; i++) {
163 |     for (int warpCount = 1; warpCount <= maxWarpCount; warpCount *= 2) {
164 |       for (int streams = 1; streams <= 8; streams *= 2) {
165 |         cout << setw(7) << setprecision(3) << r[i][{warpCount, streams}] << " ";
166 |       }
167 |       cout << "\n";
168 |     }
169 |     cout << "\n";
170 |   }
171 | }
172 | 
173 | int main(int argc, char **argv) {
174 |   gpu_clock = getGPUClock();
175 |   measureTabular<float>(32);
176 |   measureTabular<double>(32);
177 | }
178 | 


--------------------------------------------------------------------------------
/gpu-roofline/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../dtime.hpp"
  2 | #include "../gpu-error.h"
  3 | #include <cuComplex.h>
  4 | #include <cuda_runtime.h>
  5 | #include <iomanip>
  6 | #include <iostream>
  7 | #include <omp.h>
  8 | #include <sys/time.h>
  9 | #include <unistd.h>
 10 | 
 11 | #include "../MeasurementSeries.hpp"
 12 | 
 13 | #include "../gpu-stats.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | template <typename T> __global__ void initKernel(T *data, size_t data_len) {
 18 |   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
 19 |   for (int idx = tidx; idx < data_len; idx += gridDim.x * blockDim.x) {
 20 |     data[idx] = idx;
 21 |   }
 22 | }
 23 | 
 24 | template <typename T, int N, int M, int BLOCKSIZE>
 25 | __global__ void testfun(T *const __restrict__ dA, T *const __restrict__ dB,
 26 |                         T *dC) {
 27 |   T *sA = dA + threadIdx.x + blockIdx.x * BLOCKSIZE * M;
 28 |   T *sB = dB + threadIdx.x + blockIdx.x * BLOCKSIZE * M;
 29 | 
 30 |   T sum = 0;
 31 | 
 32 | //#pragma unroll 1
 33 |   for (int i = 0; i < M; i += 2) {
 34 |     T a = sA[i * BLOCKSIZE];
 35 |     T b = sB[i * BLOCKSIZE];
 36 |     T v = a - b;
 37 |     T a2 = sA[(i + 1) * BLOCKSIZE];
 38 |     T b2 = sB[(i + 1) * BLOCKSIZE];
 39 |     T v2 = a2 - b2;
 40 |     //#pragma unroll N
 41 |     for (int i = 0; i < N; i++) {
 42 |       v = v * a - b;
 43 |       v2 = v2 * a - b;
 44 |     }
 45 |     sum += v + v2;
 46 |   }
 47 |   if (threadIdx.x == 0)
 48 |     dC[blockIdx.x] = sum;
 49 | }
 50 | 
 51 | template <typename T, int N, int M, int BLOCKSIZE>
 52 | __global__ void testfun_max_power(T *const __restrict__ dA,
 53 |                                   T *const __restrict__ dB, T *dC) {
 54 |   T *sA = dA + threadIdx.x + (blockIdx.x / 2) * BLOCKSIZE * M;
 55 |   T *sB = dB + threadIdx.x + (blockIdx.x / 2) * BLOCKSIZE * M;
 56 | 
 57 |   T sum = 0;
 58 | 
 59 |   // #pragma unroll 1
 60 |   for (int i = 0; i < M; i += 2) {
 61 |     T a = sA[i * BLOCKSIZE];
 62 |     T b = sB[i * BLOCKSIZE];
 63 |     T v = a - b;
 64 |     T a2 = sA[(i + 1) * BLOCKSIZE];
 65 |     T b2 = sB[(i + 1) * BLOCKSIZE];
 66 |     T v2 = a2 - b2;
 67 |     for (int i = 0; i < N; i++) {
 68 |       v = v * a - b;
 69 |       v2 = v2 * a2 - b2;
 70 |     }
 71 |     sum += v + v2;
 72 |   }
 73 |   if (threadIdx.x == 0)
 74 |     dC[blockIdx.x] = sum;
 75 | }
 76 | 
 77 | int main(int argc, char **argv) {
 78 | 
 79 |   typedef float dtype;
 80 |   const int M = 4000;
 81 |   // PARN is a constant from the Makefile, set via -DPARN=X
 82 |   const int N = PARN;
 83 |   const int BLOCKSIZE = 256;
 84 | 
 85 |   int nDevices;
 86 |   GPU_ERROR(cudaGetDeviceCount(&nDevices));
 87 | 
 88 | #pragma omp parallel num_threads(nDevices)
 89 |   {
 90 |     GPU_ERROR(cudaSetDevice(omp_get_thread_num()));
 91 | #pragma omp barrier
 92 |     int deviceId;
 93 |     GPU_ERROR(cudaGetDevice(&deviceId));
 94 |     cudaDeviceProp prop;
 95 |     GPU_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 96 |     int numBlocks;
 97 | 
 98 |     GPU_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
 99 |         &numBlocks, testfun<dtype, N, M, BLOCKSIZE>, BLOCKSIZE, 0));
100 |     int blockCount = prop.multiProcessorCount * numBlocks;
101 | 
102 |     size_t data_len = (size_t)blockCount * BLOCKSIZE * M;
103 |     dtype *dA = NULL;
104 |     dtype *dB = NULL;
105 |     dtype *dC = NULL;
106 |     size_t iters = 1000;
107 | 
108 |     GPU_ERROR(cudaMalloc(&dA, data_len * sizeof(dtype)));
109 |     GPU_ERROR(cudaMalloc(&dB, data_len * sizeof(dtype)));
110 |     GPU_ERROR(cudaMalloc(&dC, data_len * sizeof(dtype)));
111 | #pragma omp barrier
112 |     initKernel<<<blockCount, 256>>>(dA, data_len);
113 |     initKernel<<<blockCount, 256>>>(dB, data_len);
114 |     initKernel<<<blockCount, 256>>>(dC, data_len);
115 |     GPU_ERROR(cudaDeviceSynchronize());
116 | 
117 | #pragma omp barrier
118 | 
119 |     cudaEvent_t start, stop;
120 |     GPU_ERROR(cudaEventCreate(&start));
121 |     GPU_ERROR(cudaEventCreate(&stop));
122 | 
123 |     GPU_ERROR(cudaEventRecord(start));
124 |     for (size_t iter = 0; iter < iters; iter++) {
125 |       testfun<dtype, N, M, BLOCKSIZE><<<blockCount, BLOCKSIZE>>>(dA, dB, dC);
126 |     }
127 |     GPU_ERROR(cudaEventRecord(stop));
128 | 
129 |     MeasurementSeries powerSeries;
130 |     MeasurementSeries clockSeries;
131 |     MeasurementSeries temperatureSeries;
132 | 
133 |     do {
134 |       usleep(1000);
135 |       auto stats = getGPUStats(deviceId);
136 |       powerSeries.add(stats.power);
137 |       clockSeries.add(stats.clock);
138 |       temperatureSeries.add(stats.temperature);
139 |     } while (cudaEventQuery(stop) == cudaErrorNotReady);
140 | 
141 |     GPU_ERROR(cudaEventSynchronize(stop));
142 |     GPU_ERROR(cudaGetLastError());
143 | 
144 |     float milliseconds;
145 |     GPU_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
146 |     float dt = milliseconds / 1000;
147 | 
148 | #pragma omp barrier
149 | #pragma omp for ordered schedule(static, 1)
150 |     for (int i = 0; i < omp_get_num_threads(); i++) {
151 | #pragma omp ordered
152 |       {
153 | 
154 |         //for (auto v : clockSeries)
155 |         //  std::cout << v << " ";
156 |         //std::cout << "\n"; 
157 |         cout << setprecision(3) << fixed << deviceId << " " << blockCount
158 |              << " blocks   " << setw(3) << N << " its      "
159 |              << (2.0 + N * 2.0) / (2.0 * sizeof(dtype)) << " Fl/B      "
160 |              << setprecision(0) << setw(5)
161 |              << iters * 2 * data_len * sizeof(dtype) / dt * 1.0e-9
162 |              << " GB/s    " << setw(6)
163 |              << iters * (2 + N * 2) * data_len / dt * 1.0e-9 << " GF/s   "
164 |              << clockSeries.median() << " Mhz   "
165 |              << powerSeries.median() / 1000 << " W   "
166 |              << temperatureSeries.median() << "°C\n";
167 |       }
168 |     }
169 |     GPU_ERROR(cudaFree(dA));
170 |     GPU_ERROR(cudaFree(dB));
171 |     GPU_ERROR(cudaFree(dC));
172 |   }
173 |   cout << "\n";
174 | }
175 | 


--------------------------------------------------------------------------------
/unmaintained/cuda-busy/main.cu:
--------------------------------------------------------------------------------
  1 | #include "../MeasurementSeries.hpp"
  2 | #include "../dtime.hpp"
  3 | #include "../gpu-error.h"
  4 | #include "../metrics.cuh"
  5 | #include <iomanip>
  6 | #include <iostream>
  7 | 
  8 | using namespace std;
  9 | 
 10 | double *dA, *dB;
 11 | 
 12 | using kernel_ptr_type = void (*)(int iters, double *A, const double *B);
 13 | 
 14 | template <int N, int UNROLL, bool DOTPRODUCT>
 15 | __global__ __launch_bounds__(1024, 1) void kernel(int iters, double *A,
 16 |                                                   double *B) {
 17 | 
 18 |   int widx = threadIdx.x / 32;
 19 |   double sum = 0.0;
 20 | #pragma unroll(1)
 21 |   for (int w = 0; w < (widx % 5) * 11; w++) {
 22 |     sum += w;
 23 |   }
 24 | 
 25 |   double *dA = A + threadIdx.x;
 26 |   double *dB = B + threadIdx.x;
 27 | 
 28 | #pragma unroll(1)
 29 |   for (int iter = 0; iter < iters; iter++) {
 30 | #pragma unroll(UNROLL)
 31 |     for (int n = 0; n < N; n++) {
 32 |       if (DOTPRODUCT)
 33 |         sum += dA[n * 32] * dB[n * 32];
 34 |       else
 35 |         sum += dA[n * 32];
 36 |     }
 37 |   }
 38 | 
 39 |   if (sum == -12.3) {
 40 |     A[threadIdx.x] = sum;
 41 |   }
 42 | }
 43 | 
 44 | double pred(int Iint, int Ild, int Idp, int Nsm, int ClL1) {
 45 |   int Nq = ceil((double)Nsm / 4);
 46 | 
 47 |   int Tdp = Idp * 4;
 48 |   int Tld = Ild * 4;
 49 |   int Tint = Iint * 2;
 50 |   int TL1lat = 32;
 51 |   int TL1thru = ClL1 * Nsm;
 52 | 
 53 |   int Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru);
 54 | 
 55 |   cout << setw(5) << Tdp << " ";
 56 |   cout << setw(5) << Tld << " ";
 57 |   cout << setw(5) << Tint << " ";
 58 | 
 59 | 
 60 |   TL1lat = 32 + (double)TL1thru / Ttotal * 16;
 61 |   TL1thru = ClL1 * Nsm * (1.0f + (double)TL1thru / Ttotal) * 0.5f;
 62 |   Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru);
 63 | 
 64 | 
 65 |   
 66 |   TL1lat = 32 + (double)TL1thru / Ttotal * 16;
 67 |   TL1thru = ClL1 * Nsm * (1.0f + (double)TL1thru / Ttotal) * 0.5f;
 68 |   Ttotal = Tint + max(max(Tld, TL1lat) + Tdp, TL1thru);
 69 | 
 70 |   string cont = "Tint + ";
 71 |   if (TL1thru >= max(Tld, TL1lat) + Tdp) {
 72 |     cont += " TL1thru ";
 73 |   } else if (TL1thru == max(Tld, TL1lat) + Tdp) {
 74 |     cont += " | ";
 75 |   } else {
 76 |     cont += "( ";
 77 |     if (Tld > TL1lat) {
 78 |       cont += "Tld";
 79 |     } else if (Tld == TL1lat) {
 80 |       cont += "TL1lat|Tld";
 81 |     } else {
 82 |       cont += "TL1lat";
 83 |     }
 84 |     cont += " + Tdp)";
 85 |   }
 86 | 
 87 |   cout << cont << "  ";
 88 | 
 89 |   return Ttotal;
 90 | }
 91 | 
 92 | template <int DV, int UNROLL, bool DOTPRODUCT>
 93 | void measure(int blockSize, bool concise = false) {
 94 | 
 95 |   if (DV % (32 * UNROLL) != 0)
 96 |     cout << DV << " % " << 32 * UNROLL << " != 0\n";
 97 | 
 98 |   if (DV * 8 * 2 > 128 * 1024)
 99 |     cout << DV * 8 * 2 << " > " << 128 * 1024 << "\n";
100 | 
101 |   if (DV * 8 * 2 < 64 * 1024)
102 |     cout << DV * 8 * 2 << " < " << 64 * 1024 << "\n";
103 | 
104 |   int blockCount = 1;
105 |   const int N = DV / 32;
106 |   int iters = 100000 / N;
107 | 
108 |   GPU_ERROR(cudaFuncSetCacheConfig(kernel<N, UNROLL, DOTPRODUCT>,
109 |                                    cudaFuncCachePreferL1));
110 | 
111 |   MeasurementSeries time;
112 |   for (int i = 0; i < 20; i++) {
113 | 
114 |     GPU_ERROR(cudaDeviceSynchronize());
115 |     double t1 = dtime();
116 | 
117 |     kernel<N, UNROLL, DOTPRODUCT><<<blockCount, blockSize>>>(iters, dA, dB);
118 | 
119 |     GPU_ERROR(cudaDeviceSynchronize());
120 |     double t2 = dtime();
121 |     time.add(t2 - t1);
122 |   }
123 |   GPU_ERROR(cudaGetLastError());
124 | 
125 |   double spread = (time.median() - time.minValue()) / time.median() * 100;
126 |   double dt = time.minValue();
127 |   double bw = (DOTPRODUCT ? 2 : 1) * DV * iters * sizeof(double) / dt / 1e9;
128 |   double cyc = dt / (DV * iters) * 1.38e9 * 32;
129 | 
130 |   if (concise) {
131 |     cout << fixed << setprecision(2) << setw(7) << cyc << " ";
132 |   } else {
133 | 
134 |     cout << fixed << setprecision(2);
135 |     cout << setw(3) << UNROLL << "  "     //
136 |          << setw(8) << dt * 1000 << "   " //
137 |          << setw(8) << spread << "   "    //
138 |          << setw(8) << bw << "   "        //
139 |          << setw(8) << cyc << " -- ";
140 |     // << setw(8)
141 |     //<< (20.0 + max(UNROLL * (DOTPRODUCT ? 8 : 4), 30) + UNROLL * 8) /
142 |     //        UNROLL
143 | 
144 |     int Iint = 10;
145 |     int Ild = UNROLL * (DOTPRODUCT ? 2 : 1);
146 |     int Idp = UNROLL;
147 |     int ClL1 = Ild * 2;
148 |     int Nsm = max(1, blockSize / 32);
149 |     int Nq = max(1, blockSize / 32 / 4);
150 | 
151 |     cout << setw(5) << pred(Iint, Ild, Idp, ClL1, Nsm) / UNROLL << " ";
152 | 
153 |     cout << "\n";
154 |   }
155 | }
156 | 
157 | int main(int argc, char **argv) {
158 | 
159 |   size_t maxBufferSize = 1024 * 1024;
160 |   GPU_ERROR(cudaMallocManaged(&dA, sizeof(double) * maxBufferSize));
161 |   GPU_ERROR(cudaMallocManaged(&dB, sizeof(double) * maxBufferSize));
162 |   for (size_t i = 0; i < maxBufferSize; i++) {
163 |     dA[i] = 1.2;
164 |     dB[i] = 1.21;
165 |   }
166 | 
167 |   bool concise = false;
168 |   const bool dotProduct = false;
169 |   for (int blockSize = 32; blockSize <= 1024; blockSize *= 2) {
170 |     measure<8 * 512, 1, dotProduct>(blockSize, concise);
171 |     measure<8 * 512, 2, dotProduct>(blockSize, concise);
172 |     measure<3 * 2048, 3, dotProduct>(blockSize, concise);
173 |     measure<8 * 512, 4, dotProduct>(blockSize, concise);
174 |     measure<6 * 1024, 6, dotProduct>(blockSize, concise);
175 |     measure<8 * 512, 8, dotProduct>(blockSize, concise);
176 |     measure<9 * 512, 9, dotProduct>(blockSize, concise);
177 |     measure<3 * 2048, 12, dotProduct>(blockSize, concise);
178 |     measure<8 * 512, 16, dotProduct>(blockSize, concise);
179 |     measure<9 * 512, 18, dotProduct>(blockSize, concise);
180 |     measure<6 * 1024, 24, dotProduct>(blockSize, concise);
181 |     measure<27 * 256, 27, dotProduct>(blockSize, concise);
182 |     measure<8 * 512, 32, dotProduct>(blockSize, concise);
183 |     cout << "\n";
184 |   }
185 |   GPU_ERROR(cudaFree(dA));
186 |   GPU_ERROR(cudaFree(dB));
187 |   return 0;
188 | }
189 | 


--------------------------------------------------------------------------------
/gpu-stream/plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import csv
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | 
  8 | import sys
  9 | 
 10 | sys.path.append("..")
 11 | from device_order import *
 12 | 
 13 | # fig, (ax, ax2) = plt.subplots(1, 2, sharey=True, facecolor="w", figsize=(8, 5))
 14 | fig, ax = plt.subplots(figsize=(6, 4))
 15 | 
 16 | # fig2, ax2 = plt.subplots(figsize=(8, 4))
 17 | # fig3, ax3 = plt.subplots(figsize=(8, 4))
 18 | 
 19 | 
 20 | maxbars = {}
 21 | minbars = {}
 22 | 
 23 | devicesToInclude = [
 24 |     "a40",
 25 |     "l40",
 26 |     "v100",
 27 |     "a100_80",
 28 |     "gh200",
 29 |     "mi210",
 30 |     "rx6900xt",
 31 |     "mi300x",
 32 |     # "mi300a",
 33 | ]
 34 | 
 35 | 
 36 | for filename in sorted(os.listdir("."), key=lambda f1: getOrderNumber(f1)):
 37 |     if not filename.endswith(".txt") or not any(
 38 |         [True if filename.lower().startswith(f) else False for f in devicesToInclude]
 39 |     ):
 40 |         continue
 41 |     with open(filename, newline="") as csvfile:
 42 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
 43 |         threads = []
 44 |         locs = []
 45 |         init = []
 46 |         read = []
 47 |         scale = []
 48 |         triad = []
 49 |         stencil3pt = []
 50 |         stencil5pt = []
 51 | 
 52 |         for row in csvreader:
 53 |             if row[0].startswith("block") or len(row) < 12:
 54 |                 continue
 55 | 
 56 |             # print(row)
 57 |             threads.append(int(row[1]))
 58 |             init.append(float(row[6]))
 59 |             read.append(float(row[7]))
 60 |             scale.append(float(row[8]))
 61 |             triad.append(float(row[9]))
 62 |             locs.append(float(row[2]))
 63 |             stencil3pt.append(float(row[10]))
 64 |             stencil5pt.append(float(row[11]))
 65 | 
 66 |         if len(threads) < 1:
 67 |             continue
 68 | 
 69 |         # locs = threads#[15 + l / 6 if l > 15 else l for l in locs]
 70 |         # print(locs)
 71 |         # print(threads)
 72 |         # ax.plot(locs, init,  "-v", label=filename, color="C" + str(color))
 73 |         ax.plot(
 74 |             np.array(threads),
 75 |             scale,
 76 |             label=order[getOrderNumber(filename)].upper(),
 77 |             color=getDeviceColor(filename),
 78 |             **lineStyle
 79 |         )
 80 |         # ax2.plot(
 81 |         #    np.array(threads),
 82 |         #    triad,
 83 |         #    label=filename[:-4].upper(),
 84 |         #    color=getDeviceColor(filename),
 85 |         #    **lineStyle
 86 |         # )
 87 |         print(filename, getOrderNumber(filename))
 88 | 
 89 |         # ax.plot(threads, triad, "-<", label=filename, color="C" + str(color))
 90 |         # ax.plot(threads, read, "-^", label=filename, color="C" + str(color))
 91 | 
 92 |         maxbars[filename] = [
 93 |             read[-1],
 94 |             scale[-1],
 95 |             triad[-1],
 96 |             init[-1],
 97 |             # stencil3pt[-1],
 98 |             # stencil5pt[-1],
 99 |         ]
100 | 
101 |         mClosest = 0
102 |         for m in range(len(threads)):
103 |             if abs(threads[m] - 10000) < abs(threads[mClosest] - 10000):
104 |                 mClosest = m
105 | 
106 |         print(threads[mClosest])
107 |         minbars[filename] = [
108 |             read[mClosest],
109 |             scale[mClosest],
110 |             triad[mClosest],
111 |             init[mClosest],
112 |             # stencil3pt[0],
113 |             # stencil5pt[0],
114 |         ]
115 | 
116 | 
117 | ########ax.set_xticks(threads[::5])
118 | # ax.set_xticklabels(threads, rotation="vertical")
119 | ax.set_xlabel("threads")
120 | ax.set_ylabel("DRAM bandwidth, GB/s")
121 | 
122 | # ax.axhline(1400, linestyle="--", color="C1")
123 | # ax.axhline(800, linestyle="--", color="C0")
124 | 
125 | # ax.grid()
126 | #
127 | #
128 | # ax.set_xscale("log")
129 | ax.legend()
130 | 
131 | ax.set_ylim([0, ax.get_ylim()[1]])
132 | ax.set_xlim([0, 400000])
133 | 
134 | formatter = matplotlib.ticker.FuncFormatter(lambda x, pos: "{:.0f}K".format(x // 1000))
135 | ax.get_xaxis().set_major_formatter(formatter)
136 | 
137 | fig.tight_layout(pad=0)
138 | fig.savefig("cuda-stream.svg", dpi=300)
139 | fig.savefig("cuda-stream.pdf", dpi=300)
140 | 
141 | 
142 | plt.show()
143 | 
144 | print(maxbars)
145 | 
146 | 
147 | def plotXbars(xbars, filename):
148 |     fig2, ax2 = plt.subplots(figsize=(6, 3))
149 | 
150 |     valueCount = len(list(xbars.values())[0])
151 |     c = 0
152 |     for m in range(valueCount):
153 |         ax2.bar(
154 |             np.arange(len(xbars))
155 |             + 0.8
156 |             / valueCount
157 |             * (m + 0.5 - valueCount / 2),  # + (0.9 * valueCount)  - 0.5,
158 |             [i[m] for i in xbars.values()],
159 |             width=0.8 / valueCount,
160 |             color=device_color_palette[c],
161 |             label=["read", "scale", "triad", "init", "1D3PT", "1D5PT"][m],
162 |         )
163 |         # for n in range(len(maxbars)):
164 |         #    ax2.text(
165 |         #        n + 0.9 * (m - 0.5) / valueCount - 0.35,
166 |         #        150,
167 |         #        ["init", "read", "scale", "triad", "1D3PT", "1D5PT"][m],
168 |         #        rotation=90,
169 |         #        color="w",
170 |         #        horizontalalignment="left",
171 |         #    )
172 |         c += 1
173 | 
174 |     # ax2.text(-0.4, 51, "init", rotation=90, color="w")
175 |     # ax2.text(-0.28, 51, "read", rotation=90, color="w")
176 |     # ax2.text(-0.16, 51, "scale", rotation=90, color="w")
177 |     # ax2.text(-0.04, 51, "triad", rotation=90, color="w")
178 |     # ax2.text(0.08, 51, "1D3PT", rotation=90, color="w")
179 |     # ax2.text(0.22, 51, "1D5pt", rotation=90, color="w")
180 | 
181 |     print(list(maxbars.keys()))
182 |     ax2.set_xticks(range(len(list(maxbars.keys()))))
183 |     ax2.set_xticklabels(
184 |         [order[getOrderNumber(f)].upper() for f in list(maxbars.keys())]
185 |     )
186 |     ax2.set_ylabel("DRAM Bandwidth, GB/s")
187 |     fig2.autofmt_xdate()
188 |     ax2.legend()
189 |     fig2.tight_layout(pad=0)
190 |     fig2.savefig(filename, dpi=300)
191 |     plt.show()
192 | 
193 | 
194 | plotXbars(maxbars, "maxbars.pdf")
195 | plotXbars(minbars, "minbars.pdf")
196 | 


--------------------------------------------------------------------------------
/gpu-roofline/mi300x.txt:
--------------------------------------------------------------------------------
  1 | 4
  2 | 6
  3 | 8
  4 | 10
  5 | 12
  6 | 14
  7 | 16
  8 | 18
  9 | 20
 10 | 22
 11 | 24
 12 | 28
 13 | 32
 14 | 36
 15 | 40
 16 | 44
 17 | 48
 18 | 54
 19 | 60
 20 | 66
 21 | 72
 22 | 80
 23 | 88
 24 | 96
 25 | 106
 26 | 116
 27 | 126
 28 | 138
 29 | 150
 30 | 164
 31 | 178
 32 | 194
 33 | 212
 34 | 230
 35 | 250
 36 | 272
 37 | 296
 38 | 322
 39 | 350
 40 | 380
 41 | 412
 42 | 448
 43 | 486
 44 | 528
 45 | 574
 46 | 622
 47 | 674
 48 | 732
 49 | 794
 50 | 862
 51 | 934
 52 | 1012
 53 | 1
 54 | -- Finished Building --
 55 | 0 2432 blocks     0 its      0.250 Fl/B       4073 GB/s      1018 GF/s   1254 Mhz   750 W   61°C
 56 | 
 57 | 0 2432 blocks     1 its      0.500 Fl/B       4006 GB/s      2003 GF/s   1248 Mhz   750 W   63°C
 58 | 
 59 | 0 2432 blocks     2 its      0.750 Fl/B       3986 GB/s      2990 GF/s   1243 Mhz   750 W   64°C
 60 | 
 61 | 0 2432 blocks     4 its      1.250 Fl/B       3889 GB/s      4861 GF/s   1243 Mhz   750 W   66°C
 62 | 
 63 | 0 2432 blocks     6 its      1.750 Fl/B       3896 GB/s      6818 GF/s   1249 Mhz   750 W   67°C
 64 | 
 65 | 0 2432 blocks     8 its      2.250 Fl/B       3860 GB/s      8684 GF/s   1255 Mhz   750 W   66°C
 66 | 
 67 | 0 2432 blocks    10 its      2.750 Fl/B       3840 GB/s     10559 GF/s   1266 Mhz   750 W   66°C
 68 | 
 69 | 0 2432 blocks    12 its      3.250 Fl/B       3787 GB/s     12307 GF/s   1251 Mhz   750 W   67°C
 70 | 
 71 | 0 2432 blocks    14 its      3.750 Fl/B       3762 GB/s     14109 GF/s   1251 Mhz   750 W   67°C
 72 | 
 73 | 0 2432 blocks    16 its      4.250 Fl/B       3743 GB/s     15909 GF/s   1204 Mhz   750 W   68°C
 74 | 
 75 | 0 2432 blocks    18 its      4.750 Fl/B       3731 GB/s     17723 GF/s   1144 Mhz   750 W   69°C
 76 | 
 77 | 0 2432 blocks    20 its      5.250 Fl/B       3710 GB/s     19475 GF/s   1093 Mhz   750 W   70°C
 78 | 
 79 | 0 2432 blocks    22 its      5.750 Fl/B       3686 GB/s     21193 GF/s   1047 Mhz   750 W   70°C
 80 | 
 81 | 0 2432 blocks    24 its      6.250 Fl/B       3646 GB/s     22788 GF/s   1056 Mhz   750 W   70°C
 82 | 
 83 | 0 2432 blocks    28 its      7.250 Fl/B       3582 GB/s     25970 GF/s   1061 Mhz   750 W   70°C
 84 | 
 85 | 0 2432 blocks    32 its      8.250 Fl/B       3571 GB/s     29461 GF/s   1046 Mhz   750 W   71°C
 86 | 
 87 | 0 2432 blocks    36 its      9.250 Fl/B       3515 GB/s     32512 GF/s   1041 Mhz   750 W   72°C
 88 | 
 89 | 0 2432 blocks    40 its      10.250 Fl/B       3464 GB/s     35506 GF/s   1030 Mhz   750 W   72°C
 90 | 
 91 | 0 2432 blocks    44 its      11.250 Fl/B       3398 GB/s     38226 GF/s   1010 Mhz   750 W   73°C
 92 | 
 93 | 0 2432 blocks    48 its      12.250 Fl/B       3342 GB/s     40940 GF/s   989 Mhz   750 W   73°C
 94 | 
 95 | 0 2432 blocks    54 its      13.750 Fl/B       3255 GB/s     44756 GF/s   980 Mhz   749 W   74°C
 96 | 
 97 | 0 2432 blocks    60 its      15.250 Fl/B       3137 GB/s     47841 GF/s   914 Mhz   750 W   74°C
 98 | 
 99 | 0 2432 blocks    66 its      16.750 Fl/B       3019 GB/s     50574 GF/s   943 Mhz   750 W   75°C
100 | 
101 | 0 2432 blocks    72 its      18.250 Fl/B       2911 GB/s     53125 GF/s   975 Mhz   749 W   76°C
102 | 
103 | 0 2432 blocks    80 its      20.250 Fl/B       2834 GB/s     57384 GF/s   981 Mhz   750 W   76°C
104 | 
105 | 0 2432 blocks    88 its      22.250 Fl/B       2656 GB/s     59103 GF/s   1000 Mhz   750 W   77°C
106 | 
107 | 0 2432 blocks    96 its      24.250 Fl/B       2585 GB/s     62680 GF/s   1009 Mhz   750 W   77°C
108 | 
109 | 0 2432 blocks   106 its      26.750 Fl/B       1993 GB/s     53307 GF/s   1051 Mhz   750 W   79°C
110 | 
111 | 0 2432 blocks   116 its      29.250 Fl/B       2097 GB/s     61330 GF/s   1037 Mhz   750 W   79°C
112 | 
113 | 0 2432 blocks   126 its      31.750 Fl/B       1989 GB/s     63135 GF/s   1055 Mhz   750 W   79°C
114 | 
115 | 0 2432 blocks   138 its      34.750 Fl/B       1851 GB/s     64308 GF/s   1052 Mhz   750 W   79°C
116 | 
117 | 0 2432 blocks   150 its      37.750 Fl/B       1704 GB/s     64325 GF/s   1058 Mhz   751 W   80°C
118 | 
119 | 0 2432 blocks   164 its      41.250 Fl/B       1634 GB/s     67390 GF/s   1073 Mhz   751 W   80°C
120 | 
121 | 0 2432 blocks   178 its      44.750 Fl/B       1362 GB/s     60939 GF/s   1156 Mhz   751 W   82°C
122 | 
123 | 0 2432 blocks   194 its      48.750 Fl/B       1245 GB/s     60712 GF/s   1140 Mhz   750 W   82°C
124 | 
125 | 0 2432 blocks   212 its      53.250 Fl/B       1321 GB/s     70324 GF/s   1187 Mhz   751 W   81°C
126 | 
127 | 0 2432 blocks   230 its      57.750 Fl/B       1334 GB/s     77051 GF/s   1202 Mhz   751 W   81°C
128 | 
129 | 0 2432 blocks   250 its      62.750 Fl/B       1206 GB/s     75647 GF/s   1203 Mhz   751 W   81°C
130 | 
131 | 0 2432 blocks   272 its      68.250 Fl/B       1162 GB/s     79328 GF/s   1228 Mhz   751 W   81°C
132 | 
133 | 0 2432 blocks   296 its      74.250 Fl/B       1085 GB/s     80589 GF/s   1241 Mhz   751 W   81°C
134 | 
135 | 0 2432 blocks   322 its      80.750 Fl/B       1018 GB/s     82230 GF/s   1259 Mhz   751 W   80°C
136 | 
137 | 0 2432 blocks   350 its      87.750 Fl/B        949 GB/s     83258 GF/s   1272 Mhz   751 W   81°C
138 | 
139 | 0 2432 blocks   380 its      95.250 Fl/B        878 GB/s     83621 GF/s   1273 Mhz   751 W   82°C
140 | 
141 | 0 2432 blocks   412 its      103.250 Fl/B        767 GB/s     79183 GF/s   1278 Mhz   745 W   81°C
142 | 
143 | 0 2432 blocks   448 its      112.250 Fl/B        764 GB/s     85721 GF/s   1292 Mhz   750 W   80°C
144 | 
145 | 0 2432 blocks   486 its      121.750 Fl/B        710 GB/s     86456 GF/s   1305 Mhz   748 W   79°C
146 | 
147 | 0 2432 blocks   528 its      132.250 Fl/B        667 GB/s     88246 GF/s   1312 Mhz   750 W   81°C
148 | 
149 | 0 2432 blocks   574 its      143.750 Fl/B        617 GB/s     88723 GF/s   1320 Mhz   748 W   82°C
150 | 
151 | 0 2432 blocks   622 its      155.750 Fl/B        508 GB/s     79101 GF/s   1427 Mhz   733 W   82°C
152 | 
153 | 0 2432 blocks   674 its      168.750 Fl/B        473 GB/s     79819 GF/s   1439 Mhz   731 W   82°C
154 | 
155 | 0 2432 blocks   732 its      183.250 Fl/B        496 GB/s     90942 GF/s   1366 Mhz   745 W   82°C
156 | 
157 | 0 2432 blocks   794 its      198.750 Fl/B        410 GB/s     81524 GF/s   1464 Mhz   730 W   82°C
158 | 
159 | 0 2432 blocks   862 its      215.750 Fl/B        382 GB/s     82440 GF/s   1476 Mhz   730 W   81°C
160 | 
161 | 0 2432 blocks   934 its      233.750 Fl/B        355 GB/s     83022 GF/s   1486 Mhz   728 W   81°C
162 | 
163 | 0 2432 blocks   1012 its      253.250 Fl/B        381 GB/s     96524 GF/s   1401 Mhz   747 W   82°C
164 | 
165 | 


--------------------------------------------------------------------------------
/gpu-roofline/h200.txt:
--------------------------------------------------------------------------------
  1 | 8
  2 | 16
  3 | 24
  4 | 32
  5 | 40
  6 | 48
  7 | 56
  8 | 64
  9 | 72
 10 | 80
 11 | 88
 12 | 96
 13 | 104
 14 | 112
 15 | 120
 16 | 128
 17 | 136
 18 | 144
 19 | 152
 20 | 160
 21 | 168
 22 | 176
 23 | 184
 24 | 192
 25 | 200
 26 | 208
 27 | 216
 28 | 224
 29 | 232
 30 | 240
 31 | 248
 32 | 256
 33 | 264
 34 | 272
 35 | 280
 36 | 288
 37 | 296
 38 | 304
 39 | 312
 40 | 320
 41 | 328
 42 | 336
 43 | 344
 44 | 352
 45 | 360
 46 | 368
 47 | 376
 48 | 384
 49 | 392
 50 | 400
 51 | 408
 52 | 416
 53 | 424
 54 | 432
 55 | 440
 56 | 448
 57 | 456
 58 | 464
 59 | 472
 60 | 480
 61 | 488
 62 | 496
 63 | 504
 64 | 512
 65 | 2
 66 | -- Finished Building --
 67 | 0 1056 blocks     0 its      0.250 Fl/B       3759 GB/s       940 GF/s   1980 Mhz   572 W   57°C
 68 | 
 69 | 0 1056 blocks     1 its      0.500 Fl/B       3780 GB/s      1890 GF/s   1980 Mhz   578 W   58°C
 70 | 
 71 | 0 1056 blocks     2 its      0.750 Fl/B       3763 GB/s      2822 GF/s   1980 Mhz   583 W   58°C
 72 | 
 73 | 0 1056 blocks     4 its      1.250 Fl/B       3779 GB/s      4724 GF/s   1980 Mhz   597 W   59°C
 74 | 
 75 | 0 1056 blocks     8 its      2.250 Fl/B       3760 GB/s      8459 GF/s   1980 Mhz   615 W   60°C
 76 | 
 77 | 0 1056 blocks    16 its      4.250 Fl/B       3735 GB/s     15874 GF/s   1980 Mhz   650 W   62°C
 78 | 
 79 | 0 1056 blocks    24 its      6.250 Fl/B       3629 GB/s     22683 GF/s   1980 Mhz   675 W   63°C
 80 | 
 81 | 0 1056 blocks    32 its      8.250 Fl/B       3232 GB/s     26667 GF/s   1980 Mhz   653 W   64°C
 82 | 
 83 | 0 1056 blocks    40 its      10.250 Fl/B       2690 GB/s     27571 GF/s   1980 Mhz   595 W   63°C
 84 | 
 85 | 0 1056 blocks    48 its      12.250 Fl/B       2323 GB/s     28456 GF/s   1980 Mhz   558 W   63°C
 86 | 
 87 | 0 1056 blocks    56 its      14.250 Fl/B       2029 GB/s     28920 GF/s   1980 Mhz   526 W   63°C
 88 | 
 89 | 0 1056 blocks    64 its      16.250 Fl/B       2204 GB/s     35812 GF/s   1980 Mhz   583 W   65°C
 90 | 
 91 | 0 1056 blocks    72 its      18.250 Fl/B       2002 GB/s     36541 GF/s   1980 Mhz   562 W   65°C
 92 | 
 93 | 0 1056 blocks    80 its      20.250 Fl/B       1838 GB/s     37227 GF/s   1980 Mhz   547 W   65°C
 94 | 
 95 | 0 1056 blocks    88 its      22.250 Fl/B       1700 GB/s     37819 GF/s   1980 Mhz   531 W   65°C
 96 | 
 97 | 0 1056 blocks    96 its      24.250 Fl/B       1578 GB/s     38276 GF/s   1980 Mhz   519 W   65°C
 98 | 
 99 | 0 1056 blocks   104 its      26.250 Fl/B       1475 GB/s     38727 GF/s   1980 Mhz   511 W   65°C
100 | 
101 | 0 1056 blocks   112 its      28.250 Fl/B       1365 GB/s     38569 GF/s   1980 Mhz   501 W   65°C
102 | 
103 | 0 1056 blocks   120 its      30.250 Fl/B       1303 GB/s     39404 GF/s   1980 Mhz   495 W   65°C
104 | 
105 | 0 1056 blocks   128 its      32.250 Fl/B       1213 GB/s     39109 GF/s   1980 Mhz   491 W   65°C
106 | 
107 | 0 1056 blocks   136 its      34.250 Fl/B       1155 GB/s     39570 GF/s   1980 Mhz   479 W   65°C
108 | 
109 | 0 1056 blocks   144 its      36.250 Fl/B       1095 GB/s     39692 GF/s   1980 Mhz   473 W   65°C
110 | 
111 | 0 1056 blocks   152 its      38.250 Fl/B       1045 GB/s     39981 GF/s   1980 Mhz   467 W   65°C
112 | 
113 | 0 1056 blocks   160 its      40.250 Fl/B        989 GB/s     39827 GF/s   1980 Mhz   467 W   65°C
114 | 
115 | 0 1056 blocks   168 its      42.250 Fl/B        954 GB/s     40308 GF/s   1980 Mhz   463 W   65°C
116 | 
117 | 0 1056 blocks   176 its      44.250 Fl/B        910 GB/s     40254 GF/s   1980 Mhz   455 W   64°C
118 | 
119 | 0 1056 blocks   184 its      46.250 Fl/B        876 GB/s     40524 GF/s   1980 Mhz   450 W   64°C
120 | 
121 | 0 1056 blocks   192 its      48.250 Fl/B        837 GB/s     40399 GF/s   1980 Mhz   450 W   64°C
122 | 
123 | 0 1056 blocks   200 its      50.250 Fl/B        813 GB/s     40870 GF/s   1980 Mhz   443 W   64°C
124 | 
125 | 0 1056 blocks   208 its      52.250 Fl/B        784 GB/s     40956 GF/s   1980 Mhz   442 W   64°C
126 | 
127 | 0 1056 blocks   216 its      54.250 Fl/B        761 GB/s     41269 GF/s   1980 Mhz   440 W   64°C
128 | 
129 | 0 1056 blocks   224 its      56.250 Fl/B        727 GB/s     40898 GF/s   1980 Mhz   439 W   64°C
130 | 
131 | 0 1056 blocks   232 its      58.250 Fl/B        709 GB/s     41309 GF/s   1980 Mhz   431 W   64°C
132 | 
133 | 0 1056 blocks   240 its      60.250 Fl/B        690 GB/s     41602 GF/s   1980 Mhz   433 W   64°C
134 | 
135 | 0 1056 blocks   248 its      62.250 Fl/B        667 GB/s     41499 GF/s   1980 Mhz   425 W   64°C
136 | 
137 | 0 1056 blocks   256 its      64.250 Fl/B        642 GB/s     41232 GF/s   1980 Mhz   428 W   64°C
138 | 
139 | 0 1056 blocks   264 its      66.250 Fl/B        629 GB/s     41687 GF/s   1980 Mhz   422 W   63°C
140 | 
141 | 0 1056 blocks   272 its      68.250 Fl/B        611 GB/s     41706 GF/s   1980 Mhz   422 W   63°C
142 | 
143 | 0 1056 blocks   280 its      70.250 Fl/B        595 GB/s     41795 GF/s   1980 Mhz   421 W   63°C
144 | 
145 | 0 1056 blocks   288 its      72.250 Fl/B        575 GB/s     41519 GF/s   1980 Mhz   420 W   63°C
146 | 
147 | 0 1056 blocks   296 its      74.250 Fl/B        566 GB/s     42006 GF/s   1980 Mhz   414 W   63°C
148 | 
149 | 0 1056 blocks   304 its      76.250 Fl/B        548 GB/s     41818 GF/s   1980 Mhz   415 W   63°C
150 | 
151 | 0 1056 blocks   312 its      78.250 Fl/B        536 GB/s     41925 GF/s   1980 Mhz   412 W   63°C
152 | 
153 | 0 1056 blocks   320 its      80.250 Fl/B        520 GB/s     41724 GF/s   1980 Mhz   413 W   63°C
154 | 
155 | 0 1056 blocks   328 its      82.250 Fl/B        512 GB/s     42115 GF/s   1980 Mhz   408 W   63°C
156 | 
157 | 0 1056 blocks   336 its      84.250 Fl/B        500 GB/s     42103 GF/s   1980 Mhz   408 W   63°C
158 | 
159 | 0 1056 blocks   344 its      86.250 Fl/B        488 GB/s     42129 GF/s   1980 Mhz   405 W   63°C
160 | 
161 | 0 1056 blocks   352 its      88.250 Fl/B        474 GB/s     41817 GF/s   1980 Mhz   409 W   63°C
162 | 
163 | 0 1056 blocks   360 its      90.250 Fl/B        469 GB/s     42329 GF/s   1980 Mhz   405 W   63°C
164 | 
165 | 0 1056 blocks   368 its      92.250 Fl/B        456 GB/s     42111 GF/s   1980 Mhz   403 W   63°C
166 | 
167 | 0 1056 blocks   376 its      94.250 Fl/B        447 GB/s     42092 GF/s   1980 Mhz   402 W   63°C
168 | 
169 | 0 1056 blocks   384 its      96.250 Fl/B        436 GB/s     41961 GF/s   1980 Mhz   402 W   63°C
170 | 
171 | 0 1056 blocks   392 its      98.250 Fl/B        432 GB/s     42426 GF/s   1980 Mhz   396 W   63°C
172 | 
173 | 0 1056 blocks   400 its      100.250 Fl/B        425 GB/s     42558 GF/s   1980 Mhz   398 W   63°C
174 | 
175 | 0 1056 blocks   408 its      102.250 Fl/B        415 GB/s     42442 GF/s   1980 Mhz   396 W   63°C
176 | 
177 | 


--------------------------------------------------------------------------------
/gpu-small-kernels/plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import csv
  5 | import numpy as np
  6 | import math
  7 | from random import *
  8 | 
  9 | 
 10 | import sys
 11 | 
 12 | sys.path.append("..")
 13 | from device_order import *
 14 | 
 15 | fig, ax = plt.subplots(figsize=(10, 6))
 16 | 
 17 | 
 18 | maxbars = {}
 19 | minbars = {}
 20 | 
 21 | peakBW = [897, 1555, 2039, 2039, 1229, 1638]
 22 | 
 23 | 
 24 | filesToInclude = ["L40", "A100", "RX6900XT", "MI210", "H200"]
 25 | 
 26 | # filesToInclude = ["L40", "RX6900XT"]
 27 | 
 28 | 
 29 | def getIncludeNumber(filename):
 30 |     for i in range(len(filesToInclude)):
 31 |         if filename.upper().startswith(filesToInclude[i]):
 32 |             return i
 33 |     return len(filesToInclude) + 1
 34 | 
 35 | 
 36 | def fitValues(xdata, ydata, color=None):
 37 |     ydata[2:-2] = (
 38 |         ydata[0:-4] + ydata[1:-3] + ydata[2:-2] + ydata[3:-1] + ydata[4:]
 39 |     ) / 5
 40 | 
 41 |     from scipy.optimize import curve_fit
 42 | 
 43 |     # def func(x, a, b, c):
 44 |     #    return a * np.exp(-b * np.exp(-c * x))
 45 | 
 46 |     def func(x, a, b):
 47 |         return x / (a / 1e9 + (x / 1e9 / b))
 48 | 
 49 |     best = 0
 50 |     lim = 1
 51 |     bestLim = lim
 52 |     perr = -1
 53 | 
 54 |     while lim + 1 < len(xdata):
 55 |         lim += 1
 56 |         if xdata[lim] < 3 * 1024 * 1024 or xdata[lim] > 100 * 1024 * 1024:
 57 |             continue
 58 | 
 59 |         popt, pcov, infodict, mesg, ier = curve_fit(
 60 |             func,
 61 |             xdata[:lim],
 62 |             ydata[:lim],
 63 |             bounds=([0, 0], [np.inf, np.inf]),
 64 |             full_output=True,
 65 |         )
 66 |         # print(popt)
 67 |         # print(pcov)
 68 |         # print(mesg)
 69 |         perr = np.diag(pcov)[0] * np.diag(pcov)[1]
 70 |         if perr < best or best == 0:
 71 |             best = perr
 72 |             bestLim = lim
 73 | 
 74 |         print("%d fit: a=%5.0f ns,   b=%5.0f GB/s," % (lim, popt[0], popt[1]))
 75 |     print()
 76 |     # print(perr)
 77 | 
 78 |     lim = bestLim
 79 |     popt, pcov, infodict, mesg, ier = curve_fit(
 80 |         func,
 81 |         xdata[:lim],
 82 |         ydata[:lim],
 83 |         bounds=([0, 0], [np.inf, np.inf]),
 84 |         full_output=True,
 85 |     )
 86 |     print(lim, best)
 87 | 
 88 |     # xdata = np.array([*list(xdata), *[i / 25 for i in range(1, 25)]])
 89 |     # xdata.sort()
 90 | 
 91 |     plt.plot(
 92 |         xdata[:lim] / 1024,
 93 |         func(xdata[:lim], *popt) / 1e9,
 94 |         "-",
 95 |         color="black",  # icolor,
 96 |         label="fit: a=%5.0f ns, b=%5.0f GB/s," % (popt[0], popt[1]),
 97 |         zorder=-1,
 98 |         linewidth=2,
 99 |         alpha=1.0,
100 |     )
101 |     return perr
102 | 
103 | 
104 | def fitCurve(splitA, splitB, color=None):
105 |     fitValues(
106 |         sizes[splitA:splitB],
107 |         np.array(
108 |             [max([v[b] if b < len(v) else 0 for b in range(len(bw[0]))]) for v in bw][
109 |                 splitA:splitB
110 |             ]
111 |         )
112 |         * 1e9,
113 |         color,
114 |     )
115 | 
116 | 
117 | def getOrderNumber(f):
118 |     for o in range(len(order)):
119 |         if f.startswith(order[o]):
120 |             return o
121 |     return len(order) + 1
122 | 
123 | 
124 | def getData(filename):
125 |     with open(filename, newline="") as csvfile:
126 |         csvreader = csv.reader(csvfile, delimiter=" ", skipinitialspace=True)
127 |         dims = []
128 |         bw = []
129 | 
130 |         for row in csvreader:
131 |             if row[0] == "blockSize":
132 |                 continue
133 |             dims.append(float(row[0]))
134 |             values = []
135 |             for r in row[2:]:
136 |                 if len(r) == 0:
137 |                     continue
138 |                 values.append(float(r))
139 |             bw.append(values)
140 | 
141 |         return dims, bw
142 | 
143 | 
144 | blockSizes = [
145 |     (xblock, 1024 // xblock) for xblock in [4, 8, 16, 32, 64, 128, 256, 512, 1024]
146 | ]
147 | 
148 | 
149 | def getColor(b):
150 |     return tuple(min(1.0, math.log2(c) / math.log2(128) * 1.4) for c in b)
151 | 
152 | 
153 | for filename in sorted(sorted(os.listdir(".")), key=lambda f1: getOrderNumber(f1)):
154 |     if (
155 |         any([filename.upper().startswith(f) for f in filesToInclude])
156 |         and not "linear" in filename
157 |         and not "graph" in filename
158 |         and not "pt" in filename
159 |         and not "gsync" in filename
160 |     ):
161 |         dims, bw = getData(filename)
162 |         if len(bw) < 3:
163 |             continue
164 | 
165 |         dims = np.array(dims)
166 |         sizes = dims * 16
167 | 
168 |         lineStyle["marker"] = None  # "|" if "graph" in filename.lower() else "_"
169 |         lineStyle["linewidth"] = 2
170 |         lineStyle["linestyle"] = (
171 |             "-."
172 |             if "gsync" in filename.lower()
173 |             else (
174 |                 ":"
175 |                 if "pt" in filename.lower()
176 |                 else "--" if "graph" in filename.lower() else "-"
177 |             )
178 |         )
179 |         b = 2
180 |         ax.plot(
181 |             sizes / 1024,
182 |             [max([v[b] if b < len(v) else 0 for b in range(len(bw[0]))]) for v in bw],
183 |             label=filename[:-4].upper(),
184 |             color="C" + str(getOrderNumber(filename)),
185 |             **lineStyle,
186 |             zorder=0
187 |         )
188 | 
189 |         # rx6900
190 |         # fitCurve(0, 80)
191 |         # fitCurve(84, 110)
192 |         # fitCurve(110, 139)
193 |         # fitCurve(146, 240)
194 | 
195 |         # mi210
196 |         # fitCurve(0, 76)
197 |         # fitCurve(98, 160)
198 | 
199 |         # A100
200 |         # fitCurve(0, 102)
201 |         # fitCurve(117, 220)
202 | 
203 |         # L100
204 |         # fitCurve(0, 128)
205 |         # fitCurve(146, 195)
206 | 
207 |         # v100
208 |         fitCurve(2, 120)
209 | 
210 |         # fitCurve(102, 250)
211 | 
212 | 
213 | def func(x, a, b):
214 |     return x / (a / 1e9 + (x * 16 / 1e9 / b))
215 | 
216 | 
217 | # values = np.arange(256, 32 * 1024, 256)
218 | # ax.plot(
219 | #    values,
220 | #    func(values * 1024 / 8, 3000, 2100) * 1e-9 * 16,
221 | #    color="red",
222 | #    linewidth=3,
223 | #    label="MI300X, \n fit: a = 15000 GB/s, \n      b = 3000 ns",
224 | # )
225 | 
226 | # values = np.arange(32 * 1024, 1024 * 1024, 256)
227 | # ax.plot(
228 | #    values, func(values * 1024 / 8, 3000, 500) * 1e-9 * 16, color="red", linewidth=3
229 | # )
230 | 
231 | 
232 | ax.set_xlabel("grid size, kB")
233 | ax.set_ylabel("GB/s")
234 | ax.set_xscale("log")
235 | 
236 | ax.set_xscale("log")
237 | ax.set_xticks([128, 256, 512, 1024, 2048, 8192, 20 * 1024, 64 * 1024])
238 | ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
239 | 
240 | # ax.axhline(1400, linestyle="--", color="C1")
241 | # ax.axhline(800, linestyle="--", color="C0")
242 | 
243 | # ax.grid()
244 | ax.legend()
245 | ax.set_ylim([0, ax.get_ylim()[1]])
246 | ax.set_xlim([64, 512 * 1024])
247 | 
248 | fig.tight_layout()
249 | fig.savefig("repeated-stream.svg", dpi=300)
250 | 
251 | 
252 | plt.show()
253 | 


--------------------------------------------------------------------------------
/rocm-metrics/rocm-metrics.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | /******************************************************************************
  3 | Copyright (c) 2018 Advanced Micro Devices, Inc. and Dominik Ernst
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | The above copyright notice and this permission notice shall be included in
 11 | all copies or substantial portions of the Software.
 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 18 | THE SOFTWARE.
 19 | *******************************************************************************/
 20 | 
 21 | #ifndef ROCM_METRICS_H_
 22 | #define ROCM_METRICS_H_
 23 | 
 24 | #include "hip/hip_runtime.h"
 25 | #include <hsa/hsa.h>
 26 | #include <iostream>
 27 | #include <rocprofiler.h>
 28 | #include <vector>
 29 | #include <unistd.h>
 30 | 
 31 | #define HSA_ASSERT(x) (assert((x) == HSA_STATUS_SUCCESS))
 32 | 
 33 | #define ROCP_CALL_CK(call)                                                     \
 34 |   do {                                                                         \
 35 |     hsa_status_t _status = call;                                               \
 36 |     if (_status != HSA_STATUS_SUCCESS && _status != HSA_STATUS_INFO_BREAK) {   \
 37 |       const char *profErr;                                                     \
 38 |       rocprofiler_error_string(&profErr);                                      \
 39 |       std::cout << "ERROR: function call \n \"" << #call                       \
 40 |                 << "\" at " __FILE__ ":" << __LINE__                           \
 41 |                 << " \n failed with status " << _status << ": \" " << profErr  \
 42 |                 << "\"\n";                                                     \
 43 |     }                                                                          \
 44 |   } while (0);
 45 | 
 46 | hsa_agent_t agent_info_arr[16];
 47 | unsigned agent_info_arr_len;
 48 | 
 49 | static hsa_status_t _count_devices(hsa_agent_t agent, void *data) {
 50 |   unsigned *count = (unsigned *)data;
 51 |   hsa_device_type_t type;
 52 |   hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
 53 |   assert(status == HSA_STATUS_SUCCESS);
 54 |   if (type == HSA_DEVICE_TYPE_GPU) {
 55 |     agent_info_arr[(*count)++] = agent;
 56 |   }
 57 |   return status;
 58 | }
 59 | 
 60 | static unsigned _get_device_count(void) {
 61 |   unsigned count = 0;
 62 |   hsa_status_t status = hsa_iterate_agents(&_count_devices, &count);
 63 |   assert(status == HSA_STATUS_SUCCESS);
 64 |   return count;
 65 | }
 66 | 
 67 | static hsa_agent_t _get_agent(unsigned gpu_id) {
 68 |   return agent_info_arr[gpu_id];
 69 | }
 70 | 
 71 | hsa_status_t info_data_callback(const rocprofiler_info_data_t info,
 72 |                                 void *data) {
 73 | 
 74 |     std::cout << "info data callback\n";
 75 |   switch (info.kind) {
 76 |   case ROCPROFILER_INFO_KIND_METRIC: {
 77 |     if (info.metric.expr != NULL) {
 78 |       std::cout << "Derived counter:  gpu-agent" << info.agent_index << " "
 79 |                 << info.metric.name << ": " << info.metric.description;
 80 | 
 81 |       std::cout << info.metric.name << " = " << info.metric.expr << "\n";
 82 |     } else {
 83 |       std::cout << "Basic counter:  gpu-agent" << info.agent_index << ": "
 84 |                 << info.metric.name << "\n";
 85 |       if (info.metric.instances > 1) {
 86 |         std::cout << "[0-" << info.metric.instances - 1 << "]\n";
 87 |       }
 88 |       std::cout << " : " << info.metric.description;
 89 |       std::cout << "      block " << info.metric.block_name << " has "
 90 |                 << info.metric.block_counters << " counters\n";
 91 |     }
 92 |     break;
 93 |   }
 94 |   default:
 95 |     return HSA_STATUS_ERROR;
 96 |   }
 97 |   return HSA_STATUS_SUCCESS;
 98 | }
 99 | void printMetrics(hsa_agent_t agent) {
100 |   ROCP_CALL_CK( rocprofiler_iterate_info(
101 |       &agent, ROCPROFILER_INFO_KIND_METRIC, info_data_callback, NULL));
102 | 
103 | }
104 | 
105 | hsa_agent_t agent;
106 | // Profiling context
107 | rocprofiler_t *context = NULL;
108 | 
109 | const unsigned feature_count = 2;
110 | rocprofiler_feature_t feature[feature_count];
111 | double prevValues[feature_count];
112 | 
113 | 
114 | void measureBandwidthStart() {
115 |     hipDeviceSynchronize();
116 |   // Start counters and sample them in the loop with the sampling rate
117 | }
118 | 
119 | std::vector<double> measureMetricStop() {
120 |   hipDeviceSynchronize();
121 | 
122 | 
123 |   std::vector<double> results(6,0);
124 | 
125 |   ROCP_CALL_CK( rocprofiler_read(context, 0));
126 |   ROCP_CALL_CK( rocprofiler_get_data(context, 0));
127 |   ROCP_CALL_CK( rocprofiler_get_metrics(context));
128 |   // print_results(feature, feature_count);
129 | 
130 | 
131 |   double v1 = (feature[0].data.result_double - prevValues[0]);
132 |   double v2 = (feature[1].data.result_double - prevValues[1]);
133 | 
134 | 
135 |   results[0] = v1 * 32;
136 |   results[2] = (v2 * 32) / 32;
137 | 
138 |   
139 | 
140 |   for (unsigned i = 0; i < feature_count; ++i) {
141 |       const rocprofiler_feature_t *p = &feature[i];
142 |       //std::cout << p->name << ": ";
143 | 
144 |       double val = 0;
145 |       switch(p->data.kind) {
146 |           case ROCPROFILER_DATA_KIND_INT64:
147 |               val = p->data.result_int64;
148 |               break;
149 |           case ROCPROFILER_DATA_KIND_DOUBLE:
150 |               val = p->data.result_double;
151 |               break;
152 |           default:
153 |               std::cout << "Undefined data kind: " << p->data.kind << "\n";
154 |               assert(0);
155 |       }
156 |       //std::cout << "= " << val << ", Delta: " << val - prevValues[i] << "\n";
157 |       prevValues[i] = val;
158 |   }
159 | 
160 | 
161 | 
162 |   // Stop counters
163 |   //ROCP_CALL_CK( rocprofiler_stop(context, 0));
164 | 
165 |   return results;
166 | }
167 | 
168 | void initMeasureMetric() {
169 |   setenv("HSA_TOOLS_LIB", "/opt/rocm/rocprofiler/lib/librocprofiler64.so", 1);
170 |   setenv("ROCP_METRICS", "/opt/rocm/lib/rocprofiler/metrics.xml", 1);
171 | 
172 | 
173 |   HSA_ASSERT(hsa_init());
174 |   hsa_status_t status = HSA_STATUS_ERROR;
175 |   // HSA agent
176 | 
177 |   unsigned gpu_count = _get_device_count();
178 |   agent_info_arr_len = gpu_count;
179 | 
180 |   for (unsigned gpu_id = 0; gpu_id < gpu_count; ++gpu_id) {
181 |     hsa_agent_t agent = _get_agent(gpu_id);
182 |     std::cout << "Agent " << gpu_id << "\n";
183 |   }
184 | 
185 |   agent = _get_agent(0);
186 | 
187 |   //printMetrics(agent);
188 | 
189 |   // Profiling feature objects
190 | 
191 |   // Counters and metrics
192 |   feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC;
193 |   feature[0].name = "TCP_TOTAL_CACHE_ACCESSES_sum";
194 |   feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC;
195 |   feature[1].name = "TCP_TCC_READ_REQ_sum";
196 | 
197 |   //feature[2].kind = ROCPROFILER_FEATURE_KIND_METRIC;
198 |   //feature[2].name = "FETCH_SIZE";
199 |   //feature[3].kind = ROCPROFILER_FEATURE_KIND_METRIC;
200 |   //feature[3].name = "WRITE_SIZE";
201 | 
202 |   // Creating profiling context with standalone queue
203 |   rocprofiler_properties_t properties = {};
204 |   properties.queue_depth = 128;
205 |   uint32_t mode = ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_CREATEQUEUE |
206 |                   ROCPROFILER_MODE_SINGLEGROUP;
207 | 
208 |   properties.queue_depth = 128;
209 | 
210 |   ROCP_CALL_CK(rocprofiler_open(agent, feature, feature_count, &context, mode,
211 |                                 &properties));
212 | 
213 |   ROCP_CALL_CK(rocprofiler_start(context, 0));
214 | }
215 | 
216 | #endif // ROCM-METRICS_H_
217 | 


--------------------------------------------------------------------------------